diff --git "a/sft/Full_smoe_share/checkpoint-13312/trainer_state.json" "b/sft/Full_smoe_share/checkpoint-13312/trainer_state.json" new file mode 100644--- /dev/null +++ "b/sft/Full_smoe_share/checkpoint-13312/trainer_state.json" @@ -0,0 +1,226337 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.8003607395160078, + "eval_steps": 500, + "global_step": 13312, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "auxiliary_loss_clip": 0.04675119, + "auxiliary_loss_mlp": 0.02099638, + "balance_loss_clip": 1.77205122, + "balance_loss_mlp": 2.27198935, + "epoch": 6.012325266796934e-05, + "flos": 24455432897280.0, + "grad_norm": 54.633257353768954, + "language_loss": 2.84989119, + "learning_rate": 0.0, + "loss": 1.94246852, + "num_input_tokens_seen": 19155, + "router_z_loss_clip": 3.28125, + "router_z_loss_mlp": 24.0, + "step": 1, + "time_per_iteration": 19.307077646255493 + }, + { + "auxiliary_loss_clip": 0.03143228, + "auxiliary_loss_mlp": 0.01384828, + "balance_loss_clip": 1.18112338, + "balance_loss_mlp": 1.51281738, + "epoch": 0.00012024650533593868, + "flos": 20225010188160.0, + "grad_norm": 36.066244838101376, + "language_loss": 1.82575774, + "learning_rate": 4e-06, + "loss": 1.8710382, + "num_input_tokens_seen": 36175, + "router_z_loss_clip": 2.03125, + "router_z_loss_mlp": 16.25, + "step": 2, + "time_per_iteration": 2.4628143310546875 + }, + { + "auxiliary_loss_clip": 0.03080973, + "auxiliary_loss_mlp": 0.0137341, + "balance_loss_clip": 1.17828774, + "balance_loss_mlp": 1.51664853, + "epoch": 0.000180369758003908, + "flos": 22308835996800.0, + "grad_norm": 32.82768891459672, + "language_loss": 1.57214069, + "learning_rate": 3.999999964312572e-06, + "loss": 1.61668456, + "num_input_tokens_seen": 54870, + "router_z_loss_clip": 1.953125, + "router_z_loss_mlp": 15.625, + "step": 3, + "time_per_iteration": 2.409597635269165 + }, + { + "auxiliary_loss_clip": 0.03109568, + "auxiliary_loss_mlp": 0.01353198, + "balance_loss_clip": 1.127177, + "balance_loss_mlp": 1.51108968, + "epoch": 0.00024049301067187735, + "flos": 22413680409600.0, + "grad_norm": 24.052977564940104, + "language_loss": 1.37424958, + "learning_rate": 3.99999985725029e-06, + "loss": 1.41887736, + "num_input_tokens_seen": 74575, + "router_z_loss_clip": 2.265625, + "router_z_loss_mlp": 16.0, + "step": 4, + "time_per_iteration": 2.4043149948120117 + }, + { + "auxiliary_loss_clip": 0.03127305, + "auxiliary_loss_mlp": 0.01403953, + "balance_loss_clip": 1.18498981, + "balance_loss_mlp": 1.50664127, + "epoch": 0.0003006162633398467, + "flos": 21395927099520.0, + "grad_norm": 21.146805273774273, + "language_loss": 1.415416, + "learning_rate": 3.999999678813158e-06, + "loss": 1.46072853, + "num_input_tokens_seen": 92580, + "router_z_loss_clip": 2.1875, + "router_z_loss_mlp": 16.25, + "step": 5, + "time_per_iteration": 2.3708319664001465 + }, + { + "auxiliary_loss_clip": 0.03063031, + "auxiliary_loss_mlp": 0.01405226, + "balance_loss_clip": 1.18578613, + "balance_loss_mlp": 1.50378633, + "epoch": 0.000360739516007816, + "flos": 21651316761600.0, + "grad_norm": 6.818791628039744, + "language_loss": 1.17620254, + "learning_rate": 3.999999429001183e-06, + "loss": 1.22088504, + "num_input_tokens_seen": 109705, + "router_z_loss_clip": 2.1875, + "router_z_loss_mlp": 15.5625, + "step": 6, + "time_per_iteration": 2.3820738792419434 + }, + { + "auxiliary_loss_clip": 0.02998424, + "auxiliary_loss_mlp": 0.01370393, + "balance_loss_clip": 1.17241049, + "balance_loss_mlp": 1.50680101, + "epoch": 0.0004208627686757854, + "flos": 27158586312960.0, + "grad_norm": 4.94199637075652, + "language_loss": 1.14681077, + "learning_rate": 3.9999991078143714e-06, + "loss": 1.19049883, + "num_input_tokens_seen": 129425, + "router_z_loss_clip": 1.984375, + "router_z_loss_mlp": 14.9375, + "step": 7, + "time_per_iteration": 2.679643392562866 + }, + { + "auxiliary_loss_clip": 0.02951132, + "auxiliary_loss_mlp": 0.01311899, + "balance_loss_clip": 1.13022375, + "balance_loss_mlp": 1.49760175, + "epoch": 0.0004809860213437547, + "flos": 31317824292480.0, + "grad_norm": 23.61535064956306, + "language_loss": 0.9544208, + "learning_rate": 3.999998715252736e-06, + "loss": 0.99705112, + "num_input_tokens_seen": 149210, + "router_z_loss_clip": 1.8125, + "router_z_loss_mlp": 14.5625, + "step": 8, + "time_per_iteration": 2.6806282997131348 + }, + { + "auxiliary_loss_clip": 0.02969223, + "auxiliary_loss_mlp": 0.01355113, + "balance_loss_clip": 1.16456866, + "balance_loss_mlp": 1.50150013, + "epoch": 0.000541109274011724, + "flos": 32159056435200.0, + "grad_norm": 5.007732554313651, + "language_loss": 1.11774778, + "learning_rate": 3.999998251316293e-06, + "loss": 1.16099107, + "num_input_tokens_seen": 169055, + "router_z_loss_clip": 1.90625, + "router_z_loss_mlp": 14.6875, + "step": 9, + "time_per_iteration": 2.6619770526885986 + }, + { + "auxiliary_loss_clip": 0.02872593, + "auxiliary_loss_mlp": 0.01315594, + "balance_loss_clip": 1.14421892, + "balance_loss_mlp": 1.50026464, + "epoch": 0.0006012325266796934, + "flos": 18915801914880.0, + "grad_norm": 3.139622664196079, + "language_loss": 1.06644702, + "learning_rate": 3.9999977160050555e-06, + "loss": 1.10832882, + "num_input_tokens_seen": 188045, + "router_z_loss_clip": 1.7109375, + "router_z_loss_mlp": 13.75, + "step": 10, + "time_per_iteration": 2.564631223678589 + }, + { + "auxiliary_loss_clip": 0.02790104, + "auxiliary_loss_mlp": 0.01297754, + "balance_loss_clip": 1.13801312, + "balance_loss_mlp": 1.4966042, + "epoch": 0.0006613557793476627, + "flos": 20773879672320.0, + "grad_norm": 8.748217914557543, + "language_loss": 1.10217166, + "learning_rate": 3.9999971093190445e-06, + "loss": 1.14305019, + "num_input_tokens_seen": 207035, + "router_z_loss_clip": 1.6015625, + "router_z_loss_mlp": 12.9375, + "step": 11, + "time_per_iteration": 2.6023213863372803 + }, + { + "auxiliary_loss_clip": 0.02668227, + "auxiliary_loss_mlp": 0.01248991, + "balance_loss_clip": 1.09602213, + "balance_loss_mlp": 1.48630834, + "epoch": 0.000721479032015632, + "flos": 16580740896000.0, + "grad_norm": 4.094164265162429, + "language_loss": 1.09107757, + "learning_rate": 3.999996431258282e-06, + "loss": 1.13024974, + "num_input_tokens_seen": 223225, + "router_z_loss_clip": 1.53125, + "router_z_loss_mlp": 11.875, + "step": 12, + "time_per_iteration": 2.528917074203491 + }, + { + "auxiliary_loss_clip": 0.02649503, + "auxiliary_loss_mlp": 0.0122052, + "balance_loss_clip": 1.08004415, + "balance_loss_mlp": 1.48940086, + "epoch": 0.0007816022846836014, + "flos": 23804340618240.0, + "grad_norm": 3.2974397637514343, + "language_loss": 0.99341649, + "learning_rate": 3.999995681822791e-06, + "loss": 1.03211677, + "num_input_tokens_seen": 242570, + "router_z_loss_clip": 1.40625, + "router_z_loss_mlp": 11.625, + "step": 13, + "time_per_iteration": 2.6297576427459717 + }, + { + "auxiliary_loss_clip": 0.02601865, + "auxiliary_loss_mlp": 0.01248735, + "balance_loss_clip": 1.10196495, + "balance_loss_mlp": 1.48509979, + "epoch": 0.0008417255373515708, + "flos": 19171191576960.0, + "grad_norm": 4.238777303539045, + "language_loss": 1.06023884, + "learning_rate": 3.999994861012598e-06, + "loss": 1.09874487, + "num_input_tokens_seen": 261215, + "router_z_loss_clip": 1.46875, + "router_z_loss_mlp": 11.125, + "step": 14, + "time_per_iteration": 2.5761733055114746 + }, + { + "auxiliary_loss_clip": 0.02539715, + "auxiliary_loss_mlp": 0.01209442, + "balance_loss_clip": 1.07802558, + "balance_loss_mlp": 1.48428929, + "epoch": 0.00090184879001954, + "flos": 26394372362880.0, + "grad_norm": 2.5879242228297796, + "language_loss": 0.9828856, + "learning_rate": 3.999993968827733e-06, + "loss": 1.02037716, + "num_input_tokens_seen": 280035, + "router_z_loss_clip": 1.3125, + "router_z_loss_mlp": 10.5625, + "step": 15, + "time_per_iteration": 2.602414846420288 + }, + { + "auxiliary_loss_clip": 0.02482357, + "auxiliary_loss_mlp": 0.01201394, + "balance_loss_clip": 1.07560468, + "balance_loss_mlp": 1.47833943, + "epoch": 0.0009619720426875094, + "flos": 24678391305600.0, + "grad_norm": 3.0687345466579785, + "language_loss": 0.99246669, + "learning_rate": 3.999993005268228e-06, + "loss": 1.02930415, + "num_input_tokens_seen": 300265, + "router_z_loss_clip": 1.2578125, + "router_z_loss_mlp": 10.0625, + "step": 16, + "time_per_iteration": 2.56613826751709 + }, + { + "auxiliary_loss_clip": 0.02431807, + "auxiliary_loss_mlp": 0.01216623, + "balance_loss_clip": 1.10113311, + "balance_loss_mlp": 1.47049284, + "epoch": 0.0010220952953554788, + "flos": 18623543990400.0, + "grad_norm": 3.1215760991906873, + "language_loss": 1.01441908, + "learning_rate": 3.999991970334118e-06, + "loss": 1.05090332, + "num_input_tokens_seen": 317375, + "router_z_loss_clip": 1.15625, + "router_z_loss_mlp": 9.625, + "step": 17, + "time_per_iteration": 5.36155366897583 + }, + { + "auxiliary_loss_clip": 0.02306633, + "auxiliary_loss_mlp": 0.01184744, + "balance_loss_clip": 1.07888603, + "balance_loss_mlp": 1.46231151, + "epoch": 0.001082218548023448, + "flos": 26141286850560.0, + "grad_norm": 2.330774005332857, + "language_loss": 0.99674374, + "learning_rate": 3.999990864025439e-06, + "loss": 1.03165746, + "num_input_tokens_seen": 337975, + "router_z_loss_clip": 1.0625, + "router_z_loss_mlp": 8.4375, + "step": 18, + "time_per_iteration": 2.5890016555786133 + }, + { + "auxiliary_loss_clip": 0.02248247, + "auxiliary_loss_mlp": 0.011974, + "balance_loss_clip": 1.07666516, + "balance_loss_mlp": 1.44878852, + "epoch": 0.0011423418006914173, + "flos": 19608758046720.0, + "grad_norm": 3.298295705619953, + "language_loss": 0.91169536, + "learning_rate": 3.99998968634223e-06, + "loss": 0.94615185, + "num_input_tokens_seen": 356635, + "router_z_loss_clip": 1.203125, + "router_z_loss_mlp": 8.0, + "step": 19, + "time_per_iteration": 2.57875919342041 + }, + { + "auxiliary_loss_clip": 0.02162659, + "auxiliary_loss_mlp": 0.01173688, + "balance_loss_clip": 1.08027506, + "balance_loss_mlp": 1.44382668, + "epoch": 0.0012024650533593868, + "flos": 17894382912000.0, + "grad_norm": 2.4149043095773175, + "language_loss": 1.03630507, + "learning_rate": 3.999988437284535e-06, + "loss": 1.06966853, + "num_input_tokens_seen": 375625, + "router_z_loss_clip": 0.9375, + "router_z_loss_mlp": 7.1875, + "step": 20, + "time_per_iteration": 2.589801788330078 + }, + { + "auxiliary_loss_clip": 0.02108563, + "auxiliary_loss_mlp": 0.01184161, + "balance_loss_clip": 1.08779252, + "balance_loss_mlp": 1.42771745, + "epoch": 0.001262588306027356, + "flos": 21250967667840.0, + "grad_norm": 2.4908713227019383, + "language_loss": 0.9450531, + "learning_rate": 3.999987116852396e-06, + "loss": 0.97798038, + "num_input_tokens_seen": 394350, + "router_z_loss_clip": 0.96484375, + "router_z_loss_mlp": 6.8125, + "step": 21, + "time_per_iteration": 2.6463260650634766 + }, + { + "auxiliary_loss_clip": 0.0205811, + "auxiliary_loss_mlp": 0.01174418, + "balance_loss_clip": 1.08300817, + "balance_loss_mlp": 1.41683114, + "epoch": 0.0013227115586953253, + "flos": 26102882488320.0, + "grad_norm": 2.5239968437472995, + "language_loss": 0.96045399, + "learning_rate": 3.999985725045861e-06, + "loss": 0.99277925, + "num_input_tokens_seen": 413255, + "router_z_loss_clip": 0.9140625, + "router_z_loss_mlp": 6.40625, + "step": 22, + "time_per_iteration": 2.61918306350708 + }, + { + "auxiliary_loss_clip": 0.020383, + "auxiliary_loss_mlp": 0.01197761, + "balance_loss_clip": 1.10830641, + "balance_loss_mlp": 1.41866231, + "epoch": 0.0013828348113632948, + "flos": 23950242656640.0, + "grad_norm": 1.8613463886433936, + "language_loss": 0.83544517, + "learning_rate": 3.999984261864982e-06, + "loss": 0.86780572, + "num_input_tokens_seen": 433065, + "router_z_loss_clip": 0.89453125, + "router_z_loss_mlp": 6.1875, + "step": 23, + "time_per_iteration": 2.5596020221710205 + }, + { + "auxiliary_loss_clip": 0.02009691, + "auxiliary_loss_mlp": 0.01183879, + "balance_loss_clip": 1.09533048, + "balance_loss_mlp": 1.41109812, + "epoch": 0.001442958064031264, + "flos": 15958972759680.0, + "grad_norm": 2.070711216257559, + "language_loss": 1.0156163, + "learning_rate": 3.999982727309807e-06, + "loss": 1.04755211, + "num_input_tokens_seen": 451175, + "router_z_loss_clip": 0.8828125, + "router_z_loss_mlp": 6.0, + "step": 24, + "time_per_iteration": 2.5257441997528076 + }, + { + "auxiliary_loss_clip": 0.01941478, + "auxiliary_loss_mlp": 0.01207248, + "balance_loss_clip": 1.11855638, + "balance_loss_mlp": 1.39769328, + "epoch": 0.0015030813166992333, + "flos": 18107527962240.0, + "grad_norm": 3.830467856535708, + "language_loss": 0.93178821, + "learning_rate": 3.999981121380394e-06, + "loss": 0.96327549, + "num_input_tokens_seen": 468775, + "router_z_loss_clip": 0.88671875, + "router_z_loss_mlp": 5.4375, + "step": 25, + "time_per_iteration": 2.5001440048217773 + }, + { + "auxiliary_loss_clip": 0.01920455, + "auxiliary_loss_mlp": 0.01198773, + "balance_loss_clip": 1.10831678, + "balance_loss_mlp": 1.39311361, + "epoch": 0.0015632045693672028, + "flos": 22233528460800.0, + "grad_norm": 2.107854477846936, + "language_loss": 1.00563216, + "learning_rate": 3.9999794440768e-06, + "loss": 1.03682446, + "num_input_tokens_seen": 488530, + "router_z_loss_clip": 0.90625, + "router_z_loss_mlp": 5.28125, + "step": 26, + "time_per_iteration": 2.573878526687622 + }, + { + "auxiliary_loss_clip": 0.01911834, + "auxiliary_loss_mlp": 0.01198717, + "balance_loss_clip": 1.11126542, + "balance_loss_mlp": 1.38922739, + "epoch": 0.001623327822035172, + "flos": 23990706789120.0, + "grad_norm": 2.122014738041503, + "language_loss": 0.89966214, + "learning_rate": 3.999977695399084e-06, + "loss": 0.93076766, + "num_input_tokens_seen": 510495, + "router_z_loss_clip": 0.875, + "router_z_loss_mlp": 5.21875, + "step": 27, + "time_per_iteration": 2.562899351119995 + }, + { + "auxiliary_loss_clip": 0.01883569, + "auxiliary_loss_mlp": 0.01206261, + "balance_loss_clip": 1.12271953, + "balance_loss_mlp": 1.37992167, + "epoch": 0.0016834510747031415, + "flos": 19676769108480.0, + "grad_norm": 2.0938647514651083, + "language_loss": 0.99451423, + "learning_rate": 3.999975875347308e-06, + "loss": 1.02541244, + "num_input_tokens_seen": 528605, + "router_z_loss_clip": 0.8359375, + "router_z_loss_mlp": 5.03125, + "step": 28, + "time_per_iteration": 2.5451371669769287 + }, + { + "auxiliary_loss_clip": 0.01877581, + "auxiliary_loss_mlp": 0.01182278, + "balance_loss_clip": 1.09611368, + "balance_loss_mlp": 1.37079179, + "epoch": 0.0017435743273711108, + "flos": 20922749176320.0, + "grad_norm": 2.1468105105526787, + "language_loss": 0.96911222, + "learning_rate": 3.999973983921538e-06, + "loss": 0.99971074, + "num_input_tokens_seen": 548515, + "router_z_loss_clip": 0.859375, + "router_z_loss_mlp": 5.0625, + "step": 29, + "time_per_iteration": 2.53570818901062 + }, + { + "auxiliary_loss_clip": 0.01867424, + "auxiliary_loss_mlp": 0.01180375, + "balance_loss_clip": 1.09459209, + "balance_loss_mlp": 1.36146772, + "epoch": 0.00180369758003908, + "flos": 19528178895360.0, + "grad_norm": 3.1842010867040664, + "language_loss": 1.11302865, + "learning_rate": 3.9999720211218405e-06, + "loss": 1.14350665, + "num_input_tokens_seen": 564025, + "router_z_loss_clip": 0.859375, + "router_z_loss_mlp": 5.0625, + "step": 30, + "time_per_iteration": 2.5348143577575684 + }, + { + "auxiliary_loss_clip": 0.0182161, + "auxiliary_loss_mlp": 0.01175429, + "balance_loss_clip": 1.08797705, + "balance_loss_mlp": 1.35064387, + "epoch": 0.0018638208327070496, + "flos": 27451961400960.0, + "grad_norm": 2.3505293065017008, + "language_loss": 0.96483362, + "learning_rate": 3.999969986948286e-06, + "loss": 0.99480397, + "num_input_tokens_seen": 583345, + "router_z_loss_clip": 0.875, + "router_z_loss_mlp": 4.71875, + "step": 31, + "time_per_iteration": 2.5954768657684326 + }, + { + "auxiliary_loss_clip": 0.01802228, + "auxiliary_loss_mlp": 0.01171508, + "balance_loss_clip": 1.08448565, + "balance_loss_mlp": 1.33967805, + "epoch": 0.0019239440853750188, + "flos": 13588614489600.0, + "grad_norm": 2.0416038758443933, + "language_loss": 0.88528389, + "learning_rate": 3.999967881400949e-06, + "loss": 0.91502124, + "num_input_tokens_seen": 600010, + "router_z_loss_clip": 0.8671875, + "router_z_loss_mlp": 4.625, + "step": 32, + "time_per_iteration": 2.538137435913086 + }, + { + "auxiliary_loss_clip": 0.01803808, + "auxiliary_loss_mlp": 0.0116373, + "balance_loss_clip": 1.07575357, + "balance_loss_mlp": 1.33184898, + "epoch": 0.001984067338042988, + "flos": 11253099623040.0, + "grad_norm": 2.7298373076256124, + "language_loss": 0.87022352, + "learning_rate": 3.999965704479901e-06, + "loss": 0.89989889, + "num_input_tokens_seen": 616295, + "router_z_loss_clip": 0.87890625, + "router_z_loss_mlp": 4.71875, + "step": 33, + "time_per_iteration": 2.531190872192383 + }, + { + "auxiliary_loss_clip": 0.01769014, + "auxiliary_loss_mlp": 0.01165698, + "balance_loss_clip": 1.07939088, + "balance_loss_mlp": 1.32378912, + "epoch": 0.0020441905907109576, + "flos": 22385051228160.0, + "grad_norm": 2.060972244943001, + "language_loss": 0.86651742, + "learning_rate": 3.999963456185222e-06, + "loss": 0.89586449, + "num_input_tokens_seen": 637640, + "router_z_loss_clip": 0.86328125, + "router_z_loss_mlp": 4.4375, + "step": 34, + "time_per_iteration": 2.5855860710144043 + }, + { + "auxiliary_loss_clip": 0.01742666, + "auxiliary_loss_mlp": 0.0113303, + "balance_loss_clip": 1.04314601, + "balance_loss_mlp": 1.30449271, + "epoch": 0.0021043138433789266, + "flos": 49776858489600.0, + "grad_norm": 2.195255265685214, + "language_loss": 0.70702922, + "learning_rate": 3.999961136516991e-06, + "loss": 0.7357862, + "num_input_tokens_seen": 659710, + "router_z_loss_clip": 0.8984375, + "router_z_loss_mlp": 4.375, + "step": 35, + "time_per_iteration": 2.8236277103424072 + }, + { + "auxiliary_loss_clip": 0.0174311, + "auxiliary_loss_mlp": 0.01142913, + "balance_loss_clip": 1.05574691, + "balance_loss_mlp": 1.3051616, + "epoch": 0.002164437096046896, + "flos": 20556929283840.0, + "grad_norm": 2.0848739687058857, + "language_loss": 0.8459003, + "learning_rate": 3.999958745475293e-06, + "loss": 0.87476051, + "num_input_tokens_seen": 679670, + "router_z_loss_clip": 0.87109375, + "router_z_loss_mlp": 4.375, + "step": 36, + "time_per_iteration": 2.5762763023376465 + }, + { + "auxiliary_loss_clip": 0.01730108, + "auxiliary_loss_mlp": 0.01154336, + "balance_loss_clip": 1.06540632, + "balance_loss_mlp": 1.29269588, + "epoch": 0.0022245603487148656, + "flos": 26541077362560.0, + "grad_norm": 3.013775673998062, + "language_loss": 0.87594348, + "learning_rate": 3.999956283060211e-06, + "loss": 0.9047879, + "num_input_tokens_seen": 700170, + "router_z_loss_clip": 0.88671875, + "router_z_loss_mlp": 4.375, + "step": 37, + "time_per_iteration": 2.625054359436035 + }, + { + "auxiliary_loss_clip": 0.01715788, + "auxiliary_loss_mlp": 0.01164623, + "balance_loss_clip": 1.07306981, + "balance_loss_mlp": 1.29187417, + "epoch": 0.0022846836013828346, + "flos": 20337185986560.0, + "grad_norm": 1.8630027244681364, + "language_loss": 0.99655676, + "learning_rate": 3.9999537492718345e-06, + "loss": 1.02536082, + "num_input_tokens_seen": 718545, + "router_z_loss_clip": 0.9140625, + "router_z_loss_mlp": 4.25, + "step": 38, + "time_per_iteration": 2.559971332550049 + }, + { + "auxiliary_loss_clip": 0.01694284, + "auxiliary_loss_mlp": 0.01137003, + "balance_loss_clip": 1.04478264, + "balance_loss_mlp": 1.28287303, + "epoch": 0.002344806854050804, + "flos": 26246445465600.0, + "grad_norm": 2.21150037379473, + "language_loss": 0.81611729, + "learning_rate": 3.999951144110252e-06, + "loss": 0.84443015, + "num_input_tokens_seen": 739865, + "router_z_loss_clip": 0.921875, + "router_z_loss_mlp": 4.125, + "step": 39, + "time_per_iteration": 2.6174371242523193 + }, + { + "auxiliary_loss_clip": 0.01697233, + "auxiliary_loss_mlp": 0.01149935, + "balance_loss_clip": 1.05900264, + "balance_loss_mlp": 1.27529538, + "epoch": 0.0024049301067187736, + "flos": 11800747209600.0, + "grad_norm": 2.8311092766346047, + "language_loss": 0.83641642, + "learning_rate": 3.999948467575558e-06, + "loss": 0.86488813, + "num_input_tokens_seen": 755770, + "router_z_loss_clip": 0.90625, + "router_z_loss_mlp": 4.21875, + "step": 40, + "time_per_iteration": 2.546539783477783 + }, + { + "auxiliary_loss_clip": 0.01683985, + "auxiliary_loss_mlp": 0.01153351, + "balance_loss_clip": 1.06179821, + "balance_loss_mlp": 1.2719717, + "epoch": 0.0024650533593867426, + "flos": 20630456340480.0, + "grad_norm": 2.7020804722368825, + "language_loss": 0.88915122, + "learning_rate": 3.999945719667849e-06, + "loss": 0.91752458, + "num_input_tokens_seen": 773440, + "router_z_loss_clip": 0.9140625, + "router_z_loss_mlp": 4.125, + "step": 41, + "time_per_iteration": 2.5499825477600098 + }, + { + "auxiliary_loss_clip": 0.01660993, + "auxiliary_loss_mlp": 0.01141511, + "balance_loss_clip": 1.05691957, + "balance_loss_mlp": 1.26325691, + "epoch": 0.002525176612054712, + "flos": 18405127324800.0, + "grad_norm": 2.0646008580873874, + "language_loss": 0.92708147, + "learning_rate": 3.999942900387221e-06, + "loss": 0.9551065, + "num_input_tokens_seen": 790455, + "router_z_loss_clip": 0.84765625, + "router_z_loss_mlp": 3.96875, + "step": 42, + "time_per_iteration": 2.537623167037964 + }, + { + "auxiliary_loss_clip": 0.01651451, + "auxiliary_loss_mlp": 0.01167617, + "balance_loss_clip": 1.07568288, + "balance_loss_mlp": 1.2582351, + "epoch": 0.0025852998647226816, + "flos": 28182763313280.0, + "grad_norm": 5.185354719665505, + "language_loss": 0.93673301, + "learning_rate": 3.999940009733775e-06, + "loss": 0.96492368, + "num_input_tokens_seen": 810645, + "router_z_loss_clip": 0.91796875, + "router_z_loss_mlp": 3.9375, + "step": 43, + "time_per_iteration": 2.595491647720337 + }, + { + "auxiliary_loss_clip": 0.016468, + "auxiliary_loss_mlp": 0.01148236, + "balance_loss_clip": 1.06078386, + "balance_loss_mlp": 1.25160539, + "epoch": 0.0026454231173906506, + "flos": 14282233937280.0, + "grad_norm": 5.263731246682537, + "language_loss": 0.88909531, + "learning_rate": 3.9999370477076146e-06, + "loss": 0.91704565, + "num_input_tokens_seen": 827470, + "router_z_loss_clip": 0.875, + "router_z_loss_mlp": 3.953125, + "step": 44, + "time_per_iteration": 2.51104736328125 + }, + { + "auxiliary_loss_clip": 0.01636214, + "auxiliary_loss_mlp": 0.01135016, + "balance_loss_clip": 1.05443072, + "balance_loss_mlp": 1.24934185, + "epoch": 0.00270554637005862, + "flos": 22418114152320.0, + "grad_norm": 2.629764911890971, + "language_loss": 0.94870114, + "learning_rate": 3.9999340143088455e-06, + "loss": 0.97641337, + "num_input_tokens_seen": 847285, + "router_z_loss_clip": 0.8046875, + "router_z_loss_mlp": 3.875, + "step": 45, + "time_per_iteration": 2.719869613647461 + }, + { + "auxiliary_loss_clip": 0.01637104, + "auxiliary_loss_mlp": 0.01135423, + "balance_loss_clip": 1.05626822, + "balance_loss_mlp": 1.24332213, + "epoch": 0.0027656696227265896, + "flos": 23984702035200.0, + "grad_norm": 1.653387096013355, + "language_loss": 0.99931061, + "learning_rate": 3.999930909537576e-06, + "loss": 1.02703583, + "num_input_tokens_seen": 867545, + "router_z_loss_clip": 0.79296875, + "router_z_loss_mlp": 3.9375, + "step": 46, + "time_per_iteration": 2.7189016342163086 + }, + { + "auxiliary_loss_clip": 0.01620508, + "auxiliary_loss_mlp": 0.01147488, + "balance_loss_clip": 1.06122851, + "balance_loss_mlp": 1.23951733, + "epoch": 0.0028257928753945586, + "flos": 37668001731840.0, + "grad_norm": 2.1541038360868874, + "language_loss": 0.84128428, + "learning_rate": 3.999927733393916e-06, + "loss": 0.86896425, + "num_input_tokens_seen": 889915, + "router_z_loss_clip": 0.86328125, + "router_z_loss_mlp": 3.8125, + "step": 47, + "time_per_iteration": 2.7456581592559814 + }, + { + "auxiliary_loss_clip": 0.01600178, + "auxiliary_loss_mlp": 0.01142669, + "balance_loss_clip": 1.05531228, + "balance_loss_mlp": 1.23255134, + "epoch": 0.002885916128062528, + "flos": 22453481226240.0, + "grad_norm": 1.773987463822889, + "language_loss": 0.85155529, + "learning_rate": 3.99992448587798e-06, + "loss": 0.87898374, + "num_input_tokens_seen": 908975, + "router_z_loss_clip": 0.875, + "router_z_loss_mlp": 3.671875, + "step": 48, + "time_per_iteration": 2.7141005992889404 + }, + { + "auxiliary_loss_clip": 0.01593724, + "auxiliary_loss_mlp": 0.01130042, + "balance_loss_clip": 1.04640508, + "balance_loss_mlp": 1.22513652, + "epoch": 0.0029460393807304976, + "flos": 27011671845120.0, + "grad_norm": 3.2920442272937573, + "language_loss": 0.8657636, + "learning_rate": 3.999921166989884e-06, + "loss": 0.89300132, + "num_input_tokens_seen": 929810, + "router_z_loss_clip": 0.8359375, + "router_z_loss_mlp": 3.6875, + "step": 49, + "time_per_iteration": 2.6943843364715576 + }, + { + "auxiliary_loss_clip": 0.01589589, + "auxiliary_loss_mlp": 0.01155181, + "balance_loss_clip": 1.07488132, + "balance_loss_mlp": 1.2209065, + "epoch": 0.0030061626333984666, + "flos": 15850916501760.0, + "grad_norm": 2.2248531444274304, + "language_loss": 0.88049072, + "learning_rate": 3.999917776729746e-06, + "loss": 0.90793836, + "num_input_tokens_seen": 948650, + "router_z_loss_clip": 0.8046875, + "router_z_loss_mlp": 3.6875, + "step": 50, + "time_per_iteration": 2.5544683933258057 + }, + { + "auxiliary_loss_clip": 0.01584487, + "auxiliary_loss_mlp": 0.01129517, + "balance_loss_clip": 1.05141068, + "balance_loss_mlp": 1.21761751, + "epoch": 0.003066285886066436, + "flos": 31825845619200.0, + "grad_norm": 4.248983325462892, + "language_loss": 0.83911979, + "learning_rate": 3.999914315097687e-06, + "loss": 0.86625981, + "num_input_tokens_seen": 966455, + "router_z_loss_clip": 0.78125, + "router_z_loss_mlp": 3.671875, + "step": 51, + "time_per_iteration": 2.647359609603882 + }, + { + "auxiliary_loss_clip": 0.01567797, + "auxiliary_loss_mlp": 0.01153382, + "balance_loss_clip": 1.07098413, + "balance_loss_mlp": 1.21582484, + "epoch": 0.0031264091387344056, + "flos": 41425878188160.0, + "grad_norm": 1.8429176982897937, + "language_loss": 0.91580838, + "learning_rate": 3.999910782093829e-06, + "loss": 0.94302016, + "num_input_tokens_seen": 988110, + "router_z_loss_clip": 0.82421875, + "router_z_loss_mlp": 3.53125, + "step": 52, + "time_per_iteration": 2.7222397327423096 + }, + { + "auxiliary_loss_clip": 0.01572168, + "auxiliary_loss_mlp": 0.01145655, + "balance_loss_clip": 1.06015766, + "balance_loss_mlp": 1.21593559, + "epoch": 0.0031865323914023747, + "flos": 23439812446080.0, + "grad_norm": 2.1818325670174197, + "language_loss": 0.88794315, + "learning_rate": 3.999907177718301e-06, + "loss": 0.91512132, + "num_input_tokens_seen": 1008550, + "router_z_loss_clip": 0.85546875, + "router_z_loss_mlp": 3.5625, + "step": 53, + "time_per_iteration": 2.5715830326080322 + }, + { + "auxiliary_loss_clip": 0.01566318, + "auxiliary_loss_mlp": 0.01159099, + "balance_loss_clip": 1.07393575, + "balance_loss_mlp": 1.21152437, + "epoch": 0.003246655644070344, + "flos": 14428310532480.0, + "grad_norm": 2.336043391261303, + "language_loss": 0.79594576, + "learning_rate": 3.99990350197123e-06, + "loss": 0.82319993, + "num_input_tokens_seen": 1026840, + "router_z_loss_clip": 0.8515625, + "router_z_loss_mlp": 3.546875, + "step": 54, + "time_per_iteration": 2.5533878803253174 + }, + { + "auxiliary_loss_clip": 0.01562585, + "auxiliary_loss_mlp": 0.01153723, + "balance_loss_clip": 1.06917977, + "balance_loss_mlp": 1.20680356, + "epoch": 0.0033067788967383136, + "flos": 35916793246080.0, + "grad_norm": 2.6399092445371815, + "language_loss": 0.77894688, + "learning_rate": 3.999899754852747e-06, + "loss": 0.80610991, + "num_input_tokens_seen": 1048875, + "router_z_loss_clip": 0.84765625, + "router_z_loss_mlp": 3.5625, + "step": 55, + "time_per_iteration": 2.686530828475952 + }, + { + "auxiliary_loss_clip": 0.01560018, + "auxiliary_loss_mlp": 0.01133782, + "balance_loss_clip": 1.05348277, + "balance_loss_mlp": 1.20421016, + "epoch": 0.003366902149406283, + "flos": 22957836860160.0, + "grad_norm": 3.5583647739648487, + "language_loss": 0.83599192, + "learning_rate": 3.999895936362987e-06, + "loss": 0.86292994, + "num_input_tokens_seen": 1066435, + "router_z_loss_clip": 0.8046875, + "router_z_loss_mlp": 3.5625, + "step": 56, + "time_per_iteration": 4.309218645095825 + }, + { + "auxiliary_loss_clip": 0.01562721, + "auxiliary_loss_mlp": 0.01154903, + "balance_loss_clip": 1.07555699, + "balance_loss_mlp": 1.19991803, + "epoch": 0.003427025402074252, + "flos": 26581506583680.0, + "grad_norm": 1.804282673250652, + "language_loss": 0.90663362, + "learning_rate": 3.9998920465020845e-06, + "loss": 0.93380976, + "num_input_tokens_seen": 1090330, + "router_z_loss_clip": 0.79296875, + "router_z_loss_mlp": 3.625, + "step": 57, + "time_per_iteration": 4.142320156097412 + }, + { + "auxiliary_loss_clip": 0.01542008, + "auxiliary_loss_mlp": 0.01138251, + "balance_loss_clip": 1.05480409, + "balance_loss_mlp": 1.20625949, + "epoch": 0.0034871486547422216, + "flos": 23950068099840.0, + "grad_norm": 2.146597421329481, + "language_loss": 0.9681412, + "learning_rate": 3.999888085270179e-06, + "loss": 0.9949438, + "num_input_tokens_seen": 1109840, + "router_z_loss_clip": 0.8359375, + "router_z_loss_mlp": 3.34375, + "step": 58, + "time_per_iteration": 2.59883451461792 + }, + { + "auxiliary_loss_clip": 0.01538591, + "auxiliary_loss_mlp": 0.01136056, + "balance_loss_clip": 1.0576638, + "balance_loss_mlp": 1.20107901, + "epoch": 0.003547271907410191, + "flos": 21213924848640.0, + "grad_norm": 2.2963116984050234, + "language_loss": 0.8566339, + "learning_rate": 3.9998840526674135e-06, + "loss": 0.88338029, + "num_input_tokens_seen": 1128415, + "router_z_loss_clip": 0.78125, + "router_z_loss_mlp": 3.375, + "step": 59, + "time_per_iteration": 2.588517189025879 + }, + { + "auxiliary_loss_clip": 0.01543832, + "auxiliary_loss_mlp": 0.01121869, + "balance_loss_clip": 1.04376316, + "balance_loss_mlp": 1.19997227, + "epoch": 0.00360739516007816, + "flos": 16504071816960.0, + "grad_norm": 3.6106206237954153, + "language_loss": 0.90589786, + "learning_rate": 3.999879948693929e-06, + "loss": 0.9325549, + "num_input_tokens_seen": 1146515, + "router_z_loss_clip": 0.78125, + "router_z_loss_mlp": 3.4375, + "step": 60, + "time_per_iteration": 2.5708141326904297 + }, + { + "auxiliary_loss_clip": 0.01536828, + "auxiliary_loss_mlp": 0.01128284, + "balance_loss_clip": 1.05480361, + "balance_loss_mlp": 1.19439209, + "epoch": 0.0036675184127461296, + "flos": 19463763703680.0, + "grad_norm": 3.0867057656717347, + "language_loss": 0.86699647, + "learning_rate": 3.999875773349874e-06, + "loss": 0.89364761, + "num_input_tokens_seen": 1166330, + "router_z_loss_clip": 0.734375, + "router_z_loss_mlp": 3.421875, + "step": 61, + "time_per_iteration": 2.733415365219116 + }, + { + "auxiliary_loss_clip": 0.01530636, + "auxiliary_loss_mlp": 0.01119644, + "balance_loss_clip": 1.0487386, + "balance_loss_mlp": 1.19737685, + "epoch": 0.003727641665414099, + "flos": 20956335770880.0, + "grad_norm": 1.9044505707105237, + "language_loss": 0.86141676, + "learning_rate": 3.999871526635397e-06, + "loss": 0.88791955, + "num_input_tokens_seen": 1186010, + "router_z_loss_clip": 0.7109375, + "router_z_loss_mlp": 3.328125, + "step": 62, + "time_per_iteration": 2.644710063934326 + }, + { + "auxiliary_loss_clip": 0.01525647, + "auxiliary_loss_mlp": 0.01135761, + "balance_loss_clip": 1.06042099, + "balance_loss_mlp": 1.19628096, + "epoch": 0.003787764918082068, + "flos": 18405057502080.0, + "grad_norm": 1.984669322035565, + "language_loss": 0.94111091, + "learning_rate": 3.999867208550649e-06, + "loss": 0.96772498, + "num_input_tokens_seen": 1204985, + "router_z_loss_clip": 0.75, + "router_z_loss_mlp": 3.296875, + "step": 63, + "time_per_iteration": 2.6939220428466797 + }, + { + "auxiliary_loss_clip": 0.01522687, + "auxiliary_loss_mlp": 0.01124747, + "balance_loss_clip": 1.05078948, + "balance_loss_mlp": 1.19448018, + "epoch": 0.0038478881707500376, + "flos": 12458406090240.0, + "grad_norm": 6.071362458483904, + "language_loss": 0.95735359, + "learning_rate": 3.999862819095785e-06, + "loss": 0.98382795, + "num_input_tokens_seen": 1223545, + "router_z_loss_clip": 0.7421875, + "router_z_loss_mlp": 3.28125, + "step": 64, + "time_per_iteration": 2.6445887088775635 + }, + { + "auxiliary_loss_clip": 0.01532445, + "auxiliary_loss_mlp": 0.01135816, + "balance_loss_clip": 1.06204915, + "balance_loss_mlp": 1.19461608, + "epoch": 0.003908011423418007, + "flos": 13552479365760.0, + "grad_norm": 1.8832393227624737, + "language_loss": 0.82848072, + "learning_rate": 3.999858358270962e-06, + "loss": 0.85516334, + "num_input_tokens_seen": 1241175, + "router_z_loss_clip": 0.73828125, + "router_z_loss_mlp": 3.375, + "step": 65, + "time_per_iteration": 2.6585493087768555 + }, + { + "auxiliary_loss_clip": 0.01523418, + "auxiliary_loss_mlp": 0.01124821, + "balance_loss_clip": 1.05129278, + "balance_loss_mlp": 1.19321489, + "epoch": 0.003968134676085976, + "flos": 18332473052160.0, + "grad_norm": 1.8421961963241207, + "language_loss": 0.83222592, + "learning_rate": 3.999853826076338e-06, + "loss": 0.85870826, + "num_input_tokens_seen": 1259315, + "router_z_loss_clip": 0.734375, + "router_z_loss_mlp": 3.296875, + "step": 66, + "time_per_iteration": 2.7048535346984863 + }, + { + "auxiliary_loss_clip": 0.01520742, + "auxiliary_loss_mlp": 0.01120062, + "balance_loss_clip": 1.03871369, + "balance_loss_mlp": 1.18489003, + "epoch": 0.004028257928753946, + "flos": 20484205188480.0, + "grad_norm": 2.219008712215282, + "language_loss": 0.94136697, + "learning_rate": 3.999849222512075e-06, + "loss": 0.96777511, + "num_input_tokens_seen": 1277055, + "router_z_loss_clip": 0.8125, + "router_z_loss_mlp": 3.359375, + "step": 67, + "time_per_iteration": 2.617933511734009 + }, + { + "auxiliary_loss_clip": 0.01507726, + "auxiliary_loss_mlp": 0.01130344, + "balance_loss_clip": 1.04932952, + "balance_loss_mlp": 1.18441331, + "epoch": 0.004088381181421915, + "flos": 18842833440000.0, + "grad_norm": 2.354307623528934, + "language_loss": 0.92092949, + "learning_rate": 3.9998445475783365e-06, + "loss": 0.94731021, + "num_input_tokens_seen": 1294355, + "router_z_loss_clip": 0.80859375, + "router_z_loss_mlp": 3.234375, + "step": 68, + "time_per_iteration": 2.577284097671509 + }, + { + "auxiliary_loss_clip": 0.01497895, + "auxiliary_loss_mlp": 0.01131936, + "balance_loss_clip": 1.0565958, + "balance_loss_mlp": 1.18115139, + "epoch": 0.004148504434089885, + "flos": 19426790707200.0, + "grad_norm": 2.9681617808792784, + "language_loss": 0.9445743, + "learning_rate": 3.999839801275292e-06, + "loss": 0.97087264, + "num_input_tokens_seen": 1313525, + "router_z_loss_clip": 0.75, + "router_z_loss_mlp": 3.171875, + "step": 69, + "time_per_iteration": 2.5266003608703613 + }, + { + "auxiliary_loss_clip": 0.01495954, + "auxiliary_loss_mlp": 0.01127572, + "balance_loss_clip": 1.05685711, + "balance_loss_mlp": 1.18270397, + "epoch": 0.004208627686757853, + "flos": 20810049707520.0, + "grad_norm": 2.724199016749798, + "language_loss": 0.96660733, + "learning_rate": 3.999834983603108e-06, + "loss": 0.99284261, + "num_input_tokens_seen": 1330505, + "router_z_loss_clip": 0.70703125, + "router_z_loss_mlp": 3.125, + "step": 70, + "time_per_iteration": 2.590308666229248 + }, + { + "auxiliary_loss_clip": 0.01504126, + "auxiliary_loss_mlp": 0.01123484, + "balance_loss_clip": 1.04795337, + "balance_loss_mlp": 1.17364883, + "epoch": 0.004268750939425823, + "flos": 19097629608960.0, + "grad_norm": 6.624474777525524, + "language_loss": 0.91867542, + "learning_rate": 3.9998300945619576e-06, + "loss": 0.94495147, + "num_input_tokens_seen": 1349615, + "router_z_loss_clip": 0.7578125, + "router_z_loss_mlp": 3.3125, + "step": 71, + "time_per_iteration": 2.5628535747528076 + }, + { + "auxiliary_loss_clip": 0.01797056, + "auxiliary_loss_mlp": 0.01431044, + "balance_loss_clip": 1.39118016, + "balance_loss_mlp": 1.42536378, + "epoch": 0.004328874192093792, + "flos": 52436889377280.0, + "grad_norm": 2.2573204514841265, + "language_loss": 0.65688801, + "learning_rate": 3.999825134152016e-06, + "loss": 0.68916899, + "num_input_tokens_seen": 1410275, + "router_z_loss_clip": 0.3984375, + "router_z_loss_mlp": 3.71875, + "step": 72, + "time_per_iteration": 3.1397647857666016 + }, + { + "auxiliary_loss_clip": 0.01734601, + "auxiliary_loss_mlp": 0.01323334, + "balance_loss_clip": 1.28423309, + "balance_loss_mlp": 1.40884876, + "epoch": 0.004388997444761762, + "flos": 66469459115520.0, + "grad_norm": 2.081010780673298, + "language_loss": 0.63639885, + "learning_rate": 3.999820102373459e-06, + "loss": 0.66697824, + "num_input_tokens_seen": 1473020, + "router_z_loss_clip": 0.390625, + "router_z_loss_mlp": 3.25, + "step": 73, + "time_per_iteration": 3.1842033863067627 + }, + { + "auxiliary_loss_clip": 0.01489288, + "auxiliary_loss_mlp": 0.01117901, + "balance_loss_clip": 1.04642332, + "balance_loss_mlp": 1.17090213, + "epoch": 0.004449120697429731, + "flos": 18951971950080.0, + "grad_norm": 3.399614259340687, + "language_loss": 0.83614337, + "learning_rate": 3.999814999226467e-06, + "loss": 0.86221528, + "num_input_tokens_seen": 1490385, + "router_z_loss_clip": 0.71484375, + "router_z_loss_mlp": 3.1875, + "step": 74, + "time_per_iteration": 2.5562846660614014 + }, + { + "auxiliary_loss_clip": 0.0150131, + "auxiliary_loss_mlp": 0.01129769, + "balance_loss_clip": 1.05662227, + "balance_loss_mlp": 1.17718089, + "epoch": 0.004509243950097701, + "flos": 21104437224960.0, + "grad_norm": 2.068454009829982, + "language_loss": 0.94926447, + "learning_rate": 3.999809824711222e-06, + "loss": 0.97557527, + "num_input_tokens_seen": 1509725, + "router_z_loss_clip": 0.73046875, + "router_z_loss_mlp": 3.25, + "step": 75, + "time_per_iteration": 2.530402421951294 + }, + { + "auxiliary_loss_clip": 0.01481729, + "auxiliary_loss_mlp": 0.01138677, + "balance_loss_clip": 1.06629276, + "balance_loss_mlp": 1.17169082, + "epoch": 0.004569367202765669, + "flos": 20697838997760.0, + "grad_norm": 2.4761529519656147, + "language_loss": 0.86071384, + "learning_rate": 3.9998045788279075e-06, + "loss": 0.88691783, + "num_input_tokens_seen": 1527245, + "router_z_loss_clip": 0.72265625, + "router_z_loss_mlp": 3.09375, + "step": 76, + "time_per_iteration": 2.5472307205200195 + }, + { + "auxiliary_loss_clip": 0.01492312, + "auxiliary_loss_mlp": 0.01127005, + "balance_loss_clip": 1.05362022, + "balance_loss_mlp": 1.17710352, + "epoch": 0.004629490455433639, + "flos": 28657198045440.0, + "grad_norm": 1.769193740793799, + "language_loss": 0.90393454, + "learning_rate": 3.9997992615767125e-06, + "loss": 0.93012774, + "num_input_tokens_seen": 1548930, + "router_z_loss_clip": 0.734375, + "router_z_loss_mlp": 3.15625, + "step": 77, + "time_per_iteration": 2.5652127265930176 + }, + { + "auxiliary_loss_clip": 0.01483891, + "auxiliary_loss_mlp": 0.01125642, + "balance_loss_clip": 1.05130339, + "balance_loss_mlp": 1.17786443, + "epoch": 0.004689613708101608, + "flos": 11071621042560.0, + "grad_norm": 3.8040509470686623, + "language_loss": 0.8998087, + "learning_rate": 3.9997938729578266e-06, + "loss": 0.92590404, + "num_input_tokens_seen": 1565695, + "router_z_loss_clip": 0.7421875, + "router_z_loss_mlp": 3.0625, + "step": 78, + "time_per_iteration": 2.5184528827667236 + }, + { + "auxiliary_loss_clip": 0.01486718, + "auxiliary_loss_mlp": 0.01138278, + "balance_loss_clip": 1.06508303, + "balance_loss_mlp": 1.17505836, + "epoch": 0.004749736960769578, + "flos": 21798021761280.0, + "grad_norm": 7.844066050884662, + "language_loss": 0.80664903, + "learning_rate": 3.99978841297144e-06, + "loss": 0.83289897, + "num_input_tokens_seen": 1582625, + "router_z_loss_clip": 0.734375, + "router_z_loss_mlp": 3.125, + "step": 79, + "time_per_iteration": 2.558370351791382 + }, + { + "auxiliary_loss_clip": 0.01497252, + "auxiliary_loss_mlp": 0.01123498, + "balance_loss_clip": 1.05368876, + "balance_loss_mlp": 1.18235707, + "epoch": 0.004809860213437547, + "flos": 19791563258880.0, + "grad_norm": 3.243026160233554, + "language_loss": 0.90004849, + "learning_rate": 3.99978288161775e-06, + "loss": 0.92625594, + "num_input_tokens_seen": 1601725, + "router_z_loss_clip": 0.69921875, + "router_z_loss_mlp": 3.15625, + "step": 80, + "time_per_iteration": 2.5560710430145264 + }, + { + "auxiliary_loss_clip": 0.01486012, + "auxiliary_loss_mlp": 0.01134552, + "balance_loss_clip": 1.06226373, + "balance_loss_mlp": 1.18250656, + "epoch": 0.004869983466105517, + "flos": 26573232591360.0, + "grad_norm": 2.165405405963283, + "language_loss": 0.93016237, + "learning_rate": 3.999777278896952e-06, + "loss": 0.95636809, + "num_input_tokens_seen": 1622420, + "router_z_loss_clip": 0.72265625, + "router_z_loss_mlp": 3.03125, + "step": 81, + "time_per_iteration": 2.637070894241333 + }, + { + "auxiliary_loss_clip": 0.01490725, + "auxiliary_loss_mlp": 0.01150368, + "balance_loss_clip": 1.07927203, + "balance_loss_mlp": 1.18020296, + "epoch": 0.004930106718773485, + "flos": 12822550237440.0, + "grad_norm": 2.6227018187074114, + "language_loss": 0.94184101, + "learning_rate": 3.999771604809249e-06, + "loss": 0.96825194, + "num_input_tokens_seen": 1640715, + "router_z_loss_clip": 0.7109375, + "router_z_loss_mlp": 3.09375, + "step": 82, + "time_per_iteration": 2.5250260829925537 + }, + { + "auxiliary_loss_clip": 0.01483064, + "auxiliary_loss_mlp": 0.01149186, + "balance_loss_clip": 1.07451355, + "balance_loss_mlp": 1.17058671, + "epoch": 0.004990229971441455, + "flos": 25773756301440.0, + "grad_norm": 2.157109061404475, + "language_loss": 0.85192108, + "learning_rate": 3.999765859354839e-06, + "loss": 0.87824357, + "num_input_tokens_seen": 1662210, + "router_z_loss_clip": 0.74609375, + "router_z_loss_mlp": 3.125, + "step": 83, + "time_per_iteration": 2.627007484436035 + }, + { + "auxiliary_loss_clip": 0.0147668, + "auxiliary_loss_mlp": 0.01154582, + "balance_loss_clip": 1.07733428, + "balance_loss_mlp": 1.16956425, + "epoch": 0.005050353224109424, + "flos": 17456292771840.0, + "grad_norm": 2.7224666668920112, + "language_loss": 0.90668142, + "learning_rate": 3.999760042533931e-06, + "loss": 0.93299407, + "num_input_tokens_seen": 1681070, + "router_z_loss_clip": 0.7734375, + "router_z_loss_mlp": 3.078125, + "step": 84, + "time_per_iteration": 2.4960057735443115 + }, + { + "auxiliary_loss_clip": 0.01597049, + "auxiliary_loss_mlp": 0.01233618, + "balance_loss_clip": 1.19604373, + "balance_loss_mlp": 1.38286448, + "epoch": 0.005110476476777394, + "flos": 69802269235200.0, + "grad_norm": 1.0747967940372232, + "language_loss": 0.61884308, + "learning_rate": 3.999754154346731e-06, + "loss": 0.64714968, + "num_input_tokens_seen": 1747140, + "router_z_loss_clip": 0.375, + "router_z_loss_mlp": 2.140625, + "step": 85, + "time_per_iteration": 3.219843626022339 + }, + { + "auxiliary_loss_clip": 0.01462354, + "auxiliary_loss_mlp": 0.011176, + "balance_loss_clip": 1.04264069, + "balance_loss_mlp": 1.15787935, + "epoch": 0.005170599729445363, + "flos": 24788961181440.0, + "grad_norm": 2.109722333055802, + "language_loss": 0.89139509, + "learning_rate": 3.999748194793449e-06, + "loss": 0.9171946, + "num_input_tokens_seen": 1767475, + "router_z_loss_clip": 0.75, + "router_z_loss_mlp": 3.046875, + "step": 86, + "time_per_iteration": 2.5646045207977295 + }, + { + "auxiliary_loss_clip": 0.01472103, + "auxiliary_loss_mlp": 0.01136714, + "balance_loss_clip": 1.06227946, + "balance_loss_mlp": 1.16680181, + "epoch": 0.005230722982113333, + "flos": 23256937411200.0, + "grad_norm": 2.3874485555726417, + "language_loss": 0.80222178, + "learning_rate": 3.999742163874298e-06, + "loss": 0.82830989, + "num_input_tokens_seen": 1784980, + "router_z_loss_clip": 0.7421875, + "router_z_loss_mlp": 3.0625, + "step": 87, + "time_per_iteration": 2.568986177444458 + }, + { + "auxiliary_loss_clip": 0.01458642, + "auxiliary_loss_mlp": 0.01126635, + "balance_loss_clip": 1.0532496, + "balance_loss_mlp": 1.16615224, + "epoch": 0.005290846234781301, + "flos": 16726957136640.0, + "grad_norm": 1.8012755792208095, + "language_loss": 0.94010115, + "learning_rate": 3.999736061589492e-06, + "loss": 0.96595389, + "num_input_tokens_seen": 1803030, + "router_z_loss_clip": 0.734375, + "router_z_loss_mlp": 2.9375, + "step": 88, + "time_per_iteration": 2.5278608798980713 + }, + { + "auxiliary_loss_clip": 0.01458957, + "auxiliary_loss_mlp": 0.01112408, + "balance_loss_clip": 1.04107285, + "balance_loss_mlp": 1.15566182, + "epoch": 0.005350969487449271, + "flos": 20885043041280.0, + "grad_norm": 2.0496604196188133, + "language_loss": 0.84080839, + "learning_rate": 3.999729887939251e-06, + "loss": 0.86652201, + "num_input_tokens_seen": 1822865, + "router_z_loss_clip": 0.71484375, + "router_z_loss_mlp": 3.03125, + "step": 89, + "time_per_iteration": 2.563391923904419 + }, + { + "auxiliary_loss_clip": 0.01449543, + "auxiliary_loss_mlp": 0.01111738, + "balance_loss_clip": 1.04006934, + "balance_loss_mlp": 1.15215576, + "epoch": 0.00541109274011724, + "flos": 26208878976000.0, + "grad_norm": 2.0047324414422962, + "language_loss": 0.89549929, + "learning_rate": 3.9997236429237945e-06, + "loss": 0.92111206, + "num_input_tokens_seen": 1842435, + "router_z_loss_clip": 0.71875, + "router_z_loss_mlp": 2.96875, + "step": 90, + "time_per_iteration": 2.5636699199676514 + }, + { + "auxiliary_loss_clip": 0.01434598, + "auxiliary_loss_mlp": 0.01115269, + "balance_loss_clip": 1.04503119, + "balance_loss_mlp": 1.14833903, + "epoch": 0.00547121599278521, + "flos": 21177510433920.0, + "grad_norm": 3.218059252263368, + "language_loss": 0.84463358, + "learning_rate": 3.999717326543345e-06, + "loss": 0.87013233, + "num_input_tokens_seen": 1860065, + "router_z_loss_clip": 0.703125, + "router_z_loss_mlp": 2.859375, + "step": 91, + "time_per_iteration": 2.5507023334503174 + }, + { + "auxiliary_loss_clip": 0.01435673, + "auxiliary_loss_mlp": 0.01108879, + "balance_loss_clip": 1.03992856, + "balance_loss_mlp": 1.14235711, + "epoch": 0.005531339245453179, + "flos": 19717791822720.0, + "grad_norm": 2.168802380764744, + "language_loss": 0.85291636, + "learning_rate": 3.9997109387981275e-06, + "loss": 0.87836194, + "num_input_tokens_seen": 1878135, + "router_z_loss_clip": 0.6875, + "router_z_loss_mlp": 2.9375, + "step": 92, + "time_per_iteration": 2.539388418197632 + }, + { + "auxiliary_loss_clip": 0.01438361, + "auxiliary_loss_mlp": 0.01139437, + "balance_loss_clip": 1.06667149, + "balance_loss_mlp": 1.14034557, + "epoch": 0.005591462498121149, + "flos": 17635222823040.0, + "grad_norm": 2.7323537714010415, + "language_loss": 0.89827538, + "learning_rate": 3.99970447968837e-06, + "loss": 0.92405343, + "num_input_tokens_seen": 1894895, + "router_z_loss_clip": 0.7265625, + "router_z_loss_mlp": 2.984375, + "step": 93, + "time_per_iteration": 2.520329475402832 + }, + { + "auxiliary_loss_clip": 0.01444438, + "auxiliary_loss_mlp": 0.01130383, + "balance_loss_clip": 1.05375481, + "balance_loss_mlp": 1.14248276, + "epoch": 0.005651585750789117, + "flos": 20010189392640.0, + "grad_norm": 3.029902315947606, + "language_loss": 0.85671586, + "learning_rate": 3.9996979492143045e-06, + "loss": 0.88246405, + "num_input_tokens_seen": 1913220, + "router_z_loss_clip": 0.765625, + "router_z_loss_mlp": 3.015625, + "step": 94, + "time_per_iteration": 2.518771171569824 + }, + { + "auxiliary_loss_clip": 0.01454722, + "auxiliary_loss_mlp": 0.01115659, + "balance_loss_clip": 1.07884717, + "balance_loss_mlp": 1.27629483, + "epoch": 0.005711709003457087, + "flos": 59809917185280.0, + "grad_norm": 1.1802114848498737, + "language_loss": 0.6768719, + "learning_rate": 3.999691347376162e-06, + "loss": 0.7025758, + "num_input_tokens_seen": 1970970, + "router_z_loss_clip": 0.3671875, + "router_z_loss_mlp": 1.78125, + "step": 95, + "time_per_iteration": 4.534137964248657 + }, + { + "auxiliary_loss_clip": 0.01433724, + "auxiliary_loss_mlp": 0.01147286, + "balance_loss_clip": 1.07719111, + "balance_loss_mlp": 1.137779, + "epoch": 0.005771832256125056, + "flos": 15558693488640.0, + "grad_norm": 3.5253256287489165, + "language_loss": 0.88525021, + "learning_rate": 3.99968467417418e-06, + "loss": 0.91106033, + "num_input_tokens_seen": 1988930, + "router_z_loss_clip": 0.703125, + "router_z_loss_mlp": 2.953125, + "step": 96, + "time_per_iteration": 4.071526050567627 + }, + { + "auxiliary_loss_clip": 0.01420394, + "auxiliary_loss_mlp": 0.01132117, + "balance_loss_clip": 1.06559837, + "balance_loss_mlp": 1.1362958, + "epoch": 0.005831955508793026, + "flos": 22527287573760.0, + "grad_norm": 2.891057314816126, + "language_loss": 0.88203895, + "learning_rate": 3.999677929608596e-06, + "loss": 0.90756404, + "num_input_tokens_seen": 2006285, + "router_z_loss_clip": 0.66796875, + "router_z_loss_mlp": 2.84375, + "step": 97, + "time_per_iteration": 4.145650863647461 + }, + { + "auxiliary_loss_clip": 0.01413049, + "auxiliary_loss_mlp": 0.01130992, + "balance_loss_clip": 1.0612781, + "balance_loss_mlp": 1.13330138, + "epoch": 0.005892078761460995, + "flos": 22048872946560.0, + "grad_norm": 2.0178304849889797, + "language_loss": 0.75365317, + "learning_rate": 3.99967111367965e-06, + "loss": 0.77909356, + "num_input_tokens_seen": 2024905, + "router_z_loss_clip": 0.69921875, + "router_z_loss_mlp": 2.796875, + "step": 98, + "time_per_iteration": 2.5823192596435547 + }, + { + "auxiliary_loss_clip": 0.01407603, + "auxiliary_loss_mlp": 0.0104898, + "balance_loss_clip": 1.01293111, + "balance_loss_mlp": 1.23561692, + "epoch": 0.005952202014128965, + "flos": 68535689598720.0, + "grad_norm": 0.963339581518281, + "language_loss": 0.65151054, + "learning_rate": 3.999664226387586e-06, + "loss": 0.67607635, + "num_input_tokens_seen": 2086220, + "router_z_loss_clip": 0.36132812, + "router_z_loss_mlp": 1.71875, + "step": 99, + "time_per_iteration": 3.2279751300811768 + }, + { + "auxiliary_loss_clip": 0.01421633, + "auxiliary_loss_mlp": 0.01152154, + "balance_loss_clip": 1.07628894, + "balance_loss_mlp": 1.13284802, + "epoch": 0.006012325266796933, + "flos": 22959931541760.0, + "grad_norm": 2.089015714107569, + "language_loss": 0.8961187, + "learning_rate": 3.999657267732648e-06, + "loss": 0.92185652, + "num_input_tokens_seen": 2103365, + "router_z_loss_clip": 0.7578125, + "router_z_loss_mlp": 2.890625, + "step": 100, + "time_per_iteration": 2.543591022491455 + }, + { + "auxiliary_loss_clip": 0.01423763, + "auxiliary_loss_mlp": 0.01135114, + "balance_loss_clip": 1.06478059, + "balance_loss_mlp": 1.13237977, + "epoch": 0.006072448519464903, + "flos": 17346979704960.0, + "grad_norm": 2.0200724237283785, + "language_loss": 0.89709979, + "learning_rate": 3.999650237715088e-06, + "loss": 0.92268854, + "num_input_tokens_seen": 2121995, + "router_z_loss_clip": 0.703125, + "router_z_loss_mlp": 2.90625, + "step": 101, + "time_per_iteration": 2.6019370555877686 + }, + { + "auxiliary_loss_clip": 0.01417283, + "auxiliary_loss_mlp": 0.01138533, + "balance_loss_clip": 1.06633949, + "balance_loss_mlp": 1.13459754, + "epoch": 0.006132571772132872, + "flos": 24679962316800.0, + "grad_norm": 2.382432542633007, + "language_loss": 0.89427447, + "learning_rate": 3.9996431363351536e-06, + "loss": 0.91983271, + "num_input_tokens_seen": 2141815, + "router_z_loss_clip": 0.72265625, + "router_z_loss_mlp": 2.828125, + "step": 102, + "time_per_iteration": 2.55599045753479 + }, + { + "auxiliary_loss_clip": 0.01412183, + "auxiliary_loss_mlp": 0.01121469, + "balance_loss_clip": 1.05390084, + "balance_loss_mlp": 1.13109529, + "epoch": 0.006192695024800842, + "flos": 21464741122560.0, + "grad_norm": 2.1856308121545287, + "language_loss": 0.86522692, + "learning_rate": 3.9996359635931e-06, + "loss": 0.89056349, + "num_input_tokens_seen": 2161125, + "router_z_loss_clip": 0.67578125, + "router_z_loss_mlp": 2.8125, + "step": 103, + "time_per_iteration": 2.555086374282837 + }, + { + "auxiliary_loss_clip": 0.01414161, + "auxiliary_loss_mlp": 0.01133673, + "balance_loss_clip": 1.06314886, + "balance_loss_mlp": 1.13492, + "epoch": 0.006252818277468811, + "flos": 17419459420800.0, + "grad_norm": 2.0955448234560823, + "language_loss": 0.92823404, + "learning_rate": 3.999628719489181e-06, + "loss": 0.9537124, + "num_input_tokens_seen": 2179510, + "router_z_loss_clip": 0.70703125, + "router_z_loss_mlp": 2.78125, + "step": 104, + "time_per_iteration": 2.5785515308380127 + }, + { + "auxiliary_loss_clip": 0.01407541, + "auxiliary_loss_mlp": 0.01126941, + "balance_loss_clip": 1.05708396, + "balance_loss_mlp": 1.12816191, + "epoch": 0.006312941530136781, + "flos": 19098537304320.0, + "grad_norm": 14.34565848265555, + "language_loss": 0.94836128, + "learning_rate": 3.999621404023658e-06, + "loss": 0.97370607, + "num_input_tokens_seen": 2197870, + "router_z_loss_clip": 0.69921875, + "router_z_loss_mlp": 2.796875, + "step": 105, + "time_per_iteration": 2.543741226196289 + }, + { + "auxiliary_loss_clip": 0.01410998, + "auxiliary_loss_mlp": 0.01137532, + "balance_loss_clip": 1.06157172, + "balance_loss_mlp": 1.12886274, + "epoch": 0.006373064782804749, + "flos": 24059695368960.0, + "grad_norm": 2.226532575855215, + "language_loss": 0.85008109, + "learning_rate": 3.9996140171967904e-06, + "loss": 0.87556636, + "num_input_tokens_seen": 2217495, + "router_z_loss_clip": 0.7578125, + "router_z_loss_mlp": 2.8125, + "step": 106, + "time_per_iteration": 2.576078414916992 + }, + { + "auxiliary_loss_clip": 0.01402094, + "auxiliary_loss_mlp": 0.01119371, + "balance_loss_clip": 1.04894197, + "balance_loss_mlp": 1.12047982, + "epoch": 0.006433188035472719, + "flos": 18331460622720.0, + "grad_norm": 2.3678014845405873, + "language_loss": 0.81457663, + "learning_rate": 3.9996065590088426e-06, + "loss": 0.8397913, + "num_input_tokens_seen": 2236520, + "router_z_loss_clip": 0.703125, + "router_z_loss_mlp": 2.8125, + "step": 107, + "time_per_iteration": 2.497641086578369 + }, + { + "auxiliary_loss_clip": 0.01341496, + "auxiliary_loss_mlp": 0.0110217, + "balance_loss_clip": 1.07508552, + "balance_loss_mlp": 1.18681765, + "epoch": 0.006493311288140688, + "flos": 62558907816960.0, + "grad_norm": 0.9461185842272075, + "language_loss": 0.64579511, + "learning_rate": 3.999599029460081e-06, + "loss": 0.67023176, + "num_input_tokens_seen": 2300140, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 1.546875, + "step": 108, + "time_per_iteration": 3.157834053039551 + }, + { + "auxiliary_loss_clip": 0.01398715, + "auxiliary_loss_mlp": 0.01113337, + "balance_loss_clip": 1.04224062, + "balance_loss_mlp": 1.12512159, + "epoch": 0.006553434540808658, + "flos": 19499130777600.0, + "grad_norm": 2.0634755752913914, + "language_loss": 0.96250588, + "learning_rate": 3.999591428550772e-06, + "loss": 0.98762637, + "num_input_tokens_seen": 2317320, + "router_z_loss_clip": 0.7109375, + "router_z_loss_mlp": 2.734375, + "step": 109, + "time_per_iteration": 2.527080535888672 + }, + { + "auxiliary_loss_clip": 0.01397787, + "auxiliary_loss_mlp": 0.01115048, + "balance_loss_clip": 1.04652655, + "balance_loss_mlp": 1.12549496, + "epoch": 0.006613557793476627, + "flos": 21104088111360.0, + "grad_norm": 1.7137254663805555, + "language_loss": 0.83182019, + "learning_rate": 3.999583756281189e-06, + "loss": 0.85694849, + "num_input_tokens_seen": 2337820, + "router_z_loss_clip": 0.6875, + "router_z_loss_mlp": 2.71875, + "step": 110, + "time_per_iteration": 2.596374273300171 + }, + { + "auxiliary_loss_clip": 0.01399336, + "auxiliary_loss_mlp": 0.01111445, + "balance_loss_clip": 1.04158795, + "balance_loss_mlp": 1.12313271, + "epoch": 0.006673681046144597, + "flos": 26029564899840.0, + "grad_norm": 2.0155584080496984, + "language_loss": 0.81827509, + "learning_rate": 3.999576012651605e-06, + "loss": 0.84338289, + "num_input_tokens_seen": 2358560, + "router_z_loss_clip": 0.6953125, + "router_z_loss_mlp": 2.75, + "step": 111, + "time_per_iteration": 2.590143918991089 + }, + { + "auxiliary_loss_clip": 0.01389427, + "auxiliary_loss_mlp": 0.0111467, + "balance_loss_clip": 1.04719806, + "balance_loss_mlp": 1.11860859, + "epoch": 0.006733804298812566, + "flos": 23146681737600.0, + "grad_norm": 2.370401313774829, + "language_loss": 0.92597079, + "learning_rate": 3.999568197662297e-06, + "loss": 0.95101178, + "num_input_tokens_seen": 2379005, + "router_z_loss_clip": 0.671875, + "router_z_loss_mlp": 2.703125, + "step": 112, + "time_per_iteration": 2.553640127182007 + }, + { + "auxiliary_loss_clip": 0.01399439, + "auxiliary_loss_mlp": 0.01106971, + "balance_loss_clip": 1.04140568, + "balance_loss_mlp": 1.12583899, + "epoch": 0.006793927551480535, + "flos": 11763669479040.0, + "grad_norm": 2.2048966935764076, + "language_loss": 0.77447867, + "learning_rate": 3.999560311313543e-06, + "loss": 0.79954273, + "num_input_tokens_seen": 2395610, + "router_z_loss_clip": 0.65625, + "router_z_loss_mlp": 2.734375, + "step": 113, + "time_per_iteration": 2.5330731868743896 + }, + { + "auxiliary_loss_clip": 0.01391044, + "auxiliary_loss_mlp": 0.01108191, + "balance_loss_clip": 1.03609276, + "balance_loss_mlp": 1.12234282, + "epoch": 0.006854050804148504, + "flos": 19170947197440.0, + "grad_norm": 1.788522558774708, + "language_loss": 0.91970974, + "learning_rate": 3.999552353605626e-06, + "loss": 0.94470197, + "num_input_tokens_seen": 2415005, + "router_z_loss_clip": 0.72265625, + "router_z_loss_mlp": 2.6875, + "step": 114, + "time_per_iteration": 2.5473427772521973 + }, + { + "auxiliary_loss_clip": 0.0139664, + "auxiliary_loss_mlp": 0.01111945, + "balance_loss_clip": 1.04928839, + "balance_loss_mlp": 1.11947298, + "epoch": 0.006914174056816474, + "flos": 21980792062080.0, + "grad_norm": 2.5305389445574975, + "language_loss": 0.93227071, + "learning_rate": 3.999544324538829e-06, + "loss": 0.95735657, + "num_input_tokens_seen": 2433965, + "router_z_loss_clip": 0.625, + "router_z_loss_mlp": 2.765625, + "step": 115, + "time_per_iteration": 2.5604898929595947 + }, + { + "auxiliary_loss_clip": 0.01389107, + "auxiliary_loss_mlp": 0.01104315, + "balance_loss_clip": 1.03507793, + "balance_loss_mlp": 1.11516285, + "epoch": 0.006974297309484443, + "flos": 16288238592000.0, + "grad_norm": 2.5654792929808563, + "language_loss": 0.80363703, + "learning_rate": 3.999536224113438e-06, + "loss": 0.82857126, + "num_input_tokens_seen": 2451605, + "router_z_loss_clip": 0.69140625, + "router_z_loss_mlp": 2.734375, + "step": 116, + "time_per_iteration": 2.5479090213775635 + }, + { + "auxiliary_loss_clip": 0.01381443, + "auxiliary_loss_mlp": 0.01104956, + "balance_loss_clip": 1.0391047, + "balance_loss_mlp": 1.11800122, + "epoch": 0.007034420562152413, + "flos": 26102812665600.0, + "grad_norm": 3.7731086756359464, + "language_loss": 0.86852342, + "learning_rate": 3.9995280523297416e-06, + "loss": 0.89338744, + "num_input_tokens_seen": 2472035, + "router_z_loss_clip": 0.65625, + "router_z_loss_mlp": 2.625, + "step": 117, + "time_per_iteration": 2.556678056716919 + }, + { + "auxiliary_loss_clip": 0.01386747, + "auxiliary_loss_mlp": 0.0111943, + "balance_loss_clip": 1.04842925, + "balance_loss_mlp": 1.12321186, + "epoch": 0.007094543814820382, + "flos": 14203889112960.0, + "grad_norm": 2.1802516405551553, + "language_loss": 0.82808697, + "learning_rate": 3.9995198091880334e-06, + "loss": 0.8531487, + "num_input_tokens_seen": 2489285, + "router_z_loss_clip": 0.7109375, + "router_z_loss_mlp": 2.640625, + "step": 118, + "time_per_iteration": 2.588897705078125 + }, + { + "auxiliary_loss_clip": 0.01394154, + "auxiliary_loss_mlp": 0.01123586, + "balance_loss_clip": 1.05520773, + "balance_loss_mlp": 1.11883044, + "epoch": 0.007154667067488351, + "flos": 14975120246400.0, + "grad_norm": 2.874811437563815, + "language_loss": 0.97947919, + "learning_rate": 3.999511494688606e-06, + "loss": 1.00465655, + "num_input_tokens_seen": 2506460, + "router_z_loss_clip": 0.68359375, + "router_z_loss_mlp": 2.75, + "step": 119, + "time_per_iteration": 2.5415806770324707 + }, + { + "auxiliary_loss_clip": 0.01395516, + "auxiliary_loss_mlp": 0.01106237, + "balance_loss_clip": 1.03919351, + "balance_loss_mlp": 1.1161859, + "epoch": 0.00721479032015632, + "flos": 20192261466240.0, + "grad_norm": 2.3987781398524306, + "language_loss": 0.87784606, + "learning_rate": 3.999503108831758e-06, + "loss": 0.90286356, + "num_input_tokens_seen": 2525565, + "router_z_loss_clip": 0.671875, + "router_z_loss_mlp": 2.796875, + "step": 120, + "time_per_iteration": 2.533524513244629 + }, + { + "auxiliary_loss_clip": 0.01384108, + "auxiliary_loss_mlp": 0.01112804, + "balance_loss_clip": 1.04556966, + "balance_loss_mlp": 1.11985481, + "epoch": 0.00727491357282429, + "flos": 23146158067200.0, + "grad_norm": 1.8486992631891273, + "language_loss": 0.92068368, + "learning_rate": 3.999494651617787e-06, + "loss": 0.94565284, + "num_input_tokens_seen": 2546605, + "router_z_loss_clip": 0.671875, + "router_z_loss_mlp": 2.640625, + "step": 121, + "time_per_iteration": 2.5171191692352295 + }, + { + "auxiliary_loss_clip": 0.01389371, + "auxiliary_loss_mlp": 0.01132887, + "balance_loss_clip": 1.06698799, + "balance_loss_mlp": 1.12244916, + "epoch": 0.007335036825492259, + "flos": 15520812796800.0, + "grad_norm": 2.3026404931976407, + "language_loss": 0.88909745, + "learning_rate": 3.999486123046994e-06, + "loss": 0.91432005, + "num_input_tokens_seen": 2560730, + "router_z_loss_clip": 0.65625, + "router_z_loss_mlp": 2.671875, + "step": 122, + "time_per_iteration": 2.5436108112335205 + }, + { + "auxiliary_loss_clip": 0.01383648, + "auxiliary_loss_mlp": 0.01112574, + "balance_loss_clip": 1.04171658, + "balance_loss_mlp": 1.11539435, + "epoch": 0.007395160078160229, + "flos": 24242221290240.0, + "grad_norm": 3.7241053052078352, + "language_loss": 0.91549945, + "learning_rate": 3.999477523119686e-06, + "loss": 0.94046164, + "num_input_tokens_seen": 2579550, + "router_z_loss_clip": 0.7109375, + "router_z_loss_mlp": 2.6875, + "step": 123, + "time_per_iteration": 2.5250282287597656 + }, + { + "auxiliary_loss_clip": 0.01382203, + "auxiliary_loss_mlp": 0.01103122, + "balance_loss_clip": 1.03736663, + "balance_loss_mlp": 1.11126471, + "epoch": 0.007455283330828198, + "flos": 31758428050560.0, + "grad_norm": 5.948156608363036, + "language_loss": 0.69979113, + "learning_rate": 3.999468851836168e-06, + "loss": 0.72464442, + "num_input_tokens_seen": 2600390, + "router_z_loss_clip": 0.65625, + "router_z_loss_mlp": 2.71875, + "step": 124, + "time_per_iteration": 2.675476551055908 + }, + { + "auxiliary_loss_clip": 0.01374967, + "auxiliary_loss_mlp": 0.01109007, + "balance_loss_clip": 1.04382336, + "balance_loss_mlp": 1.11420441, + "epoch": 0.007515406583496167, + "flos": 26613941103360.0, + "grad_norm": 2.139508831646684, + "language_loss": 0.87107795, + "learning_rate": 3.999460109196749e-06, + "loss": 0.89591765, + "num_input_tokens_seen": 2620770, + "router_z_loss_clip": 0.65234375, + "router_z_loss_mlp": 2.609375, + "step": 125, + "time_per_iteration": 2.5488245487213135 + }, + { + "auxiliary_loss_clip": 0.01384091, + "auxiliary_loss_mlp": 0.01117652, + "balance_loss_clip": 1.04979801, + "balance_loss_mlp": 1.1116432, + "epoch": 0.007575529836164136, + "flos": 18222706137600.0, + "grad_norm": 2.6663440653428, + "language_loss": 0.80915189, + "learning_rate": 3.999451295201743e-06, + "loss": 0.83416933, + "num_input_tokens_seen": 2639900, + "router_z_loss_clip": 0.6796875, + "router_z_loss_mlp": 2.71875, + "step": 126, + "time_per_iteration": 2.552436590194702 + }, + { + "auxiliary_loss_clip": 0.01383761, + "auxiliary_loss_mlp": 0.01104154, + "balance_loss_clip": 1.03906584, + "balance_loss_mlp": 1.11447549, + "epoch": 0.007635653088832106, + "flos": 21579325804800.0, + "grad_norm": 2.829396146717874, + "language_loss": 0.66536784, + "learning_rate": 3.999442409851463e-06, + "loss": 0.690247, + "num_input_tokens_seen": 2657450, + "router_z_loss_clip": 0.6484375, + "router_z_loss_mlp": 2.6875, + "step": 127, + "time_per_iteration": 2.5131595134735107 + }, + { + "auxiliary_loss_clip": 0.01375587, + "auxiliary_loss_mlp": 0.0110898, + "balance_loss_clip": 1.04360604, + "balance_loss_mlp": 1.1157546, + "epoch": 0.007695776341500075, + "flos": 25373861055360.0, + "grad_norm": 3.1321009293481024, + "language_loss": 0.86887217, + "learning_rate": 3.999433453146227e-06, + "loss": 0.89371789, + "num_input_tokens_seen": 2678150, + "router_z_loss_clip": 0.65234375, + "router_z_loss_mlp": 2.59375, + "step": 128, + "time_per_iteration": 2.658592700958252 + }, + { + "auxiliary_loss_clip": 0.01373952, + "auxiliary_loss_mlp": 0.01117951, + "balance_loss_clip": 1.04957247, + "balance_loss_mlp": 1.10953844, + "epoch": 0.007755899594168045, + "flos": 22342876439040.0, + "grad_norm": 1.8980833921998537, + "language_loss": 0.83853519, + "learning_rate": 3.9994244250863535e-06, + "loss": 0.86345422, + "num_input_tokens_seen": 2698290, + "router_z_loss_clip": 0.68359375, + "router_z_loss_mlp": 2.640625, + "step": 129, + "time_per_iteration": 2.5144636631011963 + }, + { + "auxiliary_loss_clip": 0.01367964, + "auxiliary_loss_mlp": 0.01104102, + "balance_loss_clip": 1.03801274, + "balance_loss_mlp": 1.10930061, + "epoch": 0.007816022846836013, + "flos": 22637124311040.0, + "grad_norm": 2.1901717057431904, + "language_loss": 0.96096313, + "learning_rate": 3.999415325672166e-06, + "loss": 0.98568374, + "num_input_tokens_seen": 2717630, + "router_z_loss_clip": 0.66015625, + "router_z_loss_mlp": 2.578125, + "step": 130, + "time_per_iteration": 2.513084888458252 + }, + { + "auxiliary_loss_clip": 0.01366017, + "auxiliary_loss_mlp": 0.0110484, + "balance_loss_clip": 1.04018116, + "balance_loss_mlp": 1.11058915, + "epoch": 0.007876146099503984, + "flos": 18182032536960.0, + "grad_norm": 2.2948658674142837, + "language_loss": 0.80744946, + "learning_rate": 3.9994061549039886e-06, + "loss": 0.83215797, + "num_input_tokens_seen": 2735835, + "router_z_loss_clip": 0.6484375, + "router_z_loss_mlp": 2.5625, + "step": 131, + "time_per_iteration": 2.493565797805786 + }, + { + "auxiliary_loss_clip": 0.01374832, + "auxiliary_loss_mlp": 0.0110615, + "balance_loss_clip": 1.04330242, + "balance_loss_mlp": 1.11081851, + "epoch": 0.007936269352171952, + "flos": 27118436382720.0, + "grad_norm": 2.7280402886740327, + "language_loss": 0.82224703, + "learning_rate": 3.9993969127821485e-06, + "loss": 0.84705681, + "num_input_tokens_seen": 2756335, + "router_z_loss_clip": 0.62890625, + "router_z_loss_mlp": 2.640625, + "step": 132, + "time_per_iteration": 2.5553696155548096 + }, + { + "auxiliary_loss_clip": 0.01363389, + "auxiliary_loss_mlp": 0.01108456, + "balance_loss_clip": 1.04355907, + "balance_loss_mlp": 1.10452795, + "epoch": 0.007996392604839923, + "flos": 19025324449920.0, + "grad_norm": 1.8894809484000292, + "language_loss": 0.94121611, + "learning_rate": 3.9993875993069755e-06, + "loss": 0.96593451, + "num_input_tokens_seen": 2775090, + "router_z_loss_clip": 0.6484375, + "router_z_loss_mlp": 2.59375, + "step": 133, + "time_per_iteration": 2.4987971782684326 + }, + { + "auxiliary_loss_clip": 0.01359644, + "auxiliary_loss_mlp": 0.01107564, + "balance_loss_clip": 1.04538476, + "balance_loss_mlp": 1.11143434, + "epoch": 0.008056515857507891, + "flos": 25482964654080.0, + "grad_norm": 1.743450272302483, + "language_loss": 0.72378039, + "learning_rate": 3.9993782144788025e-06, + "loss": 0.74845243, + "num_input_tokens_seen": 2795320, + "router_z_loss_clip": 0.62109375, + "router_z_loss_mlp": 2.484375, + "step": 134, + "time_per_iteration": 3.978874683380127 + }, + { + "auxiliary_loss_clip": 0.0136099, + "auxiliary_loss_mlp": 0.01107337, + "balance_loss_clip": 1.04010284, + "balance_loss_mlp": 1.10283303, + "epoch": 0.00811663911017586, + "flos": 20556545258880.0, + "grad_norm": 4.696246405269054, + "language_loss": 0.87248123, + "learning_rate": 3.999368758297964e-06, + "loss": 0.89716446, + "num_input_tokens_seen": 2812815, + "router_z_loss_clip": 0.671875, + "router_z_loss_mlp": 2.578125, + "step": 135, + "time_per_iteration": 5.2392988204956055 + }, + { + "auxiliary_loss_clip": 0.01362226, + "auxiliary_loss_mlp": 0.01106394, + "balance_loss_clip": 1.03935051, + "balance_loss_mlp": 1.10586715, + "epoch": 0.00817676236284383, + "flos": 18798947994240.0, + "grad_norm": 1.8825613314785814, + "language_loss": 0.87810934, + "learning_rate": 3.999359230764798e-06, + "loss": 0.90279549, + "num_input_tokens_seen": 2830445, + "router_z_loss_clip": 0.671875, + "router_z_loss_mlp": 2.5625, + "step": 136, + "time_per_iteration": 3.84043025970459 + }, + { + "auxiliary_loss_clip": 0.01359523, + "auxiliary_loss_mlp": 0.01091745, + "balance_loss_clip": 1.0304718, + "balance_loss_mlp": 1.10392106, + "epoch": 0.008236885615511799, + "flos": 23872596059520.0, + "grad_norm": 2.1865065382853994, + "language_loss": 0.82736731, + "learning_rate": 3.999349631879643e-06, + "loss": 0.85187995, + "num_input_tokens_seen": 2846965, + "router_z_loss_clip": 0.61328125, + "router_z_loss_mlp": 2.5625, + "step": 137, + "time_per_iteration": 2.4926602840423584 + }, + { + "auxiliary_loss_clip": 0.01355187, + "auxiliary_loss_mlp": 0.01092266, + "balance_loss_clip": 1.03099298, + "balance_loss_mlp": 1.10456729, + "epoch": 0.00829700886817977, + "flos": 24642500561280.0, + "grad_norm": 1.8798885548199515, + "language_loss": 0.8933351, + "learning_rate": 3.9993399616428425e-06, + "loss": 0.91780961, + "num_input_tokens_seen": 2867520, + "router_z_loss_clip": 0.61328125, + "router_z_loss_mlp": 2.5, + "step": 138, + "time_per_iteration": 2.527296543121338 + }, + { + "auxiliary_loss_clip": 0.01346701, + "auxiliary_loss_mlp": 0.01092938, + "balance_loss_clip": 1.02894676, + "balance_loss_mlp": 1.1021167, + "epoch": 0.008357132120847738, + "flos": 25260917207040.0, + "grad_norm": 2.235375654561318, + "language_loss": 0.9069339, + "learning_rate": 3.999330220054742e-06, + "loss": 0.93133026, + "num_input_tokens_seen": 2885675, + "router_z_loss_clip": 0.640625, + "router_z_loss_mlp": 2.4375, + "step": 139, + "time_per_iteration": 2.5068202018737793 + }, + { + "auxiliary_loss_clip": 0.01364767, + "auxiliary_loss_mlp": 0.01099099, + "balance_loss_clip": 1.03525043, + "balance_loss_mlp": 1.10740197, + "epoch": 0.008417255373515706, + "flos": 27343660763520.0, + "grad_norm": 2.359048254552134, + "language_loss": 0.84722054, + "learning_rate": 3.9993204071156894e-06, + "loss": 0.87185919, + "num_input_tokens_seen": 2905960, + "router_z_loss_clip": 0.63671875, + "router_z_loss_mlp": 2.5625, + "step": 140, + "time_per_iteration": 2.596464157104492 + }, + { + "auxiliary_loss_clip": 0.01350993, + "auxiliary_loss_mlp": 0.01098609, + "balance_loss_clip": 1.03685844, + "balance_loss_mlp": 1.10390186, + "epoch": 0.008477378626183677, + "flos": 17638120465920.0, + "grad_norm": 7.5568190931205255, + "language_loss": 0.8278836, + "learning_rate": 3.999310522826034e-06, + "loss": 0.85237962, + "num_input_tokens_seen": 2922780, + "router_z_loss_clip": 0.6171875, + "router_z_loss_mlp": 2.46875, + "step": 141, + "time_per_iteration": 2.5674993991851807 + }, + { + "auxiliary_loss_clip": 0.01353554, + "auxiliary_loss_mlp": 0.01096234, + "balance_loss_clip": 1.0332911, + "balance_loss_mlp": 1.10089064, + "epoch": 0.008537501878851645, + "flos": 13880488389120.0, + "grad_norm": 2.45829146302685, + "language_loss": 0.79976183, + "learning_rate": 3.999300567186129e-06, + "loss": 0.8242597, + "num_input_tokens_seen": 2938765, + "router_z_loss_clip": 0.62890625, + "router_z_loss_mlp": 2.53125, + "step": 142, + "time_per_iteration": 2.4487762451171875 + }, + { + "auxiliary_loss_clip": 0.01293382, + "auxiliary_loss_mlp": 0.01083486, + "balance_loss_clip": 1.05849934, + "balance_loss_mlp": 1.14034057, + "epoch": 0.008597625131519616, + "flos": 71244320832000.0, + "grad_norm": 1.0288296118365807, + "language_loss": 0.66773266, + "learning_rate": 3.999290540196329e-06, + "loss": 0.69150138, + "num_input_tokens_seen": 3006665, + "router_z_loss_clip": 0.24902344, + "router_z_loss_mlp": 1.53125, + "step": 143, + "time_per_iteration": 3.269559144973755 + }, + { + "auxiliary_loss_clip": 0.0135308, + "auxiliary_loss_mlp": 0.01103362, + "balance_loss_clip": 1.04156375, + "balance_loss_mlp": 1.10496008, + "epoch": 0.008657748384187584, + "flos": 17601880608000.0, + "grad_norm": 2.110396981770736, + "language_loss": 0.83264089, + "learning_rate": 3.999280441856992e-06, + "loss": 0.85720533, + "num_input_tokens_seen": 3024335, + "router_z_loss_clip": 0.6171875, + "router_z_loss_mlp": 2.46875, + "step": 144, + "time_per_iteration": 2.4857771396636963 + }, + { + "auxiliary_loss_clip": 0.01341317, + "auxiliary_loss_mlp": 0.01097107, + "balance_loss_clip": 1.03240073, + "balance_loss_mlp": 1.09779024, + "epoch": 0.008717871636855555, + "flos": 19714405420800.0, + "grad_norm": 2.0613967466713885, + "language_loss": 0.87342119, + "learning_rate": 3.9992702721684805e-06, + "loss": 0.89780545, + "num_input_tokens_seen": 3043300, + "router_z_loss_clip": 0.6484375, + "router_z_loss_mlp": 2.4375, + "step": 145, + "time_per_iteration": 2.5223348140716553 + }, + { + "auxiliary_loss_clip": 0.01353492, + "auxiliary_loss_mlp": 0.01098152, + "balance_loss_clip": 1.03301656, + "balance_loss_mlp": 1.0989728, + "epoch": 0.008777994889523523, + "flos": 24716271997440.0, + "grad_norm": 1.8762206084414534, + "language_loss": 0.85667378, + "learning_rate": 3.999260031131154e-06, + "loss": 0.88119018, + "num_input_tokens_seen": 3064610, + "router_z_loss_clip": 0.6484375, + "router_z_loss_mlp": 2.546875, + "step": 146, + "time_per_iteration": 2.530491828918457 + }, + { + "auxiliary_loss_clip": 0.0127963, + "auxiliary_loss_mlp": 0.01026299, + "balance_loss_clip": 1.00312459, + "balance_loss_mlp": 1.14265537, + "epoch": 0.008838118142191492, + "flos": 70128916715520.0, + "grad_norm": 0.8141797365584135, + "language_loss": 0.59914416, + "learning_rate": 3.999249718745381e-06, + "loss": 0.62220341, + "num_input_tokens_seen": 3130385, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 1.375, + "step": 147, + "time_per_iteration": 3.2129898071289062 + }, + { + "auxiliary_loss_clip": 0.01345115, + "auxiliary_loss_mlp": 0.01096029, + "balance_loss_clip": 1.03628087, + "balance_loss_mlp": 1.10191417, + "epoch": 0.008898241394859462, + "flos": 20043845809920.0, + "grad_norm": 1.8636165565969143, + "language_loss": 0.83679867, + "learning_rate": 3.999239335011527e-06, + "loss": 0.86121005, + "num_input_tokens_seen": 3149760, + "router_z_loss_clip": 0.59765625, + "router_z_loss_mlp": 2.4375, + "step": 148, + "time_per_iteration": 2.566246747970581 + }, + { + "auxiliary_loss_clip": 0.01350047, + "auxiliary_loss_mlp": 0.01117175, + "balance_loss_clip": 1.05318344, + "balance_loss_mlp": 1.09972465, + "epoch": 0.008958364647527431, + "flos": 10742843969280.0, + "grad_norm": 2.214139356241396, + "language_loss": 0.87434661, + "learning_rate": 3.999228879929965e-06, + "loss": 0.89901882, + "num_input_tokens_seen": 3164500, + "router_z_loss_clip": 0.640625, + "router_z_loss_mlp": 2.5, + "step": 149, + "time_per_iteration": 2.482739210128784 + }, + { + "auxiliary_loss_clip": 0.01352019, + "auxiliary_loss_mlp": 0.01106148, + "balance_loss_clip": 1.04434991, + "balance_loss_mlp": 1.10022926, + "epoch": 0.009018487900195401, + "flos": 29126326250880.0, + "grad_norm": 2.052042896827704, + "language_loss": 0.92434806, + "learning_rate": 3.999218353501066e-06, + "loss": 0.94892967, + "num_input_tokens_seen": 3182455, + "router_z_loss_clip": 0.6171875, + "router_z_loss_mlp": 2.515625, + "step": 150, + "time_per_iteration": 2.5825510025024414 + }, + { + "auxiliary_loss_clip": 0.01343467, + "auxiliary_loss_mlp": 0.01100108, + "balance_loss_clip": 1.03931189, + "balance_loss_mlp": 1.09753084, + "epoch": 0.00907861115286337, + "flos": 32962268240640.0, + "grad_norm": 2.0323430661128237, + "language_loss": 0.73467743, + "learning_rate": 3.999207755725208e-06, + "loss": 0.75911319, + "num_input_tokens_seen": 3203995, + "router_z_loss_clip": 0.609375, + "router_z_loss_mlp": 2.453125, + "step": 151, + "time_per_iteration": 2.621089458465576 + }, + { + "auxiliary_loss_clip": 0.01348765, + "auxiliary_loss_mlp": 0.01104451, + "balance_loss_clip": 1.0417949, + "balance_loss_mlp": 1.09938061, + "epoch": 0.009138734405531338, + "flos": 21761362967040.0, + "grad_norm": 2.3198228318377594, + "language_loss": 0.87541401, + "learning_rate": 3.999197086602766e-06, + "loss": 0.89994621, + "num_input_tokens_seen": 3222575, + "router_z_loss_clip": 0.625, + "router_z_loss_mlp": 2.484375, + "step": 152, + "time_per_iteration": 2.5434858798980713 + }, + { + "auxiliary_loss_clip": 0.01342767, + "auxiliary_loss_mlp": 0.01094005, + "balance_loss_clip": 1.03306484, + "balance_loss_mlp": 1.10060859, + "epoch": 0.009198857658199309, + "flos": 20841681265920.0, + "grad_norm": 3.727614359588284, + "language_loss": 0.8170656, + "learning_rate": 3.9991863461341234e-06, + "loss": 0.84143329, + "num_input_tokens_seen": 3240180, + "router_z_loss_clip": 0.609375, + "router_z_loss_mlp": 2.421875, + "step": 153, + "time_per_iteration": 2.5529229640960693 + }, + { + "auxiliary_loss_clip": 0.01340483, + "auxiliary_loss_mlp": 0.01101178, + "balance_loss_clip": 1.04081035, + "balance_loss_mlp": 1.09398246, + "epoch": 0.009258980910867277, + "flos": 24826213468800.0, + "grad_norm": 2.0004419209962077, + "language_loss": 0.88920546, + "learning_rate": 3.999175534319662e-06, + "loss": 0.91362202, + "num_input_tokens_seen": 3259800, + "router_z_loss_clip": 0.6015625, + "router_z_loss_mlp": 2.46875, + "step": 154, + "time_per_iteration": 2.498523712158203 + }, + { + "auxiliary_loss_clip": 0.0134595, + "auxiliary_loss_mlp": 0.01134348, + "balance_loss_clip": 1.07116711, + "balance_loss_mlp": 1.09875727, + "epoch": 0.009319104163535248, + "flos": 28766511112320.0, + "grad_norm": 2.1190994788647393, + "language_loss": 0.88754398, + "learning_rate": 3.999164651159769e-06, + "loss": 0.91234696, + "num_input_tokens_seen": 3280400, + "router_z_loss_clip": 0.6328125, + "router_z_loss_mlp": 2.46875, + "step": 155, + "time_per_iteration": 2.5487866401672363 + }, + { + "auxiliary_loss_clip": 0.01347773, + "auxiliary_loss_mlp": 0.01106749, + "balance_loss_clip": 1.04685855, + "balance_loss_mlp": 1.09622169, + "epoch": 0.009379227416203216, + "flos": 16581055098240.0, + "grad_norm": 2.63691628375773, + "language_loss": 0.85156655, + "learning_rate": 3.999153696654832e-06, + "loss": 0.87611175, + "num_input_tokens_seen": 3297600, + "router_z_loss_clip": 0.59765625, + "router_z_loss_mlp": 2.515625, + "step": 156, + "time_per_iteration": 2.453019380569458 + }, + { + "auxiliary_loss_clip": 0.01344155, + "auxiliary_loss_mlp": 0.01096222, + "balance_loss_clip": 1.03523493, + "balance_loss_mlp": 1.0993228, + "epoch": 0.009439350668871187, + "flos": 18329016827520.0, + "grad_norm": 2.1940432790252355, + "language_loss": 0.98876888, + "learning_rate": 3.9991426708052416e-06, + "loss": 1.01317263, + "num_input_tokens_seen": 3313635, + "router_z_loss_clip": 0.609375, + "router_z_loss_mlp": 2.4375, + "step": 157, + "time_per_iteration": 2.4774229526519775 + }, + { + "auxiliary_loss_clip": 0.01336484, + "auxiliary_loss_mlp": 0.01122648, + "balance_loss_clip": 1.06270957, + "balance_loss_mlp": 1.09593248, + "epoch": 0.009499473921539155, + "flos": 24348846182400.0, + "grad_norm": 4.935499386420861, + "language_loss": 0.87422907, + "learning_rate": 3.999131573611392e-06, + "loss": 0.8988204, + "num_input_tokens_seen": 3333735, + "router_z_loss_clip": 0.6015625, + "router_z_loss_mlp": 2.40625, + "step": 158, + "time_per_iteration": 2.5238518714904785 + }, + { + "auxiliary_loss_clip": 0.01340118, + "auxiliary_loss_mlp": 0.01117905, + "balance_loss_clip": 1.05481958, + "balance_loss_mlp": 1.09583664, + "epoch": 0.009559597174207124, + "flos": 16398389531520.0, + "grad_norm": 2.8192939651199223, + "language_loss": 0.85176593, + "learning_rate": 3.999120405073679e-06, + "loss": 0.87634623, + "num_input_tokens_seen": 3348800, + "router_z_loss_clip": 0.6328125, + "router_z_loss_mlp": 2.4375, + "step": 159, + "time_per_iteration": 2.479688882827759 + }, + { + "auxiliary_loss_clip": 0.0133384, + "auxiliary_loss_mlp": 0.01106715, + "balance_loss_clip": 1.04439223, + "balance_loss_mlp": 1.09363759, + "epoch": 0.009619720426875094, + "flos": 22855785356160.0, + "grad_norm": 1.9515699711388057, + "language_loss": 0.85391754, + "learning_rate": 3.9991091651925014e-06, + "loss": 0.87832308, + "num_input_tokens_seen": 3368595, + "router_z_loss_clip": 0.625, + "router_z_loss_mlp": 2.40625, + "step": 160, + "time_per_iteration": 2.4945571422576904 + }, + { + "auxiliary_loss_clip": 0.01338913, + "auxiliary_loss_mlp": 0.01099574, + "balance_loss_clip": 1.04101801, + "balance_loss_mlp": 1.09515488, + "epoch": 0.009679843679543063, + "flos": 19134009112320.0, + "grad_norm": 2.5362498189638, + "language_loss": 0.90829933, + "learning_rate": 3.999097853968259e-06, + "loss": 0.93268418, + "num_input_tokens_seen": 3384975, + "router_z_loss_clip": 0.5859375, + "router_z_loss_mlp": 2.4375, + "step": 161, + "time_per_iteration": 2.4981765747070312 + }, + { + "auxiliary_loss_clip": 0.01340961, + "auxiliary_loss_mlp": 0.01090779, + "balance_loss_clip": 1.03322458, + "balance_loss_mlp": 1.09601188, + "epoch": 0.009739966932211033, + "flos": 20301958558080.0, + "grad_norm": 2.448191667557215, + "language_loss": 0.90737391, + "learning_rate": 3.999086471401357e-06, + "loss": 0.93169141, + "num_input_tokens_seen": 3404755, + "router_z_loss_clip": 0.57421875, + "router_z_loss_mlp": 2.453125, + "step": 162, + "time_per_iteration": 2.5141706466674805 + }, + { + "auxiliary_loss_clip": 0.01249292, + "auxiliary_loss_mlp": 0.01097223, + "balance_loss_clip": 1.0758605, + "balance_loss_mlp": 1.11713386, + "epoch": 0.009800090184879002, + "flos": 67031073112320.0, + "grad_norm": 1.1788821400625664, + "language_loss": 0.72151339, + "learning_rate": 3.9990750174922005e-06, + "loss": 0.74497843, + "num_input_tokens_seen": 3467210, + "router_z_loss_clip": 0.21386719, + "router_z_loss_mlp": 1.3203125, + "step": 163, + "time_per_iteration": 3.0873191356658936 + }, + { + "auxiliary_loss_clip": 0.01329569, + "auxiliary_loss_mlp": 0.01099409, + "balance_loss_clip": 1.0405674, + "balance_loss_mlp": 1.09513378, + "epoch": 0.00986021343754697, + "flos": 17163755556480.0, + "grad_norm": 2.5235900048992903, + "language_loss": 0.83601165, + "learning_rate": 3.9990634922412e-06, + "loss": 0.86030143, + "num_input_tokens_seen": 3483220, + "router_z_loss_clip": 0.58984375, + "router_z_loss_mlp": 2.34375, + "step": 164, + "time_per_iteration": 2.525221824645996 + }, + { + "auxiliary_loss_clip": 0.01322732, + "auxiliary_loss_mlp": 0.01090181, + "balance_loss_clip": 1.03172088, + "balance_loss_mlp": 1.08707738, + "epoch": 0.00992033669021494, + "flos": 17748445962240.0, + "grad_norm": 2.0227093827211897, + "language_loss": 0.88425285, + "learning_rate": 3.9990518956487655e-06, + "loss": 0.908382, + "num_input_tokens_seen": 3501465, + "router_z_loss_clip": 0.5859375, + "router_z_loss_mlp": 2.34375, + "step": 165, + "time_per_iteration": 2.4771060943603516 + }, + { + "auxiliary_loss_clip": 0.01331029, + "auxiliary_loss_mlp": 0.01106093, + "balance_loss_clip": 1.0435797, + "balance_loss_mlp": 1.09030557, + "epoch": 0.00998045994288291, + "flos": 25296109724160.0, + "grad_norm": 2.4914540557712663, + "language_loss": 0.7945292, + "learning_rate": 3.9990402277153105e-06, + "loss": 0.81890035, + "num_input_tokens_seen": 3520480, + "router_z_loss_clip": 0.625, + "router_z_loss_mlp": 2.40625, + "step": 166, + "time_per_iteration": 2.5510506629943848 + }, + { + "auxiliary_loss_clip": 0.01329746, + "auxiliary_loss_mlp": 0.0110877, + "balance_loss_clip": 1.04873562, + "balance_loss_mlp": 1.08858144, + "epoch": 0.01004058319555088, + "flos": 32297801644800.0, + "grad_norm": 2.232365353759961, + "language_loss": 0.9095034, + "learning_rate": 3.999028488441252e-06, + "loss": 0.93388855, + "num_input_tokens_seen": 3539570, + "router_z_loss_clip": 0.6015625, + "router_z_loss_mlp": 2.40625, + "step": 167, + "time_per_iteration": 2.5629281997680664 + }, + { + "auxiliary_loss_clip": 0.01327017, + "auxiliary_loss_mlp": 0.01124412, + "balance_loss_clip": 1.066715, + "balance_loss_mlp": 1.09082603, + "epoch": 0.010100706448218848, + "flos": 11319365116800.0, + "grad_norm": 5.029144748726258, + "language_loss": 0.89412969, + "learning_rate": 3.999016677827009e-06, + "loss": 0.91864395, + "num_input_tokens_seen": 3555465, + "router_z_loss_clip": 0.578125, + "router_z_loss_mlp": 2.359375, + "step": 168, + "time_per_iteration": 2.4647042751312256 + }, + { + "auxiliary_loss_clip": 0.01318749, + "auxiliary_loss_mlp": 0.01095637, + "balance_loss_clip": 1.03674769, + "balance_loss_mlp": 1.08649659, + "epoch": 0.010160829700886819, + "flos": 29718103662720.0, + "grad_norm": 1.7973544688290481, + "language_loss": 0.86207986, + "learning_rate": 3.999004795873003e-06, + "loss": 0.88622367, + "num_input_tokens_seen": 3578970, + "router_z_loss_clip": 0.5859375, + "router_z_loss_mlp": 2.3125, + "step": 169, + "time_per_iteration": 2.584573984146118 + }, + { + "auxiliary_loss_clip": 0.01321291, + "auxiliary_loss_mlp": 0.01098267, + "balance_loss_clip": 1.03785205, + "balance_loss_mlp": 1.0876385, + "epoch": 0.010220952953554787, + "flos": 20411306536320.0, + "grad_norm": 1.9927921484930773, + "language_loss": 0.83916354, + "learning_rate": 3.998992842579657e-06, + "loss": 0.86335915, + "num_input_tokens_seen": 3597275, + "router_z_loss_clip": 0.6015625, + "router_z_loss_mlp": 2.34375, + "step": 170, + "time_per_iteration": 2.49104642868042 + }, + { + "auxiliary_loss_clip": 0.01333332, + "auxiliary_loss_mlp": 0.01111543, + "balance_loss_clip": 1.05127096, + "balance_loss_mlp": 1.08931541, + "epoch": 0.010281076206222756, + "flos": 31283783850240.0, + "grad_norm": 2.252100548603337, + "language_loss": 0.89015782, + "learning_rate": 3.9989808179474e-06, + "loss": 0.91460657, + "num_input_tokens_seen": 3618905, + "router_z_loss_clip": 0.6015625, + "router_z_loss_mlp": 2.4375, + "step": 171, + "time_per_iteration": 2.5218472480773926 + }, + { + "auxiliary_loss_clip": 0.01321781, + "auxiliary_loss_mlp": 0.01102424, + "balance_loss_clip": 1.04322505, + "balance_loss_mlp": 1.08822954, + "epoch": 0.010341199458890726, + "flos": 21981176087040.0, + "grad_norm": 2.44019530662096, + "language_loss": 0.88130593, + "learning_rate": 3.998968721976658e-06, + "loss": 0.90554798, + "num_input_tokens_seen": 3639610, + "router_z_loss_clip": 0.59375, + "router_z_loss_mlp": 2.328125, + "step": 172, + "time_per_iteration": 2.5287842750549316 + }, + { + "auxiliary_loss_clip": 0.01312446, + "auxiliary_loss_mlp": 0.01096913, + "balance_loss_clip": 1.03950167, + "balance_loss_mlp": 1.0837512, + "epoch": 0.010401322711558695, + "flos": 30809209472640.0, + "grad_norm": 1.803783117703605, + "language_loss": 0.80085742, + "learning_rate": 3.998956554667865e-06, + "loss": 0.82495105, + "num_input_tokens_seen": 3664030, + "router_z_loss_clip": 0.5703125, + "router_z_loss_mlp": 2.28125, + "step": 173, + "time_per_iteration": 2.5845518112182617 + }, + { + "auxiliary_loss_clip": 0.01322341, + "auxiliary_loss_mlp": 0.01106042, + "balance_loss_clip": 1.04863095, + "balance_loss_mlp": 1.08603311, + "epoch": 0.010461445964226665, + "flos": 24714037670400.0, + "grad_norm": 1.8406425535950643, + "language_loss": 0.82000279, + "learning_rate": 3.998944316021455e-06, + "loss": 0.84428656, + "num_input_tokens_seen": 3683615, + "router_z_loss_clip": 0.57421875, + "router_z_loss_mlp": 2.375, + "step": 174, + "time_per_iteration": 5.432522535324097 + }, + { + "auxiliary_loss_clip": 0.01321477, + "auxiliary_loss_mlp": 0.01108623, + "balance_loss_clip": 1.04925632, + "balance_loss_mlp": 1.08508635, + "epoch": 0.010521569216894634, + "flos": 27709096631040.0, + "grad_norm": 3.0811273337718794, + "language_loss": 0.72226876, + "learning_rate": 3.9989320060378634e-06, + "loss": 0.74656975, + "num_input_tokens_seen": 3704540, + "router_z_loss_clip": 0.59375, + "router_z_loss_mlp": 2.375, + "step": 175, + "time_per_iteration": 3.943849563598633 + }, + { + "auxiliary_loss_clip": 0.0132293, + "auxiliary_loss_mlp": 0.01111667, + "balance_loss_clip": 1.0503459, + "balance_loss_mlp": 1.08794677, + "epoch": 0.010581692469562603, + "flos": 12457533306240.0, + "grad_norm": 2.5844798526471795, + "language_loss": 0.96816218, + "learning_rate": 3.998919624717531e-06, + "loss": 0.99250817, + "num_input_tokens_seen": 3721320, + "router_z_loss_clip": 0.61328125, + "router_z_loss_mlp": 2.359375, + "step": 176, + "time_per_iteration": 2.4978418350219727 + }, + { + "auxiliary_loss_clip": 0.013113, + "auxiliary_loss_mlp": 0.01106073, + "balance_loss_clip": 1.05104566, + "balance_loss_mlp": 1.0845108, + "epoch": 0.010641815722230573, + "flos": 19426581239040.0, + "grad_norm": 2.4681889646441357, + "language_loss": 0.76004493, + "learning_rate": 3.998907172060898e-06, + "loss": 0.78421861, + "num_input_tokens_seen": 3739385, + "router_z_loss_clip": 0.55078125, + "router_z_loss_mlp": 2.265625, + "step": 177, + "time_per_iteration": 2.4572675228118896 + }, + { + "auxiliary_loss_clip": 0.01319403, + "auxiliary_loss_mlp": 0.0109398, + "balance_loss_clip": 1.03518641, + "balance_loss_mlp": 1.08404136, + "epoch": 0.010701938974898541, + "flos": 18331600268160.0, + "grad_norm": 2.4012411743357758, + "language_loss": 0.75504708, + "learning_rate": 3.9988946480684115e-06, + "loss": 0.77918088, + "num_input_tokens_seen": 3756360, + "router_z_loss_clip": 0.5859375, + "router_z_loss_mlp": 2.359375, + "step": 178, + "time_per_iteration": 2.4346606731414795 + }, + { + "auxiliary_loss_clip": 0.01326096, + "auxiliary_loss_mlp": 0.01092432, + "balance_loss_clip": 1.0330658, + "balance_loss_mlp": 1.08672309, + "epoch": 0.010762062227566512, + "flos": 19203102426240.0, + "grad_norm": 2.2127760174419295, + "language_loss": 0.8330009, + "learning_rate": 3.998882052740516e-06, + "loss": 0.85718614, + "num_input_tokens_seen": 3773930, + "router_z_loss_clip": 0.59375, + "router_z_loss_mlp": 2.40625, + "step": 179, + "time_per_iteration": 2.4346044063568115 + }, + { + "auxiliary_loss_clip": 0.01315178, + "auxiliary_loss_mlp": 0.01085268, + "balance_loss_clip": 1.0242331, + "balance_loss_mlp": 1.07958055, + "epoch": 0.01082218548023448, + "flos": 31424239716480.0, + "grad_norm": 1.9845702506607297, + "language_loss": 0.83313191, + "learning_rate": 3.9988693860776616e-06, + "loss": 0.85713637, + "num_input_tokens_seen": 3793630, + "router_z_loss_clip": 0.609375, + "router_z_loss_mlp": 2.359375, + "step": 180, + "time_per_iteration": 2.625657558441162 + }, + { + "auxiliary_loss_clip": 0.01318714, + "auxiliary_loss_mlp": 0.01090594, + "balance_loss_clip": 1.03275371, + "balance_loss_mlp": 1.08474123, + "epoch": 0.01088230873290245, + "flos": 25045258538880.0, + "grad_norm": 2.7088351245404394, + "language_loss": 0.77022505, + "learning_rate": 3.998856648080301e-06, + "loss": 0.79431814, + "num_input_tokens_seen": 3813610, + "router_z_loss_clip": 0.578125, + "router_z_loss_mlp": 2.34375, + "step": 181, + "time_per_iteration": 2.4812734127044678 + }, + { + "auxiliary_loss_clip": 0.01312901, + "auxiliary_loss_mlp": 0.01094778, + "balance_loss_clip": 1.03638959, + "balance_loss_mlp": 1.0799551, + "epoch": 0.01094243198557042, + "flos": 22892304504960.0, + "grad_norm": 2.8413661282454994, + "language_loss": 0.75974607, + "learning_rate": 3.998843838748888e-06, + "loss": 0.78382289, + "num_input_tokens_seen": 3831390, + "router_z_loss_clip": 0.5859375, + "router_z_loss_mlp": 2.328125, + "step": 182, + "time_per_iteration": 2.48538875579834 + }, + { + "auxiliary_loss_clip": 0.01313813, + "auxiliary_loss_mlp": 0.01107224, + "balance_loss_clip": 1.04876375, + "balance_loss_mlp": 1.08262396, + "epoch": 0.011002555238238388, + "flos": 17164104670080.0, + "grad_norm": 2.0731930225158735, + "language_loss": 0.86371708, + "learning_rate": 3.9988309580838796e-06, + "loss": 0.88792747, + "num_input_tokens_seen": 3849705, + "router_z_loss_clip": 0.5859375, + "router_z_loss_mlp": 2.3125, + "step": 183, + "time_per_iteration": 2.426077365875244 + }, + { + "auxiliary_loss_clip": 0.01314354, + "auxiliary_loss_mlp": 0.01103889, + "balance_loss_clip": 1.04771781, + "balance_loss_mlp": 1.08373165, + "epoch": 0.011062678490906358, + "flos": 22309045464960.0, + "grad_norm": 2.040898168728987, + "language_loss": 0.85582656, + "learning_rate": 3.998818006085736e-06, + "loss": 0.88000894, + "num_input_tokens_seen": 3869230, + "router_z_loss_clip": 0.5625, + "router_z_loss_mlp": 2.3125, + "step": 184, + "time_per_iteration": 2.4764184951782227 + }, + { + "auxiliary_loss_clip": 0.01309698, + "auxiliary_loss_mlp": 0.0109481, + "balance_loss_clip": 1.0387814, + "balance_loss_mlp": 1.0818429, + "epoch": 0.011122801743574327, + "flos": 24387250544640.0, + "grad_norm": 1.9350126346676748, + "language_loss": 0.8281703, + "learning_rate": 3.99880498275492e-06, + "loss": 0.85221541, + "num_input_tokens_seen": 3889735, + "router_z_loss_clip": 0.55859375, + "router_z_loss_mlp": 2.28125, + "step": 185, + "time_per_iteration": 2.5032644271850586 + }, + { + "auxiliary_loss_clip": 0.01317324, + "auxiliary_loss_mlp": 0.01090762, + "balance_loss_clip": 1.03339851, + "balance_loss_mlp": 1.08348095, + "epoch": 0.011182924996242297, + "flos": 18149283815040.0, + "grad_norm": 2.111643803506232, + "language_loss": 0.70618719, + "learning_rate": 3.9987918880918946e-06, + "loss": 0.73026806, + "num_input_tokens_seen": 3908855, + "router_z_loss_clip": 0.57421875, + "router_z_loss_mlp": 2.34375, + "step": 186, + "time_per_iteration": 2.4431312084198 + }, + { + "auxiliary_loss_clip": 0.01311425, + "auxiliary_loss_mlp": 0.01093057, + "balance_loss_clip": 1.03919864, + "balance_loss_mlp": 1.07877183, + "epoch": 0.011243048248910266, + "flos": 15485899570560.0, + "grad_norm": 2.569855957246881, + "language_loss": 1.01050854, + "learning_rate": 3.9987787220971295e-06, + "loss": 1.03455341, + "num_input_tokens_seen": 3923865, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 2.328125, + "step": 187, + "time_per_iteration": 2.4374749660491943 + }, + { + "auxiliary_loss_clip": 0.01303492, + "auxiliary_loss_mlp": 0.01099116, + "balance_loss_clip": 1.04146647, + "balance_loss_mlp": 1.0788306, + "epoch": 0.011303171501578235, + "flos": 40915273420800.0, + "grad_norm": 2.548479597549839, + "language_loss": 0.7428776, + "learning_rate": 3.9987654847710925e-06, + "loss": 0.76690364, + "num_input_tokens_seen": 3946870, + "router_z_loss_clip": 0.578125, + "router_z_loss_mlp": 2.25, + "step": 188, + "time_per_iteration": 2.6361711025238037 + }, + { + "auxiliary_loss_clip": 0.01240898, + "auxiliary_loss_mlp": 0.01070704, + "balance_loss_clip": 1.04972339, + "balance_loss_mlp": 1.1219821, + "epoch": 0.011363294754246205, + "flos": 66299607884160.0, + "grad_norm": 0.7354062841985437, + "language_loss": 0.56136906, + "learning_rate": 3.998752176114257e-06, + "loss": 0.58448505, + "num_input_tokens_seen": 4010005, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 1.1875, + "step": 189, + "time_per_iteration": 3.160338878631592 + }, + { + "auxiliary_loss_clip": 0.01305183, + "auxiliary_loss_mlp": 0.01101284, + "balance_loss_clip": 1.04456365, + "balance_loss_mlp": 1.0788312, + "epoch": 0.011423418006914174, + "flos": 24899112120960.0, + "grad_norm": 2.1583810659975446, + "language_loss": 0.93964595, + "learning_rate": 3.998738796127097e-06, + "loss": 0.96371061, + "num_input_tokens_seen": 4029035, + "router_z_loss_clip": 0.56640625, + "router_z_loss_mlp": 2.265625, + "step": 190, + "time_per_iteration": 2.4950332641601562 + }, + { + "auxiliary_loss_clip": 0.0130113, + "auxiliary_loss_mlp": 0.0108689, + "balance_loss_clip": 1.03446198, + "balance_loss_mlp": 1.07827342, + "epoch": 0.011483541259582144, + "flos": 19790865031680.0, + "grad_norm": 2.8798929290255653, + "language_loss": 0.84118086, + "learning_rate": 3.998725344810092e-06, + "loss": 0.8650611, + "num_input_tokens_seen": 4046995, + "router_z_loss_clip": 0.5234375, + "router_z_loss_mlp": 2.234375, + "step": 191, + "time_per_iteration": 2.474900484085083 + }, + { + "auxiliary_loss_clip": 0.01305111, + "auxiliary_loss_mlp": 0.01097284, + "balance_loss_clip": 1.04456949, + "balance_loss_mlp": 1.0755477, + "epoch": 0.011543664512250112, + "flos": 26175746229120.0, + "grad_norm": 1.8549438389464499, + "language_loss": 0.91263413, + "learning_rate": 3.99871182216372e-06, + "loss": 0.93665814, + "num_input_tokens_seen": 4065865, + "router_z_loss_clip": 0.52734375, + "router_z_loss_mlp": 2.296875, + "step": 192, + "time_per_iteration": 2.534113645553589 + }, + { + "auxiliary_loss_clip": 0.01302597, + "auxiliary_loss_mlp": 0.0110381, + "balance_loss_clip": 1.04594564, + "balance_loss_mlp": 1.07853484, + "epoch": 0.011603787764918083, + "flos": 23767856380800.0, + "grad_norm": 2.087142401840174, + "language_loss": 0.86185181, + "learning_rate": 3.998698228188465e-06, + "loss": 0.88591588, + "num_input_tokens_seen": 4085305, + "router_z_loss_clip": 0.578125, + "router_z_loss_mlp": 2.25, + "step": 193, + "time_per_iteration": 2.4675991535186768 + }, + { + "auxiliary_loss_clip": 0.01300011, + "auxiliary_loss_mlp": 0.01100296, + "balance_loss_clip": 1.04460144, + "balance_loss_mlp": 1.07354963, + "epoch": 0.011663911017586051, + "flos": 25953594048000.0, + "grad_norm": 2.5113253385065124, + "language_loss": 0.91893256, + "learning_rate": 3.9986845628848115e-06, + "loss": 0.94293571, + "num_input_tokens_seen": 4105185, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 2.265625, + "step": 194, + "time_per_iteration": 2.4987685680389404 + }, + { + "auxiliary_loss_clip": 0.01306807, + "auxiliary_loss_mlp": 0.01093733, + "balance_loss_clip": 1.0397315, + "balance_loss_mlp": 1.07947361, + "epoch": 0.01172403427025402, + "flos": 17894173443840.0, + "grad_norm": 2.357181387982265, + "language_loss": 0.88895011, + "learning_rate": 3.998670826253246e-06, + "loss": 0.91295552, + "num_input_tokens_seen": 4123160, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 2.28125, + "step": 195, + "time_per_iteration": 2.435065507888794 + }, + { + "auxiliary_loss_clip": 0.01300711, + "auxiliary_loss_mlp": 0.0109839, + "balance_loss_clip": 1.04267144, + "balance_loss_mlp": 1.07752693, + "epoch": 0.01178415752292199, + "flos": 17893579950720.0, + "grad_norm": 2.2137038281252814, + "language_loss": 0.84706283, + "learning_rate": 3.998657018294261e-06, + "loss": 0.87105381, + "num_input_tokens_seen": 4140425, + "router_z_loss_clip": 0.55859375, + "router_z_loss_mlp": 2.234375, + "step": 196, + "time_per_iteration": 2.449734926223755 + }, + { + "auxiliary_loss_clip": 0.01303728, + "auxiliary_loss_mlp": 0.01095433, + "balance_loss_clip": 1.03992963, + "balance_loss_mlp": 1.07673109, + "epoch": 0.011844280775589959, + "flos": 22892444150400.0, + "grad_norm": 2.21390164074501, + "language_loss": 0.9224143, + "learning_rate": 3.998643139008348e-06, + "loss": 0.94640595, + "num_input_tokens_seen": 4159555, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 2.265625, + "step": 197, + "time_per_iteration": 2.449037551879883 + }, + { + "auxiliary_loss_clip": 0.0129758, + "auxiliary_loss_mlp": 0.01095406, + "balance_loss_clip": 1.04073656, + "balance_loss_mlp": 1.07462072, + "epoch": 0.01190440402825793, + "flos": 26979097680000.0, + "grad_norm": 1.9318876007715875, + "language_loss": 0.78542089, + "learning_rate": 3.998629188396002e-06, + "loss": 0.80935079, + "num_input_tokens_seen": 4180480, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 2.21875, + "step": 198, + "time_per_iteration": 2.5457894802093506 + }, + { + "auxiliary_loss_clip": 0.01295783, + "auxiliary_loss_mlp": 0.01092376, + "balance_loss_clip": 1.03916073, + "balance_loss_mlp": 1.07411599, + "epoch": 0.011964527280925898, + "flos": 20520549780480.0, + "grad_norm": 1.9546481104925915, + "language_loss": 0.87513494, + "learning_rate": 3.9986151664577225e-06, + "loss": 0.8990165, + "num_input_tokens_seen": 4198835, + "router_z_loss_clip": 0.53125, + "router_z_loss_mlp": 2.21875, + "step": 199, + "time_per_iteration": 2.43874454498291 + }, + { + "auxiliary_loss_clip": 0.01303553, + "auxiliary_loss_mlp": 0.01105744, + "balance_loss_clip": 1.04752231, + "balance_loss_mlp": 1.07472932, + "epoch": 0.012024650533593867, + "flos": 27744742995840.0, + "grad_norm": 1.9422627423945882, + "language_loss": 0.8069098, + "learning_rate": 3.998601073194007e-06, + "loss": 0.83100271, + "num_input_tokens_seen": 4219335, + "router_z_loss_clip": 0.58203125, + "router_z_loss_mlp": 2.28125, + "step": 200, + "time_per_iteration": 2.496249198913574 + }, + { + "auxiliary_loss_clip": 0.01295899, + "auxiliary_loss_mlp": 0.01089628, + "balance_loss_clip": 1.033409, + "balance_loss_mlp": 1.07088518, + "epoch": 0.012084773786261837, + "flos": 10451249360640.0, + "grad_norm": 2.2723968814470914, + "language_loss": 0.86802953, + "learning_rate": 3.998586908605362e-06, + "loss": 0.89188486, + "num_input_tokens_seen": 4236940, + "router_z_loss_clip": 0.5625, + "router_z_loss_mlp": 2.25, + "step": 201, + "time_per_iteration": 2.4100606441497803 + }, + { + "auxiliary_loss_clip": 0.01300387, + "auxiliary_loss_mlp": 0.01099245, + "balance_loss_clip": 1.04326439, + "balance_loss_mlp": 1.07630789, + "epoch": 0.012144897038929806, + "flos": 23104821150720.0, + "grad_norm": 1.7247674530220773, + "language_loss": 0.83746284, + "learning_rate": 3.99857267269229e-06, + "loss": 0.86145914, + "num_input_tokens_seen": 4256755, + "router_z_loss_clip": 0.55859375, + "router_z_loss_mlp": 2.25, + "step": 202, + "time_per_iteration": 2.495171546936035 + }, + { + "auxiliary_loss_clip": 0.01290128, + "auxiliary_loss_mlp": 0.01090998, + "balance_loss_clip": 1.03797424, + "balance_loss_mlp": 1.06840992, + "epoch": 0.012205020291597776, + "flos": 21032132065920.0, + "grad_norm": 1.8188762982178481, + "language_loss": 0.89072442, + "learning_rate": 3.9985583654553e-06, + "loss": 0.9145357, + "num_input_tokens_seen": 4276505, + "router_z_loss_clip": 0.53125, + "router_z_loss_mlp": 2.21875, + "step": 203, + "time_per_iteration": 2.4408295154571533 + }, + { + "auxiliary_loss_clip": 0.01245051, + "auxiliary_loss_mlp": 0.0107076, + "balance_loss_clip": 1.04958832, + "balance_loss_mlp": 1.12554657, + "epoch": 0.012265143544265745, + "flos": 68444846507520.0, + "grad_norm": 0.9907239168515048, + "language_loss": 0.61084068, + "learning_rate": 3.998543986894904e-06, + "loss": 0.63399887, + "num_input_tokens_seen": 4330965, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 1.1953125, + "step": 204, + "time_per_iteration": 3.0198233127593994 + }, + { + "auxiliary_loss_clip": 0.01300788, + "auxiliary_loss_mlp": 0.01098754, + "balance_loss_clip": 1.04201007, + "balance_loss_mlp": 1.07181823, + "epoch": 0.012325266796933715, + "flos": 17018307365760.0, + "grad_norm": 2.2994236391340808, + "language_loss": 0.90849137, + "learning_rate": 3.9985295370116135e-06, + "loss": 0.93248677, + "num_input_tokens_seen": 4348200, + "router_z_loss_clip": 0.56640625, + "router_z_loss_mlp": 2.28125, + "step": 205, + "time_per_iteration": 2.4178733825683594 + }, + { + "auxiliary_loss_clip": 0.0130442, + "auxiliary_loss_mlp": 0.01111671, + "balance_loss_clip": 1.0553323, + "balance_loss_mlp": 1.07182193, + "epoch": 0.012385390049601683, + "flos": 20189119443840.0, + "grad_norm": 2.3596882713470264, + "language_loss": 0.88420367, + "learning_rate": 3.998515015805945e-06, + "loss": 0.90836465, + "num_input_tokens_seen": 4365460, + "router_z_loss_clip": 0.5625, + "router_z_loss_mlp": 2.3125, + "step": 206, + "time_per_iteration": 2.469921588897705 + }, + { + "auxiliary_loss_clip": 0.01295143, + "auxiliary_loss_mlp": 0.01092373, + "balance_loss_clip": 1.03915834, + "balance_loss_mlp": 1.07029605, + "epoch": 0.012445513302269652, + "flos": 16252208202240.0, + "grad_norm": 2.085862417309902, + "language_loss": 0.94610721, + "learning_rate": 3.998500423278416e-06, + "loss": 0.96998239, + "num_input_tokens_seen": 4383650, + "router_z_loss_clip": 0.53125, + "router_z_loss_mlp": 2.25, + "step": 207, + "time_per_iteration": 2.4121739864349365 + }, + { + "auxiliary_loss_clip": 0.01296356, + "auxiliary_loss_mlp": 0.01095686, + "balance_loss_clip": 1.04142165, + "balance_loss_mlp": 1.07297158, + "epoch": 0.012505636554937622, + "flos": 23768240405760.0, + "grad_norm": 2.3786309079219756, + "language_loss": 0.74965876, + "learning_rate": 3.998485759429547e-06, + "loss": 0.77357912, + "num_input_tokens_seen": 4403765, + "router_z_loss_clip": 0.54296875, + "router_z_loss_mlp": 2.234375, + "step": 208, + "time_per_iteration": 2.555402994155884 + }, + { + "auxiliary_loss_clip": 0.01286573, + "auxiliary_loss_mlp": 0.01087111, + "balance_loss_clip": 1.0330857, + "balance_loss_mlp": 1.06945479, + "epoch": 0.012565759807605591, + "flos": 30590234225280.0, + "grad_norm": 9.246201609274017, + "language_loss": 0.98260844, + "learning_rate": 3.998471024259863e-06, + "loss": 1.00634527, + "num_input_tokens_seen": 4421935, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 2.171875, + "step": 209, + "time_per_iteration": 2.5096023082733154 + }, + { + "auxiliary_loss_clip": 0.01297111, + "auxiliary_loss_mlp": 0.01103891, + "balance_loss_clip": 1.04996026, + "balance_loss_mlp": 1.07318473, + "epoch": 0.012625883060273561, + "flos": 40111956881280.0, + "grad_norm": 2.8645082772694743, + "language_loss": 0.84888291, + "learning_rate": 3.998456217769888e-06, + "loss": 0.87289298, + "num_input_tokens_seen": 4441470, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 2.234375, + "step": 210, + "time_per_iteration": 2.654449939727783 + }, + { + "auxiliary_loss_clip": 0.01288543, + "auxiliary_loss_mlp": 0.01107358, + "balance_loss_clip": 1.05447721, + "balance_loss_mlp": 1.06919765, + "epoch": 0.01268600631294153, + "flos": 27087956899200.0, + "grad_norm": 2.373016707420503, + "language_loss": 0.96118057, + "learning_rate": 3.998441339960152e-06, + "loss": 0.98513967, + "num_input_tokens_seen": 4459950, + "router_z_loss_clip": 0.53125, + "router_z_loss_mlp": 2.1875, + "step": 211, + "time_per_iteration": 2.4860005378723145 + }, + { + "auxiliary_loss_clip": 0.01300727, + "auxiliary_loss_mlp": 0.01111718, + "balance_loss_clip": 1.0554986, + "balance_loss_mlp": 1.07486534, + "epoch": 0.012746129565609499, + "flos": 16981823128320.0, + "grad_norm": 2.2582161522130466, + "language_loss": 0.94642508, + "learning_rate": 3.998426390831185e-06, + "loss": 0.97054946, + "num_input_tokens_seen": 4478390, + "router_z_loss_clip": 0.5625, + "router_z_loss_mlp": 2.25, + "step": 212, + "time_per_iteration": 2.4837257862091064 + }, + { + "auxiliary_loss_clip": 0.01289522, + "auxiliary_loss_mlp": 0.01092615, + "balance_loss_clip": 1.04164124, + "balance_loss_mlp": 1.0723896, + "epoch": 0.012806252818277469, + "flos": 46531786216320.0, + "grad_norm": 1.6722825749467651, + "language_loss": 0.75558621, + "learning_rate": 3.998411370383521e-06, + "loss": 0.7794075, + "num_input_tokens_seen": 4501665, + "router_z_loss_clip": 0.5078125, + "router_z_loss_mlp": 2.171875, + "step": 213, + "time_per_iteration": 5.504590272903442 + }, + { + "auxiliary_loss_clip": 0.01289584, + "auxiliary_loss_mlp": 0.01098085, + "balance_loss_clip": 1.0462532, + "balance_loss_mlp": 1.06899095, + "epoch": 0.012866376070945438, + "flos": 14387846400000.0, + "grad_norm": 6.023317248147852, + "language_loss": 0.85730284, + "learning_rate": 3.9983962786176945e-06, + "loss": 0.88117963, + "num_input_tokens_seen": 4519055, + "router_z_loss_clip": 0.51953125, + "router_z_loss_mlp": 2.203125, + "step": 214, + "time_per_iteration": 3.83119797706604 + }, + { + "auxiliary_loss_clip": 0.01287974, + "auxiliary_loss_mlp": 0.01113797, + "balance_loss_clip": 1.05791199, + "balance_loss_mlp": 1.07126224, + "epoch": 0.012926499323613408, + "flos": 26139611105280.0, + "grad_norm": 1.9864199504565263, + "language_loss": 0.76788223, + "learning_rate": 3.9983811155342465e-06, + "loss": 0.79189986, + "num_input_tokens_seen": 4540870, + "router_z_loss_clip": 0.55859375, + "router_z_loss_mlp": 2.15625, + "step": 215, + "time_per_iteration": 3.860884666442871 + }, + { + "auxiliary_loss_clip": 0.01301141, + "auxiliary_loss_mlp": 0.01100734, + "balance_loss_clip": 1.04892564, + "balance_loss_mlp": 1.07581246, + "epoch": 0.012986622576281377, + "flos": 30115904227200.0, + "grad_norm": 2.076965771857895, + "language_loss": 0.89427274, + "learning_rate": 3.998365881133717e-06, + "loss": 0.91829151, + "num_input_tokens_seen": 4560395, + "router_z_loss_clip": 0.515625, + "router_z_loss_mlp": 2.25, + "step": 216, + "time_per_iteration": 2.5689918994903564 + }, + { + "auxiliary_loss_clip": 0.01289735, + "auxiliary_loss_mlp": 0.01099849, + "balance_loss_clip": 1.04475069, + "balance_loss_mlp": 1.06854296, + "epoch": 0.013046745828949347, + "flos": 13953177573120.0, + "grad_norm": 2.822835372352441, + "language_loss": 0.93123031, + "learning_rate": 3.998350575416648e-06, + "loss": 0.95512605, + "num_input_tokens_seen": 4575785, + "router_z_loss_clip": 0.55078125, + "router_z_loss_mlp": 2.21875, + "step": 217, + "time_per_iteration": 2.40677809715271 + }, + { + "auxiliary_loss_clip": 0.0128734, + "auxiliary_loss_mlp": 0.01093811, + "balance_loss_clip": 1.03728223, + "balance_loss_mlp": 1.06867433, + "epoch": 0.013106869081617315, + "flos": 17346874970880.0, + "grad_norm": 1.96990105044481, + "language_loss": 0.92702591, + "learning_rate": 3.9983351983835885e-06, + "loss": 0.95083737, + "num_input_tokens_seen": 4594985, + "router_z_loss_clip": 0.56640625, + "router_z_loss_mlp": 2.1875, + "step": 218, + "time_per_iteration": 2.459202766418457 + }, + { + "auxiliary_loss_clip": 0.01281061, + "auxiliary_loss_mlp": 0.01092205, + "balance_loss_clip": 1.0364871, + "balance_loss_mlp": 1.06463194, + "epoch": 0.013166992334285284, + "flos": 25883732684160.0, + "grad_norm": 2.0566708002092895, + "language_loss": 0.85948598, + "learning_rate": 3.998319750035087e-06, + "loss": 0.88321859, + "num_input_tokens_seen": 4616125, + "router_z_loss_clip": 0.55859375, + "router_z_loss_mlp": 2.15625, + "step": 219, + "time_per_iteration": 2.4813380241394043 + }, + { + "auxiliary_loss_clip": 0.01284623, + "auxiliary_loss_mlp": 0.01085259, + "balance_loss_clip": 1.03380799, + "balance_loss_mlp": 1.06691563, + "epoch": 0.013227115586953254, + "flos": 31174610428800.0, + "grad_norm": 1.7846173857072796, + "language_loss": 0.87097883, + "learning_rate": 3.998304230371692e-06, + "loss": 0.8946777, + "num_input_tokens_seen": 4637795, + "router_z_loss_clip": 0.515625, + "router_z_loss_mlp": 2.171875, + "step": 220, + "time_per_iteration": 2.534933090209961 + }, + { + "auxiliary_loss_clip": 0.01278089, + "auxiliary_loss_mlp": 0.01087831, + "balance_loss_clip": 1.03773904, + "balance_loss_mlp": 1.06271708, + "epoch": 0.013287238839621223, + "flos": 20408513627520.0, + "grad_norm": 1.8386479521990724, + "language_loss": 0.86070645, + "learning_rate": 3.99828863939396e-06, + "loss": 0.88436568, + "num_input_tokens_seen": 4656835, + "router_z_loss_clip": 0.5, + "router_z_loss_mlp": 2.15625, + "step": 221, + "time_per_iteration": 2.4281139373779297 + }, + { + "auxiliary_loss_clip": 0.01285994, + "auxiliary_loss_mlp": 0.01091569, + "balance_loss_clip": 1.03704226, + "balance_loss_mlp": 1.06290507, + "epoch": 0.013347362092289193, + "flos": 14136262076160.0, + "grad_norm": 2.093337358758933, + "language_loss": 0.91403848, + "learning_rate": 3.998272977102448e-06, + "loss": 0.93781406, + "num_input_tokens_seen": 4673015, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 2.21875, + "step": 222, + "time_per_iteration": 2.4241268634796143 + }, + { + "auxiliary_loss_clip": 0.01277546, + "auxiliary_loss_mlp": 0.01089054, + "balance_loss_clip": 1.03266788, + "balance_loss_mlp": 1.06376529, + "epoch": 0.013407485344957162, + "flos": 21796660218240.0, + "grad_norm": 2.2189139260236526, + "language_loss": 0.94726562, + "learning_rate": 3.998257243497712e-06, + "loss": 0.97093159, + "num_input_tokens_seen": 4692355, + "router_z_loss_clip": 0.5625, + "router_z_loss_mlp": 2.140625, + "step": 223, + "time_per_iteration": 2.4427013397216797 + }, + { + "auxiliary_loss_clip": 0.01275896, + "auxiliary_loss_mlp": 0.01088529, + "balance_loss_clip": 1.03576696, + "balance_loss_mlp": 1.06117606, + "epoch": 0.013467608597625132, + "flos": 18620716170240.0, + "grad_norm": 2.8381194576812163, + "language_loss": 0.87496227, + "learning_rate": 3.998241438580316e-06, + "loss": 0.89860654, + "num_input_tokens_seen": 4710080, + "router_z_loss_clip": 0.53125, + "router_z_loss_mlp": 2.15625, + "step": 224, + "time_per_iteration": 2.4462950229644775 + }, + { + "auxiliary_loss_clip": 0.01276996, + "auxiliary_loss_mlp": 0.01084642, + "balance_loss_clip": 1.02935266, + "balance_loss_mlp": 1.06108713, + "epoch": 0.013527731850293101, + "flos": 18551308654080.0, + "grad_norm": 2.1273599177350144, + "language_loss": 0.88692373, + "learning_rate": 3.998225562350823e-06, + "loss": 0.9105401, + "num_input_tokens_seen": 4728980, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 2.15625, + "step": 225, + "time_per_iteration": 2.4228098392486572 + }, + { + "auxiliary_loss_clip": 0.01272484, + "auxiliary_loss_mlp": 0.01101737, + "balance_loss_clip": 1.04432523, + "balance_loss_mlp": 1.06158555, + "epoch": 0.01358785510296107, + "flos": 19164558418560.0, + "grad_norm": 1.815503315808218, + "language_loss": 0.98583525, + "learning_rate": 3.998209614809799e-06, + "loss": 1.00957751, + "num_input_tokens_seen": 4747020, + "router_z_loss_clip": 0.57421875, + "router_z_loss_mlp": 2.109375, + "step": 226, + "time_per_iteration": 2.455432176589966 + }, + { + "auxiliary_loss_clip": 0.01278348, + "auxiliary_loss_mlp": 0.01087459, + "balance_loss_clip": 1.03457808, + "balance_loss_mlp": 1.06498325, + "epoch": 0.01364797835562904, + "flos": 23328858545280.0, + "grad_norm": 2.7114359242763126, + "language_loss": 0.90125763, + "learning_rate": 3.9981935959578145e-06, + "loss": 0.92491573, + "num_input_tokens_seen": 4765000, + "router_z_loss_clip": 0.53125, + "router_z_loss_mlp": 2.125, + "step": 227, + "time_per_iteration": 2.450788736343384 + }, + { + "auxiliary_loss_clip": 0.01199205, + "auxiliary_loss_mlp": 0.01042575, + "balance_loss_clip": 1.0257901, + "balance_loss_mlp": 1.0826751, + "epoch": 0.013708101608297009, + "flos": 70989943599360.0, + "grad_norm": 0.9286409859088512, + "language_loss": 0.57483828, + "learning_rate": 3.99817750579544e-06, + "loss": 0.59725606, + "num_input_tokens_seen": 4833210, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 1.171875, + "step": 228, + "time_per_iteration": 3.1797547340393066 + }, + { + "auxiliary_loss_clip": 0.01271341, + "auxiliary_loss_mlp": 0.01092144, + "balance_loss_clip": 1.04128909, + "balance_loss_mlp": 1.06173563, + "epoch": 0.013768224860964979, + "flos": 16324268981760.0, + "grad_norm": 2.2875216044741458, + "language_loss": 0.86467117, + "learning_rate": 3.998161344323251e-06, + "loss": 0.88830602, + "num_input_tokens_seen": 4850120, + "router_z_loss_clip": 0.5078125, + "router_z_loss_mlp": 2.09375, + "step": 229, + "time_per_iteration": 2.410629987716675 + }, + { + "auxiliary_loss_clip": 0.01274439, + "auxiliary_loss_mlp": 0.01091076, + "balance_loss_clip": 1.03631115, + "balance_loss_mlp": 1.05819178, + "epoch": 0.013828348113632948, + "flos": 20192017086720.0, + "grad_norm": 4.939246484573645, + "language_loss": 0.83541977, + "learning_rate": 3.998145111541823e-06, + "loss": 0.85907495, + "num_input_tokens_seen": 4866215, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 2.15625, + "step": 230, + "time_per_iteration": 2.4956488609313965 + }, + { + "auxiliary_loss_clip": 0.01271041, + "auxiliary_loss_mlp": 0.01091215, + "balance_loss_clip": 1.04019332, + "balance_loss_mlp": 1.05893278, + "epoch": 0.013888471366300916, + "flos": 20740013786880.0, + "grad_norm": 1.8456005596458809, + "language_loss": 0.89727223, + "learning_rate": 3.998128807451736e-06, + "loss": 0.9208948, + "num_input_tokens_seen": 4885630, + "router_z_loss_clip": 0.5078125, + "router_z_loss_mlp": 2.125, + "step": 231, + "time_per_iteration": 2.4690020084381104 + }, + { + "auxiliary_loss_clip": 0.01272094, + "auxiliary_loss_mlp": 0.01096445, + "balance_loss_clip": 1.0455898, + "balance_loss_mlp": 1.05948091, + "epoch": 0.013948594618968886, + "flos": 22089546547200.0, + "grad_norm": 2.7312656935955193, + "language_loss": 0.83418334, + "learning_rate": 3.9981124320535715e-06, + "loss": 0.85786867, + "num_input_tokens_seen": 4905570, + "router_z_loss_clip": 0.5078125, + "router_z_loss_mlp": 2.125, + "step": 232, + "time_per_iteration": 2.4867002964019775 + }, + { + "auxiliary_loss_clip": 0.01279185, + "auxiliary_loss_mlp": 0.01088507, + "balance_loss_clip": 1.03290725, + "balance_loss_mlp": 1.05728006, + "epoch": 0.014008717871636855, + "flos": 19062087978240.0, + "grad_norm": 3.2014549539474055, + "language_loss": 0.73482341, + "learning_rate": 3.998095985347915e-06, + "loss": 0.75850034, + "num_input_tokens_seen": 4923535, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 2.21875, + "step": 233, + "time_per_iteration": 2.4370038509368896 + }, + { + "auxiliary_loss_clip": 0.01278501, + "auxiliary_loss_mlp": 0.0110119, + "balance_loss_clip": 1.04537606, + "balance_loss_mlp": 1.06214762, + "epoch": 0.014068841124304825, + "flos": 14530152568320.0, + "grad_norm": 2.3160215159406676, + "language_loss": 0.84934628, + "learning_rate": 3.998079467335351e-06, + "loss": 0.87314326, + "num_input_tokens_seen": 4939200, + "router_z_loss_clip": 0.55859375, + "router_z_loss_mlp": 2.15625, + "step": 234, + "time_per_iteration": 2.4662678241729736 + }, + { + "auxiliary_loss_clip": 0.01272153, + "auxiliary_loss_mlp": 0.01089142, + "balance_loss_clip": 1.03957438, + "balance_loss_mlp": 1.05987799, + "epoch": 0.014128964376972794, + "flos": 18076420074240.0, + "grad_norm": 2.50023724061121, + "language_loss": 0.88307524, + "learning_rate": 3.998062878016471e-06, + "loss": 0.90668821, + "num_input_tokens_seen": 4956620, + "router_z_loss_clip": 0.49609375, + "router_z_loss_mlp": 2.125, + "step": 235, + "time_per_iteration": 2.4285151958465576 + }, + { + "auxiliary_loss_clip": 0.01270289, + "auxiliary_loss_mlp": 0.01100538, + "balance_loss_clip": 1.0480144, + "balance_loss_mlp": 1.06112003, + "epoch": 0.014189087629640764, + "flos": 25333257277440.0, + "grad_norm": 2.2535235264048796, + "language_loss": 0.85064286, + "learning_rate": 3.998046217391867e-06, + "loss": 0.87435114, + "num_input_tokens_seen": 4975650, + "router_z_loss_clip": 0.52734375, + "router_z_loss_mlp": 2.09375, + "step": 236, + "time_per_iteration": 2.5147581100463867 + }, + { + "auxiliary_loss_clip": 0.01272199, + "auxiliary_loss_mlp": 0.01086088, + "balance_loss_clip": 1.03139448, + "balance_loss_mlp": 1.05876279, + "epoch": 0.014249210882308733, + "flos": 36138212288640.0, + "grad_norm": 1.9080197335876328, + "language_loss": 0.81960863, + "learning_rate": 3.9980294854621325e-06, + "loss": 0.8431915, + "num_input_tokens_seen": 4997415, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 2.140625, + "step": 237, + "time_per_iteration": 2.575861692428589 + }, + { + "auxiliary_loss_clip": 0.01267233, + "auxiliary_loss_mlp": 0.0110332, + "balance_loss_clip": 1.04819798, + "balance_loss_mlp": 1.05855203, + "epoch": 0.014309334134976702, + "flos": 12932142595200.0, + "grad_norm": 2.3071106764312312, + "language_loss": 0.76282841, + "learning_rate": 3.998012682227866e-06, + "loss": 0.78653395, + "num_input_tokens_seen": 5013905, + "router_z_loss_clip": 0.55078125, + "router_z_loss_mlp": 2.09375, + "step": 238, + "time_per_iteration": 2.439086675643921 + }, + { + "auxiliary_loss_clip": 0.0126541, + "auxiliary_loss_mlp": 0.01091746, + "balance_loss_clip": 1.04174924, + "balance_loss_mlp": 1.05934358, + "epoch": 0.014369457387644672, + "flos": 20776463112960.0, + "grad_norm": 2.189294612300346, + "language_loss": 0.86273628, + "learning_rate": 3.9979958076896655e-06, + "loss": 0.88630784, + "num_input_tokens_seen": 5033645, + "router_z_loss_clip": 0.5, + "router_z_loss_mlp": 2.0625, + "step": 239, + "time_per_iteration": 2.4491758346557617 + }, + { + "auxiliary_loss_clip": 0.01255269, + "auxiliary_loss_mlp": 0.0108271, + "balance_loss_clip": 1.03245175, + "balance_loss_mlp": 1.05461502, + "epoch": 0.01442958064031264, + "flos": 25847353180800.0, + "grad_norm": 2.0680005127153183, + "language_loss": 0.92302793, + "learning_rate": 3.997978861848135e-06, + "loss": 0.94640774, + "num_input_tokens_seen": 5052875, + "router_z_loss_clip": 0.50390625, + "router_z_loss_mlp": 2.015625, + "step": 240, + "time_per_iteration": 2.511864185333252 + }, + { + "auxiliary_loss_clip": 0.01260894, + "auxiliary_loss_mlp": 0.01088302, + "balance_loss_clip": 1.03828204, + "balance_loss_mlp": 1.05558801, + "epoch": 0.014489703892980611, + "flos": 28218479500800.0, + "grad_norm": 2.0359378222038345, + "language_loss": 0.84616089, + "learning_rate": 3.997961844703877e-06, + "loss": 0.86965281, + "num_input_tokens_seen": 5075005, + "router_z_loss_clip": 0.5, + "router_z_loss_mlp": 2.046875, + "step": 241, + "time_per_iteration": 2.5065231323242188 + }, + { + "auxiliary_loss_clip": 0.01259675, + "auxiliary_loss_mlp": 0.01095578, + "balance_loss_clip": 1.04260087, + "balance_loss_mlp": 1.06072092, + "epoch": 0.01454982714564858, + "flos": 22489860729600.0, + "grad_norm": 2.1281324971549664, + "language_loss": 0.87685394, + "learning_rate": 3.997944756257501e-06, + "loss": 0.90040648, + "num_input_tokens_seen": 5091875, + "router_z_loss_clip": 0.53125, + "router_z_loss_mlp": 1.984375, + "step": 242, + "time_per_iteration": 2.4625511169433594 + }, + { + "auxiliary_loss_clip": 0.01262409, + "auxiliary_loss_mlp": 0.01079613, + "balance_loss_clip": 1.02866244, + "balance_loss_mlp": 1.05539179, + "epoch": 0.014609950398316548, + "flos": 21652119722880.0, + "grad_norm": 2.0902873867775877, + "language_loss": 0.85707223, + "learning_rate": 3.997927596509616e-06, + "loss": 0.88049245, + "num_input_tokens_seen": 5111290, + "router_z_loss_clip": 0.5078125, + "router_z_loss_mlp": 2.0625, + "step": 243, + "time_per_iteration": 2.4473350048065186 + }, + { + "auxiliary_loss_clip": 0.01269157, + "auxiliary_loss_mlp": 0.0109046, + "balance_loss_clip": 1.03746009, + "balance_loss_mlp": 1.05946577, + "epoch": 0.014670073650984519, + "flos": 21868965377280.0, + "grad_norm": 1.566170571801324, + "language_loss": 0.83990335, + "learning_rate": 3.997910365460834e-06, + "loss": 0.86349952, + "num_input_tokens_seen": 5132265, + "router_z_loss_clip": 0.53125, + "router_z_loss_mlp": 2.09375, + "step": 244, + "time_per_iteration": 2.5066041946411133 + }, + { + "auxiliary_loss_clip": 0.01268433, + "auxiliary_loss_mlp": 0.01101666, + "balance_loss_clip": 1.04601955, + "balance_loss_mlp": 1.05904102, + "epoch": 0.014730196903652487, + "flos": 23182642304640.0, + "grad_norm": 2.681066411928938, + "language_loss": 0.78249276, + "learning_rate": 3.9978930631117705e-06, + "loss": 0.80619383, + "num_input_tokens_seen": 5148575, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 2.09375, + "step": 245, + "time_per_iteration": 2.4403247833251953 + }, + { + "auxiliary_loss_clip": 0.01270861, + "auxiliary_loss_mlp": 0.01090869, + "balance_loss_clip": 1.03546047, + "balance_loss_mlp": 1.05506361, + "epoch": 0.014790320156320457, + "flos": 23221465603200.0, + "grad_norm": 1.9837610932174923, + "language_loss": 0.83586812, + "learning_rate": 3.997875689463043e-06, + "loss": 0.85948539, + "num_input_tokens_seen": 5170415, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 2.15625, + "step": 246, + "time_per_iteration": 2.513209581375122 + }, + { + "auxiliary_loss_clip": 0.01264415, + "auxiliary_loss_mlp": 0.01085882, + "balance_loss_clip": 1.03235722, + "balance_loss_mlp": 1.05463314, + "epoch": 0.014850443408988426, + "flos": 15814571909760.0, + "grad_norm": 2.458078485465398, + "language_loss": 0.89064759, + "learning_rate": 3.9978582445152705e-06, + "loss": 0.9141506, + "num_input_tokens_seen": 5188565, + "router_z_loss_clip": 0.53515625, + "router_z_loss_mlp": 2.09375, + "step": 247, + "time_per_iteration": 2.411815881729126 + }, + { + "auxiliary_loss_clip": 0.01265272, + "auxiliary_loss_mlp": 0.01078643, + "balance_loss_clip": 1.02516592, + "balance_loss_mlp": 1.05134773, + "epoch": 0.014910566661656396, + "flos": 22780617465600.0, + "grad_norm": 2.05474741939736, + "language_loss": 0.77996743, + "learning_rate": 3.997840728269077e-06, + "loss": 0.8034066, + "num_input_tokens_seen": 5207810, + "router_z_loss_clip": 0.53515625, + "router_z_loss_mlp": 2.140625, + "step": 248, + "time_per_iteration": 2.5023632049560547 + }, + { + "auxiliary_loss_clip": 0.01266455, + "auxiliary_loss_mlp": 0.01094696, + "balance_loss_clip": 1.04407978, + "balance_loss_mlp": 1.05812287, + "epoch": 0.014970689914324365, + "flos": 26863954416000.0, + "grad_norm": 1.9115641066417266, + "language_loss": 0.83001065, + "learning_rate": 3.997823140725088e-06, + "loss": 0.8536222, + "num_input_tokens_seen": 5226210, + "router_z_loss_clip": 0.5078125, + "router_z_loss_mlp": 2.09375, + "step": 249, + "time_per_iteration": 2.513101100921631 + }, + { + "auxiliary_loss_clip": 0.01264516, + "auxiliary_loss_mlp": 0.01087825, + "balance_loss_clip": 1.03763819, + "balance_loss_mlp": 1.05710781, + "epoch": 0.015030813166992334, + "flos": 13984948776960.0, + "grad_norm": 3.3408272000276846, + "language_loss": 0.92655754, + "learning_rate": 3.997805481883929e-06, + "loss": 0.95008093, + "num_input_tokens_seen": 5241660, + "router_z_loss_clip": 0.5, + "router_z_loss_mlp": 2.078125, + "step": 250, + "time_per_iteration": 2.465294361114502 + }, + { + "auxiliary_loss_clip": 0.01271166, + "auxiliary_loss_mlp": 0.01107342, + "balance_loss_clip": 1.05267227, + "balance_loss_mlp": 1.05898547, + "epoch": 0.015090936419660304, + "flos": 24716656022400.0, + "grad_norm": 2.6396424242306686, + "language_loss": 0.96257102, + "learning_rate": 3.997787751746231e-06, + "loss": 0.98635614, + "num_input_tokens_seen": 5261090, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 2.125, + "step": 251, + "time_per_iteration": 2.4686830043792725 + }, + { + "auxiliary_loss_clip": 0.01261888, + "auxiliary_loss_mlp": 0.01093252, + "balance_loss_clip": 1.04177761, + "balance_loss_mlp": 1.05475163, + "epoch": 0.015151059672328273, + "flos": 25737621177600.0, + "grad_norm": 2.3097383613973905, + "language_loss": 0.83784211, + "learning_rate": 3.997769950312628e-06, + "loss": 0.86139357, + "num_input_tokens_seen": 5279175, + "router_z_loss_clip": 0.515625, + "router_z_loss_mlp": 2.0625, + "step": 252, + "time_per_iteration": 3.93217134475708 + }, + { + "auxiliary_loss_clip": 0.01259553, + "auxiliary_loss_mlp": 0.01094354, + "balance_loss_clip": 1.04118657, + "balance_loss_mlp": 1.05521631, + "epoch": 0.015211182924996243, + "flos": 21870152363520.0, + "grad_norm": 2.096397039732292, + "language_loss": 0.97462344, + "learning_rate": 3.997752077583753e-06, + "loss": 0.99816239, + "num_input_tokens_seen": 5296975, + "router_z_loss_clip": 0.53125, + "router_z_loss_mlp": 2.046875, + "step": 253, + "time_per_iteration": 5.36123251914978 + }, + { + "auxiliary_loss_clip": 0.01183241, + "auxiliary_loss_mlp": 0.01029227, + "balance_loss_clip": 1.01310992, + "balance_loss_mlp": 1.07796979, + "epoch": 0.015271306177664212, + "flos": 66891734409600.0, + "grad_norm": 0.838537053241808, + "language_loss": 0.55493897, + "learning_rate": 3.997734133560246e-06, + "loss": 0.57706368, + "num_input_tokens_seen": 5358375, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 1.0546875, + "step": 254, + "time_per_iteration": 3.1195199489593506 + }, + { + "auxiliary_loss_clip": 0.01263656, + "auxiliary_loss_mlp": 0.01102612, + "balance_loss_clip": 1.04877758, + "balance_loss_mlp": 1.05328155, + "epoch": 0.01533142943033218, + "flos": 26832846528000.0, + "grad_norm": 2.146919372189757, + "language_loss": 0.89907759, + "learning_rate": 3.997716118242746e-06, + "loss": 0.92274028, + "num_input_tokens_seen": 5377255, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 2.109375, + "step": 255, + "time_per_iteration": 3.897091865539551 + }, + { + "auxiliary_loss_clip": 0.01261725, + "auxiliary_loss_mlp": 0.01104855, + "balance_loss_clip": 1.05206895, + "balance_loss_mlp": 1.05353701, + "epoch": 0.01539155268300015, + "flos": 20812702970880.0, + "grad_norm": 2.1854471015532435, + "language_loss": 0.84855503, + "learning_rate": 3.997698031631898e-06, + "loss": 0.87222087, + "num_input_tokens_seen": 5395320, + "router_z_loss_clip": 0.52734375, + "router_z_loss_mlp": 2.078125, + "step": 256, + "time_per_iteration": 2.495079755783081 + }, + { + "auxiliary_loss_clip": 0.01264272, + "auxiliary_loss_mlp": 0.01096694, + "balance_loss_clip": 1.04481387, + "balance_loss_mlp": 1.05318999, + "epoch": 0.01545167593566812, + "flos": 15960927795840.0, + "grad_norm": 3.135350269790941, + "language_loss": 0.70954847, + "learning_rate": 3.997679873728344e-06, + "loss": 0.73315823, + "num_input_tokens_seen": 5411970, + "router_z_loss_clip": 0.51953125, + "router_z_loss_mlp": 2.109375, + "step": 257, + "time_per_iteration": 2.3912618160247803 + }, + { + "auxiliary_loss_clip": 0.01261377, + "auxiliary_loss_mlp": 0.01097662, + "balance_loss_clip": 1.04547238, + "balance_loss_mlp": 1.05512738, + "epoch": 0.01551179918833609, + "flos": 22600640073600.0, + "grad_norm": 2.5275914771710566, + "language_loss": 0.94030905, + "learning_rate": 3.9976616445327355e-06, + "loss": 0.96389937, + "num_input_tokens_seen": 5430245, + "router_z_loss_clip": 0.5234375, + "router_z_loss_mlp": 2.0625, + "step": 258, + "time_per_iteration": 2.523621082305908 + }, + { + "auxiliary_loss_clip": 0.0125721, + "auxiliary_loss_mlp": 0.01084906, + "balance_loss_clip": 1.03304982, + "balance_loss_mlp": 1.05085206, + "epoch": 0.015571922441004058, + "flos": 22815705248640.0, + "grad_norm": 2.701701695211177, + "language_loss": 0.92466164, + "learning_rate": 3.9976433440457205e-06, + "loss": 0.94808275, + "num_input_tokens_seen": 5448905, + "router_z_loss_clip": 0.51953125, + "router_z_loss_mlp": 2.0625, + "step": 259, + "time_per_iteration": 2.45497465133667 + }, + { + "auxiliary_loss_clip": 0.01253468, + "auxiliary_loss_mlp": 0.0108324, + "balance_loss_clip": 1.03674841, + "balance_loss_mlp": 1.05464232, + "epoch": 0.015632045693672027, + "flos": 18946595600640.0, + "grad_norm": 1.7812673312303993, + "language_loss": 0.96986514, + "learning_rate": 3.997624972267954e-06, + "loss": 0.99323225, + "num_input_tokens_seen": 5466405, + "router_z_loss_clip": 0.46484375, + "router_z_loss_mlp": 1.9921875, + "step": 260, + "time_per_iteration": 2.4638025760650635 + }, + { + "auxiliary_loss_clip": 0.01264476, + "auxiliary_loss_mlp": 0.01097613, + "balance_loss_clip": 1.04480374, + "balance_loss_mlp": 1.05541444, + "epoch": 0.015692168946339995, + "flos": 29970421125120.0, + "grad_norm": 2.0705749401091733, + "language_loss": 0.87201715, + "learning_rate": 3.99760652920009e-06, + "loss": 0.89563799, + "num_input_tokens_seen": 5487055, + "router_z_loss_clip": 0.53125, + "router_z_loss_mlp": 2.09375, + "step": 261, + "time_per_iteration": 2.5186092853546143 + }, + { + "auxiliary_loss_clip": 0.0126125, + "auxiliary_loss_mlp": 0.01086343, + "balance_loss_clip": 1.03577399, + "balance_loss_mlp": 1.05366397, + "epoch": 0.015752292199007967, + "flos": 19391039608320.0, + "grad_norm": 1.9911084105028154, + "language_loss": 0.66606891, + "learning_rate": 3.997588014842788e-06, + "loss": 0.68954486, + "num_input_tokens_seen": 5506600, + "router_z_loss_clip": 0.5078125, + "router_z_loss_mlp": 2.078125, + "step": 262, + "time_per_iteration": 2.4510912895202637 + }, + { + "auxiliary_loss_clip": 0.01257533, + "auxiliary_loss_mlp": 0.01103056, + "balance_loss_clip": 1.0508666, + "balance_loss_mlp": 1.0543381, + "epoch": 0.015812415451675936, + "flos": 20338756997760.0, + "grad_norm": 2.204412624175132, + "language_loss": 0.6779955, + "learning_rate": 3.997569429196708e-06, + "loss": 0.70160139, + "num_input_tokens_seen": 5524350, + "router_z_loss_clip": 0.5234375, + "router_z_loss_mlp": 2.03125, + "step": 263, + "time_per_iteration": 2.4451043605804443 + }, + { + "auxiliary_loss_clip": 0.01260264, + "auxiliary_loss_mlp": 0.01090615, + "balance_loss_clip": 1.03990364, + "balance_loss_mlp": 1.05124879, + "epoch": 0.015872538704343905, + "flos": 17524583124480.0, + "grad_norm": 2.9410747460535283, + "language_loss": 0.84258455, + "learning_rate": 3.997550772262513e-06, + "loss": 0.86609334, + "num_input_tokens_seen": 5542145, + "router_z_loss_clip": 0.5078125, + "router_z_loss_mlp": 2.09375, + "step": 264, + "time_per_iteration": 2.4145796298980713 + }, + { + "auxiliary_loss_clip": 0.01264681, + "auxiliary_loss_mlp": 0.0108678, + "balance_loss_clip": 1.03683114, + "balance_loss_mlp": 1.05565, + "epoch": 0.015932661957011873, + "flos": 15259802405760.0, + "grad_norm": 3.7930459922362205, + "language_loss": 1.03443956, + "learning_rate": 3.997532044040869e-06, + "loss": 1.05795407, + "num_input_tokens_seen": 5557920, + "router_z_loss_clip": 0.5, + "router_z_loss_mlp": 2.078125, + "step": 265, + "time_per_iteration": 2.4558472633361816 + }, + { + "auxiliary_loss_clip": 0.01265797, + "auxiliary_loss_mlp": 0.01091431, + "balance_loss_clip": 1.03757191, + "balance_loss_mlp": 1.05655909, + "epoch": 0.015992785209679845, + "flos": 20301504710400.0, + "grad_norm": 6.033841447363089, + "language_loss": 0.74710017, + "learning_rate": 3.997513244532445e-06, + "loss": 0.77067244, + "num_input_tokens_seen": 5576290, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 2.09375, + "step": 266, + "time_per_iteration": 2.552818775177002 + }, + { + "auxiliary_loss_clip": 0.01253089, + "auxiliary_loss_mlp": 0.01081126, + "balance_loss_clip": 1.03139186, + "balance_loss_mlp": 1.05281532, + "epoch": 0.016052908462347814, + "flos": 23361397799040.0, + "grad_norm": 1.8207170903870495, + "language_loss": 0.89983177, + "learning_rate": 3.997494373737912e-06, + "loss": 0.9231739, + "num_input_tokens_seen": 5595205, + "router_z_loss_clip": 0.49804688, + "router_z_loss_mlp": 2.0, + "step": 267, + "time_per_iteration": 2.4479634761810303 + }, + { + "auxiliary_loss_clip": 0.01263384, + "auxiliary_loss_mlp": 0.01093478, + "balance_loss_clip": 1.04264736, + "balance_loss_mlp": 1.05432463, + "epoch": 0.016113031715015783, + "flos": 21285566691840.0, + "grad_norm": 2.3169364211275987, + "language_loss": 0.8484515, + "learning_rate": 3.997475431657943e-06, + "loss": 0.87202013, + "num_input_tokens_seen": 5612645, + "router_z_loss_clip": 0.5078125, + "router_z_loss_mlp": 2.09375, + "step": 268, + "time_per_iteration": 2.439906120300293 + }, + { + "auxiliary_loss_clip": 0.01254088, + "auxiliary_loss_mlp": 0.01081637, + "balance_loss_clip": 1.03114021, + "balance_loss_mlp": 1.05426097, + "epoch": 0.01617315496768375, + "flos": 18913742144640.0, + "grad_norm": 2.45556176865787, + "language_loss": 0.88008893, + "learning_rate": 3.9974564182932135e-06, + "loss": 0.9034462, + "num_input_tokens_seen": 5628345, + "router_z_loss_clip": 0.50390625, + "router_z_loss_mlp": 2.0, + "step": 269, + "time_per_iteration": 2.409527063369751 + }, + { + "auxiliary_loss_clip": 0.0126133, + "auxiliary_loss_mlp": 0.01089492, + "balance_loss_clip": 1.03787434, + "balance_loss_mlp": 1.05400348, + "epoch": 0.01623327822035172, + "flos": 16545513467520.0, + "grad_norm": 2.600771597384193, + "language_loss": 0.96567738, + "learning_rate": 3.997437333644403e-06, + "loss": 0.98918557, + "num_input_tokens_seen": 5645940, + "router_z_loss_clip": 0.515625, + "router_z_loss_mlp": 2.078125, + "step": 270, + "time_per_iteration": 2.4156806468963623 + }, + { + "auxiliary_loss_clip": 0.01258176, + "auxiliary_loss_mlp": 0.0109777, + "balance_loss_clip": 1.04743981, + "balance_loss_mlp": 1.05751896, + "epoch": 0.016293401473019692, + "flos": 23512361984640.0, + "grad_norm": 2.3087201569184472, + "language_loss": 0.85398507, + "learning_rate": 3.9974181777121915e-06, + "loss": 0.87754458, + "num_input_tokens_seen": 5665690, + "router_z_loss_clip": 0.50390625, + "router_z_loss_mlp": 2.0, + "step": 271, + "time_per_iteration": 2.4505057334899902 + }, + { + "auxiliary_loss_clip": 0.01259898, + "auxiliary_loss_mlp": 0.01091065, + "balance_loss_clip": 1.03847027, + "balance_loss_mlp": 1.05401468, + "epoch": 0.01635352472568766, + "flos": 29014988325120.0, + "grad_norm": 8.633948262091137, + "language_loss": 0.80753708, + "learning_rate": 3.997398950497263e-06, + "loss": 0.8310467, + "num_input_tokens_seen": 5683190, + "router_z_loss_clip": 0.52734375, + "router_z_loss_mlp": 2.0625, + "step": 272, + "time_per_iteration": 2.4930429458618164 + }, + { + "auxiliary_loss_clip": 0.01254961, + "auxiliary_loss_mlp": 0.01091194, + "balance_loss_clip": 1.0410068, + "balance_loss_mlp": 1.05233335, + "epoch": 0.01641364797835563, + "flos": 13369674153600.0, + "grad_norm": 2.121148494629337, + "language_loss": 0.80297101, + "learning_rate": 3.9973796520003044e-06, + "loss": 0.82643253, + "num_input_tokens_seen": 5699780, + "router_z_loss_clip": 0.50390625, + "router_z_loss_mlp": 2.03125, + "step": 273, + "time_per_iteration": 2.396026611328125 + }, + { + "auxiliary_loss_clip": 0.01254567, + "auxiliary_loss_mlp": 0.01091044, + "balance_loss_clip": 1.03914022, + "balance_loss_mlp": 1.05181062, + "epoch": 0.016473771231023598, + "flos": 18877292818560.0, + "grad_norm": 2.2415439484369513, + "language_loss": 0.90765822, + "learning_rate": 3.997360282222004e-06, + "loss": 0.93111444, + "num_input_tokens_seen": 5716980, + "router_z_loss_clip": 0.51953125, + "router_z_loss_mlp": 2.03125, + "step": 274, + "time_per_iteration": 2.4306771755218506 + }, + { + "auxiliary_loss_clip": 0.01256585, + "auxiliary_loss_mlp": 0.010984, + "balance_loss_clip": 1.04537582, + "balance_loss_mlp": 1.05353796, + "epoch": 0.016533894483691566, + "flos": 22600535339520.0, + "grad_norm": 1.8461598322284212, + "language_loss": 0.87523705, + "learning_rate": 3.997340841163053e-06, + "loss": 0.8987869, + "num_input_tokens_seen": 5737780, + "router_z_loss_clip": 0.53125, + "router_z_loss_mlp": 2.03125, + "step": 275, + "time_per_iteration": 2.4361367225646973 + }, + { + "auxiliary_loss_clip": 0.01259522, + "auxiliary_loss_mlp": 0.01098736, + "balance_loss_clip": 1.04533052, + "balance_loss_mlp": 1.05604446, + "epoch": 0.01659401773635954, + "flos": 21506112950400.0, + "grad_norm": 1.694260442445138, + "language_loss": 0.80209416, + "learning_rate": 3.9973213288241445e-06, + "loss": 0.8256768, + "num_input_tokens_seen": 5758330, + "router_z_loss_clip": 0.53515625, + "router_z_loss_mlp": 2.03125, + "step": 276, + "time_per_iteration": 2.4542930126190186 + }, + { + "auxiliary_loss_clip": 0.01250956, + "auxiliary_loss_mlp": 0.01088707, + "balance_loss_clip": 1.04045093, + "balance_loss_mlp": 1.05300093, + "epoch": 0.016654140989027507, + "flos": 32849673505920.0, + "grad_norm": 1.7758054657349884, + "language_loss": 0.80436337, + "learning_rate": 3.997301745205976e-06, + "loss": 0.82775998, + "num_input_tokens_seen": 5778340, + "router_z_loss_clip": 0.48242188, + "router_z_loss_mlp": 1.984375, + "step": 277, + "time_per_iteration": 2.5271694660186768 + }, + { + "auxiliary_loss_clip": 0.01251341, + "auxiliary_loss_mlp": 0.01082503, + "balance_loss_clip": 1.02964544, + "balance_loss_mlp": 1.05135834, + "epoch": 0.016714264241695476, + "flos": 12305591602560.0, + "grad_norm": 2.840228512453406, + "language_loss": 0.79760599, + "learning_rate": 3.997282090309246e-06, + "loss": 0.82094443, + "num_input_tokens_seen": 5794295, + "router_z_loss_clip": 0.52734375, + "router_z_loss_mlp": 2.0, + "step": 278, + "time_per_iteration": 2.4087443351745605 + }, + { + "auxiliary_loss_clip": 0.0125048, + "auxiliary_loss_mlp": 0.01082849, + "balance_loss_clip": 1.03437805, + "balance_loss_mlp": 1.05186558, + "epoch": 0.016774387494363444, + "flos": 27122625745920.0, + "grad_norm": 1.9624673467401972, + "language_loss": 0.90430892, + "learning_rate": 3.9972623641346555e-06, + "loss": 0.92764223, + "num_input_tokens_seen": 5814405, + "router_z_loss_clip": 0.484375, + "router_z_loss_mlp": 1.984375, + "step": 279, + "time_per_iteration": 2.4882657527923584 + }, + { + "auxiliary_loss_clip": 0.01252485, + "auxiliary_loss_mlp": 0.01087283, + "balance_loss_clip": 1.03421128, + "balance_loss_mlp": 1.05146337, + "epoch": 0.016834510747031413, + "flos": 20190515898240.0, + "grad_norm": 3.3592851885107806, + "language_loss": 0.93480706, + "learning_rate": 3.9972425666829085e-06, + "loss": 0.95820475, + "num_input_tokens_seen": 5832795, + "router_z_loss_clip": 0.53125, + "router_z_loss_mlp": 2.0, + "step": 280, + "time_per_iteration": 2.450284481048584 + }, + { + "auxiliary_loss_clip": 0.0125678, + "auxiliary_loss_mlp": 0.01087997, + "balance_loss_clip": 1.03652251, + "balance_loss_mlp": 1.0515883, + "epoch": 0.016894633999699385, + "flos": 27272961527040.0, + "grad_norm": 2.204190539641557, + "language_loss": 0.73594493, + "learning_rate": 3.997222697954712e-06, + "loss": 0.75939268, + "num_input_tokens_seen": 5855750, + "router_z_loss_clip": 0.515625, + "router_z_loss_mlp": 2.0625, + "step": 281, + "time_per_iteration": 2.5007758140563965 + }, + { + "auxiliary_loss_clip": 0.01255022, + "auxiliary_loss_mlp": 0.01095399, + "balance_loss_clip": 1.04506898, + "balance_loss_mlp": 1.05505633, + "epoch": 0.016954757252367354, + "flos": 14902081948800.0, + "grad_norm": 2.637264410239938, + "language_loss": 0.79733199, + "learning_rate": 3.997202757950775e-06, + "loss": 0.82083619, + "num_input_tokens_seen": 5872610, + "router_z_loss_clip": 0.50390625, + "router_z_loss_mlp": 2.0, + "step": 282, + "time_per_iteration": 2.4414730072021484 + }, + { + "auxiliary_loss_clip": 0.01256517, + "auxiliary_loss_mlp": 0.01099689, + "balance_loss_clip": 1.04740334, + "balance_loss_mlp": 1.05429316, + "epoch": 0.017014880505035322, + "flos": 21357802028160.0, + "grad_norm": 2.070819037653251, + "language_loss": 0.77117169, + "learning_rate": 3.997182746671809e-06, + "loss": 0.79473376, + "num_input_tokens_seen": 5892985, + "router_z_loss_clip": 0.5234375, + "router_z_loss_mlp": 2.015625, + "step": 283, + "time_per_iteration": 2.445768117904663 + }, + { + "auxiliary_loss_clip": 0.01257915, + "auxiliary_loss_mlp": 0.01084586, + "balance_loss_clip": 1.03706956, + "balance_loss_mlp": 1.05535746, + "epoch": 0.01707500375770329, + "flos": 35331753726720.0, + "grad_norm": 2.1373580986706613, + "language_loss": 0.83959854, + "learning_rate": 3.997162664118528e-06, + "loss": 0.86302352, + "num_input_tokens_seen": 5914060, + "router_z_loss_clip": 0.47460938, + "router_z_loss_mlp": 2.03125, + "step": 284, + "time_per_iteration": 2.573779344558716 + }, + { + "auxiliary_loss_clip": 0.01247415, + "auxiliary_loss_mlp": 0.01085849, + "balance_loss_clip": 1.03487539, + "balance_loss_mlp": 1.04878318, + "epoch": 0.01713512701037126, + "flos": 23581071273600.0, + "grad_norm": 2.3576344067130917, + "language_loss": 0.96618634, + "learning_rate": 3.99714251029165e-06, + "loss": 0.989519, + "num_input_tokens_seen": 5932860, + "router_z_loss_clip": 0.5078125, + "router_z_loss_mlp": 1.984375, + "step": 285, + "time_per_iteration": 2.444382667541504 + }, + { + "auxiliary_loss_clip": 0.0125138, + "auxiliary_loss_mlp": 0.01083147, + "balance_loss_clip": 1.0370605, + "balance_loss_mlp": 1.05309486, + "epoch": 0.01719525026303923, + "flos": 27633474892800.0, + "grad_norm": 8.623728045445985, + "language_loss": 0.93435287, + "learning_rate": 3.997122285191892e-06, + "loss": 0.95769811, + "num_input_tokens_seen": 5952725, + "router_z_loss_clip": 0.4609375, + "router_z_loss_mlp": 1.984375, + "step": 286, + "time_per_iteration": 2.5120911598205566 + }, + { + "auxiliary_loss_clip": 0.01248755, + "auxiliary_loss_mlp": 0.01089959, + "balance_loss_clip": 1.03903317, + "balance_loss_mlp": 1.05168724, + "epoch": 0.0172553735157072, + "flos": 26978504186880.0, + "grad_norm": 2.0538479761604704, + "language_loss": 0.91652668, + "learning_rate": 3.997101988819976e-06, + "loss": 0.93991387, + "num_input_tokens_seen": 5970560, + "router_z_loss_clip": 0.5078125, + "router_z_loss_mlp": 1.96875, + "step": 287, + "time_per_iteration": 2.463068723678589 + }, + { + "auxiliary_loss_clip": 0.01250456, + "auxiliary_loss_mlp": 0.01078526, + "balance_loss_clip": 1.03065181, + "balance_loss_mlp": 1.05379772, + "epoch": 0.01731549676837517, + "flos": 14055962215680.0, + "grad_norm": 3.2909414233324563, + "language_loss": 1.01652026, + "learning_rate": 3.997081621176629e-06, + "loss": 1.03981018, + "num_input_tokens_seen": 5982980, + "router_z_loss_clip": 0.47851562, + "router_z_loss_mlp": 1.96875, + "step": 288, + "time_per_iteration": 2.422941207885742 + }, + { + "auxiliary_loss_clip": 0.01164027, + "auxiliary_loss_mlp": 0.01052735, + "balance_loss_clip": 1.03757143, + "balance_loss_mlp": 1.06516135, + "epoch": 0.017375620021043137, + "flos": 66506885959680.0, + "grad_norm": 0.9025926466434199, + "language_loss": 0.63966572, + "learning_rate": 3.997061182262575e-06, + "loss": 0.66183341, + "num_input_tokens_seen": 6049445, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.98828125, + "step": 289, + "time_per_iteration": 3.1877288818359375 + }, + { + "auxiliary_loss_clip": 0.01247348, + "auxiliary_loss_mlp": 0.01082034, + "balance_loss_clip": 1.03451705, + "balance_loss_mlp": 1.05231452, + "epoch": 0.01743574327371111, + "flos": 15224435331840.0, + "grad_norm": 3.2195614434280504, + "language_loss": 0.88081455, + "learning_rate": 3.997040672078545e-06, + "loss": 0.90410841, + "num_input_tokens_seen": 6064150, + "router_z_loss_clip": 0.4765625, + "router_z_loss_mlp": 1.9453125, + "step": 290, + "time_per_iteration": 2.4281256198883057 + }, + { + "auxiliary_loss_clip": 0.01248587, + "auxiliary_loss_mlp": 0.01079803, + "balance_loss_clip": 1.03347826, + "balance_loss_mlp": 1.05308676, + "epoch": 0.017495866526379078, + "flos": 25372708980480.0, + "grad_norm": 2.006050207544469, + "language_loss": 0.83882666, + "learning_rate": 3.997020090625269e-06, + "loss": 0.86211061, + "num_input_tokens_seen": 6083920, + "router_z_loss_clip": 0.46289062, + "router_z_loss_mlp": 1.953125, + "step": 291, + "time_per_iteration": 2.4570248126983643 + }, + { + "auxiliary_loss_clip": 0.0125354, + "auxiliary_loss_mlp": 0.01099016, + "balance_loss_clip": 1.04727936, + "balance_loss_mlp": 1.05850148, + "epoch": 0.017555989779047047, + "flos": 26358272150400.0, + "grad_norm": 1.7788976427156116, + "language_loss": 0.72319877, + "learning_rate": 3.996999437903485e-06, + "loss": 0.74672437, + "num_input_tokens_seen": 6105460, + "router_z_loss_clip": 0.515625, + "router_z_loss_mlp": 1.9453125, + "step": 292, + "time_per_iteration": 3.9080538749694824 + }, + { + "auxiliary_loss_clip": 0.01247077, + "auxiliary_loss_mlp": 0.01092912, + "balance_loss_clip": 1.04296315, + "balance_loss_mlp": 1.05412185, + "epoch": 0.017616113031715015, + "flos": 22337919025920.0, + "grad_norm": 2.160237997478328, + "language_loss": 0.86493468, + "learning_rate": 3.996978713913927e-06, + "loss": 0.88833451, + "num_input_tokens_seen": 6122890, + "router_z_loss_clip": 0.5, + "router_z_loss_mlp": 1.9296875, + "step": 293, + "time_per_iteration": 5.241613388061523 + }, + { + "auxiliary_loss_clip": 0.01246615, + "auxiliary_loss_mlp": 0.01079838, + "balance_loss_clip": 1.03301287, + "balance_loss_mlp": 1.05239558, + "epoch": 0.017676236284382984, + "flos": 20155881962880.0, + "grad_norm": 3.2612184299665374, + "language_loss": 0.80483878, + "learning_rate": 3.996957918657335e-06, + "loss": 0.8281033, + "num_input_tokens_seen": 6142890, + "router_z_loss_clip": 0.46875, + "router_z_loss_mlp": 1.9375, + "step": 294, + "time_per_iteration": 3.8060572147369385 + }, + { + "auxiliary_loss_clip": 0.0124757, + "auxiliary_loss_mlp": 0.01089644, + "balance_loss_clip": 1.03969526, + "balance_loss_mlp": 1.05109572, + "epoch": 0.017736359537050956, + "flos": 25222303376640.0, + "grad_norm": 2.5812548654115948, + "language_loss": 0.83908248, + "learning_rate": 3.996937052134452e-06, + "loss": 0.86245465, + "num_input_tokens_seen": 6162030, + "router_z_loss_clip": 0.5, + "router_z_loss_mlp": 1.96875, + "step": 295, + "time_per_iteration": 2.4810874462127686 + }, + { + "auxiliary_loss_clip": 0.01248576, + "auxiliary_loss_mlp": 0.01091159, + "balance_loss_clip": 1.04524004, + "balance_loss_mlp": 1.05808198, + "epoch": 0.017796482789718925, + "flos": 20337779479680.0, + "grad_norm": 2.0460008989613, + "language_loss": 0.83856666, + "learning_rate": 3.996916114346023e-06, + "loss": 0.86196399, + "num_input_tokens_seen": 6180540, + "router_z_loss_clip": 0.4609375, + "router_z_loss_mlp": 1.90625, + "step": 296, + "time_per_iteration": 2.4240550994873047 + }, + { + "auxiliary_loss_clip": 0.01255662, + "auxiliary_loss_mlp": 0.0108459, + "balance_loss_clip": 1.03647757, + "balance_loss_mlp": 1.05740452, + "epoch": 0.017856606042386893, + "flos": 22378208601600.0, + "grad_norm": 2.4621250106449386, + "language_loss": 0.87520307, + "learning_rate": 3.996895105292794e-06, + "loss": 0.89860559, + "num_input_tokens_seen": 6199425, + "router_z_loss_clip": 0.48046875, + "router_z_loss_mlp": 1.984375, + "step": 297, + "time_per_iteration": 2.4576117992401123 + }, + { + "auxiliary_loss_clip": 0.01250089, + "auxiliary_loss_mlp": 0.01076338, + "balance_loss_clip": 1.03039491, + "balance_loss_mlp": 1.05379272, + "epoch": 0.017916729295054862, + "flos": 20229024994560.0, + "grad_norm": 2.270374893224995, + "language_loss": 0.88099438, + "learning_rate": 3.996874024975515e-06, + "loss": 0.90425873, + "num_input_tokens_seen": 6219170, + "router_z_loss_clip": 0.4609375, + "router_z_loss_mlp": 1.9609375, + "step": 298, + "time_per_iteration": 2.444200277328491 + }, + { + "auxiliary_loss_clip": 0.0124723, + "auxiliary_loss_mlp": 0.01087312, + "balance_loss_clip": 1.03714836, + "balance_loss_mlp": 1.05417967, + "epoch": 0.01797685254772283, + "flos": 19389957356160.0, + "grad_norm": 2.25462221963985, + "language_loss": 0.88106245, + "learning_rate": 3.996852873394939e-06, + "loss": 0.90440786, + "num_input_tokens_seen": 6237930, + "router_z_loss_clip": 0.50390625, + "router_z_loss_mlp": 1.9296875, + "step": 299, + "time_per_iteration": 2.4332218170166016 + }, + { + "auxiliary_loss_clip": 0.01257521, + "auxiliary_loss_mlp": 0.01077923, + "balance_loss_clip": 1.02790248, + "balance_loss_mlp": 1.05698752, + "epoch": 0.018036975800390802, + "flos": 24424851945600.0, + "grad_norm": 3.01009565599283, + "language_loss": 0.63656032, + "learning_rate": 3.996831650551821e-06, + "loss": 0.65991479, + "num_input_tokens_seen": 6257170, + "router_z_loss_clip": 0.5, + "router_z_loss_mlp": 2.0, + "step": 300, + "time_per_iteration": 2.512960433959961 + }, + { + "auxiliary_loss_clip": 0.01250638, + "auxiliary_loss_mlp": 0.01089998, + "balance_loss_clip": 1.04167092, + "balance_loss_mlp": 1.05774963, + "epoch": 0.01809709905305877, + "flos": 15778017849600.0, + "grad_norm": 2.705806939759899, + "language_loss": 0.87975717, + "learning_rate": 3.996810356446917e-06, + "loss": 0.90316349, + "num_input_tokens_seen": 6274780, + "router_z_loss_clip": 0.484375, + "router_z_loss_mlp": 1.9296875, + "step": 301, + "time_per_iteration": 2.449906826019287 + }, + { + "auxiliary_loss_clip": 0.01170518, + "auxiliary_loss_mlp": 0.01035771, + "balance_loss_clip": 1.02175176, + "balance_loss_mlp": 1.07485867, + "epoch": 0.01815722230572674, + "flos": 67344592055040.0, + "grad_norm": 0.9665553873254724, + "language_loss": 0.62200117, + "learning_rate": 3.996788991080988e-06, + "loss": 0.64406407, + "num_input_tokens_seen": 6340435, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.95703125, + "step": 302, + "time_per_iteration": 3.235015392303467 + }, + { + "auxiliary_loss_clip": 0.01245816, + "auxiliary_loss_mlp": 0.01094046, + "balance_loss_clip": 1.04636252, + "balance_loss_mlp": 1.04997766, + "epoch": 0.01821734555839471, + "flos": 15484747495680.0, + "grad_norm": 2.2391786324512637, + "language_loss": 0.89078534, + "learning_rate": 3.996767554454796e-06, + "loss": 0.91418391, + "num_input_tokens_seen": 6358160, + "router_z_loss_clip": 0.4765625, + "router_z_loss_mlp": 1.9609375, + "step": 303, + "time_per_iteration": 2.4486289024353027 + }, + { + "auxiliary_loss_clip": 0.01252791, + "auxiliary_loss_mlp": 0.01097796, + "balance_loss_clip": 1.04799032, + "balance_loss_mlp": 1.05749726, + "epoch": 0.018277468811062677, + "flos": 24096284340480.0, + "grad_norm": 1.7268145183362635, + "language_loss": 0.79628664, + "learning_rate": 3.996746046569107e-06, + "loss": 0.81979251, + "num_input_tokens_seen": 6378485, + "router_z_loss_clip": 0.5, + "router_z_loss_mlp": 1.953125, + "step": 304, + "time_per_iteration": 2.498051643371582 + }, + { + "auxiliary_loss_clip": 0.01242654, + "auxiliary_loss_mlp": 0.01073108, + "balance_loss_clip": 1.02733171, + "balance_loss_mlp": 1.05657101, + "epoch": 0.01833759206373065, + "flos": 20958290807040.0, + "grad_norm": 1.6486581714926711, + "language_loss": 0.82408345, + "learning_rate": 3.996724467424687e-06, + "loss": 0.84724116, + "num_input_tokens_seen": 6397845, + "router_z_loss_clip": 0.45703125, + "router_z_loss_mlp": 1.859375, + "step": 305, + "time_per_iteration": 2.4394264221191406 + }, + { + "auxiliary_loss_clip": 0.0124937, + "auxiliary_loss_mlp": 0.01077638, + "balance_loss_clip": 1.03140867, + "balance_loss_mlp": 1.05194199, + "epoch": 0.018397715316398618, + "flos": 19389747888000.0, + "grad_norm": 1.952855863430056, + "language_loss": 0.90433657, + "learning_rate": 3.996702817022308e-06, + "loss": 0.9276067, + "num_input_tokens_seen": 6416475, + "router_z_loss_clip": 0.46289062, + "router_z_loss_mlp": 1.9765625, + "step": 306, + "time_per_iteration": 2.4558498859405518 + }, + { + "auxiliary_loss_clip": 0.01239261, + "auxiliary_loss_mlp": 0.0108027, + "balance_loss_clip": 1.03404021, + "balance_loss_mlp": 1.04859948, + "epoch": 0.018457838569066586, + "flos": 29131248752640.0, + "grad_norm": 2.143075141284067, + "language_loss": 0.86084306, + "learning_rate": 3.996681095362741e-06, + "loss": 0.88403845, + "num_input_tokens_seen": 6437520, + "router_z_loss_clip": 0.4609375, + "router_z_loss_mlp": 1.90625, + "step": 307, + "time_per_iteration": 2.537835121154785 + }, + { + "auxiliary_loss_clip": 0.01241134, + "auxiliary_loss_mlp": 0.01080609, + "balance_loss_clip": 1.03178144, + "balance_loss_mlp": 1.05186296, + "epoch": 0.018517961821734555, + "flos": 19207640903040.0, + "grad_norm": 2.3437069898355904, + "language_loss": 0.71195388, + "learning_rate": 3.996659302446762e-06, + "loss": 0.73517132, + "num_input_tokens_seen": 6455680, + "router_z_loss_clip": 0.48828125, + "router_z_loss_mlp": 1.890625, + "step": 308, + "time_per_iteration": 2.4432265758514404 + }, + { + "auxiliary_loss_clip": 0.01246949, + "auxiliary_loss_mlp": 0.01087794, + "balance_loss_clip": 1.04046774, + "balance_loss_mlp": 1.05282617, + "epoch": 0.018578085074402523, + "flos": 19862053027200.0, + "grad_norm": 2.6018867667266163, + "language_loss": 0.91403347, + "learning_rate": 3.996637438275148e-06, + "loss": 0.93738091, + "num_input_tokens_seen": 6474880, + "router_z_loss_clip": 0.47265625, + "router_z_loss_mlp": 1.9375, + "step": 309, + "time_per_iteration": 2.4650886058807373 + }, + { + "auxiliary_loss_clip": 0.0125455, + "auxiliary_loss_mlp": 0.01084848, + "balance_loss_clip": 1.03559089, + "balance_loss_mlp": 1.05105126, + "epoch": 0.018638208327070496, + "flos": 29605648573440.0, + "grad_norm": 1.9363741747675771, + "language_loss": 0.72133344, + "learning_rate": 3.99661550284868e-06, + "loss": 0.74472737, + "num_input_tokens_seen": 6495945, + "router_z_loss_clip": 0.4921875, + "router_z_loss_mlp": 2.03125, + "step": 310, + "time_per_iteration": 2.515129566192627 + }, + { + "auxiliary_loss_clip": 0.01245354, + "auxiliary_loss_mlp": 0.01099865, + "balance_loss_clip": 1.05294418, + "balance_loss_mlp": 1.05505824, + "epoch": 0.018698331579738464, + "flos": 45729866131200.0, + "grad_norm": 2.0692631349636943, + "language_loss": 0.73453295, + "learning_rate": 3.996593496168141e-06, + "loss": 0.75798512, + "num_input_tokens_seen": 6519930, + "router_z_loss_clip": 0.46875, + "router_z_loss_mlp": 1.8984375, + "step": 311, + "time_per_iteration": 2.6794707775115967 + }, + { + "auxiliary_loss_clip": 0.01254028, + "auxiliary_loss_mlp": 0.01086933, + "balance_loss_clip": 1.04008377, + "balance_loss_mlp": 1.05406392, + "epoch": 0.018758454832406433, + "flos": 20482669088640.0, + "grad_norm": 3.6044382373216695, + "language_loss": 0.90822446, + "learning_rate": 3.996571418234316e-06, + "loss": 0.93163407, + "num_input_tokens_seen": 6535070, + "router_z_loss_clip": 0.46875, + "router_z_loss_mlp": 2.0, + "step": 312, + "time_per_iteration": 2.454665184020996 + }, + { + "auxiliary_loss_clip": 0.01253057, + "auxiliary_loss_mlp": 0.01093846, + "balance_loss_clip": 1.04518485, + "balance_loss_mlp": 1.05319011, + "epoch": 0.0188185780850744, + "flos": 15776900686080.0, + "grad_norm": 2.1685699705162365, + "language_loss": 0.89634204, + "learning_rate": 3.996549269047992e-06, + "loss": 0.91981101, + "num_input_tokens_seen": 6554135, + "router_z_loss_clip": 0.48828125, + "router_z_loss_mlp": 2.0, + "step": 313, + "time_per_iteration": 2.413755178451538 + }, + { + "auxiliary_loss_clip": 0.01254911, + "auxiliary_loss_mlp": 0.01084852, + "balance_loss_clip": 1.03831267, + "balance_loss_mlp": 1.0554781, + "epoch": 0.018878701337742373, + "flos": 22454633301120.0, + "grad_norm": 2.2131754408423623, + "language_loss": 0.72605658, + "learning_rate": 3.996527048609961e-06, + "loss": 0.7494542, + "num_input_tokens_seen": 6572275, + "router_z_loss_clip": 0.46484375, + "router_z_loss_mlp": 1.9921875, + "step": 314, + "time_per_iteration": 2.461585283279419 + }, + { + "auxiliary_loss_clip": 0.01247425, + "auxiliary_loss_mlp": 0.01093213, + "balance_loss_clip": 1.04669785, + "balance_loss_mlp": 1.0528543, + "epoch": 0.018938824590410342, + "flos": 30992189241600.0, + "grad_norm": 2.4088693693289045, + "language_loss": 0.88752794, + "learning_rate": 3.996504756921015e-06, + "loss": 0.91093433, + "num_input_tokens_seen": 6594520, + "router_z_loss_clip": 0.46484375, + "router_z_loss_mlp": 1.9453125, + "step": 315, + "time_per_iteration": 2.527616024017334 + }, + { + "auxiliary_loss_clip": 0.01245421, + "auxiliary_loss_mlp": 0.0107802, + "balance_loss_clip": 1.03062224, + "balance_loss_mlp": 1.05360627, + "epoch": 0.01899894784307831, + "flos": 23257775283840.0, + "grad_norm": 1.883027643759866, + "language_loss": 0.80180895, + "learning_rate": 3.996482393981951e-06, + "loss": 0.82504332, + "num_input_tokens_seen": 6614245, + "router_z_loss_clip": 0.47460938, + "router_z_loss_mlp": 1.921875, + "step": 316, + "time_per_iteration": 2.5028390884399414 + }, + { + "auxiliary_loss_clip": 0.01244454, + "auxiliary_loss_mlp": 0.01081262, + "balance_loss_clip": 1.03372157, + "balance_loss_mlp": 1.05157375, + "epoch": 0.01905907109574628, + "flos": 17456921176320.0, + "grad_norm": 2.341369116632892, + "language_loss": 0.89989537, + "learning_rate": 3.996459959793564e-06, + "loss": 0.92315257, + "num_input_tokens_seen": 6632015, + "router_z_loss_clip": 0.4765625, + "router_z_loss_mlp": 1.9296875, + "step": 317, + "time_per_iteration": 2.4208786487579346 + }, + { + "auxiliary_loss_clip": 0.01239179, + "auxiliary_loss_mlp": 0.01081755, + "balance_loss_clip": 1.03349936, + "balance_loss_mlp": 1.04840732, + "epoch": 0.019119194348414248, + "flos": 14969499517440.0, + "grad_norm": 4.3113874763269395, + "language_loss": 0.90558648, + "learning_rate": 3.996437454356658e-06, + "loss": 0.92879575, + "num_input_tokens_seen": 6649015, + "router_z_loss_clip": 0.484375, + "router_z_loss_mlp": 1.90625, + "step": 318, + "time_per_iteration": 2.414809226989746 + }, + { + "auxiliary_loss_clip": 0.01241514, + "auxiliary_loss_mlp": 0.01070685, + "balance_loss_clip": 1.02605319, + "balance_loss_mlp": 1.0510509, + "epoch": 0.01917931760108222, + "flos": 25481672933760.0, + "grad_norm": 6.179517827759149, + "language_loss": 0.93067336, + "learning_rate": 3.996414877672034e-06, + "loss": 0.95379531, + "num_input_tokens_seen": 6669225, + "router_z_loss_clip": 0.4453125, + "router_z_loss_mlp": 1.90625, + "step": 319, + "time_per_iteration": 2.4777655601501465 + }, + { + "auxiliary_loss_clip": 0.01141162, + "auxiliary_loss_mlp": 0.01034181, + "balance_loss_clip": 1.02130675, + "balance_loss_mlp": 1.05140138, + "epoch": 0.01923944085375019, + "flos": 71553722100480.0, + "grad_norm": 0.9027108154994729, + "language_loss": 0.59722847, + "learning_rate": 3.996392229740498e-06, + "loss": 0.6189819, + "num_input_tokens_seen": 6725775, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.8984375, + "step": 320, + "time_per_iteration": 3.004598379135132 + }, + { + "auxiliary_loss_clip": 0.0123998, + "auxiliary_loss_mlp": 0.01086659, + "balance_loss_clip": 1.03847432, + "balance_loss_mlp": 1.0485394, + "epoch": 0.019299564106418157, + "flos": 19681482142080.0, + "grad_norm": 3.3648702202179, + "language_loss": 0.89259684, + "learning_rate": 3.99636951056286e-06, + "loss": 0.91586322, + "num_input_tokens_seen": 6744170, + "router_z_loss_clip": 0.48242188, + "router_z_loss_mlp": 1.9140625, + "step": 321, + "time_per_iteration": 2.4724040031433105 + }, + { + "auxiliary_loss_clip": 0.01247364, + "auxiliary_loss_mlp": 0.0109448, + "balance_loss_clip": 1.04455495, + "balance_loss_mlp": 1.05249262, + "epoch": 0.019359687359086126, + "flos": 24386063558400.0, + "grad_norm": 2.611546329556763, + "language_loss": 0.82608497, + "learning_rate": 3.996346720139928e-06, + "loss": 0.84950346, + "num_input_tokens_seen": 6764565, + "router_z_loss_clip": 0.49804688, + "router_z_loss_mlp": 1.9453125, + "step": 322, + "time_per_iteration": 2.472501039505005 + }, + { + "auxiliary_loss_clip": 0.0124952, + "auxiliary_loss_mlp": 0.01084466, + "balance_loss_clip": 1.03673482, + "balance_loss_mlp": 1.05295157, + "epoch": 0.019419810611754094, + "flos": 23950242656640.0, + "grad_norm": 2.3185725465210574, + "language_loss": 0.72154129, + "learning_rate": 3.996323858472518e-06, + "loss": 0.74488103, + "num_input_tokens_seen": 6785310, + "router_z_loss_clip": 0.4765625, + "router_z_loss_mlp": 1.96875, + "step": 323, + "time_per_iteration": 2.49354887008667 + }, + { + "auxiliary_loss_clip": 0.01238313, + "auxiliary_loss_mlp": 0.0107448, + "balance_loss_clip": 1.02827406, + "balance_loss_mlp": 1.04739833, + "epoch": 0.019479933864422067, + "flos": 22159233354240.0, + "grad_norm": 1.9805260952758794, + "language_loss": 0.92195767, + "learning_rate": 3.996300925561445e-06, + "loss": 0.94508559, + "num_input_tokens_seen": 6803290, + "router_z_loss_clip": 0.4609375, + "router_z_loss_mlp": 1.90625, + "step": 324, + "time_per_iteration": 2.4647061824798584 + }, + { + "auxiliary_loss_clip": 0.01249478, + "auxiliary_loss_mlp": 0.01078997, + "balance_loss_clip": 1.03441286, + "balance_loss_mlp": 1.05450225, + "epoch": 0.019540057117090035, + "flos": 22235727876480.0, + "grad_norm": 5.209329635238033, + "language_loss": 0.64969045, + "learning_rate": 3.996277921407525e-06, + "loss": 0.67297518, + "num_input_tokens_seen": 6822570, + "router_z_loss_clip": 0.4453125, + "router_z_loss_mlp": 1.9453125, + "step": 325, + "time_per_iteration": 2.4915707111358643 + }, + { + "auxiliary_loss_clip": 0.01247938, + "auxiliary_loss_mlp": 0.01081587, + "balance_loss_clip": 1.03504801, + "balance_loss_mlp": 1.05839956, + "epoch": 0.019600180369758004, + "flos": 23075633387520.0, + "grad_norm": 3.213050020436582, + "language_loss": 0.76240909, + "learning_rate": 3.996254846011582e-06, + "loss": 0.78570437, + "num_input_tokens_seen": 6841910, + "router_z_loss_clip": 0.46484375, + "router_z_loss_mlp": 1.890625, + "step": 326, + "time_per_iteration": 2.4705817699432373 + }, + { + "auxiliary_loss_clip": 0.01243276, + "auxiliary_loss_mlp": 0.01090596, + "balance_loss_clip": 1.04548764, + "balance_loss_mlp": 1.05709124, + "epoch": 0.019660303622425972, + "flos": 25409681976960.0, + "grad_norm": 3.3797375509014422, + "language_loss": 0.79048991, + "learning_rate": 3.99623169937444e-06, + "loss": 0.81382859, + "num_input_tokens_seen": 6862480, + "router_z_loss_clip": 0.45117188, + "router_z_loss_mlp": 1.8671875, + "step": 327, + "time_per_iteration": 2.4997310638427734 + }, + { + "auxiliary_loss_clip": 0.01250367, + "auxiliary_loss_mlp": 0.01083052, + "balance_loss_clip": 1.03579772, + "balance_loss_mlp": 1.05862069, + "epoch": 0.01972042687509394, + "flos": 23656448632320.0, + "grad_norm": 2.246546113724492, + "language_loss": 0.80648839, + "learning_rate": 3.996208481496923e-06, + "loss": 0.82982254, + "num_input_tokens_seen": 6882015, + "router_z_loss_clip": 0.47265625, + "router_z_loss_mlp": 1.921875, + "step": 328, + "time_per_iteration": 2.4818077087402344 + }, + { + "auxiliary_loss_clip": 0.01246085, + "auxiliary_loss_mlp": 0.01091705, + "balance_loss_clip": 1.04483187, + "balance_loss_mlp": 1.05255795, + "epoch": 0.019780550127761913, + "flos": 18222496669440.0, + "grad_norm": 3.1820907475393647, + "language_loss": 0.93123943, + "learning_rate": 3.996185192379858e-06, + "loss": 0.95461732, + "num_input_tokens_seen": 6899785, + "router_z_loss_clip": 0.46875, + "router_z_loss_mlp": 1.9375, + "step": 329, + "time_per_iteration": 2.426272392272949 + }, + { + "auxiliary_loss_clip": 0.01251169, + "auxiliary_loss_mlp": 0.01090414, + "balance_loss_clip": 1.0451622, + "balance_loss_mlp": 1.05434823, + "epoch": 0.01984067338042988, + "flos": 22417695216000.0, + "grad_norm": 2.3763564534737656, + "language_loss": 0.74106705, + "learning_rate": 3.996161832024081e-06, + "loss": 0.76448292, + "num_input_tokens_seen": 6918575, + "router_z_loss_clip": 0.453125, + "router_z_loss_mlp": 1.96875, + "step": 330, + "time_per_iteration": 2.4598453044891357 + }, + { + "auxiliary_loss_clip": 0.01249553, + "auxiliary_loss_mlp": 0.01101458, + "balance_loss_clip": 1.05377388, + "balance_loss_mlp": 1.05429959, + "epoch": 0.01990079663309785, + "flos": 17054267932800.0, + "grad_norm": 2.746237667836226, + "language_loss": 0.92803168, + "learning_rate": 3.996138400430422e-06, + "loss": 0.95154178, + "num_input_tokens_seen": 6936965, + "router_z_loss_clip": 0.4765625, + "router_z_loss_mlp": 1.953125, + "step": 331, + "time_per_iteration": 3.8557064533233643 + }, + { + "auxiliary_loss_clip": 0.01237676, + "auxiliary_loss_mlp": 0.01075151, + "balance_loss_clip": 1.03235459, + "balance_loss_mlp": 1.05177975, + "epoch": 0.01996091988576582, + "flos": 15960857973120.0, + "grad_norm": 3.6761560054958493, + "language_loss": 0.92356098, + "learning_rate": 3.996114897599718e-06, + "loss": 0.94668925, + "num_input_tokens_seen": 6953475, + "router_z_loss_clip": 0.42773438, + "router_z_loss_mlp": 1.859375, + "step": 332, + "time_per_iteration": 3.852618455886841 + }, + { + "auxiliary_loss_clip": 0.01240775, + "auxiliary_loss_mlp": 0.01078938, + "balance_loss_clip": 1.0318985, + "balance_loss_mlp": 1.05442214, + "epoch": 0.02002104313843379, + "flos": 23585330459520.0, + "grad_norm": 2.542787881565984, + "language_loss": 0.74470538, + "learning_rate": 3.996091323532807e-06, + "loss": 0.76790249, + "num_input_tokens_seen": 6971630, + "router_z_loss_clip": 0.47070312, + "router_z_loss_mlp": 1.8671875, + "step": 333, + "time_per_iteration": 3.8072381019592285 + }, + { + "auxiliary_loss_clip": 0.01244784, + "auxiliary_loss_mlp": 0.01077039, + "balance_loss_clip": 1.03154898, + "balance_loss_mlp": 1.05381489, + "epoch": 0.02008116639110176, + "flos": 34093454158080.0, + "grad_norm": 2.2161167871731506, + "language_loss": 0.78280437, + "learning_rate": 3.996067678230532e-06, + "loss": 0.80602264, + "num_input_tokens_seen": 6992775, + "router_z_loss_clip": 0.45507812, + "router_z_loss_mlp": 1.90625, + "step": 334, + "time_per_iteration": 3.9226768016815186 + }, + { + "auxiliary_loss_clip": 0.01243757, + "auxiliary_loss_mlp": 0.0107732, + "balance_loss_clip": 1.03049493, + "balance_loss_mlp": 1.04973888, + "epoch": 0.020141289643769728, + "flos": 19682669128320.0, + "grad_norm": 1.8925571247576105, + "language_loss": 0.82887501, + "learning_rate": 3.996043961693736e-06, + "loss": 0.85208577, + "num_input_tokens_seen": 7011425, + "router_z_loss_clip": 0.46875, + "router_z_loss_mlp": 1.9375, + "step": 335, + "time_per_iteration": 2.5068249702453613 + }, + { + "auxiliary_loss_clip": 0.01240044, + "auxiliary_loss_mlp": 0.01073121, + "balance_loss_clip": 1.02844167, + "balance_loss_mlp": 1.05044055, + "epoch": 0.020201412896437697, + "flos": 20739525027840.0, + "grad_norm": 2.570482808050445, + "language_loss": 0.9190805, + "learning_rate": 3.996020173923266e-06, + "loss": 0.9422121, + "num_input_tokens_seen": 7029450, + "router_z_loss_clip": 0.44726562, + "router_z_loss_mlp": 1.8984375, + "step": 336, + "time_per_iteration": 2.420025587081909 + }, + { + "auxiliary_loss_clip": 0.01243367, + "auxiliary_loss_mlp": 0.01077506, + "balance_loss_clip": 1.03118145, + "balance_loss_mlp": 1.0501318, + "epoch": 0.020261536149105665, + "flos": 20265474320640.0, + "grad_norm": 2.029482100880366, + "language_loss": 0.87759602, + "learning_rate": 3.99599631491997e-06, + "loss": 0.90080476, + "num_input_tokens_seen": 7047555, + "router_z_loss_clip": 0.46289062, + "router_z_loss_mlp": 1.9296875, + "step": 337, + "time_per_iteration": 2.4157845973968506 + }, + { + "auxiliary_loss_clip": 0.01236485, + "auxiliary_loss_mlp": 0.01086757, + "balance_loss_clip": 1.04064679, + "balance_loss_mlp": 1.04916739, + "epoch": 0.020321659401773638, + "flos": 25847562648960.0, + "grad_norm": 1.5226922127191085, + "language_loss": 0.89615571, + "learning_rate": 3.995972384684699e-06, + "loss": 0.91938806, + "num_input_tokens_seen": 7068185, + "router_z_loss_clip": 0.4609375, + "router_z_loss_mlp": 1.875, + "step": 338, + "time_per_iteration": 2.4866647720336914 + }, + { + "auxiliary_loss_clip": 0.01238625, + "auxiliary_loss_mlp": 0.01073964, + "balance_loss_clip": 1.02556515, + "balance_loss_mlp": 1.04621446, + "epoch": 0.020381782654441606, + "flos": 17494033818240.0, + "grad_norm": 2.4574020059875217, + "language_loss": 0.84838378, + "learning_rate": 3.995948383218309e-06, + "loss": 0.87150961, + "num_input_tokens_seen": 7085955, + "router_z_loss_clip": 0.484375, + "router_z_loss_mlp": 1.921875, + "step": 339, + "time_per_iteration": 2.436677932739258 + }, + { + "auxiliary_loss_clip": 0.01243425, + "auxiliary_loss_mlp": 0.01085429, + "balance_loss_clip": 1.0390569, + "balance_loss_mlp": 1.05143118, + "epoch": 0.020441905907109575, + "flos": 24242779872000.0, + "grad_norm": 1.9556810807536034, + "language_loss": 0.88591182, + "learning_rate": 3.995924310521655e-06, + "loss": 0.90920031, + "num_input_tokens_seen": 7106345, + "router_z_loss_clip": 0.46289062, + "router_z_loss_mlp": 1.921875, + "step": 340, + "time_per_iteration": 2.4788992404937744 + }, + { + "auxiliary_loss_clip": 0.01240924, + "auxiliary_loss_mlp": 0.01079023, + "balance_loss_clip": 1.03181577, + "balance_loss_mlp": 1.05091405, + "epoch": 0.020502029159777543, + "flos": 22232306563200.0, + "grad_norm": 2.1547913145760376, + "language_loss": 0.87746286, + "learning_rate": 3.995900166595596e-06, + "loss": 0.9006623, + "num_input_tokens_seen": 7125070, + "router_z_loss_clip": 0.47265625, + "router_z_loss_mlp": 1.8984375, + "step": 341, + "time_per_iteration": 2.4392197132110596 + }, + { + "auxiliary_loss_clip": 0.01244465, + "auxiliary_loss_mlp": 0.01084104, + "balance_loss_clip": 1.0366106, + "balance_loss_mlp": 1.05005693, + "epoch": 0.020562152412445512, + "flos": 23986726894080.0, + "grad_norm": 2.1928377266058137, + "language_loss": 0.79686862, + "learning_rate": 3.995875951440995e-06, + "loss": 0.82015431, + "num_input_tokens_seen": 7144675, + "router_z_loss_clip": 0.47460938, + "router_z_loss_mlp": 1.9375, + "step": 342, + "time_per_iteration": 2.454228162765503 + }, + { + "auxiliary_loss_clip": 0.01231521, + "auxiliary_loss_mlp": 0.01077618, + "balance_loss_clip": 1.03029144, + "balance_loss_mlp": 1.04603922, + "epoch": 0.020622275665113484, + "flos": 26974210089600.0, + "grad_norm": 1.8821152658598543, + "language_loss": 0.8900106, + "learning_rate": 3.995851665058715e-06, + "loss": 0.91310197, + "num_input_tokens_seen": 7165505, + "router_z_loss_clip": 0.47265625, + "router_z_loss_mlp": 1.8515625, + "step": 343, + "time_per_iteration": 2.4975695610046387 + }, + { + "auxiliary_loss_clip": 0.01247423, + "auxiliary_loss_mlp": 0.0108777, + "balance_loss_clip": 1.04194629, + "balance_loss_mlp": 1.05635118, + "epoch": 0.020682398917781453, + "flos": 22599627644160.0, + "grad_norm": 2.4061841581096366, + "language_loss": 0.77623147, + "learning_rate": 3.995827307449623e-06, + "loss": 0.79958338, + "num_input_tokens_seen": 7184605, + "router_z_loss_clip": 0.45703125, + "router_z_loss_mlp": 1.90625, + "step": 344, + "time_per_iteration": 2.500241279602051 + }, + { + "auxiliary_loss_clip": 0.01228052, + "auxiliary_loss_mlp": 0.01089187, + "balance_loss_clip": 1.04031157, + "balance_loss_mlp": 1.04616785, + "epoch": 0.02074252217044942, + "flos": 15012686736000.0, + "grad_norm": 2.1031120440135336, + "language_loss": 0.74481457, + "learning_rate": 3.995802878614588e-06, + "loss": 0.76798695, + "num_input_tokens_seen": 7203065, + "router_z_loss_clip": 0.48828125, + "router_z_loss_mlp": 1.8203125, + "step": 345, + "time_per_iteration": 2.4321231842041016 + }, + { + "auxiliary_loss_clip": 0.01236457, + "auxiliary_loss_mlp": 0.01083525, + "balance_loss_clip": 1.03290892, + "balance_loss_mlp": 1.05203187, + "epoch": 0.02080264542311739, + "flos": 25336783324800.0, + "grad_norm": 2.1070814705930667, + "language_loss": 0.89819229, + "learning_rate": 3.995778378554483e-06, + "loss": 0.92139214, + "num_input_tokens_seen": 7222995, + "router_z_loss_clip": 0.5078125, + "router_z_loss_mlp": 1.84375, + "step": 346, + "time_per_iteration": 2.500624179840088 + }, + { + "auxiliary_loss_clip": 0.01233687, + "auxiliary_loss_mlp": 0.01080794, + "balance_loss_clip": 1.03635287, + "balance_loss_mlp": 1.04964137, + "epoch": 0.02086276867578536, + "flos": 24387669480960.0, + "grad_norm": 2.0655644820909558, + "language_loss": 0.78656721, + "learning_rate": 3.99575380727018e-06, + "loss": 0.80971205, + "num_input_tokens_seen": 7244625, + "router_z_loss_clip": 0.4453125, + "router_z_loss_mlp": 1.84375, + "step": 347, + "time_per_iteration": 2.4814603328704834 + }, + { + "auxiliary_loss_clip": 0.01237051, + "auxiliary_loss_mlp": 0.01080866, + "balance_loss_clip": 1.03516102, + "balance_loss_mlp": 1.05103707, + "epoch": 0.02092289192845333, + "flos": 24461056892160.0, + "grad_norm": 1.8886253600904628, + "language_loss": 0.70518041, + "learning_rate": 3.995729164762559e-06, + "loss": 0.72835958, + "num_input_tokens_seen": 7263255, + "router_z_loss_clip": 0.45703125, + "router_z_loss_mlp": 1.859375, + "step": 348, + "time_per_iteration": 2.498539447784424 + }, + { + "auxiliary_loss_clip": 0.01240629, + "auxiliary_loss_mlp": 0.0108724, + "balance_loss_clip": 1.04062903, + "balance_loss_mlp": 1.04997635, + "epoch": 0.0209830151811213, + "flos": 17450392752000.0, + "grad_norm": 9.238146016146256, + "language_loss": 0.76325005, + "learning_rate": 3.995704451032496e-06, + "loss": 0.78652877, + "num_input_tokens_seen": 7279275, + "router_z_loss_clip": 0.46484375, + "router_z_loss_mlp": 1.90625, + "step": 349, + "time_per_iteration": 2.4045448303222656 + }, + { + "auxiliary_loss_clip": 0.01223683, + "auxiliary_loss_mlp": 0.01074173, + "balance_loss_clip": 1.03178263, + "balance_loss_mlp": 1.04728103, + "epoch": 0.021043138433789268, + "flos": 24572778842880.0, + "grad_norm": 1.7558642765462482, + "language_loss": 0.85043454, + "learning_rate": 3.995679666080876e-06, + "loss": 0.87341309, + "num_input_tokens_seen": 7300180, + "router_z_loss_clip": 0.42382812, + "router_z_loss_mlp": 1.765625, + "step": 350, + "time_per_iteration": 2.5335793495178223 + }, + { + "auxiliary_loss_clip": 0.01231843, + "auxiliary_loss_mlp": 0.0107489, + "balance_loss_clip": 1.03364384, + "balance_loss_mlp": 1.05170834, + "epoch": 0.021103261686457236, + "flos": 24453132013440.0, + "grad_norm": 7.130091306898022, + "language_loss": 0.79452366, + "learning_rate": 3.995654809908581e-06, + "loss": 0.81759101, + "num_input_tokens_seen": 7317430, + "router_z_loss_clip": 0.41210938, + "router_z_loss_mlp": 1.8046875, + "step": 351, + "time_per_iteration": 2.4338746070861816 + }, + { + "auxiliary_loss_clip": 0.01236511, + "auxiliary_loss_mlp": 0.01092286, + "balance_loss_clip": 1.0449841, + "balance_loss_mlp": 1.0526005, + "epoch": 0.021163384939125205, + "flos": 14682233917440.0, + "grad_norm": 3.84086245600335, + "language_loss": 0.87032181, + "learning_rate": 3.9956298825165005e-06, + "loss": 0.89360976, + "num_input_tokens_seen": 7334875, + "router_z_loss_clip": 0.47265625, + "router_z_loss_mlp": 1.84375, + "step": 352, + "time_per_iteration": 2.449504852294922 + }, + { + "auxiliary_loss_clip": 0.01233454, + "auxiliary_loss_mlp": 0.01077861, + "balance_loss_clip": 1.03132141, + "balance_loss_mlp": 1.05080497, + "epoch": 0.021223508191793177, + "flos": 24492199691520.0, + "grad_norm": 1.8489810839612493, + "language_loss": 0.82099515, + "learning_rate": 3.995604883905522e-06, + "loss": 0.84410834, + "num_input_tokens_seen": 7355185, + "router_z_loss_clip": 0.46484375, + "router_z_loss_mlp": 1.828125, + "step": 353, + "time_per_iteration": 2.4712696075439453 + }, + { + "auxiliary_loss_clip": 0.01229593, + "auxiliary_loss_mlp": 0.01073406, + "balance_loss_clip": 1.03025222, + "balance_loss_mlp": 1.0502218, + "epoch": 0.021283631444461146, + "flos": 24126030685440.0, + "grad_norm": 1.8250425149469043, + "language_loss": 0.80346203, + "learning_rate": 3.995579814076539e-06, + "loss": 0.82649195, + "num_input_tokens_seen": 7374425, + "router_z_loss_clip": 0.43164062, + "router_z_loss_mlp": 1.796875, + "step": 354, + "time_per_iteration": 2.4686806201934814 + }, + { + "auxiliary_loss_clip": 0.01236871, + "auxiliary_loss_mlp": 0.01079676, + "balance_loss_clip": 1.03323174, + "balance_loss_mlp": 1.05005431, + "epoch": 0.021343754697129114, + "flos": 25191055843200.0, + "grad_norm": 3.0584252208007134, + "language_loss": 0.80488598, + "learning_rate": 3.9955546730304455e-06, + "loss": 0.82805151, + "num_input_tokens_seen": 7394175, + "router_z_loss_clip": 0.46484375, + "router_z_loss_mlp": 1.875, + "step": 355, + "time_per_iteration": 2.4575672149658203 + }, + { + "auxiliary_loss_clip": 0.01232122, + "auxiliary_loss_mlp": 0.01078482, + "balance_loss_clip": 1.03416014, + "balance_loss_mlp": 1.04956889, + "epoch": 0.021403877949797083, + "flos": 17273243180160.0, + "grad_norm": 3.327945812803813, + "language_loss": 0.88961899, + "learning_rate": 3.995529460768139e-06, + "loss": 0.91272497, + "num_input_tokens_seen": 7412645, + "router_z_loss_clip": 0.44140625, + "router_z_loss_mlp": 1.828125, + "step": 356, + "time_per_iteration": 2.4201087951660156 + }, + { + "auxiliary_loss_clip": 0.01229322, + "auxiliary_loss_mlp": 0.01077796, + "balance_loss_clip": 1.03042173, + "balance_loss_mlp": 1.04959869, + "epoch": 0.021464001202465055, + "flos": 30916183478400.0, + "grad_norm": 2.6518396998699507, + "language_loss": 0.79755867, + "learning_rate": 3.995504177290519e-06, + "loss": 0.82062984, + "num_input_tokens_seen": 7432275, + "router_z_loss_clip": 0.47265625, + "router_z_loss_mlp": 1.796875, + "step": 357, + "time_per_iteration": 2.510831356048584 + }, + { + "auxiliary_loss_clip": 0.0123213, + "auxiliary_loss_mlp": 0.01073983, + "balance_loss_clip": 1.03171182, + "balance_loss_mlp": 1.0482384, + "epoch": 0.021524124455133024, + "flos": 18185418938880.0, + "grad_norm": 2.8196932538713564, + "language_loss": 0.76050007, + "learning_rate": 3.995478822598488e-06, + "loss": 0.78356123, + "num_input_tokens_seen": 7450245, + "router_z_loss_clip": 0.421875, + "router_z_loss_mlp": 1.84375, + "step": 358, + "time_per_iteration": 2.4422390460968018 + }, + { + "auxiliary_loss_clip": 0.01229352, + "auxiliary_loss_mlp": 0.01079093, + "balance_loss_clip": 1.03245842, + "balance_loss_mlp": 1.04627466, + "epoch": 0.021584247707800992, + "flos": 13805006296320.0, + "grad_norm": 2.3266225841257038, + "language_loss": 0.88053858, + "learning_rate": 3.995453396692951e-06, + "loss": 0.90362304, + "num_input_tokens_seen": 7466845, + "router_z_loss_clip": 0.46679688, + "router_z_loss_mlp": 1.828125, + "step": 359, + "time_per_iteration": 2.3980555534362793 + }, + { + "auxiliary_loss_clip": 0.01234026, + "auxiliary_loss_mlp": 0.01068698, + "balance_loss_clip": 1.02614033, + "balance_loss_mlp": 1.05061042, + "epoch": 0.02164437096046896, + "flos": 23293596205440.0, + "grad_norm": 4.100534030530065, + "language_loss": 0.7596643, + "learning_rate": 3.995427899574816e-06, + "loss": 0.78269148, + "num_input_tokens_seen": 7485450, + "router_z_loss_clip": 0.42578125, + "router_z_loss_mlp": 1.828125, + "step": 360, + "time_per_iteration": 2.508373737335205 + }, + { + "auxiliary_loss_clip": 0.01130479, + "auxiliary_loss_mlp": 0.01015122, + "balance_loss_clip": 1.00186574, + "balance_loss_mlp": 1.03843045, + "epoch": 0.02170449421313693, + "flos": 68896237875840.0, + "grad_norm": 0.8318827427639371, + "language_loss": 0.64908099, + "learning_rate": 3.99540233124499e-06, + "loss": 0.67053699, + "num_input_tokens_seen": 7553780, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.921875, + "step": 361, + "time_per_iteration": 3.1652159690856934 + }, + { + "auxiliary_loss_clip": 0.0122958, + "auxiliary_loss_mlp": 0.01074758, + "balance_loss_clip": 1.03010178, + "balance_loss_mlp": 1.04697037, + "epoch": 0.0217646174658049, + "flos": 25227365523840.0, + "grad_norm": 3.2735007447015194, + "language_loss": 0.77731925, + "learning_rate": 3.995376691704389e-06, + "loss": 0.80036259, + "num_input_tokens_seen": 7574155, + "router_z_loss_clip": 0.4453125, + "router_z_loss_mlp": 1.828125, + "step": 362, + "time_per_iteration": 2.497349262237549 + }, + { + "auxiliary_loss_clip": 0.01232158, + "auxiliary_loss_mlp": 0.01070141, + "balance_loss_clip": 1.02698779, + "balance_loss_mlp": 1.04921007, + "epoch": 0.02182474071847287, + "flos": 22892025214080.0, + "grad_norm": 2.257508093659879, + "language_loss": 0.9193871, + "learning_rate": 3.995350980953926e-06, + "loss": 0.94241005, + "num_input_tokens_seen": 7592320, + "router_z_loss_clip": 0.43164062, + "router_z_loss_mlp": 1.828125, + "step": 363, + "time_per_iteration": 2.4355170726776123 + }, + { + "auxiliary_loss_clip": 0.0122669, + "auxiliary_loss_mlp": 0.01077318, + "balance_loss_clip": 1.03614295, + "balance_loss_mlp": 1.04583764, + "epoch": 0.02188486397114084, + "flos": 23657879998080.0, + "grad_norm": 3.088695106876858, + "language_loss": 0.89338195, + "learning_rate": 3.99532519899452e-06, + "loss": 0.91642201, + "num_input_tokens_seen": 7611185, + "router_z_loss_clip": 0.41210938, + "router_z_loss_mlp": 1.8125, + "step": 364, + "time_per_iteration": 2.4619293212890625 + }, + { + "auxiliary_loss_clip": 0.01231817, + "auxiliary_loss_mlp": 0.01075566, + "balance_loss_clip": 1.03019536, + "balance_loss_mlp": 1.04998064, + "epoch": 0.021944987223808807, + "flos": 21542562276480.0, + "grad_norm": 2.164351222240482, + "language_loss": 0.78897971, + "learning_rate": 3.99529934582709e-06, + "loss": 0.81205356, + "num_input_tokens_seen": 7631970, + "router_z_loss_clip": 0.453125, + "router_z_loss_mlp": 1.8203125, + "step": 365, + "time_per_iteration": 2.4631075859069824 + }, + { + "auxiliary_loss_clip": 0.0123104, + "auxiliary_loss_mlp": 0.01081015, + "balance_loss_clip": 1.03552461, + "balance_loss_mlp": 1.04984212, + "epoch": 0.022005110476476776, + "flos": 16069961571840.0, + "grad_norm": 2.7063772335893934, + "language_loss": 0.83782774, + "learning_rate": 3.995273421452558e-06, + "loss": 0.86094832, + "num_input_tokens_seen": 7649745, + "router_z_loss_clip": 0.453125, + "router_z_loss_mlp": 1.8125, + "step": 366, + "time_per_iteration": 2.45685076713562 + }, + { + "auxiliary_loss_clip": 0.01234878, + "auxiliary_loss_mlp": 0.01070655, + "balance_loss_clip": 1.02695298, + "balance_loss_mlp": 1.05081034, + "epoch": 0.022065233729144748, + "flos": 21432655716480.0, + "grad_norm": 2.1036187774214907, + "language_loss": 0.86579663, + "learning_rate": 3.995247425871851e-06, + "loss": 0.888852, + "num_input_tokens_seen": 7668830, + "router_z_loss_clip": 0.4375, + "router_z_loss_mlp": 1.84375, + "step": 367, + "time_per_iteration": 2.4273951053619385 + }, + { + "auxiliary_loss_clip": 0.01234498, + "auxiliary_loss_mlp": 0.01090663, + "balance_loss_clip": 1.04426718, + "balance_loss_mlp": 1.04885268, + "epoch": 0.022125356981812717, + "flos": 21542632099200.0, + "grad_norm": 2.387828187297824, + "language_loss": 0.84244931, + "learning_rate": 3.995221359085895e-06, + "loss": 0.86570096, + "num_input_tokens_seen": 7687240, + "router_z_loss_clip": 0.46484375, + "router_z_loss_mlp": 1.859375, + "step": 368, + "time_per_iteration": 2.4499919414520264 + }, + { + "auxiliary_loss_clip": 0.01234171, + "auxiliary_loss_mlp": 0.01067317, + "balance_loss_clip": 1.02347231, + "balance_loss_mlp": 1.04798639, + "epoch": 0.022185480234480685, + "flos": 20703110613120.0, + "grad_norm": 2.3411944361606665, + "language_loss": 0.74964315, + "learning_rate": 3.995195221095621e-06, + "loss": 0.77265799, + "num_input_tokens_seen": 7704440, + "router_z_loss_clip": 0.4375, + "router_z_loss_mlp": 1.859375, + "step": 369, + "time_per_iteration": 2.4479618072509766 + }, + { + "auxiliary_loss_clip": 0.01229101, + "auxiliary_loss_mlp": 0.0107853, + "balance_loss_clip": 1.03704512, + "balance_loss_mlp": 1.04931593, + "epoch": 0.022245603487148654, + "flos": 25191998449920.0, + "grad_norm": 2.161917099101355, + "language_loss": 0.82162476, + "learning_rate": 3.995169011901963e-06, + "loss": 0.84470105, + "num_input_tokens_seen": 7727160, + "router_z_loss_clip": 0.4140625, + "router_z_loss_mlp": 1.796875, + "step": 370, + "time_per_iteration": 3.883336305618286 + }, + { + "auxiliary_loss_clip": 0.01231991, + "auxiliary_loss_mlp": 0.0108399, + "balance_loss_clip": 1.04186118, + "balance_loss_mlp": 1.05016744, + "epoch": 0.022305726739816623, + "flos": 21394914670080.0, + "grad_norm": 2.3533588348566785, + "language_loss": 0.81352019, + "learning_rate": 3.995142731505854e-06, + "loss": 0.83667994, + "num_input_tokens_seen": 7747730, + "router_z_loss_clip": 0.421875, + "router_z_loss_mlp": 1.8203125, + "step": 371, + "time_per_iteration": 2.4541006088256836 + }, + { + "auxiliary_loss_clip": 0.01233079, + "auxiliary_loss_mlp": 0.0108394, + "balance_loss_clip": 1.03885484, + "balance_loss_mlp": 1.05017495, + "epoch": 0.022365849992484595, + "flos": 22491047715840.0, + "grad_norm": 2.63447056724904, + "language_loss": 0.83084446, + "learning_rate": 3.995116379908234e-06, + "loss": 0.85401469, + "num_input_tokens_seen": 7766765, + "router_z_loss_clip": 0.45117188, + "router_z_loss_mlp": 1.828125, + "step": 372, + "time_per_iteration": 5.315749168395996 + }, + { + "auxiliary_loss_clip": 0.01225089, + "auxiliary_loss_mlp": 0.01074048, + "balance_loss_clip": 1.02908194, + "balance_loss_mlp": 1.04825759, + "epoch": 0.022425973245152563, + "flos": 17855664347520.0, + "grad_norm": 5.51553422257015, + "language_loss": 0.78102767, + "learning_rate": 3.995089957110041e-06, + "loss": 0.80401897, + "num_input_tokens_seen": 7784010, + "router_z_loss_clip": 0.44921875, + "router_z_loss_mlp": 1.765625, + "step": 373, + "time_per_iteration": 3.8593719005584717 + }, + { + "auxiliary_loss_clip": 0.01229883, + "auxiliary_loss_mlp": 0.01077793, + "balance_loss_clip": 1.03256512, + "balance_loss_mlp": 1.04971766, + "epoch": 0.022486096497820532, + "flos": 15482233877760.0, + "grad_norm": 2.5033796908762147, + "language_loss": 0.76996267, + "learning_rate": 3.995063463112221e-06, + "loss": 0.79303944, + "num_input_tokens_seen": 7801305, + "router_z_loss_clip": 0.453125, + "router_z_loss_mlp": 1.8046875, + "step": 374, + "time_per_iteration": 2.419429302215576 + }, + { + "auxiliary_loss_clip": 0.01228989, + "auxiliary_loss_mlp": 0.01072966, + "balance_loss_clip": 1.0256635, + "balance_loss_mlp": 1.0454855, + "epoch": 0.0225462197504885, + "flos": 27782868067200.0, + "grad_norm": 1.936964279705477, + "language_loss": 0.85860884, + "learning_rate": 3.995036897915717e-06, + "loss": 0.88162833, + "num_input_tokens_seen": 7823965, + "router_z_loss_clip": 0.47265625, + "router_z_loss_mlp": 1.8359375, + "step": 375, + "time_per_iteration": 2.4995079040527344 + }, + { + "auxiliary_loss_clip": 0.0123226, + "auxiliary_loss_mlp": 0.01084578, + "balance_loss_clip": 1.03849185, + "balance_loss_mlp": 1.05112147, + "epoch": 0.02260634300315647, + "flos": 19974438293760.0, + "grad_norm": 2.2991912097388605, + "language_loss": 0.88661456, + "learning_rate": 3.995010261521478e-06, + "loss": 0.90978289, + "num_input_tokens_seen": 7842115, + "router_z_loss_clip": 0.4609375, + "router_z_loss_mlp": 1.8125, + "step": 376, + "time_per_iteration": 2.418308973312378 + }, + { + "auxiliary_loss_clip": 0.0122968, + "auxiliary_loss_mlp": 0.01071277, + "balance_loss_clip": 1.02774143, + "balance_loss_mlp": 1.04656196, + "epoch": 0.02266646625582444, + "flos": 16027437669120.0, + "grad_norm": 2.4116974172152337, + "language_loss": 0.74843597, + "learning_rate": 3.9949835539304545e-06, + "loss": 0.77144551, + "num_input_tokens_seen": 7857830, + "router_z_loss_clip": 0.43554688, + "router_z_loss_mlp": 1.828125, + "step": 377, + "time_per_iteration": 2.4013772010803223 + }, + { + "auxiliary_loss_clip": 0.01224565, + "auxiliary_loss_mlp": 0.01075962, + "balance_loss_clip": 1.03197408, + "balance_loss_mlp": 1.04923904, + "epoch": 0.02272658950849241, + "flos": 20403800593920.0, + "grad_norm": 2.256167411950087, + "language_loss": 0.9871459, + "learning_rate": 3.9949567751436e-06, + "loss": 1.01015115, + "num_input_tokens_seen": 7875840, + "router_z_loss_clip": 0.43945312, + "router_z_loss_mlp": 1.75, + "step": 378, + "time_per_iteration": 2.4311933517456055 + }, + { + "auxiliary_loss_clip": 0.01114269, + "auxiliary_loss_mlp": 0.010195, + "balance_loss_clip": 1.0085324, + "balance_loss_mlp": 1.02366507, + "epoch": 0.02278671276116038, + "flos": 69843990176640.0, + "grad_norm": 0.9572279775789706, + "language_loss": 0.75515658, + "learning_rate": 3.99492992516187e-06, + "loss": 0.77649432, + "num_input_tokens_seen": 7940190, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.90625, + "step": 379, + "time_per_iteration": 3.109609842300415 + }, + { + "auxiliary_loss_clip": 0.01233634, + "auxiliary_loss_mlp": 0.01072328, + "balance_loss_clip": 1.02812481, + "balance_loss_mlp": 1.04682207, + "epoch": 0.022846836013828347, + "flos": 38507243927040.0, + "grad_norm": 2.251694170849771, + "language_loss": 0.78342873, + "learning_rate": 3.994903003986222e-06, + "loss": 0.80648834, + "num_input_tokens_seen": 7960840, + "router_z_loss_clip": 0.44140625, + "router_z_loss_mlp": 1.8671875, + "step": 380, + "time_per_iteration": 2.5509915351867676 + }, + { + "auxiliary_loss_clip": 0.01224327, + "auxiliary_loss_mlp": 0.01077174, + "balance_loss_clip": 1.03294706, + "balance_loss_mlp": 1.0472827, + "epoch": 0.02290695926649632, + "flos": 20958430452480.0, + "grad_norm": 2.3818750916319096, + "language_loss": 0.95732051, + "learning_rate": 3.9948760116176174e-06, + "loss": 0.98033547, + "num_input_tokens_seen": 7975500, + "router_z_loss_clip": 0.44335938, + "router_z_loss_mlp": 1.7734375, + "step": 381, + "time_per_iteration": 2.4247195720672607 + }, + { + "auxiliary_loss_clip": 0.01233709, + "auxiliary_loss_mlp": 0.01091844, + "balance_loss_clip": 1.04683065, + "balance_loss_mlp": 1.0478375, + "epoch": 0.022967082519164288, + "flos": 24021325918080.0, + "grad_norm": 5.971767421602091, + "language_loss": 0.87281406, + "learning_rate": 3.994848948057019e-06, + "loss": 0.89606953, + "num_input_tokens_seen": 7993880, + "router_z_loss_clip": 0.45117188, + "router_z_loss_mlp": 1.859375, + "step": 382, + "time_per_iteration": 2.4535419940948486 + }, + { + "auxiliary_loss_clip": 0.01232721, + "auxiliary_loss_mlp": 0.01079831, + "balance_loss_clip": 1.03834581, + "balance_loss_mlp": 1.04840291, + "epoch": 0.023027205771832256, + "flos": 20996066764800.0, + "grad_norm": 2.0497498489302672, + "language_loss": 0.84413087, + "learning_rate": 3.994821813305394e-06, + "loss": 0.8672564, + "num_input_tokens_seen": 8012730, + "router_z_loss_clip": 0.4140625, + "router_z_loss_mlp": 1.84375, + "step": 383, + "time_per_iteration": 2.427741765975952 + }, + { + "auxiliary_loss_clip": 0.01224827, + "auxiliary_loss_mlp": 0.01096017, + "balance_loss_clip": 1.05219579, + "balance_loss_mlp": 1.04934549, + "epoch": 0.023087329024500225, + "flos": 21359757064320.0, + "grad_norm": 2.510335613890191, + "language_loss": 0.82757276, + "learning_rate": 3.99479460736371e-06, + "loss": 0.8507812, + "num_input_tokens_seen": 8031275, + "router_z_loss_clip": 0.4375, + "router_z_loss_mlp": 1.7578125, + "step": 384, + "time_per_iteration": 2.416593074798584 + }, + { + "auxiliary_loss_clip": 0.01226601, + "auxiliary_loss_mlp": 0.0107713, + "balance_loss_clip": 1.03562117, + "balance_loss_mlp": 1.05061913, + "epoch": 0.023147452277168194, + "flos": 21871339349760.0, + "grad_norm": 2.094474581463722, + "language_loss": 0.88727117, + "learning_rate": 3.994767330232937e-06, + "loss": 0.91030848, + "num_input_tokens_seen": 8051600, + "router_z_loss_clip": 0.4140625, + "router_z_loss_mlp": 1.7578125, + "step": 385, + "time_per_iteration": 2.4557037353515625 + }, + { + "auxiliary_loss_clip": 0.01231744, + "auxiliary_loss_mlp": 0.01084369, + "balance_loss_clip": 1.04207373, + "balance_loss_mlp": 1.05157328, + "epoch": 0.023207575529836166, + "flos": 18915697180800.0, + "grad_norm": 2.4968411771950567, + "language_loss": 0.69599569, + "learning_rate": 3.994739981914049e-06, + "loss": 0.71915674, + "num_input_tokens_seen": 8070600, + "router_z_loss_clip": 0.421875, + "router_z_loss_mlp": 1.796875, + "step": 386, + "time_per_iteration": 2.4219396114349365 + }, + { + "auxiliary_loss_clip": 0.01227764, + "auxiliary_loss_mlp": 0.01078979, + "balance_loss_clip": 1.03563499, + "balance_loss_mlp": 1.05063343, + "epoch": 0.023267698782504134, + "flos": 25044839602560.0, + "grad_norm": 9.04059514600862, + "language_loss": 0.87687516, + "learning_rate": 3.994712562408022e-06, + "loss": 0.89994264, + "num_input_tokens_seen": 8090680, + "router_z_loss_clip": 0.43359375, + "router_z_loss_mlp": 1.765625, + "step": 387, + "time_per_iteration": 2.4649181365966797 + }, + { + "auxiliary_loss_clip": 0.01228789, + "auxiliary_loss_mlp": 0.01068427, + "balance_loss_clip": 1.02572608, + "balance_loss_mlp": 1.05026042, + "epoch": 0.023327822035172103, + "flos": 28877883949440.0, + "grad_norm": 1.990605289011327, + "language_loss": 0.83328348, + "learning_rate": 3.994685071715835e-06, + "loss": 0.85625565, + "num_input_tokens_seen": 8114610, + "router_z_loss_clip": 0.42773438, + "router_z_loss_mlp": 1.7890625, + "step": 388, + "time_per_iteration": 2.5158395767211914 + }, + { + "auxiliary_loss_clip": 0.01225898, + "auxiliary_loss_mlp": 0.01072797, + "balance_loss_clip": 1.03352904, + "balance_loss_mlp": 1.04716516, + "epoch": 0.02338794528784007, + "flos": 27120426330240.0, + "grad_norm": 2.7565455513494936, + "language_loss": 0.9320004, + "learning_rate": 3.9946575098384686e-06, + "loss": 0.95498735, + "num_input_tokens_seen": 8133975, + "router_z_loss_clip": 0.39257812, + "router_z_loss_mlp": 1.7890625, + "step": 389, + "time_per_iteration": 2.494342803955078 + }, + { + "auxiliary_loss_clip": 0.0122056, + "auxiliary_loss_mlp": 0.01076788, + "balance_loss_clip": 1.0343492, + "balance_loss_mlp": 1.04805517, + "epoch": 0.02344806854050804, + "flos": 21321352702080.0, + "grad_norm": 3.0699235696824085, + "language_loss": 0.87314248, + "learning_rate": 3.9946298767769065e-06, + "loss": 0.89611602, + "num_input_tokens_seen": 8153570, + "router_z_loss_clip": 0.42382812, + "router_z_loss_mlp": 1.7265625, + "step": 390, + "time_per_iteration": 2.453216552734375 + }, + { + "auxiliary_loss_clip": 0.01223692, + "auxiliary_loss_mlp": 0.01069994, + "balance_loss_clip": 1.0310601, + "balance_loss_mlp": 1.04871178, + "epoch": 0.023508191793176012, + "flos": 24788856447360.0, + "grad_norm": 4.83313340351558, + "language_loss": 0.88527739, + "learning_rate": 3.994602172532135e-06, + "loss": 0.90821421, + "num_input_tokens_seen": 8170075, + "router_z_loss_clip": 0.38867188, + "router_z_loss_mlp": 1.75, + "step": 391, + "time_per_iteration": 2.455122947692871 + }, + { + "auxiliary_loss_clip": 0.01220029, + "auxiliary_loss_mlp": 0.01064478, + "balance_loss_clip": 1.02511477, + "balance_loss_mlp": 1.04577327, + "epoch": 0.02356831504584398, + "flos": 25994162914560.0, + "grad_norm": 3.6976594815005357, + "language_loss": 0.86070114, + "learning_rate": 3.994574397105143e-06, + "loss": 0.88354623, + "num_input_tokens_seen": 8190420, + "router_z_loss_clip": 0.39453125, + "router_z_loss_mlp": 1.7421875, + "step": 392, + "time_per_iteration": 2.462695837020874 + }, + { + "auxiliary_loss_clip": 0.01219756, + "auxiliary_loss_mlp": 0.01065758, + "balance_loss_clip": 1.02577543, + "balance_loss_mlp": 1.04501486, + "epoch": 0.02362843829851195, + "flos": 19061459573760.0, + "grad_norm": 1.9887120025248302, + "language_loss": 0.88842404, + "learning_rate": 3.994546550496921e-06, + "loss": 0.91127914, + "num_input_tokens_seen": 8208790, + "router_z_loss_clip": 0.40039062, + "router_z_loss_mlp": 1.75, + "step": 393, + "time_per_iteration": 2.4153332710266113 + }, + { + "auxiliary_loss_clip": 0.01226983, + "auxiliary_loss_mlp": 0.01083042, + "balance_loss_clip": 1.03962576, + "balance_loss_mlp": 1.05139089, + "epoch": 0.023688561551179918, + "flos": 16070101217280.0, + "grad_norm": 3.2261350159623565, + "language_loss": 0.81036854, + "learning_rate": 3.994518632708464e-06, + "loss": 0.83346879, + "num_input_tokens_seen": 8226885, + "router_z_loss_clip": 0.43359375, + "router_z_loss_mlp": 1.75, + "step": 394, + "time_per_iteration": 2.4195621013641357 + }, + { + "auxiliary_loss_clip": 0.01222919, + "auxiliary_loss_mlp": 0.0107631, + "balance_loss_clip": 1.03420556, + "balance_loss_mlp": 1.04789519, + "epoch": 0.023748684803847887, + "flos": 21723342629760.0, + "grad_norm": 1.9563544568378761, + "language_loss": 0.85760427, + "learning_rate": 3.994490643740766e-06, + "loss": 0.88059652, + "num_input_tokens_seen": 8246825, + "router_z_loss_clip": 0.421875, + "router_z_loss_mlp": 1.75, + "step": 395, + "time_per_iteration": 2.447772741317749 + }, + { + "auxiliary_loss_clip": 0.01105912, + "auxiliary_loss_mlp": 0.01019347, + "balance_loss_clip": 1.00837958, + "balance_loss_mlp": 1.0205375, + "epoch": 0.02380880805651586, + "flos": 61923175136640.0, + "grad_norm": 0.91434065681227, + "language_loss": 0.63803995, + "learning_rate": 3.994462583594828e-06, + "loss": 0.65929258, + "num_input_tokens_seen": 8302835, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.8515625, + "step": 396, + "time_per_iteration": 2.896740198135376 + }, + { + "auxiliary_loss_clip": 0.01220118, + "auxiliary_loss_mlp": 0.01063434, + "balance_loss_clip": 1.02218795, + "balance_loss_mlp": 1.04511356, + "epoch": 0.023868931309183827, + "flos": 20265299763840.0, + "grad_norm": 2.704976210508278, + "language_loss": 0.83204758, + "learning_rate": 3.994434452271651e-06, + "loss": 0.85488307, + "num_input_tokens_seen": 8320745, + "router_z_loss_clip": 0.4140625, + "router_z_loss_mlp": 1.75, + "step": 397, + "time_per_iteration": 2.4413938522338867 + }, + { + "auxiliary_loss_clip": 0.0122405, + "auxiliary_loss_mlp": 0.01069104, + "balance_loss_clip": 1.02654648, + "balance_loss_mlp": 1.04862189, + "epoch": 0.023929054561851796, + "flos": 21138128553600.0, + "grad_norm": 2.4026621418980687, + "language_loss": 0.84061825, + "learning_rate": 3.994406249772239e-06, + "loss": 0.86354977, + "num_input_tokens_seen": 8339540, + "router_z_loss_clip": 0.42578125, + "router_z_loss_mlp": 1.75, + "step": 398, + "time_per_iteration": 2.4305572509765625 + }, + { + "auxiliary_loss_clip": 0.01224106, + "auxiliary_loss_mlp": 0.01077038, + "balance_loss_clip": 1.03304994, + "balance_loss_mlp": 1.04774022, + "epoch": 0.023989177814519765, + "flos": 13697683176960.0, + "grad_norm": 3.4160456873041443, + "language_loss": 0.8576498, + "learning_rate": 3.994377976097598e-06, + "loss": 0.88066125, + "num_input_tokens_seen": 8354890, + "router_z_loss_clip": 0.43945312, + "router_z_loss_mlp": 1.765625, + "step": 399, + "time_per_iteration": 2.400832176208496 + }, + { + "auxiliary_loss_clip": 0.01219711, + "auxiliary_loss_mlp": 0.01079695, + "balance_loss_clip": 1.03799558, + "balance_loss_mlp": 1.04665935, + "epoch": 0.024049301067187733, + "flos": 26320845306240.0, + "grad_norm": 2.980519631223286, + "language_loss": 0.85427976, + "learning_rate": 3.9943496312487365e-06, + "loss": 0.87727386, + "num_input_tokens_seen": 8375845, + "router_z_loss_clip": 0.41601562, + "router_z_loss_mlp": 1.734375, + "step": 400, + "time_per_iteration": 2.48085618019104 + }, + { + "auxiliary_loss_clip": 0.01224102, + "auxiliary_loss_mlp": 0.01069693, + "balance_loss_clip": 1.02756393, + "balance_loss_mlp": 1.05105746, + "epoch": 0.024109424319855705, + "flos": 24424293363840.0, + "grad_norm": 1.8415507104795583, + "language_loss": 0.7897774, + "learning_rate": 3.994321215226667e-06, + "loss": 0.81271529, + "num_input_tokens_seen": 8395240, + "router_z_loss_clip": 0.421875, + "router_z_loss_mlp": 1.734375, + "step": 401, + "time_per_iteration": 2.4944117069244385 + }, + { + "auxiliary_loss_clip": 0.01219562, + "auxiliary_loss_mlp": 0.01078446, + "balance_loss_clip": 1.03874898, + "balance_loss_mlp": 1.04854548, + "epoch": 0.024169547572523674, + "flos": 29603169866880.0, + "grad_norm": 2.2638273233506365, + "language_loss": 0.78047067, + "learning_rate": 3.994292728032404e-06, + "loss": 0.80345076, + "num_input_tokens_seen": 8416950, + "router_z_loss_clip": 0.39648438, + "router_z_loss_mlp": 1.703125, + "step": 402, + "time_per_iteration": 2.5082874298095703 + }, + { + "auxiliary_loss_clip": 0.0122181, + "auxiliary_loss_mlp": 0.01069983, + "balance_loss_clip": 1.02740133, + "balance_loss_mlp": 1.04404521, + "epoch": 0.024229670825191642, + "flos": 22600360782720.0, + "grad_norm": 2.746171059960839, + "language_loss": 0.94473672, + "learning_rate": 3.994264169666963e-06, + "loss": 0.96765459, + "num_input_tokens_seen": 8433660, + "router_z_loss_clip": 0.42578125, + "router_z_loss_mlp": 1.7734375, + "step": 403, + "time_per_iteration": 2.4508306980133057 + }, + { + "auxiliary_loss_clip": 0.01228746, + "auxiliary_loss_mlp": 0.01071185, + "balance_loss_clip": 1.0266248, + "balance_loss_mlp": 1.04774153, + "epoch": 0.02428979407785961, + "flos": 18149283815040.0, + "grad_norm": 2.778395112563993, + "language_loss": 0.9934364, + "learning_rate": 3.994235540131364e-06, + "loss": 1.01643562, + "num_input_tokens_seen": 8450180, + "router_z_loss_clip": 0.4453125, + "router_z_loss_mlp": 1.8125, + "step": 404, + "time_per_iteration": 2.392777919769287 + }, + { + "auxiliary_loss_clip": 0.01224341, + "auxiliary_loss_mlp": 0.01079139, + "balance_loss_clip": 1.03729606, + "balance_loss_mlp": 1.04792476, + "epoch": 0.024349917330527583, + "flos": 15304071876480.0, + "grad_norm": 3.1709684895581076, + "language_loss": 0.87440234, + "learning_rate": 3.994206839426627e-06, + "loss": 0.8974371, + "num_input_tokens_seen": 8467775, + "router_z_loss_clip": 0.41796875, + "router_z_loss_mlp": 1.765625, + "step": 405, + "time_per_iteration": 2.414794445037842 + }, + { + "auxiliary_loss_clip": 0.01232461, + "auxiliary_loss_mlp": 0.01072855, + "balance_loss_clip": 1.02982068, + "balance_loss_mlp": 1.05070579, + "epoch": 0.024410040583195552, + "flos": 20772937065600.0, + "grad_norm": 3.0795526101990034, + "language_loss": 0.93019068, + "learning_rate": 3.994178067553779e-06, + "loss": 0.95324385, + "num_input_tokens_seen": 8486765, + "router_z_loss_clip": 0.43164062, + "router_z_loss_mlp": 1.8203125, + "step": 406, + "time_per_iteration": 2.4340293407440186 + }, + { + "auxiliary_loss_clip": 0.01225436, + "auxiliary_loss_mlp": 0.01079018, + "balance_loss_clip": 1.03405201, + "balance_loss_mlp": 1.04839635, + "epoch": 0.02447016383586352, + "flos": 21797777381760.0, + "grad_norm": 4.538106572159414, + "language_loss": 0.86687589, + "learning_rate": 3.994149224513846e-06, + "loss": 0.88992041, + "num_input_tokens_seen": 8506515, + "router_z_loss_clip": 0.44921875, + "router_z_loss_mlp": 1.765625, + "step": 407, + "time_per_iteration": 2.441693067550659 + }, + { + "auxiliary_loss_clip": 0.01223369, + "auxiliary_loss_mlp": 0.01074858, + "balance_loss_clip": 1.03272939, + "balance_loss_mlp": 1.04847205, + "epoch": 0.02453028708853149, + "flos": 33946714247040.0, + "grad_norm": 2.046892011184857, + "language_loss": 0.73118854, + "learning_rate": 3.994120310307856e-06, + "loss": 0.75417078, + "num_input_tokens_seen": 8528035, + "router_z_loss_clip": 0.41992188, + "router_z_loss_mlp": 1.75, + "step": 408, + "time_per_iteration": 2.5407135486602783 + }, + { + "auxiliary_loss_clip": 0.01228314, + "auxiliary_loss_mlp": 0.01074045, + "balance_loss_clip": 1.03263187, + "balance_loss_mlp": 1.05185044, + "epoch": 0.024590410341199458, + "flos": 21792086830080.0, + "grad_norm": 2.7996715546795694, + "language_loss": 0.92269748, + "learning_rate": 3.994091324936841e-06, + "loss": 0.94572109, + "num_input_tokens_seen": 8546455, + "router_z_loss_clip": 0.4140625, + "router_z_loss_mlp": 1.765625, + "step": 409, + "time_per_iteration": 3.8719043731689453 + }, + { + "auxiliary_loss_clip": 0.01225605, + "auxiliary_loss_mlp": 0.01066815, + "balance_loss_clip": 1.02556872, + "balance_loss_mlp": 1.0501914, + "epoch": 0.02465053359386743, + "flos": 35113371972480.0, + "grad_norm": 2.2476422238018245, + "language_loss": 0.81878775, + "learning_rate": 3.994062268401836e-06, + "loss": 0.84171194, + "num_input_tokens_seen": 8568450, + "router_z_loss_clip": 0.41210938, + "router_z_loss_mlp": 1.7578125, + "step": 410, + "time_per_iteration": 2.5387113094329834 + }, + { + "auxiliary_loss_clip": 0.01226802, + "auxiliary_loss_mlp": 0.01076319, + "balance_loss_clip": 1.03333235, + "balance_loss_mlp": 1.04942226, + "epoch": 0.0247106568465354, + "flos": 27450250744320.0, + "grad_norm": 2.654348941693858, + "language_loss": 0.77659327, + "learning_rate": 3.994033140703878e-06, + "loss": 0.7996245, + "num_input_tokens_seen": 8589340, + "router_z_loss_clip": 0.4296875, + "router_z_loss_mlp": 1.7734375, + "step": 411, + "time_per_iteration": 3.9723191261291504 + }, + { + "auxiliary_loss_clip": 0.0122833, + "auxiliary_loss_mlp": 0.0108574, + "balance_loss_clip": 1.04299116, + "balance_loss_mlp": 1.04967451, + "epoch": 0.024770780099203367, + "flos": 20702761499520.0, + "grad_norm": 2.2231464753261045, + "language_loss": 0.86391199, + "learning_rate": 3.994003941844007e-06, + "loss": 0.88705271, + "num_input_tokens_seen": 8607150, + "router_z_loss_clip": 0.42773438, + "router_z_loss_mlp": 1.7890625, + "step": 412, + "time_per_iteration": 3.839991331100464 + }, + { + "auxiliary_loss_clip": 0.01101658, + "auxiliary_loss_mlp": 0.01014881, + "balance_loss_clip": 1.00386584, + "balance_loss_mlp": 1.02291083, + "epoch": 0.024830903351871336, + "flos": 69548625141120.0, + "grad_norm": 0.8306108238647311, + "language_loss": 0.5848062, + "learning_rate": 3.993974671823265e-06, + "loss": 0.60597157, + "num_input_tokens_seen": 8669865, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.7890625, + "step": 413, + "time_per_iteration": 4.496323585510254 + }, + { + "auxiliary_loss_clip": 0.01224728, + "auxiliary_loss_mlp": 0.0107864, + "balance_loss_clip": 1.03374577, + "balance_loss_mlp": 1.04808426, + "epoch": 0.024891026604539304, + "flos": 32269102640640.0, + "grad_norm": 2.8425881831129383, + "language_loss": 0.80029666, + "learning_rate": 3.9939453306426955e-06, + "loss": 0.8233304, + "num_input_tokens_seen": 8690235, + "router_z_loss_clip": 0.44921875, + "router_z_loss_mlp": 1.765625, + "step": 414, + "time_per_iteration": 2.5428895950317383 + }, + { + "auxiliary_loss_clip": 0.01218806, + "auxiliary_loss_mlp": 0.01077296, + "balance_loss_clip": 1.03511941, + "balance_loss_mlp": 1.04687631, + "epoch": 0.024951149857207276, + "flos": 18839377215360.0, + "grad_norm": 2.7484104693387423, + "language_loss": 0.7967658, + "learning_rate": 3.9939159183033466e-06, + "loss": 0.81972682, + "num_input_tokens_seen": 8706295, + "router_z_loss_clip": 0.421875, + "router_z_loss_mlp": 1.71875, + "step": 415, + "time_per_iteration": 2.4227468967437744 + }, + { + "auxiliary_loss_clip": 0.01230682, + "auxiliary_loss_mlp": 0.01078942, + "balance_loss_clip": 1.03686094, + "balance_loss_mlp": 1.05102003, + "epoch": 0.025011273109875245, + "flos": 15376307212800.0, + "grad_norm": 3.0735378724622233, + "language_loss": 0.95692146, + "learning_rate": 3.9938864348062675e-06, + "loss": 0.98001772, + "num_input_tokens_seen": 8724200, + "router_z_loss_clip": 0.41992188, + "router_z_loss_mlp": 1.796875, + "step": 416, + "time_per_iteration": 2.4133734703063965 + }, + { + "auxiliary_loss_clip": 0.01223109, + "auxiliary_loss_mlp": 0.01067797, + "balance_loss_clip": 1.02349901, + "balance_loss_mlp": 1.04716539, + "epoch": 0.025071396362543213, + "flos": 18914545105920.0, + "grad_norm": 2.036718056261979, + "language_loss": 0.77308404, + "learning_rate": 3.993856880152509e-06, + "loss": 0.79599309, + "num_input_tokens_seen": 8744170, + "router_z_loss_clip": 0.44140625, + "router_z_loss_mlp": 1.7578125, + "step": 417, + "time_per_iteration": 2.454113245010376 + }, + { + "auxiliary_loss_clip": 0.01220784, + "auxiliary_loss_mlp": 0.01082056, + "balance_loss_clip": 1.03978431, + "balance_loss_mlp": 1.05053282, + "epoch": 0.025131519615211182, + "flos": 25336783324800.0, + "grad_norm": 1.654779187260334, + "language_loss": 0.76904714, + "learning_rate": 3.9938272543431286e-06, + "loss": 0.79207551, + "num_input_tokens_seen": 8765120, + "router_z_loss_clip": 0.421875, + "router_z_loss_mlp": 1.703125, + "step": 418, + "time_per_iteration": 2.50669002532959 + }, + { + "auxiliary_loss_clip": 0.0122798, + "auxiliary_loss_mlp": 0.01067283, + "balance_loss_clip": 1.02725244, + "balance_loss_mlp": 1.05104196, + "epoch": 0.02519164286787915, + "flos": 18952146506880.0, + "grad_norm": 4.729071356260811, + "language_loss": 0.81378472, + "learning_rate": 3.993797557379182e-06, + "loss": 0.83673733, + "num_input_tokens_seen": 8783500, + "router_z_loss_clip": 0.40039062, + "router_z_loss_mlp": 1.765625, + "step": 419, + "time_per_iteration": 2.421539545059204 + }, + { + "auxiliary_loss_clip": 0.01219551, + "auxiliary_loss_mlp": 0.01074075, + "balance_loss_clip": 1.0309689, + "balance_loss_mlp": 1.04970813, + "epoch": 0.025251766120547123, + "flos": 17420122736640.0, + "grad_norm": 2.2544890987226625, + "language_loss": 0.7359246, + "learning_rate": 3.9937677892617295e-06, + "loss": 0.75886083, + "num_input_tokens_seen": 8801175, + "router_z_loss_clip": 0.43164062, + "router_z_loss_mlp": 1.6953125, + "step": 420, + "time_per_iteration": 2.417229175567627 + }, + { + "auxiliary_loss_clip": 0.01220992, + "auxiliary_loss_mlp": 0.01075999, + "balance_loss_clip": 1.03506291, + "balance_loss_mlp": 1.04477525, + "epoch": 0.02531188937321509, + "flos": 25044281020800.0, + "grad_norm": 1.9282394845206294, + "language_loss": 0.78481078, + "learning_rate": 3.993737949991833e-06, + "loss": 0.80778074, + "num_input_tokens_seen": 8820215, + "router_z_loss_clip": 0.41015625, + "router_z_loss_mlp": 1.7578125, + "step": 421, + "time_per_iteration": 2.5036356449127197 + }, + { + "auxiliary_loss_clip": 0.01219951, + "auxiliary_loss_mlp": 0.01077345, + "balance_loss_clip": 1.0353117, + "balance_loss_mlp": 1.05088842, + "epoch": 0.02537201262588306, + "flos": 30590897541120.0, + "grad_norm": 2.08196997858772, + "language_loss": 0.81678551, + "learning_rate": 3.993708039570557e-06, + "loss": 0.8397584, + "num_input_tokens_seen": 8839660, + "router_z_loss_clip": 0.41992188, + "router_z_loss_mlp": 1.6875, + "step": 422, + "time_per_iteration": 2.503122568130493 + }, + { + "auxiliary_loss_clip": 0.01221051, + "auxiliary_loss_mlp": 0.01069051, + "balance_loss_clip": 1.03059411, + "balance_loss_mlp": 1.04643059, + "epoch": 0.02543213587855103, + "flos": 26064233746560.0, + "grad_norm": 1.8956315955091694, + "language_loss": 0.83241171, + "learning_rate": 3.99367805799897e-06, + "loss": 0.85531271, + "num_input_tokens_seen": 8859280, + "router_z_loss_clip": 0.38476562, + "router_z_loss_mlp": 1.75, + "step": 423, + "time_per_iteration": 2.467223882675171 + }, + { + "auxiliary_loss_clip": 0.01215465, + "auxiliary_loss_mlp": 0.01064725, + "balance_loss_clip": 1.02540994, + "balance_loss_mlp": 1.04517615, + "epoch": 0.025492259131218997, + "flos": 36021498013440.0, + "grad_norm": 2.03259954828413, + "language_loss": 0.74068058, + "learning_rate": 3.993648005278142e-06, + "loss": 0.76348245, + "num_input_tokens_seen": 8880560, + "router_z_loss_clip": 0.39257812, + "router_z_loss_mlp": 1.703125, + "step": 424, + "time_per_iteration": 2.521575689315796 + }, + { + "auxiliary_loss_clip": 0.01225442, + "auxiliary_loss_mlp": 0.01083442, + "balance_loss_clip": 1.04109931, + "balance_loss_mlp": 1.04883325, + "epoch": 0.02555238238388697, + "flos": 18587059752960.0, + "grad_norm": 2.815339780265551, + "language_loss": 0.8265295, + "learning_rate": 3.993617881409143e-06, + "loss": 0.84961832, + "num_input_tokens_seen": 8899155, + "router_z_loss_clip": 0.42382812, + "router_z_loss_mlp": 1.765625, + "step": 425, + "time_per_iteration": 2.4312291145324707 + }, + { + "auxiliary_loss_clip": 0.01222589, + "auxiliary_loss_mlp": 0.01077014, + "balance_loss_clip": 1.03600645, + "balance_loss_mlp": 1.0447588, + "epoch": 0.025612505636554938, + "flos": 24242046733440.0, + "grad_norm": 3.8650408966719287, + "language_loss": 0.85035753, + "learning_rate": 3.993587686393052e-06, + "loss": 0.87335354, + "num_input_tokens_seen": 8917890, + "router_z_loss_clip": 0.41015625, + "router_z_loss_mlp": 1.78125, + "step": 426, + "time_per_iteration": 2.4371635913848877 + }, + { + "auxiliary_loss_clip": 0.01098027, + "auxiliary_loss_mlp": 0.01017479, + "balance_loss_clip": 1.00841868, + "balance_loss_mlp": 1.0204736, + "epoch": 0.025672628889222907, + "flos": 60583661936640.0, + "grad_norm": 0.8871951760081042, + "language_loss": 0.57136494, + "learning_rate": 3.993557420230944e-06, + "loss": 0.59252, + "num_input_tokens_seen": 8978260, + "router_z_loss_clip": 0.09082031, + "router_z_loss_mlp": 0.7734375, + "step": 427, + "time_per_iteration": 3.173518657684326 + }, + { + "auxiliary_loss_clip": 0.01222285, + "auxiliary_loss_mlp": 0.01077646, + "balance_loss_clip": 1.03778279, + "balance_loss_mlp": 1.04927766, + "epoch": 0.025732752141890875, + "flos": 19572238897920.0, + "grad_norm": 3.820220230684572, + "language_loss": 0.87785196, + "learning_rate": 3.9935270829239e-06, + "loss": 0.90085125, + "num_input_tokens_seen": 8994460, + "router_z_loss_clip": 0.3984375, + "router_z_loss_mlp": 1.734375, + "step": 428, + "time_per_iteration": 2.404013156890869 + }, + { + "auxiliary_loss_clip": 0.01218422, + "auxiliary_loss_mlp": 0.01069172, + "balance_loss_clip": 1.02976179, + "balance_loss_mlp": 1.04609978, + "epoch": 0.025792875394558847, + "flos": 31282945977600.0, + "grad_norm": 1.7895895121794416, + "language_loss": 0.85488737, + "learning_rate": 3.993496674473002e-06, + "loss": 0.87776333, + "num_input_tokens_seen": 9016670, + "router_z_loss_clip": 0.39453125, + "router_z_loss_mlp": 1.7265625, + "step": 429, + "time_per_iteration": 2.5000152587890625 + }, + { + "auxiliary_loss_clip": 0.01218331, + "auxiliary_loss_mlp": 0.01076159, + "balance_loss_clip": 1.0330286, + "balance_loss_mlp": 1.04326129, + "epoch": 0.025852998647226816, + "flos": 32378241150720.0, + "grad_norm": 3.5761364735444694, + "language_loss": 0.88163298, + "learning_rate": 3.993466194879335e-06, + "loss": 0.90457785, + "num_input_tokens_seen": 9039720, + "router_z_loss_clip": 0.43164062, + "router_z_loss_mlp": 1.75, + "step": 430, + "time_per_iteration": 2.5054399967193604 + }, + { + "auxiliary_loss_clip": 0.01221007, + "auxiliary_loss_mlp": 0.01064797, + "balance_loss_clip": 1.02712727, + "balance_loss_mlp": 1.05101562, + "epoch": 0.025913121899894784, + "flos": 20192261466240.0, + "grad_norm": 2.0882154604930507, + "language_loss": 0.83566093, + "learning_rate": 3.993435644143989e-06, + "loss": 0.85851896, + "num_input_tokens_seen": 9059850, + "router_z_loss_clip": 0.37695312, + "router_z_loss_mlp": 1.703125, + "step": 431, + "time_per_iteration": 2.413207769393921 + }, + { + "auxiliary_loss_clip": 0.01219247, + "auxiliary_loss_mlp": 0.01062272, + "balance_loss_clip": 1.02312338, + "balance_loss_mlp": 1.04578114, + "epoch": 0.025973245152562753, + "flos": 14719556027520.0, + "grad_norm": 4.11964093237474, + "language_loss": 0.86177897, + "learning_rate": 3.993405022268051e-06, + "loss": 0.8845942, + "num_input_tokens_seen": 9077590, + "router_z_loss_clip": 0.390625, + "router_z_loss_mlp": 1.734375, + "step": 432, + "time_per_iteration": 2.3806257247924805 + }, + { + "auxiliary_loss_clip": 0.01222087, + "auxiliary_loss_mlp": 0.01061348, + "balance_loss_clip": 1.02436972, + "balance_loss_mlp": 1.04628515, + "epoch": 0.02603336840523072, + "flos": 30991665571200.0, + "grad_norm": 2.6666100728953968, + "language_loss": 0.75972843, + "learning_rate": 3.993374329252616e-06, + "loss": 0.78256285, + "num_input_tokens_seen": 9099880, + "router_z_loss_clip": 0.37109375, + "router_z_loss_mlp": 1.7578125, + "step": 433, + "time_per_iteration": 2.51416277885437 + }, + { + "auxiliary_loss_clip": 0.01221008, + "auxiliary_loss_mlp": 0.01080675, + "balance_loss_clip": 1.0364722, + "balance_loss_mlp": 1.04617822, + "epoch": 0.026093491657898694, + "flos": 17673347894400.0, + "grad_norm": 1.959187456798102, + "language_loss": 0.89468384, + "learning_rate": 3.993343565098778e-06, + "loss": 0.91770065, + "num_input_tokens_seen": 9118620, + "router_z_loss_clip": 0.44140625, + "router_z_loss_mlp": 1.7421875, + "step": 434, + "time_per_iteration": 2.4000027179718018 + }, + { + "auxiliary_loss_clip": 0.01221612, + "auxiliary_loss_mlp": 0.01081696, + "balance_loss_clip": 1.04037833, + "balance_loss_mlp": 1.05102932, + "epoch": 0.026153614910566662, + "flos": 17856921156480.0, + "grad_norm": 2.096397488032798, + "language_loss": 0.79408079, + "learning_rate": 3.993312729807637e-06, + "loss": 0.81711388, + "num_input_tokens_seen": 9135655, + "router_z_loss_clip": 0.4140625, + "router_z_loss_mlp": 1.703125, + "step": 435, + "time_per_iteration": 2.4256155490875244 + }, + { + "auxiliary_loss_clip": 0.01217634, + "auxiliary_loss_mlp": 0.01078944, + "balance_loss_clip": 1.03753078, + "balance_loss_mlp": 1.04609227, + "epoch": 0.02621373816323463, + "flos": 20010084658560.0, + "grad_norm": 2.5845390045116687, + "language_loss": 0.86184919, + "learning_rate": 3.993281823380292e-06, + "loss": 0.88481498, + "num_input_tokens_seen": 9153520, + "router_z_loss_clip": 0.4140625, + "router_z_loss_mlp": 1.71875, + "step": 436, + "time_per_iteration": 2.4278886318206787 + }, + { + "auxiliary_loss_clip": 0.012181, + "auxiliary_loss_mlp": 0.01087453, + "balance_loss_clip": 1.04573023, + "balance_loss_mlp": 1.0470084, + "epoch": 0.0262738614159026, + "flos": 19280190441600.0, + "grad_norm": 4.714988608425289, + "language_loss": 0.74434, + "learning_rate": 3.993250845817845e-06, + "loss": 0.76739556, + "num_input_tokens_seen": 9170750, + "router_z_loss_clip": 0.41796875, + "router_z_loss_mlp": 1.7109375, + "step": 437, + "time_per_iteration": 2.4319581985473633 + }, + { + "auxiliary_loss_clip": 0.01215749, + "auxiliary_loss_mlp": 0.01087014, + "balance_loss_clip": 1.04405141, + "balance_loss_mlp": 1.0473398, + "epoch": 0.026333984668570568, + "flos": 18806209557120.0, + "grad_norm": 5.086534458318834, + "language_loss": 0.91138661, + "learning_rate": 3.9932197971214026e-06, + "loss": 0.93441427, + "num_input_tokens_seen": 9188430, + "router_z_loss_clip": 0.4296875, + "router_z_loss_mlp": 1.6875, + "step": 438, + "time_per_iteration": 2.4063069820404053 + }, + { + "auxiliary_loss_clip": 0.0109988, + "auxiliary_loss_mlp": 0.01032375, + "balance_loss_clip": 1.02302861, + "balance_loss_mlp": 1.02063978, + "epoch": 0.02639410792123854, + "flos": 64568403607680.0, + "grad_norm": 0.8590789451679222, + "language_loss": 0.62551713, + "learning_rate": 3.9931886772920735e-06, + "loss": 0.64683968, + "num_input_tokens_seen": 9255835, + "router_z_loss_clip": 0.09326172, + "router_z_loss_mlp": 0.7890625, + "step": 439, + "time_per_iteration": 3.1644980907440186 + }, + { + "auxiliary_loss_clip": 0.01225435, + "auxiliary_loss_mlp": 0.01080469, + "balance_loss_clip": 1.03738666, + "balance_loss_mlp": 1.04984474, + "epoch": 0.02645423117390651, + "flos": 28472263240320.0, + "grad_norm": 6.737809188874736, + "language_loss": 0.75231874, + "learning_rate": 3.993157486330967e-06, + "loss": 0.77537781, + "num_input_tokens_seen": 9276835, + "router_z_loss_clip": 0.4296875, + "router_z_loss_mlp": 1.7578125, + "step": 440, + "time_per_iteration": 2.4854986667633057 + }, + { + "auxiliary_loss_clip": 0.01219877, + "auxiliary_loss_mlp": 0.01067345, + "balance_loss_clip": 1.02543068, + "balance_loss_mlp": 1.04653168, + "epoch": 0.026514354426574478, + "flos": 18550261313280.0, + "grad_norm": 2.6045460134394824, + "language_loss": 0.82804596, + "learning_rate": 3.993126224239198e-06, + "loss": 0.85091817, + "num_input_tokens_seen": 9295075, + "router_z_loss_clip": 0.41796875, + "router_z_loss_mlp": 1.734375, + "step": 441, + "time_per_iteration": 2.4888463020324707 + }, + { + "auxiliary_loss_clip": 0.01220228, + "auxiliary_loss_mlp": 0.01074593, + "balance_loss_clip": 1.0329653, + "balance_loss_mlp": 1.04525423, + "epoch": 0.026574477679242446, + "flos": 20666766021120.0, + "grad_norm": 2.4595969775424327, + "language_loss": 0.78507668, + "learning_rate": 3.99309489101788e-06, + "loss": 0.80802488, + "num_input_tokens_seen": 9314205, + "router_z_loss_clip": 0.41601562, + "router_z_loss_mlp": 1.75, + "step": 442, + "time_per_iteration": 2.4291138648986816 + }, + { + "auxiliary_loss_clip": 0.01090726, + "auxiliary_loss_mlp": 0.01011347, + "balance_loss_clip": 1.00285959, + "balance_loss_mlp": 1.01481843, + "epoch": 0.026634600931910415, + "flos": 57953026414080.0, + "grad_norm": 0.9492683728905594, + "language_loss": 0.644611, + "learning_rate": 3.993063486668132e-06, + "loss": 0.66563171, + "num_input_tokens_seen": 9367395, + "router_z_loss_clip": 0.08496094, + "router_z_loss_mlp": 0.7578125, + "step": 443, + "time_per_iteration": 2.9280450344085693 + }, + { + "auxiliary_loss_clip": 0.01222362, + "auxiliary_loss_mlp": 0.01079253, + "balance_loss_clip": 1.03905571, + "balance_loss_mlp": 1.05135286, + "epoch": 0.026694724184578387, + "flos": 15814222796160.0, + "grad_norm": 2.0330872338587667, + "language_loss": 0.82178068, + "learning_rate": 3.993032011191076e-06, + "loss": 0.8447969, + "num_input_tokens_seen": 9385185, + "router_z_loss_clip": 0.40234375, + "router_z_loss_mlp": 1.7109375, + "step": 444, + "time_per_iteration": 2.4048736095428467 + }, + { + "auxiliary_loss_clip": 0.01221715, + "auxiliary_loss_mlp": 0.01070907, + "balance_loss_clip": 1.02789617, + "balance_loss_mlp": 1.04752612, + "epoch": 0.026754847437246355, + "flos": 23439149130240.0, + "grad_norm": 2.2354540032417507, + "language_loss": 0.95266509, + "learning_rate": 3.993000464587833e-06, + "loss": 0.9755913, + "num_input_tokens_seen": 9403225, + "router_z_loss_clip": 0.4296875, + "router_z_loss_mlp": 1.7421875, + "step": 445, + "time_per_iteration": 2.434636354446411 + }, + { + "auxiliary_loss_clip": 0.01221157, + "auxiliary_loss_mlp": 0.01077582, + "balance_loss_clip": 1.03552508, + "balance_loss_mlp": 1.04775894, + "epoch": 0.026814970689914324, + "flos": 17341009862400.0, + "grad_norm": 2.175289182417039, + "language_loss": 0.91126347, + "learning_rate": 3.9929688468595305e-06, + "loss": 0.93425083, + "num_input_tokens_seen": 9420540, + "router_z_loss_clip": 0.421875, + "router_z_loss_mlp": 1.734375, + "step": 446, + "time_per_iteration": 2.3854146003723145 + }, + { + "auxiliary_loss_clip": 0.01220393, + "auxiliary_loss_mlp": 0.01072297, + "balance_loss_clip": 1.03069329, + "balance_loss_mlp": 1.04886007, + "epoch": 0.026875093942582293, + "flos": 17893754507520.0, + "grad_norm": 2.6742440873310374, + "language_loss": 0.79533404, + "learning_rate": 3.992937158007296e-06, + "loss": 0.81826091, + "num_input_tokens_seen": 9438840, + "router_z_loss_clip": 0.41601562, + "router_z_loss_mlp": 1.71875, + "step": 447, + "time_per_iteration": 2.436859130859375 + }, + { + "auxiliary_loss_clip": 0.01217204, + "auxiliary_loss_mlp": 0.01061085, + "balance_loss_clip": 1.02317667, + "balance_loss_mlp": 1.0459522, + "epoch": 0.026935217195250265, + "flos": 21722958604800.0, + "grad_norm": 2.7414730375156515, + "language_loss": 0.86134863, + "learning_rate": 3.992905398032262e-06, + "loss": 0.88413143, + "num_input_tokens_seen": 9457215, + "router_z_loss_clip": 0.37890625, + "router_z_loss_mlp": 1.7109375, + "step": 448, + "time_per_iteration": 2.4112770557403564 + }, + { + "auxiliary_loss_clip": 0.01212256, + "auxiliary_loss_mlp": 0.01073785, + "balance_loss_clip": 1.03573346, + "balance_loss_mlp": 1.04552698, + "epoch": 0.026995340447918233, + "flos": 23621570317440.0, + "grad_norm": 2.07756945998201, + "language_loss": 0.88353348, + "learning_rate": 3.992873566935559e-06, + "loss": 0.90639389, + "num_input_tokens_seen": 9475615, + "router_z_loss_clip": 0.37890625, + "router_z_loss_mlp": 1.671875, + "step": 449, + "time_per_iteration": 3.893214225769043 + }, + { + "auxiliary_loss_clip": 0.01223196, + "auxiliary_loss_mlp": 0.01071437, + "balance_loss_clip": 1.02995205, + "balance_loss_mlp": 1.04884255, + "epoch": 0.027055463700586202, + "flos": 17930308567680.0, + "grad_norm": 2.1023873559554254, + "language_loss": 0.80007172, + "learning_rate": 3.992841664718326e-06, + "loss": 0.82301807, + "num_input_tokens_seen": 9493975, + "router_z_loss_clip": 0.4140625, + "router_z_loss_mlp": 1.7421875, + "step": 450, + "time_per_iteration": 2.4293415546417236 + }, + { + "auxiliary_loss_clip": 0.01213427, + "auxiliary_loss_mlp": 0.01068471, + "balance_loss_clip": 1.02751112, + "balance_loss_mlp": 1.04947925, + "epoch": 0.02711558695325417, + "flos": 25117738254720.0, + "grad_norm": 1.6694600928474144, + "language_loss": 0.81280768, + "learning_rate": 3.9928096913817e-06, + "loss": 0.83562666, + "num_input_tokens_seen": 9514810, + "router_z_loss_clip": 0.41015625, + "router_z_loss_mlp": 1.640625, + "step": 451, + "time_per_iteration": 5.31993293762207 + }, + { + "auxiliary_loss_clip": 0.01217688, + "auxiliary_loss_mlp": 0.01082696, + "balance_loss_clip": 1.03939891, + "balance_loss_mlp": 1.04973805, + "epoch": 0.02717571020592214, + "flos": 24238520686080.0, + "grad_norm": 1.8430085290678004, + "language_loss": 0.76597822, + "learning_rate": 3.992777646926822e-06, + "loss": 0.78898203, + "num_input_tokens_seen": 9533635, + "router_z_loss_clip": 0.43359375, + "router_z_loss_mlp": 1.6796875, + "step": 452, + "time_per_iteration": 2.4252092838287354 + }, + { + "auxiliary_loss_clip": 0.01216012, + "auxiliary_loss_mlp": 0.01073324, + "balance_loss_clip": 1.03448617, + "balance_loss_mlp": 1.04734445, + "epoch": 0.02723583345859011, + "flos": 25117773166080.0, + "grad_norm": 1.8424704499023885, + "language_loss": 0.72687912, + "learning_rate": 3.992745531354836e-06, + "loss": 0.74977249, + "num_input_tokens_seen": 9555420, + "router_z_loss_clip": 0.38867188, + "router_z_loss_mlp": 1.6875, + "step": 453, + "time_per_iteration": 3.840744733810425 + }, + { + "auxiliary_loss_clip": 0.01213416, + "auxiliary_loss_mlp": 0.01077839, + "balance_loss_clip": 1.03923905, + "balance_loss_mlp": 1.04579771, + "epoch": 0.02729595671125808, + "flos": 42739939140480.0, + "grad_norm": 1.9164462172076624, + "language_loss": 0.81865823, + "learning_rate": 3.992713344666888e-06, + "loss": 0.84157073, + "num_input_tokens_seen": 9578950, + "router_z_loss_clip": 0.38671875, + "router_z_loss_mlp": 1.671875, + "step": 454, + "time_per_iteration": 2.601329803466797 + }, + { + "auxiliary_loss_clip": 0.01217129, + "auxiliary_loss_mlp": 0.01059361, + "balance_loss_clip": 1.02178609, + "balance_loss_mlp": 1.04818177, + "epoch": 0.02735607996392605, + "flos": 21430002453120.0, + "grad_norm": 1.8384435634960097, + "language_loss": 0.75141943, + "learning_rate": 3.992681086864125e-06, + "loss": 0.77418435, + "num_input_tokens_seen": 9598160, + "router_z_loss_clip": 0.375, + "router_z_loss_mlp": 1.6875, + "step": 455, + "time_per_iteration": 2.434659481048584 + }, + { + "auxiliary_loss_clip": 0.01217381, + "auxiliary_loss_mlp": 0.01073987, + "balance_loss_clip": 1.03364635, + "balance_loss_mlp": 1.0450505, + "epoch": 0.027416203216594017, + "flos": 20850199637760.0, + "grad_norm": 3.601715071411152, + "language_loss": 0.80229902, + "learning_rate": 3.992648757947702e-06, + "loss": 0.82521272, + "num_input_tokens_seen": 9616010, + "router_z_loss_clip": 0.40234375, + "router_z_loss_mlp": 1.71875, + "step": 456, + "time_per_iteration": 2.421617031097412 + }, + { + "auxiliary_loss_clip": 0.01210392, + "auxiliary_loss_mlp": 0.0107454, + "balance_loss_clip": 1.0342474, + "balance_loss_mlp": 1.04569876, + "epoch": 0.027476326469261986, + "flos": 13223667381120.0, + "grad_norm": 2.3636243778557464, + "language_loss": 0.81195503, + "learning_rate": 3.99261635791877e-06, + "loss": 0.83480436, + "num_input_tokens_seen": 9634000, + "router_z_loss_clip": 0.40234375, + "router_z_loss_mlp": 1.6484375, + "step": 457, + "time_per_iteration": 2.4353220462799072 + }, + { + "auxiliary_loss_clip": 0.01215576, + "auxiliary_loss_mlp": 0.01069389, + "balance_loss_clip": 1.02940655, + "balance_loss_mlp": 1.04417586, + "epoch": 0.027536449721929958, + "flos": 24023385688320.0, + "grad_norm": 2.3286607514782713, + "language_loss": 0.9358151, + "learning_rate": 3.992583886778485e-06, + "loss": 0.95866472, + "num_input_tokens_seen": 9653455, + "router_z_loss_clip": 0.3984375, + "router_z_loss_mlp": 1.71875, + "step": 458, + "time_per_iteration": 2.451904058456421 + }, + { + "auxiliary_loss_clip": 0.01211667, + "auxiliary_loss_mlp": 0.01075366, + "balance_loss_clip": 1.03721941, + "balance_loss_mlp": 1.04523611, + "epoch": 0.027596572974597926, + "flos": 13005215804160.0, + "grad_norm": 2.2284725754265655, + "language_loss": 0.78291839, + "learning_rate": 3.9925513445280075e-06, + "loss": 0.80578876, + "num_input_tokens_seen": 9669650, + "router_z_loss_clip": 0.3828125, + "router_z_loss_mlp": 1.6640625, + "step": 459, + "time_per_iteration": 2.4427225589752197 + }, + { + "auxiliary_loss_clip": 0.0121507, + "auxiliary_loss_mlp": 0.01068839, + "balance_loss_clip": 1.02799821, + "balance_loss_mlp": 1.0481329, + "epoch": 0.027656696227265895, + "flos": 26141810520960.0, + "grad_norm": 1.873779291517176, + "language_loss": 0.8316347, + "learning_rate": 3.9925187311684975e-06, + "loss": 0.85447371, + "num_input_tokens_seen": 9691415, + "router_z_loss_clip": 0.40820312, + "router_z_loss_mlp": 1.671875, + "step": 460, + "time_per_iteration": 2.481241226196289 + }, + { + "auxiliary_loss_clip": 0.01087725, + "auxiliary_loss_mlp": 0.01037586, + "balance_loss_clip": 1.02890778, + "balance_loss_mlp": 1.01300073, + "epoch": 0.027716819479933864, + "flos": 60693917610240.0, + "grad_norm": 1.5945212505311077, + "language_loss": 0.73599243, + "learning_rate": 3.9924860467011195e-06, + "loss": 0.75724554, + "num_input_tokens_seen": 9755605, + "router_z_loss_clip": 0.08691406, + "router_z_loss_mlp": 0.75, + "step": 461, + "time_per_iteration": 3.049412488937378 + }, + { + "auxiliary_loss_clip": 0.01213652, + "auxiliary_loss_mlp": 0.01069088, + "balance_loss_clip": 1.03408813, + "balance_loss_mlp": 1.05053163, + "epoch": 0.027776942732601832, + "flos": 31210605907200.0, + "grad_norm": 2.472476215251796, + "language_loss": 0.8088612, + "learning_rate": 3.99245329112704e-06, + "loss": 0.83168852, + "num_input_tokens_seen": 9776270, + "router_z_loss_clip": 0.34960938, + "router_z_loss_mlp": 1.6328125, + "step": 462, + "time_per_iteration": 2.4856672286987305 + }, + { + "auxiliary_loss_clip": 0.01214272, + "auxiliary_loss_mlp": 0.01065799, + "balance_loss_clip": 1.02932096, + "balance_loss_mlp": 1.0486722, + "epoch": 0.027837065985269804, + "flos": 22673538725760.0, + "grad_norm": 2.2505299356194177, + "language_loss": 0.89811778, + "learning_rate": 3.992420464447427e-06, + "loss": 0.92091846, + "num_input_tokens_seen": 9794465, + "router_z_loss_clip": 0.36328125, + "router_z_loss_mlp": 1.65625, + "step": 463, + "time_per_iteration": 2.449388265609741 + }, + { + "auxiliary_loss_clip": 0.01083923, + "auxiliary_loss_mlp": 0.01013799, + "balance_loss_clip": 1.00612223, + "balance_loss_mlp": 1.01178098, + "epoch": 0.027897189237937773, + "flos": 62179437582720.0, + "grad_norm": 0.8732072034279693, + "language_loss": 0.5900414, + "learning_rate": 3.992387566663454e-06, + "loss": 0.6110186, + "num_input_tokens_seen": 9849685, + "router_z_loss_clip": 0.07666016, + "router_z_loss_mlp": 0.72265625, + "step": 464, + "time_per_iteration": 3.076657295227051 + }, + { + "auxiliary_loss_clip": 0.01224219, + "auxiliary_loss_mlp": 0.01071273, + "balance_loss_clip": 1.02985942, + "balance_loss_mlp": 1.05110717, + "epoch": 0.02795731249060574, + "flos": 24492164780160.0, + "grad_norm": 2.8476587695432993, + "language_loss": 0.80872023, + "learning_rate": 3.992354597776293e-06, + "loss": 0.83167517, + "num_input_tokens_seen": 9869505, + "router_z_loss_clip": 0.4140625, + "router_z_loss_mlp": 1.734375, + "step": 465, + "time_per_iteration": 2.460120916366577 + }, + { + "auxiliary_loss_clip": 0.01211628, + "auxiliary_loss_mlp": 0.01077742, + "balance_loss_clip": 1.03806865, + "balance_loss_mlp": 1.04559112, + "epoch": 0.02801743574327371, + "flos": 23731860902400.0, + "grad_norm": 2.0807330331238814, + "language_loss": 0.78305185, + "learning_rate": 3.992321557787121e-06, + "loss": 0.80594552, + "num_input_tokens_seen": 9890950, + "router_z_loss_clip": 0.39648438, + "router_z_loss_mlp": 1.65625, + "step": 466, + "time_per_iteration": 2.440420627593994 + }, + { + "auxiliary_loss_clip": 0.01210946, + "auxiliary_loss_mlp": 0.01068143, + "balance_loss_clip": 1.02928114, + "balance_loss_mlp": 1.04549003, + "epoch": 0.02807755899594168, + "flos": 20628117279360.0, + "grad_norm": 1.807040788688562, + "language_loss": 0.87426627, + "learning_rate": 3.992288446697118e-06, + "loss": 0.89705718, + "num_input_tokens_seen": 9911265, + "router_z_loss_clip": 0.38867188, + "router_z_loss_mlp": 1.65625, + "step": 467, + "time_per_iteration": 2.429304361343384 + }, + { + "auxiliary_loss_clip": 0.01218809, + "auxiliary_loss_mlp": 0.01091612, + "balance_loss_clip": 1.05005586, + "balance_loss_mlp": 1.04589593, + "epoch": 0.02813768224860965, + "flos": 19243566558720.0, + "grad_norm": 2.1615589645067237, + "language_loss": 0.86052179, + "learning_rate": 3.9922552645074644e-06, + "loss": 0.88362604, + "num_input_tokens_seen": 9929025, + "router_z_loss_clip": 0.41601562, + "router_z_loss_mlp": 1.7265625, + "step": 468, + "time_per_iteration": 2.4182615280151367 + }, + { + "auxiliary_loss_clip": 0.01216098, + "auxiliary_loss_mlp": 0.01075451, + "balance_loss_clip": 1.03553975, + "balance_loss_mlp": 1.04683185, + "epoch": 0.02819780550127762, + "flos": 20812912439040.0, + "grad_norm": 2.4673660674462172, + "language_loss": 0.91542101, + "learning_rate": 3.992222011219346e-06, + "loss": 0.93833661, + "num_input_tokens_seen": 9945190, + "router_z_loss_clip": 0.3984375, + "router_z_loss_mlp": 1.6953125, + "step": 469, + "time_per_iteration": 2.420708179473877 + }, + { + "auxiliary_loss_clip": 0.01208904, + "auxiliary_loss_mlp": 0.01087393, + "balance_loss_clip": 1.0482924, + "balance_loss_mlp": 1.04541838, + "epoch": 0.028257928753945588, + "flos": 19973111662080.0, + "grad_norm": 3.359520458405969, + "language_loss": 0.80823982, + "learning_rate": 3.992188686833948e-06, + "loss": 0.83120275, + "num_input_tokens_seen": 9962820, + "router_z_loss_clip": 0.390625, + "router_z_loss_mlp": 1.6328125, + "step": 470, + "time_per_iteration": 2.426966428756714 + }, + { + "auxiliary_loss_clip": 0.01214693, + "auxiliary_loss_mlp": 0.01074015, + "balance_loss_clip": 1.0333643, + "balance_loss_mlp": 1.0463984, + "epoch": 0.028318052006613557, + "flos": 20483472049920.0, + "grad_norm": 2.0266611816436004, + "language_loss": 0.92974067, + "learning_rate": 3.992155291352461e-06, + "loss": 0.95262778, + "num_input_tokens_seen": 9982595, + "router_z_loss_clip": 0.40625, + "router_z_loss_mlp": 1.6796875, + "step": 471, + "time_per_iteration": 2.417511224746704 + }, + { + "auxiliary_loss_clip": 0.01211363, + "auxiliary_loss_mlp": 0.01070777, + "balance_loss_clip": 1.03117526, + "balance_loss_mlp": 1.0441376, + "epoch": 0.02837817525928153, + "flos": 28513495422720.0, + "grad_norm": 2.0100640893231168, + "language_loss": 0.76147437, + "learning_rate": 3.992121824776075e-06, + "loss": 0.78429568, + "num_input_tokens_seen": 10004645, + "router_z_loss_clip": 0.39648438, + "router_z_loss_mlp": 1.671875, + "step": 472, + "time_per_iteration": 2.49124813079834 + }, + { + "auxiliary_loss_clip": 0.01216394, + "auxiliary_loss_mlp": 0.01074437, + "balance_loss_clip": 1.03524113, + "balance_loss_mlp": 1.04516983, + "epoch": 0.028438298511949497, + "flos": 18550680249600.0, + "grad_norm": 2.9233453117850345, + "language_loss": 0.9328692, + "learning_rate": 3.9920882871059865e-06, + "loss": 0.95577747, + "num_input_tokens_seen": 10022555, + "router_z_loss_clip": 0.39257812, + "router_z_loss_mlp": 1.7109375, + "step": 473, + "time_per_iteration": 2.430454730987549 + }, + { + "auxiliary_loss_clip": 0.01219842, + "auxiliary_loss_mlp": 0.01089857, + "balance_loss_clip": 1.05104256, + "balance_loss_mlp": 1.04805279, + "epoch": 0.028498421764617466, + "flos": 16909273589760.0, + "grad_norm": 3.6126159175986055, + "language_loss": 0.88592136, + "learning_rate": 3.992054678343391e-06, + "loss": 0.90901834, + "num_input_tokens_seen": 10041025, + "router_z_loss_clip": 0.38867188, + "router_z_loss_mlp": 1.71875, + "step": 474, + "time_per_iteration": 2.3969032764434814 + }, + { + "auxiliary_loss_clip": 0.0121063, + "auxiliary_loss_mlp": 0.01080179, + "balance_loss_clip": 1.03936172, + "balance_loss_mlp": 1.04794836, + "epoch": 0.028558545017285435, + "flos": 27777561540480.0, + "grad_norm": 2.2069611979958164, + "language_loss": 0.78739357, + "learning_rate": 3.992020998489488e-06, + "loss": 0.81030166, + "num_input_tokens_seen": 10060775, + "router_z_loss_clip": 0.40820312, + "router_z_loss_mlp": 1.625, + "step": 475, + "time_per_iteration": 2.4957873821258545 + }, + { + "auxiliary_loss_clip": 0.01078655, + "auxiliary_loss_mlp": 0.0105164, + "balance_loss_clip": 1.04286611, + "balance_loss_mlp": 1.01238871, + "epoch": 0.028618668269953403, + "flos": 65651060868480.0, + "grad_norm": 0.9168251895118754, + "language_loss": 0.66889834, + "learning_rate": 3.991987247545479e-06, + "loss": 0.69020128, + "num_input_tokens_seen": 10120225, + "router_z_loss_clip": 0.08789062, + "router_z_loss_mlp": 0.6640625, + "step": 476, + "time_per_iteration": 2.975771188735962 + }, + { + "auxiliary_loss_clip": 0.01212109, + "auxiliary_loss_mlp": 0.01079319, + "balance_loss_clip": 1.03947902, + "balance_loss_mlp": 1.04676247, + "epoch": 0.028678791522621375, + "flos": 21936208389120.0, + "grad_norm": 2.333401231724457, + "language_loss": 0.83673292, + "learning_rate": 3.99195342551257e-06, + "loss": 0.85964721, + "num_input_tokens_seen": 10137880, + "router_z_loss_clip": 0.3984375, + "router_z_loss_mlp": 1.65625, + "step": 477, + "time_per_iteration": 2.425384521484375 + }, + { + "auxiliary_loss_clip": 0.01219363, + "auxiliary_loss_mlp": 0.01076292, + "balance_loss_clip": 1.03509319, + "balance_loss_mlp": 1.04858422, + "epoch": 0.028738914775289344, + "flos": 24570963452160.0, + "grad_norm": 2.376657919351714, + "language_loss": 0.81632209, + "learning_rate": 3.991919532391967e-06, + "loss": 0.8392787, + "num_input_tokens_seen": 10156930, + "router_z_loss_clip": 0.41210938, + "router_z_loss_mlp": 1.703125, + "step": 478, + "time_per_iteration": 2.463796854019165 + }, + { + "auxiliary_loss_clip": 0.01212562, + "auxiliary_loss_mlp": 0.01074185, + "balance_loss_clip": 1.03467917, + "balance_loss_mlp": 1.04589367, + "epoch": 0.028799038027957313, + "flos": 23256867588480.0, + "grad_norm": 1.98261751435751, + "language_loss": 0.8049897, + "learning_rate": 3.991885568184879e-06, + "loss": 0.82785714, + "num_input_tokens_seen": 10176295, + "router_z_loss_clip": 0.39453125, + "router_z_loss_mlp": 1.6640625, + "step": 479, + "time_per_iteration": 2.42974591255188 + }, + { + "auxiliary_loss_clip": 0.01211035, + "auxiliary_loss_mlp": 0.01074508, + "balance_loss_clip": 1.03156877, + "balance_loss_mlp": 1.04642069, + "epoch": 0.02885916128062528, + "flos": 22163003781120.0, + "grad_norm": 2.75380698709829, + "language_loss": 0.7387349, + "learning_rate": 3.991851532892521e-06, + "loss": 0.76159036, + "num_input_tokens_seen": 10195790, + "router_z_loss_clip": 0.4296875, + "router_z_loss_mlp": 1.6484375, + "step": 480, + "time_per_iteration": 2.4637868404388428 + }, + { + "auxiliary_loss_clip": 0.01211482, + "auxiliary_loss_mlp": 0.01066037, + "balance_loss_clip": 1.03067946, + "balance_loss_mlp": 1.04742777, + "epoch": 0.02891928453329325, + "flos": 22931651473920.0, + "grad_norm": 1.727787042430347, + "language_loss": 0.8761667, + "learning_rate": 3.991817426516103e-06, + "loss": 0.89894187, + "num_input_tokens_seen": 10218405, + "router_z_loss_clip": 0.35351562, + "router_z_loss_mlp": 1.640625, + "step": 481, + "time_per_iteration": 2.4658827781677246 + }, + { + "auxiliary_loss_clip": 0.01075504, + "auxiliary_loss_mlp": 0.01015323, + "balance_loss_clip": 1.00783658, + "balance_loss_mlp": 1.01079535, + "epoch": 0.028979407785961222, + "flos": 57430202670720.0, + "grad_norm": 0.949522922299035, + "language_loss": 0.66014594, + "learning_rate": 3.991783249056846e-06, + "loss": 0.68105423, + "num_input_tokens_seen": 10271005, + "router_z_loss_clip": 0.07470703, + "router_z_loss_mlp": 0.6484375, + "step": 482, + "time_per_iteration": 2.7990095615386963 + }, + { + "auxiliary_loss_clip": 0.01219808, + "auxiliary_loss_mlp": 0.01078013, + "balance_loss_clip": 1.03855503, + "balance_loss_mlp": 1.04768646, + "epoch": 0.02903953103862919, + "flos": 16721929900800.0, + "grad_norm": 2.632323154501168, + "language_loss": 0.78217971, + "learning_rate": 3.991749000515968e-06, + "loss": 0.80515796, + "num_input_tokens_seen": 10288405, + "router_z_loss_clip": 0.39453125, + "router_z_loss_mlp": 1.71875, + "step": 483, + "time_per_iteration": 2.390429973602295 + }, + { + "auxiliary_loss_clip": 0.01213693, + "auxiliary_loss_mlp": 0.01084099, + "balance_loss_clip": 1.04392576, + "balance_loss_mlp": 1.04758871, + "epoch": 0.02909965429129716, + "flos": 16762708235520.0, + "grad_norm": 2.6496364320357797, + "language_loss": 0.74926507, + "learning_rate": 3.991714680894691e-06, + "loss": 0.77224296, + "num_input_tokens_seen": 10306875, + "router_z_loss_clip": 0.40234375, + "router_z_loss_mlp": 1.65625, + "step": 484, + "time_per_iteration": 2.4088382720947266 + }, + { + "auxiliary_loss_clip": 0.0121332, + "auxiliary_loss_mlp": 0.0107373, + "balance_loss_clip": 1.03410459, + "balance_loss_mlp": 1.04610586, + "epoch": 0.029159777543965128, + "flos": 19784511164160.0, + "grad_norm": 2.0936797742723923, + "language_loss": 0.83411169, + "learning_rate": 3.991680290194241e-06, + "loss": 0.85698223, + "num_input_tokens_seen": 10323965, + "router_z_loss_clip": 0.39648438, + "router_z_loss_mlp": 1.671875, + "step": 485, + "time_per_iteration": 2.4037091732025146 + }, + { + "auxiliary_loss_clip": 0.01216594, + "auxiliary_loss_mlp": 0.01070947, + "balance_loss_clip": 1.03299022, + "balance_loss_mlp": 1.05073392, + "epoch": 0.029219900796633096, + "flos": 19641751148160.0, + "grad_norm": 1.8682562352937333, + "language_loss": 0.83862162, + "learning_rate": 3.991645828415844e-06, + "loss": 0.86149704, + "num_input_tokens_seen": 10342620, + "router_z_loss_clip": 0.37890625, + "router_z_loss_mlp": 1.65625, + "step": 486, + "time_per_iteration": 2.408877372741699 + }, + { + "auxiliary_loss_clip": 0.01212274, + "auxiliary_loss_mlp": 0.01086459, + "balance_loss_clip": 1.04642892, + "balance_loss_mlp": 1.04686427, + "epoch": 0.02928002404930107, + "flos": 25884500734080.0, + "grad_norm": 2.1830433578473407, + "language_loss": 0.88530236, + "learning_rate": 3.991611295560732e-06, + "loss": 0.90828967, + "num_input_tokens_seen": 10364610, + "router_z_loss_clip": 0.40039062, + "router_z_loss_mlp": 1.65625, + "step": 487, + "time_per_iteration": 2.484448194503784 + }, + { + "auxiliary_loss_clip": 0.01215214, + "auxiliary_loss_mlp": 0.01064765, + "balance_loss_clip": 1.02649832, + "balance_loss_mlp": 1.04820597, + "epoch": 0.029340147301969037, + "flos": 20659399724160.0, + "grad_norm": 4.5142003549058325, + "language_loss": 0.87968355, + "learning_rate": 3.991576691630134e-06, + "loss": 0.90248334, + "num_input_tokens_seen": 10380910, + "router_z_loss_clip": 0.3828125, + "router_z_loss_mlp": 1.671875, + "step": 488, + "time_per_iteration": 2.423797607421875 + }, + { + "auxiliary_loss_clip": 0.01209508, + "auxiliary_loss_mlp": 0.01070475, + "balance_loss_clip": 1.03206539, + "balance_loss_mlp": 1.04734719, + "epoch": 0.029400270554637006, + "flos": 24426806981760.0, + "grad_norm": 7.590058532803281, + "language_loss": 0.88534021, + "learning_rate": 3.991542016625289e-06, + "loss": 0.90814012, + "num_input_tokens_seen": 10400665, + "router_z_loss_clip": 0.38476562, + "router_z_loss_mlp": 1.625, + "step": 489, + "time_per_iteration": 3.889488458633423 + }, + { + "auxiliary_loss_clip": 0.01204857, + "auxiliary_loss_mlp": 0.01073573, + "balance_loss_clip": 1.03483009, + "balance_loss_mlp": 1.04278088, + "epoch": 0.029460393807304974, + "flos": 20119851573120.0, + "grad_norm": 1.9148623633063457, + "language_loss": 0.88380492, + "learning_rate": 3.99150727054743e-06, + "loss": 0.90658921, + "num_input_tokens_seen": 10420150, + "router_z_loss_clip": 0.38671875, + "router_z_loss_mlp": 1.625, + "step": 490, + "time_per_iteration": 5.217238187789917 + }, + { + "auxiliary_loss_clip": 0.012128, + "auxiliary_loss_mlp": 0.01068954, + "balance_loss_clip": 1.03185582, + "balance_loss_mlp": 1.04768872, + "epoch": 0.029520517059972943, + "flos": 17674953816960.0, + "grad_norm": 3.2853586318233647, + "language_loss": 0.91206759, + "learning_rate": 3.9914724533978e-06, + "loss": 0.93488508, + "num_input_tokens_seen": 10438210, + "router_z_loss_clip": 0.37109375, + "router_z_loss_mlp": 1.6484375, + "step": 491, + "time_per_iteration": 2.4247934818267822 + }, + { + "auxiliary_loss_clip": 0.0120445, + "auxiliary_loss_mlp": 0.01066569, + "balance_loss_clip": 1.03044844, + "balance_loss_mlp": 1.04490709, + "epoch": 0.029580640312640915, + "flos": 18952181418240.0, + "grad_norm": 2.3581296560745586, + "language_loss": 0.85065138, + "learning_rate": 3.991437565177642e-06, + "loss": 0.87336159, + "num_input_tokens_seen": 10455125, + "router_z_loss_clip": 0.36132812, + "router_z_loss_mlp": 1.59375, + "step": 492, + "time_per_iteration": 3.793816089630127 + }, + { + "auxiliary_loss_clip": 0.0121162, + "auxiliary_loss_mlp": 0.01078789, + "balance_loss_clip": 1.03990269, + "balance_loss_mlp": 1.047333, + "epoch": 0.029640763565308884, + "flos": 18725351114880.0, + "grad_norm": 4.274363010344045, + "language_loss": 0.83796686, + "learning_rate": 3.991402605888198e-06, + "loss": 0.8608709, + "num_input_tokens_seen": 10470990, + "router_z_loss_clip": 0.38867188, + "router_z_loss_mlp": 1.640625, + "step": 493, + "time_per_iteration": 2.3795204162597656 + }, + { + "auxiliary_loss_clip": 0.01205968, + "auxiliary_loss_mlp": 0.0106367, + "balance_loss_clip": 1.02585661, + "balance_loss_mlp": 1.04183817, + "epoch": 0.029700886817976852, + "flos": 20594251393920.0, + "grad_norm": 1.7770761529375936, + "language_loss": 0.86436814, + "learning_rate": 3.991367575530719e-06, + "loss": 0.88706452, + "num_input_tokens_seen": 10490685, + "router_z_loss_clip": 0.37890625, + "router_z_loss_mlp": 1.640625, + "step": 494, + "time_per_iteration": 2.4433693885803223 + }, + { + "auxiliary_loss_clip": 0.01209465, + "auxiliary_loss_mlp": 0.01067738, + "balance_loss_clip": 1.03328609, + "balance_loss_mlp": 1.04554904, + "epoch": 0.02976101007064482, + "flos": 22235762787840.0, + "grad_norm": 2.355404434060518, + "language_loss": 0.86683035, + "learning_rate": 3.9913324741064535e-06, + "loss": 0.88960236, + "num_input_tokens_seen": 10509435, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 1.6328125, + "step": 495, + "time_per_iteration": 2.4427053928375244 + }, + { + "auxiliary_loss_clip": 0.01204284, + "auxiliary_loss_mlp": 0.01065663, + "balance_loss_clip": 1.0279218, + "balance_loss_mlp": 1.04706717, + "epoch": 0.029821133323312793, + "flos": 23731511788800.0, + "grad_norm": 1.962976155669888, + "language_loss": 0.61746514, + "learning_rate": 3.991297301616653e-06, + "loss": 0.64016461, + "num_input_tokens_seen": 10530050, + "router_z_loss_clip": 0.37695312, + "router_z_loss_mlp": 1.5703125, + "step": 496, + "time_per_iteration": 2.4484548568725586 + }, + { + "auxiliary_loss_clip": 0.01204454, + "auxiliary_loss_mlp": 0.01066235, + "balance_loss_clip": 1.03109229, + "balance_loss_mlp": 1.04732299, + "epoch": 0.02988125657598076, + "flos": 22418393443200.0, + "grad_norm": 1.729398170481444, + "language_loss": 0.8813799, + "learning_rate": 3.991262058062575e-06, + "loss": 0.90408683, + "num_input_tokens_seen": 10551370, + "router_z_loss_clip": 0.3515625, + "router_z_loss_mlp": 1.5703125, + "step": 497, + "time_per_iteration": 2.5011401176452637 + }, + { + "auxiliary_loss_clip": 0.0120954, + "auxiliary_loss_mlp": 0.01069592, + "balance_loss_clip": 1.03247035, + "balance_loss_mlp": 1.04552197, + "epoch": 0.02994137982864873, + "flos": 13844248531200.0, + "grad_norm": 2.6030676199930602, + "language_loss": 0.84617573, + "learning_rate": 3.991226743445477e-06, + "loss": 0.86896706, + "num_input_tokens_seen": 10569225, + "router_z_loss_clip": 0.37109375, + "router_z_loss_mlp": 1.640625, + "step": 498, + "time_per_iteration": 2.522249221801758 + }, + { + "auxiliary_loss_clip": 0.01206957, + "auxiliary_loss_mlp": 0.01070789, + "balance_loss_clip": 1.03540766, + "balance_loss_mlp": 1.04600763, + "epoch": 0.0300015030813167, + "flos": 23907404551680.0, + "grad_norm": 6.119065223903574, + "language_loss": 0.78805482, + "learning_rate": 3.991191357766617e-06, + "loss": 0.81083238, + "num_input_tokens_seen": 10586170, + "router_z_loss_clip": 0.35351562, + "router_z_loss_mlp": 1.609375, + "step": 499, + "time_per_iteration": 2.5096781253814697 + }, + { + "auxiliary_loss_clip": 0.01208767, + "auxiliary_loss_mlp": 0.01069197, + "balance_loss_clip": 1.03391087, + "balance_loss_mlp": 1.04836917, + "epoch": 0.030061626333984667, + "flos": 22015740199680.0, + "grad_norm": 1.90305937173952, + "language_loss": 0.82357585, + "learning_rate": 3.991155901027261e-06, + "loss": 0.84635556, + "num_input_tokens_seen": 10606205, + "router_z_loss_clip": 0.35351562, + "router_z_loss_mlp": 1.6015625, + "step": 500, + "time_per_iteration": 2.4628171920776367 + }, + { + "auxiliary_loss_clip": 0.01200767, + "auxiliary_loss_mlp": 0.01073662, + "balance_loss_clip": 1.0358727, + "balance_loss_mlp": 1.04500973, + "epoch": 0.03012174958665264, + "flos": 23038625479680.0, + "grad_norm": 2.535327279683379, + "language_loss": 0.8793115, + "learning_rate": 3.991120373228672e-06, + "loss": 0.90205586, + "num_input_tokens_seen": 10625995, + "router_z_loss_clip": 0.37890625, + "router_z_loss_mlp": 1.5546875, + "step": 501, + "time_per_iteration": 2.4772112369537354 + }, + { + "auxiliary_loss_clip": 0.01207747, + "auxiliary_loss_mlp": 0.01063189, + "balance_loss_clip": 1.0281651, + "balance_loss_mlp": 1.04432535, + "epoch": 0.030181872839320608, + "flos": 18952251240960.0, + "grad_norm": 2.510197501519828, + "language_loss": 0.86130059, + "learning_rate": 3.991084774372118e-06, + "loss": 0.88400996, + "num_input_tokens_seen": 10644105, + "router_z_loss_clip": 0.3515625, + "router_z_loss_mlp": 1.6328125, + "step": 502, + "time_per_iteration": 2.4320156574249268 + }, + { + "auxiliary_loss_clip": 0.01205823, + "auxiliary_loss_mlp": 0.01068997, + "balance_loss_clip": 1.0321852, + "balance_loss_mlp": 1.04956007, + "epoch": 0.030241996091988577, + "flos": 16727061870720.0, + "grad_norm": 2.3152219203109867, + "language_loss": 0.8469739, + "learning_rate": 3.991049104458871e-06, + "loss": 0.86972207, + "num_input_tokens_seen": 10661090, + "router_z_loss_clip": 0.3671875, + "router_z_loss_mlp": 1.5625, + "step": 503, + "time_per_iteration": 2.4763457775115967 + }, + { + "auxiliary_loss_clip": 0.01199079, + "auxiliary_loss_mlp": 0.01069118, + "balance_loss_clip": 1.03326011, + "balance_loss_mlp": 1.04358077, + "epoch": 0.030302119344656545, + "flos": 28620015580800.0, + "grad_norm": 2.3787165165537334, + "language_loss": 0.88057989, + "learning_rate": 3.991013363490202e-06, + "loss": 0.90326184, + "num_input_tokens_seen": 10682380, + "router_z_loss_clip": 0.359375, + "router_z_loss_mlp": 1.5546875, + "step": 504, + "time_per_iteration": 2.498375654220581 + }, + { + "auxiliary_loss_clip": 0.01201017, + "auxiliary_loss_mlp": 0.01058491, + "balance_loss_clip": 1.02403963, + "balance_loss_mlp": 1.04350054, + "epoch": 0.030362242597324514, + "flos": 15668425491840.0, + "grad_norm": 2.3046057205140835, + "language_loss": 0.77504301, + "learning_rate": 3.9909775514673885e-06, + "loss": 0.79763812, + "num_input_tokens_seen": 10699925, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 1.578125, + "step": 505, + "time_per_iteration": 2.5052103996276855 + }, + { + "auxiliary_loss_clip": 0.01077902, + "auxiliary_loss_mlp": 0.01015402, + "balance_loss_clip": 1.0084877, + "balance_loss_mlp": 1.01223004, + "epoch": 0.030422365849992486, + "flos": 72122107034880.0, + "grad_norm": 0.841552000589698, + "language_loss": 0.54989272, + "learning_rate": 3.990941668391708e-06, + "loss": 0.57082576, + "num_input_tokens_seen": 10766525, + "router_z_loss_clip": 0.06933594, + "router_z_loss_mlp": 0.65625, + "step": 506, + "time_per_iteration": 3.3168389797210693 + }, + { + "auxiliary_loss_clip": 0.01204627, + "auxiliary_loss_mlp": 0.01071427, + "balance_loss_clip": 1.03626084, + "balance_loss_mlp": 1.04464555, + "epoch": 0.030482489102660455, + "flos": 19426790707200.0, + "grad_norm": 2.1395024000496523, + "language_loss": 0.83131456, + "learning_rate": 3.99090571426444e-06, + "loss": 0.85407519, + "num_input_tokens_seen": 10786725, + "router_z_loss_clip": 0.3515625, + "router_z_loss_mlp": 1.6015625, + "step": 507, + "time_per_iteration": 2.4338178634643555 + }, + { + "auxiliary_loss_clip": 0.01205381, + "auxiliary_loss_mlp": 0.01067995, + "balance_loss_clip": 1.03156519, + "balance_loss_mlp": 1.04393148, + "epoch": 0.030542612355328423, + "flos": 20374787387520.0, + "grad_norm": 2.278497143714966, + "language_loss": 0.87978184, + "learning_rate": 3.990869689086868e-06, + "loss": 0.90251565, + "num_input_tokens_seen": 10805390, + "router_z_loss_clip": 0.36523438, + "router_z_loss_mlp": 1.609375, + "step": 508, + "time_per_iteration": 2.524904489517212 + }, + { + "auxiliary_loss_clip": 0.01206445, + "auxiliary_loss_mlp": 0.01057358, + "balance_loss_clip": 1.02202475, + "balance_loss_mlp": 1.04496431, + "epoch": 0.030602735607996392, + "flos": 34675945148160.0, + "grad_norm": 1.9490258310118795, + "language_loss": 0.71126789, + "learning_rate": 3.990833592860279e-06, + "loss": 0.73390591, + "num_input_tokens_seen": 10828030, + "router_z_loss_clip": 0.35351562, + "router_z_loss_mlp": 1.6171875, + "step": 509, + "time_per_iteration": 2.56103777885437 + }, + { + "auxiliary_loss_clip": 0.01200486, + "auxiliary_loss_mlp": 0.01061738, + "balance_loss_clip": 1.02790678, + "balance_loss_mlp": 1.04403806, + "epoch": 0.03066285886066436, + "flos": 23657565795840.0, + "grad_norm": 2.0781282707932016, + "language_loss": 0.81924725, + "learning_rate": 3.990797425585959e-06, + "loss": 0.84186947, + "num_input_tokens_seen": 10845240, + "router_z_loss_clip": 0.33789062, + "router_z_loss_mlp": 1.5625, + "step": 510, + "time_per_iteration": 2.517728805541992 + }, + { + "auxiliary_loss_clip": 0.01205482, + "auxiliary_loss_mlp": 0.01062984, + "balance_loss_clip": 1.02786517, + "balance_loss_mlp": 1.04849553, + "epoch": 0.030722982113332332, + "flos": 23001861951360.0, + "grad_norm": 2.2066502130873005, + "language_loss": 0.83270842, + "learning_rate": 3.9907611872652e-06, + "loss": 0.85539317, + "num_input_tokens_seen": 10864325, + "router_z_loss_clip": 0.3515625, + "router_z_loss_mlp": 1.5703125, + "step": 511, + "time_per_iteration": 2.4717540740966797 + }, + { + "auxiliary_loss_clip": 0.01203057, + "auxiliary_loss_mlp": 0.01067064, + "balance_loss_clip": 1.03380466, + "balance_loss_mlp": 1.04298186, + "epoch": 0.0307831053660003, + "flos": 24749788769280.0, + "grad_norm": 2.1679052234875598, + "language_loss": 0.817909, + "learning_rate": 3.990724877899296e-06, + "loss": 0.84061021, + "num_input_tokens_seen": 10883860, + "router_z_loss_clip": 0.33203125, + "router_z_loss_mlp": 1.6015625, + "step": 512, + "time_per_iteration": 2.5127251148223877 + }, + { + "auxiliary_loss_clip": 0.01199515, + "auxiliary_loss_mlp": 0.01066431, + "balance_loss_clip": 1.02985787, + "balance_loss_mlp": 1.04325438, + "epoch": 0.03084322861866827, + "flos": 26139680928000.0, + "grad_norm": 1.8918276302591983, + "language_loss": 0.86687189, + "learning_rate": 3.990688497489541e-06, + "loss": 0.88953137, + "num_input_tokens_seen": 10904555, + "router_z_loss_clip": 0.36523438, + "router_z_loss_mlp": 1.5625, + "step": 513, + "time_per_iteration": 2.4929020404815674 + }, + { + "auxiliary_loss_clip": 0.01207001, + "auxiliary_loss_mlp": 0.0106846, + "balance_loss_clip": 1.03360355, + "balance_loss_mlp": 1.04672039, + "epoch": 0.03090335187133624, + "flos": 18770283901440.0, + "grad_norm": 1.5700776694139793, + "language_loss": 0.78757954, + "learning_rate": 3.990652046037234e-06, + "loss": 0.81033409, + "num_input_tokens_seen": 10923700, + "router_z_loss_clip": 0.34765625, + "router_z_loss_mlp": 1.609375, + "step": 514, + "time_per_iteration": 2.515777111053467 + }, + { + "auxiliary_loss_clip": 0.01199241, + "auxiliary_loss_mlp": 0.0106353, + "balance_loss_clip": 1.0313673, + "balance_loss_mlp": 1.04487944, + "epoch": 0.030963475124004207, + "flos": 23220767376000.0, + "grad_norm": 3.2378638813729714, + "language_loss": 0.76955855, + "learning_rate": 3.990615523543677e-06, + "loss": 0.79218626, + "num_input_tokens_seen": 10942730, + "router_z_loss_clip": 0.32226562, + "router_z_loss_mlp": 1.546875, + "step": 515, + "time_per_iteration": 2.475161075592041 + }, + { + "auxiliary_loss_clip": 0.01198662, + "auxiliary_loss_mlp": 0.01059837, + "balance_loss_clip": 1.02440774, + "balance_loss_mlp": 1.0403347, + "epoch": 0.03102359837667218, + "flos": 42523861536000.0, + "grad_norm": 3.4578561136515913, + "language_loss": 0.82421023, + "learning_rate": 3.990578930010171e-06, + "loss": 0.8467952, + "num_input_tokens_seen": 10967120, + "router_z_loss_clip": 0.35351562, + "router_z_loss_mlp": 1.5859375, + "step": 516, + "time_per_iteration": 2.6455483436584473 + }, + { + "auxiliary_loss_clip": 0.01197751, + "auxiliary_loss_mlp": 0.01061516, + "balance_loss_clip": 1.0260396, + "balance_loss_mlp": 1.04530215, + "epoch": 0.031083721629340148, + "flos": 21175939422720.0, + "grad_norm": 1.725880235288346, + "language_loss": 0.78557986, + "learning_rate": 3.990542265438024e-06, + "loss": 0.80817252, + "num_input_tokens_seen": 10986775, + "router_z_loss_clip": 0.35546875, + "router_z_loss_mlp": 1.5234375, + "step": 517, + "time_per_iteration": 2.499014139175415 + }, + { + "auxiliary_loss_clip": 0.01194293, + "auxiliary_loss_mlp": 0.01056726, + "balance_loss_clip": 1.02375305, + "balance_loss_mlp": 1.04368186, + "epoch": 0.031143844882008116, + "flos": 29714891817600.0, + "grad_norm": 1.5230128811737134, + "language_loss": 0.9046182, + "learning_rate": 3.990505529828544e-06, + "loss": 0.92712843, + "num_input_tokens_seen": 11011360, + "router_z_loss_clip": 0.33007812, + "router_z_loss_mlp": 1.5078125, + "step": 518, + "time_per_iteration": 2.533905029296875 + }, + { + "auxiliary_loss_clip": 0.01208601, + "auxiliary_loss_mlp": 0.01072366, + "balance_loss_clip": 1.03586388, + "balance_loss_mlp": 1.04859257, + "epoch": 0.031203968134676085, + "flos": 23111349575040.0, + "grad_norm": 3.0270756451628125, + "language_loss": 0.86141729, + "learning_rate": 3.9904687231830424e-06, + "loss": 0.88422704, + "num_input_tokens_seen": 11030150, + "router_z_loss_clip": 0.36523438, + "router_z_loss_mlp": 1.6015625, + "step": 519, + "time_per_iteration": 2.497805118560791 + }, + { + "auxiliary_loss_clip": 0.01199843, + "auxiliary_loss_mlp": 0.01066668, + "balance_loss_clip": 1.03123891, + "balance_loss_mlp": 1.04227221, + "epoch": 0.03126409138734405, + "flos": 20953473039360.0, + "grad_norm": 2.5245551099703847, + "language_loss": 0.86705911, + "learning_rate": 3.990431845502831e-06, + "loss": 0.88972425, + "num_input_tokens_seen": 11049145, + "router_z_loss_clip": 0.35351562, + "router_z_loss_mlp": 1.578125, + "step": 520, + "time_per_iteration": 2.4443423748016357 + }, + { + "auxiliary_loss_clip": 0.01202682, + "auxiliary_loss_mlp": 0.01074112, + "balance_loss_clip": 1.03861165, + "balance_loss_mlp": 1.04245722, + "epoch": 0.031324214640012026, + "flos": 21649117345920.0, + "grad_norm": 1.7543559854752624, + "language_loss": 0.89257371, + "learning_rate": 3.990394896789228e-06, + "loss": 0.91534168, + "num_input_tokens_seen": 11068835, + "router_z_loss_clip": 0.35546875, + "router_z_loss_mlp": 1.6015625, + "step": 521, + "time_per_iteration": 2.474047899246216 + }, + { + "auxiliary_loss_clip": 0.01196938, + "auxiliary_loss_mlp": 0.01070582, + "balance_loss_clip": 1.03491497, + "balance_loss_mlp": 1.04307437, + "epoch": 0.03138433789267999, + "flos": 23440196471040.0, + "grad_norm": 2.037463426747011, + "language_loss": 0.70534217, + "learning_rate": 3.9903578770435505e-06, + "loss": 0.72801739, + "num_input_tokens_seen": 11088980, + "router_z_loss_clip": 0.35742188, + "router_z_loss_mlp": 1.5390625, + "step": 522, + "time_per_iteration": 2.466909170150757 + }, + { + "auxiliary_loss_clip": 0.01202988, + "auxiliary_loss_mlp": 0.0106513, + "balance_loss_clip": 1.03029704, + "balance_loss_mlp": 1.04272258, + "epoch": 0.03144446114534796, + "flos": 18981369181440.0, + "grad_norm": 4.602702188577186, + "language_loss": 0.84968263, + "learning_rate": 3.99032078626712e-06, + "loss": 0.87236381, + "num_input_tokens_seen": 11104300, + "router_z_loss_clip": 0.34765625, + "router_z_loss_mlp": 1.6015625, + "step": 523, + "time_per_iteration": 2.43253231048584 + }, + { + "auxiliary_loss_clip": 0.01203881, + "auxiliary_loss_mlp": 0.01065043, + "balance_loss_clip": 1.03044844, + "balance_loss_mlp": 1.04492521, + "epoch": 0.031504584398015935, + "flos": 22636600640640.0, + "grad_norm": 2.931477636015078, + "language_loss": 0.89870876, + "learning_rate": 3.990283624461261e-06, + "loss": 0.92139804, + "num_input_tokens_seen": 11123335, + "router_z_loss_clip": 0.34570312, + "router_z_loss_mlp": 1.59375, + "step": 524, + "time_per_iteration": 2.4421632289886475 + }, + { + "auxiliary_loss_clip": 0.01205348, + "auxiliary_loss_mlp": 0.01072081, + "balance_loss_clip": 1.03605592, + "balance_loss_mlp": 1.04450417, + "epoch": 0.0315647076506839, + "flos": 25296004990080.0, + "grad_norm": 3.1770234004138236, + "language_loss": 0.79840553, + "learning_rate": 3.9902463916273e-06, + "loss": 0.82117987, + "num_input_tokens_seen": 11140880, + "router_z_loss_clip": 0.36132812, + "router_z_loss_mlp": 1.609375, + "step": 525, + "time_per_iteration": 2.512315034866333 + }, + { + "auxiliary_loss_clip": 0.01197829, + "auxiliary_loss_mlp": 0.01059955, + "balance_loss_clip": 1.02438259, + "balance_loss_mlp": 1.04082167, + "epoch": 0.03162483090335187, + "flos": 16981892951040.0, + "grad_norm": 1.974038699837512, + "language_loss": 0.80296195, + "learning_rate": 3.990209087766563e-06, + "loss": 0.82553983, + "num_input_tokens_seen": 11158710, + "router_z_loss_clip": 0.35546875, + "router_z_loss_mlp": 1.5703125, + "step": 526, + "time_per_iteration": 2.4241766929626465 + }, + { + "auxiliary_loss_clip": 0.01206056, + "auxiliary_loss_mlp": 0.01059794, + "balance_loss_clip": 1.02348268, + "balance_loss_mlp": 1.04648757, + "epoch": 0.03168495415601984, + "flos": 18733485461760.0, + "grad_norm": 2.082035333094545, + "language_loss": 0.81417549, + "learning_rate": 3.990171712880383e-06, + "loss": 0.83683401, + "num_input_tokens_seen": 11177550, + "router_z_loss_clip": 0.36328125, + "router_z_loss_mlp": 1.59375, + "step": 527, + "time_per_iteration": 2.393235921859741 + }, + { + "auxiliary_loss_clip": 0.01193488, + "auxiliary_loss_mlp": 0.01064555, + "balance_loss_clip": 1.03058052, + "balance_loss_mlp": 1.03917575, + "epoch": 0.03174507740868781, + "flos": 21213820114560.0, + "grad_norm": 1.9535105862033473, + "language_loss": 0.93562591, + "learning_rate": 3.990134266970095e-06, + "loss": 0.95820642, + "num_input_tokens_seen": 11196230, + "router_z_loss_clip": 0.33984375, + "router_z_loss_mlp": 1.546875, + "step": 528, + "time_per_iteration": 3.870699644088745 + }, + { + "auxiliary_loss_clip": 0.01199748, + "auxiliary_loss_mlp": 0.01055606, + "balance_loss_clip": 1.02167869, + "balance_loss_mlp": 1.04302394, + "epoch": 0.03180520066135578, + "flos": 24786587208960.0, + "grad_norm": 1.9424774422442086, + "language_loss": 0.83959383, + "learning_rate": 3.9900967500370335e-06, + "loss": 0.86214739, + "num_input_tokens_seen": 11214935, + "router_z_loss_clip": 0.33984375, + "router_z_loss_mlp": 1.5625, + "step": 529, + "time_per_iteration": 2.437253952026367 + }, + { + "auxiliary_loss_clip": 0.01198756, + "auxiliary_loss_mlp": 0.01064429, + "balance_loss_clip": 1.03158712, + "balance_loss_mlp": 1.04470348, + "epoch": 0.03186532391402375, + "flos": 24863081731200.0, + "grad_norm": 2.219900736369779, + "language_loss": 0.90270782, + "learning_rate": 3.990059162082539e-06, + "loss": 0.9253397, + "num_input_tokens_seen": 11235310, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 1.5390625, + "step": 530, + "time_per_iteration": 5.275681972503662 + }, + { + "auxiliary_loss_clip": 0.01192801, + "auxiliary_loss_mlp": 0.01061732, + "balance_loss_clip": 1.02620769, + "balance_loss_mlp": 1.03718972, + "epoch": 0.03192544716669172, + "flos": 21213994671360.0, + "grad_norm": 2.2176436304325775, + "language_loss": 0.76117861, + "learning_rate": 3.9900215031079515e-06, + "loss": 0.78372395, + "num_input_tokens_seen": 11254425, + "router_z_loss_clip": 0.35546875, + "router_z_loss_mlp": 1.5546875, + "step": 531, + "time_per_iteration": 2.421377420425415 + }, + { + "auxiliary_loss_clip": 0.0119355, + "auxiliary_loss_mlp": 0.01057381, + "balance_loss_clip": 1.02307296, + "balance_loss_mlp": 1.04110742, + "epoch": 0.03198557041935969, + "flos": 24352058027520.0, + "grad_norm": 2.0795541844250462, + "language_loss": 0.904203, + "learning_rate": 3.989983773114616e-06, + "loss": 0.92671233, + "num_input_tokens_seen": 11274595, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 1.5234375, + "step": 532, + "time_per_iteration": 3.8595516681671143 + }, + { + "auxiliary_loss_clip": 0.01078112, + "auxiliary_loss_mlp": 0.0100629, + "balance_loss_clip": 0.99875575, + "balance_loss_mlp": 1.01723647, + "epoch": 0.032045693672027656, + "flos": 61824056186880.0, + "grad_norm": 0.7257353694709775, + "language_loss": 0.5794214, + "learning_rate": 3.989945972103877e-06, + "loss": 0.60026538, + "num_input_tokens_seen": 11336705, + "router_z_loss_clip": 0.07519531, + "router_z_loss_mlp": 0.609375, + "step": 533, + "time_per_iteration": 3.104865550994873 + }, + { + "auxiliary_loss_clip": 0.01193289, + "auxiliary_loss_mlp": 0.01067067, + "balance_loss_clip": 1.03375959, + "balance_loss_mlp": 1.04044938, + "epoch": 0.03210581692469563, + "flos": 28399958081280.0, + "grad_norm": 1.6510738722888407, + "language_loss": 0.8620894, + "learning_rate": 3.989908100077087e-06, + "loss": 0.88469297, + "num_input_tokens_seen": 11356820, + "router_z_loss_clip": 0.33203125, + "router_z_loss_mlp": 1.5234375, + "step": 534, + "time_per_iteration": 2.4982473850250244 + }, + { + "auxiliary_loss_clip": 0.01195723, + "auxiliary_loss_mlp": 0.01056269, + "balance_loss_clip": 1.02060103, + "balance_loss_mlp": 1.04258776, + "epoch": 0.03216594017736359, + "flos": 24716551288320.0, + "grad_norm": 2.2136679875308443, + "language_loss": 0.7724539, + "learning_rate": 3.989870157035594e-06, + "loss": 0.79497379, + "num_input_tokens_seen": 11376645, + "router_z_loss_clip": 0.35546875, + "router_z_loss_mlp": 1.53125, + "step": 535, + "time_per_iteration": 2.4496872425079346 + }, + { + "auxiliary_loss_clip": 0.01195513, + "auxiliary_loss_mlp": 0.0106181, + "balance_loss_clip": 1.02802587, + "balance_loss_mlp": 1.04010534, + "epoch": 0.032226063430031565, + "flos": 31174121669760.0, + "grad_norm": 2.1996955541292476, + "language_loss": 0.80698258, + "learning_rate": 3.989832142980754e-06, + "loss": 0.82955575, + "num_input_tokens_seen": 11397310, + "router_z_loss_clip": 0.33789062, + "router_z_loss_mlp": 1.5546875, + "step": 536, + "time_per_iteration": 2.5376601219177246 + }, + { + "auxiliary_loss_clip": 0.01194075, + "auxiliary_loss_mlp": 0.01062913, + "balance_loss_clip": 1.02779412, + "balance_loss_mlp": 1.04131877, + "epoch": 0.03228618668269954, + "flos": 32196832392960.0, + "grad_norm": 1.9626146690772939, + "language_loss": 0.69564807, + "learning_rate": 3.989794057913923e-06, + "loss": 0.71821791, + "num_input_tokens_seen": 11418475, + "router_z_loss_clip": 0.3515625, + "router_z_loss_mlp": 1.53125, + "step": 537, + "time_per_iteration": 2.4927241802215576 + }, + { + "auxiliary_loss_clip": 0.01197973, + "auxiliary_loss_mlp": 0.0106739, + "balance_loss_clip": 1.03355789, + "balance_loss_mlp": 1.04507327, + "epoch": 0.0323463099353675, + "flos": 22669174805760.0, + "grad_norm": 2.1653692978086414, + "language_loss": 0.82236588, + "learning_rate": 3.9897559018364615e-06, + "loss": 0.84501946, + "num_input_tokens_seen": 11436630, + "router_z_loss_clip": 0.33789062, + "router_z_loss_mlp": 1.53125, + "step": 538, + "time_per_iteration": 2.4447684288024902 + }, + { + "auxiliary_loss_clip": 0.01201374, + "auxiliary_loss_mlp": 0.01062495, + "balance_loss_clip": 1.02725673, + "balance_loss_mlp": 1.04155743, + "epoch": 0.032406433188035474, + "flos": 26903999612160.0, + "grad_norm": 1.7500385242252072, + "language_loss": 0.79262614, + "learning_rate": 3.98971767474973e-06, + "loss": 0.81526482, + "num_input_tokens_seen": 11457275, + "router_z_loss_clip": 0.3515625, + "router_z_loss_mlp": 1.6015625, + "step": 539, + "time_per_iteration": 2.4402785301208496 + }, + { + "auxiliary_loss_clip": 0.0119521, + "auxiliary_loss_mlp": 0.01064609, + "balance_loss_clip": 1.02927542, + "balance_loss_mlp": 1.04309082, + "epoch": 0.03246655644070344, + "flos": 31502584540800.0, + "grad_norm": 3.1364036052751243, + "language_loss": 0.77135301, + "learning_rate": 3.989679376655092e-06, + "loss": 0.79395115, + "num_input_tokens_seen": 11476925, + "router_z_loss_clip": 0.35351562, + "router_z_loss_mlp": 1.5234375, + "step": 540, + "time_per_iteration": 2.5234434604644775 + }, + { + "auxiliary_loss_clip": 0.01202534, + "auxiliary_loss_mlp": 0.01065013, + "balance_loss_clip": 1.02903533, + "balance_loss_mlp": 1.04551792, + "epoch": 0.03252667969337141, + "flos": 23217311151360.0, + "grad_norm": 2.7523998804954344, + "language_loss": 0.85123587, + "learning_rate": 3.989641007553916e-06, + "loss": 0.87391126, + "num_input_tokens_seen": 11496830, + "router_z_loss_clip": 0.359375, + "router_z_loss_mlp": 1.5703125, + "step": 541, + "time_per_iteration": 2.4256014823913574 + }, + { + "auxiliary_loss_clip": 0.01195641, + "auxiliary_loss_mlp": 0.01061551, + "balance_loss_clip": 1.02438188, + "balance_loss_mlp": 1.04368758, + "epoch": 0.032586802946039384, + "flos": 14756563935360.0, + "grad_norm": 2.2643202941631486, + "language_loss": 0.88175774, + "learning_rate": 3.989602567447569e-06, + "loss": 0.90432966, + "num_input_tokens_seen": 11515605, + "router_z_loss_clip": 0.37109375, + "router_z_loss_mlp": 1.515625, + "step": 542, + "time_per_iteration": 2.4088408946990967 + }, + { + "auxiliary_loss_clip": 0.01196528, + "auxiliary_loss_mlp": 0.01065616, + "balance_loss_clip": 1.0302583, + "balance_loss_mlp": 1.04263008, + "epoch": 0.03264692619870735, + "flos": 24279508488960.0, + "grad_norm": 1.975439138667125, + "language_loss": 0.70890611, + "learning_rate": 3.989564056337426e-06, + "loss": 0.73152757, + "num_input_tokens_seen": 11536230, + "router_z_loss_clip": 0.35351562, + "router_z_loss_mlp": 1.5390625, + "step": 543, + "time_per_iteration": 2.4294607639312744 + }, + { + "auxiliary_loss_clip": 0.0119393, + "auxiliary_loss_mlp": 0.01060699, + "balance_loss_clip": 1.0262711, + "balance_loss_mlp": 1.03910398, + "epoch": 0.03270704945137532, + "flos": 22892060125440.0, + "grad_norm": 2.6781412261206756, + "language_loss": 0.91309845, + "learning_rate": 3.989525474224858e-06, + "loss": 0.93564469, + "num_input_tokens_seen": 11554715, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 1.546875, + "step": 544, + "time_per_iteration": 2.438074827194214 + }, + { + "auxiliary_loss_clip": 0.01197985, + "auxiliary_loss_mlp": 0.01053669, + "balance_loss_clip": 1.02126789, + "balance_loss_mlp": 1.04320121, + "epoch": 0.032767172704043286, + "flos": 18040040570880.0, + "grad_norm": 3.2598794138189326, + "language_loss": 0.65982533, + "learning_rate": 3.989486821111244e-06, + "loss": 0.68234193, + "num_input_tokens_seen": 11571370, + "router_z_loss_clip": 0.32421875, + "router_z_loss_mlp": 1.546875, + "step": 545, + "time_per_iteration": 2.3907058238983154 + }, + { + "auxiliary_loss_clip": 0.01197121, + "auxiliary_loss_mlp": 0.01058319, + "balance_loss_clip": 1.02453566, + "balance_loss_mlp": 1.04238844, + "epoch": 0.03282729595671126, + "flos": 22527636687360.0, + "grad_norm": 2.2970789725101652, + "language_loss": 0.91792428, + "learning_rate": 3.9894480969979635e-06, + "loss": 0.94047862, + "num_input_tokens_seen": 11588560, + "router_z_loss_clip": 0.33789062, + "router_z_loss_mlp": 1.546875, + "step": 546, + "time_per_iteration": 2.4523770809173584 + }, + { + "auxiliary_loss_clip": 0.01193368, + "auxiliary_loss_mlp": 0.01060539, + "balance_loss_clip": 1.02456117, + "balance_loss_mlp": 1.03936839, + "epoch": 0.03288741920937923, + "flos": 20409630791040.0, + "grad_norm": 3.55375524184824, + "language_loss": 0.81702125, + "learning_rate": 3.989409301886398e-06, + "loss": 0.83956033, + "num_input_tokens_seen": 11605685, + "router_z_loss_clip": 0.359375, + "router_z_loss_mlp": 1.5390625, + "step": 547, + "time_per_iteration": 2.4157586097717285 + }, + { + "auxiliary_loss_clip": 0.01195076, + "auxiliary_loss_mlp": 0.01056152, + "balance_loss_clip": 1.02089, + "balance_loss_mlp": 1.0420146, + "epoch": 0.032947542462047195, + "flos": 20776916960640.0, + "grad_norm": 1.8755243921553955, + "language_loss": 0.80964327, + "learning_rate": 3.989370435777931e-06, + "loss": 0.83215559, + "num_input_tokens_seen": 11626290, + "router_z_loss_clip": 0.3515625, + "router_z_loss_mlp": 1.53125, + "step": 548, + "time_per_iteration": 2.4845027923583984 + }, + { + "auxiliary_loss_clip": 0.01196537, + "auxiliary_loss_mlp": 0.01059157, + "balance_loss_clip": 1.02194011, + "balance_loss_mlp": 1.04265714, + "epoch": 0.03300766571471517, + "flos": 19900247921280.0, + "grad_norm": 4.870889560058078, + "language_loss": 0.67086864, + "learning_rate": 3.989331498673951e-06, + "loss": 0.69342566, + "num_input_tokens_seen": 11643950, + "router_z_loss_clip": 0.37304688, + "router_z_loss_mlp": 1.5390625, + "step": 549, + "time_per_iteration": 2.459944009780884 + }, + { + "auxiliary_loss_clip": 0.01191409, + "auxiliary_loss_mlp": 0.01063525, + "balance_loss_clip": 1.02819109, + "balance_loss_mlp": 1.04134452, + "epoch": 0.03306778896738313, + "flos": 17966792805120.0, + "grad_norm": 2.04638296747371, + "language_loss": 0.85720515, + "learning_rate": 3.9892924905758475e-06, + "loss": 0.87975454, + "num_input_tokens_seen": 11662560, + "router_z_loss_clip": 0.35351562, + "router_z_loss_mlp": 1.5, + "step": 550, + "time_per_iteration": 2.4711291790008545 + }, + { + "auxiliary_loss_clip": 0.01194971, + "auxiliary_loss_mlp": 0.01069821, + "balance_loss_clip": 1.03541744, + "balance_loss_mlp": 1.04545546, + "epoch": 0.033127912220051105, + "flos": 21652294279680.0, + "grad_norm": 1.7071244062597453, + "language_loss": 0.81147861, + "learning_rate": 3.989253411485011e-06, + "loss": 0.83412647, + "num_input_tokens_seen": 11682265, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 1.5, + "step": 551, + "time_per_iteration": 2.4221384525299072 + }, + { + "auxiliary_loss_clip": 0.01197454, + "auxiliary_loss_mlp": 0.01065646, + "balance_loss_clip": 1.03043151, + "balance_loss_mlp": 1.04241097, + "epoch": 0.03318803547271908, + "flos": 30187127134080.0, + "grad_norm": 2.8232966739904155, + "language_loss": 0.86134279, + "learning_rate": 3.989214261402838e-06, + "loss": 0.88397378, + "num_input_tokens_seen": 11699300, + "router_z_loss_clip": 0.3515625, + "router_z_loss_mlp": 1.5546875, + "step": 552, + "time_per_iteration": 2.518507719039917 + }, + { + "auxiliary_loss_clip": 0.01197266, + "auxiliary_loss_mlp": 0.01063903, + "balance_loss_clip": 1.02642334, + "balance_loss_mlp": 1.04250526, + "epoch": 0.03324815872538704, + "flos": 20374996855680.0, + "grad_norm": 2.3913134793234097, + "language_loss": 0.92376202, + "learning_rate": 3.989175040330724e-06, + "loss": 0.9463737, + "num_input_tokens_seen": 11716955, + "router_z_loss_clip": 0.375, + "router_z_loss_mlp": 1.546875, + "step": 553, + "time_per_iteration": 2.4046833515167236 + }, + { + "auxiliary_loss_clip": 0.01197331, + "auxiliary_loss_mlp": 0.01065382, + "balance_loss_clip": 1.02694941, + "balance_loss_mlp": 1.04425418, + "epoch": 0.033308281978055014, + "flos": 24493526323200.0, + "grad_norm": 2.3784682061349676, + "language_loss": 0.78795719, + "learning_rate": 3.98913574827007e-06, + "loss": 0.81058431, + "num_input_tokens_seen": 11736130, + "router_z_loss_clip": 0.38476562, + "router_z_loss_mlp": 1.53125, + "step": 554, + "time_per_iteration": 2.4820733070373535 + }, + { + "auxiliary_loss_clip": 0.0119128, + "auxiliary_loss_mlp": 0.01069872, + "balance_loss_clip": 1.03301299, + "balance_loss_mlp": 1.04178536, + "epoch": 0.03336840523072298, + "flos": 23399313402240.0, + "grad_norm": 2.431197399805768, + "language_loss": 0.81781608, + "learning_rate": 3.989096385222278e-06, + "loss": 0.84042764, + "num_input_tokens_seen": 11754425, + "router_z_loss_clip": 0.3671875, + "router_z_loss_mlp": 1.4921875, + "step": 555, + "time_per_iteration": 2.4461193084716797 + }, + { + "auxiliary_loss_clip": 0.01195652, + "auxiliary_loss_mlp": 0.01064587, + "balance_loss_clip": 1.02787089, + "balance_loss_mlp": 1.04220772, + "epoch": 0.03342852848339095, + "flos": 30549386067840.0, + "grad_norm": 2.9516103816303603, + "language_loss": 0.88176799, + "learning_rate": 3.989056951188753e-06, + "loss": 0.90437037, + "num_input_tokens_seen": 11772845, + "router_z_loss_clip": 0.3671875, + "router_z_loss_mlp": 1.53125, + "step": 556, + "time_per_iteration": 2.528106689453125 + }, + { + "auxiliary_loss_clip": 0.01194919, + "auxiliary_loss_mlp": 0.01068536, + "balance_loss_clip": 1.03270173, + "balance_loss_mlp": 1.04206967, + "epoch": 0.03348865173605892, + "flos": 22892199770880.0, + "grad_norm": 1.9707480879266779, + "language_loss": 0.83577824, + "learning_rate": 3.989017446170901e-06, + "loss": 0.85841274, + "num_input_tokens_seen": 11792850, + "router_z_loss_clip": 0.359375, + "router_z_loss_mlp": 1.53125, + "step": 557, + "time_per_iteration": 2.4847426414489746 + }, + { + "auxiliary_loss_clip": 0.0119705, + "auxiliary_loss_mlp": 0.0105911, + "balance_loss_clip": 1.02418184, + "balance_loss_mlp": 1.04479933, + "epoch": 0.03354877498872689, + "flos": 17675058551040.0, + "grad_norm": 3.8172882719549515, + "language_loss": 0.93698788, + "learning_rate": 3.988977870170133e-06, + "loss": 0.95954949, + "num_input_tokens_seen": 11809670, + "router_z_loss_clip": 0.34960938, + "router_z_loss_mlp": 1.5234375, + "step": 558, + "time_per_iteration": 2.411505699157715 + }, + { + "auxiliary_loss_clip": 0.01193043, + "auxiliary_loss_mlp": 0.01063056, + "balance_loss_clip": 1.02886689, + "balance_loss_mlp": 1.04174602, + "epoch": 0.03360889824139486, + "flos": 21651910254720.0, + "grad_norm": 6.270277986351199, + "language_loss": 0.76974529, + "learning_rate": 3.988938223187861e-06, + "loss": 0.7923063, + "num_input_tokens_seen": 11829665, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 1.515625, + "step": 559, + "time_per_iteration": 2.419795036315918 + }, + { + "auxiliary_loss_clip": 0.01193975, + "auxiliary_loss_mlp": 0.01067361, + "balance_loss_clip": 1.03369641, + "balance_loss_mlp": 1.04131722, + "epoch": 0.033669021494062826, + "flos": 21794740093440.0, + "grad_norm": 2.815050991405286, + "language_loss": 0.87094873, + "learning_rate": 3.9888985052255005e-06, + "loss": 0.89356208, + "num_input_tokens_seen": 11848190, + "router_z_loss_clip": 0.3359375, + "router_z_loss_mlp": 1.53125, + "step": 560, + "time_per_iteration": 2.4539504051208496 + }, + { + "auxiliary_loss_clip": 0.01190911, + "auxiliary_loss_mlp": 0.01056281, + "balance_loss_clip": 1.02352214, + "balance_loss_mlp": 1.04093623, + "epoch": 0.0337291447467308, + "flos": 21865299684480.0, + "grad_norm": 3.150492713638745, + "language_loss": 0.80860865, + "learning_rate": 3.988858716284468e-06, + "loss": 0.83108056, + "num_input_tokens_seen": 11864795, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 1.5, + "step": 561, + "time_per_iteration": 2.392932891845703 + }, + { + "auxiliary_loss_clip": 0.01192948, + "auxiliary_loss_mlp": 0.01064501, + "balance_loss_clip": 1.0318135, + "balance_loss_mlp": 1.04135442, + "epoch": 0.03378926799939877, + "flos": 24244734908160.0, + "grad_norm": 1.7802178247380682, + "language_loss": 0.81872559, + "learning_rate": 3.988818856366184e-06, + "loss": 0.84130007, + "num_input_tokens_seen": 11885275, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 1.515625, + "step": 562, + "time_per_iteration": 2.4758028984069824 + }, + { + "auxiliary_loss_clip": 0.01197868, + "auxiliary_loss_mlp": 0.01073311, + "balance_loss_clip": 1.03945541, + "balance_loss_mlp": 1.0440259, + "epoch": 0.033849391252066735, + "flos": 16506899637120.0, + "grad_norm": 1.9287515962020005, + "language_loss": 0.83921456, + "learning_rate": 3.9887789254720704e-06, + "loss": 0.86192638, + "num_input_tokens_seen": 11903595, + "router_z_loss_clip": 0.33984375, + "router_z_loss_mlp": 1.5390625, + "step": 563, + "time_per_iteration": 2.3996310234069824 + }, + { + "auxiliary_loss_clip": 0.01195139, + "auxiliary_loss_mlp": 0.01065566, + "balance_loss_clip": 1.02963662, + "balance_loss_mlp": 1.04248786, + "epoch": 0.03390951450473471, + "flos": 15668390580480.0, + "grad_norm": 2.33407814831408, + "language_loss": 0.93336153, + "learning_rate": 3.988738923603553e-06, + "loss": 0.95596856, + "num_input_tokens_seen": 11917815, + "router_z_loss_clip": 0.359375, + "router_z_loss_mlp": 1.5234375, + "step": 564, + "time_per_iteration": 2.3867733478546143 + }, + { + "auxiliary_loss_clip": 0.01194909, + "auxiliary_loss_mlp": 0.01066091, + "balance_loss_clip": 1.03147316, + "balance_loss_mlp": 1.03992391, + "epoch": 0.03396963775740267, + "flos": 22673678371200.0, + "grad_norm": 3.090948314160313, + "language_loss": 0.94309002, + "learning_rate": 3.98869885076206e-06, + "loss": 0.96570009, + "num_input_tokens_seen": 11936305, + "router_z_loss_clip": 0.34765625, + "router_z_loss_mlp": 1.546875, + "step": 565, + "time_per_iteration": 2.473637104034424 + }, + { + "auxiliary_loss_clip": 0.01080434, + "auxiliary_loss_mlp": 0.01023652, + "balance_loss_clip": 1.01535463, + "balance_loss_mlp": 1.01858997, + "epoch": 0.034029761010070644, + "flos": 64388984797440.0, + "grad_norm": 1.148770634874019, + "language_loss": 0.54870236, + "learning_rate": 3.9886587069490195e-06, + "loss": 0.56974322, + "num_input_tokens_seen": 11998940, + "router_z_loss_clip": 0.08300781, + "router_z_loss_mlp": 0.6171875, + "step": 566, + "time_per_iteration": 3.11896014213562 + }, + { + "auxiliary_loss_clip": 0.01195853, + "auxiliary_loss_mlp": 0.01062324, + "balance_loss_clip": 1.0245589, + "balance_loss_mlp": 1.04374218, + "epoch": 0.034089884262738616, + "flos": 25003188483840.0, + "grad_norm": 2.358639810820114, + "language_loss": 0.76279438, + "learning_rate": 3.988618492165865e-06, + "loss": 0.78537619, + "num_input_tokens_seen": 12018860, + "router_z_loss_clip": 0.37890625, + "router_z_loss_mlp": 1.515625, + "step": 567, + "time_per_iteration": 3.895256280899048 + }, + { + "auxiliary_loss_clip": 0.01189289, + "auxiliary_loss_mlp": 0.01067985, + "balance_loss_clip": 1.03274679, + "balance_loss_mlp": 1.04122317, + "epoch": 0.03415000751540658, + "flos": 28437838773120.0, + "grad_norm": 2.024071973675408, + "language_loss": 0.80621415, + "learning_rate": 3.988578206414032e-06, + "loss": 0.82878685, + "num_input_tokens_seen": 12039675, + "router_z_loss_clip": 0.3515625, + "router_z_loss_mlp": 1.484375, + "step": 568, + "time_per_iteration": 2.4694976806640625 + }, + { + "auxiliary_loss_clip": 0.01193456, + "auxiliary_loss_mlp": 0.01061731, + "balance_loss_clip": 1.02859068, + "balance_loss_mlp": 1.0454756, + "epoch": 0.034210130768074554, + "flos": 21467708588160.0, + "grad_norm": 1.9569454038555405, + "language_loss": 0.8628267, + "learning_rate": 3.988537849694959e-06, + "loss": 0.8853786, + "num_input_tokens_seen": 12057680, + "router_z_loss_clip": 0.33203125, + "router_z_loss_mlp": 1.484375, + "step": 569, + "time_per_iteration": 3.8459970951080322 + }, + { + "auxiliary_loss_clip": 0.0119573, + "auxiliary_loss_mlp": 0.0106184, + "balance_loss_clip": 1.02598178, + "balance_loss_mlp": 1.04251552, + "epoch": 0.03427025402074252, + "flos": 18696512465280.0, + "grad_norm": 1.8131955516308138, + "language_loss": 0.95423174, + "learning_rate": 3.988497422010084e-06, + "loss": 0.97680748, + "num_input_tokens_seen": 12076135, + "router_z_loss_clip": 0.359375, + "router_z_loss_mlp": 1.53125, + "step": 570, + "time_per_iteration": 3.7499139308929443 + }, + { + "auxiliary_loss_clip": 0.01191599, + "auxiliary_loss_mlp": 0.01061289, + "balance_loss_clip": 1.02357149, + "balance_loss_mlp": 1.03753138, + "epoch": 0.03433037727341049, + "flos": 20848942828800.0, + "grad_norm": 2.495821873687206, + "language_loss": 0.79018605, + "learning_rate": 3.988456923360852e-06, + "loss": 0.81271493, + "num_input_tokens_seen": 12094785, + "router_z_loss_clip": 0.37695312, + "router_z_loss_mlp": 1.5390625, + "step": 571, + "time_per_iteration": 3.85359787940979 + }, + { + "auxiliary_loss_clip": 0.01195057, + "auxiliary_loss_mlp": 0.01068013, + "balance_loss_clip": 1.03172517, + "balance_loss_mlp": 1.04233098, + "epoch": 0.03439050052607846, + "flos": 25409123395200.0, + "grad_norm": 2.40142219818747, + "language_loss": 0.80008596, + "learning_rate": 3.988416353748707e-06, + "loss": 0.82271665, + "num_input_tokens_seen": 12114590, + "router_z_loss_clip": 0.36328125, + "router_z_loss_mlp": 1.53125, + "step": 572, + "time_per_iteration": 2.4332334995269775 + }, + { + "auxiliary_loss_clip": 0.01198195, + "auxiliary_loss_mlp": 0.01057882, + "balance_loss_clip": 1.02295363, + "balance_loss_mlp": 1.04672611, + "epoch": 0.03445062377874643, + "flos": 17639167806720.0, + "grad_norm": 2.9297505078403385, + "language_loss": 0.84247696, + "learning_rate": 3.988375713175097e-06, + "loss": 0.86503768, + "num_input_tokens_seen": 12132390, + "router_z_loss_clip": 0.34960938, + "router_z_loss_mlp": 1.515625, + "step": 573, + "time_per_iteration": 2.392399787902832 + }, + { + "auxiliary_loss_clip": 0.01193016, + "auxiliary_loss_mlp": 0.01064463, + "balance_loss_clip": 1.0300827, + "balance_loss_mlp": 1.04029369, + "epoch": 0.0345107470314144, + "flos": 16763546108160.0, + "grad_norm": 2.3845255873691547, + "language_loss": 0.76166523, + "learning_rate": 3.988335001641473e-06, + "loss": 0.78423995, + "num_input_tokens_seen": 12149035, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 1.53125, + "step": 574, + "time_per_iteration": 2.3899247646331787 + }, + { + "auxiliary_loss_clip": 0.01194761, + "auxiliary_loss_mlp": 0.01054424, + "balance_loss_clip": 1.02185607, + "balance_loss_mlp": 1.04313254, + "epoch": 0.03457087028408237, + "flos": 14683560549120.0, + "grad_norm": 2.8759034033766717, + "language_loss": 0.83530688, + "learning_rate": 3.988294219149287e-06, + "loss": 0.8577987, + "num_input_tokens_seen": 12167530, + "router_z_loss_clip": 0.32421875, + "router_z_loss_mlp": 1.515625, + "step": 575, + "time_per_iteration": 2.404356002807617 + }, + { + "auxiliary_loss_clip": 0.01191842, + "auxiliary_loss_mlp": 0.01066171, + "balance_loss_clip": 1.03207731, + "balance_loss_mlp": 1.04431152, + "epoch": 0.03463099353675034, + "flos": 20010259215360.0, + "grad_norm": 2.1975776036879133, + "language_loss": 0.83930761, + "learning_rate": 3.9882533656999945e-06, + "loss": 0.86188769, + "num_input_tokens_seen": 12186340, + "router_z_loss_clip": 0.33984375, + "router_z_loss_mlp": 1.4765625, + "step": 576, + "time_per_iteration": 2.404324769973755 + }, + { + "auxiliary_loss_clip": 0.01192762, + "auxiliary_loss_mlp": 0.01065608, + "balance_loss_clip": 1.03306341, + "balance_loss_mlp": 1.04672194, + "epoch": 0.03469111678941831, + "flos": 25299984885120.0, + "grad_norm": 2.136277743417185, + "language_loss": 0.86451602, + "learning_rate": 3.988212441295054e-06, + "loss": 0.88709974, + "num_input_tokens_seen": 12204090, + "router_z_loss_clip": 0.32421875, + "router_z_loss_mlp": 1.4609375, + "step": 577, + "time_per_iteration": 2.4424991607666016 + }, + { + "auxiliary_loss_clip": 0.01193216, + "auxiliary_loss_mlp": 0.0106795, + "balance_loss_clip": 1.03554869, + "balance_loss_mlp": 1.04519367, + "epoch": 0.034751240042086275, + "flos": 23258264042880.0, + "grad_norm": 2.0062231336852197, + "language_loss": 0.72245854, + "learning_rate": 3.9881714459359255e-06, + "loss": 0.74507022, + "num_input_tokens_seen": 12224850, + "router_z_loss_clip": 0.32421875, + "router_z_loss_mlp": 1.484375, + "step": 578, + "time_per_iteration": 2.41383695602417 + }, + { + "auxiliary_loss_clip": 0.01192622, + "auxiliary_loss_mlp": 0.010612, + "balance_loss_clip": 1.02734423, + "balance_loss_mlp": 1.04283249, + "epoch": 0.03481136329475425, + "flos": 23768100760320.0, + "grad_norm": 1.9369968368063495, + "language_loss": 0.77471632, + "learning_rate": 3.988130379624073e-06, + "loss": 0.7972545, + "num_input_tokens_seen": 12244935, + "router_z_loss_clip": 0.33789062, + "router_z_loss_mlp": 1.5, + "step": 579, + "time_per_iteration": 2.420255422592163 + }, + { + "auxiliary_loss_clip": 0.01192129, + "auxiliary_loss_mlp": 0.01062744, + "balance_loss_clip": 1.02752972, + "balance_loss_mlp": 1.04205263, + "epoch": 0.03487148654742222, + "flos": 20156475456000.0, + "grad_norm": 2.55101409008302, + "language_loss": 0.86368865, + "learning_rate": 3.988089242360961e-06, + "loss": 0.88623732, + "num_input_tokens_seen": 12262140, + "router_z_loss_clip": 0.3515625, + "router_z_loss_mlp": 1.5, + "step": 580, + "time_per_iteration": 2.4236578941345215 + }, + { + "auxiliary_loss_clip": 0.01196551, + "auxiliary_loss_mlp": 0.01058375, + "balance_loss_clip": 1.02592683, + "balance_loss_mlp": 1.04238605, + "epoch": 0.034931609800090184, + "flos": 15668669871360.0, + "grad_norm": 2.332495529883519, + "language_loss": 0.82363093, + "learning_rate": 3.988048034148057e-06, + "loss": 0.8461802, + "num_input_tokens_seen": 12280930, + "router_z_loss_clip": 0.32421875, + "router_z_loss_mlp": 1.5390625, + "step": 581, + "time_per_iteration": 2.393260955810547 + }, + { + "auxiliary_loss_clip": 0.0119314, + "auxiliary_loss_mlp": 0.01060886, + "balance_loss_clip": 1.02884281, + "balance_loss_mlp": 1.04386783, + "epoch": 0.034991733052758156, + "flos": 16361451446400.0, + "grad_norm": 2.557332275981054, + "language_loss": 0.76911843, + "learning_rate": 3.988006754986834e-06, + "loss": 0.7916587, + "num_input_tokens_seen": 12299125, + "router_z_loss_clip": 0.3203125, + "router_z_loss_mlp": 1.4921875, + "step": 582, + "time_per_iteration": 2.378066062927246 + }, + { + "auxiliary_loss_clip": 0.01193353, + "auxiliary_loss_mlp": 0.01065852, + "balance_loss_clip": 1.03054273, + "balance_loss_mlp": 1.04665709, + "epoch": 0.03505185630542612, + "flos": 19386396397440.0, + "grad_norm": 2.182466588591563, + "language_loss": 0.87432832, + "learning_rate": 3.987965404878763e-06, + "loss": 0.89692038, + "num_input_tokens_seen": 12316905, + "router_z_loss_clip": 0.35351562, + "router_z_loss_mlp": 1.46875, + "step": 583, + "time_per_iteration": 2.4018166065216064 + }, + { + "auxiliary_loss_clip": 0.0119538, + "auxiliary_loss_mlp": 0.01065533, + "balance_loss_clip": 1.03060496, + "balance_loss_mlp": 1.0420208, + "epoch": 0.03511197955809409, + "flos": 21322784067840.0, + "grad_norm": 2.4167703716027362, + "language_loss": 0.80618572, + "learning_rate": 3.987923983825321e-06, + "loss": 0.82879484, + "num_input_tokens_seen": 12335070, + "router_z_loss_clip": 0.34960938, + "router_z_loss_mlp": 1.53125, + "step": 584, + "time_per_iteration": 2.3943240642547607 + }, + { + "auxiliary_loss_clip": 0.01192112, + "auxiliary_loss_mlp": 0.01060208, + "balance_loss_clip": 1.02647233, + "balance_loss_mlp": 1.04246902, + "epoch": 0.035172102810762065, + "flos": 14135738405760.0, + "grad_norm": 5.747430825665412, + "language_loss": 0.92533493, + "learning_rate": 3.9878824918279845e-06, + "loss": 0.9478581, + "num_input_tokens_seen": 12350315, + "router_z_loss_clip": 0.33789062, + "router_z_loss_mlp": 1.5, + "step": 585, + "time_per_iteration": 2.3824782371520996 + }, + { + "auxiliary_loss_clip": 0.01193938, + "auxiliary_loss_mlp": 0.01061532, + "balance_loss_clip": 1.02808166, + "balance_loss_mlp": 1.04445136, + "epoch": 0.03523222606343003, + "flos": 20296023626880.0, + "grad_norm": 2.282708667245842, + "language_loss": 0.87457907, + "learning_rate": 3.9878409288882364e-06, + "loss": 0.89713371, + "num_input_tokens_seen": 12366030, + "router_z_loss_clip": 0.33398438, + "router_z_loss_mlp": 1.5, + "step": 586, + "time_per_iteration": 2.3860182762145996 + }, + { + "auxiliary_loss_clip": 0.01198949, + "auxiliary_loss_mlp": 0.01060182, + "balance_loss_clip": 1.02730393, + "balance_loss_mlp": 1.04620695, + "epoch": 0.035292349316098, + "flos": 20374787387520.0, + "grad_norm": 1.9012974248253003, + "language_loss": 0.76167411, + "learning_rate": 3.987799295007558e-06, + "loss": 0.7842654, + "num_input_tokens_seen": 12384895, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 1.53125, + "step": 587, + "time_per_iteration": 2.4081978797912598 + }, + { + "auxiliary_loss_clip": 0.01193189, + "auxiliary_loss_mlp": 0.01059531, + "balance_loss_clip": 1.02417374, + "balance_loss_mlp": 1.04124045, + "epoch": 0.03535247256876597, + "flos": 21467848233600.0, + "grad_norm": 1.754797954220294, + "language_loss": 0.78395927, + "learning_rate": 3.987757590187436e-06, + "loss": 0.80648649, + "num_input_tokens_seen": 12404980, + "router_z_loss_clip": 0.35351562, + "router_z_loss_mlp": 1.5234375, + "step": 588, + "time_per_iteration": 2.410745620727539 + }, + { + "auxiliary_loss_clip": 0.01197837, + "auxiliary_loss_mlp": 0.01064675, + "balance_loss_clip": 1.02633774, + "balance_loss_mlp": 1.04193711, + "epoch": 0.03541259582143394, + "flos": 23621919431040.0, + "grad_norm": 2.7924246346076744, + "language_loss": 0.93870485, + "learning_rate": 3.987715814429359e-06, + "loss": 0.96132994, + "num_input_tokens_seen": 12423835, + "router_z_loss_clip": 0.3828125, + "router_z_loss_mlp": 1.5625, + "step": 589, + "time_per_iteration": 2.423064947128296 + }, + { + "auxiliary_loss_clip": 0.01199126, + "auxiliary_loss_mlp": 0.01063146, + "balance_loss_clip": 1.02962434, + "balance_loss_mlp": 1.04581523, + "epoch": 0.03547271907410191, + "flos": 33725050824960.0, + "grad_norm": 2.876152200613965, + "language_loss": 0.83852893, + "learning_rate": 3.987673967734818e-06, + "loss": 0.86115164, + "num_input_tokens_seen": 12443135, + "router_z_loss_clip": 0.3359375, + "router_z_loss_mlp": 1.5390625, + "step": 590, + "time_per_iteration": 2.5066776275634766 + }, + { + "auxiliary_loss_clip": 0.01190277, + "auxiliary_loss_mlp": 0.01061021, + "balance_loss_clip": 1.02862036, + "balance_loss_mlp": 1.04210639, + "epoch": 0.03553284232676988, + "flos": 21141619689600.0, + "grad_norm": 2.041992487210075, + "language_loss": 0.86693615, + "learning_rate": 3.987632050105306e-06, + "loss": 0.88944912, + "num_input_tokens_seen": 12462895, + "router_z_loss_clip": 0.32421875, + "router_z_loss_mlp": 1.484375, + "step": 591, + "time_per_iteration": 2.4259727001190186 + }, + { + "auxiliary_loss_clip": 0.01194508, + "auxiliary_loss_mlp": 0.01070402, + "balance_loss_clip": 1.034091, + "balance_loss_mlp": 1.04161382, + "epoch": 0.03559296557943785, + "flos": 20045591377920.0, + "grad_norm": 2.0630956610298865, + "language_loss": 0.82878077, + "learning_rate": 3.987590061542319e-06, + "loss": 0.85142994, + "num_input_tokens_seen": 12481515, + "router_z_loss_clip": 0.36328125, + "router_z_loss_mlp": 1.53125, + "step": 592, + "time_per_iteration": 2.3929755687713623 + }, + { + "auxiliary_loss_clip": 0.01072431, + "auxiliary_loss_mlp": 0.01007916, + "balance_loss_clip": 1.00109756, + "balance_loss_mlp": 1.01578867, + "epoch": 0.035653088832105814, + "flos": 60331239740160.0, + "grad_norm": 0.8880340868301633, + "language_loss": 0.59840667, + "learning_rate": 3.987548002047354e-06, + "loss": 0.61921012, + "num_input_tokens_seen": 12548220, + "router_z_loss_clip": 0.06835938, + "router_z_loss_mlp": 0.56640625, + "step": 593, + "time_per_iteration": 3.1164820194244385 + }, + { + "auxiliary_loss_clip": 0.01193037, + "auxiliary_loss_mlp": 0.01062465, + "balance_loss_clip": 1.02639222, + "balance_loss_mlp": 1.04370463, + "epoch": 0.035713212084773786, + "flos": 20112310719360.0, + "grad_norm": 2.1609314140189433, + "language_loss": 0.8677175, + "learning_rate": 3.987505871621915e-06, + "loss": 0.89027262, + "num_input_tokens_seen": 12566105, + "router_z_loss_clip": 0.36132812, + "router_z_loss_mlp": 1.4921875, + "step": 594, + "time_per_iteration": 2.3931171894073486 + }, + { + "auxiliary_loss_clip": 0.01194122, + "auxiliary_loss_mlp": 0.01063129, + "balance_loss_clip": 1.02932084, + "balance_loss_mlp": 1.04368234, + "epoch": 0.03577333533744176, + "flos": 26284605448320.0, + "grad_norm": 1.9805413200314534, + "language_loss": 0.84035844, + "learning_rate": 3.987463670267502e-06, + "loss": 0.86293095, + "num_input_tokens_seen": 12586680, + "router_z_loss_clip": 0.33789062, + "router_z_loss_mlp": 1.5078125, + "step": 595, + "time_per_iteration": 2.455754280090332 + }, + { + "auxiliary_loss_clip": 0.01191518, + "auxiliary_loss_mlp": 0.01065451, + "balance_loss_clip": 1.03123832, + "balance_loss_mlp": 1.04367185, + "epoch": 0.035833458590109724, + "flos": 10888955475840.0, + "grad_norm": 2.8540243898721607, + "language_loss": 0.9549948, + "learning_rate": 3.987421397985625e-06, + "loss": 0.97756451, + "num_input_tokens_seen": 12601605, + "router_z_loss_clip": 0.34179688, + "router_z_loss_mlp": 1.4765625, + "step": 596, + "time_per_iteration": 2.373399257659912 + }, + { + "auxiliary_loss_clip": 0.0119328, + "auxiliary_loss_mlp": 0.01066329, + "balance_loss_clip": 1.0362879, + "balance_loss_mlp": 1.04383075, + "epoch": 0.035893581842777696, + "flos": 22089127610880.0, + "grad_norm": 7.13882265388366, + "language_loss": 0.82787955, + "learning_rate": 3.98737905477779e-06, + "loss": 0.85047561, + "num_input_tokens_seen": 12620365, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 1.5, + "step": 597, + "time_per_iteration": 2.412506580352783 + }, + { + "auxiliary_loss_clip": 0.01191588, + "auxiliary_loss_mlp": 0.0106389, + "balance_loss_clip": 1.02886677, + "balance_loss_mlp": 1.04515767, + "epoch": 0.03595370509544566, + "flos": 23037263936640.0, + "grad_norm": 1.9765483286078758, + "language_loss": 0.81232685, + "learning_rate": 3.987336640645508e-06, + "loss": 0.83488166, + "num_input_tokens_seen": 12641140, + "router_z_loss_clip": 0.3515625, + "router_z_loss_mlp": 1.46875, + "step": 598, + "time_per_iteration": 2.4125797748565674 + }, + { + "auxiliary_loss_clip": 0.01189158, + "auxiliary_loss_mlp": 0.01063772, + "balance_loss_clip": 1.02927327, + "balance_loss_mlp": 1.04330945, + "epoch": 0.03601382834811363, + "flos": 20776672581120.0, + "grad_norm": 1.919076803637372, + "language_loss": 0.81268477, + "learning_rate": 3.987294155590295e-06, + "loss": 0.83521414, + "num_input_tokens_seen": 12661080, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 1.453125, + "step": 599, + "time_per_iteration": 2.4201183319091797 + }, + { + "auxiliary_loss_clip": 0.01195125, + "auxiliary_loss_mlp": 0.01064769, + "balance_loss_clip": 1.03124774, + "balance_loss_mlp": 1.04239631, + "epoch": 0.036073951600781605, + "flos": 23950487036160.0, + "grad_norm": 2.7999164737974818, + "language_loss": 0.85811245, + "learning_rate": 3.987251599613664e-06, + "loss": 0.88071138, + "num_input_tokens_seen": 12678270, + "router_z_loss_clip": 0.33398438, + "router_z_loss_mlp": 1.53125, + "step": 600, + "time_per_iteration": 2.4171955585479736 + }, + { + "auxiliary_loss_clip": 0.01190231, + "auxiliary_loss_mlp": 0.01068489, + "balance_loss_clip": 1.03244054, + "balance_loss_mlp": 1.04157639, + "epoch": 0.03613407485344957, + "flos": 18911403083520.0, + "grad_norm": 2.2791175764803575, + "language_loss": 0.81738359, + "learning_rate": 3.987208972717135e-06, + "loss": 0.83997083, + "num_input_tokens_seen": 12697295, + "router_z_loss_clip": 0.359375, + "router_z_loss_mlp": 1.484375, + "step": 601, + "time_per_iteration": 2.4082608222961426 + }, + { + "auxiliary_loss_clip": 0.01187035, + "auxiliary_loss_mlp": 0.01053517, + "balance_loss_clip": 1.02047205, + "balance_loss_mlp": 1.04106665, + "epoch": 0.03619419810611754, + "flos": 23037438493440.0, + "grad_norm": 2.7534643093390185, + "language_loss": 0.75187588, + "learning_rate": 3.987166274902231e-06, + "loss": 0.77428138, + "num_input_tokens_seen": 12716165, + "router_z_loss_clip": 0.33007812, + "router_z_loss_mlp": 1.4609375, + "step": 602, + "time_per_iteration": 2.407188892364502 + }, + { + "auxiliary_loss_clip": 0.01185516, + "auxiliary_loss_mlp": 0.01061491, + "balance_loss_clip": 1.02737272, + "balance_loss_mlp": 1.04045725, + "epoch": 0.03625432135878551, + "flos": 29456569601280.0, + "grad_norm": 2.065777789073131, + "language_loss": 0.79639304, + "learning_rate": 3.987123506170473e-06, + "loss": 0.81886303, + "num_input_tokens_seen": 12735475, + "router_z_loss_clip": 0.34179688, + "router_z_loss_mlp": 1.453125, + "step": 603, + "time_per_iteration": 2.4654366970062256 + }, + { + "auxiliary_loss_clip": 0.01189171, + "auxiliary_loss_mlp": 0.01056694, + "balance_loss_clip": 1.02481782, + "balance_loss_mlp": 1.04381037, + "epoch": 0.03631444461145348, + "flos": 23507544216960.0, + "grad_norm": 1.8258687398138511, + "language_loss": 0.86671007, + "learning_rate": 3.987080666523389e-06, + "loss": 0.88916874, + "num_input_tokens_seen": 12754540, + "router_z_loss_clip": 0.31835938, + "router_z_loss_mlp": 1.453125, + "step": 604, + "time_per_iteration": 2.4170491695404053 + }, + { + "auxiliary_loss_clip": 0.01192464, + "auxiliary_loss_mlp": 0.01061029, + "balance_loss_clip": 1.02710176, + "balance_loss_mlp": 1.04573047, + "epoch": 0.03637456786412145, + "flos": 16617190222080.0, + "grad_norm": 2.399624764457191, + "language_loss": 0.80515403, + "learning_rate": 3.987037755962506e-06, + "loss": 0.82768893, + "num_input_tokens_seen": 12773050, + "router_z_loss_clip": 0.33984375, + "router_z_loss_mlp": 1.46875, + "step": 605, + "time_per_iteration": 2.4204325675964355 + }, + { + "auxiliary_loss_clip": 0.01190658, + "auxiliary_loss_mlp": 0.01063163, + "balance_loss_clip": 1.03068995, + "balance_loss_mlp": 1.04383564, + "epoch": 0.03643469111678942, + "flos": 15850916501760.0, + "grad_norm": 2.413621551612539, + "language_loss": 0.85129428, + "learning_rate": 3.986994774489359e-06, + "loss": 0.87383258, + "num_input_tokens_seen": 12791240, + "router_z_loss_clip": 0.32421875, + "router_z_loss_mlp": 1.46875, + "step": 606, + "time_per_iteration": 3.8434371948242188 + }, + { + "auxiliary_loss_clip": 0.01193657, + "auxiliary_loss_mlp": 0.01066992, + "balance_loss_clip": 1.03149116, + "balance_loss_mlp": 1.04402304, + "epoch": 0.03649481436945739, + "flos": 23619335990400.0, + "grad_norm": 5.737261281953924, + "language_loss": 0.8204093, + "learning_rate": 3.986951722105479e-06, + "loss": 0.84301579, + "num_input_tokens_seen": 12812245, + "router_z_loss_clip": 0.35546875, + "router_z_loss_mlp": 1.4921875, + "step": 607, + "time_per_iteration": 2.426976442337036 + }, + { + "auxiliary_loss_clip": 0.01191488, + "auxiliary_loss_mlp": 0.01062892, + "balance_loss_clip": 1.03022838, + "balance_loss_mlp": 1.04630244, + "epoch": 0.036554937622125354, + "flos": 21754694897280.0, + "grad_norm": 3.23599896243447, + "language_loss": 0.83184808, + "learning_rate": 3.986908598812402e-06, + "loss": 0.85439193, + "num_input_tokens_seen": 12831085, + "router_z_loss_clip": 0.32617188, + "router_z_loss_mlp": 1.453125, + "step": 608, + "time_per_iteration": 2.4064114093780518 + }, + { + "auxiliary_loss_clip": 0.01190389, + "auxiliary_loss_mlp": 0.01057344, + "balance_loss_clip": 1.02160549, + "balance_loss_mlp": 1.04478228, + "epoch": 0.036615060874793326, + "flos": 17818865907840.0, + "grad_norm": 2.8337069348270045, + "language_loss": 0.81716424, + "learning_rate": 3.986865404611669e-06, + "loss": 0.83964157, + "num_input_tokens_seen": 12849115, + "router_z_loss_clip": 0.35742188, + "router_z_loss_mlp": 1.453125, + "step": 609, + "time_per_iteration": 3.824737548828125 + }, + { + "auxiliary_loss_clip": 0.0119741, + "auxiliary_loss_mlp": 0.0107586, + "balance_loss_clip": 1.04396009, + "balance_loss_mlp": 1.0502708, + "epoch": 0.0366751841274613, + "flos": 26752791047040.0, + "grad_norm": 1.9162090268784777, + "language_loss": 0.79127526, + "learning_rate": 3.98682213950482e-06, + "loss": 0.814008, + "num_input_tokens_seen": 12868005, + "router_z_loss_clip": 0.3203125, + "router_z_loss_mlp": 1.46875, + "step": 610, + "time_per_iteration": 2.4709088802337646 + }, + { + "auxiliary_loss_clip": 0.0119466, + "auxiliary_loss_mlp": 0.01063045, + "balance_loss_clip": 1.02954769, + "balance_loss_mlp": 1.04573941, + "epoch": 0.03673530738012926, + "flos": 22195961971200.0, + "grad_norm": 2.3790511540834864, + "language_loss": 0.87558019, + "learning_rate": 3.986778803493401e-06, + "loss": 0.89815724, + "num_input_tokens_seen": 12886890, + "router_z_loss_clip": 0.33398438, + "router_z_loss_mlp": 1.484375, + "step": 611, + "time_per_iteration": 3.8363935947418213 + }, + { + "auxiliary_loss_clip": 0.01190653, + "auxiliary_loss_mlp": 0.01072803, + "balance_loss_clip": 1.03725433, + "balance_loss_mlp": 1.04390585, + "epoch": 0.036795430632797235, + "flos": 24680485987200.0, + "grad_norm": 2.235179936131584, + "language_loss": 0.72158015, + "learning_rate": 3.986735396578956e-06, + "loss": 0.74421477, + "num_input_tokens_seen": 12906130, + "router_z_loss_clip": 0.35546875, + "router_z_loss_mlp": 1.46875, + "step": 612, + "time_per_iteration": 2.440546989440918 + }, + { + "auxiliary_loss_clip": 0.01189836, + "auxiliary_loss_mlp": 0.01059669, + "balance_loss_clip": 1.02497888, + "balance_loss_mlp": 1.04274333, + "epoch": 0.0368555538854652, + "flos": 17747957203200.0, + "grad_norm": 3.122582402275691, + "language_loss": 0.79163623, + "learning_rate": 3.986691918763034e-06, + "loss": 0.81413126, + "num_input_tokens_seen": 12925260, + "router_z_loss_clip": 0.34765625, + "router_z_loss_mlp": 1.46875, + "step": 613, + "time_per_iteration": 2.392113208770752 + }, + { + "auxiliary_loss_clip": 0.01188545, + "auxiliary_loss_mlp": 0.01073452, + "balance_loss_clip": 1.0383811, + "balance_loss_mlp": 1.04238963, + "epoch": 0.03691567713813317, + "flos": 20593518255360.0, + "grad_norm": 1.9597988037603629, + "language_loss": 0.93362963, + "learning_rate": 3.98664837004719e-06, + "loss": 0.95624959, + "num_input_tokens_seen": 12944590, + "router_z_loss_clip": 0.3515625, + "router_z_loss_mlp": 1.4609375, + "step": 614, + "time_per_iteration": 2.461533308029175 + }, + { + "auxiliary_loss_clip": 0.01193101, + "auxiliary_loss_mlp": 0.01070644, + "balance_loss_clip": 1.03383267, + "balance_loss_mlp": 1.04623306, + "epoch": 0.036975800390801145, + "flos": 33649149795840.0, + "grad_norm": 2.718999563947092, + "language_loss": 0.73057652, + "learning_rate": 3.986604750432974e-06, + "loss": 0.753214, + "num_input_tokens_seen": 12964785, + "router_z_loss_clip": 0.3671875, + "router_z_loss_mlp": 1.46875, + "step": 615, + "time_per_iteration": 2.5033388137817383 + }, + { + "auxiliary_loss_clip": 0.01193439, + "auxiliary_loss_mlp": 0.01063049, + "balance_loss_clip": 1.02983749, + "balance_loss_mlp": 1.04255581, + "epoch": 0.03703592364346911, + "flos": 28292425493760.0, + "grad_norm": 2.5644905920844607, + "language_loss": 0.81399232, + "learning_rate": 3.986561059921947e-06, + "loss": 0.83655715, + "num_input_tokens_seen": 12986705, + "router_z_loss_clip": 0.33203125, + "router_z_loss_mlp": 1.5078125, + "step": 616, + "time_per_iteration": 2.496835231781006 + }, + { + "auxiliary_loss_clip": 0.0118991, + "auxiliary_loss_mlp": 0.01069732, + "balance_loss_clip": 1.03745008, + "balance_loss_mlp": 1.04359233, + "epoch": 0.03709604689613708, + "flos": 31502863831680.0, + "grad_norm": 2.289703141175505, + "language_loss": 0.67923647, + "learning_rate": 3.986517298515664e-06, + "loss": 0.70183289, + "num_input_tokens_seen": 13010560, + "router_z_loss_clip": 0.32421875, + "router_z_loss_mlp": 1.4609375, + "step": 617, + "time_per_iteration": 2.4922900199890137 + }, + { + "auxiliary_loss_clip": 0.01193598, + "auxiliary_loss_mlp": 0.01065017, + "balance_loss_clip": 1.02799022, + "balance_loss_mlp": 1.04717958, + "epoch": 0.03715617014880505, + "flos": 19608374021760.0, + "grad_norm": 2.4143698713390025, + "language_loss": 0.79980433, + "learning_rate": 3.9864734662156884e-06, + "loss": 0.82239044, + "num_input_tokens_seen": 13028935, + "router_z_loss_clip": 0.37109375, + "router_z_loss_mlp": 1.46875, + "step": 618, + "time_per_iteration": 2.4274299144744873 + }, + { + "auxiliary_loss_clip": 0.01195582, + "auxiliary_loss_mlp": 0.01072538, + "balance_loss_clip": 1.03458214, + "balance_loss_mlp": 1.04337478, + "epoch": 0.03721629340147302, + "flos": 15923291483520.0, + "grad_norm": 2.799342678041712, + "language_loss": 0.91307116, + "learning_rate": 3.986429563023585e-06, + "loss": 0.93575239, + "num_input_tokens_seen": 13046000, + "router_z_loss_clip": 0.37890625, + "router_z_loss_mlp": 1.5234375, + "step": 619, + "time_per_iteration": 2.3912317752838135 + }, + { + "auxiliary_loss_clip": 0.01192505, + "auxiliary_loss_mlp": 0.01069496, + "balance_loss_clip": 1.03688025, + "balance_loss_mlp": 1.04688096, + "epoch": 0.03727641665414099, + "flos": 21103075681920.0, + "grad_norm": 2.9199639193863978, + "language_loss": 0.94099218, + "learning_rate": 3.986385588940921e-06, + "loss": 0.9636122, + "num_input_tokens_seen": 13062995, + "router_z_loss_clip": 0.32617188, + "router_z_loss_mlp": 1.453125, + "step": 620, + "time_per_iteration": 2.392190933227539 + }, + { + "auxiliary_loss_clip": 0.0118776, + "auxiliary_loss_mlp": 0.0106845, + "balance_loss_clip": 1.03087556, + "balance_loss_mlp": 1.04218006, + "epoch": 0.037336539906808956, + "flos": 24130604073600.0, + "grad_norm": 1.7018149861947345, + "language_loss": 0.76863194, + "learning_rate": 3.986341543969264e-06, + "loss": 0.79119402, + "num_input_tokens_seen": 13084120, + "router_z_loss_clip": 0.375, + "router_z_loss_mlp": 1.453125, + "step": 621, + "time_per_iteration": 2.441282033920288 + }, + { + "auxiliary_loss_clip": 0.01191084, + "auxiliary_loss_mlp": 0.01061931, + "balance_loss_clip": 1.02786076, + "balance_loss_mlp": 1.04571021, + "epoch": 0.03739666315947693, + "flos": 22345285322880.0, + "grad_norm": 2.809079720400529, + "language_loss": 0.8644613, + "learning_rate": 3.986297428110187e-06, + "loss": 0.88699144, + "num_input_tokens_seen": 13100035, + "router_z_loss_clip": 0.33984375, + "router_z_loss_mlp": 1.453125, + "step": 622, + "time_per_iteration": 2.4315948486328125 + }, + { + "auxiliary_loss_clip": 0.01194275, + "auxiliary_loss_mlp": 0.01059122, + "balance_loss_clip": 1.02452755, + "balance_loss_mlp": 1.04649282, + "epoch": 0.0374567864121449, + "flos": 20448454089600.0, + "grad_norm": 2.2307814029927964, + "language_loss": 0.89798784, + "learning_rate": 3.986253241365264e-06, + "loss": 0.9205218, + "num_input_tokens_seen": 13118070, + "router_z_loss_clip": 0.34570312, + "router_z_loss_mlp": 1.484375, + "step": 623, + "time_per_iteration": 2.4165964126586914 + }, + { + "auxiliary_loss_clip": 0.01193625, + "auxiliary_loss_mlp": 0.01071854, + "balance_loss_clip": 1.03711653, + "balance_loss_mlp": 1.04731357, + "epoch": 0.037516909664812866, + "flos": 19207047409920.0, + "grad_norm": 1.8172144217237507, + "language_loss": 0.84119725, + "learning_rate": 3.986208983736073e-06, + "loss": 0.86385202, + "num_input_tokens_seen": 13136355, + "router_z_loss_clip": 0.34765625, + "router_z_loss_mlp": 1.4609375, + "step": 624, + "time_per_iteration": 2.416917562484741 + }, + { + "auxiliary_loss_clip": 0.01191925, + "auxiliary_loss_mlp": 0.01053709, + "balance_loss_clip": 1.01808918, + "balance_loss_mlp": 1.04193032, + "epoch": 0.03757703291748084, + "flos": 35003814526080.0, + "grad_norm": 3.2844482048489367, + "language_loss": 0.66283631, + "learning_rate": 3.986164655224191e-06, + "loss": 0.68529266, + "num_input_tokens_seen": 13155435, + "router_z_loss_clip": 0.35546875, + "router_z_loss_mlp": 1.5, + "step": 625, + "time_per_iteration": 2.5336241722106934 + }, + { + "auxiliary_loss_clip": 0.01185041, + "auxiliary_loss_mlp": 0.01067365, + "balance_loss_clip": 1.03231764, + "balance_loss_mlp": 1.04417443, + "epoch": 0.0376371561701488, + "flos": 25482720274560.0, + "grad_norm": 2.101032365646545, + "language_loss": 0.7704007, + "learning_rate": 3.986120255831202e-06, + "loss": 0.79292476, + "num_input_tokens_seen": 13174295, + "router_z_loss_clip": 0.3515625, + "router_z_loss_mlp": 1.40625, + "step": 626, + "time_per_iteration": 2.439168930053711 + }, + { + "auxiliary_loss_clip": 0.0118929, + "auxiliary_loss_mlp": 0.01064691, + "balance_loss_clip": 1.02992964, + "balance_loss_mlp": 1.04543984, + "epoch": 0.037697279422816775, + "flos": 18184685800320.0, + "grad_norm": 1.7948574546239324, + "language_loss": 0.81407958, + "learning_rate": 3.986075785558691e-06, + "loss": 0.83661938, + "num_input_tokens_seen": 13192500, + "router_z_loss_clip": 0.34765625, + "router_z_loss_mlp": 1.4375, + "step": 627, + "time_per_iteration": 2.3719449043273926 + }, + { + "auxiliary_loss_clip": 0.01192957, + "auxiliary_loss_mlp": 0.01069615, + "balance_loss_clip": 1.03344655, + "balance_loss_mlp": 1.04659152, + "epoch": 0.03775740267548475, + "flos": 24643128965760.0, + "grad_norm": 1.630872127286863, + "language_loss": 0.88502806, + "learning_rate": 3.986031244408243e-06, + "loss": 0.90765381, + "num_input_tokens_seen": 13213470, + "router_z_loss_clip": 0.36132812, + "router_z_loss_mlp": 1.4609375, + "step": 628, + "time_per_iteration": 2.4616754055023193 + }, + { + "auxiliary_loss_clip": 0.01187756, + "auxiliary_loss_mlp": 0.01060824, + "balance_loss_clip": 1.02670586, + "balance_loss_mlp": 1.04001284, + "epoch": 0.03781752592815271, + "flos": 21287137703040.0, + "grad_norm": 3.1648139815741545, + "language_loss": 0.79559755, + "learning_rate": 3.985986632381449e-06, + "loss": 0.81808335, + "num_input_tokens_seen": 13232365, + "router_z_loss_clip": 0.34179688, + "router_z_loss_mlp": 1.4765625, + "step": 629, + "time_per_iteration": 2.4048831462860107 + }, + { + "auxiliary_loss_clip": 0.0118929, + "auxiliary_loss_mlp": 0.01059391, + "balance_loss_clip": 1.02613187, + "balance_loss_mlp": 1.04272497, + "epoch": 0.037877649180820684, + "flos": 22088569029120.0, + "grad_norm": 4.483336565305342, + "language_loss": 0.76847458, + "learning_rate": 3.9859419494799e-06, + "loss": 0.79096138, + "num_input_tokens_seen": 13251920, + "router_z_loss_clip": 0.33203125, + "router_z_loss_mlp": 1.46875, + "step": 630, + "time_per_iteration": 2.4224445819854736 + }, + { + "auxiliary_loss_clip": 0.0119296, + "auxiliary_loss_mlp": 0.01068971, + "balance_loss_clip": 1.03406703, + "balance_loss_mlp": 1.04508269, + "epoch": 0.03793777243348865, + "flos": 14500476046080.0, + "grad_norm": 3.327037065085722, + "language_loss": 0.91509634, + "learning_rate": 3.985897195705192e-06, + "loss": 0.93771565, + "num_input_tokens_seen": 13267440, + "router_z_loss_clip": 0.34765625, + "router_z_loss_mlp": 1.484375, + "step": 631, + "time_per_iteration": 2.3660621643066406 + }, + { + "auxiliary_loss_clip": 0.0118907, + "auxiliary_loss_mlp": 0.01076706, + "balance_loss_clip": 1.04106295, + "balance_loss_mlp": 1.04433274, + "epoch": 0.03799789568615662, + "flos": 21907334828160.0, + "grad_norm": 1.6012248644307439, + "language_loss": 0.91935283, + "learning_rate": 3.985852371058921e-06, + "loss": 0.94201052, + "num_input_tokens_seen": 13287850, + "router_z_loss_clip": 0.35546875, + "router_z_loss_mlp": 1.4453125, + "step": 632, + "time_per_iteration": 2.4410040378570557 + }, + { + "auxiliary_loss_clip": 0.01187046, + "auxiliary_loss_mlp": 0.01058762, + "balance_loss_clip": 1.02428651, + "balance_loss_mlp": 1.04275036, + "epoch": 0.03805801893882459, + "flos": 24825864355200.0, + "grad_norm": 1.8387130062056452, + "language_loss": 0.83061844, + "learning_rate": 3.985807475542687e-06, + "loss": 0.85307658, + "num_input_tokens_seen": 13307760, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 1.4453125, + "step": 633, + "time_per_iteration": 2.44438099861145 + }, + { + "auxiliary_loss_clip": 0.0118841, + "auxiliary_loss_mlp": 0.01060442, + "balance_loss_clip": 1.02737331, + "balance_loss_mlp": 1.04320788, + "epoch": 0.03811814219149256, + "flos": 30481619385600.0, + "grad_norm": 1.6646039138205775, + "language_loss": 0.69604558, + "learning_rate": 3.985762509158093e-06, + "loss": 0.71853411, + "num_input_tokens_seen": 13331230, + "router_z_loss_clip": 0.33007812, + "router_z_loss_mlp": 1.453125, + "step": 634, + "time_per_iteration": 2.516108989715576 + }, + { + "auxiliary_loss_clip": 0.0107484, + "auxiliary_loss_mlp": 0.01026803, + "balance_loss_clip": 1.01931655, + "balance_loss_mlp": 1.01821148, + "epoch": 0.03817826544416053, + "flos": 66989561639040.0, + "grad_norm": 0.9040725694158229, + "language_loss": 0.61635339, + "learning_rate": 3.985717471906742e-06, + "loss": 0.63736987, + "num_input_tokens_seen": 13394760, + "router_z_loss_clip": 0.07470703, + "router_z_loss_mlp": 0.56640625, + "step": 635, + "time_per_iteration": 3.0856924057006836 + }, + { + "auxiliary_loss_clip": 0.01185486, + "auxiliary_loss_mlp": 0.01057715, + "balance_loss_clip": 1.02440786, + "balance_loss_mlp": 1.04052567, + "epoch": 0.038238388696828496, + "flos": 20484309922560.0, + "grad_norm": 2.7305143207672726, + "language_loss": 0.83529603, + "learning_rate": 3.985672363790243e-06, + "loss": 0.857728, + "num_input_tokens_seen": 13412775, + "router_z_loss_clip": 0.33203125, + "router_z_loss_mlp": 1.4453125, + "step": 636, + "time_per_iteration": 2.430511951446533 + }, + { + "auxiliary_loss_clip": 0.01187786, + "auxiliary_loss_mlp": 0.01063098, + "balance_loss_clip": 1.02938521, + "balance_loss_mlp": 1.04468215, + "epoch": 0.03829851194949647, + "flos": 17964977414400.0, + "grad_norm": 2.82889058687413, + "language_loss": 0.79160106, + "learning_rate": 3.985627184810206e-06, + "loss": 0.81410992, + "num_input_tokens_seen": 13427835, + "router_z_loss_clip": 0.33789062, + "router_z_loss_mlp": 1.4296875, + "step": 637, + "time_per_iteration": 2.394228219985962 + }, + { + "auxiliary_loss_clip": 0.01189064, + "auxiliary_loss_mlp": 0.01074372, + "balance_loss_clip": 1.03841865, + "balance_loss_mlp": 1.04261327, + "epoch": 0.03835863520216444, + "flos": 22455401351040.0, + "grad_norm": 2.291592706612894, + "language_loss": 0.83631814, + "learning_rate": 3.985581934968241e-06, + "loss": 0.85895246, + "num_input_tokens_seen": 13447295, + "router_z_loss_clip": 0.359375, + "router_z_loss_mlp": 1.46875, + "step": 638, + "time_per_iteration": 2.4217000007629395 + }, + { + "auxiliary_loss_clip": 0.01196848, + "auxiliary_loss_mlp": 0.01064665, + "balance_loss_clip": 1.02909291, + "balance_loss_mlp": 1.04514432, + "epoch": 0.038418758454832405, + "flos": 22163317983360.0, + "grad_norm": 3.242665113678473, + "language_loss": 0.70392871, + "learning_rate": 3.985536614265964e-06, + "loss": 0.72654378, + "num_input_tokens_seen": 13468455, + "router_z_loss_clip": 0.35546875, + "router_z_loss_mlp": 1.515625, + "step": 639, + "time_per_iteration": 2.434626579284668 + }, + { + "auxiliary_loss_clip": 0.01188056, + "auxiliary_loss_mlp": 0.01069986, + "balance_loss_clip": 1.03379369, + "balance_loss_mlp": 1.04202294, + "epoch": 0.03847888170750038, + "flos": 22746332643840.0, + "grad_norm": 5.606862574968034, + "language_loss": 0.84624588, + "learning_rate": 3.985491222704994e-06, + "loss": 0.86882633, + "num_input_tokens_seen": 13489085, + "router_z_loss_clip": 0.36328125, + "router_z_loss_mlp": 1.4609375, + "step": 640, + "time_per_iteration": 2.431072235107422 + }, + { + "auxiliary_loss_clip": 0.01191819, + "auxiliary_loss_mlp": 0.01069098, + "balance_loss_clip": 1.03369319, + "balance_loss_mlp": 1.04466319, + "epoch": 0.03853900496016834, + "flos": 22710092785920.0, + "grad_norm": 2.7125576891372547, + "language_loss": 0.82238823, + "learning_rate": 3.985445760286949e-06, + "loss": 0.84499741, + "num_input_tokens_seen": 13509120, + "router_z_loss_clip": 0.35546875, + "router_z_loss_mlp": 1.4765625, + "step": 641, + "time_per_iteration": 2.419487714767456 + }, + { + "auxiliary_loss_clip": 0.01068748, + "auxiliary_loss_mlp": 0.01012281, + "balance_loss_clip": 1.00498581, + "balance_loss_mlp": 1.01523471, + "epoch": 0.038599128212836314, + "flos": 70395652569600.0, + "grad_norm": 0.8889064780781849, + "language_loss": 0.65465635, + "learning_rate": 3.985400227013452e-06, + "loss": 0.67546666, + "num_input_tokens_seen": 13562005, + "router_z_loss_clip": 0.07275391, + "router_z_loss_mlp": 0.53515625, + "step": 642, + "time_per_iteration": 3.0247628688812256 + }, + { + "auxiliary_loss_clip": 0.01191587, + "auxiliary_loss_mlp": 0.01053832, + "balance_loss_clip": 1.02100134, + "balance_loss_mlp": 1.04353416, + "epoch": 0.03865925146550429, + "flos": 23294015141760.0, + "grad_norm": 1.994719867029607, + "language_loss": 0.79217535, + "learning_rate": 3.985354622886128e-06, + "loss": 0.81462955, + "num_input_tokens_seen": 13582185, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 1.484375, + "step": 643, + "time_per_iteration": 2.430391788482666 + }, + { + "auxiliary_loss_clip": 0.01188071, + "auxiliary_loss_mlp": 0.01072142, + "balance_loss_clip": 1.03945529, + "balance_loss_mlp": 1.04187131, + "epoch": 0.03871937471817225, + "flos": 21429478782720.0, + "grad_norm": 1.7826475929274195, + "language_loss": 0.82554638, + "learning_rate": 3.985308947906604e-06, + "loss": 0.84814858, + "num_input_tokens_seen": 13599555, + "router_z_loss_clip": 0.32617188, + "router_z_loss_mlp": 1.46875, + "step": 644, + "time_per_iteration": 2.419905185699463 + }, + { + "auxiliary_loss_clip": 0.01190707, + "auxiliary_loss_mlp": 0.01065234, + "balance_loss_clip": 1.02885127, + "balance_loss_mlp": 1.04173517, + "epoch": 0.038779497970840224, + "flos": 34275875345280.0, + "grad_norm": 2.5220844419508697, + "language_loss": 0.82106018, + "learning_rate": 3.985263202076511e-06, + "loss": 0.84361959, + "num_input_tokens_seen": 13621160, + "router_z_loss_clip": 0.36328125, + "router_z_loss_mlp": 1.484375, + "step": 645, + "time_per_iteration": 3.919546604156494 + }, + { + "auxiliary_loss_clip": 0.01194109, + "auxiliary_loss_mlp": 0.01069637, + "balance_loss_clip": 1.03518534, + "balance_loss_mlp": 1.04296374, + "epoch": 0.03883962122350819, + "flos": 22747065782400.0, + "grad_norm": 2.5742157379080894, + "language_loss": 0.81492043, + "learning_rate": 3.985217385397481e-06, + "loss": 0.83755791, + "num_input_tokens_seen": 13641915, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 1.515625, + "step": 646, + "time_per_iteration": 2.4276058673858643 + }, + { + "auxiliary_loss_clip": 0.01192965, + "auxiliary_loss_mlp": 0.01076988, + "balance_loss_clip": 1.04070067, + "balance_loss_mlp": 1.04868424, + "epoch": 0.03889974447617616, + "flos": 21944726760960.0, + "grad_norm": 1.7673884490273624, + "language_loss": 0.81530958, + "learning_rate": 3.985171497871149e-06, + "loss": 0.83800912, + "num_input_tokens_seen": 13661410, + "router_z_loss_clip": 0.36328125, + "router_z_loss_mlp": 1.4453125, + "step": 647, + "time_per_iteration": 2.4281909465789795 + }, + { + "auxiliary_loss_clip": 0.01189059, + "auxiliary_loss_mlp": 0.01066854, + "balance_loss_clip": 1.03259313, + "balance_loss_mlp": 1.04276097, + "epoch": 0.03895986772884413, + "flos": 31503457324800.0, + "grad_norm": 1.9542556086114053, + "language_loss": 0.8414427, + "learning_rate": 3.985125539499152e-06, + "loss": 0.86400187, + "num_input_tokens_seen": 13681705, + "router_z_loss_clip": 0.34179688, + "router_z_loss_mlp": 1.46875, + "step": 648, + "time_per_iteration": 5.353910207748413 + }, + { + "auxiliary_loss_clip": 0.01189546, + "auxiliary_loss_mlp": 0.01059021, + "balance_loss_clip": 1.02609563, + "balance_loss_mlp": 1.04522252, + "epoch": 0.0390199909815121, + "flos": 19900003541760.0, + "grad_norm": 2.0317345177524047, + "language_loss": 0.84429526, + "learning_rate": 3.9850795102831315e-06, + "loss": 0.86678088, + "num_input_tokens_seen": 13700400, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 1.4375, + "step": 649, + "time_per_iteration": 2.4122958183288574 + }, + { + "auxiliary_loss_clip": 0.0118993, + "auxiliary_loss_mlp": 0.01065205, + "balance_loss_clip": 1.02891779, + "balance_loss_mlp": 1.04358125, + "epoch": 0.03908011423418007, + "flos": 21611515944960.0, + "grad_norm": 1.8540114561548637, + "language_loss": 0.79612905, + "learning_rate": 3.9850334102247295e-06, + "loss": 0.81868041, + "num_input_tokens_seen": 13720145, + "router_z_loss_clip": 0.36328125, + "router_z_loss_mlp": 1.4609375, + "step": 650, + "time_per_iteration": 2.431824207305908 + }, + { + "auxiliary_loss_clip": 0.01184543, + "auxiliary_loss_mlp": 0.01065113, + "balance_loss_clip": 1.03082883, + "balance_loss_mlp": 1.04022264, + "epoch": 0.039140237486848035, + "flos": 18660412252800.0, + "grad_norm": 2.2190536894804413, + "language_loss": 0.78213829, + "learning_rate": 3.984987239325592e-06, + "loss": 0.80463487, + "num_input_tokens_seen": 13737500, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 1.4453125, + "step": 651, + "time_per_iteration": 3.855437994003296 + }, + { + "auxiliary_loss_clip": 0.01187028, + "auxiliary_loss_mlp": 0.01068847, + "balance_loss_clip": 1.0326786, + "balance_loss_mlp": 1.04069173, + "epoch": 0.03920036073951601, + "flos": 18660132961920.0, + "grad_norm": 3.7514277523504167, + "language_loss": 0.87278444, + "learning_rate": 3.984940997587364e-06, + "loss": 0.89534318, + "num_input_tokens_seen": 13754750, + "router_z_loss_clip": 0.36132812, + "router_z_loss_mlp": 1.4609375, + "step": 652, + "time_per_iteration": 2.42366099357605 + }, + { + "auxiliary_loss_clip": 0.01177951, + "auxiliary_loss_mlp": 0.01062097, + "balance_loss_clip": 1.02807426, + "balance_loss_mlp": 1.0395844, + "epoch": 0.03926048399218398, + "flos": 31353226277760.0, + "grad_norm": 2.587133442199089, + "language_loss": 0.79192305, + "learning_rate": 3.984894685011699e-06, + "loss": 0.8143236, + "num_input_tokens_seen": 13771990, + "router_z_loss_clip": 0.33984375, + "router_z_loss_mlp": 1.3828125, + "step": 653, + "time_per_iteration": 2.447777271270752 + }, + { + "auxiliary_loss_clip": 0.01190905, + "auxiliary_loss_mlp": 0.01072108, + "balance_loss_clip": 1.03312695, + "balance_loss_mlp": 1.04188657, + "epoch": 0.039320607244851945, + "flos": 29602297082880.0, + "grad_norm": 2.3614461284920583, + "language_loss": 0.8583045, + "learning_rate": 3.984848301600248e-06, + "loss": 0.8809346, + "num_input_tokens_seen": 13792750, + "router_z_loss_clip": 0.390625, + "router_z_loss_mlp": 1.4921875, + "step": 654, + "time_per_iteration": 2.454594612121582 + }, + { + "auxiliary_loss_clip": 0.01065384, + "auxiliary_loss_mlp": 0.01015938, + "balance_loss_clip": 1.00940573, + "balance_loss_mlp": 1.01152527, + "epoch": 0.03938073049751992, + "flos": 66531151221120.0, + "grad_norm": 0.7151695118935018, + "language_loss": 0.49906549, + "learning_rate": 3.984801847354667e-06, + "loss": 0.51987869, + "num_input_tokens_seen": 13858570, + "router_z_loss_clip": 0.06542969, + "router_z_loss_mlp": 0.5390625, + "step": 655, + "time_per_iteration": 3.1038155555725098 + }, + { + "auxiliary_loss_clip": 0.01184583, + "auxiliary_loss_mlp": 0.01064343, + "balance_loss_clip": 1.03072572, + "balance_loss_mlp": 1.0431056, + "epoch": 0.03944085375018788, + "flos": 23366704325760.0, + "grad_norm": 2.3215592261136413, + "language_loss": 0.80955482, + "learning_rate": 3.984755322276614e-06, + "loss": 0.83204401, + "num_input_tokens_seen": 13876335, + "router_z_loss_clip": 0.3359375, + "router_z_loss_mlp": 1.4140625, + "step": 656, + "time_per_iteration": 2.4134862422943115 + }, + { + "auxiliary_loss_clip": 0.01196188, + "auxiliary_loss_mlp": 0.01077883, + "balance_loss_clip": 1.03923488, + "balance_loss_mlp": 1.04710519, + "epoch": 0.039500977002855854, + "flos": 18547398581760.0, + "grad_norm": 2.5540636579470912, + "language_loss": 0.76357615, + "learning_rate": 3.9847087263677485e-06, + "loss": 0.78631687, + "num_input_tokens_seen": 13892640, + "router_z_loss_clip": 0.38671875, + "router_z_loss_mlp": 1.484375, + "step": 657, + "time_per_iteration": 2.357095718383789 + }, + { + "auxiliary_loss_clip": 0.01192952, + "auxiliary_loss_mlp": 0.01061956, + "balance_loss_clip": 1.02576399, + "balance_loss_mlp": 1.04402542, + "epoch": 0.039561100255523826, + "flos": 25336992792960.0, + "grad_norm": 1.9004998410713654, + "language_loss": 0.8134166, + "learning_rate": 3.984662059629734e-06, + "loss": 0.83596575, + "num_input_tokens_seen": 13910085, + "router_z_loss_clip": 0.36132812, + "router_z_loss_mlp": 1.4921875, + "step": 658, + "time_per_iteration": 2.44077205657959 + }, + { + "auxiliary_loss_clip": 0.01183464, + "auxiliary_loss_mlp": 0.01061993, + "balance_loss_clip": 1.0252049, + "balance_loss_mlp": 1.04198444, + "epoch": 0.03962122350819179, + "flos": 18219005533440.0, + "grad_norm": 2.0504285700224885, + "language_loss": 0.9085809, + "learning_rate": 3.984615322064235e-06, + "loss": 0.93103546, + "num_input_tokens_seen": 13928800, + "router_z_loss_clip": 0.3671875, + "router_z_loss_mlp": 1.4140625, + "step": 659, + "time_per_iteration": 2.4118030071258545 + }, + { + "auxiliary_loss_clip": 0.01187671, + "auxiliary_loss_mlp": 0.01062738, + "balance_loss_clip": 1.027619, + "balance_loss_mlp": 1.04179525, + "epoch": 0.03968134676085976, + "flos": 20521178184960.0, + "grad_norm": 2.7338367910896078, + "language_loss": 0.78944838, + "learning_rate": 3.9845685136729215e-06, + "loss": 0.81195241, + "num_input_tokens_seen": 13948325, + "router_z_loss_clip": 0.3515625, + "router_z_loss_mlp": 1.453125, + "step": 660, + "time_per_iteration": 2.4234118461608887 + }, + { + "auxiliary_loss_clip": 0.01187967, + "auxiliary_loss_mlp": 0.01062019, + "balance_loss_clip": 1.02594614, + "balance_loss_mlp": 1.04496956, + "epoch": 0.03974147001352773, + "flos": 22421395820160.0, + "grad_norm": 1.6276707879309493, + "language_loss": 0.81347334, + "learning_rate": 3.984521634457461e-06, + "loss": 0.8359732, + "num_input_tokens_seen": 13969090, + "router_z_loss_clip": 0.359375, + "router_z_loss_mlp": 1.4296875, + "step": 661, + "time_per_iteration": 2.444849729537964 + }, + { + "auxiliary_loss_clip": 0.01060966, + "auxiliary_loss_mlp": 0.01010318, + "balance_loss_clip": 1.00388038, + "balance_loss_mlp": 1.00876069, + "epoch": 0.0398015932661957, + "flos": 71125267495680.0, + "grad_norm": 0.9225560296975938, + "language_loss": 0.69447446, + "learning_rate": 3.98447468441953e-06, + "loss": 0.71518731, + "num_input_tokens_seen": 14037555, + "router_z_loss_clip": 0.06445312, + "router_z_loss_mlp": 0.5234375, + "step": 662, + "time_per_iteration": 3.17541766166687 + }, + { + "auxiliary_loss_clip": 0.01189675, + "auxiliary_loss_mlp": 0.01070304, + "balance_loss_clip": 1.03501832, + "balance_loss_mlp": 1.0454495, + "epoch": 0.03986171651886367, + "flos": 16799995434240.0, + "grad_norm": 1.8352192519331945, + "language_loss": 0.82945752, + "learning_rate": 3.984427663560801e-06, + "loss": 0.85205734, + "num_input_tokens_seen": 14055765, + "router_z_loss_clip": 0.3515625, + "router_z_loss_mlp": 1.4453125, + "step": 663, + "time_per_iteration": 2.4322595596313477 + }, + { + "auxiliary_loss_clip": 0.01194363, + "auxiliary_loss_mlp": 0.01058858, + "balance_loss_clip": 1.02304792, + "balance_loss_mlp": 1.04646635, + "epoch": 0.03992183977153164, + "flos": 24533920632960.0, + "grad_norm": 2.3611688473755743, + "language_loss": 0.87116724, + "learning_rate": 3.984380571882954e-06, + "loss": 0.89369941, + "num_input_tokens_seen": 14074195, + "router_z_loss_clip": 0.359375, + "router_z_loss_mlp": 1.4765625, + "step": 664, + "time_per_iteration": 2.4412331581115723 + }, + { + "auxiliary_loss_clip": 0.01183807, + "auxiliary_loss_mlp": 0.01064496, + "balance_loss_clip": 1.0302825, + "balance_loss_mlp": 1.0417701, + "epoch": 0.03998196302419961, + "flos": 15595003169280.0, + "grad_norm": 2.1207161045014273, + "language_loss": 0.84756935, + "learning_rate": 3.984333409387668e-06, + "loss": 0.8700524, + "num_input_tokens_seen": 14090215, + "router_z_loss_clip": 0.34179688, + "router_z_loss_mlp": 1.421875, + "step": 665, + "time_per_iteration": 2.3960518836975098 + }, + { + "auxiliary_loss_clip": 0.01195958, + "auxiliary_loss_mlp": 0.01069138, + "balance_loss_clip": 1.03170609, + "balance_loss_mlp": 1.04581308, + "epoch": 0.04004208627686758, + "flos": 25303790223360.0, + "grad_norm": 2.1601006873638107, + "language_loss": 0.81672788, + "learning_rate": 3.984286176076628e-06, + "loss": 0.83937883, + "num_input_tokens_seen": 14112150, + "router_z_loss_clip": 0.375, + "router_z_loss_mlp": 1.5, + "step": 666, + "time_per_iteration": 2.4813365936279297 + }, + { + "auxiliary_loss_clip": 0.01185139, + "auxiliary_loss_mlp": 0.01059999, + "balance_loss_clip": 1.02240062, + "balance_loss_mlp": 1.04265499, + "epoch": 0.04010220952953555, + "flos": 23474760583680.0, + "grad_norm": 1.9467012559641645, + "language_loss": 0.86658657, + "learning_rate": 3.984238871951518e-06, + "loss": 0.88903797, + "num_input_tokens_seen": 14131475, + "router_z_loss_clip": 0.37695312, + "router_z_loss_mlp": 1.421875, + "step": 667, + "time_per_iteration": 2.4209389686584473 + }, + { + "auxiliary_loss_clip": 0.01183022, + "auxiliary_loss_mlp": 0.01060967, + "balance_loss_clip": 1.02634835, + "balance_loss_mlp": 1.04454565, + "epoch": 0.04016233278220352, + "flos": 18616247516160.0, + "grad_norm": 2.0724305921822808, + "language_loss": 0.80607831, + "learning_rate": 3.984191497014026e-06, + "loss": 0.82851821, + "num_input_tokens_seen": 14146165, + "router_z_loss_clip": 0.34570312, + "router_z_loss_mlp": 1.3828125, + "step": 668, + "time_per_iteration": 2.380335807800293 + }, + { + "auxiliary_loss_clip": 0.01057149, + "auxiliary_loss_mlp": 0.01008952, + "balance_loss_clip": 1.00287223, + "balance_loss_mlp": 1.00661552, + "epoch": 0.040222456034871484, + "flos": 70902801112320.0, + "grad_norm": 0.7811409650925238, + "language_loss": 0.6007818, + "learning_rate": 3.984144051265844e-06, + "loss": 0.62144279, + "num_input_tokens_seen": 14215005, + "router_z_loss_clip": 0.06079102, + "router_z_loss_mlp": 0.5078125, + "step": 669, + "time_per_iteration": 3.1673338413238525 + }, + { + "auxiliary_loss_clip": 0.01183658, + "auxiliary_loss_mlp": 0.01063918, + "balance_loss_clip": 1.02872753, + "balance_loss_mlp": 1.04043889, + "epoch": 0.040282579287539456, + "flos": 23763701928960.0, + "grad_norm": 1.7426617425744348, + "language_loss": 0.86253875, + "learning_rate": 3.984096534708665e-06, + "loss": 0.88501447, + "num_input_tokens_seen": 14235510, + "router_z_loss_clip": 0.3515625, + "router_z_loss_mlp": 1.4296875, + "step": 670, + "time_per_iteration": 2.4509003162384033 + }, + { + "auxiliary_loss_clip": 0.01184797, + "auxiliary_loss_mlp": 0.01061892, + "balance_loss_clip": 1.02713096, + "balance_loss_mlp": 1.04237092, + "epoch": 0.04034270254020743, + "flos": 18477537217920.0, + "grad_norm": 6.661894128042471, + "language_loss": 0.74786806, + "learning_rate": 3.9840489473441835e-06, + "loss": 0.77033496, + "num_input_tokens_seen": 14254565, + "router_z_loss_clip": 0.34765625, + "router_z_loss_mlp": 1.4296875, + "step": 671, + "time_per_iteration": 2.4203484058380127 + }, + { + "auxiliary_loss_clip": 0.01189964, + "auxiliary_loss_mlp": 0.01067379, + "balance_loss_clip": 1.03228378, + "balance_loss_mlp": 1.04595947, + "epoch": 0.040402825792875394, + "flos": 17200903109760.0, + "grad_norm": 1.921792455658059, + "language_loss": 0.92102182, + "learning_rate": 3.984001289174099e-06, + "loss": 0.94359517, + "num_input_tokens_seen": 14271885, + "router_z_loss_clip": 0.3515625, + "router_z_loss_mlp": 1.4375, + "step": 672, + "time_per_iteration": 2.3875322341918945 + }, + { + "auxiliary_loss_clip": 0.01188382, + "auxiliary_loss_mlp": 0.01067851, + "balance_loss_clip": 1.03127718, + "balance_loss_mlp": 1.04510617, + "epoch": 0.040462949045543366, + "flos": 19171156665600.0, + "grad_norm": 5.893165256633166, + "language_loss": 0.90170169, + "learning_rate": 3.983953560200113e-06, + "loss": 0.92426401, + "num_input_tokens_seen": 14289670, + "router_z_loss_clip": 0.36523438, + "router_z_loss_mlp": 1.4375, + "step": 673, + "time_per_iteration": 2.4174492359161377 + }, + { + "auxiliary_loss_clip": 0.01184043, + "auxiliary_loss_mlp": 0.01070975, + "balance_loss_clip": 1.03382957, + "balance_loss_mlp": 1.04199457, + "epoch": 0.04052307229821133, + "flos": 24018812300160.0, + "grad_norm": 1.8851288699257294, + "language_loss": 0.74678195, + "learning_rate": 3.983905760423926e-06, + "loss": 0.76933217, + "num_input_tokens_seen": 14309285, + "router_z_loss_clip": 0.37109375, + "router_z_loss_mlp": 1.421875, + "step": 674, + "time_per_iteration": 2.4453983306884766 + }, + { + "auxiliary_loss_clip": 0.01192146, + "auxiliary_loss_mlp": 0.01056601, + "balance_loss_clip": 1.01995611, + "balance_loss_mlp": 1.043841, + "epoch": 0.0405831955508793, + "flos": 16435641818880.0, + "grad_norm": 2.672433531864122, + "language_loss": 0.77962393, + "learning_rate": 3.983857889847247e-06, + "loss": 0.80211139, + "num_input_tokens_seen": 14328300, + "router_z_loss_clip": 0.3671875, + "router_z_loss_mlp": 1.484375, + "step": 675, + "time_per_iteration": 2.4712469577789307 + }, + { + "auxiliary_loss_clip": 0.01188736, + "auxiliary_loss_mlp": 0.01067069, + "balance_loss_clip": 1.03259361, + "balance_loss_mlp": 1.04343069, + "epoch": 0.040643318803547275, + "flos": 24278775350400.0, + "grad_norm": 1.777188058958025, + "language_loss": 0.76703358, + "learning_rate": 3.983809948471783e-06, + "loss": 0.78959161, + "num_input_tokens_seen": 14346395, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 1.453125, + "step": 676, + "time_per_iteration": 2.42793607711792 + }, + { + "auxiliary_loss_clip": 0.01190334, + "auxiliary_loss_mlp": 0.01066163, + "balance_loss_clip": 1.03082967, + "balance_loss_mlp": 1.04389369, + "epoch": 0.04070344205621524, + "flos": 17711123852160.0, + "grad_norm": 2.550943853737293, + "language_loss": 0.84916627, + "learning_rate": 3.983761936299245e-06, + "loss": 0.87173128, + "num_input_tokens_seen": 14364605, + "router_z_loss_clip": 0.35351562, + "router_z_loss_mlp": 1.4609375, + "step": 677, + "time_per_iteration": 2.4127068519592285 + }, + { + "auxiliary_loss_clip": 0.01185319, + "auxiliary_loss_mlp": 0.01062526, + "balance_loss_clip": 1.02757335, + "balance_loss_mlp": 1.04432964, + "epoch": 0.04076356530888321, + "flos": 26176444456320.0, + "grad_norm": 1.9381617410757228, + "language_loss": 0.76106936, + "learning_rate": 3.983713853331345e-06, + "loss": 0.78354776, + "num_input_tokens_seen": 14385265, + "router_z_loss_clip": 0.34960938, + "router_z_loss_mlp": 1.40625, + "step": 678, + "time_per_iteration": 2.5064332485198975 + }, + { + "auxiliary_loss_clip": 0.01187, + "auxiliary_loss_mlp": 0.01064472, + "balance_loss_clip": 1.028018, + "balance_loss_mlp": 1.04322159, + "epoch": 0.04082368856155118, + "flos": 35771973459840.0, + "grad_norm": 1.9634592798462205, + "language_loss": 0.82002586, + "learning_rate": 3.9836656995698015e-06, + "loss": 0.84254062, + "num_input_tokens_seen": 14406090, + "router_z_loss_clip": 0.36328125, + "router_z_loss_mlp": 1.4375, + "step": 679, + "time_per_iteration": 2.528810739517212 + }, + { + "auxiliary_loss_clip": 0.01190761, + "auxiliary_loss_mlp": 0.01058053, + "balance_loss_clip": 1.02450764, + "balance_loss_mlp": 1.04950869, + "epoch": 0.04088381181421915, + "flos": 28145406291840.0, + "grad_norm": 3.685127405500079, + "language_loss": 0.76211154, + "learning_rate": 3.983617475016331e-06, + "loss": 0.78459966, + "num_input_tokens_seen": 14425130, + "router_z_loss_clip": 0.3359375, + "router_z_loss_mlp": 1.4140625, + "step": 680, + "time_per_iteration": 2.4731523990631104 + }, + { + "auxiliary_loss_clip": 0.01187026, + "auxiliary_loss_mlp": 0.01062935, + "balance_loss_clip": 1.02447796, + "balance_loss_mlp": 1.03947425, + "epoch": 0.04094393506688712, + "flos": 27596501896320.0, + "grad_norm": 1.9793853535666257, + "language_loss": 0.83050603, + "learning_rate": 3.9835691796726555e-06, + "loss": 0.85300565, + "num_input_tokens_seen": 14447355, + "router_z_loss_clip": 0.38476562, + "router_z_loss_mlp": 1.4765625, + "step": 681, + "time_per_iteration": 2.4579238891601562 + }, + { + "auxiliary_loss_clip": 0.01188714, + "auxiliary_loss_mlp": 0.01065756, + "balance_loss_clip": 1.02758527, + "balance_loss_mlp": 1.04244184, + "epoch": 0.04100405831955509, + "flos": 23110930638720.0, + "grad_norm": 1.850859883141676, + "language_loss": 0.71165198, + "learning_rate": 3.9835208135404986e-06, + "loss": 0.73419666, + "num_input_tokens_seen": 14466790, + "router_z_loss_clip": 0.3828125, + "router_z_loss_mlp": 1.4609375, + "step": 682, + "time_per_iteration": 2.428924798965454 + }, + { + "auxiliary_loss_clip": 0.01183112, + "auxiliary_loss_mlp": 0.01065471, + "balance_loss_clip": 1.0303278, + "balance_loss_mlp": 1.04071558, + "epoch": 0.04106418157222306, + "flos": 20155707406080.0, + "grad_norm": 1.6317417738527569, + "language_loss": 0.72059846, + "learning_rate": 3.9834723766215865e-06, + "loss": 0.74308419, + "num_input_tokens_seen": 14485195, + "router_z_loss_clip": 0.3515625, + "router_z_loss_mlp": 1.421875, + "step": 683, + "time_per_iteration": 2.3904831409454346 + }, + { + "auxiliary_loss_clip": 0.01184685, + "auxiliary_loss_mlp": 0.0106614, + "balance_loss_clip": 1.03223693, + "balance_loss_mlp": 1.04592633, + "epoch": 0.041124304824891024, + "flos": 17419738711680.0, + "grad_norm": 2.182818138980505, + "language_loss": 0.81072485, + "learning_rate": 3.983423868917646e-06, + "loss": 0.83323312, + "num_input_tokens_seen": 14503370, + "router_z_loss_clip": 0.33984375, + "router_z_loss_mlp": 1.3828125, + "step": 684, + "time_per_iteration": 2.383819818496704 + }, + { + "auxiliary_loss_clip": 0.01188177, + "auxiliary_loss_mlp": 0.01062482, + "balance_loss_clip": 1.02621865, + "balance_loss_mlp": 1.04425693, + "epoch": 0.041184428077558996, + "flos": 25778853360000.0, + "grad_norm": 1.6602237229884422, + "language_loss": 0.9059425, + "learning_rate": 3.983375290430411e-06, + "loss": 0.92844909, + "num_input_tokens_seen": 14526415, + "router_z_loss_clip": 0.36328125, + "router_z_loss_mlp": 1.4375, + "step": 685, + "time_per_iteration": 3.942558526992798 + }, + { + "auxiliary_loss_clip": 0.01185363, + "auxiliary_loss_mlp": 0.01061075, + "balance_loss_clip": 1.0242871, + "balance_loss_mlp": 1.04252374, + "epoch": 0.04124455133022697, + "flos": 22963701968640.0, + "grad_norm": 2.020637774355877, + "language_loss": 0.88082665, + "learning_rate": 3.983326641161613e-06, + "loss": 0.90329105, + "num_input_tokens_seen": 14546595, + "router_z_loss_clip": 0.3671875, + "router_z_loss_mlp": 1.4296875, + "step": 686, + "time_per_iteration": 2.41579270362854 + }, + { + "auxiliary_loss_clip": 0.01187174, + "auxiliary_loss_mlp": 0.01067066, + "balance_loss_clip": 1.02963471, + "balance_loss_mlp": 1.04243541, + "epoch": 0.04130467458289493, + "flos": 21287975575680.0, + "grad_norm": 1.8269890377548201, + "language_loss": 0.71391737, + "learning_rate": 3.9832779211129894e-06, + "loss": 0.73645979, + "num_input_tokens_seen": 14566590, + "router_z_loss_clip": 0.375, + "router_z_loss_mlp": 1.453125, + "step": 687, + "time_per_iteration": 2.451077461242676 + }, + { + "auxiliary_loss_clip": 0.01183612, + "auxiliary_loss_mlp": 0.01060901, + "balance_loss_clip": 1.02704561, + "balance_loss_mlp": 1.046556, + "epoch": 0.041364797835562905, + "flos": 19973216396160.0, + "grad_norm": 1.5390372221479989, + "language_loss": 0.8611179, + "learning_rate": 3.983229130286278e-06, + "loss": 0.88356304, + "num_input_tokens_seen": 14585965, + "router_z_loss_clip": 0.33789062, + "router_z_loss_mlp": 1.3671875, + "step": 688, + "time_per_iteration": 5.2253546714782715 + }, + { + "auxiliary_loss_clip": 0.01181434, + "auxiliary_loss_mlp": 0.01070965, + "balance_loss_clip": 1.03534508, + "balance_loss_mlp": 1.04390609, + "epoch": 0.04142492108823087, + "flos": 21905205235200.0, + "grad_norm": 1.8610843488901465, + "language_loss": 0.83315575, + "learning_rate": 3.98318026868322e-06, + "loss": 0.85567975, + "num_input_tokens_seen": 14606015, + "router_z_loss_clip": 0.35546875, + "router_z_loss_mlp": 1.375, + "step": 689, + "time_per_iteration": 2.4173686504364014 + }, + { + "auxiliary_loss_clip": 0.01183212, + "auxiliary_loss_mlp": 0.01069871, + "balance_loss_clip": 1.03622985, + "balance_loss_mlp": 1.04125214, + "epoch": 0.04148504434089884, + "flos": 27638292660480.0, + "grad_norm": 2.3858573184890948, + "language_loss": 0.68026263, + "learning_rate": 3.9831313363055606e-06, + "loss": 0.70279348, + "num_input_tokens_seen": 14629955, + "router_z_loss_clip": 0.3359375, + "router_z_loss_mlp": 1.421875, + "step": 690, + "time_per_iteration": 3.858558416366577 + }, + { + "auxiliary_loss_clip": 0.01178753, + "auxiliary_loss_mlp": 0.01064627, + "balance_loss_clip": 1.02952015, + "balance_loss_mlp": 1.03993392, + "epoch": 0.041545167593566815, + "flos": 20517442669440.0, + "grad_norm": 2.23165164324267, + "language_loss": 0.74733639, + "learning_rate": 3.9830823331550445e-06, + "loss": 0.76977026, + "num_input_tokens_seen": 14648000, + "router_z_loss_clip": 0.3515625, + "router_z_loss_mlp": 1.390625, + "step": 691, + "time_per_iteration": 2.3846516609191895 + }, + { + "auxiliary_loss_clip": 0.01179734, + "auxiliary_loss_mlp": 0.01071205, + "balance_loss_clip": 1.03522789, + "balance_loss_mlp": 1.04067516, + "epoch": 0.04160529084623478, + "flos": 11868269512320.0, + "grad_norm": 2.2544470318593404, + "language_loss": 0.84076923, + "learning_rate": 3.983033259233421e-06, + "loss": 0.86327863, + "num_input_tokens_seen": 14662235, + "router_z_loss_clip": 0.359375, + "router_z_loss_mlp": 1.390625, + "step": 692, + "time_per_iteration": 2.367363214492798 + }, + { + "auxiliary_loss_clip": 0.01186203, + "auxiliary_loss_mlp": 0.01064559, + "balance_loss_clip": 1.02877247, + "balance_loss_mlp": 1.04257679, + "epoch": 0.04166541409890275, + "flos": 14827472640000.0, + "grad_norm": 2.7512087519687785, + "language_loss": 0.88303667, + "learning_rate": 3.982984114542442e-06, + "loss": 0.90554428, + "num_input_tokens_seen": 14676065, + "router_z_loss_clip": 0.35742188, + "router_z_loss_mlp": 1.4375, + "step": 693, + "time_per_iteration": 2.3654260635375977 + }, + { + "auxiliary_loss_clip": 0.01184472, + "auxiliary_loss_mlp": 0.01060751, + "balance_loss_clip": 1.02789724, + "balance_loss_mlp": 1.04375386, + "epoch": 0.04172553735157072, + "flos": 25807063605120.0, + "grad_norm": 2.1493026141193754, + "language_loss": 0.81644607, + "learning_rate": 3.98293489908386e-06, + "loss": 0.8388983, + "num_input_tokens_seen": 14694955, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 1.40625, + "step": 694, + "time_per_iteration": 2.4572255611419678 + }, + { + "auxiliary_loss_clip": 0.01185629, + "auxiliary_loss_mlp": 0.01062164, + "balance_loss_clip": 1.02723527, + "balance_loss_mlp": 1.04134369, + "epoch": 0.04178566060423869, + "flos": 24278670616320.0, + "grad_norm": 1.9838667020409235, + "language_loss": 0.8338263, + "learning_rate": 3.982885612859432e-06, + "loss": 0.85630423, + "num_input_tokens_seen": 14715510, + "router_z_loss_clip": 0.34960938, + "router_z_loss_mlp": 1.4375, + "step": 695, + "time_per_iteration": 2.423081398010254 + }, + { + "auxiliary_loss_clip": 0.01187447, + "auxiliary_loss_mlp": 0.0107011, + "balance_loss_clip": 1.03351307, + "balance_loss_mlp": 1.0442071, + "epoch": 0.04184578385690666, + "flos": 18221065303680.0, + "grad_norm": 2.0993753783977223, + "language_loss": 0.84214848, + "learning_rate": 3.982836255870918e-06, + "loss": 0.86472404, + "num_input_tokens_seen": 14731755, + "router_z_loss_clip": 0.3671875, + "router_z_loss_mlp": 1.4296875, + "step": 696, + "time_per_iteration": 2.3784396648406982 + }, + { + "auxiliary_loss_clip": 0.01182217, + "auxiliary_loss_mlp": 0.01071365, + "balance_loss_clip": 1.03605509, + "balance_loss_mlp": 1.04098535, + "epoch": 0.041905907109574626, + "flos": 22775450584320.0, + "grad_norm": 2.124818166614912, + "language_loss": 0.9306224, + "learning_rate": 3.982786828120078e-06, + "loss": 0.95315826, + "num_input_tokens_seen": 14750810, + "router_z_loss_clip": 0.35351562, + "router_z_loss_mlp": 1.4140625, + "step": 697, + "time_per_iteration": 2.40679669380188 + }, + { + "auxiliary_loss_clip": 0.01178436, + "auxiliary_loss_mlp": 0.01060785, + "balance_loss_clip": 1.02650046, + "balance_loss_mlp": 1.04041481, + "epoch": 0.0419660303622426, + "flos": 20155916874240.0, + "grad_norm": 2.274826421908768, + "language_loss": 0.8352983, + "learning_rate": 3.982737329608676e-06, + "loss": 0.85769051, + "num_input_tokens_seen": 14768435, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 1.3828125, + "step": 698, + "time_per_iteration": 2.4244003295898438 + }, + { + "auxiliary_loss_clip": 0.01183744, + "auxiliary_loss_mlp": 0.01074651, + "balance_loss_clip": 1.03903079, + "balance_loss_mlp": 1.04241085, + "epoch": 0.042026153614910564, + "flos": 23075249362560.0, + "grad_norm": 2.40989884291235, + "language_loss": 0.91279924, + "learning_rate": 3.98268776033848e-06, + "loss": 0.9353832, + "num_input_tokens_seen": 14786690, + "router_z_loss_clip": 0.35546875, + "router_z_loss_mlp": 1.4140625, + "step": 699, + "time_per_iteration": 2.4093446731567383 + }, + { + "auxiliary_loss_clip": 0.0106356, + "auxiliary_loss_mlp": 0.01011575, + "balance_loss_clip": 1.00621021, + "balance_loss_mlp": 1.01348925, + "epoch": 0.042086276867578536, + "flos": 64491734528640.0, + "grad_norm": 0.8829531456366362, + "language_loss": 0.67870784, + "learning_rate": 3.9826381203112575e-06, + "loss": 0.6994592, + "num_input_tokens_seen": 14853840, + "router_z_loss_clip": 0.05371094, + "router_z_loss_mlp": 0.5, + "step": 700, + "time_per_iteration": 3.081566333770752 + }, + { + "auxiliary_loss_clip": 0.01188067, + "auxiliary_loss_mlp": 0.01067494, + "balance_loss_clip": 1.02786875, + "balance_loss_mlp": 1.04447913, + "epoch": 0.04214640012024651, + "flos": 15486109038720.0, + "grad_norm": 2.4898711189618576, + "language_loss": 0.88516855, + "learning_rate": 3.98258840952878e-06, + "loss": 0.90772414, + "num_input_tokens_seen": 14869580, + "router_z_loss_clip": 0.39648438, + "router_z_loss_mlp": 1.4375, + "step": 701, + "time_per_iteration": 2.3736653327941895 + }, + { + "auxiliary_loss_clip": 0.01184988, + "auxiliary_loss_mlp": 0.0107199, + "balance_loss_clip": 1.03656125, + "balance_loss_mlp": 1.04638183, + "epoch": 0.04220652337291447, + "flos": 23875947550080.0, + "grad_norm": 1.7072433387743238, + "language_loss": 0.67324317, + "learning_rate": 3.982538627992822e-06, + "loss": 0.69581294, + "num_input_tokens_seen": 14891065, + "router_z_loss_clip": 0.35351562, + "router_z_loss_mlp": 1.390625, + "step": 702, + "time_per_iteration": 2.4374215602874756 + }, + { + "auxiliary_loss_clip": 0.0105983, + "auxiliary_loss_mlp": 0.01005343, + "balance_loss_clip": 0.99969298, + "balance_loss_mlp": 1.00805283, + "epoch": 0.042266646625582445, + "flos": 63792145238400.0, + "grad_norm": 0.83391300942417, + "language_loss": 0.60691524, + "learning_rate": 3.98248877570516e-06, + "loss": 0.62756693, + "num_input_tokens_seen": 14954815, + "router_z_loss_clip": 0.05639648, + "router_z_loss_mlp": 0.515625, + "step": 703, + "time_per_iteration": 3.142807722091675 + }, + { + "auxiliary_loss_clip": 0.01056487, + "auxiliary_loss_mlp": 0.01007253, + "balance_loss_clip": 1.00181758, + "balance_loss_mlp": 1.00678504, + "epoch": 0.04232676987825041, + "flos": 50015521877760.0, + "grad_norm": 1.0100126866570873, + "language_loss": 0.57689762, + "learning_rate": 3.982438852667574e-06, + "loss": 0.59753501, + "num_input_tokens_seen": 15003050, + "router_z_loss_clip": 0.05444336, + "router_z_loss_mlp": 0.49609375, + "step": 704, + "time_per_iteration": 2.9380300045013428 + }, + { + "auxiliary_loss_clip": 0.01187186, + "auxiliary_loss_mlp": 0.01060862, + "balance_loss_clip": 1.02455127, + "balance_loss_mlp": 1.04617953, + "epoch": 0.04238689313091838, + "flos": 21615041992320.0, + "grad_norm": 2.3619916673472745, + "language_loss": 0.87409616, + "learning_rate": 3.982388858881844e-06, + "loss": 0.89657664, + "num_input_tokens_seen": 15021990, + "router_z_loss_clip": 0.36328125, + "router_z_loss_mlp": 1.40625, + "step": 705, + "time_per_iteration": 2.4390175342559814 + }, + { + "auxiliary_loss_clip": 0.01174888, + "auxiliary_loss_mlp": 0.01060965, + "balance_loss_clip": 1.02885032, + "balance_loss_mlp": 1.04063582, + "epoch": 0.042447016383586354, + "flos": 19134113846400.0, + "grad_norm": 1.8293907438004477, + "language_loss": 0.71343666, + "learning_rate": 3.982338794349755e-06, + "loss": 0.7357952, + "num_input_tokens_seen": 15040700, + "router_z_loss_clip": 0.3203125, + "router_z_loss_mlp": 1.34375, + "step": 706, + "time_per_iteration": 2.386361598968506 + }, + { + "auxiliary_loss_clip": 0.01177439, + "auxiliary_loss_mlp": 0.01066396, + "balance_loss_clip": 1.02932167, + "balance_loss_mlp": 1.04061365, + "epoch": 0.04250713963625432, + "flos": 24424851945600.0, + "grad_norm": 2.0036013581762693, + "language_loss": 0.93354023, + "learning_rate": 3.982288659073094e-06, + "loss": 0.95597857, + "num_input_tokens_seen": 15056725, + "router_z_loss_clip": 0.37109375, + "router_z_loss_mlp": 1.3671875, + "step": 707, + "time_per_iteration": 2.44316029548645 + }, + { + "auxiliary_loss_clip": 0.01181426, + "auxiliary_loss_mlp": 0.01070727, + "balance_loss_clip": 1.03467846, + "balance_loss_mlp": 1.04033101, + "epoch": 0.04256726288892229, + "flos": 30366231742080.0, + "grad_norm": 2.361387935515631, + "language_loss": 0.8126626, + "learning_rate": 3.98223845305365e-06, + "loss": 0.8351841, + "num_input_tokens_seen": 15077550, + "router_z_loss_clip": 0.36132812, + "router_z_loss_mlp": 1.40625, + "step": 708, + "time_per_iteration": 2.48256254196167 + }, + { + "auxiliary_loss_clip": 0.01187485, + "auxiliary_loss_mlp": 0.01067734, + "balance_loss_clip": 1.03063631, + "balance_loss_mlp": 1.04343319, + "epoch": 0.04262738614159026, + "flos": 16361730737280.0, + "grad_norm": 2.66574885580616, + "language_loss": 0.81993365, + "learning_rate": 3.982188176293213e-06, + "loss": 0.84248579, + "num_input_tokens_seen": 15094955, + "router_z_loss_clip": 0.37109375, + "router_z_loss_mlp": 1.4375, + "step": 709, + "time_per_iteration": 2.407764434814453 + }, + { + "auxiliary_loss_clip": 0.0118863, + "auxiliary_loss_mlp": 0.0105774, + "balance_loss_clip": 1.0218581, + "balance_loss_mlp": 1.04400229, + "epoch": 0.04268750939425823, + "flos": 20411341447680.0, + "grad_norm": 2.3436319116749598, + "language_loss": 0.84847897, + "learning_rate": 3.982137828793581e-06, + "loss": 0.87094259, + "num_input_tokens_seen": 15113395, + "router_z_loss_clip": 0.359375, + "router_z_loss_mlp": 1.4453125, + "step": 710, + "time_per_iteration": 2.4123568534851074 + }, + { + "auxiliary_loss_clip": 0.0118838, + "auxiliary_loss_mlp": 0.01067121, + "balance_loss_clip": 1.03183496, + "balance_loss_mlp": 1.04673469, + "epoch": 0.0427476326469262, + "flos": 20301923646720.0, + "grad_norm": 2.682782371000687, + "language_loss": 0.84520423, + "learning_rate": 3.982087410556547e-06, + "loss": 0.86775929, + "num_input_tokens_seen": 15132920, + "router_z_loss_clip": 0.3515625, + "router_z_loss_mlp": 1.421875, + "step": 711, + "time_per_iteration": 2.4092376232147217 + }, + { + "auxiliary_loss_clip": 0.01179871, + "auxiliary_loss_mlp": 0.01063639, + "balance_loss_clip": 1.02687526, + "balance_loss_mlp": 1.04195905, + "epoch": 0.042807755899594166, + "flos": 21649780661760.0, + "grad_norm": 1.7518729085008558, + "language_loss": 0.85324287, + "learning_rate": 3.982036921583912e-06, + "loss": 0.875678, + "num_input_tokens_seen": 15153115, + "router_z_loss_clip": 0.3671875, + "router_z_loss_mlp": 1.375, + "step": 712, + "time_per_iteration": 2.4427237510681152 + }, + { + "auxiliary_loss_clip": 0.01187882, + "auxiliary_loss_mlp": 0.01059618, + "balance_loss_clip": 1.02557182, + "balance_loss_mlp": 1.04309583, + "epoch": 0.04286787915226214, + "flos": 21433912525440.0, + "grad_norm": 3.1329119544886876, + "language_loss": 0.91045451, + "learning_rate": 3.981986361877479e-06, + "loss": 0.93292952, + "num_input_tokens_seen": 15172770, + "router_z_loss_clip": 0.33984375, + "router_z_loss_mlp": 1.453125, + "step": 713, + "time_per_iteration": 2.4084465503692627 + }, + { + "auxiliary_loss_clip": 0.01058632, + "auxiliary_loss_mlp": 0.01018882, + "balance_loss_clip": 1.01318336, + "balance_loss_mlp": 1.00909543, + "epoch": 0.04292800240493011, + "flos": 66394256313600.0, + "grad_norm": 0.8888646137103391, + "language_loss": 0.63704062, + "learning_rate": 3.9819357314390494e-06, + "loss": 0.65781581, + "num_input_tokens_seen": 15240055, + "router_z_loss_clip": 0.05688477, + "router_z_loss_mlp": 0.49609375, + "step": 714, + "time_per_iteration": 3.1690354347229004 + }, + { + "auxiliary_loss_clip": 0.01181426, + "auxiliary_loss_mlp": 0.01074384, + "balance_loss_clip": 1.04050469, + "balance_loss_mlp": 1.04453063, + "epoch": 0.042988125657598075, + "flos": 31648905515520.0, + "grad_norm": 2.1541672162311065, + "language_loss": 0.74600798, + "learning_rate": 3.981885030270432e-06, + "loss": 0.76856601, + "num_input_tokens_seen": 15261585, + "router_z_loss_clip": 0.33984375, + "router_z_loss_mlp": 1.375, + "step": 715, + "time_per_iteration": 2.5548620223999023 + }, + { + "auxiliary_loss_clip": 0.01186135, + "auxiliary_loss_mlp": 0.01066759, + "balance_loss_clip": 1.02880299, + "balance_loss_mlp": 1.04575384, + "epoch": 0.04304824891026605, + "flos": 33247264602240.0, + "grad_norm": 1.8331696717597785, + "language_loss": 0.72439748, + "learning_rate": 3.981834258373437e-06, + "loss": 0.74692643, + "num_input_tokens_seen": 15281160, + "router_z_loss_clip": 0.37890625, + "router_z_loss_mlp": 1.40625, + "step": 716, + "time_per_iteration": 2.5122344493865967 + }, + { + "auxiliary_loss_clip": 0.01051827, + "auxiliary_loss_mlp": 0.01005289, + "balance_loss_clip": 0.99956697, + "balance_loss_mlp": 1.00391102, + "epoch": 0.04310837216293401, + "flos": 64061080508160.0, + "grad_norm": 0.9005979170358152, + "language_loss": 0.65497255, + "learning_rate": 3.981783415749874e-06, + "loss": 0.67554367, + "num_input_tokens_seen": 15344505, + "router_z_loss_clip": 0.05712891, + "router_z_loss_mlp": 0.48046875, + "step": 717, + "time_per_iteration": 3.0917444229125977 + }, + { + "auxiliary_loss_clip": 0.01050883, + "auxiliary_loss_mlp": 0.01005029, + "balance_loss_clip": 0.99964118, + "balance_loss_mlp": 1.00407958, + "epoch": 0.043168495415601985, + "flos": 61340719057920.0, + "grad_norm": 0.9752943296857631, + "language_loss": 0.58790207, + "learning_rate": 3.9817325024015596e-06, + "loss": 0.6084612, + "num_input_tokens_seen": 15404050, + "router_z_loss_clip": 0.05395508, + "router_z_loss_mlp": 0.46875, + "step": 718, + "time_per_iteration": 2.9039950370788574 + }, + { + "auxiliary_loss_clip": 0.01183786, + "auxiliary_loss_mlp": 0.01068357, + "balance_loss_clip": 1.03123498, + "balance_loss_mlp": 1.04652083, + "epoch": 0.04322861866826996, + "flos": 20703215347200.0, + "grad_norm": 1.9132710050399087, + "language_loss": 0.91328299, + "learning_rate": 3.9816815183303086e-06, + "loss": 0.93580437, + "num_input_tokens_seen": 15424190, + "router_z_loss_clip": 0.37109375, + "router_z_loss_mlp": 1.375, + "step": 719, + "time_per_iteration": 2.425180435180664 + }, + { + "auxiliary_loss_clip": 0.01181178, + "auxiliary_loss_mlp": 0.01071679, + "balance_loss_clip": 1.03806162, + "balance_loss_mlp": 1.04153848, + "epoch": 0.04328874192093792, + "flos": 30372027027840.0, + "grad_norm": 1.6259532171414055, + "language_loss": 0.66515422, + "learning_rate": 3.981630463537942e-06, + "loss": 0.68768275, + "num_input_tokens_seen": 15446500, + "router_z_loss_clip": 0.3359375, + "router_z_loss_mlp": 1.3984375, + "step": 720, + "time_per_iteration": 2.474658250808716 + }, + { + "auxiliary_loss_clip": 0.01180216, + "auxiliary_loss_mlp": 0.01068949, + "balance_loss_clip": 1.03428292, + "balance_loss_mlp": 1.04478395, + "epoch": 0.043348865173605894, + "flos": 21943714331520.0, + "grad_norm": 2.3921306796946364, + "language_loss": 0.77202111, + "learning_rate": 3.981579338026282e-06, + "loss": 0.79451281, + "num_input_tokens_seen": 15465830, + "router_z_loss_clip": 0.34765625, + "router_z_loss_mlp": 1.359375, + "step": 721, + "time_per_iteration": 2.426039457321167 + }, + { + "auxiliary_loss_clip": 0.01183751, + "auxiliary_loss_mlp": 0.01077762, + "balance_loss_clip": 1.04156971, + "balance_loss_mlp": 1.04415536, + "epoch": 0.04340898842627386, + "flos": 15263433187200.0, + "grad_norm": 2.822492763484581, + "language_loss": 0.88540536, + "learning_rate": 3.981528141797153e-06, + "loss": 0.9080205, + "num_input_tokens_seen": 15479985, + "router_z_loss_clip": 0.36328125, + "router_z_loss_mlp": 1.390625, + "step": 722, + "time_per_iteration": 2.366525888442993 + }, + { + "auxiliary_loss_clip": 0.01190227, + "auxiliary_loss_mlp": 0.01067458, + "balance_loss_clip": 1.03372216, + "balance_loss_mlp": 1.04493773, + "epoch": 0.04346911167894183, + "flos": 27964172090880.0, + "grad_norm": 1.9351530887289412, + "language_loss": 0.84070444, + "learning_rate": 3.981476874852382e-06, + "loss": 0.86328125, + "num_input_tokens_seen": 15501545, + "router_z_loss_clip": 0.33789062, + "router_z_loss_mlp": 1.453125, + "step": 723, + "time_per_iteration": 2.4779179096221924 + }, + { + "auxiliary_loss_clip": 0.01187966, + "auxiliary_loss_mlp": 0.01072437, + "balance_loss_clip": 1.03755665, + "balance_loss_mlp": 1.04717469, + "epoch": 0.0435292349316098, + "flos": 29240910933120.0, + "grad_norm": 1.9139788895422787, + "language_loss": 0.82327592, + "learning_rate": 3.981425537193796e-06, + "loss": 0.84587997, + "num_input_tokens_seen": 15521725, + "router_z_loss_clip": 0.34765625, + "router_z_loss_mlp": 1.40625, + "step": 724, + "time_per_iteration": 3.8979430198669434 + }, + { + "auxiliary_loss_clip": 0.01182732, + "auxiliary_loss_mlp": 0.01068706, + "balance_loss_clip": 1.03420663, + "balance_loss_mlp": 1.04488754, + "epoch": 0.04358935818427777, + "flos": 20557313308800.0, + "grad_norm": 1.8369900418297336, + "language_loss": 0.79121196, + "learning_rate": 3.981374128823232e-06, + "loss": 0.81372637, + "num_input_tokens_seen": 15540910, + "router_z_loss_clip": 0.34570312, + "router_z_loss_mlp": 1.3828125, + "step": 725, + "time_per_iteration": 2.4161336421966553 + }, + { + "auxiliary_loss_clip": 0.01194352, + "auxiliary_loss_mlp": 0.01072789, + "balance_loss_clip": 1.03611982, + "balance_loss_mlp": 1.04817867, + "epoch": 0.04364948143694574, + "flos": 14464061631360.0, + "grad_norm": 2.12562964942191, + "language_loss": 0.86453843, + "learning_rate": 3.981322649742521e-06, + "loss": 0.88720989, + "num_input_tokens_seen": 15558640, + "router_z_loss_clip": 0.3671875, + "router_z_loss_mlp": 1.4609375, + "step": 726, + "time_per_iteration": 2.3939080238342285 + }, + { + "auxiliary_loss_clip": 0.01053085, + "auxiliary_loss_mlp": 0.01004662, + "balance_loss_clip": 0.99972701, + "balance_loss_mlp": 1.00666237, + "epoch": 0.043709604689613706, + "flos": 50064610982400.0, + "grad_norm": 0.9096007450487622, + "language_loss": 0.55918157, + "learning_rate": 3.9812710999535005e-06, + "loss": 0.579759, + "num_input_tokens_seen": 15612975, + "router_z_loss_clip": 0.04931641, + "router_z_loss_mlp": 0.46484375, + "step": 727, + "time_per_iteration": 4.51263689994812 + }, + { + "auxiliary_loss_clip": 0.01187576, + "auxiliary_loss_mlp": 0.01065319, + "balance_loss_clip": 1.02750552, + "balance_loss_mlp": 1.04635787, + "epoch": 0.04376972794228168, + "flos": 13990709151360.0, + "grad_norm": 1.8794382856077294, + "language_loss": 0.81984973, + "learning_rate": 3.981219479458012e-06, + "loss": 0.84237874, + "num_input_tokens_seen": 15631070, + "router_z_loss_clip": 0.37890625, + "router_z_loss_mlp": 1.4140625, + "step": 728, + "time_per_iteration": 3.7680346965789795 + }, + { + "auxiliary_loss_clip": 0.01179165, + "auxiliary_loss_mlp": 0.01065663, + "balance_loss_clip": 1.03216577, + "balance_loss_mlp": 1.04445708, + "epoch": 0.04382985119494965, + "flos": 22009037218560.0, + "grad_norm": 2.383410342674767, + "language_loss": 0.76899624, + "learning_rate": 3.981167788257896e-06, + "loss": 0.79144454, + "num_input_tokens_seen": 15647825, + "router_z_loss_clip": 0.3359375, + "router_z_loss_mlp": 1.34375, + "step": 729, + "time_per_iteration": 3.801180124282837 + }, + { + "auxiliary_loss_clip": 0.01184388, + "auxiliary_loss_mlp": 0.01062557, + "balance_loss_clip": 1.02722335, + "balance_loss_mlp": 1.04415679, + "epoch": 0.043889974447617615, + "flos": 24205387939200.0, + "grad_norm": 2.001423814994023, + "language_loss": 0.9496327, + "learning_rate": 3.9811160263549985e-06, + "loss": 0.97210211, + "num_input_tokens_seen": 15668260, + "router_z_loss_clip": 0.35351562, + "router_z_loss_mlp": 1.40625, + "step": 730, + "time_per_iteration": 2.4248015880584717 + }, + { + "auxiliary_loss_clip": 0.01182056, + "auxiliary_loss_mlp": 0.0106814, + "balance_loss_clip": 1.03128076, + "balance_loss_mlp": 1.04220426, + "epoch": 0.04395009770028559, + "flos": 17273592293760.0, + "grad_norm": 2.3730318760653777, + "language_loss": 0.8861438, + "learning_rate": 3.981064193751166e-06, + "loss": 0.90864581, + "num_input_tokens_seen": 15685630, + "router_z_loss_clip": 0.3671875, + "router_z_loss_mlp": 1.3984375, + "step": 731, + "time_per_iteration": 2.3986494541168213 + }, + { + "auxiliary_loss_clip": 0.01182096, + "auxiliary_loss_mlp": 0.01063611, + "balance_loss_clip": 1.02999365, + "balance_loss_mlp": 1.04369128, + "epoch": 0.04401022095295355, + "flos": 12309536586240.0, + "grad_norm": 2.8991978654045716, + "language_loss": 0.88705492, + "learning_rate": 3.981012290448247e-06, + "loss": 0.90951192, + "num_input_tokens_seen": 15698645, + "router_z_loss_clip": 0.3359375, + "router_z_loss_mlp": 1.3828125, + "step": 732, + "time_per_iteration": 2.382936716079712 + }, + { + "auxiliary_loss_clip": 0.0118338, + "auxiliary_loss_mlp": 0.01062163, + "balance_loss_clip": 1.02690125, + "balance_loss_mlp": 1.04321599, + "epoch": 0.044070344205621524, + "flos": 20958605009280.0, + "grad_norm": 2.0845642292686395, + "language_loss": 0.86170357, + "learning_rate": 3.980960316448097e-06, + "loss": 0.88415903, + "num_input_tokens_seen": 15716775, + "router_z_loss_clip": 0.3515625, + "router_z_loss_mlp": 1.40625, + "step": 733, + "time_per_iteration": 2.4018895626068115 + }, + { + "auxiliary_loss_clip": 0.01187338, + "auxiliary_loss_mlp": 0.01066768, + "balance_loss_clip": 1.03024256, + "balance_loss_mlp": 1.0461601, + "epoch": 0.044130467458289496, + "flos": 13844423088000.0, + "grad_norm": 4.290017182560329, + "language_loss": 0.90916038, + "learning_rate": 3.980908271752567e-06, + "loss": 0.93170148, + "num_input_tokens_seen": 15733320, + "router_z_loss_clip": 0.36523438, + "router_z_loss_mlp": 1.4140625, + "step": 734, + "time_per_iteration": 2.3619699478149414 + }, + { + "auxiliary_loss_clip": 0.01180765, + "auxiliary_loss_mlp": 0.01060107, + "balance_loss_clip": 1.02715731, + "balance_loss_mlp": 1.04557741, + "epoch": 0.04419059071095746, + "flos": 28653881466240.0, + "grad_norm": 1.915025457554586, + "language_loss": 0.77842975, + "learning_rate": 3.980856156363518e-06, + "loss": 0.80083847, + "num_input_tokens_seen": 15752705, + "router_z_loss_clip": 0.33007812, + "router_z_loss_mlp": 1.3515625, + "step": 735, + "time_per_iteration": 2.490703582763672 + }, + { + "auxiliary_loss_clip": 0.01177451, + "auxiliary_loss_mlp": 0.01060491, + "balance_loss_clip": 1.0279355, + "balance_loss_mlp": 1.04102802, + "epoch": 0.04425071396362543, + "flos": 28182065086080.0, + "grad_norm": 2.359563242556638, + "language_loss": 0.88532102, + "learning_rate": 3.980803970282806e-06, + "loss": 0.90770042, + "num_input_tokens_seen": 15772800, + "router_z_loss_clip": 0.32421875, + "router_z_loss_mlp": 1.3671875, + "step": 736, + "time_per_iteration": 2.4679150581359863 + }, + { + "auxiliary_loss_clip": 0.01180427, + "auxiliary_loss_mlp": 0.01066654, + "balance_loss_clip": 1.03225017, + "balance_loss_mlp": 1.046525, + "epoch": 0.0443108372162934, + "flos": 23657356327680.0, + "grad_norm": 1.934699423655556, + "language_loss": 0.84254616, + "learning_rate": 3.980751713512298e-06, + "loss": 0.86501706, + "num_input_tokens_seen": 15793665, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 1.34375, + "step": 737, + "time_per_iteration": 2.4656014442443848 + }, + { + "auxiliary_loss_clip": 0.01185789, + "auxiliary_loss_mlp": 0.01069531, + "balance_loss_clip": 1.03195643, + "balance_loss_mlp": 1.04592919, + "epoch": 0.04437096046896137, + "flos": 33978590184960.0, + "grad_norm": 1.8838707245702677, + "language_loss": 0.84660316, + "learning_rate": 3.980699386053855e-06, + "loss": 0.86915642, + "num_input_tokens_seen": 15813175, + "router_z_loss_clip": 0.375, + "router_z_loss_mlp": 1.3984375, + "step": 738, + "time_per_iteration": 2.5087547302246094 + }, + { + "auxiliary_loss_clip": 0.01054058, + "auxiliary_loss_mlp": 0.0100866, + "balance_loss_clip": 1.00396276, + "balance_loss_mlp": 1.00775146, + "epoch": 0.04443108372162934, + "flos": 67394379386880.0, + "grad_norm": 0.8607474266972598, + "language_loss": 0.59154689, + "learning_rate": 3.9806469879093465e-06, + "loss": 0.61217415, + "num_input_tokens_seen": 15872050, + "router_z_loss_clip": 0.046875, + "router_z_loss_mlp": 0.46289062, + "step": 739, + "time_per_iteration": 3.008528470993042 + }, + { + "auxiliary_loss_clip": 0.01178647, + "auxiliary_loss_mlp": 0.01063742, + "balance_loss_clip": 1.03043461, + "balance_loss_mlp": 1.0452528, + "epoch": 0.04449120697429731, + "flos": 29751376055040.0, + "grad_norm": 2.074928094832132, + "language_loss": 0.90996939, + "learning_rate": 3.9805945190806415e-06, + "loss": 0.93239331, + "num_input_tokens_seen": 15891085, + "router_z_loss_clip": 0.33203125, + "router_z_loss_mlp": 1.3359375, + "step": 740, + "time_per_iteration": 2.461949348449707 + }, + { + "auxiliary_loss_clip": 0.01184099, + "auxiliary_loss_mlp": 0.01063316, + "balance_loss_clip": 1.02836418, + "balance_loss_mlp": 1.04532051, + "epoch": 0.04455133022696528, + "flos": 36500645779200.0, + "grad_norm": 1.9530878257015465, + "language_loss": 0.71967971, + "learning_rate": 3.980541979569614e-06, + "loss": 0.74215388, + "num_input_tokens_seen": 15914225, + "router_z_loss_clip": 0.34960938, + "router_z_loss_mlp": 1.390625, + "step": 741, + "time_per_iteration": 2.5428102016448975 + }, + { + "auxiliary_loss_clip": 0.01177469, + "auxiliary_loss_mlp": 0.01066288, + "balance_loss_clip": 1.03188419, + "balance_loss_mlp": 1.04174197, + "epoch": 0.044611453479633245, + "flos": 28802401856640.0, + "grad_norm": 1.9194179518673538, + "language_loss": 0.88805389, + "learning_rate": 3.980489369378136e-06, + "loss": 0.91049147, + "num_input_tokens_seen": 15934540, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 1.359375, + "step": 742, + "time_per_iteration": 2.461562395095825 + }, + { + "auxiliary_loss_clip": 0.01177628, + "auxiliary_loss_mlp": 0.01059539, + "balance_loss_clip": 1.0233475, + "balance_loss_mlp": 1.04239082, + "epoch": 0.04467157673230122, + "flos": 20009945013120.0, + "grad_norm": 1.8610295912199888, + "language_loss": 0.83681965, + "learning_rate": 3.980436688508087e-06, + "loss": 0.8591913, + "num_input_tokens_seen": 15952560, + "router_z_loss_clip": 0.36132812, + "router_z_loss_mlp": 1.3515625, + "step": 743, + "time_per_iteration": 2.410418748855591 + }, + { + "auxiliary_loss_clip": 0.01180885, + "auxiliary_loss_mlp": 0.0106705, + "balance_loss_clip": 1.031955, + "balance_loss_mlp": 1.04375339, + "epoch": 0.04473169998496919, + "flos": 18003975269760.0, + "grad_norm": 2.0392217011253617, + "language_loss": 0.79766238, + "learning_rate": 3.980383936961348e-06, + "loss": 0.82014179, + "num_input_tokens_seen": 15970620, + "router_z_loss_clip": 0.3515625, + "router_z_loss_mlp": 1.375, + "step": 744, + "time_per_iteration": 2.3963050842285156 + }, + { + "auxiliary_loss_clip": 0.01175583, + "auxiliary_loss_mlp": 0.01060617, + "balance_loss_clip": 1.02821624, + "balance_loss_mlp": 1.04345059, + "epoch": 0.044791823237637154, + "flos": 20630665808640.0, + "grad_norm": 2.062579014162858, + "language_loss": 0.85017085, + "learning_rate": 3.980331114739799e-06, + "loss": 0.87253284, + "num_input_tokens_seen": 15987325, + "router_z_loss_clip": 0.32421875, + "router_z_loss_mlp": 1.3203125, + "step": 745, + "time_per_iteration": 2.41471791267395 + }, + { + "auxiliary_loss_clip": 0.01179399, + "auxiliary_loss_mlp": 0.01054924, + "balance_loss_clip": 1.0201149, + "balance_loss_mlp": 1.04224062, + "epoch": 0.04485194649030513, + "flos": 31174819896960.0, + "grad_norm": 1.8628480270544208, + "language_loss": 0.68768948, + "learning_rate": 3.980278221845328e-06, + "loss": 0.7100327, + "num_input_tokens_seen": 16008310, + "router_z_loss_clip": 0.34765625, + "router_z_loss_mlp": 1.375, + "step": 746, + "time_per_iteration": 2.485377788543701 + }, + { + "auxiliary_loss_clip": 0.01184604, + "auxiliary_loss_mlp": 0.01069791, + "balance_loss_clip": 1.03326559, + "balance_loss_mlp": 1.04763985, + "epoch": 0.04491206974297309, + "flos": 26142019989120.0, + "grad_norm": 3.637756553710533, + "language_loss": 0.68110108, + "learning_rate": 3.98022525827982e-06, + "loss": 0.70364505, + "num_input_tokens_seen": 16029620, + "router_z_loss_clip": 0.36523438, + "router_z_loss_mlp": 1.3671875, + "step": 747, + "time_per_iteration": 2.474304437637329 + }, + { + "auxiliary_loss_clip": 0.01187419, + "auxiliary_loss_mlp": 0.01082806, + "balance_loss_clip": 1.04728198, + "balance_loss_mlp": 1.04606819, + "epoch": 0.044972192995641064, + "flos": 20666626375680.0, + "grad_norm": 2.2752135269186105, + "language_loss": 0.66599447, + "learning_rate": 3.980172224045168e-06, + "loss": 0.68869674, + "num_input_tokens_seen": 16049065, + "router_z_loss_clip": 0.35546875, + "router_z_loss_mlp": 1.4140625, + "step": 748, + "time_per_iteration": 2.415919303894043 + }, + { + "auxiliary_loss_clip": 0.01183825, + "auxiliary_loss_mlp": 0.0107098, + "balance_loss_clip": 1.03421593, + "balance_loss_mlp": 1.04668474, + "epoch": 0.045032316248309036, + "flos": 16105922138880.0, + "grad_norm": 3.0247764736933203, + "language_loss": 0.76647866, + "learning_rate": 3.980119119143262e-06, + "loss": 0.78902674, + "num_input_tokens_seen": 16066765, + "router_z_loss_clip": 0.3671875, + "router_z_loss_mlp": 1.3671875, + "step": 749, + "time_per_iteration": 2.461630344390869 + }, + { + "auxiliary_loss_clip": 0.01184256, + "auxiliary_loss_mlp": 0.01067013, + "balance_loss_clip": 1.03275204, + "balance_loss_mlp": 1.04713106, + "epoch": 0.045092439500977, + "flos": 17857863763200.0, + "grad_norm": 1.9949543408587387, + "language_loss": 0.88806438, + "learning_rate": 3.980065943575998e-06, + "loss": 0.91057712, + "num_input_tokens_seen": 16085980, + "router_z_loss_clip": 0.34179688, + "router_z_loss_mlp": 1.375, + "step": 750, + "time_per_iteration": 2.389357089996338 + }, + { + "auxiliary_loss_clip": 0.01185133, + "auxiliary_loss_mlp": 0.01076102, + "balance_loss_clip": 1.03704882, + "balance_loss_mlp": 1.0451926, + "epoch": 0.04515256275364497, + "flos": 24461650385280.0, + "grad_norm": 4.792346322114513, + "language_loss": 0.74504662, + "learning_rate": 3.9800126973452725e-06, + "loss": 0.76765895, + "num_input_tokens_seen": 16106260, + "router_z_loss_clip": 0.390625, + "router_z_loss_mlp": 1.3984375, + "step": 751, + "time_per_iteration": 2.466661214828491 + }, + { + "auxiliary_loss_clip": 0.01177467, + "auxiliary_loss_mlp": 0.01066612, + "balance_loss_clip": 1.02989578, + "balance_loss_mlp": 1.04091656, + "epoch": 0.04521268600631294, + "flos": 20915522524800.0, + "grad_norm": 1.8841999039596504, + "language_loss": 0.68607342, + "learning_rate": 3.979959380452989e-06, + "loss": 0.70851421, + "num_input_tokens_seen": 16123475, + "router_z_loss_clip": 0.3671875, + "router_z_loss_mlp": 1.359375, + "step": 752, + "time_per_iteration": 2.3971285820007324 + }, + { + "auxiliary_loss_clip": 0.01178181, + "auxiliary_loss_mlp": 0.01054287, + "balance_loss_clip": 1.01993108, + "balance_loss_mlp": 1.04174387, + "epoch": 0.04527280925898091, + "flos": 13370512026240.0, + "grad_norm": 2.5533987417203603, + "language_loss": 0.9229058, + "learning_rate": 3.979905992901047e-06, + "loss": 0.94523054, + "num_input_tokens_seen": 16138335, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 1.359375, + "step": 753, + "time_per_iteration": 2.3955495357513428 + }, + { + "auxiliary_loss_clip": 0.01183752, + "auxiliary_loss_mlp": 0.01066287, + "balance_loss_clip": 1.03231251, + "balance_loss_mlp": 1.04663646, + "epoch": 0.04533293251164888, + "flos": 23253551009280.0, + "grad_norm": 1.9030511128020393, + "language_loss": 0.91005522, + "learning_rate": 3.979852534691353e-06, + "loss": 0.93255562, + "num_input_tokens_seen": 16157110, + "router_z_loss_clip": 0.33984375, + "router_z_loss_mlp": 1.375, + "step": 754, + "time_per_iteration": 2.4177744388580322 + }, + { + "auxiliary_loss_clip": 0.01172809, + "auxiliary_loss_mlp": 0.01066015, + "balance_loss_clip": 1.03161073, + "balance_loss_mlp": 1.04385495, + "epoch": 0.04539305576431685, + "flos": 12421188714240.0, + "grad_norm": 2.3406486534664896, + "language_loss": 0.78643274, + "learning_rate": 3.979799005825816e-06, + "loss": 0.80882096, + "num_input_tokens_seen": 16174155, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 1.2890625, + "step": 755, + "time_per_iteration": 2.388094902038574 + }, + { + "auxiliary_loss_clip": 0.01182913, + "auxiliary_loss_mlp": 0.01077339, + "balance_loss_clip": 1.04102838, + "balance_loss_mlp": 1.04511786, + "epoch": 0.04545317901698482, + "flos": 16070066305920.0, + "grad_norm": 2.104218086446312, + "language_loss": 0.78481936, + "learning_rate": 3.979745406306345e-06, + "loss": 0.80742186, + "num_input_tokens_seen": 16192240, + "router_z_loss_clip": 0.36328125, + "router_z_loss_mlp": 1.375, + "step": 756, + "time_per_iteration": 2.375627279281616 + }, + { + "auxiliary_loss_clip": 0.01054118, + "auxiliary_loss_mlp": 0.0100707, + "balance_loss_clip": 1.00187278, + "balance_loss_mlp": 1.00968564, + "epoch": 0.045513302269652785, + "flos": 66392475834240.0, + "grad_norm": 0.8078967859892556, + "language_loss": 0.62762362, + "learning_rate": 3.979691736134852e-06, + "loss": 0.6482355, + "num_input_tokens_seen": 16255775, + "router_z_loss_clip": 0.05200195, + "router_z_loss_mlp": 0.4453125, + "step": 757, + "time_per_iteration": 3.08660626411438 + }, + { + "auxiliary_loss_clip": 0.01180996, + "auxiliary_loss_mlp": 0.01061264, + "balance_loss_clip": 1.02705073, + "balance_loss_mlp": 1.04657936, + "epoch": 0.04557342552232076, + "flos": 21470082560640.0, + "grad_norm": 2.0456393741536685, + "language_loss": 0.84172112, + "learning_rate": 3.979637995313254e-06, + "loss": 0.86414373, + "num_input_tokens_seen": 16277015, + "router_z_loss_clip": 0.34179688, + "router_z_loss_mlp": 1.34375, + "step": 758, + "time_per_iteration": 2.445692539215088 + }, + { + "auxiliary_loss_clip": 0.01172712, + "auxiliary_loss_mlp": 0.01062261, + "balance_loss_clip": 1.02950215, + "balance_loss_mlp": 1.03983974, + "epoch": 0.04563354877498873, + "flos": 23731546700160.0, + "grad_norm": 1.9123058886883226, + "language_loss": 0.88420147, + "learning_rate": 3.979584183843468e-06, + "loss": 0.90655118, + "num_input_tokens_seen": 16296005, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 1.328125, + "step": 759, + "time_per_iteration": 2.4633443355560303 + }, + { + "auxiliary_loss_clip": 0.01183593, + "auxiliary_loss_mlp": 0.01063908, + "balance_loss_clip": 1.02807355, + "balance_loss_mlp": 1.04788387, + "epoch": 0.045693672027656694, + "flos": 25734653712000.0, + "grad_norm": 2.305370252748593, + "language_loss": 0.73975301, + "learning_rate": 3.979530301727414e-06, + "loss": 0.76222801, + "num_input_tokens_seen": 16315300, + "router_z_loss_clip": 0.35742188, + "router_z_loss_mlp": 1.359375, + "step": 760, + "time_per_iteration": 2.4366261959075928 + }, + { + "auxiliary_loss_clip": 0.01179764, + "auxiliary_loss_mlp": 0.01055823, + "balance_loss_clip": 1.02196741, + "balance_loss_mlp": 1.0474813, + "epoch": 0.045753795280324666, + "flos": 19718001290880.0, + "grad_norm": 1.965163134473522, + "language_loss": 0.82210457, + "learning_rate": 3.979476348967016e-06, + "loss": 0.84446049, + "num_input_tokens_seen": 16333820, + "router_z_loss_clip": 0.33789062, + "router_z_loss_mlp": 1.328125, + "step": 761, + "time_per_iteration": 2.4243874549865723 + }, + { + "auxiliary_loss_clip": 0.01175688, + "auxiliary_loss_mlp": 0.01062024, + "balance_loss_clip": 1.02773905, + "balance_loss_mlp": 1.04460287, + "epoch": 0.04581391853299264, + "flos": 23254737995520.0, + "grad_norm": 1.669881972747545, + "language_loss": 0.7976234, + "learning_rate": 3.979422325564199e-06, + "loss": 0.82000047, + "num_input_tokens_seen": 16355290, + "router_z_loss_clip": 0.34179688, + "router_z_loss_mlp": 1.3125, + "step": 762, + "time_per_iteration": 2.4524641036987305 + }, + { + "auxiliary_loss_clip": 0.01048775, + "auxiliary_loss_mlp": 0.01005676, + "balance_loss_clip": 1.0008601, + "balance_loss_mlp": 1.00495434, + "epoch": 0.0458740417856606, + "flos": 64227896317440.0, + "grad_norm": 0.9980029867422425, + "language_loss": 0.58720791, + "learning_rate": 3.979368231520891e-06, + "loss": 0.60775238, + "num_input_tokens_seen": 16415995, + "router_z_loss_clip": 0.0480957, + "router_z_loss_mlp": 0.4375, + "step": 763, + "time_per_iteration": 3.0900869369506836 + }, + { + "auxiliary_loss_clip": 0.01180184, + "auxiliary_loss_mlp": 0.01071259, + "balance_loss_clip": 1.03927493, + "balance_loss_mlp": 1.04385817, + "epoch": 0.045934165038328575, + "flos": 20769271372800.0, + "grad_norm": 2.0523051948717885, + "language_loss": 0.87536454, + "learning_rate": 3.979314066839022e-06, + "loss": 0.89787894, + "num_input_tokens_seen": 16433120, + "router_z_loss_clip": 0.3203125, + "router_z_loss_mlp": 1.359375, + "step": 764, + "time_per_iteration": 3.8769683837890625 + }, + { + "auxiliary_loss_clip": 0.01179282, + "auxiliary_loss_mlp": 0.01075631, + "balance_loss_clip": 1.03967762, + "balance_loss_mlp": 1.04498005, + "epoch": 0.04599428829099654, + "flos": 30261596797440.0, + "grad_norm": 2.4330211626417233, + "language_loss": 0.85370469, + "learning_rate": 3.979259831520526e-06, + "loss": 0.87625384, + "num_input_tokens_seen": 16453360, + "router_z_loss_clip": 0.359375, + "router_z_loss_mlp": 1.34375, + "step": 765, + "time_per_iteration": 2.490962266921997 + }, + { + "auxiliary_loss_clip": 0.01181964, + "auxiliary_loss_mlp": 0.01070433, + "balance_loss_clip": 1.03176188, + "balance_loss_mlp": 1.04626715, + "epoch": 0.04605441154366451, + "flos": 23037822518400.0, + "grad_norm": 2.832774509546428, + "language_loss": 0.88183564, + "learning_rate": 3.979205525567337e-06, + "loss": 0.90435958, + "num_input_tokens_seen": 16471160, + "router_z_loss_clip": 0.38671875, + "router_z_loss_mlp": 1.359375, + "step": 766, + "time_per_iteration": 2.4284579753875732 + }, + { + "auxiliary_loss_clip": 0.01174969, + "auxiliary_loss_mlp": 0.01072638, + "balance_loss_clip": 1.03797174, + "balance_loss_mlp": 1.04154038, + "epoch": 0.046114534796332485, + "flos": 22016333692800.0, + "grad_norm": 13.411650425654186, + "language_loss": 0.83985424, + "learning_rate": 3.979151148981395e-06, + "loss": 0.86233032, + "num_input_tokens_seen": 16488940, + "router_z_loss_clip": 0.34765625, + "router_z_loss_mlp": 1.3359375, + "step": 767, + "time_per_iteration": 3.95444393157959 + }, + { + "auxiliary_loss_clip": 0.01178257, + "auxiliary_loss_mlp": 0.01061193, + "balance_loss_clip": 1.02736187, + "balance_loss_mlp": 1.04422903, + "epoch": 0.04617465804900045, + "flos": 29861073146880.0, + "grad_norm": 5.4049371492260905, + "language_loss": 0.8675254, + "learning_rate": 3.979096701764638e-06, + "loss": 0.88991988, + "num_input_tokens_seen": 16509505, + "router_z_loss_clip": 0.33789062, + "router_z_loss_mlp": 1.34375, + "step": 768, + "time_per_iteration": 2.4905314445495605 + }, + { + "auxiliary_loss_clip": 0.01176369, + "auxiliary_loss_mlp": 0.01066782, + "balance_loss_clip": 1.0337851, + "balance_loss_mlp": 1.04094982, + "epoch": 0.04623478130166842, + "flos": 25628866692480.0, + "grad_norm": 2.37342504986601, + "language_loss": 0.75016659, + "learning_rate": 3.979042183919012e-06, + "loss": 0.77259809, + "num_input_tokens_seen": 16528840, + "router_z_loss_clip": 0.33007812, + "router_z_loss_mlp": 1.3515625, + "step": 769, + "time_per_iteration": 3.8787219524383545 + }, + { + "auxiliary_loss_clip": 0.01175254, + "auxiliary_loss_mlp": 0.0106277, + "balance_loss_clip": 1.02986789, + "balance_loss_mlp": 1.04421747, + "epoch": 0.04629490455433639, + "flos": 20448035153280.0, + "grad_norm": 40.211569394658184, + "language_loss": 0.8622731, + "learning_rate": 3.97898759544646e-06, + "loss": 0.88465333, + "num_input_tokens_seen": 16548335, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 1.3046875, + "step": 770, + "time_per_iteration": 2.41145920753479 + }, + { + "auxiliary_loss_clip": 0.01179355, + "auxiliary_loss_mlp": 0.01064699, + "balance_loss_clip": 1.03155899, + "balance_loss_mlp": 1.0424161, + "epoch": 0.04635502780700436, + "flos": 23147624344320.0, + "grad_norm": 2.290435336935505, + "language_loss": 0.8721177, + "learning_rate": 3.978932936348932e-06, + "loss": 0.89455825, + "num_input_tokens_seen": 16567725, + "router_z_loss_clip": 0.33203125, + "router_z_loss_mlp": 1.3671875, + "step": 771, + "time_per_iteration": 2.4312503337860107 + }, + { + "auxiliary_loss_clip": 0.01181348, + "auxiliary_loss_mlp": 0.0107427, + "balance_loss_clip": 1.03600359, + "balance_loss_mlp": 1.04299688, + "epoch": 0.04641515105967233, + "flos": 23290977853440.0, + "grad_norm": 2.206256766876312, + "language_loss": 0.83575541, + "learning_rate": 3.978878206628377e-06, + "loss": 0.85831153, + "num_input_tokens_seen": 16588175, + "router_z_loss_clip": 0.3828125, + "router_z_loss_mlp": 1.375, + "step": 772, + "time_per_iteration": 2.4327445030212402 + }, + { + "auxiliary_loss_clip": 0.01177164, + "auxiliary_loss_mlp": 0.01059886, + "balance_loss_clip": 1.02746129, + "balance_loss_mlp": 1.04654682, + "epoch": 0.046475274312340296, + "flos": 25114142384640.0, + "grad_norm": 1.9569844693108625, + "language_loss": 0.73629689, + "learning_rate": 3.978823406286751e-06, + "loss": 0.75866747, + "num_input_tokens_seen": 16607735, + "router_z_loss_clip": 0.32421875, + "router_z_loss_mlp": 1.3046875, + "step": 773, + "time_per_iteration": 2.450957775115967 + }, + { + "auxiliary_loss_clip": 0.0117497, + "auxiliary_loss_mlp": 0.01059978, + "balance_loss_clip": 1.02726662, + "balance_loss_mlp": 1.0445869, + "epoch": 0.04653539756500827, + "flos": 25263745027200.0, + "grad_norm": 2.052677547720233, + "language_loss": 0.78662962, + "learning_rate": 3.978768535326006e-06, + "loss": 0.80897909, + "num_input_tokens_seen": 16627225, + "router_z_loss_clip": 0.32617188, + "router_z_loss_mlp": 1.3046875, + "step": 774, + "time_per_iteration": 2.43341326713562 + }, + { + "auxiliary_loss_clip": 0.01171919, + "auxiliary_loss_mlp": 0.01058004, + "balance_loss_clip": 1.02596056, + "balance_loss_mlp": 1.04186547, + "epoch": 0.046595520817676234, + "flos": 35402802076800.0, + "grad_norm": 2.122867568169163, + "language_loss": 0.73343658, + "learning_rate": 3.978713593748103e-06, + "loss": 0.75573587, + "num_input_tokens_seen": 16647785, + "router_z_loss_clip": 0.3203125, + "router_z_loss_mlp": 1.296875, + "step": 775, + "time_per_iteration": 2.53930926322937 + }, + { + "auxiliary_loss_clip": 0.01176197, + "auxiliary_loss_mlp": 0.01062381, + "balance_loss_clip": 1.02788162, + "balance_loss_mlp": 1.04260957, + "epoch": 0.046655644070344206, + "flos": 18111577680000.0, + "grad_norm": 1.6785940587675907, + "language_loss": 0.76859474, + "learning_rate": 3.9786585815550015e-06, + "loss": 0.79098046, + "num_input_tokens_seen": 16667555, + "router_z_loss_clip": 0.34570312, + "router_z_loss_mlp": 1.3359375, + "step": 776, + "time_per_iteration": 2.4250969886779785 + }, + { + "auxiliary_loss_clip": 0.0116975, + "auxiliary_loss_mlp": 0.01059828, + "balance_loss_clip": 1.02811885, + "balance_loss_mlp": 1.04102755, + "epoch": 0.04671576732301218, + "flos": 29204007759360.0, + "grad_norm": 4.3993855916972695, + "language_loss": 0.7100842, + "learning_rate": 3.978603498748664e-06, + "loss": 0.73238003, + "num_input_tokens_seen": 16686875, + "router_z_loss_clip": 0.31835938, + "router_z_loss_mlp": 1.28125, + "step": 777, + "time_per_iteration": 2.4765748977661133 + }, + { + "auxiliary_loss_clip": 0.01172053, + "auxiliary_loss_mlp": 0.01070464, + "balance_loss_clip": 1.03491557, + "balance_loss_mlp": 1.04171491, + "epoch": 0.04677589057568014, + "flos": 30477115820160.0, + "grad_norm": 1.8863215391272792, + "language_loss": 0.7640267, + "learning_rate": 3.978548345331058e-06, + "loss": 0.78645194, + "num_input_tokens_seen": 16706420, + "router_z_loss_clip": 0.35546875, + "router_z_loss_mlp": 1.296875, + "step": 778, + "time_per_iteration": 2.472606897354126 + }, + { + "auxiliary_loss_clip": 0.01171305, + "auxiliary_loss_mlp": 0.0106156, + "balance_loss_clip": 1.02784729, + "balance_loss_mlp": 1.04161787, + "epoch": 0.046836013828348115, + "flos": 20556649992960.0, + "grad_norm": 2.3938019710870857, + "language_loss": 0.78961205, + "learning_rate": 3.978493121304151e-06, + "loss": 0.81194067, + "num_input_tokens_seen": 16726390, + "router_z_loss_clip": 0.33789062, + "router_z_loss_mlp": 1.296875, + "step": 779, + "time_per_iteration": 2.429203987121582 + }, + { + "auxiliary_loss_clip": 0.01165012, + "auxiliary_loss_mlp": 0.01053025, + "balance_loss_clip": 1.0218637, + "balance_loss_mlp": 1.03878808, + "epoch": 0.04689613708101608, + "flos": 25446201125760.0, + "grad_norm": 1.7069824197535406, + "language_loss": 0.77102339, + "learning_rate": 3.978437826669914e-06, + "loss": 0.79320371, + "num_input_tokens_seen": 16748965, + "router_z_loss_clip": 0.3125, + "router_z_loss_mlp": 1.265625, + "step": 780, + "time_per_iteration": 2.483863115310669 + }, + { + "auxiliary_loss_clip": 0.01170145, + "auxiliary_loss_mlp": 0.01059947, + "balance_loss_clip": 1.02910745, + "balance_loss_mlp": 1.04262638, + "epoch": 0.04695626033368405, + "flos": 23000325851520.0, + "grad_norm": 1.9470585388344062, + "language_loss": 0.76273519, + "learning_rate": 3.9783824614303195e-06, + "loss": 0.78503609, + "num_input_tokens_seen": 16768620, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 1.2734375, + "step": 781, + "time_per_iteration": 2.428377628326416 + }, + { + "auxiliary_loss_clip": 0.01176939, + "auxiliary_loss_mlp": 0.01072466, + "balance_loss_clip": 1.03796661, + "balance_loss_mlp": 1.04257929, + "epoch": 0.047016383586352024, + "flos": 29132051713920.0, + "grad_norm": 2.209148364136706, + "language_loss": 0.73881859, + "learning_rate": 3.978327025587344e-06, + "loss": 0.76131266, + "num_input_tokens_seen": 16789755, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 1.34375, + "step": 782, + "time_per_iteration": 2.456827402114868 + }, + { + "auxiliary_loss_clip": 0.01171062, + "auxiliary_loss_mlp": 0.01053215, + "balance_loss_clip": 1.02298379, + "balance_loss_mlp": 1.04179227, + "epoch": 0.04707650683901999, + "flos": 14975434448640.0, + "grad_norm": 3.168953013497748, + "language_loss": 0.80221462, + "learning_rate": 3.978271519142967e-06, + "loss": 0.82445741, + "num_input_tokens_seen": 16807585, + "router_z_loss_clip": 0.30273438, + "router_z_loss_mlp": 1.296875, + "step": 783, + "time_per_iteration": 2.392314910888672 + }, + { + "auxiliary_loss_clip": 0.01166711, + "auxiliary_loss_mlp": 0.01056846, + "balance_loss_clip": 1.02725816, + "balance_loss_mlp": 1.04139662, + "epoch": 0.04713663009168796, + "flos": 21650094864000.0, + "grad_norm": 2.5535269215120957, + "language_loss": 0.81434727, + "learning_rate": 3.978215942099167e-06, + "loss": 0.8365829, + "num_input_tokens_seen": 16827220, + "router_z_loss_clip": 0.29492188, + "router_z_loss_mlp": 1.25, + "step": 784, + "time_per_iteration": 2.4085748195648193 + }, + { + "auxiliary_loss_clip": 0.0117326, + "auxiliary_loss_mlp": 0.01058399, + "balance_loss_clip": 1.02728581, + "balance_loss_mlp": 1.0409857, + "epoch": 0.04719675334435593, + "flos": 21324320167680.0, + "grad_norm": 2.743894936329185, + "language_loss": 0.80728829, + "learning_rate": 3.9781602944579285e-06, + "loss": 0.82960492, + "num_input_tokens_seen": 16846230, + "router_z_loss_clip": 0.31054688, + "router_z_loss_mlp": 1.3203125, + "step": 785, + "time_per_iteration": 2.4267308712005615 + }, + { + "auxiliary_loss_clip": 0.01173528, + "auxiliary_loss_mlp": 0.01055203, + "balance_loss_clip": 1.02485251, + "balance_loss_mlp": 1.04502642, + "epoch": 0.0472568765970239, + "flos": 17930413301760.0, + "grad_norm": 1.9076731639279216, + "language_loss": 0.89660287, + "learning_rate": 3.978104576221238e-06, + "loss": 0.91889018, + "num_input_tokens_seen": 16865325, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 1.28125, + "step": 786, + "time_per_iteration": 2.4019699096679688 + }, + { + "auxiliary_loss_clip": 0.01171585, + "auxiliary_loss_mlp": 0.01058545, + "balance_loss_clip": 1.02435589, + "balance_loss_mlp": 1.03746784, + "epoch": 0.04731699984969187, + "flos": 18076350251520.0, + "grad_norm": 3.8833038203919887, + "language_loss": 0.76674724, + "learning_rate": 3.978048787391084e-06, + "loss": 0.78904855, + "num_input_tokens_seen": 16882930, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 1.34375, + "step": 787, + "time_per_iteration": 2.373610258102417 + }, + { + "auxiliary_loss_clip": 0.01176536, + "auxiliary_loss_mlp": 0.01058107, + "balance_loss_clip": 1.02644455, + "balance_loss_mlp": 1.04419088, + "epoch": 0.047377123102359836, + "flos": 23183968936320.0, + "grad_norm": 4.053649185534547, + "language_loss": 0.80823344, + "learning_rate": 3.9779929279694565e-06, + "loss": 0.83057988, + "num_input_tokens_seen": 16900710, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 1.328125, + "step": 788, + "time_per_iteration": 2.4188549518585205 + }, + { + "auxiliary_loss_clip": 0.01171514, + "auxiliary_loss_mlp": 0.01060823, + "balance_loss_clip": 1.02610922, + "balance_loss_mlp": 1.04387689, + "epoch": 0.04743724635502781, + "flos": 22746681757440.0, + "grad_norm": 2.0244616281489547, + "language_loss": 0.84739041, + "learning_rate": 3.977936997958349e-06, + "loss": 0.86971378, + "num_input_tokens_seen": 16919210, + "router_z_loss_clip": 0.34765625, + "router_z_loss_mlp": 1.2734375, + "step": 789, + "time_per_iteration": 2.4152109622955322 + }, + { + "auxiliary_loss_clip": 0.01172058, + "auxiliary_loss_mlp": 0.01064209, + "balance_loss_clip": 1.03479958, + "balance_loss_mlp": 1.04249954, + "epoch": 0.04749736960769577, + "flos": 17237736460800.0, + "grad_norm": 2.4696918852654024, + "language_loss": 0.81907129, + "learning_rate": 3.977880997359758e-06, + "loss": 0.84143388, + "num_input_tokens_seen": 16937125, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 1.296875, + "step": 790, + "time_per_iteration": 2.41382098197937 + }, + { + "auxiliary_loss_clip": 0.01168927, + "auxiliary_loss_mlp": 0.01055267, + "balance_loss_clip": 1.02436805, + "balance_loss_mlp": 1.04008615, + "epoch": 0.047557492860363745, + "flos": 40477672039680.0, + "grad_norm": 2.1721899593907517, + "language_loss": 0.8778193, + "learning_rate": 3.977824926175682e-06, + "loss": 0.90006131, + "num_input_tokens_seen": 16958610, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 1.28125, + "step": 791, + "time_per_iteration": 2.5817065238952637 + }, + { + "auxiliary_loss_clip": 0.01172967, + "auxiliary_loss_mlp": 0.01058892, + "balance_loss_clip": 1.02744484, + "balance_loss_mlp": 1.04207683, + "epoch": 0.04761761611303172, + "flos": 18697001224320.0, + "grad_norm": 2.1744271782528704, + "language_loss": 0.90019238, + "learning_rate": 3.977768784408122e-06, + "loss": 0.92251098, + "num_input_tokens_seen": 16977300, + "router_z_loss_clip": 0.31445312, + "router_z_loss_mlp": 1.3046875, + "step": 792, + "time_per_iteration": 2.4133920669555664 + }, + { + "auxiliary_loss_clip": 0.01170189, + "auxiliary_loss_mlp": 0.01068953, + "balance_loss_clip": 1.03831649, + "balance_loss_mlp": 1.038118, + "epoch": 0.04767773936569968, + "flos": 20920968696960.0, + "grad_norm": 1.9371333153222121, + "language_loss": 0.73367131, + "learning_rate": 3.977712572059081e-06, + "loss": 0.75606275, + "num_input_tokens_seen": 16994950, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 1.3203125, + "step": 793, + "time_per_iteration": 2.385101795196533 + }, + { + "auxiliary_loss_clip": 0.01173409, + "auxiliary_loss_mlp": 0.01053075, + "balance_loss_clip": 1.02229476, + "balance_loss_mlp": 1.04118943, + "epoch": 0.047737862618367655, + "flos": 23731546700160.0, + "grad_norm": 2.663201040336238, + "language_loss": 0.85657656, + "learning_rate": 3.977656289130567e-06, + "loss": 0.8788414, + "num_input_tokens_seen": 17014760, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 1.328125, + "step": 794, + "time_per_iteration": 2.423823833465576 + }, + { + "auxiliary_loss_clip": 0.01172468, + "auxiliary_loss_mlp": 0.01062428, + "balance_loss_clip": 1.03012252, + "balance_loss_mlp": 1.0396831, + "epoch": 0.04779798587103562, + "flos": 23694643526400.0, + "grad_norm": 2.6155256508860307, + "language_loss": 0.69553244, + "learning_rate": 3.977599935624586e-06, + "loss": 0.71788138, + "num_input_tokens_seen": 17032715, + "router_z_loss_clip": 0.32226562, + "router_z_loss_mlp": 1.328125, + "step": 795, + "time_per_iteration": 2.4013278484344482 + }, + { + "auxiliary_loss_clip": 0.01169285, + "auxiliary_loss_mlp": 0.01065853, + "balance_loss_clip": 1.03307092, + "balance_loss_mlp": 1.04058623, + "epoch": 0.04785810912370359, + "flos": 23182572481920.0, + "grad_norm": 2.438329569382553, + "language_loss": 0.80910087, + "learning_rate": 3.977543511543151e-06, + "loss": 0.83145225, + "num_input_tokens_seen": 17052215, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 1.2890625, + "step": 796, + "time_per_iteration": 2.425485610961914 + }, + { + "auxiliary_loss_clip": 0.01169091, + "auxiliary_loss_mlp": 0.01057849, + "balance_loss_clip": 1.02528143, + "balance_loss_mlp": 1.04067898, + "epoch": 0.047918232376371564, + "flos": 18039656545920.0, + "grad_norm": 2.4290765407806587, + "language_loss": 0.81627935, + "learning_rate": 3.977487016888274e-06, + "loss": 0.83854878, + "num_input_tokens_seen": 17069225, + "router_z_loss_clip": 0.32617188, + "router_z_loss_mlp": 1.28125, + "step": 797, + "time_per_iteration": 2.385831117630005 + }, + { + "auxiliary_loss_clip": 0.01053266, + "auxiliary_loss_mlp": 0.01004369, + "balance_loss_clip": 0.99902827, + "balance_loss_mlp": 1.01167202, + "epoch": 0.04797835562903953, + "flos": 62439400632960.0, + "grad_norm": 0.9135418856904373, + "language_loss": 0.64484239, + "learning_rate": 3.977430451661972e-06, + "loss": 0.66541874, + "num_input_tokens_seen": 17126680, + "router_z_loss_clip": 0.0534668, + "router_z_loss_mlp": 0.41601562, + "step": 798, + "time_per_iteration": 2.9660699367523193 + }, + { + "auxiliary_loss_clip": 0.01174345, + "auxiliary_loss_mlp": 0.01058284, + "balance_loss_clip": 1.02762318, + "balance_loss_mlp": 1.03999674, + "epoch": 0.0480384788817075, + "flos": 21506217684480.0, + "grad_norm": 1.9946740548056843, + "language_loss": 0.90883076, + "learning_rate": 3.9773738158662655e-06, + "loss": 0.93115699, + "num_input_tokens_seen": 17144835, + "router_z_loss_clip": 0.30664062, + "router_z_loss_mlp": 1.34375, + "step": 799, + "time_per_iteration": 2.409356117248535 + }, + { + "auxiliary_loss_clip": 0.01171554, + "auxiliary_loss_mlp": 0.01055537, + "balance_loss_clip": 1.02487683, + "balance_loss_mlp": 1.04478359, + "epoch": 0.048098602134375466, + "flos": 21725611868160.0, + "grad_norm": 2.0226049753764235, + "language_loss": 0.86634338, + "learning_rate": 3.977317109503172e-06, + "loss": 0.8886143, + "num_input_tokens_seen": 17165030, + "router_z_loss_clip": 0.30664062, + "router_z_loss_mlp": 1.265625, + "step": 800, + "time_per_iteration": 2.443582773208618 + }, + { + "auxiliary_loss_clip": 0.01173783, + "auxiliary_loss_mlp": 0.01063135, + "balance_loss_clip": 1.03245056, + "balance_loss_mlp": 1.04335332, + "epoch": 0.04815872538704344, + "flos": 22929940817280.0, + "grad_norm": 3.5504707427095092, + "language_loss": 0.83880752, + "learning_rate": 3.977260332574718e-06, + "loss": 0.86117673, + "num_input_tokens_seen": 17184895, + "router_z_loss_clip": 0.30664062, + "router_z_loss_mlp": 1.3046875, + "step": 801, + "time_per_iteration": 2.430955410003662 + }, + { + "auxiliary_loss_clip": 0.01170878, + "auxiliary_loss_mlp": 0.01060558, + "balance_loss_clip": 1.02977824, + "balance_loss_mlp": 1.04109502, + "epoch": 0.04821884863971141, + "flos": 43173176601600.0, + "grad_norm": 2.5459362418907205, + "language_loss": 0.79219079, + "learning_rate": 3.977203485082928e-06, + "loss": 0.81450516, + "num_input_tokens_seen": 17208225, + "router_z_loss_clip": 0.30664062, + "router_z_loss_mlp": 1.296875, + "step": 802, + "time_per_iteration": 2.6055495738983154 + }, + { + "auxiliary_loss_clip": 0.01170224, + "auxiliary_loss_mlp": 0.01054641, + "balance_loss_clip": 1.02426648, + "balance_loss_mlp": 1.04124594, + "epoch": 0.048278971892379376, + "flos": 18619145159040.0, + "grad_norm": 1.745606544997716, + "language_loss": 0.86103964, + "learning_rate": 3.977146567029833e-06, + "loss": 0.88328832, + "num_input_tokens_seen": 17226305, + "router_z_loss_clip": 0.30273438, + "router_z_loss_mlp": 1.2890625, + "step": 803, + "time_per_iteration": 3.791219711303711 + }, + { + "auxiliary_loss_clip": 0.0116548, + "auxiliary_loss_mlp": 0.0105238, + "balance_loss_clip": 1.02250648, + "balance_loss_mlp": 1.04170287, + "epoch": 0.04833909514504735, + "flos": 20229024994560.0, + "grad_norm": 2.2811621272757576, + "language_loss": 0.85222125, + "learning_rate": 3.977089578417462e-06, + "loss": 0.87439978, + "num_input_tokens_seen": 17244545, + "router_z_loss_clip": 0.29882812, + "router_z_loss_mlp": 1.234375, + "step": 804, + "time_per_iteration": 2.4115700721740723 + }, + { + "auxiliary_loss_clip": 0.0116977, + "auxiliary_loss_mlp": 0.01048593, + "balance_loss_clip": 1.01893377, + "balance_loss_mlp": 1.04267776, + "epoch": 0.04839921839771532, + "flos": 24644001749760.0, + "grad_norm": 2.490447923729626, + "language_loss": 0.86260319, + "learning_rate": 3.9770325192478504e-06, + "loss": 0.88478678, + "num_input_tokens_seen": 17265730, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 1.265625, + "step": 805, + "time_per_iteration": 2.454223155975342 + }, + { + "auxiliary_loss_clip": 0.01163325, + "auxiliary_loss_mlp": 0.01052235, + "balance_loss_clip": 1.02326751, + "balance_loss_mlp": 1.03973639, + "epoch": 0.048459341650383285, + "flos": 24826283291520.0, + "grad_norm": 2.60907289230247, + "language_loss": 0.67868835, + "learning_rate": 3.9769753895230324e-06, + "loss": 0.70084393, + "num_input_tokens_seen": 17284820, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 1.234375, + "step": 806, + "time_per_iteration": 5.261777639389038 + }, + { + "auxiliary_loss_clip": 0.01165136, + "auxiliary_loss_mlp": 0.0105429, + "balance_loss_clip": 1.02570355, + "balance_loss_mlp": 1.04075348, + "epoch": 0.04851946490305126, + "flos": 22162130997120.0, + "grad_norm": 5.154972087774901, + "language_loss": 0.7642802, + "learning_rate": 3.976918189245049e-06, + "loss": 0.78647453, + "num_input_tokens_seen": 17305085, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 1.2421875, + "step": 807, + "time_per_iteration": 2.4211854934692383 + }, + { + "auxiliary_loss_clip": 0.01164869, + "auxiliary_loss_mlp": 0.0106269, + "balance_loss_clip": 1.0339365, + "balance_loss_mlp": 1.03880262, + "epoch": 0.04857958815571922, + "flos": 19791004677120.0, + "grad_norm": 2.46456303042586, + "language_loss": 0.86459714, + "learning_rate": 3.9768609184159405e-06, + "loss": 0.88687277, + "num_input_tokens_seen": 17322715, + "router_z_loss_clip": 0.28710938, + "router_z_loss_mlp": 1.265625, + "step": 808, + "time_per_iteration": 3.7626330852508545 + }, + { + "auxiliary_loss_clip": 0.01170461, + "auxiliary_loss_mlp": 0.01053986, + "balance_loss_clip": 1.02507806, + "balance_loss_mlp": 1.04022026, + "epoch": 0.048639711408387194, + "flos": 18696966312960.0, + "grad_norm": 2.3646723308193276, + "language_loss": 0.89717674, + "learning_rate": 3.976803577037751e-06, + "loss": 0.91942126, + "num_input_tokens_seen": 17341455, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 1.296875, + "step": 809, + "time_per_iteration": 2.4033820629119873 + }, + { + "auxiliary_loss_clip": 0.01170753, + "auxiliary_loss_mlp": 0.01060366, + "balance_loss_clip": 1.02941978, + "balance_loss_mlp": 1.04319715, + "epoch": 0.048699834661055166, + "flos": 24862348592640.0, + "grad_norm": 1.9856387765986683, + "language_loss": 0.84460419, + "learning_rate": 3.976746165112527e-06, + "loss": 0.8669154, + "num_input_tokens_seen": 17360765, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 1.28125, + "step": 810, + "time_per_iteration": 2.501471757888794 + }, + { + "auxiliary_loss_clip": 0.01171227, + "auxiliary_loss_mlp": 0.01050112, + "balance_loss_clip": 1.02076244, + "balance_loss_mlp": 1.04236078, + "epoch": 0.04875995791372313, + "flos": 20702970967680.0, + "grad_norm": 5.847958811419739, + "language_loss": 0.80468845, + "learning_rate": 3.976688682642317e-06, + "loss": 0.82690179, + "num_input_tokens_seen": 17380625, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 1.2890625, + "step": 811, + "time_per_iteration": 2.4055933952331543 + }, + { + "auxiliary_loss_clip": 0.01161484, + "auxiliary_loss_mlp": 0.01058142, + "balance_loss_clip": 1.02891231, + "balance_loss_mlp": 1.03926706, + "epoch": 0.048820081166391104, + "flos": 18587304132480.0, + "grad_norm": 1.782692438004299, + "language_loss": 0.74147636, + "learning_rate": 3.976631129629173e-06, + "loss": 0.76367265, + "num_input_tokens_seen": 17399355, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 1.21875, + "step": 812, + "time_per_iteration": 2.411888360977173 + }, + { + "auxiliary_loss_clip": 0.01164119, + "auxiliary_loss_mlp": 0.01059981, + "balance_loss_clip": 1.03265858, + "balance_loss_mlp": 1.04155898, + "epoch": 0.04888020441905907, + "flos": 22706322359040.0, + "grad_norm": 1.956145964727686, + "language_loss": 0.89826584, + "learning_rate": 3.9765735060751475e-06, + "loss": 0.92050683, + "num_input_tokens_seen": 17418240, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 1.2265625, + "step": 813, + "time_per_iteration": 2.4079573154449463 + }, + { + "auxiliary_loss_clip": 0.011635, + "auxiliary_loss_mlp": 0.01050011, + "balance_loss_clip": 1.02287924, + "balance_loss_mlp": 1.04093742, + "epoch": 0.04894032767172704, + "flos": 22783235817600.0, + "grad_norm": 2.4332683364797165, + "language_loss": 0.74885005, + "learning_rate": 3.976515811982298e-06, + "loss": 0.77098519, + "num_input_tokens_seen": 17436250, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 1.2265625, + "step": 814, + "time_per_iteration": 2.4603400230407715 + }, + { + "auxiliary_loss_clip": 0.0116791, + "auxiliary_loss_mlp": 0.01061613, + "balance_loss_clip": 1.03182244, + "balance_loss_mlp": 1.04210234, + "epoch": 0.04900045092439501, + "flos": 25515084971520.0, + "grad_norm": 2.394562762664596, + "language_loss": 0.83616436, + "learning_rate": 3.976458047352684e-06, + "loss": 0.85845953, + "num_input_tokens_seen": 17455750, + "router_z_loss_clip": 0.29882812, + "router_z_loss_mlp": 1.2578125, + "step": 815, + "time_per_iteration": 2.4485135078430176 + }, + { + "auxiliary_loss_clip": 0.01166777, + "auxiliary_loss_mlp": 0.01055653, + "balance_loss_clip": 1.02430129, + "balance_loss_mlp": 1.03958774, + "epoch": 0.04906057417706298, + "flos": 25956945538560.0, + "grad_norm": 2.157888550916716, + "language_loss": 0.90636873, + "learning_rate": 3.976400212188366e-06, + "loss": 0.92859304, + "num_input_tokens_seen": 17474995, + "router_z_loss_clip": 0.31445312, + "router_z_loss_mlp": 1.2734375, + "step": 816, + "time_per_iteration": 2.444671630859375 + }, + { + "auxiliary_loss_clip": 0.01168071, + "auxiliary_loss_mlp": 0.01055868, + "balance_loss_clip": 1.02784157, + "balance_loss_mlp": 1.04176772, + "epoch": 0.04912069742973095, + "flos": 18623648724480.0, + "grad_norm": 2.630038287340091, + "language_loss": 0.79744601, + "learning_rate": 3.976342306491408e-06, + "loss": 0.81968546, + "num_input_tokens_seen": 17493395, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 1.265625, + "step": 817, + "time_per_iteration": 2.4015910625457764 + }, + { + "auxiliary_loss_clip": 0.01165215, + "auxiliary_loss_mlp": 0.01062397, + "balance_loss_clip": 1.03328562, + "balance_loss_mlp": 1.04173994, + "epoch": 0.049180820682398915, + "flos": 23698553598720.0, + "grad_norm": 2.698144736986534, + "language_loss": 0.84772664, + "learning_rate": 3.976284330263878e-06, + "loss": 0.87000275, + "num_input_tokens_seen": 17514565, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 1.234375, + "step": 818, + "time_per_iteration": 2.4503378868103027 + }, + { + "auxiliary_loss_clip": 0.01169224, + "auxiliary_loss_mlp": 0.01056172, + "balance_loss_clip": 1.0257256, + "balance_loss_mlp": 1.04177284, + "epoch": 0.04924094393506689, + "flos": 22419266227200.0, + "grad_norm": 3.614464798815647, + "language_loss": 0.7506969, + "learning_rate": 3.976226283507843e-06, + "loss": 0.77295083, + "num_input_tokens_seen": 17534590, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 1.2734375, + "step": 819, + "time_per_iteration": 2.423330783843994 + }, + { + "auxiliary_loss_clip": 0.01167272, + "auxiliary_loss_mlp": 0.01055403, + "balance_loss_clip": 1.02767491, + "balance_loss_mlp": 1.04262042, + "epoch": 0.04930106718773486, + "flos": 15737448983040.0, + "grad_norm": 2.234062200713571, + "language_loss": 0.85044587, + "learning_rate": 3.976168166225375e-06, + "loss": 0.87267256, + "num_input_tokens_seen": 17551900, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 1.25, + "step": 820, + "time_per_iteration": 2.420631170272827 + }, + { + "auxiliary_loss_clip": 0.01168213, + "auxiliary_loss_mlp": 0.01054757, + "balance_loss_clip": 1.02502584, + "balance_loss_mlp": 1.04121029, + "epoch": 0.049361190440402825, + "flos": 26249412931200.0, + "grad_norm": 2.000199948064709, + "language_loss": 0.90914762, + "learning_rate": 3.976109978418549e-06, + "loss": 0.93137735, + "num_input_tokens_seen": 17571485, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 1.265625, + "step": 821, + "time_per_iteration": 2.4498400688171387 + }, + { + "auxiliary_loss_clip": 0.01167231, + "auxiliary_loss_mlp": 0.01064154, + "balance_loss_clip": 1.03509116, + "balance_loss_mlp": 1.0418961, + "epoch": 0.0494213136930708, + "flos": 21251281870080.0, + "grad_norm": 1.832087213668366, + "language_loss": 0.8943603, + "learning_rate": 3.976051720089441e-06, + "loss": 0.91667426, + "num_input_tokens_seen": 17591410, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 1.25, + "step": 822, + "time_per_iteration": 2.4395205974578857 + }, + { + "auxiliary_loss_clip": 0.01166594, + "auxiliary_loss_mlp": 0.01057127, + "balance_loss_clip": 1.02496481, + "balance_loss_mlp": 1.0421176, + "epoch": 0.04948143694573876, + "flos": 27964241913600.0, + "grad_norm": 6.6433608209893436, + "language_loss": 0.67021036, + "learning_rate": 3.9759933912401304e-06, + "loss": 0.69244754, + "num_input_tokens_seen": 17612010, + "router_z_loss_clip": 0.32226562, + "router_z_loss_mlp": 1.25, + "step": 823, + "time_per_iteration": 2.4612014293670654 + }, + { + "auxiliary_loss_clip": 0.01050969, + "auxiliary_loss_mlp": 0.01017236, + "balance_loss_clip": 1.01137078, + "balance_loss_mlp": 1.00644159, + "epoch": 0.049541560198406734, + "flos": 66178250398080.0, + "grad_norm": 1.3172251732625322, + "language_loss": 0.62187296, + "learning_rate": 3.975934991872698e-06, + "loss": 0.642555, + "num_input_tokens_seen": 17673430, + "router_z_loss_clip": 0.05859375, + "router_z_loss_mlp": 0.4453125, + "step": 824, + "time_per_iteration": 3.1524150371551514 + }, + { + "auxiliary_loss_clip": 0.0116972, + "auxiliary_loss_mlp": 0.01061632, + "balance_loss_clip": 1.02976692, + "balance_loss_mlp": 1.04172587, + "epoch": 0.049601683451074706, + "flos": 22891606277760.0, + "grad_norm": 1.8254047317461848, + "language_loss": 0.90296292, + "learning_rate": 3.975876521989229e-06, + "loss": 0.9252764, + "num_input_tokens_seen": 17689545, + "router_z_loss_clip": 0.31835938, + "router_z_loss_mlp": 1.28125, + "step": 825, + "time_per_iteration": 2.4061501026153564 + }, + { + "auxiliary_loss_clip": 0.01170339, + "auxiliary_loss_mlp": 0.0106166, + "balance_loss_clip": 1.02899635, + "balance_loss_mlp": 1.04297948, + "epoch": 0.04966180670374267, + "flos": 21432585893760.0, + "grad_norm": 2.234887316682884, + "language_loss": 0.66441983, + "learning_rate": 3.975817981591809e-06, + "loss": 0.6867398, + "num_input_tokens_seen": 17705965, + "router_z_loss_clip": 0.32617188, + "router_z_loss_mlp": 1.2734375, + "step": 826, + "time_per_iteration": 2.4316608905792236 + }, + { + "auxiliary_loss_clip": 0.01170613, + "auxiliary_loss_mlp": 0.01059237, + "balance_loss_clip": 1.02803993, + "balance_loss_mlp": 1.04257441, + "epoch": 0.04972192995641064, + "flos": 23106392161920.0, + "grad_norm": 2.0648308008719636, + "language_loss": 0.78250402, + "learning_rate": 3.975759370682528e-06, + "loss": 0.80480254, + "num_input_tokens_seen": 17724580, + "router_z_loss_clip": 0.3125, + "router_z_loss_mlp": 1.28125, + "step": 827, + "time_per_iteration": 2.4437127113342285 + }, + { + "auxiliary_loss_clip": 0.01172813, + "auxiliary_loss_mlp": 0.01065139, + "balance_loss_clip": 1.03342938, + "balance_loss_mlp": 1.04325986, + "epoch": 0.04978205320907861, + "flos": 40404563919360.0, + "grad_norm": 1.6663251374112558, + "language_loss": 0.78703785, + "learning_rate": 3.975700689263477e-06, + "loss": 0.80941737, + "num_input_tokens_seen": 17747755, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 1.296875, + "step": 828, + "time_per_iteration": 2.6153178215026855 + }, + { + "auxiliary_loss_clip": 0.01162054, + "auxiliary_loss_mlp": 0.01054695, + "balance_loss_clip": 1.02622795, + "balance_loss_mlp": 1.0404613, + "epoch": 0.04984217646174658, + "flos": 25227365523840.0, + "grad_norm": 2.043692486008699, + "language_loss": 0.83223975, + "learning_rate": 3.97564193733675e-06, + "loss": 0.85440719, + "num_input_tokens_seen": 17768550, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 1.21875, + "step": 829, + "time_per_iteration": 2.4516046047210693 + }, + { + "auxiliary_loss_clip": 0.01167924, + "auxiliary_loss_mlp": 0.01064178, + "balance_loss_clip": 1.0296793, + "balance_loss_mlp": 1.03958869, + "epoch": 0.04990229971441455, + "flos": 15958763291520.0, + "grad_norm": 1.9552786395143507, + "language_loss": 0.75125033, + "learning_rate": 3.975583114904446e-06, + "loss": 0.77357137, + "num_input_tokens_seen": 17786080, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 1.28125, + "step": 830, + "time_per_iteration": 2.410106897354126 + }, + { + "auxiliary_loss_clip": 0.01168344, + "auxiliary_loss_mlp": 0.01063958, + "balance_loss_clip": 1.03321362, + "balance_loss_mlp": 1.039922, + "epoch": 0.04996242296708252, + "flos": 18404149806720.0, + "grad_norm": 1.9775454545213287, + "language_loss": 0.79518765, + "learning_rate": 3.975524221968661e-06, + "loss": 0.81751066, + "num_input_tokens_seen": 17803635, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 1.28125, + "step": 831, + "time_per_iteration": 2.4046993255615234 + }, + { + "auxiliary_loss_clip": 0.01170035, + "auxiliary_loss_mlp": 0.01065907, + "balance_loss_clip": 1.03642654, + "balance_loss_mlp": 1.04187751, + "epoch": 0.05002254621975049, + "flos": 17857095713280.0, + "grad_norm": 2.480317958948075, + "language_loss": 0.91368961, + "learning_rate": 3.975465258531499e-06, + "loss": 0.93604904, + "num_input_tokens_seen": 17822190, + "router_z_loss_clip": 0.29492188, + "router_z_loss_mlp": 1.28125, + "step": 832, + "time_per_iteration": 2.4149022102355957 + }, + { + "auxiliary_loss_clip": 0.01162742, + "auxiliary_loss_mlp": 0.01063583, + "balance_loss_clip": 1.03423357, + "balance_loss_mlp": 1.04163289, + "epoch": 0.050082669472418455, + "flos": 45658538490240.0, + "grad_norm": 2.126591231557392, + "language_loss": 0.83265626, + "learning_rate": 3.9754062245950625e-06, + "loss": 0.85491955, + "num_input_tokens_seen": 17846915, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 1.2109375, + "step": 833, + "time_per_iteration": 2.5969738960266113 + }, + { + "auxiliary_loss_clip": 0.01164549, + "auxiliary_loss_mlp": 0.01053187, + "balance_loss_clip": 1.02181077, + "balance_loss_mlp": 1.0378592, + "epoch": 0.05014279272508643, + "flos": 37960538947200.0, + "grad_norm": 2.5697450136119495, + "language_loss": 0.82563829, + "learning_rate": 3.975347120161459e-06, + "loss": 0.84781563, + "num_input_tokens_seen": 17867270, + "router_z_loss_clip": 0.31445312, + "router_z_loss_mlp": 1.265625, + "step": 834, + "time_per_iteration": 2.5611331462860107 + }, + { + "auxiliary_loss_clip": 0.0116794, + "auxiliary_loss_mlp": 0.01056384, + "balance_loss_clip": 1.02417338, + "balance_loss_mlp": 1.04004765, + "epoch": 0.0502029159777544, + "flos": 20995124158080.0, + "grad_norm": 2.2147712592351776, + "language_loss": 0.91696298, + "learning_rate": 3.975287945232799e-06, + "loss": 0.93920618, + "num_input_tokens_seen": 17884880, + "router_z_loss_clip": 0.32226562, + "router_z_loss_mlp": 1.28125, + "step": 835, + "time_per_iteration": 2.3931682109832764 + }, + { + "auxiliary_loss_clip": 0.01169635, + "auxiliary_loss_mlp": 0.0106848, + "balance_loss_clip": 1.03572142, + "balance_loss_mlp": 1.03864908, + "epoch": 0.050263039230422364, + "flos": 15887156359680.0, + "grad_norm": 8.942841368746471, + "language_loss": 0.76724601, + "learning_rate": 3.975228699811193e-06, + "loss": 0.78962719, + "num_input_tokens_seen": 17903695, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 1.3125, + "step": 836, + "time_per_iteration": 2.4156110286712646 + }, + { + "auxiliary_loss_clip": 0.01163802, + "auxiliary_loss_mlp": 0.01066891, + "balance_loss_clip": 1.03853154, + "balance_loss_mlp": 1.04223275, + "epoch": 0.050323162483090336, + "flos": 23731616522880.0, + "grad_norm": 2.121859205210282, + "language_loss": 0.83580768, + "learning_rate": 3.975169383898755e-06, + "loss": 0.8581146, + "num_input_tokens_seen": 17920745, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 1.21875, + "step": 837, + "time_per_iteration": 2.415476083755493 + }, + { + "auxiliary_loss_clip": 0.01164628, + "auxiliary_loss_mlp": 0.01063927, + "balance_loss_clip": 1.03463697, + "balance_loss_mlp": 1.0413357, + "epoch": 0.0503832857357583, + "flos": 20265195029760.0, + "grad_norm": 2.4680140494286156, + "language_loss": 0.7328164, + "learning_rate": 3.975109997497604e-06, + "loss": 0.75510192, + "num_input_tokens_seen": 17938220, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 1.234375, + "step": 838, + "time_per_iteration": 2.435312271118164 + }, + { + "auxiliary_loss_clip": 0.01160911, + "auxiliary_loss_mlp": 0.01063132, + "balance_loss_clip": 1.03287721, + "balance_loss_mlp": 1.03826404, + "epoch": 0.05044340898842627, + "flos": 17784057415680.0, + "grad_norm": 2.914937148434028, + "language_loss": 0.83091825, + "learning_rate": 3.975050540609857e-06, + "loss": 0.85315871, + "num_input_tokens_seen": 17957325, + "router_z_loss_clip": 0.30273438, + "router_z_loss_mlp": 1.2265625, + "step": 839, + "time_per_iteration": 2.4032273292541504 + }, + { + "auxiliary_loss_clip": 0.01159855, + "auxiliary_loss_mlp": 0.01056911, + "balance_loss_clip": 1.02784753, + "balance_loss_mlp": 1.04024374, + "epoch": 0.050503532241094246, + "flos": 22965412625280.0, + "grad_norm": 1.7597979747182033, + "language_loss": 0.8568148, + "learning_rate": 3.9749910132376355e-06, + "loss": 0.87898248, + "num_input_tokens_seen": 17975875, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 1.203125, + "step": 840, + "time_per_iteration": 2.453213691711426 + }, + { + "auxiliary_loss_clip": 0.0116538, + "auxiliary_loss_mlp": 0.01057513, + "balance_loss_clip": 1.02687609, + "balance_loss_mlp": 1.04037189, + "epoch": 0.05056365549376221, + "flos": 22776078988800.0, + "grad_norm": 1.9817718431560314, + "language_loss": 0.9464941, + "learning_rate": 3.974931415383066e-06, + "loss": 0.96872306, + "num_input_tokens_seen": 17994340, + "router_z_loss_clip": 0.30664062, + "router_z_loss_mlp": 1.25, + "step": 841, + "time_per_iteration": 2.447728157043457 + }, + { + "auxiliary_loss_clip": 0.01166064, + "auxiliary_loss_mlp": 0.01059588, + "balance_loss_clip": 1.03023815, + "balance_loss_mlp": 1.03955817, + "epoch": 0.05062377874643018, + "flos": 30915729630720.0, + "grad_norm": 2.1937082555241596, + "language_loss": 0.77494878, + "learning_rate": 3.974871747048274e-06, + "loss": 0.79720527, + "num_input_tokens_seen": 18015260, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 1.265625, + "step": 842, + "time_per_iteration": 3.926527976989746 + }, + { + "auxiliary_loss_clip": 0.01172329, + "auxiliary_loss_mlp": 0.01070474, + "balance_loss_clip": 1.03757167, + "balance_loss_mlp": 1.04413319, + "epoch": 0.05068390199909815, + "flos": 19646115068160.0, + "grad_norm": 2.3304262601049843, + "language_loss": 0.78067744, + "learning_rate": 3.97481200823539e-06, + "loss": 0.80310547, + "num_input_tokens_seen": 18033960, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 1.28125, + "step": 843, + "time_per_iteration": 2.4005467891693115 + }, + { + "auxiliary_loss_clip": 0.01167731, + "auxiliary_loss_mlp": 0.01048925, + "balance_loss_clip": 1.01936114, + "balance_loss_mlp": 1.04121172, + "epoch": 0.05074402525176612, + "flos": 37960573858560.0, + "grad_norm": 2.398313254478142, + "language_loss": 0.83207279, + "learning_rate": 3.974752198946545e-06, + "loss": 0.85423934, + "num_input_tokens_seen": 18056700, + "router_z_loss_clip": 0.29492188, + "router_z_loss_mlp": 1.265625, + "step": 844, + "time_per_iteration": 2.590186595916748 + }, + { + "auxiliary_loss_clip": 0.01160503, + "auxiliary_loss_mlp": 0.01058272, + "balance_loss_clip": 1.02806437, + "balance_loss_mlp": 1.03811467, + "epoch": 0.05080414850443409, + "flos": 22053516157440.0, + "grad_norm": 2.354765899656259, + "language_loss": 0.76544082, + "learning_rate": 3.974692319183873e-06, + "loss": 0.78762859, + "num_input_tokens_seen": 18075815, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 1.2265625, + "step": 845, + "time_per_iteration": 2.4088919162750244 + }, + { + "auxiliary_loss_clip": 0.01161682, + "auxiliary_loss_mlp": 0.01059081, + "balance_loss_clip": 1.02811074, + "balance_loss_mlp": 1.03650367, + "epoch": 0.05086427175710206, + "flos": 20224870542720.0, + "grad_norm": 1.7049887007445408, + "language_loss": 0.87393314, + "learning_rate": 3.974632368949513e-06, + "loss": 0.89614075, + "num_input_tokens_seen": 18095095, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 1.25, + "step": 846, + "time_per_iteration": 3.855921506881714 + }, + { + "auxiliary_loss_clip": 0.01166303, + "auxiliary_loss_mlp": 0.01054591, + "balance_loss_clip": 1.02493167, + "balance_loss_mlp": 1.04270983, + "epoch": 0.05092439500977003, + "flos": 15158309483520.0, + "grad_norm": 2.118490048729354, + "language_loss": 0.87353724, + "learning_rate": 3.974572348245602e-06, + "loss": 0.89574617, + "num_input_tokens_seen": 18112675, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 1.234375, + "step": 847, + "time_per_iteration": 2.3899219036102295 + }, + { + "auxiliary_loss_clip": 0.01158489, + "auxiliary_loss_mlp": 0.01055071, + "balance_loss_clip": 1.0248158, + "balance_loss_mlp": 1.03820229, + "epoch": 0.050984518262437994, + "flos": 22054039827840.0, + "grad_norm": 2.2959085046618943, + "language_loss": 0.81882077, + "learning_rate": 3.974512257074284e-06, + "loss": 0.84095639, + "num_input_tokens_seen": 18130745, + "router_z_loss_clip": 0.30273438, + "router_z_loss_mlp": 1.203125, + "step": 848, + "time_per_iteration": 3.773952007293701 + }, + { + "auxiliary_loss_clip": 0.01164005, + "auxiliary_loss_mlp": 0.01058961, + "balance_loss_clip": 1.02872968, + "balance_loss_mlp": 1.04196203, + "epoch": 0.05104464151510597, + "flos": 30224065219200.0, + "grad_norm": 2.2019890987313504, + "language_loss": 0.87174815, + "learning_rate": 3.974452095437701e-06, + "loss": 0.89397776, + "num_input_tokens_seen": 18152410, + "router_z_loss_clip": 0.30273438, + "router_z_loss_mlp": 1.21875, + "step": 849, + "time_per_iteration": 2.490201711654663 + }, + { + "auxiliary_loss_clip": 0.01159582, + "auxiliary_loss_mlp": 0.01052927, + "balance_loss_clip": 1.02360129, + "balance_loss_mlp": 1.03816199, + "epoch": 0.05110476476777394, + "flos": 18331914470400.0, + "grad_norm": 2.0239864852213465, + "language_loss": 0.83400553, + "learning_rate": 3.974391863338003e-06, + "loss": 0.8561306, + "num_input_tokens_seen": 18170870, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 1.21875, + "step": 850, + "time_per_iteration": 2.468676805496216 + }, + { + "auxiliary_loss_clip": 0.01160646, + "auxiliary_loss_mlp": 0.01051729, + "balance_loss_clip": 1.02292764, + "balance_loss_mlp": 1.03897095, + "epoch": 0.051164888020441904, + "flos": 37997197741440.0, + "grad_norm": 1.9923005803260347, + "language_loss": 0.65049136, + "learning_rate": 3.974331560777338e-06, + "loss": 0.67261505, + "num_input_tokens_seen": 18191555, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 1.21875, + "step": 851, + "time_per_iteration": 2.573582172393799 + }, + { + "auxiliary_loss_clip": 0.01158165, + "auxiliary_loss_mlp": 0.0105297, + "balance_loss_clip": 1.02218974, + "balance_loss_mlp": 1.03646827, + "epoch": 0.051225011273109876, + "flos": 23037543227520.0, + "grad_norm": 2.750400779171418, + "language_loss": 0.83152038, + "learning_rate": 3.974271187757857e-06, + "loss": 0.85363173, + "num_input_tokens_seen": 18208620, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 1.21875, + "step": 852, + "time_per_iteration": 2.4141244888305664 + }, + { + "auxiliary_loss_clip": 0.01166429, + "auxiliary_loss_mlp": 0.0106459, + "balance_loss_clip": 1.03333354, + "balance_loss_mlp": 1.04123831, + "epoch": 0.05128513452577785, + "flos": 18258841261440.0, + "grad_norm": 2.0184171365603514, + "language_loss": 0.80007803, + "learning_rate": 3.974210744281717e-06, + "loss": 0.82238829, + "num_input_tokens_seen": 18226370, + "router_z_loss_clip": 0.3125, + "router_z_loss_mlp": 1.25, + "step": 853, + "time_per_iteration": 2.405571937561035 + }, + { + "auxiliary_loss_clip": 0.01160161, + "auxiliary_loss_mlp": 0.01055026, + "balance_loss_clip": 1.02531874, + "balance_loss_mlp": 1.03994238, + "epoch": 0.05134525777844581, + "flos": 27197723813760.0, + "grad_norm": 1.9780043386979285, + "language_loss": 0.75332499, + "learning_rate": 3.974150230351074e-06, + "loss": 0.77547681, + "num_input_tokens_seen": 18247075, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 1.203125, + "step": 854, + "time_per_iteration": 2.4725728034973145 + }, + { + "auxiliary_loss_clip": 0.01164939, + "auxiliary_loss_mlp": 0.01053258, + "balance_loss_clip": 1.02355087, + "balance_loss_mlp": 1.04014957, + "epoch": 0.051405381031113785, + "flos": 28361099871360.0, + "grad_norm": 2.159727043901847, + "language_loss": 0.81719911, + "learning_rate": 3.974089645968087e-06, + "loss": 0.83938104, + "num_input_tokens_seen": 18265680, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 1.25, + "step": 855, + "time_per_iteration": 2.4578967094421387 + }, + { + "auxiliary_loss_clip": 0.01051068, + "auxiliary_loss_mlp": 0.01009121, + "balance_loss_clip": 1.00416172, + "balance_loss_mlp": 1.01090991, + "epoch": 0.05146550428378175, + "flos": 65614855921920.0, + "grad_norm": 0.9758737055950394, + "language_loss": 0.65607464, + "learning_rate": 3.974028991134917e-06, + "loss": 0.67667657, + "num_input_tokens_seen": 18327015, + "router_z_loss_clip": 0.04956055, + "router_z_loss_mlp": 0.40234375, + "step": 856, + "time_per_iteration": 3.051262855529785 + }, + { + "auxiliary_loss_clip": 0.01158072, + "auxiliary_loss_mlp": 0.01047891, + "balance_loss_clip": 1.01911426, + "balance_loss_mlp": 1.03875589, + "epoch": 0.05152562753644972, + "flos": 22053760536960.0, + "grad_norm": 3.191385805610801, + "language_loss": 0.76746464, + "learning_rate": 3.973968265853732e-06, + "loss": 0.7895242, + "num_input_tokens_seen": 18345235, + "router_z_loss_clip": 0.28710938, + "router_z_loss_mlp": 1.1953125, + "step": 857, + "time_per_iteration": 2.4042773246765137 + }, + { + "auxiliary_loss_clip": 0.01162847, + "auxiliary_loss_mlp": 0.01053205, + "balance_loss_clip": 1.02392697, + "balance_loss_mlp": 1.03982496, + "epoch": 0.051585750789117694, + "flos": 18508714928640.0, + "grad_norm": 2.3876640997155048, + "language_loss": 0.88652521, + "learning_rate": 3.973907470126697e-06, + "loss": 0.9086858, + "num_input_tokens_seen": 18362350, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 1.234375, + "step": 858, + "time_per_iteration": 2.413517475128174 + }, + { + "auxiliary_loss_clip": 0.01161464, + "auxiliary_loss_mlp": 0.01053192, + "balance_loss_clip": 1.02322316, + "balance_loss_mlp": 1.03962851, + "epoch": 0.05164587404178566, + "flos": 23729172727680.0, + "grad_norm": 2.7678383094014634, + "language_loss": 0.75064158, + "learning_rate": 3.973846603955982e-06, + "loss": 0.77278817, + "num_input_tokens_seen": 18383390, + "router_z_loss_clip": 0.29882812, + "router_z_loss_mlp": 1.21875, + "step": 859, + "time_per_iteration": 2.435861825942993 + }, + { + "auxiliary_loss_clip": 0.01168612, + "auxiliary_loss_mlp": 0.0106378, + "balance_loss_clip": 1.0315932, + "balance_loss_mlp": 1.04018474, + "epoch": 0.05170599729445363, + "flos": 16251963822720.0, + "grad_norm": 2.468489423187343, + "language_loss": 0.90703034, + "learning_rate": 3.973785667343758e-06, + "loss": 0.92935425, + "num_input_tokens_seen": 18399220, + "router_z_loss_clip": 0.32226562, + "router_z_loss_mlp": 1.28125, + "step": 860, + "time_per_iteration": 2.389540672302246 + }, + { + "auxiliary_loss_clip": 0.01163009, + "auxiliary_loss_mlp": 0.01049808, + "balance_loss_clip": 1.02217507, + "balance_loss_mlp": 1.04159057, + "epoch": 0.0517661205471216, + "flos": 23984841680640.0, + "grad_norm": 1.99597550647492, + "language_loss": 0.82325977, + "learning_rate": 3.973724660292202e-06, + "loss": 0.84538794, + "num_input_tokens_seen": 18419005, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 1.21875, + "step": 861, + "time_per_iteration": 2.447273015975952 + }, + { + "auxiliary_loss_clip": 0.01163287, + "auxiliary_loss_mlp": 0.01049733, + "balance_loss_clip": 1.02100301, + "balance_loss_mlp": 1.04005361, + "epoch": 0.05182624379978957, + "flos": 29276452563840.0, + "grad_norm": 2.184600867013007, + "language_loss": 0.78252262, + "learning_rate": 3.973663582803489e-06, + "loss": 0.80465281, + "num_input_tokens_seen": 18440550, + "router_z_loss_clip": 0.28710938, + "router_z_loss_mlp": 1.234375, + "step": 862, + "time_per_iteration": 2.4645631313323975 + }, + { + "auxiliary_loss_clip": 0.01160461, + "auxiliary_loss_mlp": 0.01060547, + "balance_loss_clip": 1.03124499, + "balance_loss_mlp": 1.04275537, + "epoch": 0.05188636705245754, + "flos": 24169671751680.0, + "grad_norm": 1.8707139286249292, + "language_loss": 0.89435291, + "learning_rate": 3.9736024348798e-06, + "loss": 0.91656297, + "num_input_tokens_seen": 18461950, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 1.171875, + "step": 863, + "time_per_iteration": 2.467291831970215 + }, + { + "auxiliary_loss_clip": 0.01164169, + "auxiliary_loss_mlp": 0.01061407, + "balance_loss_clip": 1.03003168, + "balance_loss_mlp": 1.04240823, + "epoch": 0.051946490305125506, + "flos": 26759494028160.0, + "grad_norm": 2.655600598569303, + "language_loss": 0.75558275, + "learning_rate": 3.973541216523316e-06, + "loss": 0.77783847, + "num_input_tokens_seen": 18480555, + "router_z_loss_clip": 0.3125, + "router_z_loss_mlp": 1.21875, + "step": 864, + "time_per_iteration": 2.4739580154418945 + }, + { + "auxiliary_loss_clip": 0.01165025, + "auxiliary_loss_mlp": 0.01055548, + "balance_loss_clip": 1.02510238, + "balance_loss_mlp": 1.04179323, + "epoch": 0.05200661355779348, + "flos": 21501574473600.0, + "grad_norm": 1.9851203173179528, + "language_loss": 0.78729963, + "learning_rate": 3.973479927736224e-06, + "loss": 0.80950534, + "num_input_tokens_seen": 18499645, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 1.234375, + "step": 865, + "time_per_iteration": 2.481898546218872 + }, + { + "auxiliary_loss_clip": 0.0116096, + "auxiliary_loss_mlp": 0.01056058, + "balance_loss_clip": 1.02567112, + "balance_loss_mlp": 1.03859043, + "epoch": 0.05206673681046144, + "flos": 18113497804800.0, + "grad_norm": 2.155310896039154, + "language_loss": 0.85959566, + "learning_rate": 3.973418568520709e-06, + "loss": 0.88176584, + "num_input_tokens_seen": 18516810, + "router_z_loss_clip": 0.30273438, + "router_z_loss_mlp": 1.21875, + "step": 866, + "time_per_iteration": 2.3983001708984375 + }, + { + "auxiliary_loss_clip": 0.01165422, + "auxiliary_loss_mlp": 0.01058009, + "balance_loss_clip": 1.02902925, + "balance_loss_mlp": 1.04385662, + "epoch": 0.052126860063129415, + "flos": 17523396138240.0, + "grad_norm": 2.813766549619152, + "language_loss": 0.87160748, + "learning_rate": 3.973357138878961e-06, + "loss": 0.89384174, + "num_input_tokens_seen": 18532510, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 1.21875, + "step": 867, + "time_per_iteration": 2.3934857845306396 + }, + { + "auxiliary_loss_clip": 0.01154742, + "auxiliary_loss_mlp": 0.01062666, + "balance_loss_clip": 1.03545022, + "balance_loss_mlp": 1.0386498, + "epoch": 0.05218698331579739, + "flos": 32596692727680.0, + "grad_norm": 1.4836901961628903, + "language_loss": 0.6341002, + "learning_rate": 3.973295638813174e-06, + "loss": 0.65627426, + "num_input_tokens_seen": 18557380, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 1.15625, + "step": 868, + "time_per_iteration": 2.5175845623016357 + }, + { + "auxiliary_loss_clip": 0.01165022, + "auxiliary_loss_mlp": 0.01062688, + "balance_loss_clip": 1.03052521, + "balance_loss_mlp": 1.04034626, + "epoch": 0.05224710656846535, + "flos": 22126205341440.0, + "grad_norm": 4.559647925660619, + "language_loss": 0.83260775, + "learning_rate": 3.973234068325541e-06, + "loss": 0.85488486, + "num_input_tokens_seen": 18575720, + "router_z_loss_clip": 0.3203125, + "router_z_loss_mlp": 1.25, + "step": 869, + "time_per_iteration": 2.4412894248962402 + }, + { + "auxiliary_loss_clip": 0.01163911, + "auxiliary_loss_mlp": 0.01054839, + "balance_loss_clip": 1.02630055, + "balance_loss_mlp": 1.04024363, + "epoch": 0.052307229821133325, + "flos": 11144310226560.0, + "grad_norm": 2.142193581806339, + "language_loss": 0.87373012, + "learning_rate": 3.973172427418259e-06, + "loss": 0.89591759, + "num_input_tokens_seen": 18592185, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 1.234375, + "step": 870, + "time_per_iteration": 2.4127585887908936 + }, + { + "auxiliary_loss_clip": 0.01164902, + "auxiliary_loss_mlp": 0.01054859, + "balance_loss_clip": 1.02622497, + "balance_loss_mlp": 1.04133844, + "epoch": 0.05236735307380129, + "flos": 19127271219840.0, + "grad_norm": 2.4752433989170615, + "language_loss": 0.80509758, + "learning_rate": 3.97311071609353e-06, + "loss": 0.82729518, + "num_input_tokens_seen": 18609560, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 1.234375, + "step": 871, + "time_per_iteration": 2.425719976425171 + }, + { + "auxiliary_loss_clip": 0.01161646, + "auxiliary_loss_mlp": 0.01046634, + "balance_loss_clip": 1.01799941, + "balance_loss_mlp": 1.04076588, + "epoch": 0.05242747632646926, + "flos": 20959582527360.0, + "grad_norm": 2.287777610954603, + "language_loss": 0.81213582, + "learning_rate": 3.973048934353554e-06, + "loss": 0.83421862, + "num_input_tokens_seen": 18629405, + "router_z_loss_clip": 0.28710938, + "router_z_loss_mlp": 1.2109375, + "step": 872, + "time_per_iteration": 2.431184768676758 + }, + { + "auxiliary_loss_clip": 0.01048326, + "auxiliary_loss_mlp": 0.01032536, + "balance_loss_clip": 1.02783895, + "balance_loss_mlp": 1.00866389, + "epoch": 0.052487599579137234, + "flos": 65017632337920.0, + "grad_norm": 0.9004996555808237, + "language_loss": 0.61653852, + "learning_rate": 3.972987082200538e-06, + "loss": 0.63734716, + "num_input_tokens_seen": 18681480, + "router_z_loss_clip": 0.046875, + "router_z_loss_mlp": 0.39648438, + "step": 873, + "time_per_iteration": 2.950486421585083 + }, + { + "auxiliary_loss_clip": 0.01160999, + "auxiliary_loss_mlp": 0.01047396, + "balance_loss_clip": 1.01950097, + "balance_loss_mlp": 1.03949547, + "epoch": 0.0525477228318052, + "flos": 23287905653760.0, + "grad_norm": 2.1513200790654716, + "language_loss": 0.88312685, + "learning_rate": 3.972925159636687e-06, + "loss": 0.90521085, + "num_input_tokens_seen": 18700390, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 1.21875, + "step": 874, + "time_per_iteration": 2.431154727935791 + }, + { + "auxiliary_loss_clip": 0.01162635, + "auxiliary_loss_mlp": 0.01057579, + "balance_loss_clip": 1.02794373, + "balance_loss_mlp": 1.03995252, + "epoch": 0.05260784608447317, + "flos": 32228952710400.0, + "grad_norm": 1.788658450643276, + "language_loss": 0.74017358, + "learning_rate": 3.972863166664212e-06, + "loss": 0.76237571, + "num_input_tokens_seen": 18721280, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 1.2265625, + "step": 875, + "time_per_iteration": 2.5350773334503174 + }, + { + "auxiliary_loss_clip": 0.01161192, + "auxiliary_loss_mlp": 0.0105587, + "balance_loss_clip": 1.0261867, + "balance_loss_mlp": 1.04077697, + "epoch": 0.052667969337141136, + "flos": 24462034410240.0, + "grad_norm": 2.113354624868253, + "language_loss": 0.9275443, + "learning_rate": 3.972801103285326e-06, + "loss": 0.9497149, + "num_input_tokens_seen": 18741545, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 1.203125, + "step": 876, + "time_per_iteration": 2.459304094314575 + }, + { + "auxiliary_loss_clip": 0.01048974, + "auxiliary_loss_mlp": 0.01004426, + "balance_loss_clip": 0.99989587, + "balance_loss_mlp": 1.01024711, + "epoch": 0.05272809258980911, + "flos": 57780938989440.0, + "grad_norm": 0.8418676385507674, + "language_loss": 0.62896293, + "learning_rate": 3.9727389695022434e-06, + "loss": 0.64949697, + "num_input_tokens_seen": 18801400, + "router_z_loss_clip": 0.04541016, + "router_z_loss_mlp": 0.38671875, + "step": 877, + "time_per_iteration": 3.1060640811920166 + }, + { + "auxiliary_loss_clip": 0.01162697, + "auxiliary_loss_mlp": 0.01058139, + "balance_loss_clip": 1.02790785, + "balance_loss_mlp": 1.04045391, + "epoch": 0.05278821584247708, + "flos": 17419843445760.0, + "grad_norm": 2.6596222001650593, + "language_loss": 0.85823625, + "learning_rate": 3.972676765317181e-06, + "loss": 0.88044465, + "num_input_tokens_seen": 18819670, + "router_z_loss_clip": 0.30273438, + "router_z_loss_mlp": 1.21875, + "step": 878, + "time_per_iteration": 2.431715726852417 + }, + { + "auxiliary_loss_clip": 0.01163295, + "auxiliary_loss_mlp": 0.01053228, + "balance_loss_clip": 1.02420092, + "balance_loss_mlp": 1.04162955, + "epoch": 0.052848339095145046, + "flos": 26136154880640.0, + "grad_norm": 1.9255771100967056, + "language_loss": 0.8295579, + "learning_rate": 3.97261449073236e-06, + "loss": 0.85172307, + "num_input_tokens_seen": 18840580, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 1.21875, + "step": 879, + "time_per_iteration": 2.469244956970215 + }, + { + "auxiliary_loss_clip": 0.01158954, + "auxiliary_loss_mlp": 0.01061843, + "balance_loss_clip": 1.03066969, + "balance_loss_mlp": 1.03983855, + "epoch": 0.05290846234781302, + "flos": 16471148538240.0, + "grad_norm": 2.0801283716945176, + "language_loss": 0.84291494, + "learning_rate": 3.9725521457500005e-06, + "loss": 0.86512297, + "num_input_tokens_seen": 18859295, + "router_z_loss_clip": 0.3125, + "router_z_loss_mlp": 1.1875, + "step": 880, + "time_per_iteration": 2.4176676273345947 + }, + { + "auxiliary_loss_clip": 0.01163223, + "auxiliary_loss_mlp": 0.01052989, + "balance_loss_clip": 1.02311563, + "balance_loss_mlp": 1.03948319, + "epoch": 0.05296858560048098, + "flos": 19864147708800.0, + "grad_norm": 2.2089813172294055, + "language_loss": 0.86675858, + "learning_rate": 3.97248973037233e-06, + "loss": 0.88892066, + "num_input_tokens_seen": 18877485, + "router_z_loss_clip": 0.29882812, + "router_z_loss_mlp": 1.234375, + "step": 881, + "time_per_iteration": 3.8267643451690674 + }, + { + "auxiliary_loss_clip": 0.01160676, + "auxiliary_loss_mlp": 0.01053967, + "balance_loss_clip": 1.02216184, + "balance_loss_mlp": 1.03855371, + "epoch": 0.053028708853148955, + "flos": 24387460012800.0, + "grad_norm": 1.9561697480028104, + "language_loss": 0.87807399, + "learning_rate": 3.972427244601574e-06, + "loss": 0.90022039, + "num_input_tokens_seen": 18898275, + "router_z_loss_clip": 0.31835938, + "router_z_loss_mlp": 1.21875, + "step": 882, + "time_per_iteration": 2.4512674808502197 + }, + { + "auxiliary_loss_clip": 0.0116422, + "auxiliary_loss_mlp": 0.01053706, + "balance_loss_clip": 1.02218688, + "balance_loss_mlp": 1.03948426, + "epoch": 0.05308883210581693, + "flos": 36391681825920.0, + "grad_norm": 2.673534550737762, + "language_loss": 0.69085759, + "learning_rate": 3.972364688439964e-06, + "loss": 0.71303678, + "num_input_tokens_seen": 18920665, + "router_z_loss_clip": 0.31445312, + "router_z_loss_mlp": 1.25, + "step": 883, + "time_per_iteration": 2.545847177505493 + }, + { + "auxiliary_loss_clip": 0.01158398, + "auxiliary_loss_mlp": 0.01055761, + "balance_loss_clip": 1.02661395, + "balance_loss_mlp": 1.04118681, + "epoch": 0.05314895535848489, + "flos": 22854039788160.0, + "grad_norm": 3.3655480070090205, + "language_loss": 0.76206219, + "learning_rate": 3.9723020618897325e-06, + "loss": 0.78420377, + "num_input_tokens_seen": 18939835, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 1.171875, + "step": 884, + "time_per_iteration": 2.4771573543548584 + }, + { + "auxiliary_loss_clip": 0.01158304, + "auxiliary_loss_mlp": 0.01053657, + "balance_loss_clip": 1.02576232, + "balance_loss_mlp": 1.04068482, + "epoch": 0.053209078611152864, + "flos": 12859488322560.0, + "grad_norm": 2.1558972408245882, + "language_loss": 0.8541072, + "learning_rate": 3.972239364953113e-06, + "loss": 0.87622678, + "num_input_tokens_seen": 18958405, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 1.1796875, + "step": 885, + "time_per_iteration": 5.304793834686279 + }, + { + "auxiliary_loss_clip": 0.01160873, + "auxiliary_loss_mlp": 0.01058031, + "balance_loss_clip": 1.02813315, + "balance_loss_mlp": 1.03778291, + "epoch": 0.05326920186382083, + "flos": 12163844016000.0, + "grad_norm": 2.5549299977840665, + "language_loss": 0.85519499, + "learning_rate": 3.9721765976323435e-06, + "loss": 0.87738407, + "num_input_tokens_seen": 18975445, + "router_z_loss_clip": 0.29882812, + "router_z_loss_mlp": 1.234375, + "step": 886, + "time_per_iteration": 2.3972959518432617 + }, + { + "auxiliary_loss_clip": 0.01159147, + "auxiliary_loss_mlp": 0.01056786, + "balance_loss_clip": 1.02691174, + "balance_loss_mlp": 1.0391463, + "epoch": 0.0533293251164888, + "flos": 22703564361600.0, + "grad_norm": 2.058024973151939, + "language_loss": 0.88818395, + "learning_rate": 3.972113759929665e-06, + "loss": 0.91034329, + "num_input_tokens_seen": 18991930, + "router_z_loss_clip": 0.29882812, + "router_z_loss_mlp": 1.203125, + "step": 887, + "time_per_iteration": 3.844373941421509 + }, + { + "auxiliary_loss_clip": 0.01159157, + "auxiliary_loss_mlp": 0.01071229, + "balance_loss_clip": 1.03975785, + "balance_loss_mlp": 1.0372448, + "epoch": 0.053389448369156774, + "flos": 26939785622400.0, + "grad_norm": 1.9006520640832827, + "language_loss": 0.74924183, + "learning_rate": 3.9720508518473186e-06, + "loss": 0.77154565, + "num_input_tokens_seen": 19009790, + "router_z_loss_clip": 0.31445312, + "router_z_loss_mlp": 1.21875, + "step": 888, + "time_per_iteration": 2.4814414978027344 + }, + { + "auxiliary_loss_clip": 0.01159249, + "auxiliary_loss_mlp": 0.01061684, + "balance_loss_clip": 1.0303793, + "balance_loss_mlp": 1.03939033, + "epoch": 0.05344957162182474, + "flos": 25555165079040.0, + "grad_norm": 2.078289993135767, + "language_loss": 0.88061041, + "learning_rate": 3.97198787338755e-06, + "loss": 0.90281975, + "num_input_tokens_seen": 19030170, + "router_z_loss_clip": 0.3125, + "router_z_loss_mlp": 1.203125, + "step": 889, + "time_per_iteration": 2.435617685317993 + }, + { + "auxiliary_loss_clip": 0.01158143, + "auxiliary_loss_mlp": 0.01051812, + "balance_loss_clip": 1.02282035, + "balance_loss_mlp": 1.03846538, + "epoch": 0.05350969487449271, + "flos": 19718559872640.0, + "grad_norm": 2.57841427158968, + "language_loss": 0.88126409, + "learning_rate": 3.971924824552607e-06, + "loss": 0.9033637, + "num_input_tokens_seen": 19048075, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 1.1953125, + "step": 890, + "time_per_iteration": 2.4183461666107178 + }, + { + "auxiliary_loss_clip": 0.01160598, + "auxiliary_loss_mlp": 0.0106087, + "balance_loss_clip": 1.03183031, + "balance_loss_mlp": 1.03818941, + "epoch": 0.053569818127160676, + "flos": 27015128069760.0, + "grad_norm": 2.234304571881244, + "language_loss": 0.93175459, + "learning_rate": 3.97186170534474e-06, + "loss": 0.95396924, + "num_input_tokens_seen": 19067465, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 1.21875, + "step": 891, + "time_per_iteration": 2.4461190700531006 + }, + { + "auxiliary_loss_clip": 0.01161554, + "auxiliary_loss_mlp": 0.01060467, + "balance_loss_clip": 1.02894807, + "balance_loss_mlp": 1.03905725, + "epoch": 0.05362994137982865, + "flos": 13187497345920.0, + "grad_norm": 2.2302235095241234, + "language_loss": 0.71824193, + "learning_rate": 3.9717985157662e-06, + "loss": 0.74046212, + "num_input_tokens_seen": 19085505, + "router_z_loss_clip": 0.31445312, + "router_z_loss_mlp": 1.2265625, + "step": 892, + "time_per_iteration": 2.4148337841033936 + }, + { + "auxiliary_loss_clip": 0.01162488, + "auxiliary_loss_mlp": 0.01068779, + "balance_loss_clip": 1.03904855, + "balance_loss_mlp": 1.03886676, + "epoch": 0.05369006463249662, + "flos": 28656744197760.0, + "grad_norm": 1.8684471729019887, + "language_loss": 0.82398784, + "learning_rate": 3.971735255819244e-06, + "loss": 0.8463006, + "num_input_tokens_seen": 19104360, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 1.234375, + "step": 893, + "time_per_iteration": 2.4615511894226074 + }, + { + "auxiliary_loss_clip": 0.01161726, + "auxiliary_loss_mlp": 0.01058905, + "balance_loss_clip": 1.02868581, + "balance_loss_mlp": 1.03838944, + "epoch": 0.053750187885164585, + "flos": 28911889480320.0, + "grad_norm": 2.5652337661280993, + "language_loss": 0.81720483, + "learning_rate": 3.971671925506129e-06, + "loss": 0.83941114, + "num_input_tokens_seen": 19124680, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 1.234375, + "step": 894, + "time_per_iteration": 2.492638349533081 + }, + { + "auxiliary_loss_clip": 0.01157092, + "auxiliary_loss_mlp": 0.01059332, + "balance_loss_clip": 1.02929115, + "balance_loss_mlp": 1.03694069, + "epoch": 0.05381031113783256, + "flos": 15157925458560.0, + "grad_norm": 3.843311202633951, + "language_loss": 0.75092781, + "learning_rate": 3.9716085248291125e-06, + "loss": 0.77309203, + "num_input_tokens_seen": 19142895, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 1.203125, + "step": 895, + "time_per_iteration": 2.380563974380493 + }, + { + "auxiliary_loss_clip": 0.0116928, + "auxiliary_loss_mlp": 0.01058747, + "balance_loss_clip": 1.02970707, + "balance_loss_mlp": 1.0438832, + "epoch": 0.05387043439050053, + "flos": 21834156885120.0, + "grad_norm": 2.4046720652318454, + "language_loss": 0.86494035, + "learning_rate": 3.97154505379046e-06, + "loss": 0.88722062, + "num_input_tokens_seen": 19163125, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 1.25, + "step": 896, + "time_per_iteration": 2.43613338470459 + }, + { + "auxiliary_loss_clip": 0.01164866, + "auxiliary_loss_mlp": 0.01054395, + "balance_loss_clip": 1.02225626, + "balance_loss_mlp": 1.03944147, + "epoch": 0.053930557643168495, + "flos": 17309378304000.0, + "grad_norm": 6.219713565318159, + "language_loss": 0.88008451, + "learning_rate": 3.971481512392438e-06, + "loss": 0.90227711, + "num_input_tokens_seen": 19179385, + "router_z_loss_clip": 0.3203125, + "router_z_loss_mlp": 1.25, + "step": 897, + "time_per_iteration": 2.381504535675049 + }, + { + "auxiliary_loss_clip": 0.01159566, + "auxiliary_loss_mlp": 0.01061987, + "balance_loss_clip": 1.03080177, + "balance_loss_mlp": 1.03830087, + "epoch": 0.05399068089583647, + "flos": 17347503375360.0, + "grad_norm": 1.7870846423499203, + "language_loss": 0.90078026, + "learning_rate": 3.97141790063731e-06, + "loss": 0.92299581, + "num_input_tokens_seen": 19198725, + "router_z_loss_clip": 0.31054688, + "router_z_loss_mlp": 1.21875, + "step": 898, + "time_per_iteration": 2.3894176483154297 + }, + { + "auxiliary_loss_clip": 0.01164325, + "auxiliary_loss_mlp": 0.01069351, + "balance_loss_clip": 1.03890502, + "balance_loss_mlp": 1.03923023, + "epoch": 0.05405080414850443, + "flos": 17486178762240.0, + "grad_norm": 2.6068244252625465, + "language_loss": 0.92166436, + "learning_rate": 3.971354218527349e-06, + "loss": 0.94400114, + "num_input_tokens_seen": 19212380, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 1.25, + "step": 899, + "time_per_iteration": 2.374274969100952 + }, + { + "auxiliary_loss_clip": 0.01159553, + "auxiliary_loss_mlp": 0.01058724, + "balance_loss_clip": 1.02880204, + "balance_loss_mlp": 1.03960335, + "epoch": 0.054110927401172404, + "flos": 24495690827520.0, + "grad_norm": 2.03080882462333, + "language_loss": 0.75723553, + "learning_rate": 3.971290466064827e-06, + "loss": 0.77941823, + "num_input_tokens_seen": 19232235, + "router_z_loss_clip": 0.29882812, + "router_z_loss_mlp": 1.203125, + "step": 900, + "time_per_iteration": 2.5287563800811768 + }, + { + "auxiliary_loss_clip": 0.01161827, + "auxiliary_loss_mlp": 0.01055122, + "balance_loss_clip": 1.0256772, + "balance_loss_mlp": 1.03749549, + "epoch": 0.054171050653840376, + "flos": 22928928387840.0, + "grad_norm": 3.188934588277214, + "language_loss": 0.73738217, + "learning_rate": 3.971226643252019e-06, + "loss": 0.7595517, + "num_input_tokens_seen": 19251460, + "router_z_loss_clip": 0.29492188, + "router_z_loss_mlp": 1.25, + "step": 901, + "time_per_iteration": 2.408686876296997 + }, + { + "auxiliary_loss_clip": 0.01154546, + "auxiliary_loss_mlp": 0.01061796, + "balance_loss_clip": 1.03520036, + "balance_loss_mlp": 1.03875566, + "epoch": 0.05423117390650834, + "flos": 12932352063360.0, + "grad_norm": 2.102878927305348, + "language_loss": 0.8485086, + "learning_rate": 3.971162750091202e-06, + "loss": 0.87067199, + "num_input_tokens_seen": 19269060, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.15625, + "step": 902, + "time_per_iteration": 2.3947722911834717 + }, + { + "auxiliary_loss_clip": 0.01158079, + "auxiliary_loss_mlp": 0.0105659, + "balance_loss_clip": 1.02650177, + "balance_loss_mlp": 1.03704238, + "epoch": 0.05429129715917631, + "flos": 19900317744000.0, + "grad_norm": 2.06799905004133, + "language_loss": 0.86127782, + "learning_rate": 3.971098786584657e-06, + "loss": 0.88342452, + "num_input_tokens_seen": 19288620, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 1.2109375, + "step": 903, + "time_per_iteration": 2.4634947776794434 + }, + { + "auxiliary_loss_clip": 0.01156319, + "auxiliary_loss_mlp": 0.01050202, + "balance_loss_clip": 1.02140117, + "balance_loss_mlp": 1.03751755, + "epoch": 0.05435142041184428, + "flos": 16907702578560.0, + "grad_norm": 2.4141723378563067, + "language_loss": 0.75000405, + "learning_rate": 3.971034752734668e-06, + "loss": 0.77206928, + "num_input_tokens_seen": 19306615, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 1.1875, + "step": 904, + "time_per_iteration": 2.384925365447998 + }, + { + "auxiliary_loss_clip": 0.01162563, + "auxiliary_loss_mlp": 0.01054817, + "balance_loss_clip": 1.02602839, + "balance_loss_mlp": 1.04079747, + "epoch": 0.05441154366451225, + "flos": 23947275191040.0, + "grad_norm": 2.6027161513202386, + "language_loss": 0.85758334, + "learning_rate": 3.970970648543517e-06, + "loss": 0.87975711, + "num_input_tokens_seen": 19321680, + "router_z_loss_clip": 0.28710938, + "router_z_loss_mlp": 1.21875, + "step": 905, + "time_per_iteration": 2.423539161682129 + }, + { + "auxiliary_loss_clip": 0.01157356, + "auxiliary_loss_mlp": 0.01053976, + "balance_loss_clip": 1.02722573, + "balance_loss_mlp": 1.0414567, + "epoch": 0.05447166691718022, + "flos": 19974333559680.0, + "grad_norm": 3.568337735308201, + "language_loss": 0.74576402, + "learning_rate": 3.970906474013494e-06, + "loss": 0.76787734, + "num_input_tokens_seen": 19339760, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 1.15625, + "step": 906, + "time_per_iteration": 2.4099528789520264 + }, + { + "auxiliary_loss_clip": 0.0116229, + "auxiliary_loss_mlp": 0.01055882, + "balance_loss_clip": 1.02797496, + "balance_loss_mlp": 1.03734863, + "epoch": 0.05453179016984819, + "flos": 24935351978880.0, + "grad_norm": 1.9587767139692178, + "language_loss": 0.86923331, + "learning_rate": 3.97084222914689e-06, + "loss": 0.891415, + "num_input_tokens_seen": 19359585, + "router_z_loss_clip": 0.27929688, + "router_z_loss_mlp": 1.25, + "step": 907, + "time_per_iteration": 2.467411994934082 + }, + { + "auxiliary_loss_clip": 0.0116156, + "auxiliary_loss_mlp": 0.01063412, + "balance_loss_clip": 1.03406227, + "balance_loss_mlp": 1.04133844, + "epoch": 0.05459191342251616, + "flos": 18114091297920.0, + "grad_norm": 3.51243039724905, + "language_loss": 0.86991906, + "learning_rate": 3.970777913945995e-06, + "loss": 0.89216876, + "num_input_tokens_seen": 19378590, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 1.203125, + "step": 908, + "time_per_iteration": 2.421147584915161 + }, + { + "auxiliary_loss_clip": 0.01160336, + "auxiliary_loss_mlp": 0.01057968, + "balance_loss_clip": 1.02871394, + "balance_loss_mlp": 1.04014111, + "epoch": 0.054652036675184125, + "flos": 19207291789440.0, + "grad_norm": 2.1778556300511402, + "language_loss": 0.89483535, + "learning_rate": 3.970713528413106e-06, + "loss": 0.91701841, + "num_input_tokens_seen": 19397910, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 1.203125, + "step": 909, + "time_per_iteration": 2.433941602706909 + }, + { + "auxiliary_loss_clip": 0.01161949, + "auxiliary_loss_mlp": 0.01063548, + "balance_loss_clip": 1.03183877, + "balance_loss_mlp": 1.03985989, + "epoch": 0.0547121599278521, + "flos": 16324827563520.0, + "grad_norm": 4.50022788736104, + "language_loss": 0.71124053, + "learning_rate": 3.9706490725505205e-06, + "loss": 0.73349547, + "num_input_tokens_seen": 19415950, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 1.21875, + "step": 910, + "time_per_iteration": 2.3976662158966064 + }, + { + "auxiliary_loss_clip": 0.01156027, + "auxiliary_loss_mlp": 0.01053817, + "balance_loss_clip": 1.02468252, + "balance_loss_mlp": 1.0386641, + "epoch": 0.05477228318052007, + "flos": 20337988947840.0, + "grad_norm": 1.814052792527829, + "language_loss": 0.83245134, + "learning_rate": 3.970584546360539e-06, + "loss": 0.85454977, + "num_input_tokens_seen": 19435275, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 1.171875, + "step": 911, + "time_per_iteration": 2.4349710941314697 + }, + { + "auxiliary_loss_clip": 0.01159004, + "auxiliary_loss_mlp": 0.01054041, + "balance_loss_clip": 1.02214098, + "balance_loss_mlp": 1.03775454, + "epoch": 0.054832406433188034, + "flos": 21972238778880.0, + "grad_norm": 3.1255396403293156, + "language_loss": 0.75924587, + "learning_rate": 3.970519949845464e-06, + "loss": 0.78137636, + "num_input_tokens_seen": 19452090, + "router_z_loss_clip": 0.31835938, + "router_z_loss_mlp": 1.2109375, + "step": 912, + "time_per_iteration": 2.39033842086792 + }, + { + "auxiliary_loss_clip": 0.01155644, + "auxiliary_loss_mlp": 0.01056618, + "balance_loss_clip": 1.02753103, + "balance_loss_mlp": 1.03938627, + "epoch": 0.054892529685856006, + "flos": 16398005506560.0, + "grad_norm": 2.468181012949965, + "language_loss": 0.82650316, + "learning_rate": 3.9704552830076005e-06, + "loss": 0.84862584, + "num_input_tokens_seen": 19470865, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 1.15625, + "step": 913, + "time_per_iteration": 2.4073081016540527 + }, + { + "auxiliary_loss_clip": 0.01157295, + "auxiliary_loss_mlp": 0.01052201, + "balance_loss_clip": 1.02399659, + "balance_loss_mlp": 1.04099143, + "epoch": 0.05495265293852397, + "flos": 23911279712640.0, + "grad_norm": 2.064636756581716, + "language_loss": 0.8323791, + "learning_rate": 3.9703905458492564e-06, + "loss": 0.85447407, + "num_input_tokens_seen": 19492145, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 1.15625, + "step": 914, + "time_per_iteration": 2.4616618156433105 + }, + { + "auxiliary_loss_clip": 0.01161332, + "auxiliary_loss_mlp": 0.01057481, + "balance_loss_clip": 1.02823925, + "balance_loss_mlp": 1.04124427, + "epoch": 0.055012776191191944, + "flos": 23585819218560.0, + "grad_norm": 3.304359884839736, + "language_loss": 0.8976059, + "learning_rate": 3.970325738372742e-06, + "loss": 0.91979396, + "num_input_tokens_seen": 19511015, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 1.203125, + "step": 915, + "time_per_iteration": 2.444795608520508 + }, + { + "auxiliary_loss_clip": 0.01158095, + "auxiliary_loss_mlp": 0.01058386, + "balance_loss_clip": 1.03014517, + "balance_loss_mlp": 1.03939843, + "epoch": 0.055072899443859916, + "flos": 17527585501440.0, + "grad_norm": 1.733987534023442, + "language_loss": 0.89628351, + "learning_rate": 3.970260860580371e-06, + "loss": 0.91844833, + "num_input_tokens_seen": 19529040, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 1.1875, + "step": 916, + "time_per_iteration": 2.412970542907715 + }, + { + "auxiliary_loss_clip": 0.01160748, + "auxiliary_loss_mlp": 0.01062094, + "balance_loss_clip": 1.03136241, + "balance_loss_mlp": 1.03984213, + "epoch": 0.05513302269652788, + "flos": 21686160165120.0, + "grad_norm": 4.549710784254671, + "language_loss": 0.79854846, + "learning_rate": 3.970195912474457e-06, + "loss": 0.82077694, + "num_input_tokens_seen": 19549540, + "router_z_loss_clip": 0.30664062, + "router_z_loss_mlp": 1.2109375, + "step": 917, + "time_per_iteration": 2.4622411727905273 + }, + { + "auxiliary_loss_clip": 0.0116097, + "auxiliary_loss_mlp": 0.01055916, + "balance_loss_clip": 1.02758026, + "balance_loss_mlp": 1.03956866, + "epoch": 0.05519314594919585, + "flos": 21612353817600.0, + "grad_norm": 2.0877874916789505, + "language_loss": 0.79856837, + "learning_rate": 3.9701308940573195e-06, + "loss": 0.82073724, + "num_input_tokens_seen": 19567570, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 1.2109375, + "step": 918, + "time_per_iteration": 2.42089581489563 + }, + { + "auxiliary_loss_clip": 0.01158156, + "auxiliary_loss_mlp": 0.01048415, + "balance_loss_clip": 1.0192802, + "balance_loss_mlp": 1.03866529, + "epoch": 0.05525326920186382, + "flos": 21797498090880.0, + "grad_norm": 1.8702226836959464, + "language_loss": 0.88963503, + "learning_rate": 3.970065805331279e-06, + "loss": 0.91170073, + "num_input_tokens_seen": 19585330, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 1.1953125, + "step": 919, + "time_per_iteration": 2.4145870208740234 + }, + { + "auxiliary_loss_clip": 0.01155088, + "auxiliary_loss_mlp": 0.01054277, + "balance_loss_clip": 1.02604783, + "balance_loss_mlp": 1.0378617, + "epoch": 0.05531339245453179, + "flos": 28438362443520.0, + "grad_norm": 2.341967609968428, + "language_loss": 0.86990917, + "learning_rate": 3.970000646298656e-06, + "loss": 0.89200282, + "num_input_tokens_seen": 19604970, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 1.171875, + "step": 920, + "time_per_iteration": 2.4621634483337402 + }, + { + "auxiliary_loss_clip": 0.0115865, + "auxiliary_loss_mlp": 0.01054321, + "balance_loss_clip": 1.02559161, + "balance_loss_mlp": 1.03921771, + "epoch": 0.05537351570719976, + "flos": 37373718948480.0, + "grad_norm": 2.1703236391249847, + "language_loss": 0.65769506, + "learning_rate": 3.969935416961778e-06, + "loss": 0.67982477, + "num_input_tokens_seen": 19626235, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 1.1953125, + "step": 921, + "time_per_iteration": 3.9598336219787598 + }, + { + "auxiliary_loss_clip": 0.01163449, + "auxiliary_loss_mlp": 0.01055815, + "balance_loss_clip": 1.0239861, + "balance_loss_mlp": 1.04239345, + "epoch": 0.05543363895986773, + "flos": 20083437158400.0, + "grad_norm": 4.7013371739042045, + "language_loss": 0.71693504, + "learning_rate": 3.969870117322973e-06, + "loss": 0.7391277, + "num_input_tokens_seen": 19644305, + "router_z_loss_clip": 0.31835938, + "router_z_loss_mlp": 1.2109375, + "step": 922, + "time_per_iteration": 2.426201581954956 + }, + { + "auxiliary_loss_clip": 0.01161821, + "auxiliary_loss_mlp": 0.01062252, + "balance_loss_clip": 1.03109097, + "balance_loss_mlp": 1.03957999, + "epoch": 0.0554937622125357, + "flos": 24532105242240.0, + "grad_norm": 2.542450463504702, + "language_loss": 0.82041645, + "learning_rate": 3.96980474738457e-06, + "loss": 0.84265721, + "num_input_tokens_seen": 19662130, + "router_z_loss_clip": 0.3125, + "router_z_loss_mlp": 1.21875, + "step": 923, + "time_per_iteration": 2.434980869293213 + }, + { + "auxiliary_loss_clip": 0.01159792, + "auxiliary_loss_mlp": 0.01056812, + "balance_loss_clip": 1.02710509, + "balance_loss_mlp": 1.0386827, + "epoch": 0.055553885465203665, + "flos": 14319172022400.0, + "grad_norm": 2.0976337306139627, + "language_loss": 0.78356576, + "learning_rate": 3.969739307148902e-06, + "loss": 0.80573177, + "num_input_tokens_seen": 19680715, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 1.2109375, + "step": 924, + "time_per_iteration": 2.3935599327087402 + }, + { + "auxiliary_loss_clip": 0.01158258, + "auxiliary_loss_mlp": 0.01051702, + "balance_loss_clip": 1.0231396, + "balance_loss_mlp": 1.03932667, + "epoch": 0.05561400871787164, + "flos": 27379900621440.0, + "grad_norm": 1.9906794558315535, + "language_loss": 1.0172838, + "learning_rate": 3.969673796618306e-06, + "loss": 1.03938341, + "num_input_tokens_seen": 19700535, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 1.1875, + "step": 925, + "time_per_iteration": 5.295424461364746 + }, + { + "auxiliary_loss_clip": 0.01158902, + "auxiliary_loss_mlp": 0.01047981, + "balance_loss_clip": 1.01815462, + "balance_loss_mlp": 1.03811073, + "epoch": 0.05567413197053961, + "flos": 23219999326080.0, + "grad_norm": 1.9889175308308498, + "language_loss": 0.80755478, + "learning_rate": 3.969608215795117e-06, + "loss": 0.82962364, + "num_input_tokens_seen": 19718825, + "router_z_loss_clip": 0.29882812, + "router_z_loss_mlp": 1.203125, + "step": 926, + "time_per_iteration": 2.4128670692443848 + }, + { + "auxiliary_loss_clip": 0.01166394, + "auxiliary_loss_mlp": 0.01050217, + "balance_loss_clip": 1.01919866, + "balance_loss_mlp": 1.04073811, + "epoch": 0.055734255223207574, + "flos": 25263779938560.0, + "grad_norm": 2.2196935082358995, + "language_loss": 0.7284615, + "learning_rate": 3.969542564681679e-06, + "loss": 0.75062764, + "num_input_tokens_seen": 19739080, + "router_z_loss_clip": 0.31054688, + "router_z_loss_mlp": 1.2578125, + "step": 927, + "time_per_iteration": 3.88985538482666 + }, + { + "auxiliary_loss_clip": 0.01047593, + "auxiliary_loss_mlp": 0.01003839, + "balance_loss_clip": 0.99959511, + "balance_loss_mlp": 1.00975871, + "epoch": 0.055794378475875546, + "flos": 66499519662720.0, + "grad_norm": 0.7940120874990624, + "language_loss": 0.59834445, + "learning_rate": 3.969476843280333e-06, + "loss": 0.61885875, + "num_input_tokens_seen": 19802960, + "router_z_loss_clip": 0.04248047, + "router_z_loss_mlp": 0.37890625, + "step": 928, + "time_per_iteration": 3.073519706726074 + }, + { + "auxiliary_loss_clip": 0.01161437, + "auxiliary_loss_mlp": 0.01059613, + "balance_loss_clip": 1.02945352, + "balance_loss_mlp": 1.04066682, + "epoch": 0.05585450172854351, + "flos": 25336469122560.0, + "grad_norm": 2.741727701465678, + "language_loss": 0.94735438, + "learning_rate": 3.969411051593424e-06, + "loss": 0.96956486, + "num_input_tokens_seen": 19822765, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 1.203125, + "step": 929, + "time_per_iteration": 2.4511215686798096 + }, + { + "auxiliary_loss_clip": 0.01159328, + "auxiliary_loss_mlp": 0.01056288, + "balance_loss_clip": 1.02419722, + "balance_loss_mlp": 1.03711009, + "epoch": 0.05591462498121148, + "flos": 33910334743680.0, + "grad_norm": 2.0145312235320367, + "language_loss": 0.71520591, + "learning_rate": 3.9693451896233e-06, + "loss": 0.73736215, + "num_input_tokens_seen": 19843590, + "router_z_loss_clip": 0.3203125, + "router_z_loss_mlp": 1.21875, + "step": 930, + "time_per_iteration": 2.519730567932129 + }, + { + "auxiliary_loss_clip": 0.01162948, + "auxiliary_loss_mlp": 0.01054579, + "balance_loss_clip": 1.02428746, + "balance_loss_mlp": 1.04021239, + "epoch": 0.055974748233879455, + "flos": 17929924542720.0, + "grad_norm": 7.092155019963012, + "language_loss": 0.84803557, + "learning_rate": 3.969279257372313e-06, + "loss": 0.87021089, + "num_input_tokens_seen": 19860230, + "router_z_loss_clip": 0.30273438, + "router_z_loss_mlp": 1.2265625, + "step": 931, + "time_per_iteration": 2.387993097305298 + }, + { + "auxiliary_loss_clip": 0.01163628, + "auxiliary_loss_mlp": 0.01061806, + "balance_loss_clip": 1.03116918, + "balance_loss_mlp": 1.03927064, + "epoch": 0.05603487148654742, + "flos": 24020906981760.0, + "grad_norm": 1.7785238580422558, + "language_loss": 0.83289844, + "learning_rate": 3.969213254842814e-06, + "loss": 0.85515279, + "num_input_tokens_seen": 19880795, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 1.25, + "step": 932, + "time_per_iteration": 2.460404634475708 + }, + { + "auxiliary_loss_clip": 0.01163613, + "auxiliary_loss_mlp": 0.01056253, + "balance_loss_clip": 1.02413821, + "balance_loss_mlp": 1.04108763, + "epoch": 0.05609499473921539, + "flos": 17306899597440.0, + "grad_norm": 2.233640975121556, + "language_loss": 0.73572028, + "learning_rate": 3.9691471820371594e-06, + "loss": 0.75791895, + "num_input_tokens_seen": 19897960, + "router_z_loss_clip": 0.3203125, + "router_z_loss_mlp": 1.2265625, + "step": 933, + "time_per_iteration": 2.430478096008301 + }, + { + "auxiliary_loss_clip": 0.0115802, + "auxiliary_loss_mlp": 0.01060328, + "balance_loss_clip": 1.02950001, + "balance_loss_mlp": 1.03795409, + "epoch": 0.05615511799188336, + "flos": 20993727703680.0, + "grad_norm": 2.738979518074269, + "language_loss": 0.86471808, + "learning_rate": 3.969081038957708e-06, + "loss": 0.88690156, + "num_input_tokens_seen": 19913315, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 1.203125, + "step": 934, + "time_per_iteration": 2.3857738971710205 + }, + { + "auxiliary_loss_clip": 0.01153999, + "auxiliary_loss_mlp": 0.01058784, + "balance_loss_clip": 1.03041255, + "balance_loss_mlp": 1.03940582, + "epoch": 0.05621524124455133, + "flos": 17272614775680.0, + "grad_norm": 2.1381230663333164, + "language_loss": 0.80012619, + "learning_rate": 3.969014825606819e-06, + "loss": 0.82225406, + "num_input_tokens_seen": 19928790, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 1.1484375, + "step": 935, + "time_per_iteration": 2.3873655796051025 + }, + { + "auxiliary_loss_clip": 0.01043219, + "auxiliary_loss_mlp": 0.01005184, + "balance_loss_clip": 1.00051081, + "balance_loss_mlp": 1.00608182, + "epoch": 0.0562753644972193, + "flos": 58716332668800.0, + "grad_norm": 0.8227501343542768, + "language_loss": 0.69188774, + "learning_rate": 3.968948541986855e-06, + "loss": 0.71237177, + "num_input_tokens_seen": 19988785, + "router_z_loss_clip": 0.04663086, + "router_z_loss_mlp": 0.37109375, + "step": 936, + "time_per_iteration": 2.932800531387329 + }, + { + "auxiliary_loss_clip": 0.01158892, + "auxiliary_loss_mlp": 0.01053648, + "balance_loss_clip": 1.02417982, + "balance_loss_mlp": 1.03832948, + "epoch": 0.05633548774988727, + "flos": 17456083303680.0, + "grad_norm": 2.9910462478789834, + "language_loss": 0.75406981, + "learning_rate": 3.968882188100183e-06, + "loss": 0.77619517, + "num_input_tokens_seen": 20007685, + "router_z_loss_clip": 0.29492188, + "router_z_loss_mlp": 1.203125, + "step": 937, + "time_per_iteration": 2.412381410598755 + }, + { + "auxiliary_loss_clip": 0.01042286, + "auxiliary_loss_mlp": 0.01003793, + "balance_loss_clip": 0.99935871, + "balance_loss_mlp": 1.00569856, + "epoch": 0.05639561100255524, + "flos": 70651426256640.0, + "grad_norm": 0.8602694962622135, + "language_loss": 0.64379501, + "learning_rate": 3.9688157639491704e-06, + "loss": 0.66425586, + "num_input_tokens_seen": 20072750, + "router_z_loss_clip": 0.04443359, + "router_z_loss_mlp": 0.3671875, + "step": 938, + "time_per_iteration": 3.023224353790283 + }, + { + "auxiliary_loss_clip": 0.01166904, + "auxiliary_loss_mlp": 0.01058033, + "balance_loss_clip": 1.02677608, + "balance_loss_mlp": 1.03905725, + "epoch": 0.056455734255223204, + "flos": 20484938327040.0, + "grad_norm": 2.812712479682215, + "language_loss": 0.79116201, + "learning_rate": 3.968749269536188e-06, + "loss": 0.81341136, + "num_input_tokens_seen": 20089070, + "router_z_loss_clip": 0.3125, + "router_z_loss_mlp": 1.28125, + "step": 939, + "time_per_iteration": 2.4041600227355957 + }, + { + "auxiliary_loss_clip": 0.01158136, + "auxiliary_loss_mlp": 0.01054269, + "balance_loss_clip": 1.02573061, + "balance_loss_mlp": 1.03815985, + "epoch": 0.056515857507891176, + "flos": 22052503728000.0, + "grad_norm": 1.798056398583246, + "language_loss": 0.73791158, + "learning_rate": 3.9686827048636074e-06, + "loss": 0.76003563, + "num_input_tokens_seen": 20108790, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 1.203125, + "step": 940, + "time_per_iteration": 2.4448323249816895 + }, + { + "auxiliary_loss_clip": 0.01159927, + "auxiliary_loss_mlp": 0.01061686, + "balance_loss_clip": 1.03264642, + "balance_loss_mlp": 1.04005516, + "epoch": 0.05657598076055915, + "flos": 24024153738240.0, + "grad_norm": 1.873028528359329, + "language_loss": 0.70337206, + "learning_rate": 3.968616069933806e-06, + "loss": 0.7255882, + "num_input_tokens_seen": 20128455, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 1.203125, + "step": 941, + "time_per_iteration": 2.443727731704712 + }, + { + "auxiliary_loss_clip": 0.01156185, + "auxiliary_loss_mlp": 0.01053917, + "balance_loss_clip": 1.02378058, + "balance_loss_mlp": 1.03895378, + "epoch": 0.05663610401322711, + "flos": 20479701623040.0, + "grad_norm": 1.8997356430322379, + "language_loss": 0.806705, + "learning_rate": 3.96854936474916e-06, + "loss": 0.82880604, + "num_input_tokens_seen": 20145775, + "router_z_loss_clip": 0.30273438, + "router_z_loss_mlp": 1.171875, + "step": 942, + "time_per_iteration": 2.40793514251709 + }, + { + "auxiliary_loss_clip": 0.01156782, + "auxiliary_loss_mlp": 0.01056196, + "balance_loss_clip": 1.02615571, + "balance_loss_mlp": 1.03941846, + "epoch": 0.056696227265895086, + "flos": 21067987898880.0, + "grad_norm": 2.2031773806584423, + "language_loss": 0.880005, + "learning_rate": 3.968482589312052e-06, + "loss": 0.90213478, + "num_input_tokens_seen": 20164315, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 1.171875, + "step": 943, + "time_per_iteration": 2.4251973628997803 + }, + { + "auxiliary_loss_clip": 0.01160592, + "auxiliary_loss_mlp": 0.01055373, + "balance_loss_clip": 1.02628577, + "balance_loss_mlp": 1.04070008, + "epoch": 0.05675635051856306, + "flos": 17820367096320.0, + "grad_norm": 2.2903411620696725, + "language_loss": 0.74629074, + "learning_rate": 3.968415743624863e-06, + "loss": 0.76845038, + "num_input_tokens_seen": 20182760, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 1.1953125, + "step": 944, + "time_per_iteration": 2.4026761054992676 + }, + { + "auxiliary_loss_clip": 0.01155027, + "auxiliary_loss_mlp": 0.01055523, + "balance_loss_clip": 1.02712703, + "balance_loss_mlp": 1.03648019, + "epoch": 0.05681647377123102, + "flos": 23113758458880.0, + "grad_norm": 1.5914147172454032, + "language_loss": 0.79131436, + "learning_rate": 3.9683488276899794e-06, + "loss": 0.81341994, + "num_input_tokens_seen": 20203830, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 1.1875, + "step": 945, + "time_per_iteration": 2.5026559829711914 + }, + { + "auxiliary_loss_clip": 0.01158836, + "auxiliary_loss_mlp": 0.01053737, + "balance_loss_clip": 1.02420902, + "balance_loss_mlp": 1.03774977, + "epoch": 0.056876597023898995, + "flos": 16069612458240.0, + "grad_norm": 2.2531418065513664, + "language_loss": 0.82614088, + "learning_rate": 3.96828184150979e-06, + "loss": 0.8482666, + "num_input_tokens_seen": 20220365, + "router_z_loss_clip": 0.29492188, + "router_z_loss_mlp": 1.2109375, + "step": 946, + "time_per_iteration": 2.393521785736084 + }, + { + "auxiliary_loss_clip": 0.01164953, + "auxiliary_loss_mlp": 0.01056321, + "balance_loss_clip": 1.0244925, + "balance_loss_mlp": 1.04142892, + "epoch": 0.05693672027656696, + "flos": 16834734103680.0, + "grad_norm": 1.9265319793675204, + "language_loss": 0.79115474, + "learning_rate": 3.968214785086684e-06, + "loss": 0.81336749, + "num_input_tokens_seen": 20238640, + "router_z_loss_clip": 0.31835938, + "router_z_loss_mlp": 1.234375, + "step": 947, + "time_per_iteration": 2.4404330253601074 + }, + { + "auxiliary_loss_clip": 0.01163781, + "auxiliary_loss_mlp": 0.01061873, + "balance_loss_clip": 1.0308311, + "balance_loss_mlp": 1.04187322, + "epoch": 0.05699684352923493, + "flos": 21388281511680.0, + "grad_norm": 3.7219270590303255, + "language_loss": 0.8536315, + "learning_rate": 3.968147658423056e-06, + "loss": 0.87588805, + "num_input_tokens_seen": 20251025, + "router_z_loss_clip": 0.31054688, + "router_z_loss_mlp": 1.21875, + "step": 948, + "time_per_iteration": 2.3871397972106934 + }, + { + "auxiliary_loss_clip": 0.01161505, + "auxiliary_loss_mlp": 0.01060613, + "balance_loss_clip": 1.0270915, + "balance_loss_mlp": 1.04082263, + "epoch": 0.057056966781902904, + "flos": 15559391715840.0, + "grad_norm": 1.8594148227814742, + "language_loss": 0.87232089, + "learning_rate": 3.9680804615213e-06, + "loss": 0.8945421, + "num_input_tokens_seen": 20269775, + "router_z_loss_clip": 0.3359375, + "router_z_loss_mlp": 1.203125, + "step": 949, + "time_per_iteration": 2.4071102142333984 + }, + { + "auxiliary_loss_clip": 0.01155606, + "auxiliary_loss_mlp": 0.01054039, + "balance_loss_clip": 1.02651358, + "balance_loss_mlp": 1.03915536, + "epoch": 0.05711709003457087, + "flos": 19936836892800.0, + "grad_norm": 2.011604601070385, + "language_loss": 0.78427905, + "learning_rate": 3.968013194383815e-06, + "loss": 0.8063755, + "num_input_tokens_seen": 20287715, + "router_z_loss_clip": 0.27539062, + "router_z_loss_mlp": 1.1640625, + "step": 950, + "time_per_iteration": 2.3976221084594727 + }, + { + "auxiliary_loss_clip": 0.0116531, + "auxiliary_loss_mlp": 0.01060047, + "balance_loss_clip": 1.03012562, + "balance_loss_mlp": 1.04249465, + "epoch": 0.05717721328723884, + "flos": 30331493072640.0, + "grad_norm": 2.233275216295547, + "language_loss": 0.82126546, + "learning_rate": 3.967945857013002e-06, + "loss": 0.84351903, + "num_input_tokens_seen": 20307070, + "router_z_loss_clip": 0.29882812, + "router_z_loss_mlp": 1.2265625, + "step": 951, + "time_per_iteration": 2.5157175064086914 + }, + { + "auxiliary_loss_clip": 0.01157948, + "auxiliary_loss_mlp": 0.0106173, + "balance_loss_clip": 1.03192782, + "balance_loss_mlp": 1.03772712, + "epoch": 0.05723733653990681, + "flos": 23653376432640.0, + "grad_norm": 2.473968384898655, + "language_loss": 0.86654651, + "learning_rate": 3.967878449411263e-06, + "loss": 0.88874328, + "num_input_tokens_seen": 20324945, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 1.203125, + "step": 952, + "time_per_iteration": 2.41511607170105 + }, + { + "auxiliary_loss_clip": 0.0116263, + "auxiliary_loss_mlp": 0.01060063, + "balance_loss_clip": 1.02770972, + "balance_loss_mlp": 1.03811193, + "epoch": 0.05729745979257478, + "flos": 22054633320960.0, + "grad_norm": 1.921396913652021, + "language_loss": 0.79379117, + "learning_rate": 3.967810971581004e-06, + "loss": 0.81601816, + "num_input_tokens_seen": 20346135, + "router_z_loss_clip": 0.32421875, + "router_z_loss_mlp": 1.2421875, + "step": 953, + "time_per_iteration": 2.4393908977508545 + }, + { + "auxiliary_loss_clip": 0.01162789, + "auxiliary_loss_mlp": 0.01057679, + "balance_loss_clip": 1.0272572, + "balance_loss_mlp": 1.04243231, + "epoch": 0.05735758304524275, + "flos": 19603486431360.0, + "grad_norm": 2.1109943663128177, + "language_loss": 0.86476898, + "learning_rate": 3.967743423524633e-06, + "loss": 0.88697374, + "num_input_tokens_seen": 20364450, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 1.203125, + "step": 954, + "time_per_iteration": 2.4184181690216064 + }, + { + "auxiliary_loss_clip": 0.01162818, + "auxiliary_loss_mlp": 0.01056128, + "balance_loss_clip": 1.02434659, + "balance_loss_mlp": 1.040133, + "epoch": 0.057417706297910716, + "flos": 19098013633920.0, + "grad_norm": 2.4544342404404973, + "language_loss": 0.87799019, + "learning_rate": 3.967675805244562e-06, + "loss": 0.90017962, + "num_input_tokens_seen": 20383500, + "router_z_loss_clip": 0.31835938, + "router_z_loss_mlp": 1.2265625, + "step": 955, + "time_per_iteration": 2.424464225769043 + }, + { + "auxiliary_loss_clip": 0.01158728, + "auxiliary_loss_mlp": 0.01057114, + "balance_loss_clip": 1.0270493, + "balance_loss_mlp": 1.03928959, + "epoch": 0.05747782955057869, + "flos": 16653569725440.0, + "grad_norm": 2.2318971564074923, + "language_loss": 0.89087892, + "learning_rate": 3.967608116743202e-06, + "loss": 0.9130373, + "num_input_tokens_seen": 20400295, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 1.1953125, + "step": 956, + "time_per_iteration": 2.369446039199829 + }, + { + "auxiliary_loss_clip": 0.01159596, + "auxiliary_loss_mlp": 0.01056179, + "balance_loss_clip": 1.02831984, + "balance_loss_mlp": 1.0415616, + "epoch": 0.05753795280324665, + "flos": 14501174273280.0, + "grad_norm": 2.8091922156104077, + "language_loss": 0.75586867, + "learning_rate": 3.96754035802297e-06, + "loss": 0.7780264, + "num_input_tokens_seen": 20419085, + "router_z_loss_clip": 0.27929688, + "router_z_loss_mlp": 1.1796875, + "step": 957, + "time_per_iteration": 2.4029459953308105 + }, + { + "auxiliary_loss_clip": 0.01164806, + "auxiliary_loss_mlp": 0.01067315, + "balance_loss_clip": 1.03374577, + "balance_loss_mlp": 1.04087114, + "epoch": 0.057598076055914625, + "flos": 18075372733440.0, + "grad_norm": 2.0366571588287563, + "language_loss": 0.79876363, + "learning_rate": 3.967472529086284e-06, + "loss": 0.82108486, + "num_input_tokens_seen": 20437465, + "router_z_loss_clip": 0.3359375, + "router_z_loss_mlp": 1.234375, + "step": 958, + "time_per_iteration": 2.418729066848755 + }, + { + "auxiliary_loss_clip": 0.01158242, + "auxiliary_loss_mlp": 0.01049047, + "balance_loss_clip": 1.02119946, + "balance_loss_mlp": 1.03792882, + "epoch": 0.0576581993085826, + "flos": 22123586989440.0, + "grad_norm": 2.6994391636694663, + "language_loss": 0.88083041, + "learning_rate": 3.967404629935564e-06, + "loss": 0.90290332, + "num_input_tokens_seen": 20456235, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 1.203125, + "step": 959, + "time_per_iteration": 2.4110212326049805 + }, + { + "auxiliary_loss_clip": 0.01153443, + "auxiliary_loss_mlp": 0.01049098, + "balance_loss_clip": 1.01970053, + "balance_loss_mlp": 1.03854108, + "epoch": 0.05771832256125056, + "flos": 33180370704000.0, + "grad_norm": 10.099762558081204, + "language_loss": 0.7851907, + "learning_rate": 3.9673366605732335e-06, + "loss": 0.80721611, + "num_input_tokens_seen": 20476825, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 1.1484375, + "step": 960, + "time_per_iteration": 2.5175628662109375 + }, + { + "auxiliary_loss_clip": 0.01158291, + "auxiliary_loss_mlp": 0.01052655, + "balance_loss_clip": 1.0229001, + "balance_loss_mlp": 1.03849053, + "epoch": 0.057778445813918534, + "flos": 24169008435840.0, + "grad_norm": 2.056220829596911, + "language_loss": 0.93077898, + "learning_rate": 3.967268621001718e-06, + "loss": 0.95288843, + "num_input_tokens_seen": 20496965, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 1.1953125, + "step": 961, + "time_per_iteration": 3.873457193374634 + }, + { + "auxiliary_loss_clip": 0.01158597, + "auxiliary_loss_mlp": 0.01059004, + "balance_loss_clip": 1.02731872, + "balance_loss_mlp": 1.03795218, + "epoch": 0.0578385690665865, + "flos": 29641748785920.0, + "grad_norm": 2.839739402560139, + "language_loss": 0.68123364, + "learning_rate": 3.967200511223446e-06, + "loss": 0.70340973, + "num_input_tokens_seen": 20518035, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 1.2109375, + "step": 962, + "time_per_iteration": 2.462498664855957 + }, + { + "auxiliary_loss_clip": 0.01158783, + "auxiliary_loss_mlp": 0.01056919, + "balance_loss_clip": 1.02739072, + "balance_loss_mlp": 1.04080355, + "epoch": 0.05789869231925447, + "flos": 20884414636800.0, + "grad_norm": 2.6681272796667352, + "language_loss": 0.88147473, + "learning_rate": 3.967132331240848e-06, + "loss": 0.90363169, + "num_input_tokens_seen": 20534740, + "router_z_loss_clip": 0.29492188, + "router_z_loss_mlp": 1.1796875, + "step": 963, + "time_per_iteration": 2.4187171459198 + }, + { + "auxiliary_loss_clip": 0.01162174, + "auxiliary_loss_mlp": 0.01047752, + "balance_loss_clip": 1.01761627, + "balance_loss_mlp": 1.04180944, + "epoch": 0.057958815571922444, + "flos": 26029914013440.0, + "grad_norm": 2.1959636755272665, + "language_loss": 0.8503716, + "learning_rate": 3.9670640810563575e-06, + "loss": 0.87247086, + "num_input_tokens_seen": 20553485, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 1.203125, + "step": 964, + "time_per_iteration": 3.964810609817505 + }, + { + "auxiliary_loss_clip": 0.01157798, + "auxiliary_loss_mlp": 0.01061291, + "balance_loss_clip": 1.03020108, + "balance_loss_mlp": 1.04076898, + "epoch": 0.05801893882459041, + "flos": 18076699365120.0, + "grad_norm": 2.6028477809575405, + "language_loss": 0.77876091, + "learning_rate": 3.96699576067241e-06, + "loss": 0.80095172, + "num_input_tokens_seen": 20572155, + "router_z_loss_clip": 0.3125, + "router_z_loss_mlp": 1.171875, + "step": 965, + "time_per_iteration": 3.8165664672851562 + }, + { + "auxiliary_loss_clip": 0.0115381, + "auxiliary_loss_mlp": 0.01051914, + "balance_loss_clip": 1.02453136, + "balance_loss_mlp": 1.03798401, + "epoch": 0.05807906207725838, + "flos": 17747922291840.0, + "grad_norm": 2.042103237634218, + "language_loss": 0.81013924, + "learning_rate": 3.966927370091442e-06, + "loss": 0.83219647, + "num_input_tokens_seen": 20590395, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 1.15625, + "step": 966, + "time_per_iteration": 2.4498939514160156 + }, + { + "auxiliary_loss_clip": 0.01156764, + "auxiliary_loss_mlp": 0.01054726, + "balance_loss_clip": 1.0254128, + "balance_loss_mlp": 1.03877592, + "epoch": 0.058139185329926346, + "flos": 18039412166400.0, + "grad_norm": 1.9158014889501103, + "language_loss": 0.76398164, + "learning_rate": 3.9668589093158975e-06, + "loss": 0.78609657, + "num_input_tokens_seen": 20608435, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 1.1796875, + "step": 967, + "time_per_iteration": 3.8259549140930176 + }, + { + "auxiliary_loss_clip": 0.01040902, + "auxiliary_loss_mlp": 0.01004826, + "balance_loss_clip": 1.00084472, + "balance_loss_mlp": 1.00524974, + "epoch": 0.05819930858259432, + "flos": 62360287758720.0, + "grad_norm": 1.1378855262618766, + "language_loss": 0.57294559, + "learning_rate": 3.966790378348217e-06, + "loss": 0.59340286, + "num_input_tokens_seen": 20668575, + "router_z_loss_clip": 0.03979492, + "router_z_loss_mlp": 0.35546875, + "step": 968, + "time_per_iteration": 2.9690871238708496 + }, + { + "auxiliary_loss_clip": 0.01163325, + "auxiliary_loss_mlp": 0.01056222, + "balance_loss_clip": 1.0261575, + "balance_loss_mlp": 1.04513001, + "epoch": 0.05825943183526229, + "flos": 19134358225920.0, + "grad_norm": 1.995117496071628, + "language_loss": 0.82372129, + "learning_rate": 3.966721777190847e-06, + "loss": 0.84591675, + "num_input_tokens_seen": 20687355, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 1.1796875, + "step": 969, + "time_per_iteration": 2.413395643234253 + }, + { + "auxiliary_loss_clip": 0.01158774, + "auxiliary_loss_mlp": 0.0105649, + "balance_loss_clip": 1.02590108, + "balance_loss_mlp": 1.03951955, + "epoch": 0.058319555087930255, + "flos": 29021202547200.0, + "grad_norm": 2.6883084318848938, + "language_loss": 0.78030252, + "learning_rate": 3.966653105846237e-06, + "loss": 0.80245519, + "num_input_tokens_seen": 20705710, + "router_z_loss_clip": 0.30664062, + "router_z_loss_mlp": 1.1953125, + "step": 970, + "time_per_iteration": 2.459301710128784 + }, + { + "auxiliary_loss_clip": 0.0116011, + "auxiliary_loss_mlp": 0.01059005, + "balance_loss_clip": 1.02681899, + "balance_loss_mlp": 1.03973424, + "epoch": 0.05837967834059823, + "flos": 18879003475200.0, + "grad_norm": 2.4646959244174207, + "language_loss": 0.92010236, + "learning_rate": 3.966584364316835e-06, + "loss": 0.94229347, + "num_input_tokens_seen": 20722405, + "router_z_loss_clip": 0.32226562, + "router_z_loss_mlp": 1.203125, + "step": 971, + "time_per_iteration": 2.41672945022583 + }, + { + "auxiliary_loss_clip": 0.01153373, + "auxiliary_loss_mlp": 0.01046757, + "balance_loss_clip": 1.01958954, + "balance_loss_mlp": 1.03688502, + "epoch": 0.05843980159326619, + "flos": 25701870078720.0, + "grad_norm": 1.82764836766903, + "language_loss": 0.85977405, + "learning_rate": 3.966515552605096e-06, + "loss": 0.88177538, + "num_input_tokens_seen": 20741480, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 1.1640625, + "step": 972, + "time_per_iteration": 2.4400033950805664 + }, + { + "auxiliary_loss_clip": 0.01156008, + "auxiliary_loss_mlp": 0.01055734, + "balance_loss_clip": 1.02808905, + "balance_loss_mlp": 1.03966117, + "epoch": 0.058499924845934165, + "flos": 25551080449920.0, + "grad_norm": 2.605468634185153, + "language_loss": 0.87558317, + "learning_rate": 3.966446670713476e-06, + "loss": 0.89770055, + "num_input_tokens_seen": 20759685, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 1.1640625, + "step": 973, + "time_per_iteration": 2.4687106609344482 + }, + { + "auxiliary_loss_clip": 0.01156814, + "auxiliary_loss_mlp": 0.01056868, + "balance_loss_clip": 1.02561092, + "balance_loss_mlp": 1.03824723, + "epoch": 0.05856004809860214, + "flos": 16435222882560.0, + "grad_norm": 2.345147396479656, + "language_loss": 0.74578172, + "learning_rate": 3.9663777186444325e-06, + "loss": 0.76791859, + "num_input_tokens_seen": 20778180, + "router_z_loss_clip": 0.3125, + "router_z_loss_mlp": 1.1875, + "step": 974, + "time_per_iteration": 2.4109954833984375 + }, + { + "auxiliary_loss_clip": 0.01153303, + "auxiliary_loss_mlp": 0.01056384, + "balance_loss_clip": 1.02710593, + "balance_loss_mlp": 1.03830385, + "epoch": 0.0586201713512701, + "flos": 39457230554880.0, + "grad_norm": 2.0401418582323214, + "language_loss": 0.76616645, + "learning_rate": 3.966308696400426e-06, + "loss": 0.78826332, + "num_input_tokens_seen": 20802705, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 1.1484375, + "step": 975, + "time_per_iteration": 2.5860371589660645 + }, + { + "auxiliary_loss_clip": 0.01156982, + "auxiliary_loss_mlp": 0.01055984, + "balance_loss_clip": 1.02589536, + "balance_loss_mlp": 1.03819263, + "epoch": 0.058680294603938074, + "flos": 23364120885120.0, + "grad_norm": 2.3569782231287117, + "language_loss": 0.76396739, + "learning_rate": 3.96623960398392e-06, + "loss": 0.78609711, + "num_input_tokens_seen": 20822540, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 1.1875, + "step": 976, + "time_per_iteration": 2.426626443862915 + }, + { + "auxiliary_loss_clip": 0.01154381, + "auxiliary_loss_mlp": 0.01055215, + "balance_loss_clip": 1.02517378, + "balance_loss_mlp": 1.0368669, + "epoch": 0.05874041785660604, + "flos": 32230698278400.0, + "grad_norm": 1.9095241002161987, + "language_loss": 0.8741101, + "learning_rate": 3.9661704413973805e-06, + "loss": 0.89620602, + "num_input_tokens_seen": 20844175, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 1.171875, + "step": 977, + "time_per_iteration": 2.501909017562866 + }, + { + "auxiliary_loss_clip": 0.01155156, + "auxiliary_loss_mlp": 0.01054398, + "balance_loss_clip": 1.02616882, + "balance_loss_mlp": 1.03929973, + "epoch": 0.05880054110927401, + "flos": 22308940730880.0, + "grad_norm": 1.84656154429845, + "language_loss": 0.79312801, + "learning_rate": 3.966101208643276e-06, + "loss": 0.81522357, + "num_input_tokens_seen": 20864730, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 1.15625, + "step": 978, + "time_per_iteration": 2.425081729888916 + }, + { + "auxiliary_loss_clip": 0.01160624, + "auxiliary_loss_mlp": 0.01057484, + "balance_loss_clip": 1.02701402, + "balance_loss_mlp": 1.03972781, + "epoch": 0.05886066436194198, + "flos": 27379237305600.0, + "grad_norm": 2.6661522830253337, + "language_loss": 0.80714297, + "learning_rate": 3.966031905724076e-06, + "loss": 0.82932401, + "num_input_tokens_seen": 20885200, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 1.203125, + "step": 979, + "time_per_iteration": 2.495361328125 + }, + { + "auxiliary_loss_clip": 0.01042219, + "auxiliary_loss_mlp": 0.01005382, + "balance_loss_clip": 1.00101852, + "balance_loss_mlp": 1.00593567, + "epoch": 0.05892078761460995, + "flos": 59581725338880.0, + "grad_norm": 0.9253505532266535, + "language_loss": 0.59051669, + "learning_rate": 3.965962532642255e-06, + "loss": 0.61099267, + "num_input_tokens_seen": 20940325, + "router_z_loss_clip": 0.04370117, + "router_z_loss_mlp": 0.36328125, + "step": 980, + "time_per_iteration": 2.9525704383850098 + }, + { + "auxiliary_loss_clip": 0.01152762, + "auxiliary_loss_mlp": 0.01056813, + "balance_loss_clip": 1.02776158, + "balance_loss_mlp": 1.0376246, + "epoch": 0.05898091086727792, + "flos": 15413175475200.0, + "grad_norm": 2.060047658786371, + "language_loss": 0.86316341, + "learning_rate": 3.9658930894002885e-06, + "loss": 0.88525915, + "num_input_tokens_seen": 20958220, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 1.1484375, + "step": 981, + "time_per_iteration": 2.410400152206421 + }, + { + "auxiliary_loss_clip": 0.01156705, + "auxiliary_loss_mlp": 0.01054771, + "balance_loss_clip": 1.02780557, + "balance_loss_mlp": 1.04009056, + "epoch": 0.059041034119945886, + "flos": 23654319039360.0, + "grad_norm": 2.1100833836001347, + "language_loss": 0.79749936, + "learning_rate": 3.965823576000653e-06, + "loss": 0.81961417, + "num_input_tokens_seen": 20978920, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 1.171875, + "step": 982, + "time_per_iteration": 2.4408376216888428 + }, + { + "auxiliary_loss_clip": 0.01163271, + "auxiliary_loss_mlp": 0.01055369, + "balance_loss_clip": 1.02616262, + "balance_loss_mlp": 1.04094839, + "epoch": 0.05910115737261386, + "flos": 24752930791680.0, + "grad_norm": 2.1948144071786544, + "language_loss": 0.84188688, + "learning_rate": 3.965753992445833e-06, + "loss": 0.86407328, + "num_input_tokens_seen": 20999490, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 1.2265625, + "step": 983, + "time_per_iteration": 2.453981637954712 + }, + { + "auxiliary_loss_clip": 0.01157224, + "auxiliary_loss_mlp": 0.01063252, + "balance_loss_clip": 1.03204322, + "balance_loss_mlp": 1.04023576, + "epoch": 0.05916128062528183, + "flos": 11727953291520.0, + "grad_norm": 1.9029363631781626, + "language_loss": 0.84873164, + "learning_rate": 3.9656843387383075e-06, + "loss": 0.87093639, + "num_input_tokens_seen": 21017865, + "router_z_loss_clip": 0.3125, + "router_z_loss_mlp": 1.171875, + "step": 984, + "time_per_iteration": 2.387319564819336 + }, + { + "auxiliary_loss_clip": 0.01152499, + "auxiliary_loss_mlp": 0.01055111, + "balance_loss_clip": 1.02781165, + "balance_loss_mlp": 1.04044604, + "epoch": 0.059221403877949795, + "flos": 21902063212800.0, + "grad_norm": 2.5293821744566185, + "language_loss": 0.77352715, + "learning_rate": 3.965614614880566e-06, + "loss": 0.79560328, + "num_input_tokens_seen": 21035900, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 1.125, + "step": 985, + "time_per_iteration": 2.4488773345947266 + }, + { + "auxiliary_loss_clip": 0.01159218, + "auxiliary_loss_mlp": 0.01057503, + "balance_loss_clip": 1.02842832, + "balance_loss_mlp": 1.04079485, + "epoch": 0.05928152713061777, + "flos": 20513742065280.0, + "grad_norm": 2.9893172371468024, + "language_loss": 0.90492582, + "learning_rate": 3.965544820875094e-06, + "loss": 0.92709303, + "num_input_tokens_seen": 21053235, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 1.1875, + "step": 986, + "time_per_iteration": 2.443754196166992 + }, + { + "auxiliary_loss_clip": 0.0115854, + "auxiliary_loss_mlp": 0.0106082, + "balance_loss_clip": 1.02829957, + "balance_loss_mlp": 1.03743863, + "epoch": 0.05934165038328574, + "flos": 24494084904960.0, + "grad_norm": 1.9787948727060412, + "language_loss": 0.75887883, + "learning_rate": 3.965474956724383e-06, + "loss": 0.78107238, + "num_input_tokens_seen": 21073090, + "router_z_loss_clip": 0.32421875, + "router_z_loss_mlp": 1.2109375, + "step": 987, + "time_per_iteration": 2.464308977127075 + }, + { + "auxiliary_loss_clip": 0.01158062, + "auxiliary_loss_mlp": 0.0105224, + "balance_loss_clip": 1.02315331, + "balance_loss_mlp": 1.0372479, + "epoch": 0.059401773635953704, + "flos": 38726498465280.0, + "grad_norm": 2.1718172609657764, + "language_loss": 0.71649158, + "learning_rate": 3.965405022430928e-06, + "loss": 0.73859465, + "num_input_tokens_seen": 21094895, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 1.2109375, + "step": 988, + "time_per_iteration": 2.544602870941162 + }, + { + "auxiliary_loss_clip": 0.01040901, + "auxiliary_loss_mlp": 0.01004285, + "balance_loss_clip": 0.9999463, + "balance_loss_mlp": 1.00449657, + "epoch": 0.059461896888621676, + "flos": 58020618539520.0, + "grad_norm": 0.9235199678427388, + "language_loss": 0.71133971, + "learning_rate": 3.965335017997222e-06, + "loss": 0.73179162, + "num_input_tokens_seen": 21147555, + "router_z_loss_clip": 0.04345703, + "router_z_loss_mlp": 0.36328125, + "step": 989, + "time_per_iteration": 2.8911666870117188 + }, + { + "auxiliary_loss_clip": 0.01161321, + "auxiliary_loss_mlp": 0.01062306, + "balance_loss_clip": 1.02914214, + "balance_loss_mlp": 1.03849137, + "epoch": 0.05952202014128964, + "flos": 22126659189120.0, + "grad_norm": 1.9788870334050774, + "language_loss": 0.77683198, + "learning_rate": 3.965264943425766e-06, + "loss": 0.79906827, + "num_input_tokens_seen": 21167845, + "router_z_loss_clip": 0.33203125, + "router_z_loss_mlp": 1.2265625, + "step": 990, + "time_per_iteration": 2.424315929412842 + }, + { + "auxiliary_loss_clip": 0.01153705, + "auxiliary_loss_mlp": 0.01049102, + "balance_loss_clip": 1.01887035, + "balance_loss_mlp": 1.0374701, + "epoch": 0.059582143393957614, + "flos": 20444823308160.0, + "grad_norm": 2.493270788674315, + "language_loss": 0.85957623, + "learning_rate": 3.965194798719059e-06, + "loss": 0.88160431, + "num_input_tokens_seen": 21185085, + "router_z_loss_clip": 0.30273438, + "router_z_loss_mlp": 1.1640625, + "step": 991, + "time_per_iteration": 2.4135589599609375 + }, + { + "auxiliary_loss_clip": 0.01158141, + "auxiliary_loss_mlp": 0.01059113, + "balance_loss_clip": 1.02890551, + "balance_loss_mlp": 1.03746819, + "epoch": 0.059642266646625586, + "flos": 20593832457600.0, + "grad_norm": 2.002948820668704, + "language_loss": 0.76866162, + "learning_rate": 3.965124583879604e-06, + "loss": 0.79083419, + "num_input_tokens_seen": 21204230, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 1.2109375, + "step": 992, + "time_per_iteration": 2.4235281944274902 + }, + { + "auxiliary_loss_clip": 0.01162354, + "auxiliary_loss_mlp": 0.01059023, + "balance_loss_clip": 1.03100932, + "balance_loss_mlp": 1.04212487, + "epoch": 0.05970238989929355, + "flos": 19351692639360.0, + "grad_norm": 2.425489978258854, + "language_loss": 0.74587756, + "learning_rate": 3.965054298909908e-06, + "loss": 0.76809132, + "num_input_tokens_seen": 21222655, + "router_z_loss_clip": 0.27929688, + "router_z_loss_mlp": 1.203125, + "step": 993, + "time_per_iteration": 2.4192397594451904 + }, + { + "auxiliary_loss_clip": 0.01157142, + "auxiliary_loss_mlp": 0.01057147, + "balance_loss_clip": 1.02805996, + "balance_loss_mlp": 1.04095399, + "epoch": 0.05976251315196152, + "flos": 30262713960960.0, + "grad_norm": 3.039263163999806, + "language_loss": 0.79152131, + "learning_rate": 3.964983943812479e-06, + "loss": 0.8136642, + "num_input_tokens_seen": 21242310, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 1.15625, + "step": 994, + "time_per_iteration": 2.4724812507629395 + }, + { + "auxiliary_loss_clip": 0.01154745, + "auxiliary_loss_mlp": 0.0106191, + "balance_loss_clip": 1.032179, + "balance_loss_mlp": 1.03938246, + "epoch": 0.05982263640462949, + "flos": 23184038759040.0, + "grad_norm": 2.871359616757743, + "language_loss": 0.8020556, + "learning_rate": 3.964913518589827e-06, + "loss": 0.82422209, + "num_input_tokens_seen": 21261410, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 1.15625, + "step": 995, + "time_per_iteration": 2.4379522800445557 + }, + { + "auxiliary_loss_clip": 0.01155303, + "auxiliary_loss_mlp": 0.01063218, + "balance_loss_clip": 1.03384519, + "balance_loss_mlp": 1.03777981, + "epoch": 0.05988275965729746, + "flos": 27849761965440.0, + "grad_norm": 2.2098995933085894, + "language_loss": 0.8701334, + "learning_rate": 3.964843023244466e-06, + "loss": 0.89231861, + "num_input_tokens_seen": 21280080, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 1.171875, + "step": 996, + "time_per_iteration": 2.4437999725341797 + }, + { + "auxiliary_loss_clip": 0.01159118, + "auxiliary_loss_mlp": 0.01064769, + "balance_loss_clip": 1.03220057, + "balance_loss_mlp": 1.04127562, + "epoch": 0.05994288290996543, + "flos": 24678880064640.0, + "grad_norm": 3.8608888713812597, + "language_loss": 0.88007629, + "learning_rate": 3.964772457778912e-06, + "loss": 0.90231526, + "num_input_tokens_seen": 21296765, + "router_z_loss_clip": 0.32421875, + "router_z_loss_mlp": 1.1796875, + "step": 997, + "time_per_iteration": 2.443021297454834 + }, + { + "auxiliary_loss_clip": 0.01038178, + "auxiliary_loss_mlp": 0.01003311, + "balance_loss_clip": 0.99923432, + "balance_loss_mlp": 1.003317, + "epoch": 0.0600030061626334, + "flos": 69925965782400.0, + "grad_norm": 1.0099586250694919, + "language_loss": 0.75391841, + "learning_rate": 3.964701822195683e-06, + "loss": 0.7743333, + "num_input_tokens_seen": 21363345, + "router_z_loss_clip": 0.04077148, + "router_z_loss_mlp": 0.34765625, + "step": 998, + "time_per_iteration": 3.1126739978790283 + }, + { + "auxiliary_loss_clip": 0.01157325, + "auxiliary_loss_mlp": 0.01061226, + "balance_loss_clip": 1.03156662, + "balance_loss_mlp": 1.0397613, + "epoch": 0.06006312941530137, + "flos": 26538982680960.0, + "grad_norm": 2.049004464099992, + "language_loss": 0.75884998, + "learning_rate": 3.9646311164973e-06, + "loss": 0.78103548, + "num_input_tokens_seen": 21385290, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 1.171875, + "step": 999, + "time_per_iteration": 2.4973855018615723 + }, + { + "auxiliary_loss_clip": 0.01157772, + "auxiliary_loss_mlp": 0.01056214, + "balance_loss_clip": 1.02531469, + "balance_loss_mlp": 1.03844833, + "epoch": 0.060123252667969335, + "flos": 27342787979520.0, + "grad_norm": 1.7610372518845114, + "language_loss": 0.82862902, + "learning_rate": 3.9645603406862846e-06, + "loss": 0.85076886, + "num_input_tokens_seen": 21407625, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 1.1953125, + "step": 1000, + "time_per_iteration": 2.4534127712249756 + }, + { + "auxiliary_loss_clip": 0.01156995, + "auxiliary_loss_mlp": 0.01057862, + "balance_loss_clip": 1.02845311, + "balance_loss_mlp": 1.03943276, + "epoch": 0.06018337592063731, + "flos": 27015477183360.0, + "grad_norm": 4.36013563541894, + "language_loss": 0.85889578, + "learning_rate": 3.964489494765166e-06, + "loss": 0.88104439, + "num_input_tokens_seen": 21426835, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 1.171875, + "step": 1001, + "time_per_iteration": 3.905280113220215 + }, + { + "auxiliary_loss_clip": 0.01154827, + "auxiliary_loss_mlp": 0.01052253, + "balance_loss_clip": 1.02483511, + "balance_loss_mlp": 1.04005575, + "epoch": 0.06024349917330528, + "flos": 25591788961920.0, + "grad_norm": 2.18419448899442, + "language_loss": 0.74045211, + "learning_rate": 3.96441857873647e-06, + "loss": 0.76252288, + "num_input_tokens_seen": 21444920, + "router_z_loss_clip": 0.27539062, + "router_z_loss_mlp": 1.1484375, + "step": 1002, + "time_per_iteration": 2.4591424465179443 + }, + { + "auxiliary_loss_clip": 0.01155412, + "auxiliary_loss_mlp": 0.01053095, + "balance_loss_clip": 1.02345991, + "balance_loss_mlp": 1.03853083, + "epoch": 0.060303622425973244, + "flos": 26132279719680.0, + "grad_norm": 2.3045699377531497, + "language_loss": 0.75484115, + "learning_rate": 3.964347592602728e-06, + "loss": 0.77692622, + "num_input_tokens_seen": 21463555, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 1.171875, + "step": 1003, + "time_per_iteration": 2.4360148906707764 + }, + { + "auxiliary_loss_clip": 0.01162091, + "auxiliary_loss_mlp": 0.01052213, + "balance_loss_clip": 1.0206461, + "balance_loss_mlp": 1.04050243, + "epoch": 0.060363745678641216, + "flos": 20376114019200.0, + "grad_norm": 2.4424858968040235, + "language_loss": 0.69722307, + "learning_rate": 3.964276536366473e-06, + "loss": 0.71936619, + "num_input_tokens_seen": 21481990, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 1.21875, + "step": 1004, + "time_per_iteration": 5.30731987953186 + }, + { + "auxiliary_loss_clip": 0.01162351, + "auxiliary_loss_mlp": 0.01062383, + "balance_loss_clip": 1.03300953, + "balance_loss_mlp": 1.04186904, + "epoch": 0.06042386893130918, + "flos": 17748201582720.0, + "grad_norm": 2.2180426128467743, + "language_loss": 0.83568144, + "learning_rate": 3.964205410030241e-06, + "loss": 0.85792875, + "num_input_tokens_seen": 21500385, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 1.203125, + "step": 1005, + "time_per_iteration": 2.4257004261016846 + }, + { + "auxiliary_loss_clip": 0.01037841, + "auxiliary_loss_mlp": 0.01008964, + "balance_loss_clip": 1.00495863, + "balance_loss_mlp": 1.00299001, + "epoch": 0.06048399218397715, + "flos": 68535061194240.0, + "grad_norm": 0.9146307501256149, + "language_loss": 0.59042352, + "learning_rate": 3.964134213596571e-06, + "loss": 0.61089152, + "num_input_tokens_seen": 21561040, + "router_z_loss_clip": 0.04003906, + "router_z_loss_mlp": 0.34765625, + "step": 1006, + "time_per_iteration": 3.080986261367798 + }, + { + "auxiliary_loss_clip": 0.0115253, + "auxiliary_loss_mlp": 0.01054203, + "balance_loss_clip": 1.02444792, + "balance_loss_mlp": 1.03542399, + "epoch": 0.060544115436645125, + "flos": 23257391258880.0, + "grad_norm": 4.936143666476675, + "language_loss": 0.74330884, + "learning_rate": 3.964062947068003e-06, + "loss": 0.76537621, + "num_input_tokens_seen": 21580655, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 1.171875, + "step": 1007, + "time_per_iteration": 3.882143497467041 + }, + { + "auxiliary_loss_clip": 0.01155834, + "auxiliary_loss_mlp": 0.01052496, + "balance_loss_clip": 1.02348018, + "balance_loss_mlp": 1.03754771, + "epoch": 0.06060423868931309, + "flos": 23877309093120.0, + "grad_norm": 1.740738530070099, + "language_loss": 0.80621183, + "learning_rate": 3.9639916104470804e-06, + "loss": 0.82829511, + "num_input_tokens_seen": 21599650, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 1.1875, + "step": 1008, + "time_per_iteration": 2.4477415084838867 + }, + { + "auxiliary_loss_clip": 0.01158471, + "auxiliary_loss_mlp": 0.01053834, + "balance_loss_clip": 1.0252471, + "balance_loss_mlp": 1.04050589, + "epoch": 0.06066436194198106, + "flos": 18727236328320.0, + "grad_norm": 1.7879989603434563, + "language_loss": 0.77816951, + "learning_rate": 3.9639202037363494e-06, + "loss": 0.80029255, + "num_input_tokens_seen": 21617550, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 1.1796875, + "step": 1009, + "time_per_iteration": 2.418851613998413 + }, + { + "auxiliary_loss_clip": 0.01152606, + "auxiliary_loss_mlp": 0.01048527, + "balance_loss_clip": 1.02015519, + "balance_loss_mlp": 1.03950274, + "epoch": 0.06072448519464903, + "flos": 24639428361600.0, + "grad_norm": 1.7972262110622221, + "language_loss": 0.92497772, + "learning_rate": 3.9638487269383575e-06, + "loss": 0.94698906, + "num_input_tokens_seen": 21635865, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 1.1328125, + "step": 1010, + "time_per_iteration": 2.451969623565674 + }, + { + "auxiliary_loss_clip": 0.01159785, + "auxiliary_loss_mlp": 0.01057949, + "balance_loss_clip": 1.0270505, + "balance_loss_mlp": 1.03740323, + "epoch": 0.060784608447317, + "flos": 17378017770240.0, + "grad_norm": 3.6712956960038796, + "language_loss": 0.71411031, + "learning_rate": 3.9637771800556576e-06, + "loss": 0.73628759, + "num_input_tokens_seen": 21653945, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 1.2265625, + "step": 1011, + "time_per_iteration": 2.4284327030181885 + }, + { + "auxiliary_loss_clip": 0.01158641, + "auxiliary_loss_mlp": 0.01067488, + "balance_loss_clip": 1.03451467, + "balance_loss_mlp": 1.03772664, + "epoch": 0.06084473169998497, + "flos": 23691187301760.0, + "grad_norm": 2.2199777315452334, + "language_loss": 0.8743695, + "learning_rate": 3.963705563090801e-06, + "loss": 0.89663088, + "num_input_tokens_seen": 21671230, + "router_z_loss_clip": 0.33007812, + "router_z_loss_mlp": 1.2109375, + "step": 1012, + "time_per_iteration": 2.440599203109741 + }, + { + "auxiliary_loss_clip": 0.01152822, + "auxiliary_loss_mlp": 0.01049775, + "balance_loss_clip": 1.02074802, + "balance_loss_mlp": 1.03609371, + "epoch": 0.06090485495265294, + "flos": 23545320174720.0, + "grad_norm": 2.3187281216043703, + "language_loss": 0.76561666, + "learning_rate": 3.963633876046344e-06, + "loss": 0.78764266, + "num_input_tokens_seen": 21691155, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 1.171875, + "step": 1013, + "time_per_iteration": 2.475726366043091 + }, + { + "auxiliary_loss_clip": 0.01157383, + "auxiliary_loss_mlp": 0.01060122, + "balance_loss_clip": 1.02834129, + "balance_loss_mlp": 1.03816497, + "epoch": 0.06096497820532091, + "flos": 20338268238720.0, + "grad_norm": 2.384126716604103, + "language_loss": 0.85482019, + "learning_rate": 3.963562118924844e-06, + "loss": 0.87699521, + "num_input_tokens_seen": 21707405, + "router_z_loss_clip": 0.31835938, + "router_z_loss_mlp": 1.1953125, + "step": 1014, + "time_per_iteration": 2.417125701904297 + }, + { + "auxiliary_loss_clip": 0.01160361, + "auxiliary_loss_mlp": 0.01056803, + "balance_loss_clip": 1.02313781, + "balance_loss_mlp": 1.03910649, + "epoch": 0.061025101457988874, + "flos": 26937935320320.0, + "grad_norm": 2.246085630291627, + "language_loss": 0.73297465, + "learning_rate": 3.963490291728864e-06, + "loss": 0.75514627, + "num_input_tokens_seen": 21728090, + "router_z_loss_clip": 0.3359375, + "router_z_loss_mlp": 1.2109375, + "step": 1015, + "time_per_iteration": 2.483424186706543 + }, + { + "auxiliary_loss_clip": 0.0115166, + "auxiliary_loss_mlp": 0.0105429, + "balance_loss_clip": 1.02441573, + "balance_loss_mlp": 1.03619528, + "epoch": 0.061085224710656846, + "flos": 25373861055360.0, + "grad_norm": 1.6809549189182948, + "language_loss": 0.7901845, + "learning_rate": 3.963418394460966e-06, + "loss": 0.812244, + "num_input_tokens_seen": 21747950, + "router_z_loss_clip": 0.29882812, + "router_z_loss_mlp": 1.15625, + "step": 1016, + "time_per_iteration": 2.439598798751831 + }, + { + "auxiliary_loss_clip": 0.01157869, + "auxiliary_loss_mlp": 0.01047841, + "balance_loss_clip": 1.01918364, + "balance_loss_mlp": 1.03926146, + "epoch": 0.06114534796332482, + "flos": 24823664939520.0, + "grad_norm": 1.7343598991326854, + "language_loss": 0.75973874, + "learning_rate": 3.9633464271237166e-06, + "loss": 0.78179586, + "num_input_tokens_seen": 21767900, + "router_z_loss_clip": 0.28710938, + "router_z_loss_mlp": 1.1875, + "step": 1017, + "time_per_iteration": 2.4395880699157715 + }, + { + "auxiliary_loss_clip": 0.0116119, + "auxiliary_loss_mlp": 0.01061209, + "balance_loss_clip": 1.03021479, + "balance_loss_mlp": 1.04041672, + "epoch": 0.061205471215992784, + "flos": 20630386517760.0, + "grad_norm": 2.478866834537974, + "language_loss": 0.85801131, + "learning_rate": 3.963274389719682e-06, + "loss": 0.88023531, + "num_input_tokens_seen": 21787375, + "router_z_loss_clip": 0.31054688, + "router_z_loss_mlp": 1.203125, + "step": 1018, + "time_per_iteration": 2.4153647422790527 + }, + { + "auxiliary_loss_clip": 0.01155674, + "auxiliary_loss_mlp": 0.01058413, + "balance_loss_clip": 1.02789545, + "balance_loss_mlp": 1.03959513, + "epoch": 0.061265594468660756, + "flos": 16507423307520.0, + "grad_norm": 7.811235902239468, + "language_loss": 0.76732063, + "learning_rate": 3.963202282251436e-06, + "loss": 0.78946149, + "num_input_tokens_seen": 21806275, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 1.15625, + "step": 1019, + "time_per_iteration": 2.393094062805176 + }, + { + "auxiliary_loss_clip": 0.01156278, + "auxiliary_loss_mlp": 0.01053328, + "balance_loss_clip": 1.02164245, + "balance_loss_mlp": 1.03888106, + "epoch": 0.06132571772132872, + "flos": 26245118833920.0, + "grad_norm": 2.2044878885481083, + "language_loss": 0.84023499, + "learning_rate": 3.96313010472155e-06, + "loss": 0.86233103, + "num_input_tokens_seen": 21826430, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 1.171875, + "step": 1020, + "time_per_iteration": 2.4407336711883545 + }, + { + "auxiliary_loss_clip": 0.01159264, + "auxiliary_loss_mlp": 0.01055516, + "balance_loss_clip": 1.02580905, + "balance_loss_mlp": 1.04091072, + "epoch": 0.06138584097399669, + "flos": 37413275385600.0, + "grad_norm": 2.1251693510396987, + "language_loss": 0.79392493, + "learning_rate": 3.963057857132601e-06, + "loss": 0.8160727, + "num_input_tokens_seen": 21847800, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 1.1875, + "step": 1021, + "time_per_iteration": 2.55255126953125 + }, + { + "auxiliary_loss_clip": 0.01154293, + "auxiliary_loss_mlp": 0.01057299, + "balance_loss_clip": 1.03082263, + "balance_loss_mlp": 1.03812289, + "epoch": 0.061445964226664665, + "flos": 17419703800320.0, + "grad_norm": 1.8709309273080235, + "language_loss": 0.87560797, + "learning_rate": 3.962985539487165e-06, + "loss": 0.89772391, + "num_input_tokens_seen": 21863385, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.15625, + "step": 1022, + "time_per_iteration": 2.377963066101074 + }, + { + "auxiliary_loss_clip": 0.01157659, + "auxiliary_loss_mlp": 0.01053407, + "balance_loss_clip": 1.02510691, + "balance_loss_mlp": 1.04006875, + "epoch": 0.06150608747933263, + "flos": 22598964328320.0, + "grad_norm": 3.5561262995454856, + "language_loss": 0.82924676, + "learning_rate": 3.962913151787826e-06, + "loss": 0.85135746, + "num_input_tokens_seen": 21881880, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 1.171875, + "step": 1023, + "time_per_iteration": 2.4481008052825928 + }, + { + "auxiliary_loss_clip": 0.01039811, + "auxiliary_loss_mlp": 0.01006344, + "balance_loss_clip": 1.00257671, + "balance_loss_mlp": 1.0057925, + "epoch": 0.0615662107320006, + "flos": 56738712816000.0, + "grad_norm": 0.892435407282451, + "language_loss": 0.65076607, + "learning_rate": 3.962840694037165e-06, + "loss": 0.67122757, + "num_input_tokens_seen": 21940550, + "router_z_loss_clip": 0.03759766, + "router_z_loss_mlp": 0.33984375, + "step": 1024, + "time_per_iteration": 3.0637269020080566 + }, + { + "auxiliary_loss_clip": 0.011566, + "auxiliary_loss_mlp": 0.01058846, + "balance_loss_clip": 1.027637, + "balance_loss_mlp": 1.03874087, + "epoch": 0.06162633398466857, + "flos": 22563701988480.0, + "grad_norm": 2.121991078512927, + "language_loss": 0.88018882, + "learning_rate": 3.962768166237768e-06, + "loss": 0.90234327, + "num_input_tokens_seen": 21958390, + "router_z_loss_clip": 0.3125, + "router_z_loss_mlp": 1.1796875, + "step": 1025, + "time_per_iteration": 2.4291317462921143 + }, + { + "auxiliary_loss_clip": 0.01158191, + "auxiliary_loss_mlp": 0.0104929, + "balance_loss_clip": 1.02213371, + "balance_loss_mlp": 1.04083061, + "epoch": 0.06168645723733654, + "flos": 25591928607360.0, + "grad_norm": 1.944980768345882, + "language_loss": 0.84539229, + "learning_rate": 3.9626955683922264e-06, + "loss": 0.86746705, + "num_input_tokens_seen": 21978625, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 1.171875, + "step": 1026, + "time_per_iteration": 2.4521775245666504 + }, + { + "auxiliary_loss_clip": 0.01161698, + "auxiliary_loss_mlp": 0.01052541, + "balance_loss_clip": 1.02338207, + "balance_loss_mlp": 1.04109931, + "epoch": 0.06174658049000451, + "flos": 15996993096960.0, + "grad_norm": 2.258709835405207, + "language_loss": 0.82325631, + "learning_rate": 3.962622900503127e-06, + "loss": 0.84539866, + "num_input_tokens_seen": 21996035, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 1.203125, + "step": 1027, + "time_per_iteration": 2.4000582695007324 + }, + { + "auxiliary_loss_clip": 0.01152861, + "auxiliary_loss_mlp": 0.01051801, + "balance_loss_clip": 1.02383435, + "balance_loss_mlp": 1.0380646, + "epoch": 0.06180670374267248, + "flos": 11285324674560.0, + "grad_norm": 2.537469324710815, + "language_loss": 0.84134269, + "learning_rate": 3.962550162573065e-06, + "loss": 0.86338931, + "num_input_tokens_seen": 22011625, + "router_z_loss_clip": 0.27929688, + "router_z_loss_mlp": 1.1484375, + "step": 1028, + "time_per_iteration": 2.358281373977661 + }, + { + "auxiliary_loss_clip": 0.01037697, + "auxiliary_loss_mlp": 0.01007238, + "balance_loss_clip": 1.00339997, + "balance_loss_mlp": 1.00318575, + "epoch": 0.06186682699534045, + "flos": 65127224695680.0, + "grad_norm": 0.9629596283558131, + "language_loss": 0.60529995, + "learning_rate": 3.962477354604636e-06, + "loss": 0.62574935, + "num_input_tokens_seen": 22066035, + "router_z_loss_clip": 0.03833008, + "router_z_loss_mlp": 0.34375, + "step": 1029, + "time_per_iteration": 2.864759683609009 + }, + { + "auxiliary_loss_clip": 0.01150987, + "auxiliary_loss_mlp": 0.01053206, + "balance_loss_clip": 1.02463198, + "balance_loss_mlp": 1.03651297, + "epoch": 0.061926950248008414, + "flos": 21104681604480.0, + "grad_norm": 4.849598134920486, + "language_loss": 0.82339936, + "learning_rate": 3.962404476600438e-06, + "loss": 0.84544134, + "num_input_tokens_seen": 22085015, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 1.140625, + "step": 1030, + "time_per_iteration": 2.394225597381592 + }, + { + "auxiliary_loss_clip": 0.01161727, + "auxiliary_loss_mlp": 0.01062051, + "balance_loss_clip": 1.03181958, + "balance_loss_mlp": 1.0407145, + "epoch": 0.061987073500676386, + "flos": 17747503355520.0, + "grad_norm": 2.727803687845987, + "language_loss": 0.7986154, + "learning_rate": 3.962331528563072e-06, + "loss": 0.82085317, + "num_input_tokens_seen": 22102775, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 1.2109375, + "step": 1031, + "time_per_iteration": 2.4070053100585938 + }, + { + "auxiliary_loss_clip": 0.01156936, + "auxiliary_loss_mlp": 0.01061886, + "balance_loss_clip": 1.03142774, + "balance_loss_mlp": 1.03971457, + "epoch": 0.06204719675334436, + "flos": 21835134403200.0, + "grad_norm": 1.6632160897947257, + "language_loss": 0.77500129, + "learning_rate": 3.962258510495142e-06, + "loss": 0.79718953, + "num_input_tokens_seen": 22121680, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 1.171875, + "step": 1032, + "time_per_iteration": 2.404686450958252 + }, + { + "auxiliary_loss_clip": 0.01158514, + "auxiliary_loss_mlp": 0.01062509, + "balance_loss_clip": 1.0319314, + "balance_loss_mlp": 1.03803051, + "epoch": 0.06210732000601232, + "flos": 19352705068800.0, + "grad_norm": 2.252385851128305, + "language_loss": 0.88763595, + "learning_rate": 3.962185422399254e-06, + "loss": 0.90984619, + "num_input_tokens_seen": 22138155, + "router_z_loss_clip": 0.30664062, + "router_z_loss_mlp": 1.203125, + "step": 1033, + "time_per_iteration": 2.3899827003479004 + }, + { + "auxiliary_loss_clip": 0.01156533, + "auxiliary_loss_mlp": 0.01060124, + "balance_loss_clip": 1.03165722, + "balance_loss_mlp": 1.03904891, + "epoch": 0.062167443258680295, + "flos": 24748357403520.0, + "grad_norm": 2.1112015481619135, + "language_loss": 0.85067034, + "learning_rate": 3.962112264278014e-06, + "loss": 0.87283695, + "num_input_tokens_seen": 22157420, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 1.171875, + "step": 1034, + "time_per_iteration": 2.417952299118042 + }, + { + "auxiliary_loss_clip": 0.01152025, + "auxiliary_loss_mlp": 0.0105143, + "balance_loss_clip": 1.02109075, + "balance_loss_mlp": 1.04026592, + "epoch": 0.06222756651134827, + "flos": 34457074634880.0, + "grad_norm": 2.0175474623906156, + "language_loss": 0.80539238, + "learning_rate": 3.962039036134035e-06, + "loss": 0.82742691, + "num_input_tokens_seen": 22178620, + "router_z_loss_clip": 0.30273438, + "router_z_loss_mlp": 1.1171875, + "step": 1035, + "time_per_iteration": 2.5118720531463623 + }, + { + "auxiliary_loss_clip": 0.01158328, + "auxiliary_loss_mlp": 0.01052676, + "balance_loss_clip": 1.02084732, + "balance_loss_mlp": 1.0402739, + "epoch": 0.06228768976401623, + "flos": 25665281107200.0, + "grad_norm": 2.7436331301329893, + "language_loss": 0.78723359, + "learning_rate": 3.961965737969931e-06, + "loss": 0.8093437, + "num_input_tokens_seen": 22197125, + "router_z_loss_clip": 0.31835938, + "router_z_loss_mlp": 1.1796875, + "step": 1036, + "time_per_iteration": 2.429211139678955 + }, + { + "auxiliary_loss_clip": 0.01154011, + "auxiliary_loss_mlp": 0.01055222, + "balance_loss_clip": 1.02690959, + "balance_loss_mlp": 1.03979087, + "epoch": 0.062347813016684205, + "flos": 25294608535680.0, + "grad_norm": 1.8662455074359048, + "language_loss": 0.86611468, + "learning_rate": 3.961892369788315e-06, + "loss": 0.88820702, + "num_input_tokens_seen": 22217575, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 1.140625, + "step": 1037, + "time_per_iteration": 2.4833195209503174 + }, + { + "auxiliary_loss_clip": 0.01152175, + "auxiliary_loss_mlp": 0.01053944, + "balance_loss_clip": 1.02109003, + "balance_loss_mlp": 1.03708446, + "epoch": 0.06240793626935217, + "flos": 26905815002880.0, + "grad_norm": 2.290920851884523, + "language_loss": 0.80359685, + "learning_rate": 3.961818931591808e-06, + "loss": 0.82565802, + "num_input_tokens_seen": 22236840, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 1.15625, + "step": 1038, + "time_per_iteration": 2.4463109970092773 + }, + { + "auxiliary_loss_clip": 0.01153569, + "auxiliary_loss_mlp": 0.01057785, + "balance_loss_clip": 1.02872229, + "balance_loss_mlp": 1.0402391, + "epoch": 0.06246805952202014, + "flos": 21614727790080.0, + "grad_norm": 3.0631812454019824, + "language_loss": 0.85687834, + "learning_rate": 3.961745423383028e-06, + "loss": 0.87899184, + "num_input_tokens_seen": 22256465, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 1.1328125, + "step": 1039, + "time_per_iteration": 2.4449520111083984 + }, + { + "auxiliary_loss_clip": 0.01155556, + "auxiliary_loss_mlp": 0.01059921, + "balance_loss_clip": 1.0290575, + "balance_loss_mlp": 1.03914809, + "epoch": 0.0625281827746881, + "flos": 19311053950080.0, + "grad_norm": 1.8935037623048254, + "language_loss": 0.80690914, + "learning_rate": 3.961671845164602e-06, + "loss": 0.82906389, + "num_input_tokens_seen": 22274025, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 1.1640625, + "step": 1040, + "time_per_iteration": 3.8361432552337646 + }, + { + "auxiliary_loss_clip": 0.01157663, + "auxiliary_loss_mlp": 0.01059855, + "balance_loss_clip": 1.03005266, + "balance_loss_mlp": 1.04205906, + "epoch": 0.06258830602735609, + "flos": 27744533527680.0, + "grad_norm": 8.969843761282052, + "language_loss": 0.69530857, + "learning_rate": 3.961598196939153e-06, + "loss": 0.71748376, + "num_input_tokens_seen": 22292245, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 1.15625, + "step": 1041, + "time_per_iteration": 2.4421133995056152 + }, + { + "auxiliary_loss_clip": 0.01153717, + "auxiliary_loss_mlp": 0.01054046, + "balance_loss_clip": 1.02290881, + "balance_loss_mlp": 1.03592014, + "epoch": 0.06264842928002405, + "flos": 23221465603200.0, + "grad_norm": 2.1600309028167017, + "language_loss": 0.81277382, + "learning_rate": 3.961524478709311e-06, + "loss": 0.8348515, + "num_input_tokens_seen": 22311455, + "router_z_loss_clip": 0.3125, + "router_z_loss_mlp": 1.1796875, + "step": 1042, + "time_per_iteration": 2.4429678916931152 + }, + { + "auxiliary_loss_clip": 0.01155927, + "auxiliary_loss_mlp": 0.01047444, + "balance_loss_clip": 1.01816618, + "balance_loss_mlp": 1.03883386, + "epoch": 0.06270855253269202, + "flos": 38397965771520.0, + "grad_norm": 1.6556398191388253, + "language_loss": 0.76052594, + "learning_rate": 3.961450690477705e-06, + "loss": 0.78255963, + "num_input_tokens_seen": 22333750, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 1.171875, + "step": 1043, + "time_per_iteration": 3.930548906326294 + }, + { + "auxiliary_loss_clip": 0.0115033, + "auxiliary_loss_mlp": 0.01048839, + "balance_loss_clip": 1.02066958, + "balance_loss_mlp": 1.03881478, + "epoch": 0.06276867578535998, + "flos": 22452503708160.0, + "grad_norm": 2.1727494522463116, + "language_loss": 0.92467427, + "learning_rate": 3.961376832246969e-06, + "loss": 0.946666, + "num_input_tokens_seen": 22351940, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 1.1171875, + "step": 1044, + "time_per_iteration": 3.851454973220825 + }, + { + "auxiliary_loss_clip": 0.01153806, + "auxiliary_loss_mlp": 0.01051929, + "balance_loss_clip": 1.02408147, + "balance_loss_mlp": 1.03988838, + "epoch": 0.06282879903802796, + "flos": 22929312412800.0, + "grad_norm": 2.6175374391353987, + "language_loss": 0.86091137, + "learning_rate": 3.96130290401974e-06, + "loss": 0.88296872, + "num_input_tokens_seen": 22372085, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 1.140625, + "step": 1045, + "time_per_iteration": 2.4562478065490723 + }, + { + "auxiliary_loss_clip": 0.01147997, + "auxiliary_loss_mlp": 0.01058226, + "balance_loss_clip": 1.03123653, + "balance_loss_mlp": 1.0371809, + "epoch": 0.06288892229069593, + "flos": 34817937114240.0, + "grad_norm": 2.050724739218883, + "language_loss": 0.78363693, + "learning_rate": 3.961228905798655e-06, + "loss": 0.80569911, + "num_input_tokens_seen": 22392020, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 1.109375, + "step": 1046, + "time_per_iteration": 3.8668949604034424 + }, + { + "auxiliary_loss_clip": 0.01154558, + "auxiliary_loss_mlp": 0.01061772, + "balance_loss_clip": 1.03345942, + "balance_loss_mlp": 1.03891706, + "epoch": 0.06294904554336389, + "flos": 19426127391360.0, + "grad_norm": 2.827178720419603, + "language_loss": 0.77426672, + "learning_rate": 3.961154837586356e-06, + "loss": 0.79642999, + "num_input_tokens_seen": 22411180, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 1.15625, + "step": 1047, + "time_per_iteration": 2.403306007385254 + }, + { + "auxiliary_loss_clip": 0.01158847, + "auxiliary_loss_mlp": 0.0105585, + "balance_loss_clip": 1.02571368, + "balance_loss_mlp": 1.03932309, + "epoch": 0.06300916879603187, + "flos": 40660267783680.0, + "grad_norm": 2.2582086957955054, + "language_loss": 0.7676698, + "learning_rate": 3.961080699385484e-06, + "loss": 0.78981674, + "num_input_tokens_seen": 22435105, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 1.1953125, + "step": 1048, + "time_per_iteration": 2.585777521133423 + }, + { + "auxiliary_loss_clip": 0.01159709, + "auxiliary_loss_mlp": 0.01053575, + "balance_loss_clip": 1.02439284, + "balance_loss_mlp": 1.040627, + "epoch": 0.06306929204869983, + "flos": 23803048897920.0, + "grad_norm": 2.868288878903169, + "language_loss": 0.77440327, + "learning_rate": 3.961006491198688e-06, + "loss": 0.79653615, + "num_input_tokens_seen": 22452710, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 1.1953125, + "step": 1049, + "time_per_iteration": 2.448014259338379 + }, + { + "auxiliary_loss_clip": 0.01154987, + "auxiliary_loss_mlp": 0.0105744, + "balance_loss_clip": 1.02704167, + "balance_loss_mlp": 1.03871131, + "epoch": 0.0631294153013678, + "flos": 18914824396800.0, + "grad_norm": 2.1328334411159666, + "language_loss": 0.83224154, + "learning_rate": 3.960932213028614e-06, + "loss": 0.85436583, + "num_input_tokens_seen": 22470175, + "router_z_loss_clip": 0.30273438, + "router_z_loss_mlp": 1.1640625, + "step": 1050, + "time_per_iteration": 2.428130626678467 + }, + { + "auxiliary_loss_clip": 0.01153997, + "auxiliary_loss_mlp": 0.0105438, + "balance_loss_clip": 1.02810645, + "balance_loss_mlp": 1.04045296, + "epoch": 0.06318953855403578, + "flos": 24279019729920.0, + "grad_norm": 2.0519250920100536, + "language_loss": 0.76973629, + "learning_rate": 3.960857864877913e-06, + "loss": 0.79182005, + "num_input_tokens_seen": 22490020, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 1.1328125, + "step": 1051, + "time_per_iteration": 2.432659864425659 + }, + { + "auxiliary_loss_clip": 0.01155369, + "auxiliary_loss_mlp": 0.01063899, + "balance_loss_clip": 1.03567064, + "balance_loss_mlp": 1.03871274, + "epoch": 0.06324966180670374, + "flos": 22527811244160.0, + "grad_norm": 2.0842928281458883, + "language_loss": 0.80101454, + "learning_rate": 3.960783446749239e-06, + "loss": 0.82320726, + "num_input_tokens_seen": 22509685, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 1.171875, + "step": 1052, + "time_per_iteration": 2.4301910400390625 + }, + { + "auxiliary_loss_clip": 0.01156457, + "auxiliary_loss_mlp": 0.01054979, + "balance_loss_clip": 1.0256772, + "balance_loss_mlp": 1.03852856, + "epoch": 0.06330978505937171, + "flos": 15777214888320.0, + "grad_norm": 2.4383696598633495, + "language_loss": 0.78276086, + "learning_rate": 3.960708958645247e-06, + "loss": 0.8048752, + "num_input_tokens_seen": 22527905, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 1.1796875, + "step": 1053, + "time_per_iteration": 2.3998453617095947 + }, + { + "auxiliary_loss_clip": 0.01154537, + "auxiliary_loss_mlp": 0.01047982, + "balance_loss_clip": 1.02014709, + "balance_loss_mlp": 1.03847611, + "epoch": 0.06336990831203967, + "flos": 21470012737920.0, + "grad_norm": 1.9432053338143196, + "language_loss": 0.84447843, + "learning_rate": 3.960634400568597e-06, + "loss": 0.86650366, + "num_input_tokens_seen": 22546335, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 1.15625, + "step": 1054, + "time_per_iteration": 2.4102094173431396 + }, + { + "auxiliary_loss_clip": 0.01153083, + "auxiliary_loss_mlp": 0.01058387, + "balance_loss_clip": 1.03110003, + "balance_loss_mlp": 1.03914165, + "epoch": 0.06343003156470765, + "flos": 18477886331520.0, + "grad_norm": 2.41561749478276, + "language_loss": 0.85629678, + "learning_rate": 3.9605597725219485e-06, + "loss": 0.87841147, + "num_input_tokens_seen": 22563885, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 1.140625, + "step": 1055, + "time_per_iteration": 2.3968770503997803 + }, + { + "auxiliary_loss_clip": 0.01155734, + "auxiliary_loss_mlp": 0.01060315, + "balance_loss_clip": 1.02896309, + "balance_loss_mlp": 1.03833985, + "epoch": 0.06349015481737562, + "flos": 25153733733120.0, + "grad_norm": 2.6139634882193867, + "language_loss": 0.8117063, + "learning_rate": 3.960485074507964e-06, + "loss": 0.83386678, + "num_input_tokens_seen": 22583035, + "router_z_loss_clip": 0.3125, + "router_z_loss_mlp": 1.171875, + "step": 1056, + "time_per_iteration": 2.4474260807037354 + }, + { + "auxiliary_loss_clip": 0.01159162, + "auxiliary_loss_mlp": 0.01056713, + "balance_loss_clip": 1.02288127, + "balance_loss_mlp": 1.03690875, + "epoch": 0.06355027807004358, + "flos": 26870517751680.0, + "grad_norm": 2.441038065041055, + "language_loss": 0.80776274, + "learning_rate": 3.960410306529311e-06, + "loss": 0.82992148, + "num_input_tokens_seen": 22605055, + "router_z_loss_clip": 0.33789062, + "router_z_loss_mlp": 1.21875, + "step": 1057, + "time_per_iteration": 2.497753858566284 + }, + { + "auxiliary_loss_clip": 0.01145479, + "auxiliary_loss_mlp": 0.01053419, + "balance_loss_clip": 1.02721691, + "balance_loss_mlp": 1.03655159, + "epoch": 0.06361040132271156, + "flos": 21395647808640.0, + "grad_norm": 1.8377619202367705, + "language_loss": 0.83484435, + "learning_rate": 3.960335468588656e-06, + "loss": 0.85683334, + "num_input_tokens_seen": 22623760, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 1.09375, + "step": 1058, + "time_per_iteration": 2.4473915100097656 + }, + { + "auxiliary_loss_clip": 0.01150486, + "auxiliary_loss_mlp": 0.01053281, + "balance_loss_clip": 1.02176166, + "balance_loss_mlp": 1.03529727, + "epoch": 0.06367052457537953, + "flos": 25732733587200.0, + "grad_norm": 2.1521473809757206, + "language_loss": 0.87502033, + "learning_rate": 3.960260560688672e-06, + "loss": 0.89705795, + "num_input_tokens_seen": 22643000, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 1.1484375, + "step": 1059, + "time_per_iteration": 2.487631320953369 + }, + { + "auxiliary_loss_clip": 0.01157359, + "auxiliary_loss_mlp": 0.0105957, + "balance_loss_clip": 1.03045952, + "balance_loss_mlp": 1.04114223, + "epoch": 0.0637306478280475, + "flos": 17630684346240.0, + "grad_norm": 2.4567529624321938, + "language_loss": 0.91952676, + "learning_rate": 3.96018558283203e-06, + "loss": 0.94169605, + "num_input_tokens_seen": 22660460, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 1.15625, + "step": 1060, + "time_per_iteration": 2.4338083267211914 + }, + { + "auxiliary_loss_clip": 0.01153993, + "auxiliary_loss_mlp": 0.01054981, + "balance_loss_clip": 1.02560759, + "balance_loss_mlp": 1.03712416, + "epoch": 0.06379077108071547, + "flos": 13661757521280.0, + "grad_norm": 2.079161993849353, + "language_loss": 0.8758902, + "learning_rate": 3.960110535021406e-06, + "loss": 0.89797997, + "num_input_tokens_seen": 22679270, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 1.171875, + "step": 1061, + "time_per_iteration": 2.447174549102783 + }, + { + "auxiliary_loss_clip": 0.01159231, + "auxiliary_loss_mlp": 0.01055013, + "balance_loss_clip": 1.02487659, + "balance_loss_mlp": 1.03590798, + "epoch": 0.06385089433338344, + "flos": 28477499944320.0, + "grad_norm": 2.4121893917422734, + "language_loss": 0.7742179, + "learning_rate": 3.96003541725948e-06, + "loss": 0.79636031, + "num_input_tokens_seen": 22699330, + "router_z_loss_clip": 0.30273438, + "router_z_loss_mlp": 1.234375, + "step": 1062, + "time_per_iteration": 2.530794620513916 + }, + { + "auxiliary_loss_clip": 0.01152508, + "auxiliary_loss_mlp": 0.01056336, + "balance_loss_clip": 1.02829826, + "balance_loss_mlp": 1.03505027, + "epoch": 0.0639110175860514, + "flos": 24310057795200.0, + "grad_norm": 3.397358567992743, + "language_loss": 0.8646583, + "learning_rate": 3.959960229548932e-06, + "loss": 0.88674676, + "num_input_tokens_seen": 22717945, + "router_z_loss_clip": 0.27929688, + "router_z_loss_mlp": 1.171875, + "step": 1063, + "time_per_iteration": 2.493764638900757 + }, + { + "auxiliary_loss_clip": 0.01152675, + "auxiliary_loss_mlp": 0.01061486, + "balance_loss_clip": 1.03192258, + "balance_loss_mlp": 1.03749716, + "epoch": 0.06397114083871938, + "flos": 22089686192640.0, + "grad_norm": 1.8895788334406478, + "language_loss": 0.79841852, + "learning_rate": 3.9598849718924456e-06, + "loss": 0.82056022, + "num_input_tokens_seen": 22736790, + "router_z_loss_clip": 0.29492188, + "router_z_loss_mlp": 1.1484375, + "step": 1064, + "time_per_iteration": 2.552570343017578 + }, + { + "auxiliary_loss_clip": 0.01156097, + "auxiliary_loss_mlp": 0.01062561, + "balance_loss_clip": 1.03197181, + "balance_loss_mlp": 1.03786206, + "epoch": 0.06403126409138735, + "flos": 19571819961600.0, + "grad_norm": 2.915547150127337, + "language_loss": 0.84240711, + "learning_rate": 3.9598096442927045e-06, + "loss": 0.86459368, + "num_input_tokens_seen": 22754745, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 1.1796875, + "step": 1065, + "time_per_iteration": 2.483699321746826 + }, + { + "auxiliary_loss_clip": 0.01153172, + "auxiliary_loss_mlp": 0.01055382, + "balance_loss_clip": 1.02670002, + "balance_loss_mlp": 1.03946292, + "epoch": 0.06409138734405531, + "flos": 40805820708480.0, + "grad_norm": 2.1064109549280228, + "language_loss": 0.68253148, + "learning_rate": 3.959734246752399e-06, + "loss": 0.70461702, + "num_input_tokens_seen": 22776780, + "router_z_loss_clip": 0.28710938, + "router_z_loss_mlp": 1.140625, + "step": 1066, + "time_per_iteration": 2.613372564315796 + }, + { + "auxiliary_loss_clip": 0.01153307, + "auxiliary_loss_mlp": 0.01068058, + "balance_loss_clip": 1.03811312, + "balance_loss_mlp": 1.03984094, + "epoch": 0.06415151059672328, + "flos": 20440773590400.0, + "grad_norm": 2.221808413280424, + "language_loss": 0.9024362, + "learning_rate": 3.959658779274219e-06, + "loss": 0.92464983, + "num_input_tokens_seen": 22793915, + "router_z_loss_clip": 0.29882812, + "router_z_loss_mlp": 1.1328125, + "step": 1067, + "time_per_iteration": 2.4078049659729004 + }, + { + "auxiliary_loss_clip": 0.01153334, + "auxiliary_loss_mlp": 0.01056416, + "balance_loss_clip": 1.02794874, + "balance_loss_mlp": 1.03792787, + "epoch": 0.06421163384939126, + "flos": 18071218281600.0, + "grad_norm": 2.0953299155703515, + "language_loss": 0.83557618, + "learning_rate": 3.959583241860859e-06, + "loss": 0.85767376, + "num_input_tokens_seen": 22812670, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 1.15625, + "step": 1068, + "time_per_iteration": 2.43798565864563 + }, + { + "auxiliary_loss_clip": 0.01149899, + "auxiliary_loss_mlp": 0.01055215, + "balance_loss_clip": 1.02734399, + "balance_loss_mlp": 1.03755999, + "epoch": 0.06427175710205922, + "flos": 25118261925120.0, + "grad_norm": 2.7684988106959607, + "language_loss": 0.89493138, + "learning_rate": 3.959507634515013e-06, + "loss": 0.91698253, + "num_input_tokens_seen": 22832440, + "router_z_loss_clip": 0.27929688, + "router_z_loss_mlp": 1.125, + "step": 1069, + "time_per_iteration": 2.4779961109161377 + }, + { + "auxiliary_loss_clip": 0.01156154, + "auxiliary_loss_mlp": 0.01066524, + "balance_loss_clip": 1.03642344, + "balance_loss_mlp": 1.03853703, + "epoch": 0.06433188035472719, + "flos": 17379693515520.0, + "grad_norm": 2.6462331204119565, + "language_loss": 0.95468295, + "learning_rate": 3.95943195723938e-06, + "loss": 0.97690964, + "num_input_tokens_seen": 22845495, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 1.171875, + "step": 1070, + "time_per_iteration": 2.40043306350708 + }, + { + "auxiliary_loss_clip": 0.01154162, + "auxiliary_loss_mlp": 0.01050874, + "balance_loss_clip": 1.02147698, + "balance_loss_mlp": 1.03742683, + "epoch": 0.06439200360739517, + "flos": 23545250352000.0, + "grad_norm": 1.9726107770921453, + "language_loss": 0.88081366, + "learning_rate": 3.959356210036661e-06, + "loss": 0.90286404, + "num_input_tokens_seen": 22865390, + "router_z_loss_clip": 0.29492188, + "router_z_loss_mlp": 1.171875, + "step": 1071, + "time_per_iteration": 2.4496536254882812 + }, + { + "auxiliary_loss_clip": 0.01149584, + "auxiliary_loss_mlp": 0.01051725, + "balance_loss_clip": 1.02471256, + "balance_loss_mlp": 1.03599989, + "epoch": 0.06445212686006313, + "flos": 21978732291840.0, + "grad_norm": 1.9189707447936222, + "language_loss": 0.76146531, + "learning_rate": 3.959280392909559e-06, + "loss": 0.78347838, + "num_input_tokens_seen": 22885495, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 1.140625, + "step": 1072, + "time_per_iteration": 2.4486169815063477 + }, + { + "auxiliary_loss_clip": 0.01156692, + "auxiliary_loss_mlp": 0.01058945, + "balance_loss_clip": 1.02768803, + "balance_loss_mlp": 1.03829575, + "epoch": 0.0645122501127311, + "flos": 25920112187520.0, + "grad_norm": 2.0618225953997027, + "language_loss": 0.80716658, + "learning_rate": 3.9592045058607785e-06, + "loss": 0.82932299, + "num_input_tokens_seen": 22904845, + "router_z_loss_clip": 0.3125, + "router_z_loss_mlp": 1.1875, + "step": 1073, + "time_per_iteration": 2.4605281352996826 + }, + { + "auxiliary_loss_clip": 0.01144171, + "auxiliary_loss_mlp": 0.01053222, + "balance_loss_clip": 1.02463543, + "balance_loss_mlp": 1.03527343, + "epoch": 0.06457237336539907, + "flos": 25624956620160.0, + "grad_norm": 1.6866784670733426, + "language_loss": 0.80415916, + "learning_rate": 3.95912854889303e-06, + "loss": 0.82613313, + "num_input_tokens_seen": 22925940, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 1.09375, + "step": 1074, + "time_per_iteration": 2.4511263370513916 + }, + { + "auxiliary_loss_clip": 0.01153531, + "auxiliary_loss_mlp": 0.01052702, + "balance_loss_clip": 1.0233289, + "balance_loss_mlp": 1.03641522, + "epoch": 0.06463249661806704, + "flos": 19462960742400.0, + "grad_norm": 2.532306893728656, + "language_loss": 0.78886366, + "learning_rate": 3.959052522009023e-06, + "loss": 0.81092602, + "num_input_tokens_seen": 22944375, + "router_z_loss_clip": 0.29492188, + "router_z_loss_mlp": 1.171875, + "step": 1075, + "time_per_iteration": 2.4653217792510986 + }, + { + "auxiliary_loss_clip": 0.01157863, + "auxiliary_loss_mlp": 0.01054332, + "balance_loss_clip": 1.02711701, + "balance_loss_mlp": 1.03994346, + "epoch": 0.064692619870735, + "flos": 24496912725120.0, + "grad_norm": 5.248740749478744, + "language_loss": 0.87301528, + "learning_rate": 3.95897642521147e-06, + "loss": 0.89513719, + "num_input_tokens_seen": 22959145, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 1.1796875, + "step": 1076, + "time_per_iteration": 2.558877944946289 + }, + { + "auxiliary_loss_clip": 0.01149619, + "auxiliary_loss_mlp": 0.01046069, + "balance_loss_clip": 1.01819813, + "balance_loss_mlp": 1.03587496, + "epoch": 0.06475274312340297, + "flos": 17017748784000.0, + "grad_norm": 2.1191892004808404, + "language_loss": 0.80661476, + "learning_rate": 3.958900258503089e-06, + "loss": 0.82857162, + "num_input_tokens_seen": 22978100, + "router_z_loss_clip": 0.27929688, + "router_z_loss_mlp": 1.140625, + "step": 1077, + "time_per_iteration": 2.5130233764648438 + }, + { + "auxiliary_loss_clip": 0.01156957, + "auxiliary_loss_mlp": 0.01062066, + "balance_loss_clip": 1.0318104, + "balance_loss_mlp": 1.03646731, + "epoch": 0.06481286637607095, + "flos": 24571207831680.0, + "grad_norm": 2.564596832680389, + "language_loss": 0.91844654, + "learning_rate": 3.958824021886595e-06, + "loss": 0.94063681, + "num_input_tokens_seen": 22997285, + "router_z_loss_clip": 0.30273438, + "router_z_loss_mlp": 1.203125, + "step": 1078, + "time_per_iteration": 2.5395843982696533 + }, + { + "auxiliary_loss_clip": 0.01160052, + "auxiliary_loss_mlp": 0.01057451, + "balance_loss_clip": 1.02744603, + "balance_loss_mlp": 1.03943264, + "epoch": 0.06487298962873891, + "flos": 21104576870400.0, + "grad_norm": 2.0316399257948365, + "language_loss": 0.78641224, + "learning_rate": 3.9587477153647115e-06, + "loss": 0.80858719, + "num_input_tokens_seen": 23016285, + "router_z_loss_clip": 0.29882812, + "router_z_loss_mlp": 1.203125, + "step": 1079, + "time_per_iteration": 2.4766886234283447 + }, + { + "auxiliary_loss_clip": 0.01151624, + "auxiliary_loss_mlp": 0.01057974, + "balance_loss_clip": 1.02979302, + "balance_loss_mlp": 1.0372957, + "epoch": 0.06493311288140688, + "flos": 24607028753280.0, + "grad_norm": 2.63909668044762, + "language_loss": 0.68948388, + "learning_rate": 3.95867133894016e-06, + "loss": 0.71157992, + "num_input_tokens_seen": 23036420, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 1.140625, + "step": 1080, + "time_per_iteration": 3.8689091205596924 + }, + { + "auxiliary_loss_clip": 0.01151384, + "auxiliary_loss_mlp": 0.01055946, + "balance_loss_clip": 1.02595294, + "balance_loss_mlp": 1.03560901, + "epoch": 0.06499323613407486, + "flos": 25336818236160.0, + "grad_norm": 1.7999405252280114, + "language_loss": 0.72002423, + "learning_rate": 3.958594892615667e-06, + "loss": 0.7420975, + "num_input_tokens_seen": 23056945, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 1.15625, + "step": 1081, + "time_per_iteration": 2.4564967155456543 + }, + { + "auxiliary_loss_clip": 0.0114835, + "auxiliary_loss_mlp": 0.01051298, + "balance_loss_clip": 1.02166224, + "balance_loss_mlp": 1.03567648, + "epoch": 0.06505335938674282, + "flos": 20374682653440.0, + "grad_norm": 2.8791195051438643, + "language_loss": 0.84015405, + "learning_rate": 3.95851837639396e-06, + "loss": 0.86215043, + "num_input_tokens_seen": 23074940, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 1.125, + "step": 1082, + "time_per_iteration": 2.4006152153015137 + }, + { + "auxiliary_loss_clip": 0.01159429, + "auxiliary_loss_mlp": 0.01064826, + "balance_loss_clip": 1.03422475, + "balance_loss_mlp": 1.03717995, + "epoch": 0.06511348263941079, + "flos": 25336748413440.0, + "grad_norm": 5.4199843251700655, + "language_loss": 0.82377207, + "learning_rate": 3.9584417902777695e-06, + "loss": 0.84601462, + "num_input_tokens_seen": 23093420, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 1.21875, + "step": 1083, + "time_per_iteration": 3.9749627113342285 + }, + { + "auxiliary_loss_clip": 0.01156056, + "auxiliary_loss_mlp": 0.01057279, + "balance_loss_clip": 1.02689254, + "balance_loss_mlp": 1.0394218, + "epoch": 0.06517360589207877, + "flos": 20331949282560.0, + "grad_norm": 2.566877750876929, + "language_loss": 0.79550064, + "learning_rate": 3.95836513426983e-06, + "loss": 0.81763399, + "num_input_tokens_seen": 23111550, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 1.1640625, + "step": 1084, + "time_per_iteration": 2.466878890991211 + }, + { + "auxiliary_loss_clip": 0.01152333, + "auxiliary_loss_mlp": 0.01053444, + "balance_loss_clip": 1.02485764, + "balance_loss_mlp": 1.03761244, + "epoch": 0.06523372914474673, + "flos": 31680432339840.0, + "grad_norm": 5.835992372464286, + "language_loss": 0.66288763, + "learning_rate": 3.958288408372877e-06, + "loss": 0.68494546, + "num_input_tokens_seen": 23130335, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 1.1484375, + "step": 1085, + "time_per_iteration": 3.9148972034454346 + }, + { + "auxiliary_loss_clip": 0.01147609, + "auxiliary_loss_mlp": 0.0105211, + "balance_loss_clip": 1.02476358, + "balance_loss_mlp": 1.03516364, + "epoch": 0.0652938523974147, + "flos": 20777091517440.0, + "grad_norm": 2.121651067054262, + "language_loss": 0.76523113, + "learning_rate": 3.9582116125896474e-06, + "loss": 0.78722835, + "num_input_tokens_seen": 23152380, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 1.125, + "step": 1086, + "time_per_iteration": 2.4858388900756836 + }, + { + "auxiliary_loss_clip": 0.01148721, + "auxiliary_loss_mlp": 0.01047731, + "balance_loss_clip": 1.0208497, + "balance_loss_mlp": 1.03547835, + "epoch": 0.06535397565008266, + "flos": 16690053962880.0, + "grad_norm": 3.2443909718870723, + "language_loss": 0.85044527, + "learning_rate": 3.958134746922882e-06, + "loss": 0.87240976, + "num_input_tokens_seen": 23171630, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 1.1328125, + "step": 1087, + "time_per_iteration": 2.390460252761841 + }, + { + "auxiliary_loss_clip": 0.01147463, + "auxiliary_loss_mlp": 0.01053357, + "balance_loss_clip": 1.02568889, + "balance_loss_mlp": 1.0346911, + "epoch": 0.06541409890275064, + "flos": 26867061527040.0, + "grad_norm": 2.6455725952910427, + "language_loss": 0.77596116, + "learning_rate": 3.958057811375325e-06, + "loss": 0.7979694, + "num_input_tokens_seen": 23192520, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 1.125, + "step": 1088, + "time_per_iteration": 2.4677419662475586 + }, + { + "auxiliary_loss_clip": 0.01149935, + "auxiliary_loss_mlp": 0.01057884, + "balance_loss_clip": 1.02992916, + "balance_loss_mlp": 1.03685796, + "epoch": 0.06547422215541861, + "flos": 20520584691840.0, + "grad_norm": 1.7317476980719246, + "language_loss": 0.71197081, + "learning_rate": 3.957980805949722e-06, + "loss": 0.73404896, + "num_input_tokens_seen": 23210710, + "router_z_loss_clip": 0.27929688, + "router_z_loss_mlp": 1.125, + "step": 1089, + "time_per_iteration": 2.386992931365967 + }, + { + "auxiliary_loss_clip": 0.01147423, + "auxiliary_loss_mlp": 0.01050398, + "balance_loss_clip": 1.02330232, + "balance_loss_mlp": 1.0368315, + "epoch": 0.06553434540808657, + "flos": 22015565642880.0, + "grad_norm": 1.8712122613700142, + "language_loss": 0.85494733, + "learning_rate": 3.957903730648819e-06, + "loss": 0.87692559, + "num_input_tokens_seen": 23230305, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 1.109375, + "step": 1090, + "time_per_iteration": 2.4183759689331055 + }, + { + "auxiliary_loss_clip": 0.01153315, + "auxiliary_loss_mlp": 0.01055158, + "balance_loss_clip": 1.02670288, + "balance_loss_mlp": 1.03855705, + "epoch": 0.06559446866075455, + "flos": 24607482600960.0, + "grad_norm": 2.0463246774747117, + "language_loss": 0.71929127, + "learning_rate": 3.957826585475369e-06, + "loss": 0.74137598, + "num_input_tokens_seen": 23249015, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 1.1484375, + "step": 1091, + "time_per_iteration": 2.446702241897583 + }, + { + "auxiliary_loss_clip": 0.01148337, + "auxiliary_loss_mlp": 0.01053979, + "balance_loss_clip": 1.02652502, + "balance_loss_mlp": 1.03667808, + "epoch": 0.06565459191342252, + "flos": 24273678291840.0, + "grad_norm": 2.577704198220226, + "language_loss": 0.82610309, + "learning_rate": 3.957749370432124e-06, + "loss": 0.84812617, + "num_input_tokens_seen": 23265105, + "router_z_loss_clip": 0.27539062, + "router_z_loss_mlp": 1.1171875, + "step": 1092, + "time_per_iteration": 2.3923897743225098 + }, + { + "auxiliary_loss_clip": 0.01152296, + "auxiliary_loss_mlp": 0.01055245, + "balance_loss_clip": 1.02546632, + "balance_loss_mlp": 1.03611541, + "epoch": 0.06571471516609048, + "flos": 24786063538560.0, + "grad_norm": 1.8958847964951662, + "language_loss": 0.7130363, + "learning_rate": 3.957672085521841e-06, + "loss": 0.73511177, + "num_input_tokens_seen": 23283950, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 1.15625, + "step": 1093, + "time_per_iteration": 2.4138355255126953 + }, + { + "auxiliary_loss_clip": 0.01149541, + "auxiliary_loss_mlp": 0.01052042, + "balance_loss_clip": 1.02276444, + "balance_loss_mlp": 1.03724301, + "epoch": 0.06577483841875846, + "flos": 26212858871040.0, + "grad_norm": 1.6711946765405614, + "language_loss": 0.87978733, + "learning_rate": 3.957594730747276e-06, + "loss": 0.90180314, + "num_input_tokens_seen": 23305005, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 1.125, + "step": 1094, + "time_per_iteration": 2.4436089992523193 + }, + { + "auxiliary_loss_clip": 0.01151625, + "auxiliary_loss_mlp": 0.0105448, + "balance_loss_clip": 1.02482069, + "balance_loss_mlp": 1.037292, + "epoch": 0.06583496167142643, + "flos": 25079683006080.0, + "grad_norm": 2.2043409811576806, + "language_loss": 0.81170315, + "learning_rate": 3.957517306111191e-06, + "loss": 0.8337642, + "num_input_tokens_seen": 23323220, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 1.140625, + "step": 1095, + "time_per_iteration": 2.4198005199432373 + }, + { + "auxiliary_loss_clip": 0.0114609, + "auxiliary_loss_mlp": 0.01048581, + "balance_loss_clip": 1.02168703, + "balance_loss_mlp": 1.03538322, + "epoch": 0.06589508492409439, + "flos": 25628622312960.0, + "grad_norm": 2.126074922761706, + "language_loss": 0.6998198, + "learning_rate": 3.957439811616349e-06, + "loss": 0.72176647, + "num_input_tokens_seen": 23342235, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 1.109375, + "step": 1096, + "time_per_iteration": 2.4462523460388184 + }, + { + "auxiliary_loss_clip": 0.01152339, + "auxiliary_loss_mlp": 0.01050908, + "balance_loss_clip": 1.02412224, + "balance_loss_mlp": 1.039469, + "epoch": 0.06595520817676236, + "flos": 23620173863040.0, + "grad_norm": 1.8544065519083277, + "language_loss": 0.77033997, + "learning_rate": 3.957362247265515e-06, + "loss": 0.79237241, + "num_input_tokens_seen": 23363680, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 1.125, + "step": 1097, + "time_per_iteration": 2.40742564201355 + }, + { + "auxiliary_loss_clip": 0.0115117, + "auxiliary_loss_mlp": 0.0106126, + "balance_loss_clip": 1.03356814, + "balance_loss_mlp": 1.03739858, + "epoch": 0.06601533142943034, + "flos": 33800323449600.0, + "grad_norm": 1.966380454277295, + "language_loss": 0.78213745, + "learning_rate": 3.957284613061456e-06, + "loss": 0.80426174, + "num_input_tokens_seen": 23385590, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 1.140625, + "step": 1098, + "time_per_iteration": 2.5256941318511963 + }, + { + "auxiliary_loss_clip": 0.01150204, + "auxiliary_loss_mlp": 0.01060173, + "balance_loss_clip": 1.03039408, + "balance_loss_mlp": 1.03755939, + "epoch": 0.0660754546820983, + "flos": 20258352403200.0, + "grad_norm": 4.2821213113645795, + "language_loss": 0.81474102, + "learning_rate": 3.957206909006945e-06, + "loss": 0.8368448, + "num_input_tokens_seen": 23402945, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 1.125, + "step": 1099, + "time_per_iteration": 2.382452964782715 + }, + { + "auxiliary_loss_clip": 0.01142698, + "auxiliary_loss_mlp": 0.01050563, + "balance_loss_clip": 1.02282298, + "balance_loss_mlp": 1.03261256, + "epoch": 0.06613557793476627, + "flos": 19353158916480.0, + "grad_norm": 3.173236474032098, + "language_loss": 0.82873213, + "learning_rate": 3.957129135104754e-06, + "loss": 0.85066473, + "num_input_tokens_seen": 23421410, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 1.1015625, + "step": 1100, + "time_per_iteration": 2.41461443901062 + }, + { + "auxiliary_loss_clip": 0.01149334, + "auxiliary_loss_mlp": 0.01056582, + "balance_loss_clip": 1.02933121, + "balance_loss_mlp": 1.03661847, + "epoch": 0.06619570118743424, + "flos": 13771698992640.0, + "grad_norm": 2.412003980507769, + "language_loss": 0.73175687, + "learning_rate": 3.957051291357658e-06, + "loss": 0.75381601, + "num_input_tokens_seen": 23438870, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 1.125, + "step": 1101, + "time_per_iteration": 2.445695638656616 + }, + { + "auxiliary_loss_clip": 0.01146847, + "auxiliary_loss_mlp": 0.01058545, + "balance_loss_clip": 1.02983987, + "balance_loss_mlp": 1.0356338, + "epoch": 0.06625582444010221, + "flos": 17856921156480.0, + "grad_norm": 2.432250688453231, + "language_loss": 0.85938394, + "learning_rate": 3.956973377768437e-06, + "loss": 0.8814379, + "num_input_tokens_seen": 23456975, + "router_z_loss_clip": 0.28710938, + "router_z_loss_mlp": 1.109375, + "step": 1102, + "time_per_iteration": 2.3959429264068604 + }, + { + "auxiliary_loss_clip": 0.01148477, + "auxiliary_loss_mlp": 0.01051257, + "balance_loss_clip": 1.02208626, + "balance_loss_mlp": 1.03743041, + "epoch": 0.06631594769277017, + "flos": 11837894762880.0, + "grad_norm": 4.580157305379214, + "language_loss": 0.81804848, + "learning_rate": 3.956895394339869e-06, + "loss": 0.84004581, + "num_input_tokens_seen": 23473440, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 1.109375, + "step": 1103, + "time_per_iteration": 2.3681488037109375 + }, + { + "auxiliary_loss_clip": 0.01150511, + "auxiliary_loss_mlp": 0.01060027, + "balance_loss_clip": 1.03302574, + "balance_loss_mlp": 1.03946304, + "epoch": 0.06637607094543815, + "flos": 19792296397440.0, + "grad_norm": 1.8496289460402604, + "language_loss": 0.81953788, + "learning_rate": 3.956817341074738e-06, + "loss": 0.84164321, + "num_input_tokens_seen": 23493880, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 1.109375, + "step": 1104, + "time_per_iteration": 2.4122419357299805 + }, + { + "auxiliary_loss_clip": 0.01143497, + "auxiliary_loss_mlp": 0.01047648, + "balance_loss_clip": 1.0187993, + "balance_loss_mlp": 1.03398204, + "epoch": 0.06643619419810612, + "flos": 25484430931200.0, + "grad_norm": 1.8390153581956532, + "language_loss": 0.80658793, + "learning_rate": 3.95673921797583e-06, + "loss": 0.82849944, + "num_input_tokens_seen": 23514920, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 1.09375, + "step": 1105, + "time_per_iteration": 2.446779727935791 + }, + { + "auxiliary_loss_clip": 0.01144253, + "auxiliary_loss_mlp": 0.01052386, + "balance_loss_clip": 1.02591014, + "balance_loss_mlp": 1.0359776, + "epoch": 0.06649631745077408, + "flos": 16945583270400.0, + "grad_norm": 2.004951132118422, + "language_loss": 0.96369636, + "learning_rate": 3.956661025045933e-06, + "loss": 0.98566276, + "num_input_tokens_seen": 23531635, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.0859375, + "step": 1106, + "time_per_iteration": 2.384737014770508 + }, + { + "auxiliary_loss_clip": 0.01149602, + "auxiliary_loss_mlp": 0.0104897, + "balance_loss_clip": 1.02033615, + "balance_loss_mlp": 1.03467488, + "epoch": 0.06655644070344206, + "flos": 17857619383680.0, + "grad_norm": 3.070621473217749, + "language_loss": 0.8192116, + "learning_rate": 3.9565827622878365e-06, + "loss": 0.84119731, + "num_input_tokens_seen": 23551020, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 1.15625, + "step": 1107, + "time_per_iteration": 2.3848822116851807 + }, + { + "auxiliary_loss_clip": 0.01043134, + "auxiliary_loss_mlp": 0.01023949, + "balance_loss_clip": 1.01925182, + "balance_loss_mlp": 1.00925303, + "epoch": 0.06661656395611003, + "flos": 61416236062080.0, + "grad_norm": 0.7916552539491276, + "language_loss": 0.56714582, + "learning_rate": 3.956504429704334e-06, + "loss": 0.5878166, + "num_input_tokens_seen": 23610675, + "router_z_loss_clip": 0.046875, + "router_z_loss_mlp": 0.33984375, + "step": 1108, + "time_per_iteration": 2.987449884414673 + }, + { + "auxiliary_loss_clip": 0.01147292, + "auxiliary_loss_mlp": 0.01054553, + "balance_loss_clip": 1.0233202, + "balance_loss_mlp": 1.03442478, + "epoch": 0.066676687208778, + "flos": 20661948253440.0, + "grad_norm": 3.414117940188091, + "language_loss": 0.72846961, + "learning_rate": 3.956426027298221e-06, + "loss": 0.7504881, + "num_input_tokens_seen": 23628710, + "router_z_loss_clip": 0.3125, + "router_z_loss_mlp": 1.125, + "step": 1109, + "time_per_iteration": 2.4279117584228516 + }, + { + "auxiliary_loss_clip": 0.01147421, + "auxiliary_loss_mlp": 0.01054663, + "balance_loss_clip": 1.02586257, + "balance_loss_mlp": 1.03525794, + "epoch": 0.06673681046144596, + "flos": 20922225505920.0, + "grad_norm": 2.0648281320744832, + "language_loss": 0.7821449, + "learning_rate": 3.956347555072296e-06, + "loss": 0.80416572, + "num_input_tokens_seen": 23649160, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 1.125, + "step": 1110, + "time_per_iteration": 2.421630382537842 + }, + { + "auxiliary_loss_clip": 0.01148098, + "auxiliary_loss_mlp": 0.01051408, + "balance_loss_clip": 1.0239898, + "balance_loss_mlp": 1.03642201, + "epoch": 0.06679693371411394, + "flos": 31064494400640.0, + "grad_norm": 3.521541261242907, + "language_loss": 0.71108806, + "learning_rate": 3.95626901302936e-06, + "loss": 0.73308313, + "num_input_tokens_seen": 23671995, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 1.1171875, + "step": 1111, + "time_per_iteration": 2.4702768325805664 + }, + { + "auxiliary_loss_clip": 0.01152294, + "auxiliary_loss_mlp": 0.01052218, + "balance_loss_clip": 1.02451348, + "balance_loss_mlp": 1.03722906, + "epoch": 0.0668570569667819, + "flos": 21725053286400.0, + "grad_norm": 2.0572766136120872, + "language_loss": 0.78350592, + "learning_rate": 3.956190401172214e-06, + "loss": 0.80555105, + "num_input_tokens_seen": 23690705, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 1.1484375, + "step": 1112, + "time_per_iteration": 2.4014744758605957 + }, + { + "auxiliary_loss_clip": 0.01148717, + "auxiliary_loss_mlp": 0.01057763, + "balance_loss_clip": 1.0297966, + "balance_loss_mlp": 1.03698647, + "epoch": 0.06691718021944987, + "flos": 22746158087040.0, + "grad_norm": 2.2200154515730315, + "language_loss": 0.79009718, + "learning_rate": 3.956111719503664e-06, + "loss": 0.81216192, + "num_input_tokens_seen": 23709990, + "router_z_loss_clip": 0.27929688, + "router_z_loss_mlp": 1.1171875, + "step": 1113, + "time_per_iteration": 2.4003682136535645 + }, + { + "auxiliary_loss_clip": 0.01143582, + "auxiliary_loss_mlp": 0.01045722, + "balance_loss_clip": 1.01901865, + "balance_loss_mlp": 1.03396714, + "epoch": 0.06697730347211785, + "flos": 16544675594880.0, + "grad_norm": 1.8213310860122236, + "language_loss": 0.82533109, + "learning_rate": 3.956032968026519e-06, + "loss": 0.84722418, + "num_input_tokens_seen": 23728485, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 1.09375, + "step": 1114, + "time_per_iteration": 2.3902881145477295 + }, + { + "auxiliary_loss_clip": 0.01045519, + "auxiliary_loss_mlp": 0.01003938, + "balance_loss_clip": 0.99950367, + "balance_loss_mlp": 1.0111022, + "epoch": 0.06703742672478581, + "flos": 59779123499520.0, + "grad_norm": 0.8243039787841735, + "language_loss": 0.58152986, + "learning_rate": 3.955954146743589e-06, + "loss": 0.60202444, + "num_input_tokens_seen": 23786650, + "router_z_loss_clip": 0.04443359, + "router_z_loss_mlp": 0.34375, + "step": 1115, + "time_per_iteration": 2.975740432739258 + }, + { + "auxiliary_loss_clip": 0.01148229, + "auxiliary_loss_mlp": 0.01056833, + "balance_loss_clip": 1.02709055, + "balance_loss_mlp": 1.03565681, + "epoch": 0.06709754997745378, + "flos": 16799262295680.0, + "grad_norm": 3.130615164771175, + "language_loss": 0.9187237, + "learning_rate": 3.9558752556576874e-06, + "loss": 0.94077432, + "num_input_tokens_seen": 23802555, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 1.125, + "step": 1116, + "time_per_iteration": 2.3603169918060303 + }, + { + "auxiliary_loss_clip": 0.01154845, + "auxiliary_loss_mlp": 0.01060873, + "balance_loss_clip": 1.03114212, + "balance_loss_mlp": 1.03859985, + "epoch": 0.06715767323012176, + "flos": 22122923673600.0, + "grad_norm": 2.095241715439275, + "language_loss": 0.87228984, + "learning_rate": 3.955796294771628e-06, + "loss": 0.89444697, + "num_input_tokens_seen": 23822945, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 1.1640625, + "step": 1117, + "time_per_iteration": 2.437349319458008 + }, + { + "auxiliary_loss_clip": 0.01041756, + "auxiliary_loss_mlp": 0.01006534, + "balance_loss_clip": 1.00224292, + "balance_loss_mlp": 1.00733614, + "epoch": 0.06721779648278972, + "flos": 66615363020160.0, + "grad_norm": 0.8524745008189767, + "language_loss": 0.59762853, + "learning_rate": 3.95571726408823e-06, + "loss": 0.61811143, + "num_input_tokens_seen": 23874075, + "router_z_loss_clip": 0.04296875, + "router_z_loss_mlp": 0.34375, + "step": 1118, + "time_per_iteration": 3.0528414249420166 + }, + { + "auxiliary_loss_clip": 0.01146149, + "auxiliary_loss_mlp": 0.01046751, + "balance_loss_clip": 1.01955914, + "balance_loss_mlp": 1.03465438, + "epoch": 0.06727791973545769, + "flos": 22381385535360.0, + "grad_norm": 6.487344051059983, + "language_loss": 0.82986391, + "learning_rate": 3.955638163610314e-06, + "loss": 0.85179293, + "num_input_tokens_seen": 23889720, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 1.1171875, + "step": 1119, + "time_per_iteration": 2.419816732406616 + }, + { + "auxiliary_loss_clip": 0.01144101, + "auxiliary_loss_mlp": 0.01050539, + "balance_loss_clip": 1.02450359, + "balance_loss_mlp": 1.03527403, + "epoch": 0.06733804298812565, + "flos": 24279054641280.0, + "grad_norm": 1.8616435906553814, + "language_loss": 0.8482362, + "learning_rate": 3.955558993340703e-06, + "loss": 0.87018257, + "num_input_tokens_seen": 23909385, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 1.09375, + "step": 1120, + "time_per_iteration": 3.8828959465026855 + }, + { + "auxiliary_loss_clip": 0.01156273, + "auxiliary_loss_mlp": 0.01058698, + "balance_loss_clip": 1.03104162, + "balance_loss_mlp": 1.0411582, + "epoch": 0.06739816624079363, + "flos": 15917496197760.0, + "grad_norm": 2.2660236821839623, + "language_loss": 0.78819853, + "learning_rate": 3.955479753282221e-06, + "loss": 0.81034827, + "num_input_tokens_seen": 23926830, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 1.1484375, + "step": 1121, + "time_per_iteration": 2.4081673622131348 + }, + { + "auxiliary_loss_clip": 0.01039947, + "auxiliary_loss_mlp": 0.01005861, + "balance_loss_clip": 1.00168824, + "balance_loss_mlp": 1.00650489, + "epoch": 0.0674582894934616, + "flos": 71394656613120.0, + "grad_norm": 0.7533590153096971, + "language_loss": 0.58349454, + "learning_rate": 3.955400443437696e-06, + "loss": 0.60395265, + "num_input_tokens_seen": 23992640, + "router_z_loss_clip": 0.04174805, + "router_z_loss_mlp": 0.3359375, + "step": 1122, + "time_per_iteration": 4.462320566177368 + }, + { + "auxiliary_loss_clip": 0.0115262, + "auxiliary_loss_mlp": 0.01052663, + "balance_loss_clip": 1.02430344, + "balance_loss_mlp": 1.03925991, + "epoch": 0.06751841274612956, + "flos": 25263779938560.0, + "grad_norm": 2.039678532660783, + "language_loss": 0.71565163, + "learning_rate": 3.95532106380996e-06, + "loss": 0.7377044, + "num_input_tokens_seen": 24011135, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 1.1328125, + "step": 1123, + "time_per_iteration": 3.810786724090576 + }, + { + "auxiliary_loss_clip": 0.01150952, + "auxiliary_loss_mlp": 0.0105428, + "balance_loss_clip": 1.02476323, + "balance_loss_mlp": 1.03756046, + "epoch": 0.06757853599879754, + "flos": 23801687354880.0, + "grad_norm": 1.86152894512166, + "language_loss": 0.79015303, + "learning_rate": 3.9552416144018445e-06, + "loss": 0.81220531, + "num_input_tokens_seen": 24030695, + "router_z_loss_clip": 0.29492188, + "router_z_loss_mlp": 1.1328125, + "step": 1124, + "time_per_iteration": 2.4354350566864014 + }, + { + "auxiliary_loss_clip": 0.01145399, + "auxiliary_loss_mlp": 0.01044236, + "balance_loss_clip": 1.01797402, + "balance_loss_mlp": 1.0362289, + "epoch": 0.0676386592514655, + "flos": 21032655736320.0, + "grad_norm": 2.7282165640532234, + "language_loss": 0.71316373, + "learning_rate": 3.955162095216186e-06, + "loss": 0.7350601, + "num_input_tokens_seen": 24050680, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 1.09375, + "step": 1125, + "time_per_iteration": 3.7452244758605957 + }, + { + "auxiliary_loss_clip": 0.01145681, + "auxiliary_loss_mlp": 0.01052374, + "balance_loss_clip": 1.02295339, + "balance_loss_mlp": 1.03685808, + "epoch": 0.06769878250413347, + "flos": 25555165079040.0, + "grad_norm": 2.6751849311736544, + "language_loss": 0.81167436, + "learning_rate": 3.95508250625582e-06, + "loss": 0.83365488, + "num_input_tokens_seen": 24067205, + "router_z_loss_clip": 0.29492188, + "router_z_loss_mlp": 1.0859375, + "step": 1126, + "time_per_iteration": 2.447014331817627 + }, + { + "auxiliary_loss_clip": 0.01040105, + "auxiliary_loss_mlp": 0.01005089, + "balance_loss_clip": 1.00074983, + "balance_loss_mlp": 1.00653863, + "epoch": 0.06775890575680145, + "flos": 70651740458880.0, + "grad_norm": 0.7816526170598117, + "language_loss": 0.59801042, + "learning_rate": 3.95500284752359e-06, + "loss": 0.61846232, + "num_input_tokens_seen": 24131320, + "router_z_loss_clip": 0.04345703, + "router_z_loss_mlp": 0.3359375, + "step": 1127, + "time_per_iteration": 3.066779851913452 + }, + { + "auxiliary_loss_clip": 0.01147014, + "auxiliary_loss_mlp": 0.01048138, + "balance_loss_clip": 1.02029109, + "balance_loss_mlp": 1.03688431, + "epoch": 0.06781902900946941, + "flos": 24234575702400.0, + "grad_norm": 2.224979259672447, + "language_loss": 0.81246132, + "learning_rate": 3.954923119022337e-06, + "loss": 0.83441287, + "num_input_tokens_seen": 24149930, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 1.1015625, + "step": 1128, + "time_per_iteration": 2.4281606674194336 + }, + { + "auxiliary_loss_clip": 0.01154245, + "auxiliary_loss_mlp": 0.01046281, + "balance_loss_clip": 1.01767111, + "balance_loss_mlp": 1.03841734, + "epoch": 0.06787915226213738, + "flos": 22416473318400.0, + "grad_norm": 2.7069058363169156, + "language_loss": 0.75399923, + "learning_rate": 3.9548433207549065e-06, + "loss": 0.77600449, + "num_input_tokens_seen": 24169590, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 1.15625, + "step": 1129, + "time_per_iteration": 2.4316508769989014 + }, + { + "auxiliary_loss_clip": 0.01145738, + "auxiliary_loss_mlp": 0.01046612, + "balance_loss_clip": 1.01823974, + "balance_loss_mlp": 1.03559637, + "epoch": 0.06793927551480534, + "flos": 37705393664640.0, + "grad_norm": 1.7740383949338567, + "language_loss": 0.71722078, + "learning_rate": 3.954763452724146e-06, + "loss": 0.73914433, + "num_input_tokens_seen": 24189965, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 1.09375, + "step": 1130, + "time_per_iteration": 2.5574941635131836 + }, + { + "auxiliary_loss_clip": 0.01145337, + "auxiliary_loss_mlp": 0.01049854, + "balance_loss_clip": 1.02341366, + "balance_loss_mlp": 1.03672767, + "epoch": 0.06799939876747332, + "flos": 20630351606400.0, + "grad_norm": 2.5622001697638903, + "language_loss": 0.80953151, + "learning_rate": 3.954683514932906e-06, + "loss": 0.83148336, + "num_input_tokens_seen": 24208045, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.0859375, + "step": 1131, + "time_per_iteration": 2.46608567237854 + }, + { + "auxiliary_loss_clip": 0.0114364, + "auxiliary_loss_mlp": 0.0105958, + "balance_loss_clip": 1.0310415, + "balance_loss_mlp": 1.0365063, + "epoch": 0.06805952202014129, + "flos": 14863921966080.0, + "grad_norm": 10.230628435090006, + "language_loss": 0.80578613, + "learning_rate": 3.95460350738404e-06, + "loss": 0.82781839, + "num_input_tokens_seen": 24223805, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 1.0703125, + "step": 1132, + "time_per_iteration": 2.406174421310425 + }, + { + "auxiliary_loss_clip": 0.01144118, + "auxiliary_loss_mlp": 0.01054145, + "balance_loss_clip": 1.02660751, + "balance_loss_mlp": 1.03547812, + "epoch": 0.06811964527280925, + "flos": 48907555747200.0, + "grad_norm": 1.5189189589491072, + "language_loss": 0.63690358, + "learning_rate": 3.954523430080402e-06, + "loss": 0.65888619, + "num_input_tokens_seen": 24249475, + "router_z_loss_clip": 0.27539062, + "router_z_loss_mlp": 1.0859375, + "step": 1133, + "time_per_iteration": 2.6687326431274414 + }, + { + "auxiliary_loss_clip": 0.01150547, + "auxiliary_loss_mlp": 0.01053353, + "balance_loss_clip": 1.02378917, + "balance_loss_mlp": 1.03586221, + "epoch": 0.06817976852547723, + "flos": 15376377035520.0, + "grad_norm": 2.2196303995449114, + "language_loss": 0.74988973, + "learning_rate": 3.9544432830248504e-06, + "loss": 0.77192879, + "num_input_tokens_seen": 24267980, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 1.1484375, + "step": 1134, + "time_per_iteration": 2.389263391494751 + }, + { + "auxiliary_loss_clip": 0.01144706, + "auxiliary_loss_mlp": 0.01052305, + "balance_loss_clip": 1.02647233, + "balance_loss_mlp": 1.03708625, + "epoch": 0.0682398917781452, + "flos": 20154694976640.0, + "grad_norm": 3.295505603148525, + "language_loss": 0.8708508, + "learning_rate": 3.954363066220246e-06, + "loss": 0.89282089, + "num_input_tokens_seen": 24286805, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.078125, + "step": 1135, + "time_per_iteration": 2.395012378692627 + }, + { + "auxiliary_loss_clip": 0.01146483, + "auxiliary_loss_mlp": 0.01045769, + "balance_loss_clip": 1.01824403, + "balance_loss_mlp": 1.03473854, + "epoch": 0.06830001503081316, + "flos": 23439498243840.0, + "grad_norm": 2.9357431467981527, + "language_loss": 0.77959895, + "learning_rate": 3.954282779669451e-06, + "loss": 0.80152142, + "num_input_tokens_seen": 24305855, + "router_z_loss_clip": 0.27539062, + "router_z_loss_mlp": 1.1171875, + "step": 1136, + "time_per_iteration": 2.437164783477783 + }, + { + "auxiliary_loss_clip": 0.01149479, + "auxiliary_loss_mlp": 0.01056481, + "balance_loss_clip": 1.02828848, + "balance_loss_mlp": 1.0376966, + "epoch": 0.06836013828348114, + "flos": 34348389972480.0, + "grad_norm": 10.53788335757046, + "language_loss": 0.83737171, + "learning_rate": 3.95420242337533e-06, + "loss": 0.85943127, + "num_input_tokens_seen": 24326535, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 1.1171875, + "step": 1137, + "time_per_iteration": 2.509925127029419 + }, + { + "auxiliary_loss_clip": 0.01143737, + "auxiliary_loss_mlp": 0.01049197, + "balance_loss_clip": 1.02167106, + "balance_loss_mlp": 1.0355711, + "epoch": 0.06842026153614911, + "flos": 23147729078400.0, + "grad_norm": 2.548998429243754, + "language_loss": 0.78280199, + "learning_rate": 3.954121997340752e-06, + "loss": 0.80473137, + "num_input_tokens_seen": 24345810, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 1.078125, + "step": 1138, + "time_per_iteration": 2.451308012008667 + }, + { + "auxiliary_loss_clip": 0.01147673, + "auxiliary_loss_mlp": 0.01060586, + "balance_loss_clip": 1.03071165, + "balance_loss_mlp": 1.03628469, + "epoch": 0.06848038478881707, + "flos": 24607796803200.0, + "grad_norm": 2.4230119432478325, + "language_loss": 0.85318613, + "learning_rate": 3.9540415015685855e-06, + "loss": 0.8752687, + "num_input_tokens_seen": 24366095, + "router_z_loss_clip": 0.29882812, + "router_z_loss_mlp": 1.109375, + "step": 1139, + "time_per_iteration": 2.418158769607544 + }, + { + "auxiliary_loss_clip": 0.01145708, + "auxiliary_loss_mlp": 0.01045786, + "balance_loss_clip": 1.01917887, + "balance_loss_mlp": 1.0365721, + "epoch": 0.06854050804148504, + "flos": 40879382676480.0, + "grad_norm": 1.802646008694186, + "language_loss": 0.74583817, + "learning_rate": 3.953960936061706e-06, + "loss": 0.76775312, + "num_input_tokens_seen": 24388665, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.09375, + "step": 1140, + "time_per_iteration": 2.5871405601501465 + }, + { + "auxiliary_loss_clip": 0.01144081, + "auxiliary_loss_mlp": 0.01060288, + "balance_loss_clip": 1.02993786, + "balance_loss_mlp": 1.03546023, + "epoch": 0.06860063129415302, + "flos": 31685005728000.0, + "grad_norm": 2.353283521657733, + "language_loss": 0.6831162, + "learning_rate": 3.9538803008229845e-06, + "loss": 0.70515984, + "num_input_tokens_seen": 24407705, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 1.0859375, + "step": 1141, + "time_per_iteration": 2.4802167415618896 + }, + { + "auxiliary_loss_clip": 0.01147212, + "auxiliary_loss_mlp": 0.01055887, + "balance_loss_clip": 1.02670479, + "balance_loss_mlp": 1.03654337, + "epoch": 0.06866075454682098, + "flos": 26540798071680.0, + "grad_norm": 2.4042942679845396, + "language_loss": 0.78867722, + "learning_rate": 3.953799595855303e-06, + "loss": 0.81070817, + "num_input_tokens_seen": 24428390, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 1.109375, + "step": 1142, + "time_per_iteration": 2.4773917198181152 + }, + { + "auxiliary_loss_clip": 0.01144319, + "auxiliary_loss_mlp": 0.01050934, + "balance_loss_clip": 1.02454114, + "balance_loss_mlp": 1.03604782, + "epoch": 0.06872087779948895, + "flos": 29788453785600.0, + "grad_norm": 1.8654347049738194, + "language_loss": 0.6836428, + "learning_rate": 3.953718821161539e-06, + "loss": 0.70559537, + "num_input_tokens_seen": 24450810, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.0859375, + "step": 1143, + "time_per_iteration": 2.5048325061798096 + }, + { + "auxiliary_loss_clip": 0.01138427, + "auxiliary_loss_mlp": 0.01050695, + "balance_loss_clip": 1.02438569, + "balance_loss_mlp": 1.03550994, + "epoch": 0.06878100105215693, + "flos": 26939960179200.0, + "grad_norm": 1.722226436663597, + "language_loss": 0.74243826, + "learning_rate": 3.953637976744576e-06, + "loss": 0.76432949, + "num_input_tokens_seen": 24469965, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 1.03125, + "step": 1144, + "time_per_iteration": 2.498687267303467 + }, + { + "auxiliary_loss_clip": 0.01146537, + "auxiliary_loss_mlp": 0.01055603, + "balance_loss_clip": 1.02723074, + "balance_loss_mlp": 1.03431416, + "epoch": 0.06884112430482489, + "flos": 10669980228480.0, + "grad_norm": 4.7915259833189205, + "language_loss": 0.9168638, + "learning_rate": 3.953557062607299e-06, + "loss": 0.93888521, + "num_input_tokens_seen": 24486370, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 1.125, + "step": 1145, + "time_per_iteration": 2.358703136444092 + }, + { + "auxiliary_loss_clip": 0.01150496, + "auxiliary_loss_mlp": 0.01056676, + "balance_loss_clip": 1.02692151, + "balance_loss_mlp": 1.03589928, + "epoch": 0.06890124755749286, + "flos": 20192610579840.0, + "grad_norm": 2.3528228070886286, + "language_loss": 0.81935954, + "learning_rate": 3.953476078752595e-06, + "loss": 0.84143126, + "num_input_tokens_seen": 24503780, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 1.140625, + "step": 1146, + "time_per_iteration": 2.4289767742156982 + }, + { + "auxiliary_loss_clip": 0.01142062, + "auxiliary_loss_mlp": 0.01057507, + "balance_loss_clip": 1.0310905, + "balance_loss_mlp": 1.03638935, + "epoch": 0.06896137081016084, + "flos": 20448174798720.0, + "grad_norm": 2.3180790737616364, + "language_loss": 0.84927756, + "learning_rate": 3.953395025183355e-06, + "loss": 0.87127328, + "num_input_tokens_seen": 24522320, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 1.0625, + "step": 1147, + "time_per_iteration": 2.3913071155548096 + }, + { + "auxiliary_loss_clip": 0.01145703, + "auxiliary_loss_mlp": 0.01050701, + "balance_loss_clip": 1.02339029, + "balance_loss_mlp": 1.03590751, + "epoch": 0.0690214940628288, + "flos": 18367735392000.0, + "grad_norm": 1.9433216530482342, + "language_loss": 0.85627848, + "learning_rate": 3.9533139019024715e-06, + "loss": 0.87824255, + "num_input_tokens_seen": 24540445, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 1.09375, + "step": 1148, + "time_per_iteration": 2.3985259532928467 + }, + { + "auxiliary_loss_clip": 0.01142808, + "auxiliary_loss_mlp": 0.01051462, + "balance_loss_clip": 1.02472377, + "balance_loss_mlp": 1.0347476, + "epoch": 0.06908161731549677, + "flos": 20556999106560.0, + "grad_norm": 2.5802358550337074, + "language_loss": 0.69454765, + "learning_rate": 3.953232708912839e-06, + "loss": 0.71649039, + "num_input_tokens_seen": 24557105, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 1.078125, + "step": 1149, + "time_per_iteration": 2.38120436668396 + }, + { + "auxiliary_loss_clip": 0.01148519, + "auxiliary_loss_mlp": 0.01048599, + "balance_loss_clip": 1.02033401, + "balance_loss_mlp": 1.03614509, + "epoch": 0.06914174056816474, + "flos": 27562426542720.0, + "grad_norm": 2.0275636731663966, + "language_loss": 0.83030009, + "learning_rate": 3.953151446217356e-06, + "loss": 0.85227126, + "num_input_tokens_seen": 24578240, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 1.125, + "step": 1150, + "time_per_iteration": 2.4575576782226562 + }, + { + "auxiliary_loss_clip": 0.01150229, + "auxiliary_loss_mlp": 0.01055713, + "balance_loss_clip": 1.02748466, + "balance_loss_mlp": 1.03936958, + "epoch": 0.06920186382083271, + "flos": 15303129269760.0, + "grad_norm": 3.4158487911616255, + "language_loss": 0.8146646, + "learning_rate": 3.953070113818921e-06, + "loss": 0.83672404, + "num_input_tokens_seen": 24593585, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 1.109375, + "step": 1151, + "time_per_iteration": 2.356996774673462 + }, + { + "auxiliary_loss_clip": 0.01142556, + "auxiliary_loss_mlp": 0.01047837, + "balance_loss_clip": 1.0213964, + "balance_loss_mlp": 1.03563929, + "epoch": 0.06926198707350067, + "flos": 25190078325120.0, + "grad_norm": 2.1900678702636296, + "language_loss": 0.85472023, + "learning_rate": 3.952988711720439e-06, + "loss": 0.87662417, + "num_input_tokens_seen": 24613110, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.0625, + "step": 1152, + "time_per_iteration": 2.472827196121216 + }, + { + "auxiliary_loss_clip": 0.01142016, + "auxiliary_loss_mlp": 0.01049989, + "balance_loss_clip": 1.02365518, + "balance_loss_mlp": 1.03515291, + "epoch": 0.06932211032616864, + "flos": 13255438584960.0, + "grad_norm": 1.9839593451496706, + "language_loss": 0.90736151, + "learning_rate": 3.952907239924813e-06, + "loss": 0.92928159, + "num_input_tokens_seen": 24628795, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 1.0703125, + "step": 1153, + "time_per_iteration": 2.3746461868286133 + }, + { + "auxiliary_loss_clip": 0.01144548, + "auxiliary_loss_mlp": 0.01048424, + "balance_loss_clip": 1.01949143, + "balance_loss_mlp": 1.035882, + "epoch": 0.06938223357883662, + "flos": 24826213468800.0, + "grad_norm": 2.2986110293559463, + "language_loss": 0.81671846, + "learning_rate": 3.95282569843495e-06, + "loss": 0.83864814, + "num_input_tokens_seen": 24645480, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 1.09375, + "step": 1154, + "time_per_iteration": 2.451007843017578 + }, + { + "auxiliary_loss_clip": 0.01141782, + "auxiliary_loss_mlp": 0.01054789, + "balance_loss_clip": 1.02863431, + "balance_loss_mlp": 1.03702188, + "epoch": 0.06944235683150458, + "flos": 27266852039040.0, + "grad_norm": 1.8435908644227317, + "language_loss": 0.75050694, + "learning_rate": 3.952744087253762e-06, + "loss": 0.77247268, + "num_input_tokens_seen": 24664630, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 1.046875, + "step": 1155, + "time_per_iteration": 2.449211597442627 + }, + { + "auxiliary_loss_clip": 0.01141457, + "auxiliary_loss_mlp": 0.01045484, + "balance_loss_clip": 1.01813757, + "balance_loss_mlp": 1.0339638, + "epoch": 0.06950248008417255, + "flos": 25806993782400.0, + "grad_norm": 1.809875608216508, + "language_loss": 0.70478129, + "learning_rate": 3.952662406384161e-06, + "loss": 0.72665071, + "num_input_tokens_seen": 24684210, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 1.078125, + "step": 1156, + "time_per_iteration": 2.47178316116333 + }, + { + "auxiliary_loss_clip": 0.01144698, + "auxiliary_loss_mlp": 0.01056941, + "balance_loss_clip": 1.02679288, + "balance_loss_mlp": 1.03540492, + "epoch": 0.06956260333684053, + "flos": 22270501457280.0, + "grad_norm": 2.042998869215872, + "language_loss": 0.75011253, + "learning_rate": 3.952580655829061e-06, + "loss": 0.77212894, + "num_input_tokens_seen": 24702490, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 1.09375, + "step": 1157, + "time_per_iteration": 2.4180266857147217 + }, + { + "auxiliary_loss_clip": 0.01144855, + "auxiliary_loss_mlp": 0.01055636, + "balance_loss_clip": 1.02833712, + "balance_loss_mlp": 1.03508615, + "epoch": 0.0696227265895085, + "flos": 29680048414080.0, + "grad_norm": 1.9123471701501298, + "language_loss": 0.71525955, + "learning_rate": 3.952498835591381e-06, + "loss": 0.73726451, + "num_input_tokens_seen": 24724340, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 1.09375, + "step": 1158, + "time_per_iteration": 2.50199031829834 + }, + { + "auxiliary_loss_clip": 0.01144933, + "auxiliary_loss_mlp": 0.0105089, + "balance_loss_clip": 1.0223515, + "balance_loss_mlp": 1.03502059, + "epoch": 0.06968284984217646, + "flos": 25522276711680.0, + "grad_norm": 1.8495643318440533, + "language_loss": 0.79798836, + "learning_rate": 3.952416945674039e-06, + "loss": 0.81994659, + "num_input_tokens_seen": 24745550, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 1.09375, + "step": 1159, + "time_per_iteration": 3.928196668624878 + }, + { + "auxiliary_loss_clip": 0.01149889, + "auxiliary_loss_mlp": 0.01054699, + "balance_loss_clip": 1.02370465, + "balance_loss_mlp": 1.0370692, + "epoch": 0.06974297309484444, + "flos": 20697315327360.0, + "grad_norm": 2.9024267789836666, + "language_loss": 0.80438364, + "learning_rate": 3.952334986079957e-06, + "loss": 0.82642949, + "num_input_tokens_seen": 24762575, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 1.125, + "step": 1160, + "time_per_iteration": 2.3827168941497803 + }, + { + "auxiliary_loss_clip": 0.01143649, + "auxiliary_loss_mlp": 0.01054615, + "balance_loss_clip": 1.02514696, + "balance_loss_mlp": 1.03302932, + "epoch": 0.0698030963475124, + "flos": 26503999632000.0, + "grad_norm": 1.6930892433628664, + "language_loss": 0.756661, + "learning_rate": 3.9522529568120635e-06, + "loss": 0.77864367, + "num_input_tokens_seen": 24782605, + "router_z_loss_clip": 0.29492188, + "router_z_loss_mlp": 1.109375, + "step": 1161, + "time_per_iteration": 2.4427478313446045 + }, + { + "auxiliary_loss_clip": 0.01140831, + "auxiliary_loss_mlp": 0.01054918, + "balance_loss_clip": 1.02531838, + "balance_loss_mlp": 1.03243947, + "epoch": 0.06986321960018037, + "flos": 23039288795520.0, + "grad_norm": 1.789627523228887, + "language_loss": 0.82873094, + "learning_rate": 3.952170857873283e-06, + "loss": 0.85068834, + "num_input_tokens_seen": 24802910, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 1.0859375, + "step": 1162, + "time_per_iteration": 5.221744537353516 + }, + { + "auxiliary_loss_clip": 0.01137981, + "auxiliary_loss_mlp": 0.01047741, + "balance_loss_clip": 1.01998901, + "balance_loss_mlp": 1.03151393, + "epoch": 0.06992334285284833, + "flos": 28583566254720.0, + "grad_norm": 2.1630598390116518, + "language_loss": 0.78933895, + "learning_rate": 3.952088689266547e-06, + "loss": 0.81119615, + "num_input_tokens_seen": 24823305, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 1.0625, + "step": 1163, + "time_per_iteration": 2.46140193939209 + }, + { + "auxiliary_loss_clip": 0.01143545, + "auxiliary_loss_mlp": 0.01054695, + "balance_loss_clip": 1.02511919, + "balance_loss_mlp": 1.03399253, + "epoch": 0.06998346610551631, + "flos": 20594286305280.0, + "grad_norm": 2.0283146772607057, + "language_loss": 0.79181325, + "learning_rate": 3.952006450994786e-06, + "loss": 0.81379569, + "num_input_tokens_seen": 24842155, + "router_z_loss_clip": 0.29492188, + "router_z_loss_mlp": 1.09375, + "step": 1164, + "time_per_iteration": 3.7580206394195557 + }, + { + "auxiliary_loss_clip": 0.01143306, + "auxiliary_loss_mlp": 0.01055278, + "balance_loss_clip": 1.02756214, + "balance_loss_mlp": 1.0347116, + "epoch": 0.07004358935818428, + "flos": 22527706510080.0, + "grad_norm": 1.5771235008123332, + "language_loss": 0.72730517, + "learning_rate": 3.951924143060937e-06, + "loss": 0.74929094, + "num_input_tokens_seen": 24862080, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 1.0859375, + "step": 1165, + "time_per_iteration": 2.418278694152832 + }, + { + "auxiliary_loss_clip": 0.01142289, + "auxiliary_loss_mlp": 0.01051947, + "balance_loss_clip": 1.02405202, + "balance_loss_mlp": 1.03415227, + "epoch": 0.07010371261085224, + "flos": 28948722831360.0, + "grad_norm": 2.5634817613944993, + "language_loss": 0.80783445, + "learning_rate": 3.951841765467935e-06, + "loss": 0.82977676, + "num_input_tokens_seen": 24886165, + "router_z_loss_clip": 0.27929688, + "router_z_loss_mlp": 1.078125, + "step": 1166, + "time_per_iteration": 2.481705904006958 + }, + { + "auxiliary_loss_clip": 0.0113909, + "auxiliary_loss_mlp": 0.01044312, + "balance_loss_clip": 1.01615429, + "balance_loss_mlp": 1.03347373, + "epoch": 0.07016383586352022, + "flos": 23658054554880.0, + "grad_norm": 2.060449597299873, + "language_loss": 0.84201783, + "learning_rate": 3.951759318218722e-06, + "loss": 0.86385179, + "num_input_tokens_seen": 24905775, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 1.0546875, + "step": 1167, + "time_per_iteration": 2.4210996627807617 + }, + { + "auxiliary_loss_clip": 0.01144835, + "auxiliary_loss_mlp": 0.01052096, + "balance_loss_clip": 1.02310443, + "balance_loss_mlp": 1.03402662, + "epoch": 0.07022395911618819, + "flos": 19791109411200.0, + "grad_norm": 2.2660493238016324, + "language_loss": 0.89404839, + "learning_rate": 3.951676801316239e-06, + "loss": 0.91601771, + "num_input_tokens_seen": 24924295, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 1.109375, + "step": 1168, + "time_per_iteration": 2.4255263805389404 + }, + { + "auxiliary_loss_clip": 0.01145153, + "auxiliary_loss_mlp": 0.01062315, + "balance_loss_clip": 1.03074884, + "balance_loss_mlp": 1.03228617, + "epoch": 0.07028408236885615, + "flos": 21688080289920.0, + "grad_norm": 6.326228126873958, + "language_loss": 0.88479823, + "learning_rate": 3.951594214763431e-06, + "loss": 0.90687293, + "num_input_tokens_seen": 24943210, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 1.125, + "step": 1169, + "time_per_iteration": 2.4024498462677 + }, + { + "auxiliary_loss_clip": 0.01145541, + "auxiliary_loss_mlp": 0.01061234, + "balance_loss_clip": 1.03035831, + "balance_loss_mlp": 1.03636777, + "epoch": 0.07034420562152413, + "flos": 25629076160640.0, + "grad_norm": 3.762141607568952, + "language_loss": 0.83485639, + "learning_rate": 3.951511558563246e-06, + "loss": 0.85692418, + "num_input_tokens_seen": 24960360, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 1.09375, + "step": 1170, + "time_per_iteration": 2.454987049102783 + }, + { + "auxiliary_loss_clip": 0.01142319, + "auxiliary_loss_mlp": 0.01058716, + "balance_loss_clip": 1.02900887, + "balance_loss_mlp": 1.03398812, + "epoch": 0.0704043288741921, + "flos": 20809491125760.0, + "grad_norm": 2.0649427383890804, + "language_loss": 0.75835848, + "learning_rate": 3.951428832718633e-06, + "loss": 0.7803688, + "num_input_tokens_seen": 24978290, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 1.078125, + "step": 1171, + "time_per_iteration": 2.4012880325317383 + }, + { + "auxiliary_loss_clip": 0.01142165, + "auxiliary_loss_mlp": 0.0104635, + "balance_loss_clip": 1.01909852, + "balance_loss_mlp": 1.03408408, + "epoch": 0.07046445212686006, + "flos": 25591998430080.0, + "grad_norm": 1.8672122014430101, + "language_loss": 0.88891184, + "learning_rate": 3.951346037232546e-06, + "loss": 0.91079688, + "num_input_tokens_seen": 24997055, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 1.078125, + "step": 1172, + "time_per_iteration": 2.453601121902466 + }, + { + "auxiliary_loss_clip": 0.01143036, + "auxiliary_loss_mlp": 0.01048481, + "balance_loss_clip": 1.01798701, + "balance_loss_mlp": 1.03233933, + "epoch": 0.07052457537952803, + "flos": 25555793483520.0, + "grad_norm": 2.2028699019088385, + "language_loss": 0.82122999, + "learning_rate": 3.951263172107937e-06, + "loss": 0.84314519, + "num_input_tokens_seen": 25017490, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 1.109375, + "step": 1173, + "time_per_iteration": 2.4472427368164062 + }, + { + "auxiliary_loss_clip": 0.0114247, + "auxiliary_loss_mlp": 0.01051073, + "balance_loss_clip": 1.02131832, + "balance_loss_mlp": 1.03417015, + "epoch": 0.070584698632196, + "flos": 17967525943680.0, + "grad_norm": 56.39445814219136, + "language_loss": 0.8231191, + "learning_rate": 3.951180237347765e-06, + "loss": 0.84505451, + "num_input_tokens_seen": 25035660, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 1.078125, + "step": 1174, + "time_per_iteration": 2.4031693935394287 + }, + { + "auxiliary_loss_clip": 0.01142534, + "auxiliary_loss_mlp": 0.01051656, + "balance_loss_clip": 1.02391613, + "balance_loss_mlp": 1.03413618, + "epoch": 0.07064482188486397, + "flos": 25369811337600.0, + "grad_norm": 2.066140012195251, + "language_loss": 0.85233241, + "learning_rate": 3.951097232954989e-06, + "loss": 0.87427437, + "num_input_tokens_seen": 25054785, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 1.078125, + "step": 1175, + "time_per_iteration": 2.438138484954834 + }, + { + "auxiliary_loss_clip": 0.01145033, + "auxiliary_loss_mlp": 0.01059015, + "balance_loss_clip": 1.02916503, + "balance_loss_mlp": 1.03509188, + "epoch": 0.07070494513753194, + "flos": 24898693184640.0, + "grad_norm": 1.9238158226412332, + "language_loss": 0.83100969, + "learning_rate": 3.951014158932572e-06, + "loss": 0.85305011, + "num_input_tokens_seen": 25075180, + "router_z_loss_clip": 0.29882812, + "router_z_loss_mlp": 1.1015625, + "step": 1176, + "time_per_iteration": 2.4777917861938477 + }, + { + "auxiliary_loss_clip": 0.01142268, + "auxiliary_loss_mlp": 0.01058178, + "balance_loss_clip": 1.02932918, + "balance_loss_mlp": 1.03444862, + "epoch": 0.07076506839019991, + "flos": 22337569912320.0, + "grad_norm": 4.217455463468714, + "language_loss": 0.74490559, + "learning_rate": 3.950931015283479e-06, + "loss": 0.76691002, + "num_input_tokens_seen": 25093035, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 1.078125, + "step": 1177, + "time_per_iteration": 2.4201037883758545 + }, + { + "auxiliary_loss_clip": 0.01147636, + "auxiliary_loss_mlp": 0.01051709, + "balance_loss_clip": 1.0224309, + "balance_loss_mlp": 1.03535318, + "epoch": 0.07082519164286788, + "flos": 18659818759680.0, + "grad_norm": 2.1383008413969153, + "language_loss": 0.86319709, + "learning_rate": 3.950847802010675e-06, + "loss": 0.88519061, + "num_input_tokens_seen": 25112520, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 1.125, + "step": 1178, + "time_per_iteration": 2.4174270629882812 + }, + { + "auxiliary_loss_clip": 0.01068847, + "auxiliary_loss_mlp": 0.01008342, + "balance_loss_clip": 1.00047445, + "balance_loss_mlp": 1.01997209, + "epoch": 0.07088531489553584, + "flos": 63650676942720.0, + "grad_norm": 0.8365517648153916, + "language_loss": 0.63280094, + "learning_rate": 3.950764519117132e-06, + "loss": 0.65357292, + "num_input_tokens_seen": 25177760, + "router_z_loss_clip": 0.07861328, + "router_z_loss_mlp": 0.48828125, + "step": 1179, + "time_per_iteration": 3.1840317249298096 + }, + { + "auxiliary_loss_clip": 0.01145276, + "auxiliary_loss_mlp": 0.0105284, + "balance_loss_clip": 1.02395582, + "balance_loss_mlp": 1.0354408, + "epoch": 0.07094543814820382, + "flos": 21571819862400.0, + "grad_norm": 2.5128410300265416, + "language_loss": 0.83514106, + "learning_rate": 3.9506811666058215e-06, + "loss": 0.85712224, + "num_input_tokens_seen": 25195260, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 1.1015625, + "step": 1180, + "time_per_iteration": 2.446951389312744 + }, + { + "auxiliary_loss_clip": 0.011423, + "auxiliary_loss_mlp": 0.01053974, + "balance_loss_clip": 1.02662778, + "balance_loss_mlp": 1.03505707, + "epoch": 0.07100556140087179, + "flos": 22088883231360.0, + "grad_norm": 2.205994704076047, + "language_loss": 0.87598801, + "learning_rate": 3.950597744479717e-06, + "loss": 0.89795077, + "num_input_tokens_seen": 25212740, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 1.0703125, + "step": 1181, + "time_per_iteration": 2.4032480716705322 + }, + { + "auxiliary_loss_clip": 0.0114354, + "auxiliary_loss_mlp": 0.01055523, + "balance_loss_clip": 1.02886796, + "balance_loss_mlp": 1.03676593, + "epoch": 0.07106568465353975, + "flos": 47920491388800.0, + "grad_norm": 2.04683450933953, + "language_loss": 0.83846635, + "learning_rate": 3.950514252741797e-06, + "loss": 0.86045694, + "num_input_tokens_seen": 25236420, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.0703125, + "step": 1182, + "time_per_iteration": 2.7398793697357178 + }, + { + "auxiliary_loss_clip": 0.01141069, + "auxiliary_loss_mlp": 0.01048225, + "balance_loss_clip": 1.01976967, + "balance_loss_mlp": 1.03627372, + "epoch": 0.07112580790620772, + "flos": 23439672800640.0, + "grad_norm": 3.851505484815403, + "language_loss": 0.7913717, + "learning_rate": 3.950430691395042e-06, + "loss": 0.81326461, + "num_input_tokens_seen": 25255120, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 1.046875, + "step": 1183, + "time_per_iteration": 2.4907233715057373 + }, + { + "auxiliary_loss_clip": 0.01145502, + "auxiliary_loss_mlp": 0.01055042, + "balance_loss_clip": 1.02502549, + "balance_loss_mlp": 1.03338456, + "epoch": 0.0711859311588757, + "flos": 31867531649280.0, + "grad_norm": 2.0698299749698843, + "language_loss": 0.78832853, + "learning_rate": 3.95034706044243e-06, + "loss": 0.81033391, + "num_input_tokens_seen": 25275150, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 1.125, + "step": 1184, + "time_per_iteration": 2.532350540161133 + }, + { + "auxiliary_loss_clip": 0.01141494, + "auxiliary_loss_mlp": 0.01055609, + "balance_loss_clip": 1.0272969, + "balance_loss_mlp": 1.03506601, + "epoch": 0.07124605441154366, + "flos": 19609281717120.0, + "grad_norm": 1.9766682763801302, + "language_loss": 0.76702213, + "learning_rate": 3.95026335988695e-06, + "loss": 0.78899324, + "num_input_tokens_seen": 25293680, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 1.0625, + "step": 1185, + "time_per_iteration": 2.4000062942504883 + }, + { + "auxiliary_loss_clip": 0.0114118, + "auxiliary_loss_mlp": 0.01052528, + "balance_loss_clip": 1.02598047, + "balance_loss_mlp": 1.03560448, + "epoch": 0.07130617766421163, + "flos": 14683560549120.0, + "grad_norm": 2.3640395760624795, + "language_loss": 0.65478528, + "learning_rate": 3.950179589731587e-06, + "loss": 0.67672229, + "num_input_tokens_seen": 25310050, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.0546875, + "step": 1186, + "time_per_iteration": 2.405527353286743 + }, + { + "auxiliary_loss_clip": 0.01141241, + "auxiliary_loss_mlp": 0.01048348, + "balance_loss_clip": 1.02125204, + "balance_loss_mlp": 1.0361383, + "epoch": 0.07136630091687961, + "flos": 26066712453120.0, + "grad_norm": 1.8983866206856574, + "language_loss": 0.69567817, + "learning_rate": 3.950095749979331e-06, + "loss": 0.717574, + "num_input_tokens_seen": 25331020, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 1.046875, + "step": 1187, + "time_per_iteration": 2.451582670211792 + }, + { + "auxiliary_loss_clip": 0.01142274, + "auxiliary_loss_mlp": 0.01049525, + "balance_loss_clip": 1.02415752, + "balance_loss_mlp": 1.03641868, + "epoch": 0.07142642416954757, + "flos": 15668285846400.0, + "grad_norm": 2.595109808513564, + "language_loss": 0.79029095, + "learning_rate": 3.950011840633174e-06, + "loss": 0.81220895, + "num_input_tokens_seen": 25347875, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.0625, + "step": 1188, + "time_per_iteration": 2.4079363346099854 + }, + { + "auxiliary_loss_clip": 0.01143543, + "auxiliary_loss_mlp": 0.01049544, + "balance_loss_clip": 1.02231669, + "balance_loss_mlp": 1.03669262, + "epoch": 0.07148654742221554, + "flos": 19754310971520.0, + "grad_norm": 1.9246330997835532, + "language_loss": 0.84834594, + "learning_rate": 3.9499278616961106e-06, + "loss": 0.87027681, + "num_input_tokens_seen": 25366715, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 1.0703125, + "step": 1189, + "time_per_iteration": 2.4651424884796143 + }, + { + "auxiliary_loss_clip": 0.01140024, + "auxiliary_loss_mlp": 0.01049078, + "balance_loss_clip": 1.02243471, + "balance_loss_mlp": 1.03461695, + "epoch": 0.07154667067488352, + "flos": 23470850511360.0, + "grad_norm": 1.8648074508764025, + "language_loss": 0.76680577, + "learning_rate": 3.949843813171137e-06, + "loss": 0.78869677, + "num_input_tokens_seen": 25385450, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 1.0546875, + "step": 1190, + "time_per_iteration": 2.448935031890869 + }, + { + "auxiliary_loss_clip": 0.01145334, + "auxiliary_loss_mlp": 0.01063064, + "balance_loss_clip": 1.03400064, + "balance_loss_mlp": 1.03551149, + "epoch": 0.07160679392755148, + "flos": 18331949381760.0, + "grad_norm": 2.0180827544920383, + "language_loss": 0.75543731, + "learning_rate": 3.949759695061254e-06, + "loss": 0.77752125, + "num_input_tokens_seen": 25403940, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 1.09375, + "step": 1191, + "time_per_iteration": 2.420780897140503 + }, + { + "auxiliary_loss_clip": 0.01143815, + "auxiliary_loss_mlp": 0.01053508, + "balance_loss_clip": 1.02427769, + "balance_loss_mlp": 1.03578496, + "epoch": 0.07166691718021945, + "flos": 17746106901120.0, + "grad_norm": 3.7572314550321306, + "language_loss": 0.74226058, + "learning_rate": 3.949675507369463e-06, + "loss": 0.76423383, + "num_input_tokens_seen": 25420410, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 1.078125, + "step": 1192, + "time_per_iteration": 2.41892409324646 + }, + { + "auxiliary_loss_clip": 0.01139894, + "auxiliary_loss_mlp": 0.0104925, + "balance_loss_clip": 1.02218974, + "balance_loss_mlp": 1.03323972, + "epoch": 0.07172704043288743, + "flos": 22450932696960.0, + "grad_norm": 2.174877482137923, + "language_loss": 0.78133452, + "learning_rate": 3.949591250098768e-06, + "loss": 0.80322599, + "num_input_tokens_seen": 25439415, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 1.0703125, + "step": 1193, + "time_per_iteration": 2.4201836585998535 + }, + { + "auxiliary_loss_clip": 0.01145021, + "auxiliary_loss_mlp": 0.01050061, + "balance_loss_clip": 1.02161729, + "balance_loss_mlp": 1.03653884, + "epoch": 0.07178716368555539, + "flos": 23221081578240.0, + "grad_norm": 2.173482196864183, + "language_loss": 0.85534096, + "learning_rate": 3.949506923252175e-06, + "loss": 0.87729174, + "num_input_tokens_seen": 25458715, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 1.0859375, + "step": 1194, + "time_per_iteration": 2.473048448562622 + }, + { + "auxiliary_loss_clip": 0.01141008, + "auxiliary_loss_mlp": 0.01053987, + "balance_loss_clip": 1.02797532, + "balance_loss_mlp": 1.03503466, + "epoch": 0.07184728693822336, + "flos": 25149788749440.0, + "grad_norm": 2.39331448397616, + "language_loss": 0.81294763, + "learning_rate": 3.9494225268326965e-06, + "loss": 0.83489752, + "num_input_tokens_seen": 25477985, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 1.0625, + "step": 1195, + "time_per_iteration": 2.423985719680786 + }, + { + "auxiliary_loss_clip": 0.01142658, + "auxiliary_loss_mlp": 0.01044567, + "balance_loss_clip": 1.01861489, + "balance_loss_mlp": 1.03682089, + "epoch": 0.07190741019089132, + "flos": 22710127697280.0, + "grad_norm": 1.9294649589775585, + "language_loss": 0.7980628, + "learning_rate": 3.949338060843342e-06, + "loss": 0.81993502, + "num_input_tokens_seen": 25497110, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 1.0625, + "step": 1196, + "time_per_iteration": 2.4577596187591553 + }, + { + "auxiliary_loss_clip": 0.01137897, + "auxiliary_loss_mlp": 0.01062488, + "balance_loss_clip": 1.03280461, + "balance_loss_mlp": 1.03426504, + "epoch": 0.0719675334435593, + "flos": 29348548254720.0, + "grad_norm": 3.0521239106417553, + "language_loss": 0.70851308, + "learning_rate": 3.949253525287126e-06, + "loss": 0.73051691, + "num_input_tokens_seen": 25516555, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 1.0390625, + "step": 1197, + "time_per_iteration": 2.4613070487976074 + }, + { + "auxiliary_loss_clip": 0.01138148, + "auxiliary_loss_mlp": 0.01048628, + "balance_loss_clip": 1.02221167, + "balance_loss_mlp": 1.03309786, + "epoch": 0.07202765669622727, + "flos": 17638818693120.0, + "grad_norm": 4.029424760912505, + "language_loss": 0.85489368, + "learning_rate": 3.9491689201670655e-06, + "loss": 0.8767615, + "num_input_tokens_seen": 25533895, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 1.046875, + "step": 1198, + "time_per_iteration": 2.4109957218170166 + }, + { + "auxiliary_loss_clip": 0.01142444, + "auxiliary_loss_mlp": 0.01053808, + "balance_loss_clip": 1.02474451, + "balance_loss_mlp": 1.03578997, + "epoch": 0.07208777994889523, + "flos": 21432969918720.0, + "grad_norm": 2.2387513912187056, + "language_loss": 0.83341557, + "learning_rate": 3.94908424548618e-06, + "loss": 0.85537809, + "num_input_tokens_seen": 25554195, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 1.0703125, + "step": 1199, + "time_per_iteration": 3.895958185195923 + }, + { + "auxiliary_loss_clip": 0.01146576, + "auxiliary_loss_mlp": 0.01055465, + "balance_loss_clip": 1.02858305, + "balance_loss_mlp": 1.03858709, + "epoch": 0.07214790320156321, + "flos": 26939715799680.0, + "grad_norm": 2.230486717462451, + "language_loss": 0.75693011, + "learning_rate": 3.9489995012474924e-06, + "loss": 0.77895045, + "num_input_tokens_seen": 25574155, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 1.078125, + "step": 1200, + "time_per_iteration": 2.5096075534820557 + }, + { + "auxiliary_loss_clip": 0.0114182, + "auxiliary_loss_mlp": 0.01056701, + "balance_loss_clip": 1.02809119, + "balance_loss_mlp": 1.03691626, + "epoch": 0.07220802645423118, + "flos": 23878775370240.0, + "grad_norm": 2.228077108250191, + "language_loss": 0.8275224, + "learning_rate": 3.948914687454027e-06, + "loss": 0.84950757, + "num_input_tokens_seen": 25592735, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 1.046875, + "step": 1201, + "time_per_iteration": 2.4473869800567627 + }, + { + "auxiliary_loss_clip": 0.01143448, + "auxiliary_loss_mlp": 0.01054304, + "balance_loss_clip": 1.02470398, + "balance_loss_mlp": 1.03431582, + "epoch": 0.07226814970689914, + "flos": 19242658863360.0, + "grad_norm": 2.3020796644932813, + "language_loss": 0.68767619, + "learning_rate": 3.948829804108807e-06, + "loss": 0.70965374, + "num_input_tokens_seen": 25611510, + "router_z_loss_clip": 0.29492188, + "router_z_loss_mlp": 1.09375, + "step": 1202, + "time_per_iteration": 5.258546590805054 + }, + { + "auxiliary_loss_clip": 0.01142037, + "auxiliary_loss_mlp": 0.01052944, + "balance_loss_clip": 1.02426255, + "balance_loss_mlp": 1.03484988, + "epoch": 0.07232827295956712, + "flos": 19171017020160.0, + "grad_norm": 2.6378065975950515, + "language_loss": 0.87662745, + "learning_rate": 3.948744851214865e-06, + "loss": 0.89857721, + "num_input_tokens_seen": 25629560, + "router_z_loss_clip": 0.28710938, + "router_z_loss_mlp": 1.078125, + "step": 1203, + "time_per_iteration": 3.90230393409729 + }, + { + "auxiliary_loss_clip": 0.01145265, + "auxiliary_loss_mlp": 0.01051896, + "balance_loss_clip": 1.02352452, + "balance_loss_mlp": 1.03499961, + "epoch": 0.07238839621223508, + "flos": 17638783781760.0, + "grad_norm": 1.9076009635896547, + "language_loss": 0.78297997, + "learning_rate": 3.948659828775233e-06, + "loss": 0.80495155, + "num_input_tokens_seen": 25648330, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 1.109375, + "step": 1204, + "time_per_iteration": 2.4286630153656006 + }, + { + "auxiliary_loss_clip": 0.01141302, + "auxiliary_loss_mlp": 0.01051983, + "balance_loss_clip": 1.02534008, + "balance_loss_mlp": 1.0341996, + "epoch": 0.07244851946490305, + "flos": 28291168684800.0, + "grad_norm": 1.6565434766687437, + "language_loss": 0.82000256, + "learning_rate": 3.9485747367929436e-06, + "loss": 0.8419354, + "num_input_tokens_seen": 25669470, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.0703125, + "step": 1205, + "time_per_iteration": 2.4822306632995605 + }, + { + "auxiliary_loss_clip": 0.01144075, + "auxiliary_loss_mlp": 0.01054803, + "balance_loss_clip": 1.02466643, + "balance_loss_mlp": 1.03717446, + "epoch": 0.07250864271757101, + "flos": 22563736899840.0, + "grad_norm": 2.0059740658311545, + "language_loss": 0.7660293, + "learning_rate": 3.948489575271035e-06, + "loss": 0.78801811, + "num_input_tokens_seen": 25690470, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 1.0625, + "step": 1206, + "time_per_iteration": 2.473125457763672 + }, + { + "auxiliary_loss_clip": 0.01143495, + "auxiliary_loss_mlp": 0.01050002, + "balance_loss_clip": 1.02232158, + "balance_loss_mlp": 1.03645301, + "epoch": 0.072568765970239, + "flos": 21761328055680.0, + "grad_norm": 2.467119437823108, + "language_loss": 0.77418441, + "learning_rate": 3.948404344212544e-06, + "loss": 0.79611939, + "num_input_tokens_seen": 25709205, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 1.0703125, + "step": 1207, + "time_per_iteration": 2.404226779937744 + }, + { + "auxiliary_loss_clip": 0.01140906, + "auxiliary_loss_mlp": 0.01048589, + "balance_loss_clip": 1.02285171, + "balance_loss_mlp": 1.03607512, + "epoch": 0.07262888922290696, + "flos": 25518541196160.0, + "grad_norm": 2.4181522111205536, + "language_loss": 0.79696399, + "learning_rate": 3.948319043620516e-06, + "loss": 0.81885892, + "num_input_tokens_seen": 25728485, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.046875, + "step": 1208, + "time_per_iteration": 2.4818239212036133 + }, + { + "auxiliary_loss_clip": 0.01141341, + "auxiliary_loss_mlp": 0.01045132, + "balance_loss_clip": 1.01904941, + "balance_loss_mlp": 1.03661633, + "epoch": 0.07268901247557492, + "flos": 21245626229760.0, + "grad_norm": 2.396122437366724, + "language_loss": 0.78514445, + "learning_rate": 3.948233673497991e-06, + "loss": 0.80700922, + "num_input_tokens_seen": 25747730, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 1.046875, + "step": 1209, + "time_per_iteration": 2.4050676822662354 + }, + { + "auxiliary_loss_clip": 0.0114268, + "auxiliary_loss_mlp": 0.01055349, + "balance_loss_clip": 1.0285393, + "balance_loss_mlp": 1.03766227, + "epoch": 0.0727491357282429, + "flos": 25478251620480.0, + "grad_norm": 2.593857403459947, + "language_loss": 0.8194046, + "learning_rate": 3.948148233848018e-06, + "loss": 0.84138495, + "num_input_tokens_seen": 25768050, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 1.0546875, + "step": 1210, + "time_per_iteration": 2.4727799892425537 + }, + { + "auxiliary_loss_clip": 0.01140298, + "auxiliary_loss_mlp": 0.01054591, + "balance_loss_clip": 1.02724493, + "balance_loss_mlp": 1.03657985, + "epoch": 0.07280925898091087, + "flos": 24461021980800.0, + "grad_norm": 1.7600465661830798, + "language_loss": 0.84463573, + "learning_rate": 3.948062724673646e-06, + "loss": 0.8665846, + "num_input_tokens_seen": 25787985, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 1.0390625, + "step": 1211, + "time_per_iteration": 2.4353771209716797 + }, + { + "auxiliary_loss_clip": 0.01137977, + "auxiliary_loss_mlp": 0.01042181, + "balance_loss_clip": 1.01647985, + "balance_loss_mlp": 1.03354049, + "epoch": 0.07286938223357883, + "flos": 18287435531520.0, + "grad_norm": 2.3416196011709323, + "language_loss": 0.90241849, + "learning_rate": 3.947977145977927e-06, + "loss": 0.92422009, + "num_input_tokens_seen": 25803620, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.046875, + "step": 1212, + "time_per_iteration": 2.371009111404419 + }, + { + "auxiliary_loss_clip": 0.0114141, + "auxiliary_loss_mlp": 0.01046974, + "balance_loss_clip": 1.01823235, + "balance_loss_mlp": 1.03664112, + "epoch": 0.07292950548624681, + "flos": 21213750291840.0, + "grad_norm": 2.0401905048381983, + "language_loss": 0.72653985, + "learning_rate": 3.947891497763914e-06, + "loss": 0.7484237, + "num_input_tokens_seen": 25823315, + "router_z_loss_clip": 0.28710938, + "router_z_loss_mlp": 1.046875, + "step": 1213, + "time_per_iteration": 2.4208662509918213 + }, + { + "auxiliary_loss_clip": 0.01143135, + "auxiliary_loss_mlp": 0.01044519, + "balance_loss_clip": 1.01767325, + "balance_loss_mlp": 1.03473377, + "epoch": 0.07298962873891478, + "flos": 24640929550080.0, + "grad_norm": 1.7914246045079538, + "language_loss": 0.84198576, + "learning_rate": 3.947805780034664e-06, + "loss": 0.86386228, + "num_input_tokens_seen": 25842605, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 1.0859375, + "step": 1214, + "time_per_iteration": 2.448817253112793 + }, + { + "auxiliary_loss_clip": 0.01146305, + "auxiliary_loss_mlp": 0.01049021, + "balance_loss_clip": 1.021662, + "balance_loss_mlp": 1.03676653, + "epoch": 0.07304975199158274, + "flos": 27051542484480.0, + "grad_norm": 2.7801557012053837, + "language_loss": 0.84115255, + "learning_rate": 3.947719992793236e-06, + "loss": 0.86310577, + "num_input_tokens_seen": 25863030, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 1.09375, + "step": 1215, + "time_per_iteration": 2.4775185585021973 + }, + { + "auxiliary_loss_clip": 0.01141298, + "auxiliary_loss_mlp": 0.01048689, + "balance_loss_clip": 1.0214498, + "balance_loss_mlp": 1.03517699, + "epoch": 0.07310987524425071, + "flos": 33548075809920.0, + "grad_norm": 2.0020276426992583, + "language_loss": 0.80888009, + "learning_rate": 3.9476341360426924e-06, + "loss": 0.83077991, + "num_input_tokens_seen": 25888015, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 1.0625, + "step": 1216, + "time_per_iteration": 2.6004252433776855 + }, + { + "auxiliary_loss_clip": 0.01145917, + "auxiliary_loss_mlp": 0.01044281, + "balance_loss_clip": 1.01780462, + "balance_loss_mlp": 1.0389626, + "epoch": 0.07316999849691869, + "flos": 28109690104320.0, + "grad_norm": 2.2619077943459716, + "language_loss": 0.76420432, + "learning_rate": 3.9475482097860955e-06, + "loss": 0.78610629, + "num_input_tokens_seen": 25908660, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.0703125, + "step": 1217, + "time_per_iteration": 2.4730021953582764 + }, + { + "auxiliary_loss_clip": 0.0113805, + "auxiliary_loss_mlp": 0.01051466, + "balance_loss_clip": 1.02534699, + "balance_loss_mlp": 1.03639913, + "epoch": 0.07323012174958665, + "flos": 14391721560960.0, + "grad_norm": 2.01205891769929, + "language_loss": 0.86498004, + "learning_rate": 3.947462214026512e-06, + "loss": 0.88687515, + "num_input_tokens_seen": 25927215, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 1.015625, + "step": 1218, + "time_per_iteration": 2.4093971252441406 + }, + { + "auxiliary_loss_clip": 0.01141823, + "auxiliary_loss_mlp": 0.01057249, + "balance_loss_clip": 1.03045118, + "balance_loss_mlp": 1.03431439, + "epoch": 0.07329024500225462, + "flos": 21615356194560.0, + "grad_norm": 1.733751255105255, + "language_loss": 0.86781025, + "learning_rate": 3.947376148767013e-06, + "loss": 0.88980097, + "num_input_tokens_seen": 25945500, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 1.078125, + "step": 1219, + "time_per_iteration": 2.4079973697662354 + }, + { + "auxiliary_loss_clip": 0.01139537, + "auxiliary_loss_mlp": 0.0105074, + "balance_loss_clip": 1.02428699, + "balance_loss_mlp": 1.03407681, + "epoch": 0.0733503682549226, + "flos": 13223318267520.0, + "grad_norm": 2.5594197446101523, + "language_loss": 0.84308958, + "learning_rate": 3.947290014010668e-06, + "loss": 0.86499238, + "num_input_tokens_seen": 25963105, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 1.0546875, + "step": 1220, + "time_per_iteration": 2.3945207595825195 + }, + { + "auxiliary_loss_clip": 0.01140314, + "auxiliary_loss_mlp": 0.01054931, + "balance_loss_clip": 1.02772701, + "balance_loss_mlp": 1.034724, + "epoch": 0.07341049150759056, + "flos": 20885915825280.0, + "grad_norm": 4.876721254970153, + "language_loss": 0.76878929, + "learning_rate": 3.9472038097605516e-06, + "loss": 0.7907418, + "num_input_tokens_seen": 25981690, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 1.0546875, + "step": 1221, + "time_per_iteration": 2.4245474338531494 + }, + { + "auxiliary_loss_clip": 0.01143765, + "auxiliary_loss_mlp": 0.01053598, + "balance_loss_clip": 1.02473783, + "balance_loss_mlp": 1.03804398, + "epoch": 0.07347061476025853, + "flos": 15412721627520.0, + "grad_norm": 2.9206585572953103, + "language_loss": 0.91950142, + "learning_rate": 3.94711753601974e-06, + "loss": 0.94147503, + "num_input_tokens_seen": 25999890, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 1.0546875, + "step": 1222, + "time_per_iteration": 2.3849406242370605 + }, + { + "auxiliary_loss_clip": 0.01148281, + "auxiliary_loss_mlp": 0.01049477, + "balance_loss_clip": 1.0222857, + "balance_loss_mlp": 1.03966355, + "epoch": 0.0735307380129265, + "flos": 11108070368640.0, + "grad_norm": 2.3887842126224474, + "language_loss": 0.90904081, + "learning_rate": 3.947031192791312e-06, + "loss": 0.93101841, + "num_input_tokens_seen": 26016445, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 1.0859375, + "step": 1223, + "time_per_iteration": 2.439429759979248 + }, + { + "auxiliary_loss_clip": 0.01142685, + "auxiliary_loss_mlp": 0.01054166, + "balance_loss_clip": 1.02712965, + "balance_loss_mlp": 1.03838503, + "epoch": 0.07359086126559447, + "flos": 23731267409280.0, + "grad_norm": 2.124319472946957, + "language_loss": 0.81972909, + "learning_rate": 3.9469447800783485e-06, + "loss": 0.84169757, + "num_input_tokens_seen": 26036080, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 1.046875, + "step": 1224, + "time_per_iteration": 2.4450085163116455 + }, + { + "auxiliary_loss_clip": 0.01140637, + "auxiliary_loss_mlp": 0.01059567, + "balance_loss_clip": 1.03000343, + "balance_loss_mlp": 1.03389513, + "epoch": 0.07365098451826244, + "flos": 20992296337920.0, + "grad_norm": 2.4175849250634416, + "language_loss": 0.83184677, + "learning_rate": 3.946858297883935e-06, + "loss": 0.85384881, + "num_input_tokens_seen": 26055805, + "router_z_loss_clip": 0.29492188, + "router_z_loss_mlp": 1.0625, + "step": 1225, + "time_per_iteration": 2.4287068843841553 + }, + { + "auxiliary_loss_clip": 0.01141, + "auxiliary_loss_mlp": 0.01049981, + "balance_loss_clip": 1.02364779, + "balance_loss_mlp": 1.03529191, + "epoch": 0.0737111077709304, + "flos": 19932682440960.0, + "grad_norm": 2.011532145546205, + "language_loss": 0.90203059, + "learning_rate": 3.946771746211156e-06, + "loss": 0.9239403, + "num_input_tokens_seen": 26073905, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 1.0546875, + "step": 1226, + "time_per_iteration": 2.3896918296813965 + }, + { + "auxiliary_loss_clip": 0.01149355, + "auxiliary_loss_mlp": 0.01046833, + "balance_loss_clip": 1.01710188, + "balance_loss_mlp": 1.03556871, + "epoch": 0.07377123102359838, + "flos": 16580601250560.0, + "grad_norm": 2.5947243267518085, + "language_loss": 0.76146984, + "learning_rate": 3.946685125063101e-06, + "loss": 0.78343177, + "num_input_tokens_seen": 26091700, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 1.140625, + "step": 1227, + "time_per_iteration": 2.448315143585205 + }, + { + "auxiliary_loss_clip": 0.01141519, + "auxiliary_loss_mlp": 0.01054329, + "balance_loss_clip": 1.02749527, + "balance_loss_mlp": 1.03901958, + "epoch": 0.07383135427626634, + "flos": 28327338720000.0, + "grad_norm": 1.6501807133980002, + "language_loss": 0.85381699, + "learning_rate": 3.9465984344428615e-06, + "loss": 0.87577546, + "num_input_tokens_seen": 26114105, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 1.0234375, + "step": 1228, + "time_per_iteration": 2.4647083282470703 + }, + { + "auxiliary_loss_clip": 0.01144267, + "auxiliary_loss_mlp": 0.0104705, + "balance_loss_clip": 1.01946509, + "balance_loss_mlp": 1.03646779, + "epoch": 0.07389147752893431, + "flos": 20046149959680.0, + "grad_norm": 2.6306386628314633, + "language_loss": 0.79686767, + "learning_rate": 3.946511674353531e-06, + "loss": 0.81878078, + "num_input_tokens_seen": 26131165, + "router_z_loss_clip": 0.27539062, + "router_z_loss_mlp": 1.078125, + "step": 1229, + "time_per_iteration": 2.4147233963012695 + }, + { + "auxiliary_loss_clip": 0.01143158, + "auxiliary_loss_mlp": 0.01053877, + "balance_loss_clip": 1.0235498, + "balance_loss_mlp": 1.03548288, + "epoch": 0.07395160078160229, + "flos": 18113148691200.0, + "grad_norm": 2.6013210374683204, + "language_loss": 0.78361106, + "learning_rate": 3.9464248447982065e-06, + "loss": 0.80558145, + "num_input_tokens_seen": 26150040, + "router_z_loss_clip": 0.30273438, + "router_z_loss_mlp": 1.078125, + "step": 1230, + "time_per_iteration": 2.377610445022583 + }, + { + "auxiliary_loss_clip": 0.01139386, + "auxiliary_loss_mlp": 0.01046705, + "balance_loss_clip": 1.01846445, + "balance_loss_mlp": 1.03478765, + "epoch": 0.07401172403427025, + "flos": 23585784307200.0, + "grad_norm": 2.1179927963574534, + "language_loss": 0.81063914, + "learning_rate": 3.946337945779986e-06, + "loss": 0.83249998, + "num_input_tokens_seen": 26169380, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 1.046875, + "step": 1231, + "time_per_iteration": 2.456991672515869 + }, + { + "auxiliary_loss_clip": 0.01143164, + "auxiliary_loss_mlp": 0.01050602, + "balance_loss_clip": 1.0215987, + "balance_loss_mlp": 1.03441632, + "epoch": 0.07407184728693822, + "flos": 26358691086720.0, + "grad_norm": 2.1789172637567487, + "language_loss": 0.94616294, + "learning_rate": 3.94625097730197e-06, + "loss": 0.96810061, + "num_input_tokens_seen": 26189420, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 1.0859375, + "step": 1232, + "time_per_iteration": 2.4660682678222656 + }, + { + "auxiliary_loss_clip": 0.01139915, + "auxiliary_loss_mlp": 0.01049601, + "balance_loss_clip": 1.02361298, + "balance_loss_mlp": 1.03494775, + "epoch": 0.0741319705396062, + "flos": 22199348373120.0, + "grad_norm": 1.8981820065754686, + "language_loss": 0.81123012, + "learning_rate": 3.946163939367264e-06, + "loss": 0.83312529, + "num_input_tokens_seen": 26209300, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 1.046875, + "step": 1233, + "time_per_iteration": 2.4436287879943848 + }, + { + "auxiliary_loss_clip": 0.01146831, + "auxiliary_loss_mlp": 0.01059435, + "balance_loss_clip": 1.02751112, + "balance_loss_mlp": 1.03701162, + "epoch": 0.07419209379227416, + "flos": 39198978161280.0, + "grad_norm": 2.275438771641701, + "language_loss": 0.70302069, + "learning_rate": 3.9460768319789724e-06, + "loss": 0.72508335, + "num_input_tokens_seen": 26228110, + "router_z_loss_clip": 0.3203125, + "router_z_loss_mlp": 1.1015625, + "step": 1234, + "time_per_iteration": 2.5459272861480713 + }, + { + "auxiliary_loss_clip": 0.01144109, + "auxiliary_loss_mlp": 0.01052466, + "balance_loss_clip": 1.024261, + "balance_loss_mlp": 1.03747475, + "epoch": 0.07425221704494213, + "flos": 22780617465600.0, + "grad_norm": 1.9535843715857266, + "language_loss": 0.77411473, + "learning_rate": 3.945989655140205e-06, + "loss": 0.79608047, + "num_input_tokens_seen": 26247020, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 1.0625, + "step": 1235, + "time_per_iteration": 2.465358018875122 + }, + { + "auxiliary_loss_clip": 0.01142052, + "auxiliary_loss_mlp": 0.01047285, + "balance_loss_clip": 1.0195688, + "balance_loss_mlp": 1.03596783, + "epoch": 0.0743123402976101, + "flos": 22271897911680.0, + "grad_norm": 2.1740463529522676, + "language_loss": 0.8237235, + "learning_rate": 3.945902408854073e-06, + "loss": 0.84561688, + "num_input_tokens_seen": 26265750, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 1.0625, + "step": 1236, + "time_per_iteration": 2.433403730392456 + }, + { + "auxiliary_loss_clip": 0.01142526, + "auxiliary_loss_mlp": 0.01054264, + "balance_loss_clip": 1.02632093, + "balance_loss_mlp": 1.03503644, + "epoch": 0.07437246355027807, + "flos": 29313739762560.0, + "grad_norm": 2.189070570878333, + "language_loss": 0.7565378, + "learning_rate": 3.945815093123688e-06, + "loss": 0.77850574, + "num_input_tokens_seen": 26287905, + "router_z_loss_clip": 0.27929688, + "router_z_loss_mlp": 1.078125, + "step": 1237, + "time_per_iteration": 2.515289545059204 + }, + { + "auxiliary_loss_clip": 0.01141188, + "auxiliary_loss_mlp": 0.01051467, + "balance_loss_clip": 1.02457392, + "balance_loss_mlp": 1.03395224, + "epoch": 0.07443258680294604, + "flos": 31943293032960.0, + "grad_norm": 1.6945120127682058, + "language_loss": 0.77806079, + "learning_rate": 3.945727707952168e-06, + "loss": 0.79998732, + "num_input_tokens_seen": 26311795, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 1.0703125, + "step": 1238, + "time_per_iteration": 2.539829730987549 + }, + { + "auxiliary_loss_clip": 0.01146616, + "auxiliary_loss_mlp": 0.01052864, + "balance_loss_clip": 1.02389622, + "balance_loss_mlp": 1.03531945, + "epoch": 0.074492710055614, + "flos": 22674167130240.0, + "grad_norm": 1.9737472469547397, + "language_loss": 0.86791956, + "learning_rate": 3.945640253342632e-06, + "loss": 0.88991439, + "num_input_tokens_seen": 26330330, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 1.1171875, + "step": 1239, + "time_per_iteration": 3.8863275051116943 + }, + { + "auxiliary_loss_clip": 0.01143857, + "auxiliary_loss_mlp": 0.01046158, + "balance_loss_clip": 1.01618838, + "balance_loss_mlp": 1.03571773, + "epoch": 0.07455283330828198, + "flos": 21283925857920.0, + "grad_norm": 1.87822766520297, + "language_loss": 0.88759482, + "learning_rate": 3.9455527292981996e-06, + "loss": 0.909495, + "num_input_tokens_seen": 26348865, + "router_z_loss_clip": 0.29882812, + "router_z_loss_mlp": 1.078125, + "step": 1240, + "time_per_iteration": 2.4443845748901367 + }, + { + "auxiliary_loss_clip": 0.01144719, + "auxiliary_loss_mlp": 0.01053837, + "balance_loss_clip": 1.02510726, + "balance_loss_mlp": 1.0371182, + "epoch": 0.07461295656094995, + "flos": 24387285456000.0, + "grad_norm": 2.065054906840633, + "language_loss": 0.89345175, + "learning_rate": 3.945465135821995e-06, + "loss": 0.91543734, + "num_input_tokens_seen": 26368210, + "router_z_loss_clip": 0.28710938, + "router_z_loss_mlp": 1.078125, + "step": 1241, + "time_per_iteration": 3.873291254043579 + }, + { + "auxiliary_loss_clip": 0.01042863, + "auxiliary_loss_mlp": 0.01006544, + "balance_loss_clip": 1.00244331, + "balance_loss_mlp": 1.0090704, + "epoch": 0.07467307981361791, + "flos": 62106608753280.0, + "grad_norm": 0.889362179100009, + "language_loss": 0.63114607, + "learning_rate": 3.9453774729171435e-06, + "loss": 0.65164018, + "num_input_tokens_seen": 26424890, + "router_z_loss_clip": 0.04101562, + "router_z_loss_mlp": 0.33789062, + "step": 1242, + "time_per_iteration": 6.085329055786133 + }, + { + "auxiliary_loss_clip": 0.01153597, + "auxiliary_loss_mlp": 0.01054763, + "balance_loss_clip": 1.0232439, + "balance_loss_mlp": 1.03737187, + "epoch": 0.07473320306628589, + "flos": 24861999479040.0, + "grad_norm": 2.764946515618584, + "language_loss": 0.62563753, + "learning_rate": 3.945289740586775e-06, + "loss": 0.64772117, + "num_input_tokens_seen": 26446405, + "router_z_loss_clip": 0.31445312, + "router_z_loss_mlp": 1.1640625, + "step": 1243, + "time_per_iteration": 2.450406074523926 + }, + { + "auxiliary_loss_clip": 0.0114227, + "auxiliary_loss_mlp": 0.01044545, + "balance_loss_clip": 1.01800931, + "balance_loss_mlp": 1.03594351, + "epoch": 0.07479332631895386, + "flos": 24896354123520.0, + "grad_norm": 1.845577187094996, + "language_loss": 0.76297748, + "learning_rate": 3.945201938834018e-06, + "loss": 0.78484559, + "num_input_tokens_seen": 26466070, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.0625, + "step": 1244, + "time_per_iteration": 2.448031187057495 + }, + { + "auxiliary_loss_clip": 0.01147805, + "auxiliary_loss_mlp": 0.01054192, + "balance_loss_clip": 1.02586758, + "balance_loss_mlp": 1.03727245, + "epoch": 0.07485344957162182, + "flos": 17814467076480.0, + "grad_norm": 3.423545890676594, + "language_loss": 0.69239521, + "learning_rate": 3.945114067662009e-06, + "loss": 0.71441513, + "num_input_tokens_seen": 26479350, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 1.109375, + "step": 1245, + "time_per_iteration": 2.3707425594329834 + }, + { + "auxiliary_loss_clip": 0.01144149, + "auxiliary_loss_mlp": 0.01052193, + "balance_loss_clip": 1.02332044, + "balance_loss_mlp": 1.03658879, + "epoch": 0.0749135728242898, + "flos": 25009018680960.0, + "grad_norm": 1.7770744627600272, + "language_loss": 0.88667941, + "learning_rate": 3.9450261270738815e-06, + "loss": 0.90864277, + "num_input_tokens_seen": 26498255, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 1.078125, + "step": 1246, + "time_per_iteration": 2.4962313175201416 + }, + { + "auxiliary_loss_clip": 0.01152627, + "auxiliary_loss_mlp": 0.01055953, + "balance_loss_clip": 1.0230515, + "balance_loss_mlp": 1.03740907, + "epoch": 0.07497369607695777, + "flos": 17821100234880.0, + "grad_norm": 2.4445910492133485, + "language_loss": 0.88317931, + "learning_rate": 3.944938117072776e-06, + "loss": 0.90526509, + "num_input_tokens_seen": 26515375, + "router_z_loss_clip": 0.33007812, + "router_z_loss_mlp": 1.15625, + "step": 1247, + "time_per_iteration": 2.4028384685516357 + }, + { + "auxiliary_loss_clip": 0.01141357, + "auxiliary_loss_mlp": 0.01053357, + "balance_loss_clip": 1.02529478, + "balance_loss_mlp": 1.03495264, + "epoch": 0.07503381932962573, + "flos": 15120219323520.0, + "grad_norm": 2.405659657463368, + "language_loss": 0.64709055, + "learning_rate": 3.944850037661831e-06, + "loss": 0.66903764, + "num_input_tokens_seen": 26533595, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 1.0625, + "step": 1248, + "time_per_iteration": 2.442790985107422 + }, + { + "auxiliary_loss_clip": 0.01141608, + "auxiliary_loss_mlp": 0.01052677, + "balance_loss_clip": 1.02704656, + "balance_loss_mlp": 1.03831017, + "epoch": 0.0750939425822937, + "flos": 12816091635840.0, + "grad_norm": 2.2582700393608524, + "language_loss": 0.74438941, + "learning_rate": 3.944761888844191e-06, + "loss": 0.76633227, + "num_input_tokens_seen": 26549405, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 1.03125, + "step": 1249, + "time_per_iteration": 2.3979647159576416 + }, + { + "auxiliary_loss_clip": 0.01147036, + "auxiliary_loss_mlp": 0.01063925, + "balance_loss_clip": 1.03414643, + "balance_loss_mlp": 1.0371995, + "epoch": 0.07515406583496168, + "flos": 24205702141440.0, + "grad_norm": 3.3231514265364357, + "language_loss": 0.8245669, + "learning_rate": 3.944673670623001e-06, + "loss": 0.84667647, + "num_input_tokens_seen": 26567200, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 1.09375, + "step": 1250, + "time_per_iteration": 2.469184398651123 + }, + { + "auxiliary_loss_clip": 0.01144078, + "auxiliary_loss_mlp": 0.01053375, + "balance_loss_clip": 1.02534926, + "balance_loss_mlp": 1.03903031, + "epoch": 0.07521418908762964, + "flos": 26686944489600.0, + "grad_norm": 6.605391922579396, + "language_loss": 0.669029, + "learning_rate": 3.944585383001411e-06, + "loss": 0.69100344, + "num_input_tokens_seen": 26586190, + "router_z_loss_clip": 0.27929688, + "router_z_loss_mlp": 1.0546875, + "step": 1251, + "time_per_iteration": 2.5036938190460205 + }, + { + "auxiliary_loss_clip": 0.01039191, + "auxiliary_loss_mlp": 0.0100688, + "balance_loss_clip": 1.00211179, + "balance_loss_mlp": 1.00628459, + "epoch": 0.0752743123402976, + "flos": 59091788096640.0, + "grad_norm": 0.8883458310414257, + "language_loss": 0.70415509, + "learning_rate": 3.944497025982571e-06, + "loss": 0.72461569, + "num_input_tokens_seen": 26650710, + "router_z_loss_clip": 0.04760742, + "router_z_loss_mlp": 0.33007812, + "step": 1252, + "time_per_iteration": 3.095370054244995 + }, + { + "auxiliary_loss_clip": 0.01142421, + "auxiliary_loss_mlp": 0.01061465, + "balance_loss_clip": 1.03253305, + "balance_loss_mlp": 1.03514695, + "epoch": 0.07533443559296558, + "flos": 23475912658560.0, + "grad_norm": 2.1062586387472946, + "language_loss": 0.79992402, + "learning_rate": 3.944408599569633e-06, + "loss": 0.82196289, + "num_input_tokens_seen": 26669000, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 1.0703125, + "step": 1253, + "time_per_iteration": 2.4121005535125732 + }, + { + "auxiliary_loss_clip": 0.01146671, + "auxiliary_loss_mlp": 0.01057295, + "balance_loss_clip": 1.0272783, + "balance_loss_mlp": 1.03715527, + "epoch": 0.07539455884563355, + "flos": 20878270237440.0, + "grad_norm": 2.9367962325269223, + "language_loss": 0.9338783, + "learning_rate": 3.9443201037657545e-06, + "loss": 0.95591795, + "num_input_tokens_seen": 26683075, + "router_z_loss_clip": 0.29882812, + "router_z_loss_mlp": 1.09375, + "step": 1254, + "time_per_iteration": 2.419344663619995 + }, + { + "auxiliary_loss_clip": 0.01139612, + "auxiliary_loss_mlp": 0.01047563, + "balance_loss_clip": 1.01966774, + "balance_loss_mlp": 1.03530228, + "epoch": 0.07545468209830151, + "flos": 27671669786880.0, + "grad_norm": 4.104633237380502, + "language_loss": 0.87933367, + "learning_rate": 3.944231538574092e-06, + "loss": 0.90120542, + "num_input_tokens_seen": 26701875, + "router_z_loss_clip": 0.27929688, + "router_z_loss_mlp": 1.046875, + "step": 1255, + "time_per_iteration": 2.4580881595611572 + }, + { + "auxiliary_loss_clip": 0.01140922, + "auxiliary_loss_mlp": 0.01051102, + "balance_loss_clip": 1.02168155, + "balance_loss_mlp": 1.03676486, + "epoch": 0.0755148053509695, + "flos": 14136122430720.0, + "grad_norm": 1.7522842390054543, + "language_loss": 0.79969382, + "learning_rate": 3.9441429039978086e-06, + "loss": 0.82161403, + "num_input_tokens_seen": 26719050, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 1.046875, + "step": 1256, + "time_per_iteration": 2.4150822162628174 + }, + { + "auxiliary_loss_clip": 0.01040318, + "auxiliary_loss_mlp": 0.01004824, + "balance_loss_clip": 1.00017476, + "balance_loss_mlp": 1.00681496, + "epoch": 0.07557492860363746, + "flos": 58232506780800.0, + "grad_norm": 0.7715466436615287, + "language_loss": 0.58031034, + "learning_rate": 3.944054200040065e-06, + "loss": 0.60076171, + "num_input_tokens_seen": 26780650, + "router_z_loss_clip": 0.04638672, + "router_z_loss_mlp": 0.3359375, + "step": 1257, + "time_per_iteration": 3.169145107269287 + }, + { + "auxiliary_loss_clip": 0.01143633, + "auxiliary_loss_mlp": 0.01056884, + "balance_loss_clip": 1.02752304, + "balance_loss_mlp": 1.03853524, + "epoch": 0.07563505185630542, + "flos": 24643233699840.0, + "grad_norm": 2.6403445214651766, + "language_loss": 0.89664084, + "learning_rate": 3.943965426704027e-06, + "loss": 0.91864598, + "num_input_tokens_seen": 26798725, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 1.0546875, + "step": 1258, + "time_per_iteration": 2.4530155658721924 + }, + { + "auxiliary_loss_clip": 0.01142097, + "auxiliary_loss_mlp": 0.01055937, + "balance_loss_clip": 1.02931738, + "balance_loss_mlp": 1.03846896, + "epoch": 0.07569517510897339, + "flos": 15522104517120.0, + "grad_norm": 2.0561855104483153, + "language_loss": 0.80861282, + "learning_rate": 3.943876583992864e-06, + "loss": 0.83059323, + "num_input_tokens_seen": 26817005, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.03125, + "step": 1259, + "time_per_iteration": 2.41066837310791 + }, + { + "auxiliary_loss_clip": 0.01141372, + "auxiliary_loss_mlp": 0.01054204, + "balance_loss_clip": 1.02615404, + "balance_loss_mlp": 1.03635323, + "epoch": 0.07575529836164137, + "flos": 22927462110720.0, + "grad_norm": 1.7786873652159558, + "language_loss": 0.75696343, + "learning_rate": 3.943787671909746e-06, + "loss": 0.7789191, + "num_input_tokens_seen": 26836655, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 1.046875, + "step": 1260, + "time_per_iteration": 2.4653754234313965 + }, + { + "auxiliary_loss_clip": 0.01142378, + "auxiliary_loss_mlp": 0.0105818, + "balance_loss_clip": 1.0282104, + "balance_loss_mlp": 1.03585958, + "epoch": 0.07581542161430933, + "flos": 19499410068480.0, + "grad_norm": 2.2183576565645375, + "language_loss": 0.84589267, + "learning_rate": 3.943698690457846e-06, + "loss": 0.86789823, + "num_input_tokens_seen": 26854925, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 1.0625, + "step": 1261, + "time_per_iteration": 2.437059164047241 + }, + { + "auxiliary_loss_clip": 0.0114615, + "auxiliary_loss_mlp": 0.01060723, + "balance_loss_clip": 1.03254223, + "balance_loss_mlp": 1.03722024, + "epoch": 0.0758755448669773, + "flos": 24972290064000.0, + "grad_norm": 1.8577573636488671, + "language_loss": 0.83029902, + "learning_rate": 3.943609639640339e-06, + "loss": 0.85236776, + "num_input_tokens_seen": 26876170, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 1.0859375, + "step": 1262, + "time_per_iteration": 2.51898455619812 + }, + { + "auxiliary_loss_clip": 0.01139744, + "auxiliary_loss_mlp": 0.01054691, + "balance_loss_clip": 1.02629542, + "balance_loss_mlp": 1.03495586, + "epoch": 0.07593566811964528, + "flos": 22746856314240.0, + "grad_norm": 3.162190663494559, + "language_loss": 0.82544661, + "learning_rate": 3.943520519460405e-06, + "loss": 0.84739101, + "num_input_tokens_seen": 26895005, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 1.046875, + "step": 1263, + "time_per_iteration": 2.421804189682007 + }, + { + "auxiliary_loss_clip": 0.01144962, + "auxiliary_loss_mlp": 0.01046337, + "balance_loss_clip": 1.01865637, + "balance_loss_mlp": 1.03497577, + "epoch": 0.07599579137231324, + "flos": 23111279752320.0, + "grad_norm": 2.6172494042949146, + "language_loss": 0.76007628, + "learning_rate": 3.943431329921221e-06, + "loss": 0.78198922, + "num_input_tokens_seen": 26913930, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 1.1015625, + "step": 1264, + "time_per_iteration": 2.436509847640991 + }, + { + "auxiliary_loss_clip": 0.01143242, + "auxiliary_loss_mlp": 0.0105781, + "balance_loss_clip": 1.02933121, + "balance_loss_mlp": 1.03629994, + "epoch": 0.07605591462498121, + "flos": 14501174273280.0, + "grad_norm": 2.5946179933620526, + "language_loss": 0.8096326, + "learning_rate": 3.943342071025974e-06, + "loss": 0.8316431, + "num_input_tokens_seen": 26931485, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 1.0703125, + "step": 1265, + "time_per_iteration": 2.3973848819732666 + }, + { + "auxiliary_loss_clip": 0.01144294, + "auxiliary_loss_mlp": 0.01049988, + "balance_loss_clip": 1.02011418, + "balance_loss_mlp": 1.03644371, + "epoch": 0.07611603787764919, + "flos": 23513060211840.0, + "grad_norm": 3.287075522816002, + "language_loss": 0.65693021, + "learning_rate": 3.9432527427778455e-06, + "loss": 0.678873, + "num_input_tokens_seen": 26951670, + "router_z_loss_clip": 0.29882812, + "router_z_loss_mlp": 1.078125, + "step": 1266, + "time_per_iteration": 2.455113172531128 + }, + { + "auxiliary_loss_clip": 0.0114309, + "auxiliary_loss_mlp": 0.01054833, + "balance_loss_clip": 1.02599609, + "balance_loss_mlp": 1.03630745, + "epoch": 0.07617616113031715, + "flos": 21506112950400.0, + "grad_norm": 2.377862188966611, + "language_loss": 0.79258627, + "learning_rate": 3.943163345180026e-06, + "loss": 0.81456548, + "num_input_tokens_seen": 26970335, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 1.0625, + "step": 1267, + "time_per_iteration": 2.5339574813842773 + }, + { + "auxiliary_loss_clip": 0.0114187, + "auxiliary_loss_mlp": 0.01043139, + "balance_loss_clip": 1.01756895, + "balance_loss_mlp": 1.03486967, + "epoch": 0.07623628438298512, + "flos": 14572327357440.0, + "grad_norm": 2.5288995682024065, + "language_loss": 0.72980249, + "learning_rate": 3.9430738782357054e-06, + "loss": 0.75165266, + "num_input_tokens_seen": 26986025, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 1.0703125, + "step": 1268, + "time_per_iteration": 2.394282579421997 + }, + { + "auxiliary_loss_clip": 0.01143821, + "auxiliary_loss_mlp": 0.01048508, + "balance_loss_clip": 1.02023184, + "balance_loss_mlp": 1.03516436, + "epoch": 0.07629640763565308, + "flos": 14719521116160.0, + "grad_norm": 2.747619032322646, + "language_loss": 0.82369566, + "learning_rate": 3.9429843419480755e-06, + "loss": 0.84561896, + "num_input_tokens_seen": 27004045, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 1.0859375, + "step": 1269, + "time_per_iteration": 2.429852247238159 + }, + { + "auxiliary_loss_clip": 0.01141087, + "auxiliary_loss_mlp": 0.01050495, + "balance_loss_clip": 1.02332711, + "balance_loss_mlp": 1.03607571, + "epoch": 0.07635653088832106, + "flos": 14902047037440.0, + "grad_norm": 2.3775327902069257, + "language_loss": 0.88504201, + "learning_rate": 3.942894736320334e-06, + "loss": 0.90695786, + "num_input_tokens_seen": 27022070, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 1.046875, + "step": 1270, + "time_per_iteration": 2.413921356201172 + }, + { + "auxiliary_loss_clip": 0.01145719, + "auxiliary_loss_mlp": 0.01053373, + "balance_loss_clip": 1.02552581, + "balance_loss_mlp": 1.03685999, + "epoch": 0.07641665414098903, + "flos": 26650355518080.0, + "grad_norm": 2.3193044847198054, + "language_loss": 0.71426392, + "learning_rate": 3.942805061355676e-06, + "loss": 0.73625481, + "num_input_tokens_seen": 27041755, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 1.0859375, + "step": 1271, + "time_per_iteration": 2.488715410232544 + }, + { + "auxiliary_loss_clip": 0.01140245, + "auxiliary_loss_mlp": 0.01051859, + "balance_loss_clip": 1.02447629, + "balance_loss_mlp": 1.03839946, + "epoch": 0.07647677739365699, + "flos": 25191614424960.0, + "grad_norm": 1.6024138693327201, + "language_loss": 0.82551324, + "learning_rate": 3.9427153170573026e-06, + "loss": 0.8474344, + "num_input_tokens_seen": 27061540, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 1.015625, + "step": 1272, + "time_per_iteration": 2.4781711101531982 + }, + { + "auxiliary_loss_clip": 0.01141554, + "auxiliary_loss_mlp": 0.01053515, + "balance_loss_clip": 1.02572775, + "balance_loss_mlp": 1.0339067, + "epoch": 0.07653690064632497, + "flos": 20557103840640.0, + "grad_norm": 4.691640888023605, + "language_loss": 0.7996034, + "learning_rate": 3.9426255034284174e-06, + "loss": 0.82155412, + "num_input_tokens_seen": 27081395, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 1.078125, + "step": 1273, + "time_per_iteration": 2.424798011779785 + }, + { + "auxiliary_loss_clip": 0.01141954, + "auxiliary_loss_mlp": 0.01055519, + "balance_loss_clip": 1.0280174, + "balance_loss_mlp": 1.03463483, + "epoch": 0.07659702389899294, + "flos": 22268336952960.0, + "grad_norm": 2.39038723505383, + "language_loss": 0.81201237, + "learning_rate": 3.942535620472224e-06, + "loss": 0.83398712, + "num_input_tokens_seen": 27101175, + "router_z_loss_clip": 0.27539062, + "router_z_loss_mlp": 1.078125, + "step": 1274, + "time_per_iteration": 2.4857890605926514 + }, + { + "auxiliary_loss_clip": 0.01144799, + "auxiliary_loss_mlp": 0.01058775, + "balance_loss_clip": 1.03149986, + "balance_loss_mlp": 1.03649175, + "epoch": 0.0766571471516609, + "flos": 32634713064960.0, + "grad_norm": 2.102679445046312, + "language_loss": 0.73003268, + "learning_rate": 3.942445668191932e-06, + "loss": 0.7520684, + "num_input_tokens_seen": 27124505, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 1.078125, + "step": 1275, + "time_per_iteration": 2.533172845840454 + }, + { + "auxiliary_loss_clip": 0.01145421, + "auxiliary_loss_mlp": 0.01054721, + "balance_loss_clip": 1.0252409, + "balance_loss_mlp": 1.03737283, + "epoch": 0.07671727040432888, + "flos": 15266505386880.0, + "grad_norm": 2.224435838407383, + "language_loss": 0.79420996, + "learning_rate": 3.94235564659075e-06, + "loss": 0.81621134, + "num_input_tokens_seen": 27140960, + "router_z_loss_clip": 0.29492188, + "router_z_loss_mlp": 1.078125, + "step": 1276, + "time_per_iteration": 2.540464162826538 + }, + { + "auxiliary_loss_clip": 0.0114624, + "auxiliary_loss_mlp": 0.01051885, + "balance_loss_clip": 1.0239898, + "balance_loss_mlp": 1.03762674, + "epoch": 0.07677739365699685, + "flos": 28182833136000.0, + "grad_norm": 2.2290197674949424, + "language_loss": 0.59222054, + "learning_rate": 3.942265555671892e-06, + "loss": 0.61420172, + "num_input_tokens_seen": 27160985, + "router_z_loss_clip": 0.27929688, + "router_z_loss_mlp": 1.0859375, + "step": 1277, + "time_per_iteration": 2.5006768703460693 + }, + { + "auxiliary_loss_clip": 0.01145807, + "auxiliary_loss_mlp": 0.01057558, + "balance_loss_clip": 1.02946019, + "balance_loss_mlp": 1.03459835, + "epoch": 0.07683751690966481, + "flos": 18295150942080.0, + "grad_norm": 3.6354157239331477, + "language_loss": 0.75029022, + "learning_rate": 3.942175395438572e-06, + "loss": 0.77232379, + "num_input_tokens_seen": 27178390, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 1.109375, + "step": 1278, + "time_per_iteration": 3.87268328666687 + }, + { + "auxiliary_loss_clip": 0.01136778, + "auxiliary_loss_mlp": 0.01051673, + "balance_loss_clip": 1.02626991, + "balance_loss_mlp": 1.03386188, + "epoch": 0.07689764016233278, + "flos": 21980024012160.0, + "grad_norm": 2.747090015732431, + "language_loss": 0.88341421, + "learning_rate": 3.942085165894009e-06, + "loss": 0.90529871, + "num_input_tokens_seen": 27197505, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.03125, + "step": 1279, + "time_per_iteration": 2.445218086242676 + }, + { + "auxiliary_loss_clip": 0.01139971, + "auxiliary_loss_mlp": 0.01048246, + "balance_loss_clip": 1.020661, + "balance_loss_mlp": 1.03736174, + "epoch": 0.07695776341500075, + "flos": 22234924915200.0, + "grad_norm": 2.417511747291879, + "language_loss": 0.82531738, + "learning_rate": 3.9419948670414206e-06, + "loss": 0.84719956, + "num_input_tokens_seen": 27214260, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 1.0234375, + "step": 1280, + "time_per_iteration": 2.472209930419922 + }, + { + "auxiliary_loss_clip": 0.01140022, + "auxiliary_loss_mlp": 0.01056315, + "balance_loss_clip": 1.02799058, + "balance_loss_mlp": 1.0359323, + "epoch": 0.07701788666766872, + "flos": 16142825312640.0, + "grad_norm": 3.092359493923539, + "language_loss": 0.75768244, + "learning_rate": 3.941904498884032e-06, + "loss": 0.7796458, + "num_input_tokens_seen": 27232525, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 1.0390625, + "step": 1281, + "time_per_iteration": 4.002227067947388 + }, + { + "auxiliary_loss_clip": 0.01144663, + "auxiliary_loss_mlp": 0.0105056, + "balance_loss_clip": 1.02142501, + "balance_loss_mlp": 1.03378582, + "epoch": 0.07707800992033668, + "flos": 19462053047040.0, + "grad_norm": 3.3006494765725862, + "language_loss": 0.74827677, + "learning_rate": 3.941814061425067e-06, + "loss": 0.77022898, + "num_input_tokens_seen": 27249800, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 1.109375, + "step": 1282, + "time_per_iteration": 3.923555374145508 + }, + { + "auxiliary_loss_clip": 0.01143046, + "auxiliary_loss_mlp": 0.01054118, + "balance_loss_clip": 1.02739167, + "balance_loss_mlp": 1.03525567, + "epoch": 0.07713813317300466, + "flos": 18989259148800.0, + "grad_norm": 2.6287145402917154, + "language_loss": 0.83850062, + "learning_rate": 3.941723554667752e-06, + "loss": 0.86047232, + "num_input_tokens_seen": 27268895, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 1.078125, + "step": 1283, + "time_per_iteration": 2.483353853225708 + }, + { + "auxiliary_loss_clip": 0.01142422, + "auxiliary_loss_mlp": 0.01056764, + "balance_loss_clip": 1.02675915, + "balance_loss_mlp": 1.03554845, + "epoch": 0.07719825642567263, + "flos": 18112974134400.0, + "grad_norm": 4.00045806440098, + "language_loss": 0.74790585, + "learning_rate": 3.941632978615318e-06, + "loss": 0.7698977, + "num_input_tokens_seen": 27288180, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 1.0703125, + "step": 1284, + "time_per_iteration": 2.4336962699890137 + }, + { + "auxiliary_loss_clip": 0.01139409, + "auxiliary_loss_mlp": 0.01058026, + "balance_loss_clip": 1.03098965, + "balance_loss_mlp": 1.03460002, + "epoch": 0.0772583796783406, + "flos": 42192780312960.0, + "grad_norm": 1.9606313210845743, + "language_loss": 0.76300985, + "learning_rate": 3.941542333270999e-06, + "loss": 0.78498423, + "num_input_tokens_seen": 27311815, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 1.046875, + "step": 1285, + "time_per_iteration": 2.6642353534698486 + }, + { + "auxiliary_loss_clip": 0.01146061, + "auxiliary_loss_mlp": 0.01060958, + "balance_loss_clip": 1.03332567, + "balance_loss_mlp": 1.0383873, + "epoch": 0.07731850293100857, + "flos": 24752546766720.0, + "grad_norm": 2.016961446663883, + "language_loss": 0.83767694, + "learning_rate": 3.9414516186380275e-06, + "loss": 0.85974705, + "num_input_tokens_seen": 27331890, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 1.078125, + "step": 1286, + "time_per_iteration": 2.4510388374328613 + }, + { + "auxiliary_loss_clip": 0.01143516, + "auxiliary_loss_mlp": 0.01049856, + "balance_loss_clip": 1.02274823, + "balance_loss_mlp": 1.03486085, + "epoch": 0.07737862618367654, + "flos": 17564942522880.0, + "grad_norm": 2.2536591918310656, + "language_loss": 0.770509, + "learning_rate": 3.941360834719641e-06, + "loss": 0.79244268, + "num_input_tokens_seen": 27348320, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 1.0859375, + "step": 1287, + "time_per_iteration": 2.44064998626709 + }, + { + "auxiliary_loss_clip": 0.01138396, + "auxiliary_loss_mlp": 0.01049698, + "balance_loss_clip": 1.0231142, + "balance_loss_mlp": 1.03492832, + "epoch": 0.0774387494363445, + "flos": 25626038872320.0, + "grad_norm": 2.0322938983618326, + "language_loss": 0.84395438, + "learning_rate": 3.941269981519081e-06, + "loss": 0.86583531, + "num_input_tokens_seen": 27367670, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.03125, + "step": 1288, + "time_per_iteration": 2.4835731983184814 + }, + { + "auxiliary_loss_clip": 0.01138414, + "auxiliary_loss_mlp": 0.01050989, + "balance_loss_clip": 1.02489436, + "balance_loss_mlp": 1.03338683, + "epoch": 0.07749887268901248, + "flos": 12239046817920.0, + "grad_norm": 2.157235414053665, + "language_loss": 0.85084462, + "learning_rate": 3.941179059039589e-06, + "loss": 0.87273872, + "num_input_tokens_seen": 27385485, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 1.046875, + "step": 1289, + "time_per_iteration": 2.520219564437866 + }, + { + "auxiliary_loss_clip": 0.01140713, + "auxiliary_loss_mlp": 0.01047597, + "balance_loss_clip": 1.01990509, + "balance_loss_mlp": 1.03482342, + "epoch": 0.07755899594168045, + "flos": 25080590701440.0, + "grad_norm": 1.9354442156770693, + "language_loss": 0.85018635, + "learning_rate": 3.941088067284409e-06, + "loss": 0.87206948, + "num_input_tokens_seen": 27405110, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 1.0625, + "step": 1290, + "time_per_iteration": 2.470550060272217 + }, + { + "auxiliary_loss_clip": 0.01136217, + "auxiliary_loss_mlp": 0.01061076, + "balance_loss_clip": 1.03229904, + "balance_loss_mlp": 1.03412962, + "epoch": 0.07761911919434841, + "flos": 14245540231680.0, + "grad_norm": 2.307083349569191, + "language_loss": 0.90523207, + "learning_rate": 3.9409970062567895e-06, + "loss": 0.92720503, + "num_input_tokens_seen": 27422855, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 1.0234375, + "step": 1291, + "time_per_iteration": 2.440746784210205 + }, + { + "auxiliary_loss_clip": 0.01041045, + "auxiliary_loss_mlp": 0.01016423, + "balance_loss_clip": 1.01241791, + "balance_loss_mlp": 1.00740957, + "epoch": 0.07767924244701638, + "flos": 67233463597440.0, + "grad_norm": 0.8793135900668437, + "language_loss": 0.65063083, + "learning_rate": 3.94090587595998e-06, + "loss": 0.67120552, + "num_input_tokens_seen": 27487190, + "router_z_loss_clip": 0.04003906, + "router_z_loss_mlp": 0.3359375, + "step": 1292, + "time_per_iteration": 3.168759822845459 + }, + { + "auxiliary_loss_clip": 0.0114126, + "auxiliary_loss_mlp": 0.01046543, + "balance_loss_clip": 1.01861262, + "balance_loss_mlp": 1.03376663, + "epoch": 0.07773936569968436, + "flos": 28549316344320.0, + "grad_norm": 2.045054047714249, + "language_loss": 0.87551838, + "learning_rate": 3.940814676397232e-06, + "loss": 0.89739639, + "num_input_tokens_seen": 27510465, + "router_z_loss_clip": 0.27929688, + "router_z_loss_mlp": 1.0703125, + "step": 1293, + "time_per_iteration": 2.5071475505828857 + }, + { + "auxiliary_loss_clip": 0.01144298, + "auxiliary_loss_mlp": 0.01053189, + "balance_loss_clip": 1.02447128, + "balance_loss_mlp": 1.03891051, + "epoch": 0.07779948895235232, + "flos": 27489039131520.0, + "grad_norm": 2.1716084836194733, + "language_loss": 0.84672004, + "learning_rate": 3.940723407571801e-06, + "loss": 0.8686949, + "num_input_tokens_seen": 27528645, + "router_z_loss_clip": 0.28710938, + "router_z_loss_mlp": 1.0546875, + "step": 1294, + "time_per_iteration": 2.5061745643615723 + }, + { + "auxiliary_loss_clip": 0.01141525, + "auxiliary_loss_mlp": 0.01048977, + "balance_loss_clip": 1.02082014, + "balance_loss_mlp": 1.03697085, + "epoch": 0.07785961220502029, + "flos": 18222322112640.0, + "grad_norm": 2.381994821509741, + "language_loss": 0.79361206, + "learning_rate": 3.9406320694869425e-06, + "loss": 0.81551707, + "num_input_tokens_seen": 27546165, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 1.046875, + "step": 1295, + "time_per_iteration": 2.407604694366455 + }, + { + "auxiliary_loss_clip": 0.01140814, + "auxiliary_loss_mlp": 0.01047431, + "balance_loss_clip": 1.02001262, + "balance_loss_mlp": 1.03543675, + "epoch": 0.07791973545768827, + "flos": 24607063664640.0, + "grad_norm": 2.319744815127164, + "language_loss": 0.87795794, + "learning_rate": 3.940540662145918e-06, + "loss": 0.89984035, + "num_input_tokens_seen": 27566520, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 1.0546875, + "step": 1296, + "time_per_iteration": 2.4614198207855225 + }, + { + "auxiliary_loss_clip": 0.01141737, + "auxiliary_loss_mlp": 0.01049657, + "balance_loss_clip": 1.02112985, + "balance_loss_mlp": 1.03465438, + "epoch": 0.07797985871035623, + "flos": 14281221507840.0, + "grad_norm": 3.0816995918719856, + "language_loss": 0.96446133, + "learning_rate": 3.940449185551989e-06, + "loss": 0.98637521, + "num_input_tokens_seen": 27581960, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 1.0703125, + "step": 1297, + "time_per_iteration": 2.4358835220336914 + }, + { + "auxiliary_loss_clip": 0.01140842, + "auxiliary_loss_mlp": 0.01050633, + "balance_loss_clip": 1.0230006, + "balance_loss_mlp": 1.03406048, + "epoch": 0.0780399819630242, + "flos": 26609367715200.0, + "grad_norm": 2.114068414625776, + "language_loss": 0.7612232, + "learning_rate": 3.94035763970842e-06, + "loss": 0.78313792, + "num_input_tokens_seen": 27601415, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 1.0703125, + "step": 1298, + "time_per_iteration": 2.452118396759033 + }, + { + "auxiliary_loss_clip": 0.0114226, + "auxiliary_loss_mlp": 0.01051076, + "balance_loss_clip": 1.02457619, + "balance_loss_mlp": 1.03616405, + "epoch": 0.07810010521569218, + "flos": 21833458657920.0, + "grad_norm": 1.7562543677043454, + "language_loss": 0.80491579, + "learning_rate": 3.940266024618478e-06, + "loss": 0.82684916, + "num_input_tokens_seen": 27621490, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.0625, + "step": 1299, + "time_per_iteration": 2.448579788208008 + }, + { + "auxiliary_loss_clip": 0.01138035, + "auxiliary_loss_mlp": 0.01050805, + "balance_loss_clip": 1.0229218, + "balance_loss_mlp": 1.03488827, + "epoch": 0.07816022846836014, + "flos": 25080101942400.0, + "grad_norm": 2.063325759243455, + "language_loss": 0.85981327, + "learning_rate": 3.940174340285432e-06, + "loss": 0.88170165, + "num_input_tokens_seen": 27640600, + "router_z_loss_clip": 0.27929688, + "router_z_loss_mlp": 1.03125, + "step": 1300, + "time_per_iteration": 2.4218318462371826 + }, + { + "auxiliary_loss_clip": 0.0114285, + "auxiliary_loss_mlp": 0.01046773, + "balance_loss_clip": 1.01909256, + "balance_loss_mlp": 1.03690553, + "epoch": 0.0782203517210281, + "flos": 40915901825280.0, + "grad_norm": 2.2657372489361336, + "language_loss": 0.71694589, + "learning_rate": 3.940082586712555e-06, + "loss": 0.73884213, + "num_input_tokens_seen": 27663070, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 1.0625, + "step": 1301, + "time_per_iteration": 2.6418845653533936 + }, + { + "auxiliary_loss_clip": 0.01147725, + "auxiliary_loss_mlp": 0.01058281, + "balance_loss_clip": 1.03042173, + "balance_loss_mlp": 1.03863072, + "epoch": 0.07828047497369607, + "flos": 41170418703360.0, + "grad_norm": 1.5101327011812837, + "language_loss": 0.7031014, + "learning_rate": 3.939990763903122e-06, + "loss": 0.72516143, + "num_input_tokens_seen": 27686425, + "router_z_loss_clip": 0.27929688, + "router_z_loss_mlp": 1.0859375, + "step": 1302, + "time_per_iteration": 2.741868734359741 + }, + { + "auxiliary_loss_clip": 0.01142976, + "auxiliary_loss_mlp": 0.01052096, + "balance_loss_clip": 1.02437949, + "balance_loss_mlp": 1.03674889, + "epoch": 0.07834059822636405, + "flos": 23507160192000.0, + "grad_norm": 1.988705315306982, + "language_loss": 0.82179976, + "learning_rate": 3.939898871860407e-06, + "loss": 0.84375048, + "num_input_tokens_seen": 27704900, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 1.0625, + "step": 1303, + "time_per_iteration": 2.4571609497070312 + }, + { + "auxiliary_loss_clip": 0.01142584, + "auxiliary_loss_mlp": 0.01059815, + "balance_loss_clip": 1.03139567, + "balance_loss_mlp": 1.03558779, + "epoch": 0.07840072147903202, + "flos": 20192854959360.0, + "grad_norm": 2.581953731053822, + "language_loss": 0.74705011, + "learning_rate": 3.939806910587693e-06, + "loss": 0.76907408, + "num_input_tokens_seen": 27724890, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 1.0703125, + "step": 1304, + "time_per_iteration": 2.433095932006836 + }, + { + "auxiliary_loss_clip": 0.01146743, + "auxiliary_loss_mlp": 0.01056951, + "balance_loss_clip": 1.02690983, + "balance_loss_mlp": 1.04023051, + "epoch": 0.07846084473169998, + "flos": 21359757064320.0, + "grad_norm": 1.8012610446750759, + "language_loss": 0.76330793, + "learning_rate": 3.9397148800882595e-06, + "loss": 0.78534484, + "num_input_tokens_seen": 27743115, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 1.0625, + "step": 1305, + "time_per_iteration": 2.4251163005828857 + }, + { + "auxiliary_loss_clip": 0.01145606, + "auxiliary_loss_mlp": 0.01058712, + "balance_loss_clip": 1.02882612, + "balance_loss_mlp": 1.03599501, + "epoch": 0.07852096798436796, + "flos": 25409786711040.0, + "grad_norm": 1.709212760868719, + "language_loss": 0.84957409, + "learning_rate": 3.939622780365391e-06, + "loss": 0.87161732, + "num_input_tokens_seen": 27763570, + "router_z_loss_clip": 0.29882812, + "router_z_loss_mlp": 1.09375, + "step": 1306, + "time_per_iteration": 2.4500534534454346 + }, + { + "auxiliary_loss_clip": 0.01141706, + "auxiliary_loss_mlp": 0.01042407, + "balance_loss_clip": 1.01584721, + "balance_loss_mlp": 1.03744817, + "epoch": 0.07858109123703592, + "flos": 24570335047680.0, + "grad_norm": 2.6426056403295197, + "language_loss": 0.9069171, + "learning_rate": 3.939530611422375e-06, + "loss": 0.92875826, + "num_input_tokens_seen": 27780030, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.0390625, + "step": 1307, + "time_per_iteration": 2.4576053619384766 + }, + { + "auxiliary_loss_clip": 0.01142213, + "auxiliary_loss_mlp": 0.01051842, + "balance_loss_clip": 1.02225399, + "balance_loss_mlp": 1.03556371, + "epoch": 0.07864121448970389, + "flos": 20697978643200.0, + "grad_norm": 1.8719671173611063, + "language_loss": 0.8353464, + "learning_rate": 3.939438373262501e-06, + "loss": 0.85728693, + "num_input_tokens_seen": 27796225, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 1.0625, + "step": 1308, + "time_per_iteration": 2.3935189247131348 + }, + { + "auxiliary_loss_clip": 0.01139905, + "auxiliary_loss_mlp": 0.0104803, + "balance_loss_clip": 1.02133918, + "balance_loss_mlp": 1.03627121, + "epoch": 0.07870133774237187, + "flos": 22965412625280.0, + "grad_norm": 1.4422409899536226, + "language_loss": 0.77097666, + "learning_rate": 3.93934606588906e-06, + "loss": 0.79285604, + "num_input_tokens_seen": 27815975, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.0390625, + "step": 1309, + "time_per_iteration": 2.4520504474639893 + }, + { + "auxiliary_loss_clip": 0.01148205, + "auxiliary_loss_mlp": 0.01060707, + "balance_loss_clip": 1.03188252, + "balance_loss_mlp": 1.03713357, + "epoch": 0.07876146099503983, + "flos": 18841855921920.0, + "grad_norm": 2.109083590132941, + "language_loss": 0.80204201, + "learning_rate": 3.939253689305346e-06, + "loss": 0.82413113, + "num_input_tokens_seen": 27832255, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 1.109375, + "step": 1310, + "time_per_iteration": 2.374415397644043 + }, + { + "auxiliary_loss_clip": 0.01138736, + "auxiliary_loss_mlp": 0.01049991, + "balance_loss_clip": 1.02394342, + "balance_loss_mlp": 1.03848791, + "epoch": 0.0788215842477078, + "flos": 23804654820480.0, + "grad_norm": 1.6889419547689029, + "language_loss": 0.72608209, + "learning_rate": 3.939161243514657e-06, + "loss": 0.74796939, + "num_input_tokens_seen": 27852180, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 1.0078125, + "step": 1311, + "time_per_iteration": 2.4493556022644043 + }, + { + "auxiliary_loss_clip": 0.01143532, + "auxiliary_loss_mlp": 0.01063003, + "balance_loss_clip": 1.03531039, + "balance_loss_mlp": 1.03952634, + "epoch": 0.07888170750037576, + "flos": 21578837045760.0, + "grad_norm": 3.1876067717240892, + "language_loss": 0.85806346, + "learning_rate": 3.939068728520291e-06, + "loss": 0.88012886, + "num_input_tokens_seen": 27871435, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 1.0390625, + "step": 1312, + "time_per_iteration": 2.4188222885131836 + }, + { + "auxiliary_loss_clip": 0.01141501, + "auxiliary_loss_mlp": 0.01058017, + "balance_loss_clip": 1.03069377, + "balance_loss_mlp": 1.03968775, + "epoch": 0.07894183075304374, + "flos": 19863833506560.0, + "grad_norm": 2.420747931174189, + "language_loss": 0.81749922, + "learning_rate": 3.938976144325549e-06, + "loss": 0.83949447, + "num_input_tokens_seen": 27890625, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 1.015625, + "step": 1313, + "time_per_iteration": 2.436534881591797 + }, + { + "auxiliary_loss_clip": 0.01149076, + "auxiliary_loss_mlp": 0.01059925, + "balance_loss_clip": 1.02850103, + "balance_loss_mlp": 1.03694129, + "epoch": 0.07900195400571171, + "flos": 16142546021760.0, + "grad_norm": 2.417584199048771, + "language_loss": 0.72915339, + "learning_rate": 3.9388834909337375e-06, + "loss": 0.75124347, + "num_input_tokens_seen": 27906530, + "router_z_loss_clip": 0.31445312, + "router_z_loss_mlp": 1.1171875, + "step": 1314, + "time_per_iteration": 2.3856356143951416 + }, + { + "auxiliary_loss_clip": 0.01141154, + "auxiliary_loss_mlp": 0.01053615, + "balance_loss_clip": 1.02693546, + "balance_loss_mlp": 1.03433251, + "epoch": 0.07906207725837967, + "flos": 23729347284480.0, + "grad_norm": 1.614323519360908, + "language_loss": 0.79576534, + "learning_rate": 3.938790768348161e-06, + "loss": 0.81771302, + "num_input_tokens_seen": 27926725, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 1.0703125, + "step": 1315, + "time_per_iteration": 2.4572107791900635 + }, + { + "auxiliary_loss_clip": 0.01140939, + "auxiliary_loss_mlp": 0.01057704, + "balance_loss_clip": 1.02779472, + "balance_loss_mlp": 1.03500342, + "epoch": 0.07912220051104765, + "flos": 24314770828800.0, + "grad_norm": 1.9746038132442256, + "language_loss": 0.73879462, + "learning_rate": 3.938697976572129e-06, + "loss": 0.76078105, + "num_input_tokens_seen": 27947875, + "router_z_loss_clip": 0.29882812, + "router_z_loss_mlp": 1.0625, + "step": 1316, + "time_per_iteration": 2.4434773921966553 + }, + { + "auxiliary_loss_clip": 0.01147625, + "auxiliary_loss_mlp": 0.01056223, + "balance_loss_clip": 1.02733898, + "balance_loss_mlp": 1.03697991, + "epoch": 0.07918232376371562, + "flos": 18879038386560.0, + "grad_norm": 3.399225985719132, + "language_loss": 0.65351379, + "learning_rate": 3.938605115608954e-06, + "loss": 0.67555225, + "num_input_tokens_seen": 27965040, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 1.1015625, + "step": 1317, + "time_per_iteration": 2.584772825241089 + }, + { + "auxiliary_loss_clip": 0.01148, + "auxiliary_loss_mlp": 0.01062528, + "balance_loss_clip": 1.03260612, + "balance_loss_mlp": 1.03737366, + "epoch": 0.07924244701638358, + "flos": 27375187587840.0, + "grad_norm": 2.590912805255077, + "language_loss": 0.7312218, + "learning_rate": 3.938512185461948e-06, + "loss": 0.75332707, + "num_input_tokens_seen": 27985330, + "router_z_loss_clip": 0.29882812, + "router_z_loss_mlp": 1.109375, + "step": 1318, + "time_per_iteration": 3.980745315551758 + }, + { + "auxiliary_loss_clip": 0.01143958, + "auxiliary_loss_mlp": 0.0105347, + "balance_loss_clip": 1.02559924, + "balance_loss_mlp": 1.03717411, + "epoch": 0.07930257026905156, + "flos": 25119134709120.0, + "grad_norm": 1.67985434599967, + "language_loss": 0.90111381, + "learning_rate": 3.938419186134429e-06, + "loss": 0.92308807, + "num_input_tokens_seen": 28007615, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 1.0625, + "step": 1319, + "time_per_iteration": 2.4653007984161377 + }, + { + "auxiliary_loss_clip": 0.01142388, + "auxiliary_loss_mlp": 0.01056527, + "balance_loss_clip": 1.02764225, + "balance_loss_mlp": 1.03432322, + "epoch": 0.07936269352171953, + "flos": 21833423746560.0, + "grad_norm": 1.8351091050344135, + "language_loss": 0.79586965, + "learning_rate": 3.9383261176297155e-06, + "loss": 0.81785882, + "num_input_tokens_seen": 28027765, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 1.078125, + "step": 1320, + "time_per_iteration": 3.920444965362549 + }, + { + "auxiliary_loss_clip": 0.01143597, + "auxiliary_loss_mlp": 0.01056051, + "balance_loss_clip": 1.02728558, + "balance_loss_mlp": 1.03809261, + "epoch": 0.07942281677438749, + "flos": 16939124668800.0, + "grad_norm": 2.9078392525385057, + "language_loss": 0.69522524, + "learning_rate": 3.938232979951129e-06, + "loss": 0.71722168, + "num_input_tokens_seen": 28044225, + "router_z_loss_clip": 0.28710938, + "router_z_loss_mlp": 1.0546875, + "step": 1321, + "time_per_iteration": 5.2817018032073975 + }, + { + "auxiliary_loss_clip": 0.01140461, + "auxiliary_loss_mlp": 0.01058528, + "balance_loss_clip": 1.03015637, + "balance_loss_mlp": 1.03602624, + "epoch": 0.07948294002705546, + "flos": 18986012392320.0, + "grad_norm": 2.1799705885269205, + "language_loss": 0.84114683, + "learning_rate": 3.938139773101993e-06, + "loss": 0.86313665, + "num_input_tokens_seen": 28062915, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 1.046875, + "step": 1322, + "time_per_iteration": 2.416301965713501 + }, + { + "auxiliary_loss_clip": 0.01139827, + "auxiliary_loss_mlp": 0.01054368, + "balance_loss_clip": 1.0266279, + "balance_loss_mlp": 1.03306556, + "epoch": 0.07954306327972344, + "flos": 21652364102400.0, + "grad_norm": 2.3530515704260577, + "language_loss": 0.90426469, + "learning_rate": 3.938046497085634e-06, + "loss": 0.92620659, + "num_input_tokens_seen": 28082175, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 1.0703125, + "step": 1323, + "time_per_iteration": 2.456897735595703 + }, + { + "auxiliary_loss_clip": 0.01137344, + "auxiliary_loss_mlp": 0.01053257, + "balance_loss_clip": 1.02537346, + "balance_loss_mlp": 1.0350616, + "epoch": 0.0796031865323914, + "flos": 23219196364800.0, + "grad_norm": 1.726783845318455, + "language_loss": 0.82554126, + "learning_rate": 3.937953151905381e-06, + "loss": 0.84744722, + "num_input_tokens_seen": 28102645, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 1.0234375, + "step": 1324, + "time_per_iteration": 2.426100730895996 + }, + { + "auxiliary_loss_clip": 0.01141442, + "auxiliary_loss_mlp": 0.01052664, + "balance_loss_clip": 1.02341056, + "balance_loss_mlp": 1.03469324, + "epoch": 0.07966330978505937, + "flos": 23293421648640.0, + "grad_norm": 4.397938593012299, + "language_loss": 0.79089087, + "learning_rate": 3.937859737564564e-06, + "loss": 0.81283194, + "num_input_tokens_seen": 28122805, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 1.0703125, + "step": 1325, + "time_per_iteration": 2.5013134479522705 + }, + { + "auxiliary_loss_clip": 0.01143631, + "auxiliary_loss_mlp": 0.01060608, + "balance_loss_clip": 1.03113937, + "balance_loss_mlp": 1.03763437, + "epoch": 0.07972343303772735, + "flos": 18362952535680.0, + "grad_norm": 2.3257959431325914, + "language_loss": 0.88397908, + "learning_rate": 3.937766254066519e-06, + "loss": 0.9060216, + "num_input_tokens_seen": 28140530, + "router_z_loss_clip": 0.29492188, + "router_z_loss_mlp": 1.0625, + "step": 1326, + "time_per_iteration": 2.4037437438964844 + }, + { + "auxiliary_loss_clip": 0.01137361, + "auxiliary_loss_mlp": 0.01047272, + "balance_loss_clip": 1.01801825, + "balance_loss_mlp": 1.03456819, + "epoch": 0.07978355629039531, + "flos": 21761432789760.0, + "grad_norm": 2.0451963149136407, + "language_loss": 0.83130109, + "learning_rate": 3.937672701414581e-06, + "loss": 0.85314745, + "num_input_tokens_seen": 28159640, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 1.03125, + "step": 1327, + "time_per_iteration": 2.4422802925109863 + }, + { + "auxiliary_loss_clip": 0.01141986, + "auxiliary_loss_mlp": 0.01053909, + "balance_loss_clip": 1.02343893, + "balance_loss_mlp": 1.03526497, + "epoch": 0.07984367954306328, + "flos": 18550331136000.0, + "grad_norm": 2.109895088853339, + "language_loss": 0.78819835, + "learning_rate": 3.937579079612087e-06, + "loss": 0.81015736, + "num_input_tokens_seen": 28177050, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 1.0625, + "step": 1328, + "time_per_iteration": 2.3991034030914307 + }, + { + "auxiliary_loss_clip": 0.01143765, + "auxiliary_loss_mlp": 0.01053721, + "balance_loss_clip": 1.02444315, + "balance_loss_mlp": 1.03603315, + "epoch": 0.07990380279573125, + "flos": 16903268835840.0, + "grad_norm": 2.470183114375481, + "language_loss": 0.7324903, + "learning_rate": 3.9374853886623805e-06, + "loss": 0.75446516, + "num_input_tokens_seen": 28193245, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 1.078125, + "step": 1329, + "time_per_iteration": 2.444092035293579 + }, + { + "auxiliary_loss_clip": 0.01138578, + "auxiliary_loss_mlp": 0.0104502, + "balance_loss_clip": 1.01756668, + "balance_loss_mlp": 1.03349066, + "epoch": 0.07996392604839922, + "flos": 24097192035840.0, + "grad_norm": 1.7260779122665668, + "language_loss": 0.8116973, + "learning_rate": 3.937391628568805e-06, + "loss": 0.83353329, + "num_input_tokens_seen": 28213570, + "router_z_loss_clip": 0.27539062, + "router_z_loss_mlp": 1.046875, + "step": 1330, + "time_per_iteration": 2.4590070247650146 + }, + { + "auxiliary_loss_clip": 0.01139686, + "auxiliary_loss_mlp": 0.01050342, + "balance_loss_clip": 1.02336502, + "balance_loss_mlp": 1.03483081, + "epoch": 0.08002404930106718, + "flos": 14277974751360.0, + "grad_norm": 5.953708099505188, + "language_loss": 0.88954514, + "learning_rate": 3.937297799334706e-06, + "loss": 0.9114455, + "num_input_tokens_seen": 28229980, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 1.046875, + "step": 1331, + "time_per_iteration": 2.418520212173462 + }, + { + "auxiliary_loss_clip": 0.01143133, + "auxiliary_loss_mlp": 0.01050024, + "balance_loss_clip": 1.02038908, + "balance_loss_mlp": 1.03411341, + "epoch": 0.08008417255373516, + "flos": 40404633742080.0, + "grad_norm": 1.9768105749615845, + "language_loss": 0.73450077, + "learning_rate": 3.937203900963431e-06, + "loss": 0.75643235, + "num_input_tokens_seen": 28253840, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 1.09375, + "step": 1332, + "time_per_iteration": 2.569986343383789 + }, + { + "auxiliary_loss_clip": 0.01139288, + "auxiliary_loss_mlp": 0.01047808, + "balance_loss_clip": 1.01986504, + "balance_loss_mlp": 1.03435731, + "epoch": 0.08014429580640313, + "flos": 18477921242880.0, + "grad_norm": 1.9158417669999368, + "language_loss": 0.82308197, + "learning_rate": 3.9371099334583315e-06, + "loss": 0.84495294, + "num_input_tokens_seen": 28271675, + "router_z_loss_clip": 0.27929688, + "router_z_loss_mlp": 1.046875, + "step": 1333, + "time_per_iteration": 2.416245460510254 + }, + { + "auxiliary_loss_clip": 0.0114136, + "auxiliary_loss_mlp": 0.01049533, + "balance_loss_clip": 1.0222342, + "balance_loss_mlp": 1.03392458, + "epoch": 0.0802044190590711, + "flos": 22052398993920.0, + "grad_norm": 2.22853006873138, + "language_loss": 0.74815822, + "learning_rate": 3.937015896822762e-06, + "loss": 0.7700671, + "num_input_tokens_seen": 28291850, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 1.078125, + "step": 1334, + "time_per_iteration": 2.415252685546875 + }, + { + "auxiliary_loss_clip": 0.01139333, + "auxiliary_loss_mlp": 0.01049732, + "balance_loss_clip": 1.02207494, + "balance_loss_mlp": 1.03632402, + "epoch": 0.08026454231173906, + "flos": 24570963452160.0, + "grad_norm": 1.8146781566640344, + "language_loss": 0.80229247, + "learning_rate": 3.936921791060078e-06, + "loss": 0.82418305, + "num_input_tokens_seen": 28310780, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 1.03125, + "step": 1335, + "time_per_iteration": 2.476264476776123 + }, + { + "auxiliary_loss_clip": 0.01042149, + "auxiliary_loss_mlp": 0.01018921, + "balance_loss_clip": 1.01470125, + "balance_loss_mlp": 1.00973916, + "epoch": 0.08032466556440704, + "flos": 52579195545600.0, + "grad_norm": 0.7418359836964853, + "language_loss": 0.5600881, + "learning_rate": 3.936827616173636e-06, + "loss": 0.58069885, + "num_input_tokens_seen": 28369985, + "router_z_loss_clip": 0.04223633, + "router_z_loss_mlp": 0.32421875, + "step": 1336, + "time_per_iteration": 3.1065118312835693 + }, + { + "auxiliary_loss_clip": 0.01139751, + "auxiliary_loss_mlp": 0.01056251, + "balance_loss_clip": 1.02970338, + "balance_loss_mlp": 1.0362134, + "epoch": 0.080384788817075, + "flos": 23841453260160.0, + "grad_norm": 2.0979713849682615, + "language_loss": 0.67448568, + "learning_rate": 3.9367333721668006e-06, + "loss": 0.6964457, + "num_input_tokens_seen": 28388670, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.03125, + "step": 1337, + "time_per_iteration": 2.450666904449463 + }, + { + "auxiliary_loss_clip": 0.01140381, + "auxiliary_loss_mlp": 0.01054173, + "balance_loss_clip": 1.02661204, + "balance_loss_mlp": 1.03676331, + "epoch": 0.08044491206974297, + "flos": 25299565948800.0, + "grad_norm": 2.2552804727475815, + "language_loss": 0.86439645, + "learning_rate": 3.936639059042932e-06, + "loss": 0.88634193, + "num_input_tokens_seen": 28411845, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 1.0390625, + "step": 1338, + "time_per_iteration": 2.505598783493042 + }, + { + "auxiliary_loss_clip": 0.01138751, + "auxiliary_loss_mlp": 0.01067749, + "balance_loss_clip": 1.03838754, + "balance_loss_mlp": 1.03342259, + "epoch": 0.08050503532241095, + "flos": 22375625160960.0, + "grad_norm": 4.359292809857966, + "language_loss": 0.87319863, + "learning_rate": 3.936544676805397e-06, + "loss": 0.89526367, + "num_input_tokens_seen": 28427875, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 1.0546875, + "step": 1339, + "time_per_iteration": 2.425258159637451 + }, + { + "auxiliary_loss_clip": 0.01133709, + "auxiliary_loss_mlp": 0.01047137, + "balance_loss_clip": 1.02039886, + "balance_loss_mlp": 1.03435993, + "epoch": 0.08056515857507891, + "flos": 18368433619200.0, + "grad_norm": 2.1293160007814826, + "language_loss": 0.89519572, + "learning_rate": 3.936450225457564e-06, + "loss": 0.91700423, + "num_input_tokens_seen": 28446615, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 0.9921875, + "step": 1340, + "time_per_iteration": 2.40386700630188 + }, + { + "auxiliary_loss_clip": 0.01036387, + "auxiliary_loss_mlp": 0.01003174, + "balance_loss_clip": 0.99928826, + "balance_loss_mlp": 1.00538206, + "epoch": 0.08062528182774688, + "flos": 51345329719680.0, + "grad_norm": 0.8708482508804375, + "language_loss": 0.64813459, + "learning_rate": 3.936355705002804e-06, + "loss": 0.66853023, + "num_input_tokens_seen": 28505290, + "router_z_loss_clip": 0.03881836, + "router_z_loss_mlp": 0.31054688, + "step": 1341, + "time_per_iteration": 3.0302042961120605 + }, + { + "auxiliary_loss_clip": 0.01145959, + "auxiliary_loss_mlp": 0.01054501, + "balance_loss_clip": 1.02598643, + "balance_loss_mlp": 1.03534555, + "epoch": 0.08068540508041486, + "flos": 17598843319680.0, + "grad_norm": 2.07789975291421, + "language_loss": 0.89729524, + "learning_rate": 3.936261115444489e-06, + "loss": 0.91929984, + "num_input_tokens_seen": 28522735, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 1.109375, + "step": 1342, + "time_per_iteration": 2.4231953620910645 + }, + { + "auxiliary_loss_clip": 0.01147974, + "auxiliary_loss_mlp": 0.01057106, + "balance_loss_clip": 1.02868652, + "balance_loss_mlp": 1.03958821, + "epoch": 0.08074552833308282, + "flos": 10560422782080.0, + "grad_norm": 2.5990374723313217, + "language_loss": 0.76440805, + "learning_rate": 3.936166456785997e-06, + "loss": 0.78645885, + "num_input_tokens_seen": 28539460, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 1.078125, + "step": 1343, + "time_per_iteration": 2.42657470703125 + }, + { + "auxiliary_loss_clip": 0.0103502, + "auxiliary_loss_mlp": 0.01009421, + "balance_loss_clip": 1.00532043, + "balance_loss_mlp": 1.00449371, + "epoch": 0.08080565158575079, + "flos": 60837026739840.0, + "grad_norm": 0.8016447465790755, + "language_loss": 0.57401437, + "learning_rate": 3.936071729030702e-06, + "loss": 0.59445882, + "num_input_tokens_seen": 28599855, + "router_z_loss_clip": 0.04101562, + "router_z_loss_mlp": 0.3046875, + "step": 1344, + "time_per_iteration": 3.0285229682922363 + }, + { + "auxiliary_loss_clip": 0.01142824, + "auxiliary_loss_mlp": 0.01056169, + "balance_loss_clip": 1.02847719, + "balance_loss_mlp": 1.03656745, + "epoch": 0.08086577483841875, + "flos": 18331390800000.0, + "grad_norm": 3.3861312365131355, + "language_loss": 0.86296439, + "learning_rate": 3.935976932181989e-06, + "loss": 0.88495433, + "num_input_tokens_seen": 28617585, + "router_z_loss_clip": 0.27539062, + "router_z_loss_mlp": 1.0625, + "step": 1345, + "time_per_iteration": 2.429169178009033 + }, + { + "auxiliary_loss_clip": 0.0113923, + "auxiliary_loss_mlp": 0.01056697, + "balance_loss_clip": 1.03047085, + "balance_loss_mlp": 1.03582883, + "epoch": 0.08092589809108673, + "flos": 21542527365120.0, + "grad_norm": 1.8247206662094533, + "language_loss": 0.87417907, + "learning_rate": 3.935882066243239e-06, + "loss": 0.89613831, + "num_input_tokens_seen": 28636355, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 1.03125, + "step": 1346, + "time_per_iteration": 2.438117027282715 + }, + { + "auxiliary_loss_clip": 0.01140205, + "auxiliary_loss_mlp": 0.01051821, + "balance_loss_clip": 1.02616668, + "balance_loss_mlp": 1.03634501, + "epoch": 0.0809860213437547, + "flos": 22126903568640.0, + "grad_norm": 1.9378568526291882, + "language_loss": 0.92655408, + "learning_rate": 3.935787131217838e-06, + "loss": 0.94847435, + "num_input_tokens_seen": 28656260, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 1.0390625, + "step": 1347, + "time_per_iteration": 2.4197306632995605 + }, + { + "auxiliary_loss_clip": 0.01136978, + "auxiliary_loss_mlp": 0.01047197, + "balance_loss_clip": 1.01831245, + "balance_loss_mlp": 1.03443682, + "epoch": 0.08104614459642266, + "flos": 21724424881920.0, + "grad_norm": 2.006159913199672, + "language_loss": 0.89071, + "learning_rate": 3.9356921271091734e-06, + "loss": 0.91255176, + "num_input_tokens_seen": 28675865, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 1.0234375, + "step": 1348, + "time_per_iteration": 2.4586288928985596 + }, + { + "auxiliary_loss_clip": 0.01137366, + "auxiliary_loss_mlp": 0.01047615, + "balance_loss_clip": 1.02150774, + "balance_loss_mlp": 1.0380547, + "epoch": 0.08110626784909064, + "flos": 23950731415680.0, + "grad_norm": 1.9698497285486793, + "language_loss": 0.76631665, + "learning_rate": 3.935597053920635e-06, + "loss": 0.7881664, + "num_input_tokens_seen": 28696255, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.9921875, + "step": 1349, + "time_per_iteration": 2.4439492225646973 + }, + { + "auxiliary_loss_clip": 0.01140006, + "auxiliary_loss_mlp": 0.0105312, + "balance_loss_clip": 1.02532005, + "balance_loss_mlp": 1.03573346, + "epoch": 0.0811663911017586, + "flos": 19024696045440.0, + "grad_norm": 2.4233555394337256, + "language_loss": 0.88450396, + "learning_rate": 3.935501911655618e-06, + "loss": 0.90643525, + "num_input_tokens_seen": 28713905, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 1.0390625, + "step": 1350, + "time_per_iteration": 2.412860631942749 + }, + { + "auxiliary_loss_clip": 0.01135942, + "auxiliary_loss_mlp": 0.01058571, + "balance_loss_clip": 1.03073609, + "balance_loss_mlp": 1.03386545, + "epoch": 0.08122651435442657, + "flos": 15340381557120.0, + "grad_norm": 2.1531664901380094, + "language_loss": 0.8194319, + "learning_rate": 3.935406700317516e-06, + "loss": 0.84137702, + "num_input_tokens_seen": 28732075, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 1.0234375, + "step": 1351, + "time_per_iteration": 2.4073026180267334 + }, + { + "auxiliary_loss_clip": 0.011395, + "auxiliary_loss_mlp": 0.01051922, + "balance_loss_clip": 1.02209556, + "balance_loss_mlp": 1.03398967, + "epoch": 0.08128663760709455, + "flos": 23220453173760.0, + "grad_norm": 2.489233034819534, + "language_loss": 0.75422478, + "learning_rate": 3.935311419909728e-06, + "loss": 0.77613902, + "num_input_tokens_seen": 28751150, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 1.0546875, + "step": 1352, + "time_per_iteration": 2.5176920890808105 + }, + { + "auxiliary_loss_clip": 0.01141523, + "auxiliary_loss_mlp": 0.01056237, + "balance_loss_clip": 1.02710199, + "balance_loss_mlp": 1.03585172, + "epoch": 0.08134676085976252, + "flos": 22964539841280.0, + "grad_norm": 1.82520977696158, + "language_loss": 0.83126086, + "learning_rate": 3.935216070435652e-06, + "loss": 0.85323852, + "num_input_tokens_seen": 28773360, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 1.0546875, + "step": 1353, + "time_per_iteration": 2.468277931213379 + }, + { + "auxiliary_loss_clip": 0.01033342, + "auxiliary_loss_mlp": 0.01007523, + "balance_loss_clip": 1.00339806, + "balance_loss_mlp": 1.00308514, + "epoch": 0.08140688411243048, + "flos": 64319369679360.0, + "grad_norm": 0.8483378491422867, + "language_loss": 0.59735012, + "learning_rate": 3.935120651898694e-06, + "loss": 0.61775875, + "num_input_tokens_seen": 28833390, + "router_z_loss_clip": 0.04125977, + "router_z_loss_mlp": 0.30273438, + "step": 1354, + "time_per_iteration": 3.0853233337402344 + }, + { + "auxiliary_loss_clip": 0.01136525, + "auxiliary_loss_mlp": 0.01052364, + "balance_loss_clip": 1.02514863, + "balance_loss_mlp": 1.03431463, + "epoch": 0.08146700736509845, + "flos": 22490768424960.0, + "grad_norm": 1.8495339968426354, + "language_loss": 0.82956147, + "learning_rate": 3.935025164302257e-06, + "loss": 0.85145044, + "num_input_tokens_seen": 28852430, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 1.0234375, + "step": 1355, + "time_per_iteration": 2.464796781539917 + }, + { + "auxiliary_loss_clip": 0.01139687, + "auxiliary_loss_mlp": 0.01059226, + "balance_loss_clip": 1.0298171, + "balance_loss_mlp": 1.03361726, + "epoch": 0.08152713061776642, + "flos": 20446813255680.0, + "grad_norm": 1.9276991028738168, + "language_loss": 0.7095387, + "learning_rate": 3.934929607649749e-06, + "loss": 0.73152781, + "num_input_tokens_seen": 28870685, + "router_z_loss_clip": 0.29492188, + "router_z_loss_mlp": 1.0625, + "step": 1356, + "time_per_iteration": 2.4853670597076416 + }, + { + "auxiliary_loss_clip": 0.01139779, + "auxiliary_loss_mlp": 0.01051055, + "balance_loss_clip": 1.02342236, + "balance_loss_mlp": 1.03437948, + "epoch": 0.08158725387043439, + "flos": 23549090601600.0, + "grad_norm": 1.8537448997099917, + "language_loss": 0.70516974, + "learning_rate": 3.934833981944582e-06, + "loss": 0.72707808, + "num_input_tokens_seen": 28889860, + "router_z_loss_clip": 0.27539062, + "router_z_loss_mlp": 1.0546875, + "step": 1357, + "time_per_iteration": 2.4754977226257324 + }, + { + "auxiliary_loss_clip": 0.0114063, + "auxiliary_loss_mlp": 0.01053673, + "balance_loss_clip": 1.02531266, + "balance_loss_mlp": 1.03652501, + "epoch": 0.08164737712310235, + "flos": 22016263870080.0, + "grad_norm": 2.1873732431405237, + "language_loss": 0.84406656, + "learning_rate": 3.934738287190166e-06, + "loss": 0.86600959, + "num_input_tokens_seen": 28905865, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 1.0390625, + "step": 1358, + "time_per_iteration": 3.949376106262207 + }, + { + "auxiliary_loss_clip": 0.01141411, + "auxiliary_loss_mlp": 0.01053594, + "balance_loss_clip": 1.02345777, + "balance_loss_mlp": 1.03640282, + "epoch": 0.08170750037577033, + "flos": 23366704325760.0, + "grad_norm": 2.0468282200358843, + "language_loss": 1.0262934, + "learning_rate": 3.934642523389917e-06, + "loss": 1.04824352, + "num_input_tokens_seen": 28925250, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 1.0546875, + "step": 1359, + "time_per_iteration": 3.8985743522644043 + }, + { + "auxiliary_loss_clip": 0.01136888, + "auxiliary_loss_mlp": 0.01049909, + "balance_loss_clip": 1.02070248, + "balance_loss_mlp": 1.03373373, + "epoch": 0.0817676236284383, + "flos": 28396850970240.0, + "grad_norm": 2.0307171832386377, + "language_loss": 0.83083647, + "learning_rate": 3.934546690547253e-06, + "loss": 0.85270447, + "num_input_tokens_seen": 28943445, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 1.03125, + "step": 1360, + "time_per_iteration": 2.471463680267334 + }, + { + "auxiliary_loss_clip": 0.01140642, + "auxiliary_loss_mlp": 0.01051129, + "balance_loss_clip": 1.02199435, + "balance_loss_mlp": 1.03499973, + "epoch": 0.08182774688110626, + "flos": 19207885282560.0, + "grad_norm": 2.2149181348842157, + "language_loss": 0.72330105, + "learning_rate": 3.934450788665594e-06, + "loss": 0.74521875, + "num_input_tokens_seen": 28962695, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 1.0546875, + "step": 1361, + "time_per_iteration": 5.282749176025391 + }, + { + "auxiliary_loss_clip": 0.01132788, + "auxiliary_loss_mlp": 0.01056152, + "balance_loss_clip": 1.02720809, + "balance_loss_mlp": 1.03063273, + "epoch": 0.08188787013377424, + "flos": 22782991438080.0, + "grad_norm": 2.91924500043117, + "language_loss": 0.76753962, + "learning_rate": 3.934354817748363e-06, + "loss": 0.78942901, + "num_input_tokens_seen": 28982120, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 1.0234375, + "step": 1362, + "time_per_iteration": 2.4537808895111084 + }, + { + "auxiliary_loss_clip": 0.01139277, + "auxiliary_loss_mlp": 0.01047336, + "balance_loss_clip": 1.01981115, + "balance_loss_mlp": 1.03720856, + "epoch": 0.08194799338644221, + "flos": 16467273377280.0, + "grad_norm": 2.5230938950134862, + "language_loss": 0.7296077, + "learning_rate": 3.934258777798984e-06, + "loss": 0.75147378, + "num_input_tokens_seen": 28998100, + "router_z_loss_clip": 0.27539062, + "router_z_loss_mlp": 1.0234375, + "step": 1363, + "time_per_iteration": 2.3879504203796387 + }, + { + "auxiliary_loss_clip": 0.01139257, + "auxiliary_loss_mlp": 0.01046154, + "balance_loss_clip": 1.01670909, + "balance_loss_mlp": 1.03688741, + "epoch": 0.08200811663911017, + "flos": 23912536521600.0, + "grad_norm": 2.0120773302425747, + "language_loss": 0.77598512, + "learning_rate": 3.934162668820884e-06, + "loss": 0.79783922, + "num_input_tokens_seen": 29017095, + "router_z_loss_clip": 0.29492188, + "router_z_loss_mlp": 1.0234375, + "step": 1364, + "time_per_iteration": 2.460463762283325 + }, + { + "auxiliary_loss_clip": 0.01139736, + "auxiliary_loss_mlp": 0.01051396, + "balance_loss_clip": 1.02369142, + "balance_loss_mlp": 1.03548634, + "epoch": 0.08206823989177814, + "flos": 17895534986880.0, + "grad_norm": 11.193197151022844, + "language_loss": 0.81889302, + "learning_rate": 3.934066490817495e-06, + "loss": 0.84080428, + "num_input_tokens_seen": 29037240, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 1.046875, + "step": 1365, + "time_per_iteration": 2.552370548248291 + }, + { + "auxiliary_loss_clip": 0.01138806, + "auxiliary_loss_mlp": 0.01046396, + "balance_loss_clip": 1.01888299, + "balance_loss_mlp": 1.03748155, + "epoch": 0.08212836314444612, + "flos": 22087172574720.0, + "grad_norm": 2.109783001922283, + "language_loss": 0.82045788, + "learning_rate": 3.9339702437922465e-06, + "loss": 0.84230983, + "num_input_tokens_seen": 29056250, + "router_z_loss_clip": 0.27539062, + "router_z_loss_mlp": 1.015625, + "step": 1366, + "time_per_iteration": 2.615504264831543 + }, + { + "auxiliary_loss_clip": 0.01138705, + "auxiliary_loss_mlp": 0.010455, + "balance_loss_clip": 1.02046609, + "balance_loss_mlp": 1.03385198, + "epoch": 0.08218848639711408, + "flos": 17596678815360.0, + "grad_norm": 1.770323620790888, + "language_loss": 0.81591201, + "learning_rate": 3.933873927748575e-06, + "loss": 0.83775401, + "num_input_tokens_seen": 29073380, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 1.046875, + "step": 1367, + "time_per_iteration": 2.467405080795288 + }, + { + "auxiliary_loss_clip": 0.01145546, + "auxiliary_loss_mlp": 0.01059798, + "balance_loss_clip": 1.03116357, + "balance_loss_mlp": 1.03625274, + "epoch": 0.08224860964978205, + "flos": 17856886245120.0, + "grad_norm": 2.036747963087789, + "language_loss": 0.82997632, + "learning_rate": 3.933777542689918e-06, + "loss": 0.85202968, + "num_input_tokens_seen": 29091330, + "router_z_loss_clip": 0.28710938, + "router_z_loss_mlp": 1.09375, + "step": 1368, + "time_per_iteration": 2.5678632259368896 + }, + { + "auxiliary_loss_clip": 0.01133686, + "auxiliary_loss_mlp": 0.0104746, + "balance_loss_clip": 1.02059031, + "balance_loss_mlp": 1.03601241, + "epoch": 0.08230873290245003, + "flos": 25226388005760.0, + "grad_norm": 1.7779988400973337, + "language_loss": 0.81281292, + "learning_rate": 3.933681088619715e-06, + "loss": 0.83462441, + "num_input_tokens_seen": 29110375, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.9765625, + "step": 1369, + "time_per_iteration": 2.535598039627075 + }, + { + "auxiliary_loss_clip": 0.01136147, + "auxiliary_loss_mlp": 0.01048409, + "balance_loss_clip": 1.02279139, + "balance_loss_mlp": 1.03621101, + "epoch": 0.08236885615511799, + "flos": 31758567696000.0, + "grad_norm": 2.113249725053309, + "language_loss": 0.74624491, + "learning_rate": 3.933584565541407e-06, + "loss": 0.76809049, + "num_input_tokens_seen": 29129395, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 1.0, + "step": 1370, + "time_per_iteration": 2.606149196624756 + }, + { + "auxiliary_loss_clip": 0.01141181, + "auxiliary_loss_mlp": 0.01055651, + "balance_loss_clip": 1.02685022, + "balance_loss_mlp": 1.03535104, + "epoch": 0.08242897940778596, + "flos": 23184702074880.0, + "grad_norm": 1.5566014351480937, + "language_loss": 0.74512672, + "learning_rate": 3.9334879734584405e-06, + "loss": 0.76709503, + "num_input_tokens_seen": 29148650, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 1.0625, + "step": 1371, + "time_per_iteration": 2.5078554153442383 + }, + { + "auxiliary_loss_clip": 0.01137903, + "auxiliary_loss_mlp": 0.01052755, + "balance_loss_clip": 1.02305984, + "balance_loss_mlp": 1.03381038, + "epoch": 0.08248910266045394, + "flos": 34490172470400.0, + "grad_norm": 2.0681939965691374, + "language_loss": 0.71125972, + "learning_rate": 3.933391312374262e-06, + "loss": 0.73316634, + "num_input_tokens_seen": 29170785, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 1.046875, + "step": 1372, + "time_per_iteration": 2.66995906829834 + }, + { + "auxiliary_loss_clip": 0.01141069, + "auxiliary_loss_mlp": 0.01055284, + "balance_loss_clip": 1.02529144, + "balance_loss_mlp": 1.03453314, + "epoch": 0.0825492259131219, + "flos": 13435590533760.0, + "grad_norm": 3.663715416242882, + "language_loss": 0.87991744, + "learning_rate": 3.93329458229232e-06, + "loss": 0.90188098, + "num_input_tokens_seen": 29185210, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 1.0625, + "step": 1373, + "time_per_iteration": 2.432610034942627 + }, + { + "auxiliary_loss_clip": 0.01136937, + "auxiliary_loss_mlp": 0.01053465, + "balance_loss_clip": 1.02511716, + "balance_loss_mlp": 1.03353488, + "epoch": 0.08260934916578987, + "flos": 25811252968320.0, + "grad_norm": 1.8320037522503072, + "language_loss": 0.82148111, + "learning_rate": 3.933197783216068e-06, + "loss": 0.8433851, + "num_input_tokens_seen": 29205210, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 1.03125, + "step": 1374, + "time_per_iteration": 2.4978749752044678 + }, + { + "auxiliary_loss_clip": 0.01037488, + "auxiliary_loss_mlp": 0.01014386, + "balance_loss_clip": 1.00916481, + "balance_loss_mlp": 1.00536513, + "epoch": 0.08266947241845783, + "flos": 63456909563520.0, + "grad_norm": 0.8173504943743701, + "language_loss": 0.60599476, + "learning_rate": 3.93310091514896e-06, + "loss": 0.62651354, + "num_input_tokens_seen": 29265350, + "router_z_loss_clip": 0.05224609, + "router_z_loss_mlp": 0.3203125, + "step": 1375, + "time_per_iteration": 3.042445182800293 + }, + { + "auxiliary_loss_clip": 0.01035876, + "auxiliary_loss_mlp": 0.01008424, + "balance_loss_clip": 1.00348854, + "balance_loss_mlp": 1.00414395, + "epoch": 0.08272959567112581, + "flos": 69990346062720.0, + "grad_norm": 0.9075145685559574, + "language_loss": 0.62212205, + "learning_rate": 3.933003978094452e-06, + "loss": 0.64256501, + "num_input_tokens_seen": 29321475, + "router_z_loss_clip": 0.04931641, + "router_z_loss_mlp": 0.31640625, + "step": 1376, + "time_per_iteration": 3.0153872966766357 + }, + { + "auxiliary_loss_clip": 0.01142638, + "auxiliary_loss_mlp": 0.01054963, + "balance_loss_clip": 1.02731788, + "balance_loss_mlp": 1.03678632, + "epoch": 0.08278971892379378, + "flos": 20412144408960.0, + "grad_norm": 1.6758916856217034, + "language_loss": 0.82464159, + "learning_rate": 3.9329069720560045e-06, + "loss": 0.84661758, + "num_input_tokens_seen": 29341405, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 1.0625, + "step": 1377, + "time_per_iteration": 2.5013442039489746 + }, + { + "auxiliary_loss_clip": 0.01137221, + "auxiliary_loss_mlp": 0.01052512, + "balance_loss_clip": 1.02425957, + "balance_loss_mlp": 1.03591537, + "epoch": 0.08284984217646174, + "flos": 26249028906240.0, + "grad_norm": 1.8833560528177287, + "language_loss": 0.84713018, + "learning_rate": 3.932809897037079e-06, + "loss": 0.8690275, + "num_input_tokens_seen": 29361955, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 1.015625, + "step": 1378, + "time_per_iteration": 2.4867005348205566 + }, + { + "auxiliary_loss_clip": 0.01138837, + "auxiliary_loss_mlp": 0.01053253, + "balance_loss_clip": 1.02495241, + "balance_loss_mlp": 1.03472888, + "epoch": 0.08290996542912972, + "flos": 27193569361920.0, + "grad_norm": 2.1981360833435644, + "language_loss": 0.87588495, + "learning_rate": 3.932712753041141e-06, + "loss": 0.89780581, + "num_input_tokens_seen": 29382395, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 1.0390625, + "step": 1379, + "time_per_iteration": 2.477638006210327 + }, + { + "auxiliary_loss_clip": 0.01137617, + "auxiliary_loss_mlp": 0.01054187, + "balance_loss_clip": 1.02743649, + "balance_loss_mlp": 1.03673005, + "epoch": 0.08297008868179769, + "flos": 38616661728000.0, + "grad_norm": 2.1679296386762332, + "language_loss": 0.7849893, + "learning_rate": 3.932615540071656e-06, + "loss": 0.80690736, + "num_input_tokens_seen": 29404460, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 1.015625, + "step": 1380, + "time_per_iteration": 2.553020477294922 + }, + { + "auxiliary_loss_clip": 0.01137195, + "auxiliary_loss_mlp": 0.01059167, + "balance_loss_clip": 1.03142667, + "balance_loss_mlp": 1.03819525, + "epoch": 0.08303021193446565, + "flos": 19973705155200.0, + "grad_norm": 2.502140765456767, + "language_loss": 0.85779071, + "learning_rate": 3.932518258132094e-06, + "loss": 0.8797543, + "num_input_tokens_seen": 29422675, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.98828125, + "step": 1381, + "time_per_iteration": 2.4220502376556396 + }, + { + "auxiliary_loss_clip": 0.01146824, + "auxiliary_loss_mlp": 0.01056244, + "balance_loss_clip": 1.02714479, + "balance_loss_mlp": 1.03916287, + "epoch": 0.08309033518713363, + "flos": 13661792432640.0, + "grad_norm": 2.8855093131695493, + "language_loss": 0.88018179, + "learning_rate": 3.932420907225926e-06, + "loss": 0.9022125, + "num_input_tokens_seen": 29439840, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 1.078125, + "step": 1382, + "time_per_iteration": 2.3828964233398438 + }, + { + "auxiliary_loss_clip": 0.01138958, + "auxiliary_loss_mlp": 0.01054701, + "balance_loss_clip": 1.02839184, + "balance_loss_mlp": 1.03570044, + "epoch": 0.0831504584398016, + "flos": 17967560855040.0, + "grad_norm": 2.632102141344648, + "language_loss": 0.77463621, + "learning_rate": 3.932323487356626e-06, + "loss": 0.7965728, + "num_input_tokens_seen": 29457360, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 1.03125, + "step": 1383, + "time_per_iteration": 2.403245210647583 + }, + { + "auxiliary_loss_clip": 0.01142298, + "auxiliary_loss_mlp": 0.01054448, + "balance_loss_clip": 1.02700639, + "balance_loss_mlp": 1.03616834, + "epoch": 0.08321058169246956, + "flos": 22600290960000.0, + "grad_norm": 6.694357317480596, + "language_loss": 0.82948864, + "learning_rate": 3.932225998527672e-06, + "loss": 0.85145605, + "num_input_tokens_seen": 29477040, + "router_z_loss_clip": 0.27539062, + "router_z_loss_mlp": 1.0625, + "step": 1384, + "time_per_iteration": 2.4460690021514893 + }, + { + "auxiliary_loss_clip": 0.01147629, + "auxiliary_loss_mlp": 0.01055719, + "balance_loss_clip": 1.02738309, + "balance_loss_mlp": 1.03878617, + "epoch": 0.08327070494513754, + "flos": 22849501311360.0, + "grad_norm": 2.7198915303661058, + "language_loss": 0.85049307, + "learning_rate": 3.932128440742542e-06, + "loss": 0.87252659, + "num_input_tokens_seen": 29492010, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 1.0859375, + "step": 1385, + "time_per_iteration": 2.4074478149414062 + }, + { + "auxiliary_loss_clip": 0.01144683, + "auxiliary_loss_mlp": 0.01051475, + "balance_loss_clip": 1.02263844, + "balance_loss_mlp": 1.03906059, + "epoch": 0.0833308281978055, + "flos": 22781909185920.0, + "grad_norm": 1.7272321262773471, + "language_loss": 0.68542445, + "learning_rate": 3.932030814004719e-06, + "loss": 0.70738602, + "num_input_tokens_seen": 29511850, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 1.0546875, + "step": 1386, + "time_per_iteration": 2.489827871322632 + }, + { + "auxiliary_loss_clip": 0.01138937, + "auxiliary_loss_mlp": 0.0105088, + "balance_loss_clip": 1.02410579, + "balance_loss_mlp": 1.03381312, + "epoch": 0.08339095145047347, + "flos": 20811585807360.0, + "grad_norm": 1.6662190524934888, + "language_loss": 0.81894517, + "learning_rate": 3.9319331183176844e-06, + "loss": 0.84084338, + "num_input_tokens_seen": 29531415, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 1.046875, + "step": 1387, + "time_per_iteration": 2.442065715789795 + }, + { + "auxiliary_loss_clip": 0.0114151, + "auxiliary_loss_mlp": 0.01063807, + "balance_loss_clip": 1.03293192, + "balance_loss_mlp": 1.03462207, + "epoch": 0.08345107470314143, + "flos": 18514335657600.0, + "grad_norm": 1.9618631684366505, + "language_loss": 0.77150124, + "learning_rate": 3.931835353684927e-06, + "loss": 0.79355443, + "num_input_tokens_seen": 29549525, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 1.0703125, + "step": 1388, + "time_per_iteration": 2.442744731903076 + }, + { + "auxiliary_loss_clip": 0.01136028, + "auxiliary_loss_mlp": 0.01059897, + "balance_loss_clip": 1.03164482, + "balance_loss_mlp": 1.03585863, + "epoch": 0.08351119795580941, + "flos": 18806558670720.0, + "grad_norm": 1.9977385433797352, + "language_loss": 0.78928244, + "learning_rate": 3.931737520109935e-06, + "loss": 0.81124169, + "num_input_tokens_seen": 29568705, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 1.0, + "step": 1389, + "time_per_iteration": 2.403454542160034 + }, + { + "auxiliary_loss_clip": 0.01140771, + "auxiliary_loss_mlp": 0.01045638, + "balance_loss_clip": 1.01771963, + "balance_loss_mlp": 1.03638148, + "epoch": 0.08357132120847738, + "flos": 18440843512320.0, + "grad_norm": 2.5662322532793325, + "language_loss": 0.87396991, + "learning_rate": 3.931639617596201e-06, + "loss": 0.89583397, + "num_input_tokens_seen": 29585855, + "router_z_loss_clip": 0.27929688, + "router_z_loss_mlp": 1.046875, + "step": 1390, + "time_per_iteration": 2.444549798965454 + }, + { + "auxiliary_loss_clip": 0.01136063, + "auxiliary_loss_mlp": 0.01063131, + "balance_loss_clip": 1.03559399, + "balance_loss_mlp": 1.03334212, + "epoch": 0.08363144446114534, + "flos": 25921124616960.0, + "grad_norm": 2.3827728135287236, + "language_loss": 0.86620837, + "learning_rate": 3.931541646147217e-06, + "loss": 0.88820034, + "num_input_tokens_seen": 29607280, + "router_z_loss_clip": 0.27539062, + "router_z_loss_mlp": 1.03125, + "step": 1391, + "time_per_iteration": 2.4772896766662598 + }, + { + "auxiliary_loss_clip": 0.01147064, + "auxiliary_loss_mlp": 0.01066716, + "balance_loss_clip": 1.03882098, + "balance_loss_mlp": 1.03849733, + "epoch": 0.08369156771381332, + "flos": 18040319861760.0, + "grad_norm": 2.5776007911349925, + "language_loss": 0.87413985, + "learning_rate": 3.93144360576648e-06, + "loss": 0.89627767, + "num_input_tokens_seen": 29624130, + "router_z_loss_clip": 0.27929688, + "router_z_loss_mlp": 1.0859375, + "step": 1392, + "time_per_iteration": 2.43986439704895 + }, + { + "auxiliary_loss_clip": 0.0113886, + "auxiliary_loss_mlp": 0.01055026, + "balance_loss_clip": 1.02673769, + "balance_loss_mlp": 1.03480148, + "epoch": 0.08375169096648129, + "flos": 22673992573440.0, + "grad_norm": 2.5201146235582197, + "language_loss": 0.79845703, + "learning_rate": 3.931345496457489e-06, + "loss": 0.82039583, + "num_input_tokens_seen": 29643210, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 1.0390625, + "step": 1393, + "time_per_iteration": 2.4255776405334473 + }, + { + "auxiliary_loss_clip": 0.01137911, + "auxiliary_loss_mlp": 0.01047337, + "balance_loss_clip": 1.02095628, + "balance_loss_mlp": 1.03682518, + "epoch": 0.08381181421914925, + "flos": 26102044615680.0, + "grad_norm": 3.8426116391483442, + "language_loss": 0.84546328, + "learning_rate": 3.931247318223746e-06, + "loss": 0.86731571, + "num_input_tokens_seen": 29663920, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 1.0078125, + "step": 1394, + "time_per_iteration": 2.4559414386749268 + }, + { + "auxiliary_loss_clip": 0.01141432, + "auxiliary_loss_mlp": 0.0104905, + "balance_loss_clip": 1.0208931, + "balance_loss_mlp": 1.0367496, + "epoch": 0.08387193747181723, + "flos": 20628780595200.0, + "grad_norm": 2.1271812036602222, + "language_loss": 0.82844597, + "learning_rate": 3.931149071068753e-06, + "loss": 0.85035086, + "num_input_tokens_seen": 29683825, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 1.046875, + "step": 1395, + "time_per_iteration": 2.403179407119751 + }, + { + "auxiliary_loss_clip": 0.01141043, + "auxiliary_loss_mlp": 0.01050477, + "balance_loss_clip": 1.02043593, + "balance_loss_mlp": 1.03645396, + "epoch": 0.0839320607244852, + "flos": 13442363337600.0, + "grad_norm": 2.805372604291138, + "language_loss": 0.82337093, + "learning_rate": 3.931050754996018e-06, + "loss": 0.84528613, + "num_input_tokens_seen": 29698775, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 1.046875, + "step": 1396, + "time_per_iteration": 2.3986716270446777 + }, + { + "auxiliary_loss_clip": 0.01139023, + "auxiliary_loss_mlp": 0.01051594, + "balance_loss_clip": 1.0220654, + "balance_loss_mlp": 1.03866041, + "epoch": 0.08399218397715316, + "flos": 23476122126720.0, + "grad_norm": 1.9699496155025322, + "language_loss": 0.76609969, + "learning_rate": 3.930952370009048e-06, + "loss": 0.78800583, + "num_input_tokens_seen": 29719430, + "router_z_loss_clip": 0.29492188, + "router_z_loss_mlp": 1.0, + "step": 1397, + "time_per_iteration": 3.893310308456421 + }, + { + "auxiliary_loss_clip": 0.01136244, + "auxiliary_loss_mlp": 0.01046049, + "balance_loss_clip": 1.01685452, + "balance_loss_mlp": 1.03420091, + "epoch": 0.08405230722982113, + "flos": 25919553605760.0, + "grad_norm": 2.245557912887348, + "language_loss": 0.7817446, + "learning_rate": 3.930853916111355e-06, + "loss": 0.80356753, + "num_input_tokens_seen": 29739685, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 1.0234375, + "step": 1398, + "time_per_iteration": 3.903414726257324 + }, + { + "auxiliary_loss_clip": 0.01131837, + "auxiliary_loss_mlp": 0.01047987, + "balance_loss_clip": 1.0217495, + "balance_loss_mlp": 1.03283024, + "epoch": 0.0841124304824891, + "flos": 17966478602880.0, + "grad_norm": 2.612612922286341, + "language_loss": 0.95172715, + "learning_rate": 3.930755393306453e-06, + "loss": 0.9735254, + "num_input_tokens_seen": 29756165, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.98828125, + "step": 1399, + "time_per_iteration": 2.4221866130828857 + }, + { + "auxiliary_loss_clip": 0.01139385, + "auxiliary_loss_mlp": 0.01057325, + "balance_loss_clip": 1.02733231, + "balance_loss_mlp": 1.03419042, + "epoch": 0.08417255373515707, + "flos": 25628482667520.0, + "grad_norm": 1.9415604543083347, + "language_loss": 0.81517625, + "learning_rate": 3.930656801597857e-06, + "loss": 0.83714336, + "num_input_tokens_seen": 29776425, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 1.0546875, + "step": 1400, + "time_per_iteration": 5.18965220451355 + }, + { + "auxiliary_loss_clip": 0.01136377, + "auxiliary_loss_mlp": 0.01053098, + "balance_loss_clip": 1.0243212, + "balance_loss_mlp": 1.0334307, + "epoch": 0.08423267698782504, + "flos": 26248540147200.0, + "grad_norm": 3.0763122953915043, + "language_loss": 0.86442995, + "learning_rate": 3.930558140989087e-06, + "loss": 0.88632476, + "num_input_tokens_seen": 29796440, + "router_z_loss_clip": 0.28710938, + "router_z_loss_mlp": 1.03125, + "step": 1401, + "time_per_iteration": 2.4589450359344482 + }, + { + "auxiliary_loss_clip": 0.01140493, + "auxiliary_loss_mlp": 0.01053125, + "balance_loss_clip": 1.02307224, + "balance_loss_mlp": 1.03402793, + "epoch": 0.08429280024049302, + "flos": 20118699498240.0, + "grad_norm": 2.3133765135270075, + "language_loss": 0.87033337, + "learning_rate": 3.930459411483662e-06, + "loss": 0.89226949, + "num_input_tokens_seen": 29814755, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 1.0625, + "step": 1402, + "time_per_iteration": 2.4530887603759766 + }, + { + "auxiliary_loss_clip": 0.01134145, + "auxiliary_loss_mlp": 0.01047832, + "balance_loss_clip": 1.02120042, + "balance_loss_mlp": 1.03132677, + "epoch": 0.08435292349316098, + "flos": 42922849086720.0, + "grad_norm": 2.048879929905967, + "language_loss": 0.8895582, + "learning_rate": 3.930360613085106e-06, + "loss": 0.91137803, + "num_input_tokens_seen": 29834785, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.03125, + "step": 1403, + "time_per_iteration": 2.6017262935638428 + }, + { + "auxiliary_loss_clip": 0.01139954, + "auxiliary_loss_mlp": 0.01052537, + "balance_loss_clip": 1.02278185, + "balance_loss_mlp": 1.0341984, + "epoch": 0.08441304674582895, + "flos": 22856169381120.0, + "grad_norm": 2.3078835344609447, + "language_loss": 0.80272245, + "learning_rate": 3.930261745796945e-06, + "loss": 0.82464731, + "num_input_tokens_seen": 29854695, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 1.0625, + "step": 1404, + "time_per_iteration": 2.4601595401763916 + }, + { + "auxiliary_loss_clip": 0.01142983, + "auxiliary_loss_mlp": 0.01063266, + "balance_loss_clip": 1.03136539, + "balance_loss_mlp": 1.03684366, + "epoch": 0.08447316999849692, + "flos": 18696512465280.0, + "grad_norm": 1.9814480155180556, + "language_loss": 0.83600795, + "learning_rate": 3.930162809622709e-06, + "loss": 0.85807049, + "num_input_tokens_seen": 29872180, + "router_z_loss_clip": 0.31835938, + "router_z_loss_mlp": 1.0625, + "step": 1405, + "time_per_iteration": 2.4039623737335205 + }, + { + "auxiliary_loss_clip": 0.01137342, + "auxiliary_loss_mlp": 0.0105122, + "balance_loss_clip": 1.022228, + "balance_loss_mlp": 1.0332588, + "epoch": 0.08453329325116489, + "flos": 25482790097280.0, + "grad_norm": 1.6255358588896107, + "language_loss": 0.80443799, + "learning_rate": 3.930063804565927e-06, + "loss": 0.82632363, + "num_input_tokens_seen": 29893205, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 1.0390625, + "step": 1406, + "time_per_iteration": 2.510904550552368 + }, + { + "auxiliary_loss_clip": 0.01140187, + "auxiliary_loss_mlp": 0.01056026, + "balance_loss_clip": 1.0283339, + "balance_loss_mlp": 1.03640819, + "epoch": 0.08459341650383286, + "flos": 20919083483520.0, + "grad_norm": 1.957000793352056, + "language_loss": 0.79425609, + "learning_rate": 3.929964730630132e-06, + "loss": 0.81621814, + "num_input_tokens_seen": 29911970, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 1.0390625, + "step": 1407, + "time_per_iteration": 2.423840045928955 + }, + { + "auxiliary_loss_clip": 0.01135099, + "auxiliary_loss_mlp": 0.01050036, + "balance_loss_clip": 1.02234411, + "balance_loss_mlp": 1.03416073, + "epoch": 0.08465353975650082, + "flos": 13042223712000.0, + "grad_norm": 2.3275697224793697, + "language_loss": 0.91585648, + "learning_rate": 3.92986558781886e-06, + "loss": 0.9377079, + "num_input_tokens_seen": 29929925, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 1.0078125, + "step": 1408, + "time_per_iteration": 2.4315226078033447 + }, + { + "auxiliary_loss_clip": 0.01043673, + "auxiliary_loss_mlp": 0.01023297, + "balance_loss_clip": 1.01826644, + "balance_loss_mlp": 1.01066768, + "epoch": 0.0847136630091688, + "flos": 60874174293120.0, + "grad_norm": 0.8792341838331387, + "language_loss": 0.61765254, + "learning_rate": 3.92976637613565e-06, + "loss": 0.63832223, + "num_input_tokens_seen": 29985950, + "router_z_loss_clip": 0.05029297, + "router_z_loss_mlp": 0.33007812, + "step": 1409, + "time_per_iteration": 3.1208980083465576 + }, + { + "auxiliary_loss_clip": 0.01131074, + "auxiliary_loss_mlp": 0.01055388, + "balance_loss_clip": 1.02793384, + "balance_loss_mlp": 1.03469133, + "epoch": 0.08477378626183676, + "flos": 22045661101440.0, + "grad_norm": 1.6652926113525195, + "language_loss": 0.86648887, + "learning_rate": 3.9296670955840415e-06, + "loss": 0.88835347, + "num_input_tokens_seen": 30004330, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.96484375, + "step": 1410, + "time_per_iteration": 2.4426138401031494 + }, + { + "auxiliary_loss_clip": 0.01136838, + "auxiliary_loss_mlp": 0.01047834, + "balance_loss_clip": 1.01812696, + "balance_loss_mlp": 1.03348505, + "epoch": 0.08483390951450473, + "flos": 16689146267520.0, + "grad_norm": 2.071857028368419, + "language_loss": 0.74074405, + "learning_rate": 3.929567746167578e-06, + "loss": 0.76259077, + "num_input_tokens_seen": 30022555, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 1.03125, + "step": 1411, + "time_per_iteration": 2.4424655437469482 + }, + { + "auxiliary_loss_clip": 0.01036848, + "auxiliary_loss_mlp": 0.01007644, + "balance_loss_clip": 1.00278056, + "balance_loss_mlp": 1.00472724, + "epoch": 0.08489403276717271, + "flos": 51581341710720.0, + "grad_norm": 0.9068689782583981, + "language_loss": 0.56724936, + "learning_rate": 3.929468327889805e-06, + "loss": 0.58769429, + "num_input_tokens_seen": 30077220, + "router_z_loss_clip": 0.04858398, + "router_z_loss_mlp": 0.3203125, + "step": 1412, + "time_per_iteration": 3.0075435638427734 + }, + { + "auxiliary_loss_clip": 0.01133906, + "auxiliary_loss_mlp": 0.01054027, + "balance_loss_clip": 1.02596474, + "balance_loss_mlp": 1.03316736, + "epoch": 0.08495415601984067, + "flos": 17091380574720.0, + "grad_norm": 2.4888670092824627, + "language_loss": 0.88898432, + "learning_rate": 3.9293688407542715e-06, + "loss": 0.91086364, + "num_input_tokens_seen": 30094600, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 1.0078125, + "step": 1413, + "time_per_iteration": 2.515660524368286 + }, + { + "auxiliary_loss_clip": 0.01138069, + "auxiliary_loss_mlp": 0.01049608, + "balance_loss_clip": 1.02165365, + "balance_loss_mlp": 1.03670883, + "epoch": 0.08501427927250864, + "flos": 23147310142080.0, + "grad_norm": 1.928976151337458, + "language_loss": 0.88079464, + "learning_rate": 3.929269284764526e-06, + "loss": 0.9026714, + "num_input_tokens_seen": 30114475, + "router_z_loss_clip": 0.27929688, + "router_z_loss_mlp": 1.015625, + "step": 1414, + "time_per_iteration": 2.4387221336364746 + }, + { + "auxiliary_loss_clip": 0.01138837, + "auxiliary_loss_mlp": 0.01055666, + "balance_loss_clip": 1.02861762, + "balance_loss_mlp": 1.03549552, + "epoch": 0.08507440252517662, + "flos": 19062437091840.0, + "grad_norm": 1.8104022752795743, + "language_loss": 0.77125359, + "learning_rate": 3.929169659924123e-06, + "loss": 0.79319859, + "num_input_tokens_seen": 30133350, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 1.03125, + "step": 1415, + "time_per_iteration": 2.4585282802581787 + }, + { + "auxiliary_loss_clip": 0.01136626, + "auxiliary_loss_mlp": 0.01053604, + "balance_loss_clip": 1.02770007, + "balance_loss_mlp": 1.03540778, + "epoch": 0.08513452577784458, + "flos": 60180137775360.0, + "grad_norm": 1.7518766502615744, + "language_loss": 0.70400184, + "learning_rate": 3.929069966236617e-06, + "loss": 0.72590417, + "num_input_tokens_seen": 30159005, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 1.015625, + "step": 1416, + "time_per_iteration": 2.7807602882385254 + }, + { + "auxiliary_loss_clip": 0.01142408, + "auxiliary_loss_mlp": 0.01058259, + "balance_loss_clip": 1.02874279, + "balance_loss_mlp": 1.03745627, + "epoch": 0.08519464903051255, + "flos": 27307246348800.0, + "grad_norm": 2.068740206450198, + "language_loss": 0.74673724, + "learning_rate": 3.928970203705565e-06, + "loss": 0.76874387, + "num_input_tokens_seen": 30179450, + "router_z_loss_clip": 0.29492188, + "router_z_loss_mlp": 1.046875, + "step": 1417, + "time_per_iteration": 2.503241539001465 + }, + { + "auxiliary_loss_clip": 0.01135854, + "auxiliary_loss_mlp": 0.01048919, + "balance_loss_clip": 1.02104759, + "balance_loss_mlp": 1.03411698, + "epoch": 0.08525477228318051, + "flos": 20265404497920.0, + "grad_norm": 2.8020629614021364, + "language_loss": 0.82518953, + "learning_rate": 3.92887037233453e-06, + "loss": 0.84703726, + "num_input_tokens_seen": 30197235, + "router_z_loss_clip": 0.27929688, + "router_z_loss_mlp": 1.015625, + "step": 1418, + "time_per_iteration": 2.4216010570526123 + }, + { + "auxiliary_loss_clip": 0.01036904, + "auxiliary_loss_mlp": 0.01013136, + "balance_loss_clip": 1.00758064, + "balance_loss_mlp": 1.00450683, + "epoch": 0.08531489553584849, + "flos": 67611923268480.0, + "grad_norm": 0.892372631416078, + "language_loss": 0.56662297, + "learning_rate": 3.928770472127073e-06, + "loss": 0.58712339, + "num_input_tokens_seen": 30257410, + "router_z_loss_clip": 0.05566406, + "router_z_loss_mlp": 0.32421875, + "step": 1419, + "time_per_iteration": 3.0361785888671875 + }, + { + "auxiliary_loss_clip": 0.01135059, + "auxiliary_loss_mlp": 0.010583, + "balance_loss_clip": 1.03101301, + "balance_loss_mlp": 1.03346896, + "epoch": 0.08537501878851646, + "flos": 27525732837120.0, + "grad_norm": 2.225891915285972, + "language_loss": 0.69978249, + "learning_rate": 3.928670503086758e-06, + "loss": 0.72171611, + "num_input_tokens_seen": 30277865, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 1.015625, + "step": 1420, + "time_per_iteration": 2.5303280353546143 + }, + { + "auxiliary_loss_clip": 0.011341, + "auxiliary_loss_mlp": 0.01041267, + "balance_loss_clip": 1.01250148, + "balance_loss_mlp": 1.0326556, + "epoch": 0.08543514204118442, + "flos": 22783131083520.0, + "grad_norm": 1.5346284285593206, + "language_loss": 0.88313144, + "learning_rate": 3.9285704652171545e-06, + "loss": 0.90488505, + "num_input_tokens_seen": 30298545, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 1.015625, + "step": 1421, + "time_per_iteration": 2.4855990409851074 + }, + { + "auxiliary_loss_clip": 0.01035216, + "auxiliary_loss_mlp": 0.01009874, + "balance_loss_clip": 1.00470078, + "balance_loss_mlp": 1.00329804, + "epoch": 0.0854952652938524, + "flos": 60987362520960.0, + "grad_norm": 0.8049145610308904, + "language_loss": 0.63468266, + "learning_rate": 3.9284703585218324e-06, + "loss": 0.65513355, + "num_input_tokens_seen": 30361725, + "router_z_loss_clip": 0.05175781, + "router_z_loss_mlp": 0.3203125, + "step": 1422, + "time_per_iteration": 3.057037115097046 + }, + { + "auxiliary_loss_clip": 0.01132589, + "auxiliary_loss_mlp": 0.01055404, + "balance_loss_clip": 1.0286057, + "balance_loss_mlp": 1.03614104, + "epoch": 0.08555538854652037, + "flos": 28036791452160.0, + "grad_norm": 3.313427635387682, + "language_loss": 0.83097607, + "learning_rate": 3.928370183004363e-06, + "loss": 0.85285604, + "num_input_tokens_seen": 30382180, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.96484375, + "step": 1423, + "time_per_iteration": 2.525609016418457 + }, + { + "auxiliary_loss_clip": 0.01138477, + "auxiliary_loss_mlp": 0.01063673, + "balance_loss_clip": 1.03676784, + "balance_loss_mlp": 1.03663898, + "epoch": 0.08561551179918833, + "flos": 23508277355520.0, + "grad_norm": 1.6560125375036239, + "language_loss": 0.75101602, + "learning_rate": 3.9282699386683236e-06, + "loss": 0.77303749, + "num_input_tokens_seen": 30402980, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 1.015625, + "step": 1424, + "time_per_iteration": 2.5153520107269287 + }, + { + "auxiliary_loss_clip": 0.01137275, + "auxiliary_loss_mlp": 0.01058406, + "balance_loss_clip": 1.03109503, + "balance_loss_mlp": 1.0375545, + "epoch": 0.08567563505185631, + "flos": 17926084293120.0, + "grad_norm": 1.8755866914873893, + "language_loss": 0.76020384, + "learning_rate": 3.928169625517289e-06, + "loss": 0.78216064, + "num_input_tokens_seen": 30420800, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.99609375, + "step": 1425, + "time_per_iteration": 2.46828293800354 + }, + { + "auxiliary_loss_clip": 0.01134093, + "auxiliary_loss_mlp": 0.01046632, + "balance_loss_clip": 1.02071571, + "balance_loss_mlp": 1.03487504, + "epoch": 0.08573575830452428, + "flos": 19718490049920.0, + "grad_norm": 2.9193753758221637, + "language_loss": 0.93008298, + "learning_rate": 3.9280692435548405e-06, + "loss": 0.95189023, + "num_input_tokens_seen": 30439620, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.9921875, + "step": 1426, + "time_per_iteration": 2.4237146377563477 + }, + { + "auxiliary_loss_clip": 0.01141535, + "auxiliary_loss_mlp": 0.01061609, + "balance_loss_clip": 1.03209257, + "balance_loss_mlp": 1.03872645, + "epoch": 0.08579588155719224, + "flos": 17930587858560.0, + "grad_norm": 2.0509856314306787, + "language_loss": 0.75465858, + "learning_rate": 3.927968792784561e-06, + "loss": 0.77669007, + "num_input_tokens_seen": 30457300, + "router_z_loss_clip": 0.29492188, + "router_z_loss_mlp": 1.03125, + "step": 1427, + "time_per_iteration": 2.5031518936157227 + }, + { + "auxiliary_loss_clip": 0.0113477, + "auxiliary_loss_mlp": 0.01049683, + "balance_loss_clip": 1.02349269, + "balance_loss_mlp": 1.03487051, + "epoch": 0.08585600480986022, + "flos": 16032429993600.0, + "grad_norm": 2.3071386430294982, + "language_loss": 0.82328194, + "learning_rate": 3.927868273210033e-06, + "loss": 0.84512639, + "num_input_tokens_seen": 30471580, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 1.0, + "step": 1428, + "time_per_iteration": 2.5303401947021484 + }, + { + "auxiliary_loss_clip": 0.01142845, + "auxiliary_loss_mlp": 0.0106289, + "balance_loss_clip": 1.03399324, + "balance_loss_mlp": 1.03674901, + "epoch": 0.08591612806252819, + "flos": 28656185616000.0, + "grad_norm": 2.24419618106378, + "language_loss": 0.79911095, + "learning_rate": 3.927767684834847e-06, + "loss": 0.8211683, + "num_input_tokens_seen": 30492720, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 1.0625, + "step": 1429, + "time_per_iteration": 2.5428974628448486 + }, + { + "auxiliary_loss_clip": 0.01141666, + "auxiliary_loss_mlp": 0.01056747, + "balance_loss_clip": 1.02868533, + "balance_loss_mlp": 1.03754735, + "epoch": 0.08597625131519615, + "flos": 20958081338880.0, + "grad_norm": 2.76991814960215, + "language_loss": 0.88487703, + "learning_rate": 3.9276670276625894e-06, + "loss": 0.90686119, + "num_input_tokens_seen": 30509535, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 1.0390625, + "step": 1430, + "time_per_iteration": 2.4304347038269043 + }, + { + "auxiliary_loss_clip": 0.01137127, + "auxiliary_loss_mlp": 0.01052006, + "balance_loss_clip": 1.02481472, + "balance_loss_mlp": 1.03735805, + "epoch": 0.08603637456786412, + "flos": 23255296577280.0, + "grad_norm": 1.6513236082284355, + "language_loss": 0.81535912, + "learning_rate": 3.927566301696856e-06, + "loss": 0.83725047, + "num_input_tokens_seen": 30529490, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 1.0, + "step": 1431, + "time_per_iteration": 2.4605906009674072 + }, + { + "auxiliary_loss_clip": 0.01136667, + "auxiliary_loss_mlp": 0.01054383, + "balance_loss_clip": 1.02739429, + "balance_loss_mlp": 1.03375912, + "epoch": 0.0860964978205321, + "flos": 28692914232960.0, + "grad_norm": 1.9114593628809293, + "language_loss": 0.77429157, + "learning_rate": 3.927465506941238e-06, + "loss": 0.79620206, + "num_input_tokens_seen": 30550205, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 1.03125, + "step": 1432, + "time_per_iteration": 2.466153860092163 + }, + { + "auxiliary_loss_clip": 0.01136558, + "auxiliary_loss_mlp": 0.01058598, + "balance_loss_clip": 1.02967751, + "balance_loss_mlp": 1.03431463, + "epoch": 0.08615662107320006, + "flos": 19317372906240.0, + "grad_norm": 2.704543968863709, + "language_loss": 0.72969025, + "learning_rate": 3.927364643399335e-06, + "loss": 0.75164181, + "num_input_tokens_seen": 30568830, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 1.0234375, + "step": 1433, + "time_per_iteration": 2.426859140396118 + }, + { + "auxiliary_loss_clip": 0.0114178, + "auxiliary_loss_mlp": 0.01059604, + "balance_loss_clip": 1.02967048, + "balance_loss_mlp": 1.03753674, + "epoch": 0.08621674432586802, + "flos": 15850776856320.0, + "grad_norm": 2.5010258393633356, + "language_loss": 0.85818481, + "learning_rate": 3.927263711074745e-06, + "loss": 0.8801986, + "num_input_tokens_seen": 30585730, + "router_z_loss_clip": 0.29882812, + "router_z_loss_mlp": 1.0390625, + "step": 1434, + "time_per_iteration": 2.3832969665527344 + }, + { + "auxiliary_loss_clip": 0.0113731, + "auxiliary_loss_mlp": 0.01055979, + "balance_loss_clip": 1.02808404, + "balance_loss_mlp": 1.03510058, + "epoch": 0.086276867578536, + "flos": 14099777838720.0, + "grad_norm": 2.47929626062069, + "language_loss": 0.78560674, + "learning_rate": 3.927162709971072e-06, + "loss": 0.8075397, + "num_input_tokens_seen": 30603180, + "router_z_loss_clip": 0.27929688, + "router_z_loss_mlp": 1.0234375, + "step": 1435, + "time_per_iteration": 2.4245572090148926 + }, + { + "auxiliary_loss_clip": 0.01036822, + "auxiliary_loss_mlp": 0.01042827, + "balance_loss_clip": 1.03898871, + "balance_loss_mlp": 1.00600958, + "epoch": 0.08633699083120397, + "flos": 70181250710400.0, + "grad_norm": 0.923907410816164, + "language_loss": 0.57990175, + "learning_rate": 3.927061640091918e-06, + "loss": 0.60069823, + "num_input_tokens_seen": 30668895, + "router_z_loss_clip": 0.03833008, + "router_z_loss_mlp": 0.30859375, + "step": 1436, + "time_per_iteration": 4.62123966217041 + }, + { + "auxiliary_loss_clip": 0.01136609, + "auxiliary_loss_mlp": 0.01054357, + "balance_loss_clip": 1.02512634, + "balance_loss_mlp": 1.0350647, + "epoch": 0.08639711408387193, + "flos": 30297592275840.0, + "grad_norm": 2.785802479640344, + "language_loss": 0.68792832, + "learning_rate": 3.926960501440891e-06, + "loss": 0.70983791, + "num_input_tokens_seen": 30688955, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 1.015625, + "step": 1437, + "time_per_iteration": 2.506444215774536 + }, + { + "auxiliary_loss_clip": 0.01136806, + "auxiliary_loss_mlp": 0.0104706, + "balance_loss_clip": 1.01931965, + "balance_loss_mlp": 1.0348177, + "epoch": 0.08645723733653991, + "flos": 20296791676800.0, + "grad_norm": 2.169010760070846, + "language_loss": 0.72614551, + "learning_rate": 3.9268592940216014e-06, + "loss": 0.74798417, + "num_input_tokens_seen": 30706095, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 1.015625, + "step": 1438, + "time_per_iteration": 3.8132054805755615 + }, + { + "auxiliary_loss_clip": 0.01132794, + "auxiliary_loss_mlp": 0.0104806, + "balance_loss_clip": 1.01983142, + "balance_loss_mlp": 1.03471184, + "epoch": 0.08651736058920788, + "flos": 32889195031680.0, + "grad_norm": 1.600592663775302, + "language_loss": 0.64091539, + "learning_rate": 3.9267580178376596e-06, + "loss": 0.6627239, + "num_input_tokens_seen": 30729025, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 0.98046875, + "step": 1439, + "time_per_iteration": 2.560499906539917 + }, + { + "auxiliary_loss_clip": 0.01137702, + "auxiliary_loss_mlp": 0.01049448, + "balance_loss_clip": 1.02102852, + "balance_loss_mlp": 1.03593493, + "epoch": 0.08657748384187584, + "flos": 22636286438400.0, + "grad_norm": 2.5636767582097706, + "language_loss": 0.87194371, + "learning_rate": 3.92665667289268e-06, + "loss": 0.89381528, + "num_input_tokens_seen": 30746155, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 1.015625, + "step": 1440, + "time_per_iteration": 5.246348857879639 + }, + { + "auxiliary_loss_clip": 0.01142193, + "auxiliary_loss_mlp": 0.01058538, + "balance_loss_clip": 1.02762711, + "balance_loss_mlp": 1.03654242, + "epoch": 0.08663760709454381, + "flos": 23657286504960.0, + "grad_norm": 3.2939192965722217, + "language_loss": 0.8352201, + "learning_rate": 3.92655525919028e-06, + "loss": 0.85722744, + "num_input_tokens_seen": 30761410, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 1.0546875, + "step": 1441, + "time_per_iteration": 2.458915948867798 + }, + { + "auxiliary_loss_clip": 0.01033196, + "auxiliary_loss_mlp": 0.01004597, + "balance_loss_clip": 1.00054348, + "balance_loss_mlp": 1.00218558, + "epoch": 0.08669773034721179, + "flos": 62683688482560.0, + "grad_norm": 0.8420296727267951, + "language_loss": 0.60429287, + "learning_rate": 3.926453776734078e-06, + "loss": 0.62467074, + "num_input_tokens_seen": 30823010, + "router_z_loss_clip": 0.04052734, + "router_z_loss_mlp": 0.31054688, + "step": 1442, + "time_per_iteration": 3.161848783493042 + }, + { + "auxiliary_loss_clip": 0.01139796, + "auxiliary_loss_mlp": 0.01052902, + "balance_loss_clip": 1.02550721, + "balance_loss_mlp": 1.03469789, + "epoch": 0.08675785359987975, + "flos": 20666451818880.0, + "grad_norm": 2.652059476450735, + "language_loss": 0.78552687, + "learning_rate": 3.9263522255276965e-06, + "loss": 0.80745387, + "num_input_tokens_seen": 30841980, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 1.0546875, + "step": 1443, + "time_per_iteration": 2.423205852508545 + }, + { + "auxiliary_loss_clip": 0.01135757, + "auxiliary_loss_mlp": 0.01049801, + "balance_loss_clip": 1.02301478, + "balance_loss_mlp": 1.03341126, + "epoch": 0.08681797685254772, + "flos": 26939960179200.0, + "grad_norm": 1.604042055689223, + "language_loss": 0.82368612, + "learning_rate": 3.9262506055747596e-06, + "loss": 0.84554166, + "num_input_tokens_seen": 30863280, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 1.0234375, + "step": 1444, + "time_per_iteration": 2.4947876930236816 + }, + { + "auxiliary_loss_clip": 0.01139058, + "auxiliary_loss_mlp": 0.01054381, + "balance_loss_clip": 1.02591348, + "balance_loss_mlp": 1.03583121, + "epoch": 0.0868781001052157, + "flos": 17711856990720.0, + "grad_norm": 2.8698712932591914, + "language_loss": 0.87018931, + "learning_rate": 3.926148916878893e-06, + "loss": 0.8921237, + "num_input_tokens_seen": 30881710, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 1.03125, + "step": 1445, + "time_per_iteration": 2.438837766647339 + }, + { + "auxiliary_loss_clip": 0.01140454, + "auxiliary_loss_mlp": 0.01055732, + "balance_loss_clip": 1.02794433, + "balance_loss_mlp": 1.0385077, + "epoch": 0.08693822335788366, + "flos": 19895639621760.0, + "grad_norm": 1.8482243248422658, + "language_loss": 0.81103694, + "learning_rate": 3.926047159443727e-06, + "loss": 0.83299881, + "num_input_tokens_seen": 30900225, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 1.0234375, + "step": 1446, + "time_per_iteration": 2.46681547164917 + }, + { + "auxiliary_loss_clip": 0.01033918, + "auxiliary_loss_mlp": 0.01006089, + "balance_loss_clip": 1.00213134, + "balance_loss_mlp": 1.00254512, + "epoch": 0.08699834661055163, + "flos": 67020878995200.0, + "grad_norm": 0.7253279505632818, + "language_loss": 0.54759985, + "learning_rate": 3.925945333272891e-06, + "loss": 0.56799996, + "num_input_tokens_seen": 30959580, + "router_z_loss_clip": 0.03955078, + "router_z_loss_mlp": 0.31445312, + "step": 1447, + "time_per_iteration": 3.1271657943725586 + }, + { + "auxiliary_loss_clip": 0.01134434, + "auxiliary_loss_mlp": 0.0105079, + "balance_loss_clip": 1.02214408, + "balance_loss_mlp": 1.03559732, + "epoch": 0.0870584698632196, + "flos": 13479650536320.0, + "grad_norm": 2.368430375578426, + "language_loss": 0.84644473, + "learning_rate": 3.925843438370021e-06, + "loss": 0.86829698, + "num_input_tokens_seen": 30976775, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 0.984375, + "step": 1448, + "time_per_iteration": 2.5329737663269043 + }, + { + "auxiliary_loss_clip": 0.01140375, + "auxiliary_loss_mlp": 0.01052026, + "balance_loss_clip": 1.02382088, + "balance_loss_mlp": 1.03608632, + "epoch": 0.08711859311588757, + "flos": 16106096695680.0, + "grad_norm": 2.6427047070415206, + "language_loss": 0.80531889, + "learning_rate": 3.925741474738752e-06, + "loss": 0.82724291, + "num_input_tokens_seen": 30990495, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 1.046875, + "step": 1449, + "time_per_iteration": 2.3928134441375732 + }, + { + "auxiliary_loss_clip": 0.01135613, + "auxiliary_loss_mlp": 0.01043545, + "balance_loss_clip": 1.01903582, + "balance_loss_mlp": 1.03557396, + "epoch": 0.08717871636855554, + "flos": 38470829512320.0, + "grad_norm": 1.5675969670229246, + "language_loss": 0.71181607, + "learning_rate": 3.925639442382724e-06, + "loss": 0.73360765, + "num_input_tokens_seen": 31014080, + "router_z_loss_clip": 0.24511719, + "router_z_loss_mlp": 1.0, + "step": 1450, + "time_per_iteration": 2.5841941833496094 + }, + { + "auxiliary_loss_clip": 0.01136972, + "auxiliary_loss_mlp": 0.01054337, + "balance_loss_clip": 1.0267638, + "balance_loss_mlp": 1.03637421, + "epoch": 0.0872388396212235, + "flos": 17599681192320.0, + "grad_norm": 1.771054780384107, + "language_loss": 0.83204961, + "learning_rate": 3.925537341305578e-06, + "loss": 0.85396278, + "num_input_tokens_seen": 31031210, + "router_z_loss_clip": 0.27539062, + "router_z_loss_mlp": 1.0078125, + "step": 1451, + "time_per_iteration": 2.3957407474517822 + }, + { + "auxiliary_loss_clip": 0.01133863, + "auxiliary_loss_mlp": 0.0105886, + "balance_loss_clip": 1.03336096, + "balance_loss_mlp": 1.03573465, + "epoch": 0.08729896287389148, + "flos": 25258368677760.0, + "grad_norm": 2.1923603807858347, + "language_loss": 0.74339652, + "learning_rate": 3.925435171510957e-06, + "loss": 0.76532376, + "num_input_tokens_seen": 31049710, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.984375, + "step": 1452, + "time_per_iteration": 2.4761288166046143 + }, + { + "auxiliary_loss_clip": 0.0113897, + "auxiliary_loss_mlp": 0.0105538, + "balance_loss_clip": 1.02777123, + "balance_loss_mlp": 1.03575897, + "epoch": 0.08735908612655945, + "flos": 15631557229440.0, + "grad_norm": 3.009200128085401, + "language_loss": 0.79649633, + "learning_rate": 3.925332933002507e-06, + "loss": 0.81843984, + "num_input_tokens_seen": 31066160, + "router_z_loss_clip": 0.27539062, + "router_z_loss_mlp": 1.03125, + "step": 1453, + "time_per_iteration": 2.4006617069244385 + }, + { + "auxiliary_loss_clip": 0.01135753, + "auxiliary_loss_mlp": 0.01051944, + "balance_loss_clip": 1.02537227, + "balance_loss_mlp": 1.03667951, + "epoch": 0.08741920937922741, + "flos": 20338617352320.0, + "grad_norm": 1.875711402079848, + "language_loss": 0.70716834, + "learning_rate": 3.925230625783877e-06, + "loss": 0.72904533, + "num_input_tokens_seen": 31085270, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.9921875, + "step": 1454, + "time_per_iteration": 2.452676773071289 + }, + { + "auxiliary_loss_clip": 0.01034214, + "auxiliary_loss_mlp": 0.0101123, + "balance_loss_clip": 1.00710583, + "balance_loss_mlp": 1.00364447, + "epoch": 0.08747933263189539, + "flos": 62816252515200.0, + "grad_norm": 0.7824902349415341, + "language_loss": 0.58511788, + "learning_rate": 3.925128249858719e-06, + "loss": 0.60557228, + "num_input_tokens_seen": 31148445, + "router_z_loss_clip": 0.04125977, + "router_z_loss_mlp": 0.3046875, + "step": 1455, + "time_per_iteration": 3.0468578338623047 + }, + { + "auxiliary_loss_clip": 0.01134979, + "auxiliary_loss_mlp": 0.01048629, + "balance_loss_clip": 1.02142525, + "balance_loss_mlp": 1.03400826, + "epoch": 0.08753945588456336, + "flos": 33034503576960.0, + "grad_norm": 1.5610199804777385, + "language_loss": 0.77557188, + "learning_rate": 3.925025805230685e-06, + "loss": 0.79740798, + "num_input_tokens_seen": 31168770, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 1.0078125, + "step": 1456, + "time_per_iteration": 2.5630991458892822 + }, + { + "auxiliary_loss_clip": 0.01132645, + "auxiliary_loss_mlp": 0.01054823, + "balance_loss_clip": 1.02566481, + "balance_loss_mlp": 1.03343987, + "epoch": 0.08759957913723132, + "flos": 35545911206400.0, + "grad_norm": 2.3625478373839406, + "language_loss": 0.71963835, + "learning_rate": 3.924923291903433e-06, + "loss": 0.74151307, + "num_input_tokens_seen": 31189270, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 0.9921875, + "step": 1457, + "time_per_iteration": 2.551595449447632 + }, + { + "auxiliary_loss_clip": 0.01130536, + "auxiliary_loss_mlp": 0.01044576, + "balance_loss_clip": 1.01885056, + "balance_loss_mlp": 1.03258061, + "epoch": 0.0876597023898993, + "flos": 23910092726400.0, + "grad_norm": 1.5815599312414572, + "language_loss": 0.86436832, + "learning_rate": 3.924820709880619e-06, + "loss": 0.88611948, + "num_input_tokens_seen": 31210385, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.98046875, + "step": 1458, + "time_per_iteration": 2.4821531772613525 + }, + { + "auxiliary_loss_clip": 0.01140857, + "auxiliary_loss_mlp": 0.0104819, + "balance_loss_clip": 1.02134418, + "balance_loss_mlp": 1.03808141, + "epoch": 0.08771982564256726, + "flos": 18113043957120.0, + "grad_norm": 1.6349072283959376, + "language_loss": 0.8053205, + "learning_rate": 3.924718059165906e-06, + "loss": 0.82721102, + "num_input_tokens_seen": 31229745, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 1.03125, + "step": 1459, + "time_per_iteration": 2.4148335456848145 + }, + { + "auxiliary_loss_clip": 0.01137273, + "auxiliary_loss_mlp": 0.01055503, + "balance_loss_clip": 1.02746463, + "balance_loss_mlp": 1.03450203, + "epoch": 0.08777994889523523, + "flos": 17711054029440.0, + "grad_norm": 2.099146642925664, + "language_loss": 0.84267873, + "learning_rate": 3.924615339762956e-06, + "loss": 0.8646065, + "num_input_tokens_seen": 31248280, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 1.0234375, + "step": 1460, + "time_per_iteration": 2.470891237258911 + }, + { + "auxiliary_loss_clip": 0.01130985, + "auxiliary_loss_mlp": 0.01051176, + "balance_loss_clip": 1.02502179, + "balance_loss_mlp": 1.03299022, + "epoch": 0.0878400721479032, + "flos": 12819198746880.0, + "grad_norm": 2.6593803727230347, + "language_loss": 0.81124723, + "learning_rate": 3.924512551675435e-06, + "loss": 0.83306885, + "num_input_tokens_seen": 31262190, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.98046875, + "step": 1461, + "time_per_iteration": 2.421342372894287 + }, + { + "auxiliary_loss_clip": 0.01138365, + "auxiliary_loss_mlp": 0.01052623, + "balance_loss_clip": 1.02715981, + "balance_loss_mlp": 1.03707671, + "epoch": 0.08790019540057117, + "flos": 26391579454080.0, + "grad_norm": 1.7269677394716834, + "language_loss": 0.76201111, + "learning_rate": 3.924409694907011e-06, + "loss": 0.783921, + "num_input_tokens_seen": 31283690, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.015625, + "step": 1462, + "time_per_iteration": 2.6441681385040283 + }, + { + "auxiliary_loss_clip": 0.01139179, + "auxiliary_loss_mlp": 0.01055463, + "balance_loss_clip": 1.02630436, + "balance_loss_mlp": 1.03623247, + "epoch": 0.08796031865323914, + "flos": 19133066505600.0, + "grad_norm": 1.7974264920681688, + "language_loss": 0.74233687, + "learning_rate": 3.924306769461356e-06, + "loss": 0.7642833, + "num_input_tokens_seen": 31302505, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 1.03125, + "step": 1463, + "time_per_iteration": 2.507075309753418 + }, + { + "auxiliary_loss_clip": 0.01137699, + "auxiliary_loss_mlp": 0.01051707, + "balance_loss_clip": 1.02271533, + "balance_loss_mlp": 1.03376389, + "epoch": 0.0880204419059071, + "flos": 26063186405760.0, + "grad_norm": 1.8813185484463697, + "language_loss": 0.83247638, + "learning_rate": 3.924203775342142e-06, + "loss": 0.85437036, + "num_input_tokens_seen": 31323070, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 1.0390625, + "step": 1464, + "time_per_iteration": 2.582576274871826 + }, + { + "auxiliary_loss_clip": 0.01135477, + "auxiliary_loss_mlp": 0.01055543, + "balance_loss_clip": 1.02949548, + "balance_loss_mlp": 1.03474152, + "epoch": 0.08808056515857508, + "flos": 22376881969920.0, + "grad_norm": 1.893011339821771, + "language_loss": 0.78369987, + "learning_rate": 3.924100712553046e-06, + "loss": 0.80561006, + "num_input_tokens_seen": 31341880, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 1.0078125, + "step": 1465, + "time_per_iteration": 2.4433510303497314 + }, + { + "auxiliary_loss_clip": 0.01138777, + "auxiliary_loss_mlp": 0.01050684, + "balance_loss_clip": 1.02324176, + "balance_loss_mlp": 1.03619003, + "epoch": 0.08814068841124305, + "flos": 23184178404480.0, + "grad_norm": 2.4569350502347165, + "language_loss": 0.84995323, + "learning_rate": 3.923997581097744e-06, + "loss": 0.87184787, + "num_input_tokens_seen": 31361995, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 1.0234375, + "step": 1466, + "time_per_iteration": 2.514909267425537 + }, + { + "auxiliary_loss_clip": 0.01137144, + "auxiliary_loss_mlp": 0.01049688, + "balance_loss_clip": 1.02200794, + "balance_loss_mlp": 1.03487492, + "epoch": 0.08820081166391101, + "flos": 25154117758080.0, + "grad_norm": 2.1428941119918825, + "language_loss": 0.84030366, + "learning_rate": 3.923894380979917e-06, + "loss": 0.86217201, + "num_input_tokens_seen": 31381515, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 1.015625, + "step": 1467, + "time_per_iteration": 2.4532175064086914 + }, + { + "auxiliary_loss_clip": 0.01136589, + "auxiliary_loss_mlp": 0.01050094, + "balance_loss_clip": 1.02235413, + "balance_loss_mlp": 1.03442502, + "epoch": 0.08826093491657899, + "flos": 22230735552000.0, + "grad_norm": 1.8526771256998313, + "language_loss": 0.75296938, + "learning_rate": 3.9237911122032485e-06, + "loss": 0.7748363, + "num_input_tokens_seen": 31400345, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 1.0234375, + "step": 1468, + "time_per_iteration": 2.4733340740203857 + }, + { + "auxiliary_loss_clip": 0.01134615, + "auxiliary_loss_mlp": 0.01045144, + "balance_loss_clip": 1.0202769, + "balance_loss_mlp": 1.03503835, + "epoch": 0.08832105816924696, + "flos": 22125751493760.0, + "grad_norm": 5.546335350124982, + "language_loss": 0.8053264, + "learning_rate": 3.923687774771424e-06, + "loss": 0.827124, + "num_input_tokens_seen": 31419620, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.99609375, + "step": 1469, + "time_per_iteration": 2.4505934715270996 + }, + { + "auxiliary_loss_clip": 0.01139914, + "auxiliary_loss_mlp": 0.01053373, + "balance_loss_clip": 1.02658677, + "balance_loss_mlp": 1.03651309, + "epoch": 0.08838118142191492, + "flos": 17565536016000.0, + "grad_norm": 1.989153980440257, + "language_loss": 0.77890998, + "learning_rate": 3.923584368688132e-06, + "loss": 0.80084276, + "num_input_tokens_seen": 31437970, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 1.03125, + "step": 1470, + "time_per_iteration": 2.4670047760009766 + }, + { + "auxiliary_loss_clip": 0.01132672, + "auxiliary_loss_mlp": 0.01050743, + "balance_loss_clip": 1.0239327, + "balance_loss_mlp": 1.03389835, + "epoch": 0.0884413046745829, + "flos": 20776148910720.0, + "grad_norm": 1.8836256182417797, + "language_loss": 0.83851361, + "learning_rate": 3.923480893957061e-06, + "loss": 0.86034775, + "num_input_tokens_seen": 31457040, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 0.98828125, + "step": 1471, + "time_per_iteration": 2.4457244873046875 + }, + { + "auxiliary_loss_clip": 0.01130809, + "auxiliary_loss_mlp": 0.01046425, + "balance_loss_clip": 1.02320361, + "balance_loss_mlp": 1.0354538, + "epoch": 0.08850142792725087, + "flos": 22124424862080.0, + "grad_norm": 5.684584918680441, + "language_loss": 0.83179504, + "learning_rate": 3.923377350581905e-06, + "loss": 0.85356736, + "num_input_tokens_seen": 31477520, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.953125, + "step": 1472, + "time_per_iteration": 2.4930155277252197 + }, + { + "auxiliary_loss_clip": 0.01135368, + "auxiliary_loss_mlp": 0.0104367, + "balance_loss_clip": 1.01808798, + "balance_loss_mlp": 1.03654361, + "epoch": 0.08856155117991883, + "flos": 22417660304640.0, + "grad_norm": 2.271622402276832, + "language_loss": 0.82474113, + "learning_rate": 3.923273738566359e-06, + "loss": 0.84653151, + "num_input_tokens_seen": 31495575, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.98828125, + "step": 1473, + "time_per_iteration": 2.4435055255889893 + }, + { + "auxiliary_loss_clip": 0.01136878, + "auxiliary_loss_mlp": 0.01047921, + "balance_loss_clip": 1.02318478, + "balance_loss_mlp": 1.03624725, + "epoch": 0.0886216744325868, + "flos": 29935647544320.0, + "grad_norm": 1.5623027892790873, + "language_loss": 0.78689879, + "learning_rate": 3.92317005791412e-06, + "loss": 0.80874676, + "num_input_tokens_seen": 31520020, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 1.0078125, + "step": 1474, + "time_per_iteration": 2.5359132289886475 + }, + { + "auxiliary_loss_clip": 0.01132754, + "auxiliary_loss_mlp": 0.01048824, + "balance_loss_clip": 1.02194262, + "balance_loss_mlp": 1.03589225, + "epoch": 0.08868179768525478, + "flos": 23981839303680.0, + "grad_norm": 1.6698744687384766, + "language_loss": 0.79016858, + "learning_rate": 3.923066308628889e-06, + "loss": 0.81198436, + "num_input_tokens_seen": 31539265, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.96875, + "step": 1475, + "time_per_iteration": 2.441384792327881 + }, + { + "auxiliary_loss_clip": 0.01133227, + "auxiliary_loss_mlp": 0.01045926, + "balance_loss_clip": 1.02074862, + "balance_loss_mlp": 1.03375602, + "epoch": 0.08874192093792274, + "flos": 43175934599040.0, + "grad_norm": 1.5978715824027918, + "language_loss": 0.73998678, + "learning_rate": 3.922962490714368e-06, + "loss": 0.76177835, + "num_input_tokens_seen": 31563425, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.99609375, + "step": 1476, + "time_per_iteration": 4.0761120319366455 + }, + { + "auxiliary_loss_clip": 0.01136907, + "auxiliary_loss_mlp": 0.01049839, + "balance_loss_clip": 1.02361321, + "balance_loss_mlp": 1.03561664, + "epoch": 0.0888020441905907, + "flos": 32851104871680.0, + "grad_norm": 1.8367264592435533, + "language_loss": 0.74373507, + "learning_rate": 3.922858604174262e-06, + "loss": 0.76560253, + "num_input_tokens_seen": 31584525, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 1.015625, + "step": 1477, + "time_per_iteration": 2.597036123275757 + }, + { + "auxiliary_loss_clip": 0.0113435, + "auxiliary_loss_mlp": 0.01055158, + "balance_loss_clip": 1.02894378, + "balance_loss_mlp": 1.03508937, + "epoch": 0.08886216744325869, + "flos": 23148217837440.0, + "grad_norm": 1.8871903181689216, + "language_loss": 0.86721641, + "learning_rate": 3.922754649012279e-06, + "loss": 0.88911152, + "num_input_tokens_seen": 31603325, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.9921875, + "step": 1478, + "time_per_iteration": 3.840590476989746 + }, + { + "auxiliary_loss_clip": 0.0113549, + "auxiliary_loss_mlp": 0.01053253, + "balance_loss_clip": 1.02746797, + "balance_loss_mlp": 1.03530025, + "epoch": 0.08892229069592665, + "flos": 23330464467840.0, + "grad_norm": 3.261643036609131, + "language_loss": 0.77389818, + "learning_rate": 3.922650625232128e-06, + "loss": 0.79578561, + "num_input_tokens_seen": 31624820, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.0, + "step": 1479, + "time_per_iteration": 4.07110071182251 + }, + { + "auxiliary_loss_clip": 0.01130919, + "auxiliary_loss_mlp": 0.01043634, + "balance_loss_clip": 1.01843274, + "balance_loss_mlp": 1.03348565, + "epoch": 0.08898241394859462, + "flos": 26212579580160.0, + "grad_norm": 2.419935582481106, + "language_loss": 0.78363329, + "learning_rate": 3.922546532837522e-06, + "loss": 0.80537885, + "num_input_tokens_seen": 31646080, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.9765625, + "step": 1480, + "time_per_iteration": 4.018751382827759 + }, + { + "auxiliary_loss_clip": 0.01133847, + "auxiliary_loss_mlp": 0.01052865, + "balance_loss_clip": 1.02467179, + "balance_loss_mlp": 1.03337443, + "epoch": 0.0890425372012626, + "flos": 23549474626560.0, + "grad_norm": 2.081024641177727, + "language_loss": 0.66308194, + "learning_rate": 3.9224423718321756e-06, + "loss": 0.68494904, + "num_input_tokens_seen": 31665770, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 1.0078125, + "step": 1481, + "time_per_iteration": 2.4800703525543213 + }, + { + "auxiliary_loss_clip": 0.01134249, + "auxiliary_loss_mlp": 0.01047694, + "balance_loss_clip": 1.02288699, + "balance_loss_mlp": 1.03551579, + "epoch": 0.08910266045393056, + "flos": 23001687394560.0, + "grad_norm": 1.864432448296234, + "language_loss": 0.9653616, + "learning_rate": 3.922338142219806e-06, + "loss": 0.98718101, + "num_input_tokens_seen": 31683805, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.98828125, + "step": 1482, + "time_per_iteration": -0.18274378776550293 + }, + { + "auxiliary_loss_clip": 0.01137095, + "auxiliary_loss_mlp": 0.01052594, + "balance_loss_clip": 1.02611768, + "balance_loss_mlp": 1.03577983, + "epoch": 0.08916278370659853, + "flos": 31935298331520.0, + "grad_norm": 1.9234200730524673, + "language_loss": 0.7877143, + "learning_rate": 3.922233844004133e-06, + "loss": 0.8096112, + "num_input_tokens_seen": 31704630, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.015625, + "step": 1483, + "time_per_iteration": 2.5001602172851562 + }, + { + "auxiliary_loss_clip": 0.01132379, + "auxiliary_loss_mlp": 0.01056567, + "balance_loss_clip": 1.03047252, + "balance_loss_mlp": 1.03430367, + "epoch": 0.08922290695926649, + "flos": 17529435803520.0, + "grad_norm": 2.3630293380418683, + "language_loss": 0.85483754, + "learning_rate": 3.922129477188879e-06, + "loss": 0.87672698, + "num_input_tokens_seen": 31723255, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.98046875, + "step": 1484, + "time_per_iteration": 2.4240922927856445 + }, + { + "auxiliary_loss_clip": 0.01142654, + "auxiliary_loss_mlp": 0.01049875, + "balance_loss_clip": 1.02124047, + "balance_loss_mlp": 1.03831923, + "epoch": 0.08928303021193447, + "flos": 32123689361280.0, + "grad_norm": 1.6040066629630427, + "language_loss": 0.80224192, + "learning_rate": 3.922025041777768e-06, + "loss": 0.82416725, + "num_input_tokens_seen": 31747045, + "router_z_loss_clip": 0.28710938, + "router_z_loss_mlp": 1.0390625, + "step": 1485, + "time_per_iteration": 2.524641275405884 + }, + { + "auxiliary_loss_clip": 0.01133156, + "auxiliary_loss_mlp": 0.01051077, + "balance_loss_clip": 1.02557778, + "balance_loss_mlp": 1.03244472, + "epoch": 0.08934315346460243, + "flos": 22124180482560.0, + "grad_norm": 2.0853469502904693, + "language_loss": 0.82999718, + "learning_rate": 3.921920537774528e-06, + "loss": 0.85183954, + "num_input_tokens_seen": 31766615, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 1.0078125, + "step": 1486, + "time_per_iteration": 2.4714694023132324 + }, + { + "auxiliary_loss_clip": 0.01132548, + "auxiliary_loss_mlp": 0.01056082, + "balance_loss_clip": 1.02856851, + "balance_loss_mlp": 1.03458941, + "epoch": 0.0894032767172704, + "flos": 22564470038400.0, + "grad_norm": 1.6977251970071152, + "language_loss": 0.76376575, + "learning_rate": 3.921815965182887e-06, + "loss": 0.78565204, + "num_input_tokens_seen": 31785855, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.9765625, + "step": 1487, + "time_per_iteration": 2.472322940826416 + }, + { + "auxiliary_loss_clip": 0.01135312, + "auxiliary_loss_mlp": 0.01050471, + "balance_loss_clip": 1.02255261, + "balance_loss_mlp": 1.03422713, + "epoch": 0.08946339996993838, + "flos": 20192366200320.0, + "grad_norm": 2.0317647278322477, + "language_loss": 0.82573104, + "learning_rate": 3.921711324006578e-06, + "loss": 0.8475889, + "num_input_tokens_seen": 31804210, + "router_z_loss_clip": 0.27929688, + "router_z_loss_mlp": 1.0078125, + "step": 1488, + "time_per_iteration": 2.4572408199310303 + }, + { + "auxiliary_loss_clip": 0.01132356, + "auxiliary_loss_mlp": 0.01048558, + "balance_loss_clip": 1.02412033, + "balance_loss_mlp": 1.0337038, + "epoch": 0.08952352322260634, + "flos": 48358372060800.0, + "grad_norm": 2.8944739791810865, + "language_loss": 0.72003675, + "learning_rate": 3.921606614249335e-06, + "loss": 0.74184585, + "num_input_tokens_seen": 31826150, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.98828125, + "step": 1489, + "time_per_iteration": 2.759321451187134 + }, + { + "auxiliary_loss_clip": 0.01130591, + "auxiliary_loss_mlp": 0.01049125, + "balance_loss_clip": 1.02280331, + "balance_loss_mlp": 1.03231263, + "epoch": 0.08958364647527431, + "flos": 31791805176960.0, + "grad_norm": 1.7753833348466836, + "language_loss": 0.89858687, + "learning_rate": 3.921501835914894e-06, + "loss": 0.92038399, + "num_input_tokens_seen": 31848060, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 0.984375, + "step": 1490, + "time_per_iteration": 2.541839122772217 + }, + { + "auxiliary_loss_clip": 0.01140402, + "auxiliary_loss_mlp": 0.01056209, + "balance_loss_clip": 1.02856421, + "balance_loss_mlp": 1.03494847, + "epoch": 0.08964376972794229, + "flos": 23367053439360.0, + "grad_norm": 2.3138583614972386, + "language_loss": 0.73459613, + "learning_rate": 3.921396989006997e-06, + "loss": 0.75656223, + "num_input_tokens_seen": 31870040, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 1.0546875, + "step": 1491, + "time_per_iteration": 2.482553482055664 + }, + { + "auxiliary_loss_clip": 0.01132926, + "auxiliary_loss_mlp": 0.01044359, + "balance_loss_clip": 1.01953959, + "balance_loss_mlp": 1.03456628, + "epoch": 0.08970389298061025, + "flos": 23293666028160.0, + "grad_norm": 1.9212727717432074, + "language_loss": 0.76900983, + "learning_rate": 3.9212920735293824e-06, + "loss": 0.79078269, + "num_input_tokens_seen": 31890400, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.984375, + "step": 1492, + "time_per_iteration": 2.4586904048919678 + }, + { + "auxiliary_loss_clip": 0.01132489, + "auxiliary_loss_mlp": 0.01048568, + "balance_loss_clip": 1.02209187, + "balance_loss_mlp": 1.03553343, + "epoch": 0.08976401623327822, + "flos": 33760417898880.0, + "grad_norm": 2.0921066681155245, + "language_loss": 0.70533705, + "learning_rate": 3.921187089485796e-06, + "loss": 0.72714764, + "num_input_tokens_seen": 31913435, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.96875, + "step": 1493, + "time_per_iteration": 2.540811777114868 + }, + { + "auxiliary_loss_clip": 0.01132349, + "auxiliary_loss_mlp": 0.01048068, + "balance_loss_clip": 1.02154422, + "balance_loss_mlp": 1.03320432, + "epoch": 0.08982413948594618, + "flos": 23910302194560.0, + "grad_norm": 1.8346011499961192, + "language_loss": 0.86851084, + "learning_rate": 3.921082036879985e-06, + "loss": 0.89031506, + "num_input_tokens_seen": 31932435, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.9921875, + "step": 1494, + "time_per_iteration": 2.462740421295166 + }, + { + "auxiliary_loss_clip": 0.01133166, + "auxiliary_loss_mlp": 0.01054281, + "balance_loss_clip": 1.02751827, + "balance_loss_mlp": 1.03466368, + "epoch": 0.08988426273861416, + "flos": 16836584405760.0, + "grad_norm": 1.7800069294718281, + "language_loss": 0.83029783, + "learning_rate": 3.9209769157156976e-06, + "loss": 0.85217232, + "num_input_tokens_seen": 31950125, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 0.984375, + "step": 1495, + "time_per_iteration": 2.403963804244995 + }, + { + "auxiliary_loss_clip": 0.0113773, + "auxiliary_loss_mlp": 0.01058836, + "balance_loss_clip": 1.03222859, + "balance_loss_mlp": 1.03652573, + "epoch": 0.08994438599128213, + "flos": 14792489591040.0, + "grad_norm": 1.8861377359210703, + "language_loss": 0.69612455, + "learning_rate": 3.920871725996685e-06, + "loss": 0.7180903, + "num_input_tokens_seen": 31968050, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.015625, + "step": 1496, + "time_per_iteration": 2.3890879154205322 + }, + { + "auxiliary_loss_clip": 0.01132691, + "auxiliary_loss_mlp": 0.01049185, + "balance_loss_clip": 1.024616, + "balance_loss_mlp": 1.0346992, + "epoch": 0.09000450924395009, + "flos": 17383359208320.0, + "grad_norm": 1.665119356571217, + "language_loss": 0.79898089, + "learning_rate": 3.920766467726702e-06, + "loss": 0.82079965, + "num_input_tokens_seen": 31985675, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.9765625, + "step": 1497, + "time_per_iteration": 2.4362621307373047 + }, + { + "auxiliary_loss_clip": 0.01135802, + "auxiliary_loss_mlp": 0.01049072, + "balance_loss_clip": 1.02345371, + "balance_loss_mlp": 1.03321958, + "epoch": 0.09006463249661807, + "flos": 24279159375360.0, + "grad_norm": 2.704740676644494, + "language_loss": 0.8292343, + "learning_rate": 3.920661140909505e-06, + "loss": 0.85108304, + "num_input_tokens_seen": 32005180, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 1.0234375, + "step": 1498, + "time_per_iteration": 2.4597747325897217 + }, + { + "auxiliary_loss_clip": 0.01136483, + "auxiliary_loss_mlp": 0.01056237, + "balance_loss_clip": 1.0302614, + "balance_loss_mlp": 1.03528881, + "epoch": 0.09012475574928604, + "flos": 13661094205440.0, + "grad_norm": 4.45428008519945, + "language_loss": 0.78773302, + "learning_rate": 3.920555745548851e-06, + "loss": 0.8096602, + "num_input_tokens_seen": 32022970, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 1.015625, + "step": 1499, + "time_per_iteration": 2.419222593307495 + }, + { + "auxiliary_loss_clip": 0.01130284, + "auxiliary_loss_mlp": 0.01055989, + "balance_loss_clip": 1.03046608, + "balance_loss_mlp": 1.03425586, + "epoch": 0.090184879001954, + "flos": 23326728952320.0, + "grad_norm": 1.7092410090117585, + "language_loss": 0.93098229, + "learning_rate": 3.920450281648503e-06, + "loss": 0.95284498, + "num_input_tokens_seen": 32043055, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.9609375, + "step": 1500, + "time_per_iteration": 2.435533285140991 + }, + { + "auxiliary_loss_clip": 0.01133888, + "auxiliary_loss_mlp": 0.01047049, + "balance_loss_clip": 1.02153802, + "balance_loss_mlp": 1.03357148, + "epoch": 0.09024500225462198, + "flos": 23001582660480.0, + "grad_norm": 2.196175652597993, + "language_loss": 0.74589396, + "learning_rate": 3.920344749212226e-06, + "loss": 0.76770335, + "num_input_tokens_seen": 32061900, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 1.0, + "step": 1501, + "time_per_iteration": 2.46028733253479 + }, + { + "auxiliary_loss_clip": 0.01044632, + "auxiliary_loss_mlp": 0.01025841, + "balance_loss_clip": 1.02193058, + "balance_loss_mlp": 1.00976682, + "epoch": 0.09030512550728995, + "flos": 62185966007040.0, + "grad_norm": 0.7316882177862591, + "language_loss": 0.58222729, + "learning_rate": 3.920239148243783e-06, + "loss": 0.60293198, + "num_input_tokens_seen": 32122745, + "router_z_loss_clip": 0.0390625, + "router_z_loss_mlp": 0.34765625, + "step": 1502, + "time_per_iteration": 3.0684406757354736 + }, + { + "auxiliary_loss_clip": 0.01127273, + "auxiliary_loss_mlp": 0.01045086, + "balance_loss_clip": 1.02217412, + "balance_loss_mlp": 1.03078341, + "epoch": 0.09036524875995791, + "flos": 38799152737920.0, + "grad_norm": 2.273966761436493, + "language_loss": 0.69753504, + "learning_rate": 3.920133478746944e-06, + "loss": 0.71925861, + "num_input_tokens_seen": 32145125, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 0.96484375, + "step": 1503, + "time_per_iteration": 2.6171352863311768 + }, + { + "auxiliary_loss_clip": 0.01133858, + "auxiliary_loss_mlp": 0.01047987, + "balance_loss_clip": 1.02242839, + "balance_loss_mlp": 1.03417444, + "epoch": 0.09042537201262588, + "flos": 21688987985280.0, + "grad_norm": 2.231932536970297, + "language_loss": 0.85978246, + "learning_rate": 3.920027740725481e-06, + "loss": 0.88160092, + "num_input_tokens_seen": 32166255, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 1.0, + "step": 1504, + "time_per_iteration": 2.4569787979125977 + }, + { + "auxiliary_loss_clip": 0.01140245, + "auxiliary_loss_mlp": 0.01054718, + "balance_loss_clip": 1.0257622, + "balance_loss_mlp": 1.03655267, + "epoch": 0.09048549526529386, + "flos": 22266102625920.0, + "grad_norm": 2.087997650270069, + "language_loss": 0.72479331, + "learning_rate": 3.919921934183167e-06, + "loss": 0.7467429, + "num_input_tokens_seen": 32184010, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 1.0390625, + "step": 1505, + "time_per_iteration": 2.4890501499176025 + }, + { + "auxiliary_loss_clip": 0.01132126, + "auxiliary_loss_mlp": 0.01046802, + "balance_loss_clip": 1.02039695, + "balance_loss_mlp": 1.03433979, + "epoch": 0.09054561851796182, + "flos": 14610068403840.0, + "grad_norm": 2.0171736463787093, + "language_loss": 0.80757898, + "learning_rate": 3.919816059123778e-06, + "loss": 0.82936823, + "num_input_tokens_seen": 32201635, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 0.9765625, + "step": 1506, + "time_per_iteration": 2.4429192543029785 + }, + { + "auxiliary_loss_clip": 0.01132694, + "auxiliary_loss_mlp": 0.01044174, + "balance_loss_clip": 1.0197835, + "balance_loss_mlp": 1.03535104, + "epoch": 0.09060574177062979, + "flos": 27634941169920.0, + "grad_norm": 1.9727846762699803, + "language_loss": 0.75965023, + "learning_rate": 3.919710115551092e-06, + "loss": 0.78141892, + "num_input_tokens_seen": 32221940, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.97265625, + "step": 1507, + "time_per_iteration": 2.5400660037994385 + }, + { + "auxiliary_loss_clip": 0.01042598, + "auxiliary_loss_mlp": 0.01005991, + "balance_loss_clip": 1.00198567, + "balance_loss_mlp": 1.00916338, + "epoch": 0.09066586502329776, + "flos": 66082657495680.0, + "grad_norm": 0.7293385187982612, + "language_loss": 0.57651293, + "learning_rate": 3.91960410346889e-06, + "loss": 0.59699887, + "num_input_tokens_seen": 32276495, + "router_z_loss_clip": 0.04003906, + "router_z_loss_mlp": 0.33398438, + "step": 1508, + "time_per_iteration": 2.967487335205078 + }, + { + "auxiliary_loss_clip": 0.0113586, + "auxiliary_loss_mlp": 0.0105616, + "balance_loss_clip": 1.02919483, + "balance_loss_mlp": 1.03609443, + "epoch": 0.09072598827596573, + "flos": 18915452801280.0, + "grad_norm": 2.2190963044476137, + "language_loss": 0.85160971, + "learning_rate": 3.919498022880955e-06, + "loss": 0.87352985, + "num_input_tokens_seen": 32294130, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.99609375, + "step": 1509, + "time_per_iteration": 2.5288822650909424 + }, + { + "auxiliary_loss_clip": 0.01141463, + "auxiliary_loss_mlp": 0.01054211, + "balance_loss_clip": 1.02641118, + "balance_loss_mlp": 1.03611588, + "epoch": 0.0907861115286337, + "flos": 24820732385280.0, + "grad_norm": 2.3260571756947472, + "language_loss": 0.84302211, + "learning_rate": 3.9193918737910735e-06, + "loss": 0.86497879, + "num_input_tokens_seen": 32313555, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 1.046875, + "step": 1510, + "time_per_iteration": 2.466231346130371 + }, + { + "auxiliary_loss_clip": 0.01134825, + "auxiliary_loss_mlp": 0.01050364, + "balance_loss_clip": 1.02267158, + "balance_loss_mlp": 1.03401542, + "epoch": 0.09084623478130167, + "flos": 21651770609280.0, + "grad_norm": 1.912535516932508, + "language_loss": 0.85478687, + "learning_rate": 3.919285656203033e-06, + "loss": 0.87663877, + "num_input_tokens_seen": 32331430, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 1.0078125, + "step": 1511, + "time_per_iteration": 2.4451420307159424 + }, + { + "auxiliary_loss_clip": 0.01132187, + "auxiliary_loss_mlp": 0.01048022, + "balance_loss_clip": 1.02150965, + "balance_loss_mlp": 1.03599679, + "epoch": 0.09090635803396964, + "flos": 27637943546880.0, + "grad_norm": 1.7413602600900544, + "language_loss": 0.85064685, + "learning_rate": 3.919179370120624e-06, + "loss": 0.87244892, + "num_input_tokens_seen": 32353705, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.96484375, + "step": 1512, + "time_per_iteration": 2.4990103244781494 + }, + { + "auxiliary_loss_clip": 0.0112873, + "auxiliary_loss_mlp": 0.0104476, + "balance_loss_clip": 1.02056026, + "balance_loss_mlp": 1.03212404, + "epoch": 0.0909664812866376, + "flos": 17668355569920.0, + "grad_norm": 2.479651232075728, + "language_loss": 0.86426342, + "learning_rate": 3.919073015547641e-06, + "loss": 0.88599831, + "num_input_tokens_seen": 32370520, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.96484375, + "step": 1513, + "time_per_iteration": 2.4611003398895264 + }, + { + "auxiliary_loss_clip": 0.01134348, + "auxiliary_loss_mlp": 0.01049628, + "balance_loss_clip": 1.02377188, + "balance_loss_mlp": 1.03542376, + "epoch": 0.09102660453930557, + "flos": 23950312479360.0, + "grad_norm": 1.8659367863772227, + "language_loss": 0.86158764, + "learning_rate": 3.918966592487878e-06, + "loss": 0.88342738, + "num_input_tokens_seen": 32389105, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.98828125, + "step": 1514, + "time_per_iteration": 2.4910011291503906 + }, + { + "auxiliary_loss_clip": 0.01134255, + "auxiliary_loss_mlp": 0.01058063, + "balance_loss_clip": 1.03387547, + "balance_loss_mlp": 1.03549898, + "epoch": 0.09108672779197355, + "flos": 25811741727360.0, + "grad_norm": 1.844292454397013, + "language_loss": 0.90314281, + "learning_rate": 3.918860100945134e-06, + "loss": 0.92506593, + "num_input_tokens_seen": 32408065, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.984375, + "step": 1515, + "time_per_iteration": 3.9674715995788574 + }, + { + "auxiliary_loss_clip": 0.01134262, + "auxiliary_loss_mlp": 0.01046491, + "balance_loss_clip": 1.02043211, + "balance_loss_mlp": 1.03435397, + "epoch": 0.09114685104464151, + "flos": 29638292561280.0, + "grad_norm": 2.112813599939862, + "language_loss": 0.85246992, + "learning_rate": 3.9187535409232076e-06, + "loss": 0.87427747, + "num_input_tokens_seen": 32427225, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 1.0, + "step": 1516, + "time_per_iteration": 2.516740322113037 + }, + { + "auxiliary_loss_clip": 0.01138227, + "auxiliary_loss_mlp": 0.01050867, + "balance_loss_clip": 1.02510583, + "balance_loss_mlp": 1.03607202, + "epoch": 0.09120697429730948, + "flos": 33728227758720.0, + "grad_norm": 1.4460689545829237, + "language_loss": 0.80797648, + "learning_rate": 3.918646912425904e-06, + "loss": 0.82986748, + "num_input_tokens_seen": 32450510, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.0234375, + "step": 1517, + "time_per_iteration": 3.999890089035034 + }, + { + "auxiliary_loss_clip": 0.01140348, + "auxiliary_loss_mlp": 0.01059854, + "balance_loss_clip": 1.03282976, + "balance_loss_mlp": 1.03763103, + "epoch": 0.09126709754997746, + "flos": 18400519025280.0, + "grad_norm": 1.570144501988006, + "language_loss": 0.77740484, + "learning_rate": 3.918540215457027e-06, + "loss": 0.79940683, + "num_input_tokens_seen": 32468425, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 1.03125, + "step": 1518, + "time_per_iteration": 2.447437047958374 + }, + { + "auxiliary_loss_clip": 0.01133414, + "auxiliary_loss_mlp": 0.01051801, + "balance_loss_clip": 1.02459717, + "balance_loss_mlp": 1.03456867, + "epoch": 0.09132722080264542, + "flos": 22090838267520.0, + "grad_norm": 1.6758766032308245, + "language_loss": 0.86130202, + "learning_rate": 3.918433450020386e-06, + "loss": 0.88315415, + "num_input_tokens_seen": 32487510, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 0.98828125, + "step": 1519, + "time_per_iteration": 3.8800432682037354 + }, + { + "auxiliary_loss_clip": 0.01136126, + "auxiliary_loss_mlp": 0.01051936, + "balance_loss_clip": 1.02413607, + "balance_loss_mlp": 1.03529108, + "epoch": 0.09138734405531339, + "flos": 21032062243200.0, + "grad_norm": 2.3850002057706474, + "language_loss": 0.72785783, + "learning_rate": 3.9183266161197885e-06, + "loss": 0.74973845, + "num_input_tokens_seen": 32507250, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 1.0078125, + "step": 1520, + "time_per_iteration": 3.880931854248047 + }, + { + "auxiliary_loss_clip": 0.01135703, + "auxiliary_loss_mlp": 0.01054564, + "balance_loss_clip": 1.02647829, + "balance_loss_mlp": 1.03522754, + "epoch": 0.09144746730798137, + "flos": 20082913488000.0, + "grad_norm": 2.5358907338691727, + "language_loss": 0.85057628, + "learning_rate": 3.91821971375905e-06, + "loss": 0.87247896, + "num_input_tokens_seen": 32526045, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 1.0, + "step": 1521, + "time_per_iteration": 2.469526767730713 + }, + { + "auxiliary_loss_clip": 0.01136681, + "auxiliary_loss_mlp": 0.0105437, + "balance_loss_clip": 1.02826309, + "balance_loss_mlp": 1.03459895, + "epoch": 0.09150759056064933, + "flos": 22777265975040.0, + "grad_norm": 2.7776145419894873, + "language_loss": 0.83937508, + "learning_rate": 3.918112742941983e-06, + "loss": 0.86128557, + "num_input_tokens_seen": 32546575, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 1.015625, + "step": 1522, + "time_per_iteration": 2.5015242099761963 + }, + { + "auxiliary_loss_clip": 0.01129337, + "auxiliary_loss_mlp": 0.01053417, + "balance_loss_clip": 1.0273335, + "balance_loss_mlp": 1.03404737, + "epoch": 0.0915677138133173, + "flos": 27562950213120.0, + "grad_norm": 1.9851311064106862, + "language_loss": 0.81124741, + "learning_rate": 3.9180057036724066e-06, + "loss": 0.83307493, + "num_input_tokens_seen": 32568795, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.953125, + "step": 1523, + "time_per_iteration": 2.5210797786712646 + }, + { + "auxiliary_loss_clip": 0.01135162, + "auxiliary_loss_mlp": 0.01050573, + "balance_loss_clip": 1.02509856, + "balance_loss_mlp": 1.03610897, + "epoch": 0.09162783706598528, + "flos": 17673836653440.0, + "grad_norm": 2.434152104453912, + "language_loss": 0.74915415, + "learning_rate": 3.9178985959541406e-06, + "loss": 0.77101147, + "num_input_tokens_seen": 32587010, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.9921875, + "step": 1524, + "time_per_iteration": 2.440109968185425 + }, + { + "auxiliary_loss_clip": 0.01134508, + "auxiliary_loss_mlp": 0.01055211, + "balance_loss_clip": 1.02829385, + "balance_loss_mlp": 1.03354788, + "epoch": 0.09168796031865324, + "flos": 18477223015680.0, + "grad_norm": 2.5147764071717886, + "language_loss": 0.86025923, + "learning_rate": 3.917791419791006e-06, + "loss": 0.88215643, + "num_input_tokens_seen": 32602375, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 1.0078125, + "step": 1525, + "time_per_iteration": 2.432311773300171 + }, + { + "auxiliary_loss_clip": 0.01133316, + "auxiliary_loss_mlp": 0.0104995, + "balance_loss_clip": 1.02396262, + "balance_loss_mlp": 1.03515005, + "epoch": 0.0917480835713212, + "flos": 29386324212480.0, + "grad_norm": 2.096285881342677, + "language_loss": 0.7531842, + "learning_rate": 3.91768417518683e-06, + "loss": 0.77501684, + "num_input_tokens_seen": 32621460, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.98046875, + "step": 1526, + "time_per_iteration": 2.508889675140381 + }, + { + "auxiliary_loss_clip": 0.01133753, + "auxiliary_loss_mlp": 0.01048868, + "balance_loss_clip": 1.02340519, + "balance_loss_mlp": 1.0355401, + "epoch": 0.09180820682398917, + "flos": 19828222053120.0, + "grad_norm": 2.155732744211786, + "language_loss": 0.77275509, + "learning_rate": 3.917576862145438e-06, + "loss": 0.79458129, + "num_input_tokens_seen": 32640440, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.984375, + "step": 1527, + "time_per_iteration": 2.504575490951538 + }, + { + "auxiliary_loss_clip": 0.01134489, + "auxiliary_loss_mlp": 0.01052293, + "balance_loss_clip": 1.02495801, + "balance_loss_mlp": 1.03549433, + "epoch": 0.09186833007665715, + "flos": 23840720121600.0, + "grad_norm": 2.472114783236302, + "language_loss": 0.78673851, + "learning_rate": 3.91746948067066e-06, + "loss": 0.80860639, + "num_input_tokens_seen": 32660020, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.9921875, + "step": 1528, + "time_per_iteration": 2.497410297393799 + }, + { + "auxiliary_loss_clip": 0.01133446, + "auxiliary_loss_mlp": 0.01044216, + "balance_loss_clip": 1.01814508, + "balance_loss_mlp": 1.03460348, + "epoch": 0.09192845332932512, + "flos": 12931898215680.0, + "grad_norm": 2.7292163851837303, + "language_loss": 0.77312195, + "learning_rate": 3.91736203076633e-06, + "loss": 0.79489857, + "num_input_tokens_seen": 32678170, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.98828125, + "step": 1529, + "time_per_iteration": 2.433762550354004 + }, + { + "auxiliary_loss_clip": 0.01133114, + "auxiliary_loss_mlp": 0.01049329, + "balance_loss_clip": 1.02217329, + "balance_loss_mlp": 1.03224063, + "epoch": 0.09198857658199308, + "flos": 24567123202560.0, + "grad_norm": 1.8967014901884687, + "language_loss": 0.8285197, + "learning_rate": 3.9172545124362795e-06, + "loss": 0.85034418, + "num_input_tokens_seen": 32697540, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 1.0078125, + "step": 1530, + "time_per_iteration": 2.4446945190429688 + }, + { + "auxiliary_loss_clip": 0.0113179, + "auxiliary_loss_mlp": 0.01055151, + "balance_loss_clip": 1.02943802, + "balance_loss_mlp": 1.03457022, + "epoch": 0.09204869983466106, + "flos": 20265893256960.0, + "grad_norm": 2.6123554775526823, + "language_loss": 0.83155543, + "learning_rate": 3.9171469256843484e-06, + "loss": 0.85342479, + "num_input_tokens_seen": 32716805, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.96875, + "step": 1531, + "time_per_iteration": 2.4508118629455566 + }, + { + "auxiliary_loss_clip": 0.01131607, + "auxiliary_loss_mlp": 0.01050721, + "balance_loss_clip": 1.02392328, + "balance_loss_mlp": 1.03305411, + "epoch": 0.09210882308732903, + "flos": 20884624104960.0, + "grad_norm": 3.281697007764632, + "language_loss": 0.81480652, + "learning_rate": 3.917039270514375e-06, + "loss": 0.83662981, + "num_input_tokens_seen": 32736385, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 0.984375, + "step": 1532, + "time_per_iteration": 2.4293484687805176 + }, + { + "auxiliary_loss_clip": 0.01135024, + "auxiliary_loss_mlp": 0.01053959, + "balance_loss_clip": 1.02642202, + "balance_loss_mlp": 1.0354228, + "epoch": 0.09216894633999699, + "flos": 30955006776960.0, + "grad_norm": 2.533561225734148, + "language_loss": 0.83641398, + "learning_rate": 3.9169315469302e-06, + "loss": 0.85830384, + "num_input_tokens_seen": 32757140, + "router_z_loss_clip": 0.27539062, + "router_z_loss_mlp": 0.99609375, + "step": 1533, + "time_per_iteration": 2.507021188735962 + }, + { + "auxiliary_loss_clip": 0.01133847, + "auxiliary_loss_mlp": 0.01046285, + "balance_loss_clip": 1.02035689, + "balance_loss_mlp": 1.03520799, + "epoch": 0.09222906959266497, + "flos": 13150733817600.0, + "grad_norm": 2.012719717570496, + "language_loss": 0.90133536, + "learning_rate": 3.91682375493567e-06, + "loss": 0.92313659, + "num_input_tokens_seen": 32774860, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.984375, + "step": 1534, + "time_per_iteration": 2.387143611907959 + }, + { + "auxiliary_loss_clip": 0.01133891, + "auxiliary_loss_mlp": 0.01064751, + "balance_loss_clip": 1.03456783, + "balance_loss_mlp": 1.03374577, + "epoch": 0.09228919284533293, + "flos": 25993290130560.0, + "grad_norm": 1.9768533857068011, + "language_loss": 0.75789332, + "learning_rate": 3.916715894534631e-06, + "loss": 0.77987975, + "num_input_tokens_seen": 32795250, + "router_z_loss_clip": 0.30273438, + "router_z_loss_mlp": 1.0, + "step": 1535, + "time_per_iteration": 2.490509271621704 + }, + { + "auxiliary_loss_clip": 0.01127005, + "auxiliary_loss_mlp": 0.01048225, + "balance_loss_clip": 1.02288067, + "balance_loss_mlp": 1.03255856, + "epoch": 0.0923493160980009, + "flos": 18659818759680.0, + "grad_norm": 1.6612240374319367, + "language_loss": 0.81129748, + "learning_rate": 3.916607965730932e-06, + "loss": 0.83304977, + "num_input_tokens_seen": 32813805, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.9453125, + "step": 1536, + "time_per_iteration": 2.3843331336975098 + }, + { + "auxiliary_loss_clip": 0.0112877, + "auxiliary_loss_mlp": 0.01049071, + "balance_loss_clip": 1.02415609, + "balance_loss_mlp": 1.03268063, + "epoch": 0.09240943935066886, + "flos": 21139559919360.0, + "grad_norm": 3.3094259332098237, + "language_loss": 0.89304686, + "learning_rate": 3.9164999685284245e-06, + "loss": 0.91482526, + "num_input_tokens_seen": 32830960, + "router_z_loss_clip": 0.24902344, + "router_z_loss_mlp": 0.9609375, + "step": 1537, + "time_per_iteration": 2.4380135536193848 + }, + { + "auxiliary_loss_clip": 0.01130553, + "auxiliary_loss_mlp": 0.01052005, + "balance_loss_clip": 1.02496839, + "balance_loss_mlp": 1.03329217, + "epoch": 0.09246956260333684, + "flos": 20591458485120.0, + "grad_norm": 2.218947463308206, + "language_loss": 0.81051397, + "learning_rate": 3.916391902930963e-06, + "loss": 0.83233953, + "num_input_tokens_seen": 32848275, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.97265625, + "step": 1538, + "time_per_iteration": 2.4207215309143066 + }, + { + "auxiliary_loss_clip": 0.01047669, + "auxiliary_loss_mlp": 0.01006016, + "balance_loss_clip": 1.00153339, + "balance_loss_mlp": 1.01349711, + "epoch": 0.09252968585600481, + "flos": 67555153664640.0, + "grad_norm": 0.7330743763034323, + "language_loss": 0.57387245, + "learning_rate": 3.916283768942404e-06, + "loss": 0.59440935, + "num_input_tokens_seen": 32917730, + "router_z_loss_clip": 0.04492188, + "router_z_loss_mlp": 0.34179688, + "step": 1539, + "time_per_iteration": 3.207620620727539 + }, + { + "auxiliary_loss_clip": 0.01134897, + "auxiliary_loss_mlp": 0.01046346, + "balance_loss_clip": 1.02051282, + "balance_loss_mlp": 1.03614211, + "epoch": 0.09258980910867277, + "flos": 17382905360640.0, + "grad_norm": 3.029941687293074, + "language_loss": 0.67660999, + "learning_rate": 3.916175566566607e-06, + "loss": 0.69842243, + "num_input_tokens_seen": 32934910, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.98828125, + "step": 1540, + "time_per_iteration": 2.4448249340057373 + }, + { + "auxiliary_loss_clip": 0.01133389, + "auxiliary_loss_mlp": 0.01046343, + "balance_loss_clip": 1.01944923, + "balance_loss_mlp": 1.03585327, + "epoch": 0.09264993236134075, + "flos": 19864880847360.0, + "grad_norm": 1.941268230130487, + "language_loss": 0.83593309, + "learning_rate": 3.916067295807433e-06, + "loss": 0.85773039, + "num_input_tokens_seen": 32953840, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.9765625, + "step": 1541, + "time_per_iteration": 2.517001152038574 + }, + { + "auxiliary_loss_clip": 0.01044401, + "auxiliary_loss_mlp": 0.01004169, + "balance_loss_clip": 1.00006866, + "balance_loss_mlp": 1.0115366, + "epoch": 0.09271005561400872, + "flos": 62281558909440.0, + "grad_norm": 0.8820698294636038, + "language_loss": 0.61850953, + "learning_rate": 3.915958956668745e-06, + "loss": 0.63899529, + "num_input_tokens_seen": 33011410, + "router_z_loss_clip": 0.04101562, + "router_z_loss_mlp": 0.328125, + "step": 1542, + "time_per_iteration": 3.1013567447662354 + }, + { + "auxiliary_loss_clip": 0.01131936, + "auxiliary_loss_mlp": 0.01053385, + "balance_loss_clip": 1.02901816, + "balance_loss_mlp": 1.0335089, + "epoch": 0.09277017886667668, + "flos": 23328788722560.0, + "grad_norm": 1.826825102411416, + "language_loss": 0.82780075, + "learning_rate": 3.915850549154412e-06, + "loss": 0.84965402, + "num_input_tokens_seen": 33031675, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.984375, + "step": 1543, + "time_per_iteration": 2.4943008422851562 + }, + { + "auxiliary_loss_clip": 0.0113051, + "auxiliary_loss_mlp": 0.01050692, + "balance_loss_clip": 1.02413249, + "balance_loss_mlp": 1.03472304, + "epoch": 0.09283030211934466, + "flos": 54743183435520.0, + "grad_norm": 1.8056748607287116, + "language_loss": 0.72402155, + "learning_rate": 3.9157420732682995e-06, + "loss": 0.74583352, + "num_input_tokens_seen": 33056355, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.95703125, + "step": 1544, + "time_per_iteration": 2.719080924987793 + }, + { + "auxiliary_loss_clip": 0.0113308, + "auxiliary_loss_mlp": 0.01047957, + "balance_loss_clip": 1.02156389, + "balance_loss_mlp": 1.03499806, + "epoch": 0.09289042537201263, + "flos": 30333517931520.0, + "grad_norm": 2.5526843283161744, + "language_loss": 0.77444106, + "learning_rate": 3.91563352901428e-06, + "loss": 0.79625142, + "num_input_tokens_seen": 33079520, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 0.98046875, + "step": 1545, + "time_per_iteration": 2.5514965057373047 + }, + { + "auxiliary_loss_clip": 0.01129955, + "auxiliary_loss_mlp": 0.01049175, + "balance_loss_clip": 1.02300811, + "balance_loss_mlp": 1.03411222, + "epoch": 0.09295054862468059, + "flos": 17745932344320.0, + "grad_norm": 2.695182505976073, + "language_loss": 0.74121594, + "learning_rate": 3.915524916396229e-06, + "loss": 0.76300728, + "num_input_tokens_seen": 33096135, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.95703125, + "step": 1546, + "time_per_iteration": 2.397153854370117 + }, + { + "auxiliary_loss_clip": 0.01133425, + "auxiliary_loss_mlp": 0.01045961, + "balance_loss_clip": 1.02051032, + "balance_loss_mlp": 1.03395939, + "epoch": 0.09301067187734856, + "flos": 23656937391360.0, + "grad_norm": 1.798481983913879, + "language_loss": 0.8445034, + "learning_rate": 3.91541623541802e-06, + "loss": 0.86629736, + "num_input_tokens_seen": 33115245, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.9921875, + "step": 1547, + "time_per_iteration": 2.485987901687622 + }, + { + "auxiliary_loss_clip": 0.01131087, + "auxiliary_loss_mlp": 0.01051158, + "balance_loss_clip": 1.02568305, + "balance_loss_mlp": 1.03294826, + "epoch": 0.09307079513001654, + "flos": 27526465975680.0, + "grad_norm": 2.1358068082235433, + "language_loss": 0.67515683, + "learning_rate": 3.9153074860835326e-06, + "loss": 0.69697928, + "num_input_tokens_seen": 33136640, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.98046875, + "step": 1548, + "time_per_iteration": 2.512816905975342 + }, + { + "auxiliary_loss_clip": 0.01134184, + "auxiliary_loss_mlp": 0.01056214, + "balance_loss_clip": 1.02990484, + "balance_loss_mlp": 1.0346415, + "epoch": 0.0931309183826845, + "flos": 20626406622720.0, + "grad_norm": 1.908903974905939, + "language_loss": 0.83415234, + "learning_rate": 3.915198668396649e-06, + "loss": 0.85605627, + "num_input_tokens_seen": 33155060, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 0.9921875, + "step": 1549, + "time_per_iteration": 2.4752960205078125 + }, + { + "auxiliary_loss_clip": 0.011351, + "auxiliary_loss_mlp": 0.01042165, + "balance_loss_clip": 1.01690459, + "balance_loss_mlp": 1.03577685, + "epoch": 0.09319104163535247, + "flos": 29019701358720.0, + "grad_norm": 1.6348414684861816, + "language_loss": 0.75787747, + "learning_rate": 3.91508978236125e-06, + "loss": 0.77965015, + "num_input_tokens_seen": 33175420, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.9921875, + "step": 1550, + "time_per_iteration": 2.4757087230682373 + }, + { + "auxiliary_loss_clip": 0.0113549, + "auxiliary_loss_mlp": 0.01054175, + "balance_loss_clip": 1.02480149, + "balance_loss_mlp": 1.03360713, + "epoch": 0.09325116488802045, + "flos": 25300368910080.0, + "grad_norm": 2.818707974293908, + "language_loss": 0.82972282, + "learning_rate": 3.914980827981223e-06, + "loss": 0.85161948, + "num_input_tokens_seen": 33194120, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 1.015625, + "step": 1551, + "time_per_iteration": 2.5096378326416016 + }, + { + "auxiliary_loss_clip": 0.01036712, + "auxiliary_loss_mlp": 0.01005773, + "balance_loss_clip": 1.00164795, + "balance_loss_mlp": 1.00625217, + "epoch": 0.09331128814068841, + "flos": 61532880514560.0, + "grad_norm": 0.7464859419302465, + "language_loss": 0.61793554, + "learning_rate": 3.914871805260456e-06, + "loss": 0.63836038, + "num_input_tokens_seen": 33261080, + "router_z_loss_clip": 0.04125977, + "router_z_loss_mlp": 0.3046875, + "step": 1552, + "time_per_iteration": 3.1823689937591553 + }, + { + "auxiliary_loss_clip": 0.01035259, + "auxiliary_loss_mlp": 0.01005669, + "balance_loss_clip": 1.00163996, + "balance_loss_mlp": 1.0053786, + "epoch": 0.09337141139335638, + "flos": 53290515052800.0, + "grad_norm": 0.8366902008839252, + "language_loss": 0.59049493, + "learning_rate": 3.91476271420284e-06, + "loss": 0.61090428, + "num_input_tokens_seen": 33330235, + "router_z_loss_clip": 0.0402832, + "router_z_loss_mlp": 0.29882812, + "step": 1553, + "time_per_iteration": 3.234530210494995 + }, + { + "auxiliary_loss_clip": 0.01133075, + "auxiliary_loss_mlp": 0.01050295, + "balance_loss_clip": 1.02307987, + "balance_loss_mlp": 1.03343916, + "epoch": 0.09343153464602436, + "flos": 23475738101760.0, + "grad_norm": 1.8482203056914184, + "language_loss": 0.87292784, + "learning_rate": 3.914653554812269e-06, + "loss": 0.89476156, + "num_input_tokens_seen": 33349035, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 0.99609375, + "step": 1554, + "time_per_iteration": 2.4784629344940186 + }, + { + "auxiliary_loss_clip": 0.01130778, + "auxiliary_loss_mlp": 0.01048842, + "balance_loss_clip": 1.02203178, + "balance_loss_mlp": 1.03553355, + "epoch": 0.09349165789869232, + "flos": 19352495600640.0, + "grad_norm": 1.8549894743094775, + "language_loss": 0.81752455, + "learning_rate": 3.914544327092637e-06, + "loss": 0.83932072, + "num_input_tokens_seen": 33368060, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 0.953125, + "step": 1555, + "time_per_iteration": 3.996166944503784 + }, + { + "auxiliary_loss_clip": 0.01034732, + "auxiliary_loss_mlp": 0.01003965, + "balance_loss_clip": 1.00012672, + "balance_loss_mlp": 1.00511003, + "epoch": 0.09355178115136029, + "flos": 67499572913280.0, + "grad_norm": 0.8684699997120257, + "language_loss": 0.5964554, + "learning_rate": 3.914435031047844e-06, + "loss": 0.61684233, + "num_input_tokens_seen": 33430825, + "router_z_loss_clip": 0.03833008, + "router_z_loss_mlp": 0.296875, + "step": 1556, + "time_per_iteration": 3.087555408477783 + }, + { + "auxiliary_loss_clip": 0.01132066, + "auxiliary_loss_mlp": 0.01051652, + "balance_loss_clip": 1.02517581, + "balance_loss_mlp": 1.03376365, + "epoch": 0.09361190440402825, + "flos": 37340132353920.0, + "grad_norm": 1.8014126832529527, + "language_loss": 0.8437897, + "learning_rate": 3.9143256666817875e-06, + "loss": 0.86562681, + "num_input_tokens_seen": 33454855, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.98046875, + "step": 1557, + "time_per_iteration": 3.954483985900879 + }, + { + "auxiliary_loss_clip": 0.01131742, + "auxiliary_loss_mlp": 0.01051749, + "balance_loss_clip": 1.02405715, + "balance_loss_mlp": 1.03336477, + "epoch": 0.09367202765669623, + "flos": 24898553539200.0, + "grad_norm": 1.7844932698726639, + "language_loss": 0.77857816, + "learning_rate": 3.914216233998373e-06, + "loss": 0.80041307, + "num_input_tokens_seen": 33476000, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.984375, + "step": 1558, + "time_per_iteration": 3.984492540359497 + }, + { + "auxiliary_loss_clip": 0.01135984, + "auxiliary_loss_mlp": 0.01048342, + "balance_loss_clip": 1.02124536, + "balance_loss_mlp": 1.03523898, + "epoch": 0.0937321509093642, + "flos": 15704665349760.0, + "grad_norm": 1.858447994209009, + "language_loss": 0.79866064, + "learning_rate": 3.914106733001505e-06, + "loss": 0.82050389, + "num_input_tokens_seen": 33493845, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 1.0078125, + "step": 1559, + "time_per_iteration": 3.8980712890625 + }, + { + "auxiliary_loss_clip": 0.01127696, + "auxiliary_loss_mlp": 0.01048208, + "balance_loss_clip": 1.02249503, + "balance_loss_mlp": 1.03341794, + "epoch": 0.09379227416203216, + "flos": 20482704000000.0, + "grad_norm": 3.145335706741672, + "language_loss": 0.76307881, + "learning_rate": 3.9139971636950914e-06, + "loss": 0.78483784, + "num_input_tokens_seen": 33510850, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.94140625, + "step": 1560, + "time_per_iteration": 2.4804866313934326 + }, + { + "auxiliary_loss_clip": 0.01137378, + "auxiliary_loss_mlp": 0.01049168, + "balance_loss_clip": 1.02276349, + "balance_loss_mlp": 1.03392553, + "epoch": 0.09385239741470014, + "flos": 24351359800320.0, + "grad_norm": 1.684229005364842, + "language_loss": 0.80698353, + "learning_rate": 3.913887526083042e-06, + "loss": 0.82884896, + "num_input_tokens_seen": 33530430, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 1.0390625, + "step": 1561, + "time_per_iteration": 2.5235869884490967 + }, + { + "auxiliary_loss_clip": 0.01130155, + "auxiliary_loss_mlp": 0.01042075, + "balance_loss_clip": 1.01642132, + "balance_loss_mlp": 1.03278327, + "epoch": 0.0939125206673681, + "flos": 33290102707200.0, + "grad_norm": 6.11597418955832, + "language_loss": 0.61490536, + "learning_rate": 3.91377782016927e-06, + "loss": 0.63662767, + "num_input_tokens_seen": 33551975, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.97265625, + "step": 1562, + "time_per_iteration": 2.5785610675811768 + }, + { + "auxiliary_loss_clip": 0.01132565, + "auxiliary_loss_mlp": 0.01051863, + "balance_loss_clip": 1.02629256, + "balance_loss_mlp": 1.03629041, + "epoch": 0.09397264392003607, + "flos": 19243915672320.0, + "grad_norm": 9.651361957153787, + "language_loss": 0.84796524, + "learning_rate": 3.9136680459576905e-06, + "loss": 0.86980951, + "num_input_tokens_seen": 33569850, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.96484375, + "step": 1563, + "time_per_iteration": 2.445735216140747 + }, + { + "auxiliary_loss_clip": 0.01126767, + "auxiliary_loss_mlp": 0.01042546, + "balance_loss_clip": 1.01910937, + "balance_loss_mlp": 1.03224373, + "epoch": 0.09403276717270405, + "flos": 19316919058560.0, + "grad_norm": 1.689177225733662, + "language_loss": 0.75749022, + "learning_rate": 3.913558203452221e-06, + "loss": 0.77918339, + "num_input_tokens_seen": 33590510, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.9453125, + "step": 1564, + "time_per_iteration": 2.600019931793213 + }, + { + "auxiliary_loss_clip": 0.01131358, + "auxiliary_loss_mlp": 0.01046324, + "balance_loss_clip": 1.02163541, + "balance_loss_mlp": 1.03432405, + "epoch": 0.09409289042537201, + "flos": 23582432816640.0, + "grad_norm": 2.075383688901369, + "language_loss": 0.80019706, + "learning_rate": 3.913448292656782e-06, + "loss": 0.82197386, + "num_input_tokens_seen": 33608810, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.96875, + "step": 1565, + "time_per_iteration": 2.5562853813171387 + }, + { + "auxiliary_loss_clip": 0.01128414, + "auxiliary_loss_mlp": 0.01053288, + "balance_loss_clip": 1.0280869, + "balance_loss_mlp": 1.03111577, + "epoch": 0.09415301367803998, + "flos": 20077572049920.0, + "grad_norm": 1.9569232200602484, + "language_loss": 0.75231785, + "learning_rate": 3.913338313575295e-06, + "loss": 0.77413487, + "num_input_tokens_seen": 33627265, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.97265625, + "step": 1566, + "time_per_iteration": 2.5157737731933594 + }, + { + "auxiliary_loss_clip": 0.01128828, + "auxiliary_loss_mlp": 0.01056516, + "balance_loss_clip": 1.03018296, + "balance_loss_mlp": 1.03255665, + "epoch": 0.09421313693070796, + "flos": 21061215095040.0, + "grad_norm": 1.8935387915162705, + "language_loss": 0.77399063, + "learning_rate": 3.913228266211685e-06, + "loss": 0.79584408, + "num_input_tokens_seen": 33644810, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 0.9609375, + "step": 1567, + "time_per_iteration": 2.4394407272338867 + }, + { + "auxiliary_loss_clip": 0.01132407, + "auxiliary_loss_mlp": 0.01048987, + "balance_loss_clip": 1.02391696, + "balance_loss_mlp": 1.03506601, + "epoch": 0.09427326018337592, + "flos": 24315015208320.0, + "grad_norm": 1.8373050423611277, + "language_loss": 0.82380879, + "learning_rate": 3.91311815056988e-06, + "loss": 0.84562278, + "num_input_tokens_seen": 33665665, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.97265625, + "step": 1568, + "time_per_iteration": 2.4735612869262695 + }, + { + "auxiliary_loss_clip": 0.01133743, + "auxiliary_loss_mlp": 0.01047137, + "balance_loss_clip": 1.02000451, + "balance_loss_mlp": 1.035182, + "epoch": 0.09433338343604389, + "flos": 20262925791360.0, + "grad_norm": 3.0445244276773686, + "language_loss": 0.76563734, + "learning_rate": 3.9130079666538094e-06, + "loss": 0.78744614, + "num_input_tokens_seen": 33684760, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 0.984375, + "step": 1569, + "time_per_iteration": 2.4458184242248535 + }, + { + "auxiliary_loss_clip": 0.01128653, + "auxiliary_loss_mlp": 0.01054956, + "balance_loss_clip": 1.02970767, + "balance_loss_mlp": 1.03367496, + "epoch": 0.09439350668871185, + "flos": 12742355111040.0, + "grad_norm": 2.085834786434566, + "language_loss": 0.85499036, + "learning_rate": 3.912897714467405e-06, + "loss": 0.87682647, + "num_input_tokens_seen": 33700750, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.94921875, + "step": 1570, + "time_per_iteration": 2.400621175765991 + }, + { + "auxiliary_loss_clip": 0.01132324, + "auxiliary_loss_mlp": 0.01047512, + "balance_loss_clip": 1.02268112, + "balance_loss_mlp": 1.03605151, + "epoch": 0.09445362994137983, + "flos": 25960960344960.0, + "grad_norm": 1.7147238482541927, + "language_loss": 0.76369232, + "learning_rate": 3.912787394014602e-06, + "loss": 0.78549063, + "num_input_tokens_seen": 33724430, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.9609375, + "step": 1571, + "time_per_iteration": 2.503005266189575 + }, + { + "auxiliary_loss_clip": 0.01125115, + "auxiliary_loss_mlp": 0.01049007, + "balance_loss_clip": 1.02462876, + "balance_loss_mlp": 1.03338134, + "epoch": 0.0945137531940478, + "flos": 19714440332160.0, + "grad_norm": 1.5826613962874685, + "language_loss": 0.79275006, + "learning_rate": 3.912677005299337e-06, + "loss": 0.81449127, + "num_input_tokens_seen": 33743455, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.91796875, + "step": 1572, + "time_per_iteration": 2.4463391304016113 + }, + { + "auxiliary_loss_clip": 0.01128702, + "auxiliary_loss_mlp": 0.0105154, + "balance_loss_clip": 1.02736449, + "balance_loss_mlp": 1.03395796, + "epoch": 0.09457387644671576, + "flos": 23616089233920.0, + "grad_norm": 1.9490236600921087, + "language_loss": 0.87449968, + "learning_rate": 3.912566548325549e-06, + "loss": 0.8963021, + "num_input_tokens_seen": 33763435, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 0.94921875, + "step": 1573, + "time_per_iteration": 2.4512505531311035 + }, + { + "auxiliary_loss_clip": 0.01130399, + "auxiliary_loss_mlp": 0.0105662, + "balance_loss_clip": 1.0299294, + "balance_loss_mlp": 1.0338819, + "epoch": 0.09463399969938374, + "flos": 26906059382400.0, + "grad_norm": 3.9008503413191, + "language_loss": 0.81712437, + "learning_rate": 3.912456023097182e-06, + "loss": 0.83899456, + "num_input_tokens_seen": 33784325, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.96484375, + "step": 1574, + "time_per_iteration": 2.4951813220977783 + }, + { + "auxiliary_loss_clip": 0.01129031, + "auxiliary_loss_mlp": 0.01044704, + "balance_loss_clip": 1.01958644, + "balance_loss_mlp": 1.03409958, + "epoch": 0.0946941229520517, + "flos": 23658438579840.0, + "grad_norm": 1.8567349415175596, + "language_loss": 0.81094515, + "learning_rate": 3.912345429618178e-06, + "loss": 0.83268249, + "num_input_tokens_seen": 33802510, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.94921875, + "step": 1575, + "time_per_iteration": 2.443317174911499 + }, + { + "auxiliary_loss_clip": 0.01126348, + "auxiliary_loss_mlp": 0.01052319, + "balance_loss_clip": 1.02739215, + "balance_loss_mlp": 1.03227639, + "epoch": 0.09475424620471967, + "flos": 24132908223360.0, + "grad_norm": 2.4286094261598494, + "language_loss": 0.86847895, + "learning_rate": 3.912234767892486e-06, + "loss": 0.89026564, + "num_input_tokens_seen": 33819980, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.94140625, + "step": 1576, + "time_per_iteration": 2.446354866027832 + }, + { + "auxiliary_loss_clip": 0.01039027, + "auxiliary_loss_mlp": 0.01006219, + "balance_loss_clip": 1.00252378, + "balance_loss_mlp": 1.00889707, + "epoch": 0.09481436945738765, + "flos": 68422815573120.0, + "grad_norm": 0.9876111566020145, + "language_loss": 0.65870196, + "learning_rate": 3.912124037924053e-06, + "loss": 0.67915446, + "num_input_tokens_seen": 33878925, + "router_z_loss_clip": 0.03686523, + "router_z_loss_mlp": 0.30078125, + "step": 1577, + "time_per_iteration": 3.0272223949432373 + }, + { + "auxiliary_loss_clip": 0.01129314, + "auxiliary_loss_mlp": 0.01042959, + "balance_loss_clip": 1.01878285, + "balance_loss_mlp": 1.03307056, + "epoch": 0.09487449271005562, + "flos": 16653150789120.0, + "grad_norm": 2.0004661920780817, + "language_loss": 0.79035044, + "learning_rate": 3.912013239716831e-06, + "loss": 0.81207317, + "num_input_tokens_seen": 33897600, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 0.9609375, + "step": 1578, + "time_per_iteration": 2.4237899780273438 + }, + { + "auxiliary_loss_clip": 0.01127281, + "auxiliary_loss_mlp": 0.01051388, + "balance_loss_clip": 1.02628279, + "balance_loss_mlp": 1.03219068, + "epoch": 0.09493461596272358, + "flos": 24274655809920.0, + "grad_norm": 1.757623102725029, + "language_loss": 0.78247732, + "learning_rate": 3.911902373274776e-06, + "loss": 0.80426395, + "num_input_tokens_seen": 33917365, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.953125, + "step": 1579, + "time_per_iteration": 2.4414737224578857 + }, + { + "auxiliary_loss_clip": 0.011302, + "auxiliary_loss_mlp": 0.01053094, + "balance_loss_clip": 1.02664101, + "balance_loss_mlp": 1.03369892, + "epoch": 0.09499473921539155, + "flos": 21869139934080.0, + "grad_norm": 2.1555160267596505, + "language_loss": 0.72842288, + "learning_rate": 3.911791438601842e-06, + "loss": 0.75025582, + "num_input_tokens_seen": 33936680, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 0.96484375, + "step": 1580, + "time_per_iteration": 2.4454803466796875 + }, + { + "auxiliary_loss_clip": 0.01125442, + "auxiliary_loss_mlp": 0.01051209, + "balance_loss_clip": 1.02759409, + "balance_loss_mlp": 1.03187084, + "epoch": 0.09505486246805953, + "flos": 33545736748800.0, + "grad_norm": 1.9535485397853518, + "language_loss": 0.77732539, + "learning_rate": 3.91168043570199e-06, + "loss": 0.79909194, + "num_input_tokens_seen": 33960685, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.9375, + "step": 1581, + "time_per_iteration": 2.5470573902130127 + }, + { + "auxiliary_loss_clip": 0.01127665, + "auxiliary_loss_mlp": 0.01049812, + "balance_loss_clip": 1.02551746, + "balance_loss_mlp": 1.0330832, + "epoch": 0.09511498572072749, + "flos": 21214273962240.0, + "grad_norm": 1.921177984257379, + "language_loss": 0.87140906, + "learning_rate": 3.911569364579181e-06, + "loss": 0.89318383, + "num_input_tokens_seen": 33980015, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 0.9453125, + "step": 1582, + "time_per_iteration": 2.4393773078918457 + }, + { + "auxiliary_loss_clip": 0.0112801, + "auxiliary_loss_mlp": 0.01042294, + "balance_loss_clip": 1.01530528, + "balance_loss_mlp": 1.03308201, + "epoch": 0.09517510897339546, + "flos": 14610382606080.0, + "grad_norm": 1.9526238988739044, + "language_loss": 0.66777384, + "learning_rate": 3.9114582252373786e-06, + "loss": 0.68947685, + "num_input_tokens_seen": 33997705, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.94921875, + "step": 1583, + "time_per_iteration": 2.408489465713501 + }, + { + "auxiliary_loss_clip": 0.01130276, + "auxiliary_loss_mlp": 0.01044433, + "balance_loss_clip": 1.01768279, + "balance_loss_mlp": 1.03359652, + "epoch": 0.09523523222606343, + "flos": 27816140459520.0, + "grad_norm": 2.123446702097312, + "language_loss": 0.70384932, + "learning_rate": 3.911347017680548e-06, + "loss": 0.72559643, + "num_input_tokens_seen": 34017465, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 0.96484375, + "step": 1584, + "time_per_iteration": 2.4857635498046875 + }, + { + "auxiliary_loss_clip": 0.01128904, + "auxiliary_loss_mlp": 0.01043143, + "balance_loss_clip": 1.01943183, + "balance_loss_mlp": 1.03342712, + "epoch": 0.0952953554787314, + "flos": 20705170383360.0, + "grad_norm": 1.4986264749081961, + "language_loss": 0.81038153, + "learning_rate": 3.911235741912659e-06, + "loss": 0.832102, + "num_input_tokens_seen": 34038550, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.95703125, + "step": 1585, + "time_per_iteration": 2.4599831104278564 + }, + { + "auxiliary_loss_clip": 0.01131759, + "auxiliary_loss_mlp": 0.01052134, + "balance_loss_clip": 1.02458525, + "balance_loss_mlp": 1.0344888, + "epoch": 0.09535547873139937, + "flos": 24786552297600.0, + "grad_norm": 1.7153409836079414, + "language_loss": 0.71711075, + "learning_rate": 3.911124397937683e-06, + "loss": 0.73894966, + "num_input_tokens_seen": 34058665, + "router_z_loss_clip": 0.27539062, + "router_z_loss_mlp": 0.97265625, + "step": 1586, + "time_per_iteration": 2.5065290927886963 + }, + { + "auxiliary_loss_clip": 0.0103869, + "auxiliary_loss_mlp": 0.01004867, + "balance_loss_clip": 1.00143409, + "balance_loss_mlp": 1.00877166, + "epoch": 0.09541560198406734, + "flos": 71909208230400.0, + "grad_norm": 0.8043839504118597, + "language_loss": 0.5548532, + "learning_rate": 3.911012985759594e-06, + "loss": 0.57528877, + "num_input_tokens_seen": 34109655, + "router_z_loss_clip": 0.03442383, + "router_z_loss_mlp": 0.29882812, + "step": 1587, + "time_per_iteration": 2.8876850605010986 + }, + { + "auxiliary_loss_clip": 0.01131229, + "auxiliary_loss_mlp": 0.01053932, + "balance_loss_clip": 1.02865958, + "balance_loss_mlp": 1.03474832, + "epoch": 0.09547572523673531, + "flos": 28981436641920.0, + "grad_norm": 1.6962794613973096, + "language_loss": 0.80978215, + "learning_rate": 3.910901505382367e-06, + "loss": 0.83163375, + "num_input_tokens_seen": 34131115, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.96484375, + "step": 1588, + "time_per_iteration": 2.5049076080322266 + }, + { + "auxiliary_loss_clip": 0.01128484, + "auxiliary_loss_mlp": 0.01048779, + "balance_loss_clip": 1.02406645, + "balance_loss_mlp": 1.03440237, + "epoch": 0.09553584848940327, + "flos": 24132768577920.0, + "grad_norm": 1.5583499472319169, + "language_loss": 0.81693327, + "learning_rate": 3.910789956809981e-06, + "loss": 0.8387059, + "num_input_tokens_seen": 34151925, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.94140625, + "step": 1589, + "time_per_iteration": 2.444581985473633 + }, + { + "auxiliary_loss_clip": 0.01129849, + "auxiliary_loss_mlp": 0.01049648, + "balance_loss_clip": 1.02476883, + "balance_loss_mlp": 1.0345633, + "epoch": 0.09559597174207124, + "flos": 42849706055040.0, + "grad_norm": 1.4846773363066526, + "language_loss": 0.64840114, + "learning_rate": 3.910678340046415e-06, + "loss": 0.67019612, + "num_input_tokens_seen": 34175395, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.953125, + "step": 1590, + "time_per_iteration": 2.6288366317749023 + }, + { + "auxiliary_loss_clip": 0.01127174, + "auxiliary_loss_mlp": 0.01048047, + "balance_loss_clip": 1.02297759, + "balance_loss_mlp": 1.03421664, + "epoch": 0.09565609499473922, + "flos": 32669486645760.0, + "grad_norm": 1.9044171830956573, + "language_loss": 0.83177459, + "learning_rate": 3.910566655095655e-06, + "loss": 0.85352671, + "num_input_tokens_seen": 34197760, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.9296875, + "step": 1591, + "time_per_iteration": 2.530949592590332 + }, + { + "auxiliary_loss_clip": 0.01131455, + "auxiliary_loss_mlp": 0.01058947, + "balance_loss_clip": 1.03254199, + "balance_loss_mlp": 1.03324676, + "epoch": 0.09571621824740718, + "flos": 18477432483840.0, + "grad_norm": 2.665017419106436, + "language_loss": 0.74207127, + "learning_rate": 3.9104549019616855e-06, + "loss": 0.76397526, + "num_input_tokens_seen": 34215330, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 0.984375, + "step": 1592, + "time_per_iteration": 2.3857264518737793 + }, + { + "auxiliary_loss_clip": 0.01126811, + "auxiliary_loss_mlp": 0.0104743, + "balance_loss_clip": 1.02271771, + "balance_loss_mlp": 1.03202522, + "epoch": 0.09577634150007515, + "flos": 29386219478400.0, + "grad_norm": 1.841806432839428, + "language_loss": 0.74010116, + "learning_rate": 3.910343080648495e-06, + "loss": 0.76184356, + "num_input_tokens_seen": 34237745, + "router_z_loss_clip": 0.24707031, + "router_z_loss_mlp": 0.94921875, + "step": 1593, + "time_per_iteration": 2.4896321296691895 + }, + { + "auxiliary_loss_clip": 0.01128345, + "auxiliary_loss_mlp": 0.01050887, + "balance_loss_clip": 1.02528095, + "balance_loss_mlp": 1.03316426, + "epoch": 0.09583646475274313, + "flos": 22746716668800.0, + "grad_norm": 1.7383322301339936, + "language_loss": 0.69956505, + "learning_rate": 3.910231191160074e-06, + "loss": 0.72135735, + "num_input_tokens_seen": 34256565, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.953125, + "step": 1594, + "time_per_iteration": 3.940382480621338 + }, + { + "auxiliary_loss_clip": 0.01129306, + "auxiliary_loss_mlp": 0.01044947, + "balance_loss_clip": 1.01942444, + "balance_loss_mlp": 1.03309989, + "epoch": 0.0958965880054111, + "flos": 23217346062720.0, + "grad_norm": 2.301854053189111, + "language_loss": 0.8258779, + "learning_rate": 3.910119233500415e-06, + "loss": 0.84762043, + "num_input_tokens_seen": 34275970, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.96484375, + "step": 1595, + "time_per_iteration": 2.4429819583892822 + }, + { + "auxiliary_loss_clip": 0.01131211, + "auxiliary_loss_mlp": 0.01047621, + "balance_loss_clip": 1.02132368, + "balance_loss_mlp": 1.03454077, + "epoch": 0.09595671125807906, + "flos": 21323377560960.0, + "grad_norm": 2.15645836466016, + "language_loss": 0.84589171, + "learning_rate": 3.910007207673514e-06, + "loss": 0.86767995, + "num_input_tokens_seen": 34295490, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.96484375, + "step": 1596, + "time_per_iteration": 3.870670795440674 + }, + { + "auxiliary_loss_clip": 0.01132189, + "auxiliary_loss_mlp": 0.0104588, + "balance_loss_clip": 1.01948714, + "balance_loss_mlp": 1.03406048, + "epoch": 0.09601683451074704, + "flos": 39601910695680.0, + "grad_norm": 1.8493887818937222, + "language_loss": 0.69076598, + "learning_rate": 3.909895113683369e-06, + "loss": 0.71254671, + "num_input_tokens_seen": 34319990, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 0.98046875, + "step": 1597, + "time_per_iteration": 2.581110715866089 + }, + { + "auxiliary_loss_clip": 0.01130091, + "auxiliary_loss_mlp": 0.01040875, + "balance_loss_clip": 1.01574564, + "balance_loss_mlp": 1.03267622, + "epoch": 0.096076957763415, + "flos": 23731581611520.0, + "grad_norm": 2.043402237761072, + "language_loss": 0.74736744, + "learning_rate": 3.9097829515339805e-06, + "loss": 0.76907706, + "num_input_tokens_seen": 34339225, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.97265625, + "step": 1598, + "time_per_iteration": 3.935260057449341 + }, + { + "auxiliary_loss_clip": 0.01134141, + "auxiliary_loss_mlp": 0.01051248, + "balance_loss_clip": 1.02383006, + "balance_loss_mlp": 1.0345974, + "epoch": 0.09613708101608297, + "flos": 34676678286720.0, + "grad_norm": 1.5883270883652745, + "language_loss": 0.69103479, + "learning_rate": 3.909670721229351e-06, + "loss": 0.71288872, + "num_input_tokens_seen": 34361020, + "router_z_loss_clip": 0.27539062, + "router_z_loss_mlp": 0.9921875, + "step": 1599, + "time_per_iteration": 2.53983998298645 + }, + { + "auxiliary_loss_clip": 0.01129399, + "auxiliary_loss_mlp": 0.01047688, + "balance_loss_clip": 1.02234411, + "balance_loss_mlp": 1.03334928, + "epoch": 0.09619720426875093, + "flos": 20739001357440.0, + "grad_norm": 2.1501053197670674, + "language_loss": 0.84326446, + "learning_rate": 3.909558422773485e-06, + "loss": 0.8650353, + "num_input_tokens_seen": 34378630, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.9609375, + "step": 1600, + "time_per_iteration": 2.480672597885132 + }, + { + "auxiliary_loss_clip": 0.01130237, + "auxiliary_loss_mlp": 0.01051023, + "balance_loss_clip": 1.0260725, + "balance_loss_mlp": 1.03436995, + "epoch": 0.09625732752141891, + "flos": 13041874598400.0, + "grad_norm": 3.115910981380097, + "language_loss": 0.803262, + "learning_rate": 3.909446056170392e-06, + "loss": 0.82507461, + "num_input_tokens_seen": 34397110, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.95703125, + "step": 1601, + "time_per_iteration": 2.407433032989502 + }, + { + "auxiliary_loss_clip": 0.01133421, + "auxiliary_loss_mlp": 0.01050606, + "balance_loss_clip": 1.02371204, + "balance_loss_mlp": 1.03521669, + "epoch": 0.09631745077408688, + "flos": 22272526316160.0, + "grad_norm": 2.789109957056121, + "language_loss": 0.82325655, + "learning_rate": 3.9093336214240805e-06, + "loss": 0.84509683, + "num_input_tokens_seen": 34414165, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.984375, + "step": 1602, + "time_per_iteration": 2.4383020401000977 + }, + { + "auxiliary_loss_clip": 0.01128537, + "auxiliary_loss_mlp": 0.01051186, + "balance_loss_clip": 1.02412581, + "balance_loss_mlp": 1.0341289, + "epoch": 0.09637757402675484, + "flos": 24753105348480.0, + "grad_norm": 2.012458938720864, + "language_loss": 0.62555087, + "learning_rate": 3.9092211185385625e-06, + "loss": 0.64734805, + "num_input_tokens_seen": 34434445, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.9453125, + "step": 1603, + "time_per_iteration": 2.452920913696289 + }, + { + "auxiliary_loss_clip": 0.01132477, + "auxiliary_loss_mlp": 0.01048033, + "balance_loss_clip": 1.01939869, + "balance_loss_mlp": 1.03522003, + "epoch": 0.09643769727942282, + "flos": 22524739044480.0, + "grad_norm": 5.2125661320797105, + "language_loss": 0.71050173, + "learning_rate": 3.909108547517855e-06, + "loss": 0.73230684, + "num_input_tokens_seen": 34453095, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 0.97265625, + "step": 1604, + "time_per_iteration": 2.5001332759857178 + }, + { + "auxiliary_loss_clip": 0.01129197, + "auxiliary_loss_mlp": 0.01049554, + "balance_loss_clip": 1.02432883, + "balance_loss_mlp": 1.0339098, + "epoch": 0.09649782053209079, + "flos": 30919674614400.0, + "grad_norm": 2.1476828999693787, + "language_loss": 0.79755807, + "learning_rate": 3.908995908365974e-06, + "loss": 0.81934559, + "num_input_tokens_seen": 34473680, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.953125, + "step": 1605, + "time_per_iteration": 2.516160249710083 + }, + { + "auxiliary_loss_clip": 0.01129773, + "auxiliary_loss_mlp": 0.01046575, + "balance_loss_clip": 1.02044487, + "balance_loss_mlp": 1.03133607, + "epoch": 0.09655794378475875, + "flos": 25336469122560.0, + "grad_norm": 2.1327881144518552, + "language_loss": 0.74646139, + "learning_rate": 3.908883201086939e-06, + "loss": 0.76822495, + "num_input_tokens_seen": 34492610, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.984375, + "step": 1606, + "time_per_iteration": 2.4860920906066895 + }, + { + "auxiliary_loss_clip": 0.01131643, + "auxiliary_loss_mlp": 0.01043237, + "balance_loss_clip": 1.01773846, + "balance_loss_mlp": 1.03393614, + "epoch": 0.09661806703742673, + "flos": 22344971120640.0, + "grad_norm": 1.7854779361931754, + "language_loss": 0.75499034, + "learning_rate": 3.908770425684774e-06, + "loss": 0.77673924, + "num_input_tokens_seen": 34511855, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.9765625, + "step": 1607, + "time_per_iteration": 2.439432382583618 + }, + { + "auxiliary_loss_clip": 0.01129381, + "auxiliary_loss_mlp": 0.0104156, + "balance_loss_clip": 1.01736069, + "balance_loss_mlp": 1.03235519, + "epoch": 0.0966781902900947, + "flos": 17456606974080.0, + "grad_norm": 1.966699091936902, + "language_loss": 0.86513656, + "learning_rate": 3.908657582163501e-06, + "loss": 0.88684595, + "num_input_tokens_seen": 34528905, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.97265625, + "step": 1608, + "time_per_iteration": 2.420522451400757 + }, + { + "auxiliary_loss_clip": 0.01134142, + "auxiliary_loss_mlp": 0.01055626, + "balance_loss_clip": 1.02823162, + "balance_loss_mlp": 1.0341233, + "epoch": 0.09673831354276266, + "flos": 36902496061440.0, + "grad_norm": 2.6149778361642504, + "language_loss": 0.71525943, + "learning_rate": 3.90854467052715e-06, + "loss": 0.73715711, + "num_input_tokens_seen": 34548480, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 1.0, + "step": 1609, + "time_per_iteration": 2.5289125442504883 + }, + { + "auxiliary_loss_clip": 0.01130136, + "auxiliary_loss_mlp": 0.01046521, + "balance_loss_clip": 1.02195179, + "balance_loss_mlp": 1.03249955, + "epoch": 0.09679843679543064, + "flos": 20700422438400.0, + "grad_norm": 2.195539894108231, + "language_loss": 0.84416944, + "learning_rate": 3.908431690779748e-06, + "loss": 0.86593604, + "num_input_tokens_seen": 34565410, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.9765625, + "step": 1610, + "time_per_iteration": 2.4299871921539307 + }, + { + "auxiliary_loss_clip": 0.01129956, + "auxiliary_loss_mlp": 0.0104887, + "balance_loss_clip": 1.02248859, + "balance_loss_mlp": 1.03369439, + "epoch": 0.0968585600480986, + "flos": 23513269680000.0, + "grad_norm": 2.117691948263346, + "language_loss": 0.6709789, + "learning_rate": 3.9083186429253284e-06, + "loss": 0.69276714, + "num_input_tokens_seen": 34584840, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 0.9609375, + "step": 1611, + "time_per_iteration": 2.4413843154907227 + }, + { + "auxiliary_loss_clip": 0.01129039, + "auxiliary_loss_mlp": 0.01048062, + "balance_loss_clip": 1.02288496, + "balance_loss_mlp": 1.03424144, + "epoch": 0.09691868330076657, + "flos": 20120026129920.0, + "grad_norm": 1.79228238953497, + "language_loss": 0.8106401, + "learning_rate": 3.908205526967925e-06, + "loss": 0.83241117, + "num_input_tokens_seen": 34603360, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.9453125, + "step": 1612, + "time_per_iteration": 2.458664894104004 + }, + { + "auxiliary_loss_clip": 0.01130977, + "auxiliary_loss_mlp": 0.01048114, + "balance_loss_clip": 1.02069569, + "balance_loss_mlp": 1.03401744, + "epoch": 0.09697880655343454, + "flos": 16543767899520.0, + "grad_norm": 2.123354943874359, + "language_loss": 0.8074218, + "learning_rate": 3.9080923429115755e-06, + "loss": 0.82921273, + "num_input_tokens_seen": 34620760, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.96875, + "step": 1613, + "time_per_iteration": 2.4841091632843018 + }, + { + "auxiliary_loss_clip": 0.01131463, + "auxiliary_loss_mlp": 0.01046134, + "balance_loss_clip": 1.01866841, + "balance_loss_mlp": 1.03485274, + "epoch": 0.09703892980610251, + "flos": 26102987222400.0, + "grad_norm": 2.1194127715365556, + "language_loss": 0.84466386, + "learning_rate": 3.907979090760318e-06, + "loss": 0.86643982, + "num_input_tokens_seen": 34640695, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.96484375, + "step": 1614, + "time_per_iteration": 2.508568286895752 + }, + { + "auxiliary_loss_clip": 0.0104157, + "auxiliary_loss_mlp": 0.01004943, + "balance_loss_clip": 1.00150931, + "balance_loss_mlp": 1.01060295, + "epoch": 0.09709905305877048, + "flos": 60440273516160.0, + "grad_norm": 0.7053662651999016, + "language_loss": 0.54595566, + "learning_rate": 3.907865770518194e-06, + "loss": 0.56642085, + "num_input_tokens_seen": 34702395, + "router_z_loss_clip": 0.03442383, + "router_z_loss_mlp": 0.30859375, + "step": 1615, + "time_per_iteration": 3.030308723449707 + }, + { + "auxiliary_loss_clip": 0.01039482, + "auxiliary_loss_mlp": 0.01002298, + "balance_loss_clip": 0.99905533, + "balance_loss_mlp": 1.00889277, + "epoch": 0.09715917631143844, + "flos": 57636503228160.0, + "grad_norm": 0.8212475208661101, + "language_loss": 0.58202291, + "learning_rate": 3.9077523821892495e-06, + "loss": 0.60244071, + "num_input_tokens_seen": 34768910, + "router_z_loss_clip": 0.0324707, + "router_z_loss_mlp": 0.3046875, + "step": 1616, + "time_per_iteration": 3.2261805534362793 + }, + { + "auxiliary_loss_clip": 0.01131525, + "auxiliary_loss_mlp": 0.01057625, + "balance_loss_clip": 1.03033817, + "balance_loss_mlp": 1.03429449, + "epoch": 0.09721929956410642, + "flos": 20557173663360.0, + "grad_norm": 1.883023529993233, + "language_loss": 0.68717158, + "learning_rate": 3.907638925777529e-06, + "loss": 0.70906311, + "num_input_tokens_seen": 34787680, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.97265625, + "step": 1617, + "time_per_iteration": 2.468594789505005 + }, + { + "auxiliary_loss_clip": 0.01130611, + "auxiliary_loss_mlp": 0.01056749, + "balance_loss_clip": 1.02909219, + "balance_loss_mlp": 1.0327003, + "epoch": 0.09727942281677439, + "flos": 27343137093120.0, + "grad_norm": 1.8482913576706792, + "language_loss": 0.80511546, + "learning_rate": 3.907525401287082e-06, + "loss": 0.82698905, + "num_input_tokens_seen": 34808330, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.98046875, + "step": 1618, + "time_per_iteration": 2.519648313522339 + }, + { + "auxiliary_loss_clip": 0.01125056, + "auxiliary_loss_mlp": 0.01046007, + "balance_loss_clip": 1.02254701, + "balance_loss_mlp": 1.03280425, + "epoch": 0.09733954606944235, + "flos": 24898867741440.0, + "grad_norm": 1.656348963972433, + "language_loss": 0.93125695, + "learning_rate": 3.907411808721961e-06, + "loss": 0.95296764, + "num_input_tokens_seen": 34830020, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.921875, + "step": 1619, + "time_per_iteration": 2.4830868244171143 + }, + { + "auxiliary_loss_clip": 0.01126306, + "auxiliary_loss_mlp": 0.01051411, + "balance_loss_clip": 1.02572119, + "balance_loss_mlp": 1.03497815, + "epoch": 0.09739966932211033, + "flos": 31502584540800.0, + "grad_norm": 2.14999931966844, + "language_loss": 0.88552165, + "learning_rate": 3.907298148086219e-06, + "loss": 0.9072988, + "num_input_tokens_seen": 34850330, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.91015625, + "step": 1620, + "time_per_iteration": 2.555567741394043 + }, + { + "auxiliary_loss_clip": 0.01129719, + "auxiliary_loss_mlp": 0.01056381, + "balance_loss_clip": 1.02976167, + "balance_loss_mlp": 1.033777, + "epoch": 0.0974597925747783, + "flos": 23877623295360.0, + "grad_norm": 1.937261380343017, + "language_loss": 0.77111161, + "learning_rate": 3.907184419383912e-06, + "loss": 0.79297262, + "num_input_tokens_seen": 34871640, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.9609375, + "step": 1621, + "time_per_iteration": 2.5151984691619873 + }, + { + "auxiliary_loss_clip": 0.01128858, + "auxiliary_loss_mlp": 0.01066381, + "balance_loss_clip": 1.04022598, + "balance_loss_mlp": 1.0327791, + "epoch": 0.09751991582744626, + "flos": 17018621568000.0, + "grad_norm": 1.9740811110808778, + "language_loss": 0.77910846, + "learning_rate": 3.907070622619099e-06, + "loss": 0.80106086, + "num_input_tokens_seen": 34888100, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.9609375, + "step": 1622, + "time_per_iteration": 2.4174482822418213 + }, + { + "auxiliary_loss_clip": 0.01130166, + "auxiliary_loss_mlp": 0.01057601, + "balance_loss_clip": 1.02941966, + "balance_loss_mlp": 1.03194284, + "epoch": 0.09758003908011423, + "flos": 28401564003840.0, + "grad_norm": 4.7039111582580535, + "language_loss": 0.85681069, + "learning_rate": 3.906956757795841e-06, + "loss": 0.87868834, + "num_input_tokens_seen": 34910485, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 0.984375, + "step": 1623, + "time_per_iteration": 2.5555121898651123 + }, + { + "auxiliary_loss_clip": 0.01127703, + "auxiliary_loss_mlp": 0.01053942, + "balance_loss_clip": 1.02783489, + "balance_loss_mlp": 1.03434443, + "epoch": 0.09764016233278221, + "flos": 18143488529280.0, + "grad_norm": 2.193997572133753, + "language_loss": 0.79843217, + "learning_rate": 3.906842824918201e-06, + "loss": 0.8202486, + "num_input_tokens_seen": 34928615, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.93359375, + "step": 1624, + "time_per_iteration": 2.418097496032715 + }, + { + "auxiliary_loss_clip": 0.01127799, + "auxiliary_loss_mlp": 0.01048748, + "balance_loss_clip": 1.02371407, + "balance_loss_mlp": 1.03221011, + "epoch": 0.09770028558545017, + "flos": 15265004198400.0, + "grad_norm": 2.217296861129646, + "language_loss": 0.8578465, + "learning_rate": 3.906728823990246e-06, + "loss": 0.87961197, + "num_input_tokens_seen": 34946045, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.95703125, + "step": 1625, + "time_per_iteration": 2.4446890354156494 + }, + { + "auxiliary_loss_clip": 0.01130691, + "auxiliary_loss_mlp": 0.01056346, + "balance_loss_clip": 1.03190827, + "balance_loss_mlp": 1.03437328, + "epoch": 0.09776040883811814, + "flos": 23471444004480.0, + "grad_norm": 2.178532157941631, + "language_loss": 0.85360849, + "learning_rate": 3.906614755016044e-06, + "loss": 0.87547886, + "num_input_tokens_seen": 34962865, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.96484375, + "step": 1626, + "time_per_iteration": 2.485790967941284 + }, + { + "auxiliary_loss_clip": 0.01134409, + "auxiliary_loss_mlp": 0.01051877, + "balance_loss_clip": 1.02445865, + "balance_loss_mlp": 1.03664041, + "epoch": 0.09782053209078612, + "flos": 24498309179520.0, + "grad_norm": 2.56867210436732, + "language_loss": 0.83493525, + "learning_rate": 3.9065006179996655e-06, + "loss": 0.85679817, + "num_input_tokens_seen": 34983505, + "router_z_loss_clip": 0.27539062, + "router_z_loss_mlp": 0.9765625, + "step": 1627, + "time_per_iteration": 2.505154609680176 + }, + { + "auxiliary_loss_clip": 0.01125757, + "auxiliary_loss_mlp": 0.01049378, + "balance_loss_clip": 1.02329516, + "balance_loss_mlp": 1.03237844, + "epoch": 0.09788065534345408, + "flos": 21579081425280.0, + "grad_norm": 2.4486454381811202, + "language_loss": 0.8416543, + "learning_rate": 3.906386412945184e-06, + "loss": 0.8634057, + "num_input_tokens_seen": 35001825, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.9296875, + "step": 1628, + "time_per_iteration": 2.4483213424682617 + }, + { + "auxiliary_loss_clip": 0.01126192, + "auxiliary_loss_mlp": 0.01044646, + "balance_loss_clip": 1.01932597, + "balance_loss_mlp": 1.03139532, + "epoch": 0.09794077859612205, + "flos": 23841313614720.0, + "grad_norm": 1.6683207470752828, + "language_loss": 0.75619781, + "learning_rate": 3.906272139856676e-06, + "loss": 0.77790618, + "num_input_tokens_seen": 35023075, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.9453125, + "step": 1629, + "time_per_iteration": 2.4707629680633545 + }, + { + "auxiliary_loss_clip": 0.011295, + "auxiliary_loss_mlp": 0.01048848, + "balance_loss_clip": 1.02209711, + "balance_loss_mlp": 1.03475523, + "epoch": 0.09800090184879003, + "flos": 23658752782080.0, + "grad_norm": 1.8837128082629686, + "language_loss": 0.78327549, + "learning_rate": 3.906157798738218e-06, + "loss": 0.80505896, + "num_input_tokens_seen": 35043480, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 0.9453125, + "step": 1630, + "time_per_iteration": 2.4542155265808105 + }, + { + "auxiliary_loss_clip": 0.01128359, + "auxiliary_loss_mlp": 0.01050031, + "balance_loss_clip": 1.02196896, + "balance_loss_mlp": 1.03394079, + "epoch": 0.09806102510145799, + "flos": 17054826514560.0, + "grad_norm": 2.2129601684385456, + "language_loss": 0.86369681, + "learning_rate": 3.906043389593892e-06, + "loss": 0.8854807, + "num_input_tokens_seen": 35061490, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 0.9453125, + "step": 1631, + "time_per_iteration": 2.3962056636810303 + }, + { + "auxiliary_loss_clip": 0.01126007, + "auxiliary_loss_mlp": 0.01048049, + "balance_loss_clip": 1.0227648, + "balance_loss_mlp": 1.03338003, + "epoch": 0.09812114835412596, + "flos": 23877344004480.0, + "grad_norm": 2.0547621584516267, + "language_loss": 0.83182806, + "learning_rate": 3.9059289124277804e-06, + "loss": 0.85356867, + "num_input_tokens_seen": 35079670, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.92578125, + "step": 1632, + "time_per_iteration": 2.4445884227752686 + }, + { + "auxiliary_loss_clip": 0.01129925, + "auxiliary_loss_mlp": 0.01054484, + "balance_loss_clip": 1.02919996, + "balance_loss_mlp": 1.03536582, + "epoch": 0.09818127160679392, + "flos": 20594425950720.0, + "grad_norm": 1.9895039626173088, + "language_loss": 0.78635532, + "learning_rate": 3.9058143672439684e-06, + "loss": 0.80819941, + "num_input_tokens_seen": 35099205, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.9453125, + "step": 1633, + "time_per_iteration": 2.442203998565674 + }, + { + "auxiliary_loss_clip": 0.01124605, + "auxiliary_loss_mlp": 0.01049381, + "balance_loss_clip": 1.02381027, + "balance_loss_mlp": 1.03329194, + "epoch": 0.0982413948594619, + "flos": 15486423240960.0, + "grad_norm": 2.3402569957392636, + "language_loss": 0.73614502, + "learning_rate": 3.905699754046544e-06, + "loss": 0.75788486, + "num_input_tokens_seen": 35115270, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.9140625, + "step": 1634, + "time_per_iteration": 3.9503259658813477 + }, + { + "auxiliary_loss_clip": 0.01130281, + "auxiliary_loss_mlp": 0.01056121, + "balance_loss_clip": 1.02904904, + "balance_loss_mlp": 1.03145909, + "epoch": 0.09830151811212987, + "flos": 24206784393600.0, + "grad_norm": 2.5834969716202623, + "language_loss": 0.72645545, + "learning_rate": 3.905585072839597e-06, + "loss": 0.74831951, + "num_input_tokens_seen": 35134065, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 0.98828125, + "step": 1635, + "time_per_iteration": 2.446293592453003 + }, + { + "auxiliary_loss_clip": 0.01132306, + "auxiliary_loss_mlp": 0.01046668, + "balance_loss_clip": 1.0192616, + "balance_loss_mlp": 1.03499234, + "epoch": 0.09836164136479783, + "flos": 20593553166720.0, + "grad_norm": 2.563211383861435, + "language_loss": 0.78225213, + "learning_rate": 3.905470323627221e-06, + "loss": 0.80404186, + "num_input_tokens_seen": 35154870, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.97265625, + "step": 1636, + "time_per_iteration": 3.8364906311035156 + }, + { + "auxiliary_loss_clip": 0.01128111, + "auxiliary_loss_mlp": 0.01056342, + "balance_loss_clip": 1.03126025, + "balance_loss_mlp": 1.03416944, + "epoch": 0.09842176461746581, + "flos": 19933241022720.0, + "grad_norm": 1.9148254897281238, + "language_loss": 0.69535017, + "learning_rate": 3.9053555064135106e-06, + "loss": 0.71719474, + "num_input_tokens_seen": 35171850, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.9375, + "step": 1637, + "time_per_iteration": 2.4151923656463623 + }, + { + "auxiliary_loss_clip": 0.01127765, + "auxiliary_loss_mlp": 0.01053649, + "balance_loss_clip": 1.02756572, + "balance_loss_mlp": 1.03222847, + "epoch": 0.09848188787013377, + "flos": 21213610646400.0, + "grad_norm": 2.1697528708543414, + "language_loss": 0.7735889, + "learning_rate": 3.905240621202563e-06, + "loss": 0.79540306, + "num_input_tokens_seen": 35188795, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.953125, + "step": 1638, + "time_per_iteration": 3.8829591274261475 + }, + { + "auxiliary_loss_clip": 0.01125303, + "auxiliary_loss_mlp": 0.01041223, + "balance_loss_clip": 1.01531875, + "balance_loss_mlp": 1.03256583, + "epoch": 0.09854201112280174, + "flos": 30152912135040.0, + "grad_norm": 1.5071899996243445, + "language_loss": 0.72347581, + "learning_rate": 3.905125667998478e-06, + "loss": 0.74514115, + "num_input_tokens_seen": 35212100, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.92578125, + "step": 1639, + "time_per_iteration": 2.500878095626831 + }, + { + "auxiliary_loss_clip": 0.01127181, + "auxiliary_loss_mlp": 0.01039899, + "balance_loss_clip": 1.01400685, + "balance_loss_mlp": 1.03245223, + "epoch": 0.09860213437546972, + "flos": 21794740093440.0, + "grad_norm": 1.704259235748373, + "language_loss": 0.88319802, + "learning_rate": 3.90501064680536e-06, + "loss": 0.90486884, + "num_input_tokens_seen": 35230390, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.9453125, + "step": 1640, + "time_per_iteration": 2.4600088596343994 + }, + { + "auxiliary_loss_clip": 0.0113121, + "auxiliary_loss_mlp": 0.01042793, + "balance_loss_clip": 1.01701999, + "balance_loss_mlp": 1.03482461, + "epoch": 0.09866225762813768, + "flos": 21834471087360.0, + "grad_norm": 2.5186735761485917, + "language_loss": 0.80387259, + "learning_rate": 3.904895557627311e-06, + "loss": 0.82561255, + "num_input_tokens_seen": 35250405, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.96484375, + "step": 1641, + "time_per_iteration": 2.4450650215148926 + }, + { + "auxiliary_loss_clip": 0.01128525, + "auxiliary_loss_mlp": 0.01048598, + "balance_loss_clip": 1.02246714, + "balance_loss_mlp": 1.03371489, + "epoch": 0.09872238088080565, + "flos": 17598982965120.0, + "grad_norm": 2.5171415456479145, + "language_loss": 0.86056006, + "learning_rate": 3.90478040046844e-06, + "loss": 0.88233137, + "num_input_tokens_seen": 35262820, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.9453125, + "step": 1642, + "time_per_iteration": 2.3926727771759033 + }, + { + "auxiliary_loss_clip": 0.01130981, + "auxiliary_loss_mlp": 0.01045888, + "balance_loss_clip": 1.02081847, + "balance_loss_mlp": 1.03485167, + "epoch": 0.09878250413347361, + "flos": 27634906258560.0, + "grad_norm": 1.5784985955077508, + "language_loss": 0.80769372, + "learning_rate": 3.9046651753328565e-06, + "loss": 0.82946241, + "num_input_tokens_seen": 35284490, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.9609375, + "step": 1643, + "time_per_iteration": 2.501354694366455 + }, + { + "auxiliary_loss_clip": 0.01126589, + "auxiliary_loss_mlp": 0.01054205, + "balance_loss_clip": 1.0288372, + "balance_loss_mlp": 1.03275657, + "epoch": 0.0988426273861416, + "flos": 16543802810880.0, + "grad_norm": 1.9392785792078961, + "language_loss": 0.82399493, + "learning_rate": 3.904549882224672e-06, + "loss": 0.8458029, + "num_input_tokens_seen": 35302815, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.9375, + "step": 1644, + "time_per_iteration": 2.427469491958618 + }, + { + "auxiliary_loss_clip": 0.01124519, + "auxiliary_loss_mlp": 0.01045704, + "balance_loss_clip": 1.02088428, + "balance_loss_mlp": 1.03299594, + "epoch": 0.09890275063880956, + "flos": 21214204139520.0, + "grad_norm": 1.8836345415938323, + "language_loss": 0.68441319, + "learning_rate": 3.904434521148001e-06, + "loss": 0.70611537, + "num_input_tokens_seen": 35321175, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.91796875, + "step": 1645, + "time_per_iteration": 2.4562041759490967 + }, + { + "auxiliary_loss_clip": 0.01037503, + "auxiliary_loss_mlp": 0.01022829, + "balance_loss_clip": 1.01913321, + "balance_loss_mlp": 1.01097751, + "epoch": 0.09896287389147752, + "flos": 59376225876480.0, + "grad_norm": 0.857572721094008, + "language_loss": 0.60793686, + "learning_rate": 3.904319092106961e-06, + "loss": 0.62854016, + "num_input_tokens_seen": 35381740, + "router_z_loss_clip": 0.03686523, + "router_z_loss_mlp": 0.265625, + "step": 1646, + "time_per_iteration": 3.085836172103882 + }, + { + "auxiliary_loss_clip": 0.01124927, + "auxiliary_loss_mlp": 0.01050674, + "balance_loss_clip": 1.02424479, + "balance_loss_mlp": 1.03245282, + "epoch": 0.0990229971441455, + "flos": 29641399672320.0, + "grad_norm": 2.1236808177994075, + "language_loss": 0.73563886, + "learning_rate": 3.904203595105671e-06, + "loss": 0.75739485, + "num_input_tokens_seen": 35403760, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 0.92578125, + "step": 1647, + "time_per_iteration": 2.485992670059204 + }, + { + "auxiliary_loss_clip": 0.01126761, + "auxiliary_loss_mlp": 0.01049389, + "balance_loss_clip": 1.02456927, + "balance_loss_mlp": 1.03380871, + "epoch": 0.09908312039681347, + "flos": 21833807771520.0, + "grad_norm": 2.0031847019009117, + "language_loss": 0.84025264, + "learning_rate": 3.904088030148253e-06, + "loss": 0.86201417, + "num_input_tokens_seen": 35424050, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.9296875, + "step": 1648, + "time_per_iteration": 2.454493522644043 + }, + { + "auxiliary_loss_clip": 0.01034882, + "auxiliary_loss_mlp": 0.01007785, + "balance_loss_clip": 1.00447071, + "balance_loss_mlp": 1.00925016, + "epoch": 0.09914324364948143, + "flos": 57560951312640.0, + "grad_norm": 0.7264387503758062, + "language_loss": 0.5566957, + "learning_rate": 3.90397239723883e-06, + "loss": 0.57712233, + "num_input_tokens_seen": 35481690, + "router_z_loss_clip": 0.03320312, + "router_z_loss_mlp": 0.2578125, + "step": 1649, + "time_per_iteration": 3.019339084625244 + }, + { + "auxiliary_loss_clip": 0.01120967, + "auxiliary_loss_mlp": 0.01050197, + "balance_loss_clip": 1.0244596, + "balance_loss_mlp": 1.03003716, + "epoch": 0.09920336690214941, + "flos": 34122711744000.0, + "grad_norm": 2.1078890210501404, + "language_loss": 0.89719647, + "learning_rate": 3.903856696381531e-06, + "loss": 0.91890812, + "num_input_tokens_seen": 35498635, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.91015625, + "step": 1650, + "time_per_iteration": 2.5014076232910156 + }, + { + "auxiliary_loss_clip": 0.01032729, + "auxiliary_loss_mlp": 0.01006617, + "balance_loss_clip": 1.0029211, + "balance_loss_mlp": 1.00755525, + "epoch": 0.09926349015481738, + "flos": 71212514716800.0, + "grad_norm": 0.7942200012221744, + "language_loss": 0.63744354, + "learning_rate": 3.903740927580484e-06, + "loss": 0.65783697, + "num_input_tokens_seen": 35565720, + "router_z_loss_clip": 0.03686523, + "router_z_loss_mlp": 0.25195312, + "step": 1651, + "time_per_iteration": 3.259958505630493 + }, + { + "auxiliary_loss_clip": 0.01123818, + "auxiliary_loss_mlp": 0.01055441, + "balance_loss_clip": 1.03074133, + "balance_loss_mlp": 1.03246665, + "epoch": 0.09932361340748534, + "flos": 23147589432960.0, + "grad_norm": 2.3108050741700272, + "language_loss": 0.8803277, + "learning_rate": 3.90362509083982e-06, + "loss": 0.90212035, + "num_input_tokens_seen": 35586000, + "router_z_loss_clip": 0.24707031, + "router_z_loss_mlp": 0.9140625, + "step": 1652, + "time_per_iteration": 2.527146816253662 + }, + { + "auxiliary_loss_clip": 0.01128132, + "auxiliary_loss_mlp": 0.01048056, + "balance_loss_clip": 1.02420235, + "balance_loss_mlp": 1.03502429, + "epoch": 0.09938373666015332, + "flos": 19827628560000.0, + "grad_norm": 2.0249951043498418, + "language_loss": 0.82159197, + "learning_rate": 3.903509186163673e-06, + "loss": 0.84335381, + "num_input_tokens_seen": 35604355, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.9296875, + "step": 1653, + "time_per_iteration": 2.4585115909576416 + }, + { + "auxiliary_loss_clip": 0.01127948, + "auxiliary_loss_mlp": 0.01055208, + "balance_loss_clip": 1.02905321, + "balance_loss_mlp": 1.0347662, + "epoch": 0.09944385991282129, + "flos": 22089581458560.0, + "grad_norm": 2.025344607574988, + "language_loss": 0.79414368, + "learning_rate": 3.903393213556179e-06, + "loss": 0.81597531, + "num_input_tokens_seen": 35625495, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.9296875, + "step": 1654, + "time_per_iteration": 2.5175716876983643 + }, + { + "auxiliary_loss_clip": 0.01127728, + "auxiliary_loss_mlp": 0.01055725, + "balance_loss_clip": 1.03126287, + "balance_loss_mlp": 1.03671968, + "epoch": 0.09950398316548925, + "flos": 19827838028160.0, + "grad_norm": 1.631934643293413, + "language_loss": 0.81203735, + "learning_rate": 3.903277173021479e-06, + "loss": 0.83387184, + "num_input_tokens_seen": 35645030, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.91015625, + "step": 1655, + "time_per_iteration": 2.5070693492889404 + }, + { + "auxiliary_loss_clip": 0.01122712, + "auxiliary_loss_mlp": 0.01046783, + "balance_loss_clip": 1.02300107, + "balance_loss_mlp": 1.03238094, + "epoch": 0.09956410641815722, + "flos": 25002699724800.0, + "grad_norm": 1.8733174755885336, + "language_loss": 0.80317938, + "learning_rate": 3.903161064563712e-06, + "loss": 0.8248744, + "num_input_tokens_seen": 35664305, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.90234375, + "step": 1656, + "time_per_iteration": 2.480952739715576 + }, + { + "auxiliary_loss_clip": 0.01129665, + "auxiliary_loss_mlp": 0.01052113, + "balance_loss_clip": 1.02768731, + "balance_loss_mlp": 1.03797841, + "epoch": 0.0996242296708252, + "flos": 19316709590400.0, + "grad_norm": 1.677361575413214, + "language_loss": 0.88713956, + "learning_rate": 3.9030448881870206e-06, + "loss": 0.90895736, + "num_input_tokens_seen": 35684060, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.91796875, + "step": 1657, + "time_per_iteration": 2.445462465286255 + }, + { + "auxiliary_loss_clip": 0.01131792, + "auxiliary_loss_mlp": 0.01045285, + "balance_loss_clip": 1.01911855, + "balance_loss_mlp": 1.03498387, + "epoch": 0.09968435292349316, + "flos": 21870536388480.0, + "grad_norm": 2.4750170046506597, + "language_loss": 0.84711289, + "learning_rate": 3.902928643895554e-06, + "loss": 0.86888373, + "num_input_tokens_seen": 35703250, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.96875, + "step": 1658, + "time_per_iteration": 2.4258053302764893 + }, + { + "auxiliary_loss_clip": 0.01031565, + "auxiliary_loss_mlp": 0.01005096, + "balance_loss_clip": 1.00178158, + "balance_loss_mlp": 1.00624537, + "epoch": 0.09974447617616113, + "flos": 65381636839680.0, + "grad_norm": 0.9018418713282724, + "language_loss": 0.60856706, + "learning_rate": 3.9028123316934575e-06, + "loss": 0.62893367, + "num_input_tokens_seen": 35762165, + "router_z_loss_clip": 0.03320312, + "router_z_loss_mlp": 0.25390625, + "step": 1659, + "time_per_iteration": 3.0654025077819824 + }, + { + "auxiliary_loss_clip": 0.01127454, + "auxiliary_loss_mlp": 0.01047572, + "balance_loss_clip": 1.02435017, + "balance_loss_mlp": 1.03469169, + "epoch": 0.0998045994288291, + "flos": 23658682959360.0, + "grad_norm": 4.848363324112766, + "language_loss": 0.85086519, + "learning_rate": 3.902695951584885e-06, + "loss": 0.87261546, + "num_input_tokens_seen": 35781520, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.92578125, + "step": 1660, + "time_per_iteration": 2.453413963317871 + }, + { + "auxiliary_loss_clip": 0.0112804, + "auxiliary_loss_mlp": 0.01046614, + "balance_loss_clip": 1.02045989, + "balance_loss_mlp": 1.03721237, + "epoch": 0.09986472268149707, + "flos": 19608688224000.0, + "grad_norm": 1.9418971738798911, + "language_loss": 0.80042166, + "learning_rate": 3.902579503573987e-06, + "loss": 0.82216817, + "num_input_tokens_seen": 35799565, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.90625, + "step": 1661, + "time_per_iteration": 2.4215495586395264 + }, + { + "auxiliary_loss_clip": 0.01128891, + "auxiliary_loss_mlp": 0.01046257, + "balance_loss_clip": 1.01982832, + "balance_loss_mlp": 1.03200734, + "epoch": 0.09992484593416504, + "flos": 26212125732480.0, + "grad_norm": 1.8201023566804326, + "language_loss": 0.83474773, + "learning_rate": 3.902462987664922e-06, + "loss": 0.85649925, + "num_input_tokens_seen": 35821085, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 0.96875, + "step": 1662, + "time_per_iteration": 2.4538283348083496 + }, + { + "auxiliary_loss_clip": 0.01127811, + "auxiliary_loss_mlp": 0.01049377, + "balance_loss_clip": 1.02310371, + "balance_loss_mlp": 1.03647661, + "epoch": 0.09998496918683301, + "flos": 17492672275200.0, + "grad_norm": 2.0821206111460366, + "language_loss": 0.88856053, + "learning_rate": 3.902346403861846e-06, + "loss": 0.91033244, + "num_input_tokens_seen": 35839840, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.9140625, + "step": 1663, + "time_per_iteration": 2.412142753601074 + }, + { + "auxiliary_loss_clip": 0.01128487, + "auxiliary_loss_mlp": 0.01050009, + "balance_loss_clip": 1.02400947, + "balance_loss_mlp": 1.03475738, + "epoch": 0.10004509243950098, + "flos": 22783794399360.0, + "grad_norm": 1.6838586642707083, + "language_loss": 0.70417583, + "learning_rate": 3.9022297521689196e-06, + "loss": 0.72596073, + "num_input_tokens_seen": 35861545, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.9375, + "step": 1664, + "time_per_iteration": 2.4572854042053223 + }, + { + "auxiliary_loss_clip": 0.0112979, + "auxiliary_loss_mlp": 0.01051836, + "balance_loss_clip": 1.02657557, + "balance_loss_mlp": 1.03681624, + "epoch": 0.10010521569216894, + "flos": 16252452581760.0, + "grad_norm": 2.3885815761458833, + "language_loss": 0.78945351, + "learning_rate": 3.902113032590307e-06, + "loss": 0.8112697, + "num_input_tokens_seen": 35878295, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.9296875, + "step": 1665, + "time_per_iteration": 2.4018146991729736 + }, + { + "auxiliary_loss_clip": 0.01133934, + "auxiliary_loss_mlp": 0.0106273, + "balance_loss_clip": 1.03639627, + "balance_loss_mlp": 1.03929853, + "epoch": 0.10016533894483691, + "flos": 23401512817920.0, + "grad_norm": 1.7453432919123004, + "language_loss": 0.70129985, + "learning_rate": 3.901996245130174e-06, + "loss": 0.72326648, + "num_input_tokens_seen": 35898990, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 0.9453125, + "step": 1666, + "time_per_iteration": 2.441276788711548 + }, + { + "auxiliary_loss_clip": 0.01128306, + "auxiliary_loss_mlp": 0.0106182, + "balance_loss_clip": 1.03455698, + "balance_loss_mlp": 1.03533638, + "epoch": 0.10022546219750489, + "flos": 19353158916480.0, + "grad_norm": 2.1816995475096856, + "language_loss": 0.78218007, + "learning_rate": 3.901879389792686e-06, + "loss": 0.80408126, + "num_input_tokens_seen": 35916225, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.9296875, + "step": 1667, + "time_per_iteration": 2.3924455642700195 + }, + { + "auxiliary_loss_clip": 0.01128903, + "auxiliary_loss_mlp": 0.01055359, + "balance_loss_clip": 1.02883434, + "balance_loss_mlp": 1.03445554, + "epoch": 0.10028558545017285, + "flos": 27084640320000.0, + "grad_norm": 2.5698885473644046, + "language_loss": 0.77251256, + "learning_rate": 3.9017624665820155e-06, + "loss": 0.79435515, + "num_input_tokens_seen": 35934630, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.9453125, + "step": 1668, + "time_per_iteration": 2.4461889266967773 + }, + { + "auxiliary_loss_clip": 0.01126996, + "auxiliary_loss_mlp": 0.01049487, + "balance_loss_clip": 1.02247405, + "balance_loss_mlp": 1.03288484, + "epoch": 0.10034570870284082, + "flos": 25845991637760.0, + "grad_norm": 2.261072014975546, + "language_loss": 0.78254324, + "learning_rate": 3.901645475502334e-06, + "loss": 0.80430806, + "num_input_tokens_seen": 35953855, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.94140625, + "step": 1669, + "time_per_iteration": 2.4449715614318848 + }, + { + "auxiliary_loss_clip": 0.01131728, + "auxiliary_loss_mlp": 0.01064008, + "balance_loss_clip": 1.03719783, + "balance_loss_mlp": 1.0359478, + "epoch": 0.1004058319555088, + "flos": 26248400501760.0, + "grad_norm": 3.2733356873499346, + "language_loss": 0.85289216, + "learning_rate": 3.901528416557817e-06, + "loss": 0.87484956, + "num_input_tokens_seen": 35974555, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.9609375, + "step": 1670, + "time_per_iteration": 2.451265335083008 + }, + { + "auxiliary_loss_clip": 0.01121068, + "auxiliary_loss_mlp": 0.01044712, + "balance_loss_clip": 1.02132273, + "balance_loss_mlp": 1.03208125, + "epoch": 0.10046595520817676, + "flos": 25373302473600.0, + "grad_norm": 1.5981438708977425, + "language_loss": 0.77034068, + "learning_rate": 3.901411289752643e-06, + "loss": 0.79199851, + "num_input_tokens_seen": 35996830, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.890625, + "step": 1671, + "time_per_iteration": 2.4486119747161865 + }, + { + "auxiliary_loss_clip": 0.01029094, + "auxiliary_loss_mlp": 0.01005826, + "balance_loss_clip": 1.00208318, + "balance_loss_mlp": 1.00429285, + "epoch": 0.10052607846084473, + "flos": 67458934224000.0, + "grad_norm": 0.7714351547673884, + "language_loss": 0.60759377, + "learning_rate": 3.901294095090991e-06, + "loss": 0.62794292, + "num_input_tokens_seen": 36054465, + "router_z_loss_clip": 0.03735352, + "router_z_loss_mlp": 0.24804688, + "step": 1672, + "time_per_iteration": 3.0407564640045166 + }, + { + "auxiliary_loss_clip": 0.01131427, + "auxiliary_loss_mlp": 0.01055055, + "balance_loss_clip": 1.02826881, + "balance_loss_mlp": 1.03504014, + "epoch": 0.10058620171351271, + "flos": 21359442862080.0, + "grad_norm": 2.0496448377842214, + "language_loss": 0.76836884, + "learning_rate": 3.901176832577043e-06, + "loss": 0.79023367, + "num_input_tokens_seen": 36073480, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 0.96484375, + "step": 1673, + "time_per_iteration": 3.8078036308288574 + }, + { + "auxiliary_loss_clip": 0.01125647, + "auxiliary_loss_mlp": 0.01041383, + "balance_loss_clip": 1.01638508, + "balance_loss_mlp": 1.03168154, + "epoch": 0.10064632496618067, + "flos": 16799192472960.0, + "grad_norm": 2.1322106564210506, + "language_loss": 0.73229301, + "learning_rate": 3.901059502214984e-06, + "loss": 0.75396329, + "num_input_tokens_seen": 36091830, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.94140625, + "step": 1674, + "time_per_iteration": 2.386910915374756 + }, + { + "auxiliary_loss_clip": 0.0112982, + "auxiliary_loss_mlp": 0.01044697, + "balance_loss_clip": 1.0194962, + "balance_loss_mlp": 1.03423774, + "epoch": 0.10070644821884864, + "flos": 23623280974080.0, + "grad_norm": 2.1319658939344626, + "language_loss": 0.79347897, + "learning_rate": 3.900942104009003e-06, + "loss": 0.81522405, + "num_input_tokens_seen": 36111400, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.953125, + "step": 1675, + "time_per_iteration": 3.822953939437866 + }, + { + "auxiliary_loss_clip": 0.01125259, + "auxiliary_loss_mlp": 0.01056213, + "balance_loss_clip": 1.03017747, + "balance_loss_mlp": 1.03400826, + "epoch": 0.1007665714715166, + "flos": 24461406005760.0, + "grad_norm": 2.608486034898942, + "language_loss": 0.81366646, + "learning_rate": 3.900824637963287e-06, + "loss": 0.83548117, + "num_input_tokens_seen": 36129345, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.9140625, + "step": 1676, + "time_per_iteration": 2.427243232727051 + }, + { + "auxiliary_loss_clip": 0.01130224, + "auxiliary_loss_mlp": 0.01053137, + "balance_loss_clip": 1.02775776, + "balance_loss_mlp": 1.03403139, + "epoch": 0.10082669472418458, + "flos": 16798214954880.0, + "grad_norm": 2.002097366993846, + "language_loss": 0.8618263, + "learning_rate": 3.9007071040820285e-06, + "loss": 0.88365984, + "num_input_tokens_seen": 36146255, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.96484375, + "step": 1677, + "time_per_iteration": 3.9178919792175293 + }, + { + "auxiliary_loss_clip": 0.01127388, + "auxiliary_loss_mlp": 0.01050704, + "balance_loss_clip": 1.02513361, + "balance_loss_mlp": 1.03335094, + "epoch": 0.10088681797685255, + "flos": 13552653922560.0, + "grad_norm": 2.024853065057127, + "language_loss": 0.85943526, + "learning_rate": 3.900589502369423e-06, + "loss": 0.88121617, + "num_input_tokens_seen": 36164050, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.94140625, + "step": 1678, + "time_per_iteration": 2.389723062515259 + }, + { + "auxiliary_loss_clip": 0.0113194, + "auxiliary_loss_mlp": 0.0104625, + "balance_loss_clip": 1.01992822, + "balance_loss_mlp": 1.03805208, + "epoch": 0.10094694122952051, + "flos": 25264513077120.0, + "grad_norm": 2.1347749920294357, + "language_loss": 0.89958286, + "learning_rate": 3.9004718328296676e-06, + "loss": 0.92136478, + "num_input_tokens_seen": 36183530, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 0.94140625, + "step": 1679, + "time_per_iteration": 2.444307565689087 + }, + { + "auxiliary_loss_clip": 0.01029359, + "auxiliary_loss_mlp": 0.01012468, + "balance_loss_clip": 1.00867677, + "balance_loss_mlp": 1.0046916, + "epoch": 0.10100706448218849, + "flos": 69850762980480.0, + "grad_norm": 0.7774210750054823, + "language_loss": 0.52998149, + "learning_rate": 3.900354095466962e-06, + "loss": 0.55039978, + "num_input_tokens_seen": 36248550, + "router_z_loss_clip": 0.0378418, + "router_z_loss_mlp": 0.24707031, + "step": 1680, + "time_per_iteration": 3.087531328201294 + }, + { + "auxiliary_loss_clip": 0.01125771, + "auxiliary_loss_mlp": 0.0104457, + "balance_loss_clip": 1.01921439, + "balance_loss_mlp": 1.03489673, + "epoch": 0.10106718773485646, + "flos": 20006244408960.0, + "grad_norm": 1.7655943226966269, + "language_loss": 0.76840341, + "learning_rate": 3.900236290285506e-06, + "loss": 0.79010677, + "num_input_tokens_seen": 36266065, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.91015625, + "step": 1681, + "time_per_iteration": 2.3962461948394775 + }, + { + "auxiliary_loss_clip": 0.01132826, + "auxiliary_loss_mlp": 0.01058513, + "balance_loss_clip": 1.02968824, + "balance_loss_mlp": 1.03300309, + "epoch": 0.10112731098752442, + "flos": 13478987220480.0, + "grad_norm": 2.3429988295473376, + "language_loss": 0.93676221, + "learning_rate": 3.900118417289504e-06, + "loss": 0.95867562, + "num_input_tokens_seen": 36280960, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 1.0, + "step": 1682, + "time_per_iteration": 2.3901689052581787 + }, + { + "auxiliary_loss_clip": 0.01129013, + "auxiliary_loss_mlp": 0.01044731, + "balance_loss_clip": 1.01974475, + "balance_loss_mlp": 1.03455329, + "epoch": 0.1011874342401924, + "flos": 18514894239360.0, + "grad_norm": 2.5183537344737164, + "language_loss": 0.87868714, + "learning_rate": 3.900000476483164e-06, + "loss": 0.9004246, + "num_input_tokens_seen": 36299010, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.9453125, + "step": 1683, + "time_per_iteration": 2.381648302078247 + }, + { + "auxiliary_loss_clip": 0.01128587, + "auxiliary_loss_mlp": 0.01050638, + "balance_loss_clip": 1.02581882, + "balance_loss_mlp": 1.03547812, + "epoch": 0.10124755749286037, + "flos": 20701853804160.0, + "grad_norm": 1.7995433586265865, + "language_loss": 0.7452631, + "learning_rate": 3.8998824678706946e-06, + "loss": 0.76705539, + "num_input_tokens_seen": 36318400, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.93359375, + "step": 1684, + "time_per_iteration": 2.424262523651123 + }, + { + "auxiliary_loss_clip": 0.01026459, + "auxiliary_loss_mlp": 0.01004289, + "balance_loss_clip": 1.00049782, + "balance_loss_mlp": 1.00221205, + "epoch": 0.10130768074552833, + "flos": 56106015557760.0, + "grad_norm": 0.7860492151257247, + "language_loss": 0.61080587, + "learning_rate": 3.899764391456306e-06, + "loss": 0.63111335, + "num_input_tokens_seen": 36381815, + "router_z_loss_clip": 0.0378418, + "router_z_loss_mlp": 0.2421875, + "step": 1685, + "time_per_iteration": 3.1197712421417236 + }, + { + "auxiliary_loss_clip": 0.01127716, + "auxiliary_loss_mlp": 0.01050595, + "balance_loss_clip": 1.02488136, + "balance_loss_mlp": 1.03425086, + "epoch": 0.1013678039981963, + "flos": 33400916962560.0, + "grad_norm": 2.7186454758616514, + "language_loss": 0.61819071, + "learning_rate": 3.8996462472442145e-06, + "loss": 0.63997382, + "num_input_tokens_seen": 36404320, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.9375, + "step": 1686, + "time_per_iteration": 2.557370185852051 + }, + { + "auxiliary_loss_clip": 0.01129063, + "auxiliary_loss_mlp": 0.0104935, + "balance_loss_clip": 1.02246845, + "balance_loss_mlp": 1.03644657, + "epoch": 0.10142792725086427, + "flos": 31903980975360.0, + "grad_norm": 2.880887399024693, + "language_loss": 0.81339729, + "learning_rate": 3.8995280352386344e-06, + "loss": 0.83518136, + "num_input_tokens_seen": 36427510, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 0.92578125, + "step": 1687, + "time_per_iteration": 2.5703492164611816 + }, + { + "auxiliary_loss_clip": 0.01131885, + "auxiliary_loss_mlp": 0.01052296, + "balance_loss_clip": 1.02561688, + "balance_loss_mlp": 1.03477442, + "epoch": 0.10148805050353224, + "flos": 28474637212800.0, + "grad_norm": 1.9758616894600414, + "language_loss": 0.71980017, + "learning_rate": 3.899409755443785e-06, + "loss": 0.741642, + "num_input_tokens_seen": 36448230, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 0.97265625, + "step": 1688, + "time_per_iteration": 2.483548164367676 + }, + { + "auxiliary_loss_clip": 0.01126728, + "auxiliary_loss_mlp": 0.0105274, + "balance_loss_clip": 1.02780175, + "balance_loss_mlp": 1.03525734, + "epoch": 0.1015481737562002, + "flos": 25147903536000.0, + "grad_norm": 2.4480015311261627, + "language_loss": 0.86638576, + "learning_rate": 3.899291407863887e-06, + "loss": 0.88818043, + "num_input_tokens_seen": 36464395, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.9140625, + "step": 1689, + "time_per_iteration": 2.476289749145508 + }, + { + "auxiliary_loss_clip": 0.01123046, + "auxiliary_loss_mlp": 0.01043488, + "balance_loss_clip": 1.01890743, + "balance_loss_mlp": 1.03043246, + "epoch": 0.10160829700886818, + "flos": 30881479720320.0, + "grad_norm": 1.7647076627727838, + "language_loss": 0.88198733, + "learning_rate": 3.899172992503165e-06, + "loss": 0.90365267, + "num_input_tokens_seen": 36486475, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.92578125, + "step": 1690, + "time_per_iteration": 2.602295160293579 + }, + { + "auxiliary_loss_clip": 0.01126595, + "auxiliary_loss_mlp": 0.01044899, + "balance_loss_clip": 1.02053297, + "balance_loss_mlp": 1.03325152, + "epoch": 0.10166842026153615, + "flos": 20410992334080.0, + "grad_norm": 2.4958265577871694, + "language_loss": 0.83553779, + "learning_rate": 3.899054509365843e-06, + "loss": 0.85725272, + "num_input_tokens_seen": 36505310, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 0.93359375, + "step": 1691, + "time_per_iteration": 2.4795753955841064 + }, + { + "auxiliary_loss_clip": 0.01127851, + "auxiliary_loss_mlp": 0.01048939, + "balance_loss_clip": 1.02351224, + "balance_loss_mlp": 1.03416157, + "epoch": 0.10172854351420411, + "flos": 33475491360000.0, + "grad_norm": 1.5062507315521056, + "language_loss": 0.6655491, + "learning_rate": 3.89893595845615e-06, + "loss": 0.68731701, + "num_input_tokens_seen": 36529820, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.9375, + "step": 1692, + "time_per_iteration": 2.5656216144561768 + }, + { + "auxiliary_loss_clip": 0.01126495, + "auxiliary_loss_mlp": 0.01053501, + "balance_loss_clip": 1.02723956, + "balance_loss_mlp": 1.03327119, + "epoch": 0.1017886667668721, + "flos": 23549195335680.0, + "grad_norm": 1.6614169439877764, + "language_loss": 0.75763559, + "learning_rate": 3.898817339778319e-06, + "loss": 0.77943558, + "num_input_tokens_seen": 36549000, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.9296875, + "step": 1693, + "time_per_iteration": 2.5104470252990723 + }, + { + "auxiliary_loss_clip": 0.01126969, + "auxiliary_loss_mlp": 0.01048846, + "balance_loss_clip": 1.02250028, + "balance_loss_mlp": 1.03276646, + "epoch": 0.10184879001954006, + "flos": 23294922837120.0, + "grad_norm": 1.6836904940668604, + "language_loss": 0.8728255, + "learning_rate": 3.898698653336581e-06, + "loss": 0.89458358, + "num_input_tokens_seen": 36567515, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 0.9453125, + "step": 1694, + "time_per_iteration": 2.4375650882720947 + }, + { + "auxiliary_loss_clip": 0.01130871, + "auxiliary_loss_mlp": 0.01048342, + "balance_loss_clip": 1.02084053, + "balance_loss_mlp": 1.0317018, + "epoch": 0.10190891327220802, + "flos": 18332123938560.0, + "grad_norm": 2.3117364563831915, + "language_loss": 0.7957328, + "learning_rate": 3.8985798991351715e-06, + "loss": 0.81752491, + "num_input_tokens_seen": 36586190, + "router_z_loss_clip": 0.27539062, + "router_z_loss_mlp": 0.9921875, + "step": 1695, + "time_per_iteration": 2.3535845279693604 + }, + { + "auxiliary_loss_clip": 0.0112953, + "auxiliary_loss_mlp": 0.01053714, + "balance_loss_clip": 1.02674866, + "balance_loss_mlp": 1.03284919, + "epoch": 0.10196903652487599, + "flos": 26464268638080.0, + "grad_norm": 1.771862970932022, + "language_loss": 0.86338663, + "learning_rate": 3.898461077178329e-06, + "loss": 0.88521904, + "num_input_tokens_seen": 36607495, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.96875, + "step": 1696, + "time_per_iteration": 2.464308261871338 + }, + { + "auxiliary_loss_clip": 0.01124274, + "auxiliary_loss_mlp": 0.0105109, + "balance_loss_clip": 1.02643776, + "balance_loss_mlp": 1.0333662, + "epoch": 0.10202915977754397, + "flos": 21868511529600.0, + "grad_norm": 1.8515751852928584, + "language_loss": 0.82061327, + "learning_rate": 3.898342187470296e-06, + "loss": 0.84236693, + "num_input_tokens_seen": 36628555, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.91015625, + "step": 1697, + "time_per_iteration": 2.4317948818206787 + }, + { + "auxiliary_loss_clip": 0.01128753, + "auxiliary_loss_mlp": 0.01047324, + "balance_loss_clip": 1.02047801, + "balance_loss_mlp": 1.03356361, + "epoch": 0.10208928303021193, + "flos": 22308661440000.0, + "grad_norm": 2.003597479785428, + "language_loss": 0.80216718, + "learning_rate": 3.898223230015311e-06, + "loss": 0.82392788, + "num_input_tokens_seen": 36646250, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.94921875, + "step": 1698, + "time_per_iteration": 2.4211795330047607 + }, + { + "auxiliary_loss_clip": 0.01126562, + "auxiliary_loss_mlp": 0.01038661, + "balance_loss_clip": 1.01368654, + "balance_loss_mlp": 1.03277946, + "epoch": 0.1021494062828799, + "flos": 22124529596160.0, + "grad_norm": 3.2384389278339683, + "language_loss": 0.75553715, + "learning_rate": 3.8981042048176235e-06, + "loss": 0.77718937, + "num_input_tokens_seen": 36666675, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.9375, + "step": 1699, + "time_per_iteration": 2.40696120262146 + }, + { + "auxiliary_loss_clip": 0.01126844, + "auxiliary_loss_mlp": 0.01043679, + "balance_loss_clip": 1.01847851, + "balance_loss_mlp": 1.03395414, + "epoch": 0.10220952953554788, + "flos": 19645696131840.0, + "grad_norm": 1.660742949698858, + "language_loss": 0.79711759, + "learning_rate": 3.897985111881478e-06, + "loss": 0.81882286, + "num_input_tokens_seen": 36685225, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.9296875, + "step": 1700, + "time_per_iteration": 2.3866331577301025 + }, + { + "auxiliary_loss_clip": 0.01125573, + "auxiliary_loss_mlp": 0.01046093, + "balance_loss_clip": 1.02139282, + "balance_loss_mlp": 1.03189898, + "epoch": 0.10226965278821584, + "flos": 29786044901760.0, + "grad_norm": 1.7820552114416843, + "language_loss": 0.77036595, + "learning_rate": 3.897865951211127e-06, + "loss": 0.79208261, + "num_input_tokens_seen": 36705985, + "router_z_loss_clip": 0.24707031, + "router_z_loss_mlp": 0.9375, + "step": 1701, + "time_per_iteration": 2.4395692348480225 + }, + { + "auxiliary_loss_clip": 0.01129793, + "auxiliary_loss_mlp": 0.01043111, + "balance_loss_clip": 1.01714671, + "balance_loss_mlp": 1.03423548, + "epoch": 0.10232977604088381, + "flos": 27015581917440.0, + "grad_norm": 2.8523336534744077, + "language_loss": 0.78233707, + "learning_rate": 3.897746722810822e-06, + "loss": 0.80406612, + "num_input_tokens_seen": 36725815, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.953125, + "step": 1702, + "time_per_iteration": 2.436391592025757 + }, + { + "auxiliary_loss_clip": 0.01124749, + "auxiliary_loss_mlp": 0.01042127, + "balance_loss_clip": 1.01767766, + "balance_loss_mlp": 1.03302264, + "epoch": 0.10238989929355179, + "flos": 20776463112960.0, + "grad_norm": 2.0057943972936663, + "language_loss": 0.94855535, + "learning_rate": 3.897627426684818e-06, + "loss": 0.97022408, + "num_input_tokens_seen": 36742345, + "router_z_loss_clip": 0.24511719, + "router_z_loss_mlp": 0.91796875, + "step": 1703, + "time_per_iteration": 2.3969039916992188 + }, + { + "auxiliary_loss_clip": 0.01033287, + "auxiliary_loss_mlp": 0.01006063, + "balance_loss_clip": 1.00258231, + "balance_loss_mlp": 1.00845075, + "epoch": 0.10245002254621975, + "flos": 57695297690880.0, + "grad_norm": 0.8679415430569597, + "language_loss": 0.55032927, + "learning_rate": 3.897508062837372e-06, + "loss": 0.57072276, + "num_input_tokens_seen": 36798775, + "router_z_loss_clip": 0.03491211, + "router_z_loss_mlp": 0.24804688, + "step": 1704, + "time_per_iteration": 2.9495198726654053 + }, + { + "auxiliary_loss_clip": 0.01126333, + "auxiliary_loss_mlp": 0.01041418, + "balance_loss_clip": 1.01575208, + "balance_loss_mlp": 1.03269148, + "epoch": 0.10251014579888772, + "flos": 16799192472960.0, + "grad_norm": 2.1628112329074147, + "language_loss": 0.83624583, + "learning_rate": 3.897388631272745e-06, + "loss": 0.85792339, + "num_input_tokens_seen": 36816295, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.9375, + "step": 1705, + "time_per_iteration": 2.3643715381622314 + }, + { + "auxiliary_loss_clip": 0.01028383, + "auxiliary_loss_mlp": 0.01004697, + "balance_loss_clip": 1.0013113, + "balance_loss_mlp": 1.00360084, + "epoch": 0.1025702690515557, + "flos": 68562328832640.0, + "grad_norm": 0.7604867521608939, + "language_loss": 0.60402644, + "learning_rate": 3.8972691319951975e-06, + "loss": 0.62435722, + "num_input_tokens_seen": 36882030, + "router_z_loss_clip": 0.03393555, + "router_z_loss_mlp": 0.24804688, + "step": 1706, + "time_per_iteration": 3.0974771976470947 + }, + { + "auxiliary_loss_clip": 0.01129113, + "auxiliary_loss_mlp": 0.01045736, + "balance_loss_clip": 1.02063107, + "balance_loss_mlp": 1.03540444, + "epoch": 0.10263039230422366, + "flos": 14865737356800.0, + "grad_norm": 2.324386550135008, + "language_loss": 0.86008024, + "learning_rate": 3.897149565008996e-06, + "loss": 0.88182867, + "num_input_tokens_seen": 36899245, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.9375, + "step": 1707, + "time_per_iteration": 2.3840553760528564 + }, + { + "auxiliary_loss_clip": 0.01126693, + "auxiliary_loss_mlp": 0.0104306, + "balance_loss_clip": 1.01841986, + "balance_loss_mlp": 1.03309667, + "epoch": 0.10269051555689163, + "flos": 25336434211200.0, + "grad_norm": 1.4926963578695893, + "language_loss": 0.7271347, + "learning_rate": 3.897029930318406e-06, + "loss": 0.74883235, + "num_input_tokens_seen": 36920950, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.9375, + "step": 1708, + "time_per_iteration": 2.418433904647827 + }, + { + "auxiliary_loss_clip": 0.01124308, + "auxiliary_loss_mlp": 0.01049843, + "balance_loss_clip": 1.02458286, + "balance_loss_mlp": 1.03183699, + "epoch": 0.10275063880955959, + "flos": 21067778430720.0, + "grad_norm": 1.7060560352128098, + "language_loss": 0.91177273, + "learning_rate": 3.8969102279276974e-06, + "loss": 0.93351424, + "num_input_tokens_seen": 36938900, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.92578125, + "step": 1709, + "time_per_iteration": 2.3767495155334473 + }, + { + "auxiliary_loss_clip": 0.01125528, + "auxiliary_loss_mlp": 0.01043814, + "balance_loss_clip": 1.01924491, + "balance_loss_mlp": 1.03394461, + "epoch": 0.10281076206222757, + "flos": 30365638248960.0, + "grad_norm": 2.6524481322250177, + "language_loss": 0.88083231, + "learning_rate": 3.896790457841142e-06, + "loss": 0.90252578, + "num_input_tokens_seen": 36957010, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.9140625, + "step": 1710, + "time_per_iteration": 2.436161994934082 + }, + { + "auxiliary_loss_clip": 0.01120898, + "auxiliary_loss_mlp": 0.01043799, + "balance_loss_clip": 1.01883698, + "balance_loss_mlp": 1.03276277, + "epoch": 0.10287088531489554, + "flos": 22417241368320.0, + "grad_norm": 2.085302421561381, + "language_loss": 0.79199672, + "learning_rate": 3.896670620063015e-06, + "loss": 0.81364369, + "num_input_tokens_seen": 36977690, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.8828125, + "step": 1711, + "time_per_iteration": 2.4026315212249756 + }, + { + "auxiliary_loss_clip": 0.01127818, + "auxiliary_loss_mlp": 0.01048941, + "balance_loss_clip": 1.0223453, + "balance_loss_mlp": 1.03480744, + "epoch": 0.1029310085675635, + "flos": 25114910434560.0, + "grad_norm": 2.9395482296819866, + "language_loss": 0.73571283, + "learning_rate": 3.896550714597592e-06, + "loss": 0.75748044, + "num_input_tokens_seen": 36997300, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.9296875, + "step": 1712, + "time_per_iteration": 2.428074598312378 + }, + { + "auxiliary_loss_clip": 0.01124127, + "auxiliary_loss_mlp": 0.01050446, + "balance_loss_clip": 1.02728355, + "balance_loss_mlp": 1.03561556, + "epoch": 0.10299113182023148, + "flos": 19864601556480.0, + "grad_norm": 1.8027700892132426, + "language_loss": 0.86771899, + "learning_rate": 3.896430741449153e-06, + "loss": 0.88946474, + "num_input_tokens_seen": 37016110, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 0.88671875, + "step": 1713, + "time_per_iteration": 3.7723255157470703 + }, + { + "auxiliary_loss_clip": 0.01124946, + "auxiliary_loss_mlp": 0.01043759, + "balance_loss_clip": 1.02115691, + "balance_loss_mlp": 1.03272271, + "epoch": 0.10305125507289944, + "flos": 20446603787520.0, + "grad_norm": 1.5695577095444464, + "language_loss": 0.72571588, + "learning_rate": 3.8963107006219785e-06, + "loss": 0.74740291, + "num_input_tokens_seen": 37036405, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.921875, + "step": 1714, + "time_per_iteration": 2.414334535598755 + }, + { + "auxiliary_loss_clip": 0.01130338, + "auxiliary_loss_mlp": 0.01050869, + "balance_loss_clip": 1.02520323, + "balance_loss_mlp": 1.03358746, + "epoch": 0.10311137832556741, + "flos": 26249552576640.0, + "grad_norm": 2.3658421272882806, + "language_loss": 0.90832257, + "learning_rate": 3.896190592120353e-06, + "loss": 0.93013465, + "num_input_tokens_seen": 37057580, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.96875, + "step": 1715, + "time_per_iteration": 3.8215456008911133 + }, + { + "auxiliary_loss_clip": 0.01126425, + "auxiliary_loss_mlp": 0.01044727, + "balance_loss_clip": 1.01991987, + "balance_loss_mlp": 1.03373456, + "epoch": 0.10317150157823539, + "flos": 35297468904960.0, + "grad_norm": 2.4404738181742807, + "language_loss": 0.75811809, + "learning_rate": 3.896070415948563e-06, + "loss": 0.77982962, + "num_input_tokens_seen": 37079120, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.92578125, + "step": 1716, + "time_per_iteration": 2.520768165588379 + }, + { + "auxiliary_loss_clip": 0.01131289, + "auxiliary_loss_mlp": 0.01053478, + "balance_loss_clip": 1.02689457, + "balance_loss_mlp": 1.03468513, + "epoch": 0.10323162483090335, + "flos": 25738738341120.0, + "grad_norm": 1.8637532906378036, + "language_loss": 0.8557725, + "learning_rate": 3.895950172110897e-06, + "loss": 0.87762022, + "num_input_tokens_seen": 37099710, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.96484375, + "step": 1717, + "time_per_iteration": 3.9256229400634766 + }, + { + "auxiliary_loss_clip": 0.01126121, + "auxiliary_loss_mlp": 0.01052604, + "balance_loss_clip": 1.02903628, + "balance_loss_mlp": 1.03377187, + "epoch": 0.10329174808357132, + "flos": 16288936819200.0, + "grad_norm": 1.8295567488097717, + "language_loss": 0.8306402, + "learning_rate": 3.895829860611646e-06, + "loss": 0.85242748, + "num_input_tokens_seen": 37117775, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 0.921875, + "step": 1718, + "time_per_iteration": 2.3728268146514893 + }, + { + "auxiliary_loss_clip": 0.01126617, + "auxiliary_loss_mlp": 0.01044083, + "balance_loss_clip": 1.02005076, + "balance_loss_mlp": 1.0349555, + "epoch": 0.10335187133623928, + "flos": 36685615495680.0, + "grad_norm": 1.9096384483571365, + "language_loss": 0.72850704, + "learning_rate": 3.895709481455105e-06, + "loss": 0.75021404, + "num_input_tokens_seen": 37140280, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.91796875, + "step": 1719, + "time_per_iteration": 2.532106399536133 + }, + { + "auxiliary_loss_clip": 0.0112546, + "auxiliary_loss_mlp": 0.01044799, + "balance_loss_clip": 1.02068257, + "balance_loss_mlp": 1.03439403, + "epoch": 0.10341199458890726, + "flos": 14974771132800.0, + "grad_norm": 2.126221701693877, + "language_loss": 0.92706668, + "learning_rate": 3.895589034645568e-06, + "loss": 0.94876933, + "num_input_tokens_seen": 37158350, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 0.91015625, + "step": 1720, + "time_per_iteration": 2.364673137664795 + }, + { + "auxiliary_loss_clip": 0.01125416, + "auxiliary_loss_mlp": 0.01043045, + "balance_loss_clip": 1.01660395, + "balance_loss_mlp": 1.03304362, + "epoch": 0.10347211784157523, + "flos": 21030561054720.0, + "grad_norm": 2.0938448238925855, + "language_loss": 0.79727536, + "learning_rate": 3.8954685201873344e-06, + "loss": 0.81895995, + "num_input_tokens_seen": 37177120, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.921875, + "step": 1721, + "time_per_iteration": 2.389920949935913 + }, + { + "auxiliary_loss_clip": 0.01129995, + "auxiliary_loss_mlp": 0.01047324, + "balance_loss_clip": 1.02155101, + "balance_loss_mlp": 1.03391886, + "epoch": 0.1035322410942432, + "flos": 19791074499840.0, + "grad_norm": 3.5107157029297373, + "language_loss": 0.80865979, + "learning_rate": 3.895347938084706e-06, + "loss": 0.83043295, + "num_input_tokens_seen": 37195895, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.9609375, + "step": 1722, + "time_per_iteration": 2.397848129272461 + }, + { + "auxiliary_loss_clip": 0.01026926, + "auxiliary_loss_mlp": 0.01003916, + "balance_loss_clip": 0.99988711, + "balance_loss_mlp": 1.00182128, + "epoch": 0.10359236434691117, + "flos": 52696014554880.0, + "grad_norm": 0.9208959392453442, + "language_loss": 0.6713531, + "learning_rate": 3.895227288341984e-06, + "loss": 0.6916616, + "num_input_tokens_seen": 37247270, + "router_z_loss_clip": 0.0402832, + "router_z_loss_mlp": 0.25195312, + "step": 1723, + "time_per_iteration": 2.7822790145874023 + }, + { + "auxiliary_loss_clip": 0.01127009, + "auxiliary_loss_mlp": 0.01054408, + "balance_loss_clip": 1.02935004, + "balance_loss_mlp": 1.03282261, + "epoch": 0.10365248759957914, + "flos": 18404429097600.0, + "grad_norm": 3.3654607027691053, + "language_loss": 0.77949142, + "learning_rate": 3.8951065709634755e-06, + "loss": 0.80130565, + "num_input_tokens_seen": 37265595, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.94140625, + "step": 1724, + "time_per_iteration": 2.4069485664367676 + }, + { + "auxiliary_loss_clip": 0.0113112, + "auxiliary_loss_mlp": 0.0105769, + "balance_loss_clip": 1.03269172, + "balance_loss_mlp": 1.03416538, + "epoch": 0.1037126108522471, + "flos": 47551878587520.0, + "grad_norm": 1.7015623873393253, + "language_loss": 0.74837613, + "learning_rate": 3.8949857859534884e-06, + "loss": 0.77026427, + "num_input_tokens_seen": 37286660, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.97265625, + "step": 1725, + "time_per_iteration": 2.601824998855591 + }, + { + "auxiliary_loss_clip": 0.01125892, + "auxiliary_loss_mlp": 0.01055733, + "balance_loss_clip": 1.03295231, + "balance_loss_mlp": 1.034518, + "epoch": 0.10377273410491508, + "flos": 22815670337280.0, + "grad_norm": 1.9342342234349115, + "language_loss": 0.74688578, + "learning_rate": 3.894864933316333e-06, + "loss": 0.76870197, + "num_input_tokens_seen": 37304915, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 0.9140625, + "step": 1726, + "time_per_iteration": 2.403597116470337 + }, + { + "auxiliary_loss_clip": 0.0112702, + "auxiliary_loss_mlp": 0.01050749, + "balance_loss_clip": 1.02503598, + "balance_loss_mlp": 1.03321671, + "epoch": 0.10383285735758305, + "flos": 26137551335040.0, + "grad_norm": 1.9402703290147678, + "language_loss": 0.72989267, + "learning_rate": 3.894744013056322e-06, + "loss": 0.75167036, + "num_input_tokens_seen": 37325265, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.9375, + "step": 1727, + "time_per_iteration": 2.416836738586426 + }, + { + "auxiliary_loss_clip": 0.01125883, + "auxiliary_loss_mlp": 0.01053341, + "balance_loss_clip": 1.02731776, + "balance_loss_mlp": 1.03287256, + "epoch": 0.10389298061025101, + "flos": 17090856904320.0, + "grad_norm": 2.1202233462362714, + "language_loss": 0.8460077, + "learning_rate": 3.894623025177772e-06, + "loss": 0.8678, + "num_input_tokens_seen": 37341650, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.9296875, + "step": 1728, + "time_per_iteration": 2.358876943588257 + }, + { + "auxiliary_loss_clip": 0.01125422, + "auxiliary_loss_mlp": 0.01048718, + "balance_loss_clip": 1.02230167, + "balance_loss_mlp": 1.03374028, + "epoch": 0.10395310386291898, + "flos": 20775485594880.0, + "grad_norm": 2.289753052034552, + "language_loss": 0.70360857, + "learning_rate": 3.894501969684999e-06, + "loss": 0.7253499, + "num_input_tokens_seen": 37360270, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 0.91796875, + "step": 1729, + "time_per_iteration": 2.4003336429595947 + }, + { + "auxiliary_loss_clip": 0.01123786, + "auxiliary_loss_mlp": 0.01048196, + "balance_loss_clip": 1.02238703, + "balance_loss_mlp": 1.032902, + "epoch": 0.10401322711558696, + "flos": 12819792240000.0, + "grad_norm": 2.4446926699856104, + "language_loss": 0.81571615, + "learning_rate": 3.894380846582324e-06, + "loss": 0.83743596, + "num_input_tokens_seen": 37375225, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.91015625, + "step": 1730, + "time_per_iteration": 2.360623598098755 + }, + { + "auxiliary_loss_clip": 0.01121029, + "auxiliary_loss_mlp": 0.0103754, + "balance_loss_clip": 1.01527131, + "balance_loss_mlp": 1.03012872, + "epoch": 0.10407335036825492, + "flos": 23183584911360.0, + "grad_norm": 1.7366924144958773, + "language_loss": 0.75999582, + "learning_rate": 3.89425965587407e-06, + "loss": 0.78158152, + "num_input_tokens_seen": 37395165, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.91015625, + "step": 1731, + "time_per_iteration": 2.3825974464416504 + }, + { + "auxiliary_loss_clip": 0.01120133, + "auxiliary_loss_mlp": 0.01043516, + "balance_loss_clip": 1.02133155, + "balance_loss_mlp": 1.03189421, + "epoch": 0.10413347362092289, + "flos": 26102987222400.0, + "grad_norm": 2.4071442074963687, + "language_loss": 0.82738227, + "learning_rate": 3.894138397564562e-06, + "loss": 0.84901875, + "num_input_tokens_seen": 37414845, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 0.8828125, + "step": 1732, + "time_per_iteration": 2.423374891281128 + }, + { + "auxiliary_loss_clip": 0.01122988, + "auxiliary_loss_mlp": 0.01041163, + "balance_loss_clip": 1.01872814, + "balance_loss_mlp": 1.03322685, + "epoch": 0.10419359687359087, + "flos": 12640233784320.0, + "grad_norm": 2.216770909856606, + "language_loss": 0.83156657, + "learning_rate": 3.894017071658125e-06, + "loss": 0.85320818, + "num_input_tokens_seen": 37432490, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.8984375, + "step": 1733, + "time_per_iteration": 2.3525197505950928 + }, + { + "auxiliary_loss_clip": 0.01125276, + "auxiliary_loss_mlp": 0.01040831, + "balance_loss_clip": 1.01758552, + "balance_loss_mlp": 1.03219247, + "epoch": 0.10425372012625883, + "flos": 12124427224320.0, + "grad_norm": 2.1742975654993333, + "language_loss": 0.76333314, + "learning_rate": 3.893895678159092e-06, + "loss": 0.78499418, + "num_input_tokens_seen": 37449435, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.9296875, + "step": 1734, + "time_per_iteration": 2.3762547969818115 + }, + { + "auxiliary_loss_clip": 0.01120024, + "auxiliary_loss_mlp": 0.01038903, + "balance_loss_clip": 1.01643229, + "balance_loss_mlp": 1.03039026, + "epoch": 0.1043138433789268, + "flos": 25336399299840.0, + "grad_norm": 1.7597136442224786, + "language_loss": 0.75126266, + "learning_rate": 3.8937742170717935e-06, + "loss": 0.77285194, + "num_input_tokens_seen": 37469105, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.8984375, + "step": 1735, + "time_per_iteration": 2.4201161861419678 + }, + { + "auxiliary_loss_clip": 0.01123744, + "auxiliary_loss_mlp": 0.01043558, + "balance_loss_clip": 1.01808333, + "balance_loss_mlp": 1.0322299, + "epoch": 0.10437396663159478, + "flos": 29165917599360.0, + "grad_norm": 1.638933424974742, + "language_loss": 0.7859149, + "learning_rate": 3.893652688400565e-06, + "loss": 0.80758798, + "num_input_tokens_seen": 37490540, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.9140625, + "step": 1736, + "time_per_iteration": 2.5261738300323486 + }, + { + "auxiliary_loss_clip": 0.01122254, + "auxiliary_loss_mlp": 0.01055381, + "balance_loss_clip": 1.03001356, + "balance_loss_mlp": 1.03328729, + "epoch": 0.10443408988426274, + "flos": 25079822651520.0, + "grad_norm": 1.8328233590421816, + "language_loss": 0.70703518, + "learning_rate": 3.893531092149743e-06, + "loss": 0.72881156, + "num_input_tokens_seen": 37511905, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.890625, + "step": 1737, + "time_per_iteration": 2.435969352722168 + }, + { + "auxiliary_loss_clip": 0.01126007, + "auxiliary_loss_mlp": 0.01052773, + "balance_loss_clip": 1.02487803, + "balance_loss_mlp": 1.03007066, + "epoch": 0.1044942131369307, + "flos": 26758481598720.0, + "grad_norm": 1.781210849877685, + "language_loss": 0.81362653, + "learning_rate": 3.893409428323666e-06, + "loss": 0.83541435, + "num_input_tokens_seen": 37533635, + "router_z_loss_clip": 0.27929688, + "router_z_loss_mlp": 0.9609375, + "step": 1738, + "time_per_iteration": 2.429069757461548 + }, + { + "auxiliary_loss_clip": 0.01124425, + "auxiliary_loss_mlp": 0.01053833, + "balance_loss_clip": 1.02822733, + "balance_loss_mlp": 1.03121376, + "epoch": 0.10455433638959867, + "flos": 18441576650880.0, + "grad_norm": 1.8175202610349077, + "language_loss": 0.74855512, + "learning_rate": 3.8932876969266785e-06, + "loss": 0.7703377, + "num_input_tokens_seen": 37552035, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.93359375, + "step": 1739, + "time_per_iteration": 2.4002676010131836 + }, + { + "auxiliary_loss_clip": 0.01121714, + "auxiliary_loss_mlp": 0.01050747, + "balance_loss_clip": 1.02702391, + "balance_loss_mlp": 1.03128552, + "epoch": 0.10461445964226665, + "flos": 23217939555840.0, + "grad_norm": 3.3126482199985987, + "language_loss": 0.77350897, + "learning_rate": 3.893165897963123e-06, + "loss": 0.79523361, + "num_input_tokens_seen": 37571540, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 0.90625, + "step": 1740, + "time_per_iteration": 2.390667676925659 + }, + { + "auxiliary_loss_clip": 0.01124322, + "auxiliary_loss_mlp": 0.01046332, + "balance_loss_clip": 1.02300251, + "balance_loss_mlp": 1.03353405, + "epoch": 0.10467458289493461, + "flos": 24344307705600.0, + "grad_norm": 2.0689127159114054, + "language_loss": 0.8588016, + "learning_rate": 3.893044031437346e-06, + "loss": 0.88050812, + "num_input_tokens_seen": 37588265, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.90625, + "step": 1741, + "time_per_iteration": 2.3992786407470703 + }, + { + "auxiliary_loss_clip": 0.01124591, + "auxiliary_loss_mlp": 0.0104551, + "balance_loss_clip": 1.01955867, + "balance_loss_mlp": 1.03237677, + "epoch": 0.10473470614760258, + "flos": 21286893323520.0, + "grad_norm": 2.532080922859773, + "language_loss": 0.75275385, + "learning_rate": 3.892922097353697e-06, + "loss": 0.77445483, + "num_input_tokens_seen": 37606860, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.921875, + "step": 1742, + "time_per_iteration": 2.376932382583618 + }, + { + "auxiliary_loss_clip": 0.01124536, + "auxiliary_loss_mlp": 0.01048128, + "balance_loss_clip": 1.02493, + "balance_loss_mlp": 1.03433001, + "epoch": 0.10479482940027056, + "flos": 21686195076480.0, + "grad_norm": 1.9615184924185378, + "language_loss": 0.86979914, + "learning_rate": 3.8928000957165275e-06, + "loss": 0.89152575, + "num_input_tokens_seen": 37625210, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.90234375, + "step": 1743, + "time_per_iteration": 2.388972520828247 + }, + { + "auxiliary_loss_clip": 0.01121899, + "auxiliary_loss_mlp": 0.01047988, + "balance_loss_clip": 1.02152383, + "balance_loss_mlp": 1.0318445, + "epoch": 0.10485495265293852, + "flos": 21572797380480.0, + "grad_norm": 1.980346536726369, + "language_loss": 0.75399542, + "learning_rate": 3.8926780265301915e-06, + "loss": 0.77569425, + "num_input_tokens_seen": 37644110, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.90234375, + "step": 1744, + "time_per_iteration": 2.3914849758148193 + }, + { + "auxiliary_loss_clip": 0.01124598, + "auxiliary_loss_mlp": 0.01050524, + "balance_loss_clip": 1.02765965, + "balance_loss_mlp": 1.03332758, + "epoch": 0.10491507590560649, + "flos": 37960399301760.0, + "grad_norm": 1.8428502923857146, + "language_loss": 0.78735441, + "learning_rate": 3.8925558897990445e-06, + "loss": 0.80910563, + "num_input_tokens_seen": 37665800, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.9140625, + "step": 1745, + "time_per_iteration": 2.5292773246765137 + }, + { + "auxiliary_loss_clip": 0.01122432, + "auxiliary_loss_mlp": 0.01057055, + "balance_loss_clip": 1.03259289, + "balance_loss_mlp": 1.0313642, + "epoch": 0.10497519915827447, + "flos": 26395070590080.0, + "grad_norm": 2.7675134126447194, + "language_loss": 0.82449567, + "learning_rate": 3.892433685527447e-06, + "loss": 0.84629059, + "num_input_tokens_seen": 37685095, + "router_z_loss_clip": 0.24511719, + "router_z_loss_mlp": 0.91015625, + "step": 1746, + "time_per_iteration": 2.4199109077453613 + }, + { + "auxiliary_loss_clip": 0.0112421, + "auxiliary_loss_mlp": 0.0105103, + "balance_loss_clip": 1.02669919, + "balance_loss_mlp": 1.03329909, + "epoch": 0.10503532241094243, + "flos": 40660581985920.0, + "grad_norm": 1.6092919705029667, + "language_loss": 0.69958377, + "learning_rate": 3.892311413719759e-06, + "loss": 0.72133613, + "num_input_tokens_seen": 37707445, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 0.90625, + "step": 1747, + "time_per_iteration": 2.571589708328247 + }, + { + "auxiliary_loss_clip": 0.01128556, + "auxiliary_loss_mlp": 0.01052369, + "balance_loss_clip": 1.02750218, + "balance_loss_mlp": 1.03342628, + "epoch": 0.1050954456636104, + "flos": 29788104672000.0, + "grad_norm": 2.2964050379853744, + "language_loss": 0.84260982, + "learning_rate": 3.892189074380345e-06, + "loss": 0.86441904, + "num_input_tokens_seen": 37728325, + "router_z_loss_clip": 0.24902344, + "router_z_loss_mlp": 0.953125, + "step": 1748, + "time_per_iteration": 2.4443163871765137 + }, + { + "auxiliary_loss_clip": 0.01117316, + "auxiliary_loss_mlp": 0.01042685, + "balance_loss_clip": 1.01779461, + "balance_loss_mlp": 1.02924657, + "epoch": 0.10515556891627838, + "flos": 23947694127360.0, + "grad_norm": 2.0563322181054393, + "language_loss": 0.71392345, + "learning_rate": 3.892066667513569e-06, + "loss": 0.73552346, + "num_input_tokens_seen": 37748910, + "router_z_loss_clip": 0.24902344, + "router_z_loss_mlp": 0.8828125, + "step": 1749, + "time_per_iteration": 2.41198468208313 + }, + { + "auxiliary_loss_clip": 0.01118943, + "auxiliary_loss_mlp": 0.01046165, + "balance_loss_clip": 1.02150106, + "balance_loss_mlp": 1.03067636, + "epoch": 0.10521569216894634, + "flos": 18258631793280.0, + "grad_norm": 2.09474354965328, + "language_loss": 0.81900769, + "learning_rate": 3.891944193123801e-06, + "loss": 0.84065878, + "num_input_tokens_seen": 37765745, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.8828125, + "step": 1750, + "time_per_iteration": 2.3500618934631348 + }, + { + "auxiliary_loss_clip": 0.01127944, + "auxiliary_loss_mlp": 0.01055871, + "balance_loss_clip": 1.03163528, + "balance_loss_mlp": 1.03505528, + "epoch": 0.10527581542161431, + "flos": 15630056040960.0, + "grad_norm": 2.155919689446535, + "language_loss": 0.92280161, + "learning_rate": 3.891821651215411e-06, + "loss": 0.9446398, + "num_input_tokens_seen": 37780520, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.9296875, + "step": 1751, + "time_per_iteration": 2.38798189163208 + }, + { + "auxiliary_loss_clip": 0.01121446, + "auxiliary_loss_mlp": 0.01042405, + "balance_loss_clip": 1.01945722, + "balance_loss_mlp": 1.03214347, + "epoch": 0.10533593867428227, + "flos": 18295569878400.0, + "grad_norm": 3.1524830188228834, + "language_loss": 0.78899848, + "learning_rate": 3.8916990417927735e-06, + "loss": 0.810637, + "num_input_tokens_seen": 37799515, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 0.89453125, + "step": 1752, + "time_per_iteration": 2.3802599906921387 + }, + { + "auxiliary_loss_clip": 0.01121154, + "auxiliary_loss_mlp": 0.01045128, + "balance_loss_clip": 1.0206542, + "balance_loss_mlp": 1.03210664, + "epoch": 0.10539606192695025, + "flos": 29021935685760.0, + "grad_norm": 1.8613766788519057, + "language_loss": 0.75671118, + "learning_rate": 3.891576364860262e-06, + "loss": 0.77837402, + "num_input_tokens_seen": 37818695, + "router_z_loss_clip": 0.24511719, + "router_z_loss_mlp": 0.890625, + "step": 1753, + "time_per_iteration": 3.81923508644104 + }, + { + "auxiliary_loss_clip": 0.01124279, + "auxiliary_loss_mlp": 0.01053431, + "balance_loss_clip": 1.02843332, + "balance_loss_mlp": 1.03089023, + "epoch": 0.10545618517961822, + "flos": 19968433539840.0, + "grad_norm": 1.8995117140353865, + "language_loss": 0.83522022, + "learning_rate": 3.891453620422258e-06, + "loss": 0.85699737, + "num_input_tokens_seen": 37837860, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.93359375, + "step": 1754, + "time_per_iteration": 3.7905337810516357 + }, + { + "auxiliary_loss_clip": 0.01128729, + "auxiliary_loss_mlp": 0.01050098, + "balance_loss_clip": 1.0233947, + "balance_loss_mlp": 1.03546464, + "epoch": 0.10551630843228618, + "flos": 16142511110400.0, + "grad_norm": 2.7992506175283154, + "language_loss": 0.6898886, + "learning_rate": 3.891330808483137e-06, + "loss": 0.71167684, + "num_input_tokens_seen": 37856260, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.93359375, + "step": 1755, + "time_per_iteration": 2.345918655395508 + }, + { + "auxiliary_loss_clip": 0.01127338, + "auxiliary_loss_mlp": 0.01054693, + "balance_loss_clip": 1.0284667, + "balance_loss_mlp": 1.03414655, + "epoch": 0.10557643168495416, + "flos": 23439009484800.0, + "grad_norm": 2.095319590645789, + "language_loss": 0.76325703, + "learning_rate": 3.891207929047286e-06, + "loss": 0.78507727, + "num_input_tokens_seen": 37876960, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.9296875, + "step": 1756, + "time_per_iteration": 2.4101569652557373 + }, + { + "auxiliary_loss_clip": 0.01124114, + "auxiliary_loss_mlp": 0.0104714, + "balance_loss_clip": 1.02310705, + "balance_loss_mlp": 1.03179097, + "epoch": 0.10563655493762213, + "flos": 21797951938560.0, + "grad_norm": 1.7351198219289925, + "language_loss": 0.79872441, + "learning_rate": 3.8910849821190884e-06, + "loss": 0.82043695, + "num_input_tokens_seen": 37897070, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.921875, + "step": 1757, + "time_per_iteration": 3.8162782192230225 + }, + { + "auxiliary_loss_clip": 0.0112361, + "auxiliary_loss_mlp": 0.01044144, + "balance_loss_clip": 1.01888347, + "balance_loss_mlp": 1.03206515, + "epoch": 0.10569667819029009, + "flos": 53798782625280.0, + "grad_norm": 1.5596724518209486, + "language_loss": 0.78979349, + "learning_rate": 3.890961967702933e-06, + "loss": 0.81147099, + "num_input_tokens_seen": 37923635, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.9140625, + "step": 1758, + "time_per_iteration": 2.692870616912842 + }, + { + "auxiliary_loss_clip": 0.01127386, + "auxiliary_loss_mlp": 0.01040808, + "balance_loss_clip": 1.01633477, + "balance_loss_mlp": 1.03447723, + "epoch": 0.10575680144295807, + "flos": 22924529556480.0, + "grad_norm": 1.7044128071529396, + "language_loss": 0.91619074, + "learning_rate": 3.890838885803208e-06, + "loss": 0.93787271, + "num_input_tokens_seen": 37942650, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.9296875, + "step": 1759, + "time_per_iteration": 2.395379066467285 + }, + { + "auxiliary_loss_clip": 0.01126309, + "auxiliary_loss_mlp": 0.01055059, + "balance_loss_clip": 1.02841604, + "balance_loss_mlp": 1.0312767, + "epoch": 0.10581692469562604, + "flos": 14135808228480.0, + "grad_norm": 1.9812907161966353, + "language_loss": 0.77218324, + "learning_rate": 3.890715736424307e-06, + "loss": 0.79399687, + "num_input_tokens_seen": 37960660, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.953125, + "step": 1760, + "time_per_iteration": 2.3628458976745605 + }, + { + "auxiliary_loss_clip": 0.01124844, + "auxiliary_loss_mlp": 0.01051971, + "balance_loss_clip": 1.0255419, + "balance_loss_mlp": 1.03168297, + "epoch": 0.105877047948294, + "flos": 25957469208960.0, + "grad_norm": 3.0718633757371125, + "language_loss": 0.8935079, + "learning_rate": 3.890592519570626e-06, + "loss": 0.91527599, + "num_input_tokens_seen": 37978625, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 0.9296875, + "step": 1761, + "time_per_iteration": 2.4184231758117676 + }, + { + "auxiliary_loss_clip": 0.01125491, + "auxiliary_loss_mlp": 0.01053337, + "balance_loss_clip": 1.02795768, + "balance_loss_mlp": 1.03289485, + "epoch": 0.10593717120096197, + "flos": 30663447079680.0, + "grad_norm": 2.260293059938474, + "language_loss": 0.7777102, + "learning_rate": 3.89046923524656e-06, + "loss": 0.79949844, + "num_input_tokens_seen": 38000005, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.92578125, + "step": 1762, + "time_per_iteration": 2.473712205886841 + }, + { + "auxiliary_loss_clip": 0.01029718, + "auxiliary_loss_mlp": 0.01002481, + "balance_loss_clip": 0.99895269, + "balance_loss_mlp": 1.00415659, + "epoch": 0.10599729445362994, + "flos": 66432905055360.0, + "grad_norm": 0.7567652517232661, + "language_loss": 0.60488772, + "learning_rate": 3.8903458834565105e-06, + "loss": 0.62520969, + "num_input_tokens_seen": 38066165, + "router_z_loss_clip": 0.03540039, + "router_z_loss_mlp": 0.25585938, + "step": 1763, + "time_per_iteration": 3.18863844871521 + }, + { + "auxiliary_loss_clip": 0.0112237, + "auxiliary_loss_mlp": 0.01042503, + "balance_loss_clip": 1.01810062, + "balance_loss_mlp": 1.03152835, + "epoch": 0.10605741770629791, + "flos": 23947135545600.0, + "grad_norm": 1.7391662287955905, + "language_loss": 0.79645944, + "learning_rate": 3.890222464204879e-06, + "loss": 0.8181082, + "num_input_tokens_seen": 38086150, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.90625, + "step": 1764, + "time_per_iteration": 2.401545524597168 + }, + { + "auxiliary_loss_clip": 0.01123566, + "auxiliary_loss_mlp": 0.01048595, + "balance_loss_clip": 1.02396607, + "balance_loss_mlp": 1.03268123, + "epoch": 0.10611754095896588, + "flos": 19386605865600.0, + "grad_norm": 1.916558263318626, + "language_loss": 0.80186951, + "learning_rate": 3.89009897749607e-06, + "loss": 0.82359111, + "num_input_tokens_seen": 38104205, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.91015625, + "step": 1765, + "time_per_iteration": 2.383697748184204 + }, + { + "auxiliary_loss_clip": 0.01120346, + "auxiliary_loss_mlp": 0.01050923, + "balance_loss_clip": 1.02638936, + "balance_loss_mlp": 1.03064609, + "epoch": 0.10617766421163385, + "flos": 22236635571840.0, + "grad_norm": 1.840882934516297, + "language_loss": 0.76780617, + "learning_rate": 3.88997542333449e-06, + "loss": 0.78951889, + "num_input_tokens_seen": 38122005, + "router_z_loss_clip": 0.24511719, + "router_z_loss_mlp": 0.8984375, + "step": 1766, + "time_per_iteration": 2.376946210861206 + }, + { + "auxiliary_loss_clip": 0.01125036, + "auxiliary_loss_mlp": 0.01054262, + "balance_loss_clip": 1.02844131, + "balance_loss_mlp": 1.03260565, + "epoch": 0.10623778746430182, + "flos": 28403100103680.0, + "grad_norm": 1.5994413753068162, + "language_loss": 0.77417314, + "learning_rate": 3.889851801724549e-06, + "loss": 0.79596615, + "num_input_tokens_seen": 38143365, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.92578125, + "step": 1767, + "time_per_iteration": 2.4704442024230957 + }, + { + "auxiliary_loss_clip": 0.01028991, + "auxiliary_loss_mlp": 0.0100806, + "balance_loss_clip": 1.00436473, + "balance_loss_mlp": 1.00339031, + "epoch": 0.10629791071696978, + "flos": 64231282719360.0, + "grad_norm": 0.6754453633366562, + "language_loss": 0.57893264, + "learning_rate": 3.889728112670658e-06, + "loss": 0.59930313, + "num_input_tokens_seen": 38210035, + "router_z_loss_clip": 0.03686523, + "router_z_loss_mlp": 0.25585938, + "step": 1768, + "time_per_iteration": 3.1264593601226807 + }, + { + "auxiliary_loss_clip": 0.01125592, + "auxiliary_loss_mlp": 0.01039103, + "balance_loss_clip": 1.01608372, + "balance_loss_mlp": 1.03276598, + "epoch": 0.10635803396963776, + "flos": 22746472289280.0, + "grad_norm": 1.3944339810139335, + "language_loss": 0.86446828, + "learning_rate": 3.8896043561772325e-06, + "loss": 0.88611525, + "num_input_tokens_seen": 38231230, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.9296875, + "step": 1769, + "time_per_iteration": 2.437068462371826 + }, + { + "auxiliary_loss_clip": 0.0112854, + "auxiliary_loss_mlp": 0.01050713, + "balance_loss_clip": 1.02455854, + "balance_loss_mlp": 1.0353334, + "epoch": 0.10641815722230573, + "flos": 31394214080640.0, + "grad_norm": 2.81559546028732, + "language_loss": 0.61949551, + "learning_rate": 3.889480532248688e-06, + "loss": 0.64128804, + "num_input_tokens_seen": 38253890, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.9296875, + "step": 1770, + "time_per_iteration": 2.472952365875244 + }, + { + "auxiliary_loss_clip": 0.01029669, + "auxiliary_loss_mlp": 0.01003379, + "balance_loss_clip": 0.99985087, + "balance_loss_mlp": 1.00370085, + "epoch": 0.1064782804749737, + "flos": 58550077307520.0, + "grad_norm": 1.1358638098678222, + "language_loss": 0.57037234, + "learning_rate": 3.889356640889444e-06, + "loss": 0.59070289, + "num_input_tokens_seen": 38304290, + "router_z_loss_clip": 0.03540039, + "router_z_loss_mlp": 0.25976562, + "step": 1771, + "time_per_iteration": 2.942660093307495 + }, + { + "auxiliary_loss_clip": 0.01125322, + "auxiliary_loss_mlp": 0.01058676, + "balance_loss_clip": 1.0330584, + "balance_loss_mlp": 1.03383589, + "epoch": 0.10653840372764166, + "flos": 23986691982720.0, + "grad_norm": 1.6196900827448717, + "language_loss": 0.88175607, + "learning_rate": 3.8892326821039205e-06, + "loss": 0.9035961, + "num_input_tokens_seen": 38324725, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.9140625, + "step": 1772, + "time_per_iteration": 2.4082319736480713 + }, + { + "auxiliary_loss_clip": 0.0112954, + "auxiliary_loss_mlp": 0.01046109, + "balance_loss_clip": 1.01948977, + "balance_loss_mlp": 1.03332782, + "epoch": 0.10659852698030964, + "flos": 18293719576320.0, + "grad_norm": 3.5867802309173427, + "language_loss": 0.7572273, + "learning_rate": 3.889108655896542e-06, + "loss": 0.77898383, + "num_input_tokens_seen": 38340735, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.96484375, + "step": 1773, + "time_per_iteration": 2.360600709915161 + }, + { + "auxiliary_loss_clip": 0.01127654, + "auxiliary_loss_mlp": 0.01051195, + "balance_loss_clip": 1.02645874, + "balance_loss_mlp": 1.03547025, + "epoch": 0.1066586502329776, + "flos": 32159230992000.0, + "grad_norm": 1.8622521371175176, + "language_loss": 0.82763404, + "learning_rate": 3.888984562271736e-06, + "loss": 0.84942257, + "num_input_tokens_seen": 38361315, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.921875, + "step": 1774, + "time_per_iteration": 2.489536762237549 + }, + { + "auxiliary_loss_clip": 0.01129712, + "auxiliary_loss_mlp": 0.01051944, + "balance_loss_clip": 1.02571797, + "balance_loss_mlp": 1.03411245, + "epoch": 0.10671877348564557, + "flos": 17784197061120.0, + "grad_norm": 2.2850490467324875, + "language_loss": 0.76627076, + "learning_rate": 3.888860401233929e-06, + "loss": 0.78808731, + "num_input_tokens_seen": 38377425, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.95703125, + "step": 1775, + "time_per_iteration": 2.3686110973358154 + }, + { + "auxiliary_loss_clip": 0.01029928, + "auxiliary_loss_mlp": 0.01003173, + "balance_loss_clip": 0.99997795, + "balance_loss_mlp": 1.00443852, + "epoch": 0.10677889673831355, + "flos": 63506695029120.0, + "grad_norm": 0.8170942420256979, + "language_loss": 0.57425374, + "learning_rate": 3.8887361727875535e-06, + "loss": 0.5945847, + "num_input_tokens_seen": 38440275, + "router_z_loss_clip": 0.03198242, + "router_z_loss_mlp": 0.25390625, + "step": 1776, + "time_per_iteration": 3.05979585647583 + }, + { + "auxiliary_loss_clip": 0.01126727, + "auxiliary_loss_mlp": 0.010466, + "balance_loss_clip": 1.02071953, + "balance_loss_mlp": 1.03543973, + "epoch": 0.10683901999098151, + "flos": 22016612983680.0, + "grad_norm": 1.5249944085465545, + "language_loss": 0.8304826, + "learning_rate": 3.888611876937043e-06, + "loss": 0.85221589, + "num_input_tokens_seen": 38461820, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.9140625, + "step": 1777, + "time_per_iteration": 2.4524457454681396 + }, + { + "auxiliary_loss_clip": 0.01125564, + "auxiliary_loss_mlp": 0.01047993, + "balance_loss_clip": 1.02347136, + "balance_loss_mlp": 1.03583741, + "epoch": 0.10689914324364948, + "flos": 25041872136960.0, + "grad_norm": 3.6853021162900017, + "language_loss": 0.87512541, + "learning_rate": 3.888487513686832e-06, + "loss": 0.89686102, + "num_input_tokens_seen": 38482235, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.8984375, + "step": 1778, + "time_per_iteration": 2.4289801120758057 + }, + { + "auxiliary_loss_clip": 0.01129984, + "auxiliary_loss_mlp": 0.01050385, + "balance_loss_clip": 1.024683, + "balance_loss_mlp": 1.0366596, + "epoch": 0.10695926649631746, + "flos": 16434210453120.0, + "grad_norm": 1.9132762909614143, + "language_loss": 0.84370452, + "learning_rate": 3.88836308304136e-06, + "loss": 0.8655082, + "num_input_tokens_seen": 38500690, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.93359375, + "step": 1779, + "time_per_iteration": 2.385535955429077 + }, + { + "auxiliary_loss_clip": 0.01121678, + "auxiliary_loss_mlp": 0.01045359, + "balance_loss_clip": 1.02161276, + "balance_loss_mlp": 1.03248048, + "epoch": 0.10701938974898542, + "flos": 16978366903680.0, + "grad_norm": 1.940612983001018, + "language_loss": 0.67382878, + "learning_rate": 3.888238585005066e-06, + "loss": 0.69549918, + "num_input_tokens_seen": 38518405, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 0.890625, + "step": 1780, + "time_per_iteration": 2.3779759407043457 + }, + { + "auxiliary_loss_clip": 0.01124395, + "auxiliary_loss_mlp": 0.01048141, + "balance_loss_clip": 1.02261877, + "balance_loss_mlp": 1.03375697, + "epoch": 0.10707951300165339, + "flos": 15887191271040.0, + "grad_norm": 2.120473397927048, + "language_loss": 0.91888499, + "learning_rate": 3.888114019582395e-06, + "loss": 0.94061041, + "num_input_tokens_seen": 38535060, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.90625, + "step": 1781, + "time_per_iteration": 2.366016387939453 + }, + { + "auxiliary_loss_clip": 0.01125598, + "auxiliary_loss_mlp": 0.01046815, + "balance_loss_clip": 1.02105403, + "balance_loss_mlp": 1.03395641, + "epoch": 0.10713963625432135, + "flos": 14246273370240.0, + "grad_norm": 1.961229098536527, + "language_loss": 0.79416013, + "learning_rate": 3.887989386777791e-06, + "loss": 0.81588423, + "num_input_tokens_seen": 38552855, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.91796875, + "step": 1782, + "time_per_iteration": 2.359746217727661 + }, + { + "auxiliary_loss_clip": 0.01126018, + "auxiliary_loss_mlp": 0.01047457, + "balance_loss_clip": 1.02292371, + "balance_loss_mlp": 1.03313398, + "epoch": 0.10719975950698933, + "flos": 16756040165760.0, + "grad_norm": 2.134353445152127, + "language_loss": 0.78729677, + "learning_rate": 3.887864686595703e-06, + "loss": 0.80903149, + "num_input_tokens_seen": 38570075, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.9296875, + "step": 1783, + "time_per_iteration": 2.3823351860046387 + }, + { + "auxiliary_loss_clip": 0.01127351, + "auxiliary_loss_mlp": 0.01051587, + "balance_loss_clip": 1.02654111, + "balance_loss_mlp": 1.03401184, + "epoch": 0.1072598827596573, + "flos": 22709534204160.0, + "grad_norm": 1.9319803999437355, + "language_loss": 0.86656928, + "learning_rate": 3.887739919040579e-06, + "loss": 0.88835871, + "num_input_tokens_seen": 38587970, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.93359375, + "step": 1784, + "time_per_iteration": 2.3890769481658936 + }, + { + "auxiliary_loss_clip": 0.01128957, + "auxiliary_loss_mlp": 0.01048849, + "balance_loss_clip": 1.02149022, + "balance_loss_mlp": 1.03402746, + "epoch": 0.10732000601232526, + "flos": 23257146879360.0, + "grad_norm": 2.588319712177381, + "language_loss": 1.0069952, + "learning_rate": 3.887615084116874e-06, + "loss": 1.02877331, + "num_input_tokens_seen": 38605840, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.94921875, + "step": 1785, + "time_per_iteration": 2.4029221534729004 + }, + { + "auxiliary_loss_clip": 0.01123074, + "auxiliary_loss_mlp": 0.01048438, + "balance_loss_clip": 1.02413082, + "balance_loss_mlp": 1.0350312, + "epoch": 0.10738012926499324, + "flos": 24205911609600.0, + "grad_norm": 1.3298221024401562, + "language_loss": 0.84858882, + "learning_rate": 3.887490181829042e-06, + "loss": 0.87030399, + "num_input_tokens_seen": 38627070, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 0.8828125, + "step": 1786, + "time_per_iteration": 2.4200692176818848 + }, + { + "auxiliary_loss_clip": 0.0112323, + "auxiliary_loss_mlp": 0.01046963, + "balance_loss_clip": 1.01944935, + "balance_loss_mlp": 1.03104043, + "epoch": 0.1074402525176612, + "flos": 20922016037760.0, + "grad_norm": 1.716800751345172, + "language_loss": 0.78385222, + "learning_rate": 3.887365212181542e-06, + "loss": 0.80555415, + "num_input_tokens_seen": 38645840, + "router_z_loss_clip": 0.27539062, + "router_z_loss_mlp": 0.921875, + "step": 1787, + "time_per_iteration": 2.38824462890625 + }, + { + "auxiliary_loss_clip": 0.01128656, + "auxiliary_loss_mlp": 0.01050491, + "balance_loss_clip": 1.02328789, + "balance_loss_mlp": 1.03389168, + "epoch": 0.10750037577032917, + "flos": 16945967295360.0, + "grad_norm": 1.7573360887890106, + "language_loss": 0.82472336, + "learning_rate": 3.88724017517883e-06, + "loss": 0.84651482, + "num_input_tokens_seen": 38664770, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 0.9453125, + "step": 1788, + "time_per_iteration": 2.3621251583099365 + }, + { + "auxiliary_loss_clip": 0.01123994, + "auxiliary_loss_mlp": 0.01050739, + "balance_loss_clip": 1.02641988, + "balance_loss_mlp": 1.03294051, + "epoch": 0.10756049902299715, + "flos": 20265509232000.0, + "grad_norm": 1.838983977324372, + "language_loss": 0.78195995, + "learning_rate": 3.887115070825373e-06, + "loss": 0.8037073, + "num_input_tokens_seen": 38683865, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 0.91015625, + "step": 1789, + "time_per_iteration": 2.3786821365356445 + }, + { + "auxiliary_loss_clip": 0.01126292, + "auxiliary_loss_mlp": 0.01060179, + "balance_loss_clip": 1.03047156, + "balance_loss_mlp": 1.0334549, + "epoch": 0.10762062227566511, + "flos": 23585400282240.0, + "grad_norm": 2.6683776947526225, + "language_loss": 0.745278, + "learning_rate": 3.886989899125632e-06, + "loss": 0.76714271, + "num_input_tokens_seen": 38702485, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 0.9296875, + "step": 1790, + "time_per_iteration": 2.4150373935699463 + }, + { + "auxiliary_loss_clip": 0.01127052, + "auxiliary_loss_mlp": 0.01061173, + "balance_loss_clip": 1.03416061, + "balance_loss_mlp": 1.03454113, + "epoch": 0.10768074552833308, + "flos": 24309638858880.0, + "grad_norm": 2.0808194179309796, + "language_loss": 0.78436476, + "learning_rate": 3.886864660084075e-06, + "loss": 0.806247, + "num_input_tokens_seen": 38722475, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.92578125, + "step": 1791, + "time_per_iteration": 2.408830404281616 + }, + { + "auxiliary_loss_clip": 0.01119221, + "auxiliary_loss_mlp": 0.0104741, + "balance_loss_clip": 1.02353215, + "balance_loss_mlp": 1.03260255, + "epoch": 0.10774086878100106, + "flos": 25298832810240.0, + "grad_norm": 1.8902431432210107, + "language_loss": 0.70625722, + "learning_rate": 3.886739353705173e-06, + "loss": 0.72792351, + "num_input_tokens_seen": 38743285, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 0.8671875, + "step": 1792, + "time_per_iteration": 3.9579362869262695 + }, + { + "auxiliary_loss_clip": 0.01125953, + "auxiliary_loss_mlp": 0.01043045, + "balance_loss_clip": 1.0159843, + "balance_loss_mlp": 1.03193712, + "epoch": 0.10780099203366902, + "flos": 22052957575680.0, + "grad_norm": 1.8560698854611348, + "language_loss": 0.75875032, + "learning_rate": 3.886613979993396e-06, + "loss": 0.78044033, + "num_input_tokens_seen": 38763035, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.9375, + "step": 1793, + "time_per_iteration": 2.3957271575927734 + }, + { + "auxiliary_loss_clip": 0.01124997, + "auxiliary_loss_mlp": 0.01057005, + "balance_loss_clip": 1.0321852, + "balance_loss_mlp": 1.03507137, + "epoch": 0.10786111528633699, + "flos": 22746367555200.0, + "grad_norm": 1.5432225075588661, + "language_loss": 0.85082167, + "learning_rate": 3.886488538953219e-06, + "loss": 0.87264168, + "num_input_tokens_seen": 38784900, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.8984375, + "step": 1794, + "time_per_iteration": 3.90265154838562 + }, + { + "auxiliary_loss_clip": 0.01129109, + "auxiliary_loss_mlp": 0.01044994, + "balance_loss_clip": 1.01956701, + "balance_loss_mlp": 1.0332222, + "epoch": 0.10792123853900495, + "flos": 20849990169600.0, + "grad_norm": 1.9717999863669853, + "language_loss": 0.7450695, + "learning_rate": 3.8863630305891196e-06, + "loss": 0.76681054, + "num_input_tokens_seen": 38804695, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.9609375, + "step": 1795, + "time_per_iteration": 2.3823306560516357 + }, + { + "auxiliary_loss_clip": 0.01127177, + "auxiliary_loss_mlp": 0.01049021, + "balance_loss_clip": 1.02110219, + "balance_loss_mlp": 1.03276324, + "epoch": 0.10798136179167293, + "flos": 17747747735040.0, + "grad_norm": 2.4639211054087333, + "language_loss": 0.81476229, + "learning_rate": 3.8862374549055755e-06, + "loss": 0.83652425, + "num_input_tokens_seen": 38822395, + "router_z_loss_clip": 0.27929688, + "router_z_loss_mlp": 0.9453125, + "step": 1796, + "time_per_iteration": 3.7188520431518555 + }, + { + "auxiliary_loss_clip": 0.01128867, + "auxiliary_loss_mlp": 0.01058443, + "balance_loss_clip": 1.03150129, + "balance_loss_mlp": 1.03387094, + "epoch": 0.1080414850443409, + "flos": 13588789046400.0, + "grad_norm": 2.2364337055104055, + "language_loss": 0.73790944, + "learning_rate": 3.886111811907069e-06, + "loss": 0.75978255, + "num_input_tokens_seen": 38839865, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.94921875, + "step": 1797, + "time_per_iteration": 3.7533183097839355 + }, + { + "auxiliary_loss_clip": 0.01125669, + "auxiliary_loss_mlp": 0.01046034, + "balance_loss_clip": 1.02181053, + "balance_loss_mlp": 1.03294826, + "epoch": 0.10810160829700886, + "flos": 16252487493120.0, + "grad_norm": 2.129713269732791, + "language_loss": 0.81432426, + "learning_rate": 3.885986101598082e-06, + "loss": 0.83604133, + "num_input_tokens_seen": 38857300, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.9296875, + "step": 1798, + "time_per_iteration": 2.378171443939209 + }, + { + "auxiliary_loss_clip": 0.01125419, + "auxiliary_loss_mlp": 0.01047582, + "balance_loss_clip": 1.02235758, + "balance_loss_mlp": 1.0329802, + "epoch": 0.10816173154967684, + "flos": 15157122497280.0, + "grad_norm": 2.29817641696348, + "language_loss": 0.85187292, + "learning_rate": 3.885860323983104e-06, + "loss": 0.87360299, + "num_input_tokens_seen": 38874960, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.92578125, + "step": 1799, + "time_per_iteration": 2.3468503952026367 + }, + { + "auxiliary_loss_clip": 0.01122342, + "auxiliary_loss_mlp": 0.01054019, + "balance_loss_clip": 1.02958083, + "balance_loss_mlp": 1.03431249, + "epoch": 0.10822185480234481, + "flos": 17784371617920.0, + "grad_norm": 1.871905884018768, + "language_loss": 0.76835096, + "learning_rate": 3.885734479066622e-06, + "loss": 0.79011458, + "num_input_tokens_seen": 38893610, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.8828125, + "step": 1800, + "time_per_iteration": 2.383742094039917 + }, + { + "auxiliary_loss_clip": 0.01119787, + "auxiliary_loss_mlp": 0.01041734, + "balance_loss_clip": 1.0178442, + "balance_loss_mlp": 1.03130066, + "epoch": 0.10828197805501277, + "flos": 25555479281280.0, + "grad_norm": 1.5186469739563766, + "language_loss": 0.7293545, + "learning_rate": 3.885608566853126e-06, + "loss": 0.75096971, + "num_input_tokens_seen": 38913485, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 0.88671875, + "step": 1801, + "time_per_iteration": 2.4169530868530273 + }, + { + "auxiliary_loss_clip": 0.01129902, + "auxiliary_loss_mlp": 0.01046469, + "balance_loss_clip": 1.02205503, + "balance_loss_mlp": 1.03348994, + "epoch": 0.10834210130768075, + "flos": 28983217121280.0, + "grad_norm": 1.8508048354535698, + "language_loss": 0.65805316, + "learning_rate": 3.8854825873471115e-06, + "loss": 0.67981684, + "num_input_tokens_seen": 38935650, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.9609375, + "step": 1802, + "time_per_iteration": 2.4483463764190674 + }, + { + "auxiliary_loss_clip": 0.01123685, + "auxiliary_loss_mlp": 0.01052333, + "balance_loss_clip": 1.02596378, + "balance_loss_mlp": 1.03211331, + "epoch": 0.10840222456034872, + "flos": 20263239993600.0, + "grad_norm": 3.1106185134885775, + "language_loss": 0.81412292, + "learning_rate": 3.885356540553073e-06, + "loss": 0.83588308, + "num_input_tokens_seen": 38954130, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 0.9140625, + "step": 1803, + "time_per_iteration": 2.37387752532959 + }, + { + "auxiliary_loss_clip": 0.01120541, + "auxiliary_loss_mlp": 0.01048911, + "balance_loss_clip": 1.02432966, + "balance_loss_mlp": 1.03114367, + "epoch": 0.10846234781301668, + "flos": 19862087938560.0, + "grad_norm": 1.572617341333409, + "language_loss": 0.91127855, + "learning_rate": 3.88523042647551e-06, + "loss": 0.93297303, + "num_input_tokens_seen": 38972905, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.89453125, + "step": 1804, + "time_per_iteration": 2.3934566974639893 + }, + { + "auxiliary_loss_clip": 0.01123549, + "auxiliary_loss_mlp": 0.0104879, + "balance_loss_clip": 1.02301693, + "balance_loss_mlp": 1.03132033, + "epoch": 0.10852247106568465, + "flos": 26467829596800.0, + "grad_norm": 2.1018894025612136, + "language_loss": 0.76497591, + "learning_rate": 3.885104245118921e-06, + "loss": 0.78669924, + "num_input_tokens_seen": 38993255, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.921875, + "step": 1805, + "time_per_iteration": 2.423750638961792 + }, + { + "auxiliary_loss_clip": 0.01120164, + "auxiliary_loss_mlp": 0.01040732, + "balance_loss_clip": 1.01696146, + "balance_loss_mlp": 1.03138614, + "epoch": 0.10858259431835263, + "flos": 30080188039680.0, + "grad_norm": 1.9556052539861366, + "language_loss": 0.86247486, + "learning_rate": 3.8849779964878125e-06, + "loss": 0.88408375, + "num_input_tokens_seen": 39012610, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 0.890625, + "step": 1806, + "time_per_iteration": 2.451107978820801 + }, + { + "auxiliary_loss_clip": 0.01122921, + "auxiliary_loss_mlp": 0.01049866, + "balance_loss_clip": 1.02549911, + "balance_loss_mlp": 1.03105104, + "epoch": 0.10864271757102059, + "flos": 19062157800960.0, + "grad_norm": 3.0931004384736887, + "language_loss": 0.81229842, + "learning_rate": 3.884851680586687e-06, + "loss": 0.83402628, + "num_input_tokens_seen": 39030120, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 0.91796875, + "step": 1807, + "time_per_iteration": 2.3664300441741943 + }, + { + "auxiliary_loss_clip": 0.01120947, + "auxiliary_loss_mlp": 0.01043449, + "balance_loss_clip": 1.020895, + "balance_loss_mlp": 1.03195763, + "epoch": 0.10870284082368856, + "flos": 24713967847680.0, + "grad_norm": 1.8282252096160712, + "language_loss": 0.7888785, + "learning_rate": 3.884725297420053e-06, + "loss": 0.81052244, + "num_input_tokens_seen": 39049875, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.890625, + "step": 1808, + "time_per_iteration": 2.4256677627563477 + }, + { + "auxiliary_loss_clip": 0.01125638, + "auxiliary_loss_mlp": 0.01046616, + "balance_loss_clip": 1.02194023, + "balance_loss_mlp": 1.03399539, + "epoch": 0.10876296407635654, + "flos": 20626720824960.0, + "grad_norm": 1.7326907662304973, + "language_loss": 0.79196876, + "learning_rate": 3.884598846992422e-06, + "loss": 0.81369132, + "num_input_tokens_seen": 39068935, + "router_z_loss_clip": 0.24707031, + "router_z_loss_mlp": 0.9140625, + "step": 1809, + "time_per_iteration": 2.39125394821167 + }, + { + "auxiliary_loss_clip": 0.01121593, + "auxiliary_loss_mlp": 0.01045871, + "balance_loss_clip": 1.0217669, + "balance_loss_mlp": 1.03222513, + "epoch": 0.1088230873290245, + "flos": 21578767223040.0, + "grad_norm": 1.9900982360126007, + "language_loss": 0.84929574, + "learning_rate": 3.884472329308306e-06, + "loss": 0.87097037, + "num_input_tokens_seen": 39087370, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.89453125, + "step": 1810, + "time_per_iteration": 2.3882687091827393 + }, + { + "auxiliary_loss_clip": 0.01127982, + "auxiliary_loss_mlp": 0.01052776, + "balance_loss_clip": 1.02653766, + "balance_loss_mlp": 1.03371501, + "epoch": 0.10888321058169247, + "flos": 26467829596800.0, + "grad_norm": 2.0813929656714656, + "language_loss": 0.63652569, + "learning_rate": 3.8843457443722195e-06, + "loss": 0.6583333, + "num_input_tokens_seen": 39106635, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.94140625, + "step": 1811, + "time_per_iteration": 2.4178736209869385 + }, + { + "auxiliary_loss_clip": 0.01122244, + "auxiliary_loss_mlp": 0.01045659, + "balance_loss_clip": 1.02147138, + "balance_loss_mlp": 1.03190422, + "epoch": 0.10894333383436045, + "flos": 25847423003520.0, + "grad_norm": 2.200120322991929, + "language_loss": 0.74163443, + "learning_rate": 3.884219092188681e-06, + "loss": 0.76331341, + "num_input_tokens_seen": 39126335, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.90625, + "step": 1812, + "time_per_iteration": 2.4389612674713135 + }, + { + "auxiliary_loss_clip": 0.01122401, + "auxiliary_loss_mlp": 0.0104553, + "balance_loss_clip": 1.02191472, + "balance_loss_mlp": 1.03145349, + "epoch": 0.10900345708702841, + "flos": 19536068862720.0, + "grad_norm": 1.67541571038205, + "language_loss": 0.72409236, + "learning_rate": 3.884092372762209e-06, + "loss": 0.74577165, + "num_input_tokens_seen": 39144820, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.91015625, + "step": 1813, + "time_per_iteration": 2.4204447269439697 + }, + { + "auxiliary_loss_clip": 0.01120348, + "auxiliary_loss_mlp": 0.01047142, + "balance_loss_clip": 1.02408659, + "balance_loss_mlp": 1.03367043, + "epoch": 0.10906358033969638, + "flos": 23622163810560.0, + "grad_norm": 1.8415215996899577, + "language_loss": 0.82487369, + "learning_rate": 3.883965586097327e-06, + "loss": 0.84654868, + "num_input_tokens_seen": 39165945, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.8671875, + "step": 1814, + "time_per_iteration": 2.419451951980591 + }, + { + "auxiliary_loss_clip": 0.01123526, + "auxiliary_loss_mlp": 0.0104773, + "balance_loss_clip": 1.02416265, + "balance_loss_mlp": 1.03223526, + "epoch": 0.10912370359236434, + "flos": 21213680469120.0, + "grad_norm": 4.420405023215869, + "language_loss": 0.84061807, + "learning_rate": 3.88383873219856e-06, + "loss": 0.86233068, + "num_input_tokens_seen": 39183520, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.9140625, + "step": 1815, + "time_per_iteration": 2.373497247695923 + }, + { + "auxiliary_loss_clip": 0.0112164, + "auxiliary_loss_mlp": 0.01044655, + "balance_loss_clip": 1.01982379, + "balance_loss_mlp": 1.03311396, + "epoch": 0.10918382684503232, + "flos": 13552339720320.0, + "grad_norm": 4.895839077195346, + "language_loss": 0.71816218, + "learning_rate": 3.8837118110704345e-06, + "loss": 0.73982519, + "num_input_tokens_seen": 39201190, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.88671875, + "step": 1816, + "time_per_iteration": 2.3697516918182373 + }, + { + "auxiliary_loss_clip": 0.01126296, + "auxiliary_loss_mlp": 0.01053606, + "balance_loss_clip": 1.02730894, + "balance_loss_mlp": 1.03459978, + "epoch": 0.10924395009770028, + "flos": 27963089838720.0, + "grad_norm": 2.297323944015786, + "language_loss": 0.72977591, + "learning_rate": 3.88358482271748e-06, + "loss": 0.75157493, + "num_input_tokens_seen": 39221210, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.91796875, + "step": 1817, + "time_per_iteration": 2.427844524383545 + }, + { + "auxiliary_loss_clip": 0.01123589, + "auxiliary_loss_mlp": 0.01046912, + "balance_loss_clip": 1.02059031, + "balance_loss_mlp": 1.0316056, + "epoch": 0.10930407335036825, + "flos": 25592557011840.0, + "grad_norm": 1.665588613083356, + "language_loss": 0.67563391, + "learning_rate": 3.883457767144228e-06, + "loss": 0.69733888, + "num_input_tokens_seen": 39242025, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 0.921875, + "step": 1818, + "time_per_iteration": 2.426520586013794 + }, + { + "auxiliary_loss_clip": 0.01123798, + "auxiliary_loss_mlp": 0.01051526, + "balance_loss_clip": 1.02620637, + "balance_loss_mlp": 1.03213441, + "epoch": 0.10936419660303623, + "flos": 18405197147520.0, + "grad_norm": 2.4183236947991564, + "language_loss": 0.73805034, + "learning_rate": 3.883330644355212e-06, + "loss": 0.75980365, + "num_input_tokens_seen": 39259870, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.9140625, + "step": 1819, + "time_per_iteration": 2.3530497550964355 + }, + { + "auxiliary_loss_clip": 0.01124492, + "auxiliary_loss_mlp": 0.01050427, + "balance_loss_clip": 1.02703798, + "balance_loss_mlp": 1.03328323, + "epoch": 0.1094243198557042, + "flos": 23838974553600.0, + "grad_norm": 3.5626405489282345, + "language_loss": 0.7400474, + "learning_rate": 3.8832034543549716e-06, + "loss": 0.76179659, + "num_input_tokens_seen": 39278500, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.91015625, + "step": 1820, + "time_per_iteration": 2.417438507080078 + }, + { + "auxiliary_loss_clip": 0.01122324, + "auxiliary_loss_mlp": 0.01050176, + "balance_loss_clip": 1.02416396, + "balance_loss_mlp": 1.03208113, + "epoch": 0.10948444310837216, + "flos": 14643166239360.0, + "grad_norm": 2.4897420477094308, + "language_loss": 0.82555467, + "learning_rate": 3.883076197148043e-06, + "loss": 0.84727973, + "num_input_tokens_seen": 39294800, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.90234375, + "step": 1821, + "time_per_iteration": 2.348799467086792 + }, + { + "auxiliary_loss_clip": 0.01119461, + "auxiliary_loss_mlp": 0.01048744, + "balance_loss_clip": 1.02570057, + "balance_loss_mlp": 1.03041005, + "epoch": 0.10954456636104014, + "flos": 27817571825280.0, + "grad_norm": 2.5166504603033286, + "language_loss": 0.76038003, + "learning_rate": 3.8829488727389684e-06, + "loss": 0.78206205, + "num_input_tokens_seen": 39314625, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.890625, + "step": 1822, + "time_per_iteration": 2.4672350883483887 + }, + { + "auxiliary_loss_clip": 0.01120139, + "auxiliary_loss_mlp": 0.01039119, + "balance_loss_clip": 1.01685095, + "balance_loss_mlp": 1.03165674, + "epoch": 0.1096046896137081, + "flos": 33619508184960.0, + "grad_norm": 1.7645762476545976, + "language_loss": 0.79805642, + "learning_rate": 3.882821481132294e-06, + "loss": 0.81964904, + "num_input_tokens_seen": 39336465, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.8828125, + "step": 1823, + "time_per_iteration": 2.4774742126464844 + }, + { + "auxiliary_loss_clip": 0.01121685, + "auxiliary_loss_mlp": 0.01041736, + "balance_loss_clip": 1.01870477, + "balance_loss_mlp": 1.03323531, + "epoch": 0.10966481286637607, + "flos": 26978783477760.0, + "grad_norm": 1.5088373435187543, + "language_loss": 0.79352868, + "learning_rate": 3.882694022332562e-06, + "loss": 0.81516284, + "num_input_tokens_seen": 39357930, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.8828125, + "step": 1824, + "time_per_iteration": 2.421736717224121 + }, + { + "auxiliary_loss_clip": 0.01121125, + "auxiliary_loss_mlp": 0.01048417, + "balance_loss_clip": 1.02428925, + "balance_loss_mlp": 1.03207612, + "epoch": 0.10972493611904403, + "flos": 23035518368640.0, + "grad_norm": 1.8666654077198064, + "language_loss": 0.8807869, + "learning_rate": 3.882566496344324e-06, + "loss": 0.90248227, + "num_input_tokens_seen": 39376380, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 0.88671875, + "step": 1825, + "time_per_iteration": 2.3920836448669434 + }, + { + "auxiliary_loss_clip": 0.01116614, + "auxiliary_loss_mlp": 0.01045545, + "balance_loss_clip": 1.02299047, + "balance_loss_mlp": 1.03023314, + "epoch": 0.10978505937171201, + "flos": 38103194229120.0, + "grad_norm": 2.47998846056717, + "language_loss": 0.76288664, + "learning_rate": 3.88243890317213e-06, + "loss": 0.78450823, + "num_input_tokens_seen": 39399935, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 0.8671875, + "step": 1826, + "time_per_iteration": 2.5153608322143555 + }, + { + "auxiliary_loss_clip": 0.01041664, + "auxiliary_loss_mlp": 0.01008415, + "balance_loss_clip": 1.00543463, + "balance_loss_mlp": 1.01317477, + "epoch": 0.10984518262437998, + "flos": 59846645802240.0, + "grad_norm": 0.8523878617208052, + "language_loss": 0.54961729, + "learning_rate": 3.882311242820534e-06, + "loss": 0.57011807, + "num_input_tokens_seen": 39460685, + "router_z_loss_clip": 0.02978516, + "router_z_loss_mlp": 0.28515625, + "step": 1827, + "time_per_iteration": 3.0348868370056152 + }, + { + "auxiliary_loss_clip": 0.01038473, + "auxiliary_loss_mlp": 0.01004865, + "balance_loss_clip": 1.00183761, + "balance_loss_mlp": 1.00998783, + "epoch": 0.10990530587704794, + "flos": 66716295494400.0, + "grad_norm": 0.7314426430528725, + "language_loss": 0.553303, + "learning_rate": 3.882183515294092e-06, + "loss": 0.57373631, + "num_input_tokens_seen": 39524765, + "router_z_loss_clip": 0.03027344, + "router_z_loss_mlp": 0.28515625, + "step": 1828, + "time_per_iteration": 3.0552117824554443 + }, + { + "auxiliary_loss_clip": 0.01125022, + "auxiliary_loss_mlp": 0.01047309, + "balance_loss_clip": 1.02119052, + "balance_loss_mlp": 1.03243375, + "epoch": 0.10996542912971592, + "flos": 25446026568960.0, + "grad_norm": 3.020088416281762, + "language_loss": 0.84540451, + "learning_rate": 3.882055720597362e-06, + "loss": 0.86712778, + "num_input_tokens_seen": 39543640, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.92578125, + "step": 1829, + "time_per_iteration": 2.4032886028289795 + }, + { + "auxiliary_loss_clip": 0.01123522, + "auxiliary_loss_mlp": 0.01051236, + "balance_loss_clip": 1.02602315, + "balance_loss_mlp": 1.03332651, + "epoch": 0.11002555238238389, + "flos": 44016503425920.0, + "grad_norm": 2.1794525035170795, + "language_loss": 0.88641047, + "learning_rate": 3.8819278587349045e-06, + "loss": 0.90815806, + "num_input_tokens_seen": 39567525, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.90234375, + "step": 1830, + "time_per_iteration": 2.5679574012756348 + }, + { + "auxiliary_loss_clip": 0.01034133, + "auxiliary_loss_mlp": 0.01003377, + "balance_loss_clip": 0.99980056, + "balance_loss_mlp": 1.00614595, + "epoch": 0.11008567563505185, + "flos": 54061781097600.0, + "grad_norm": 0.6857460363847238, + "language_loss": 0.55485028, + "learning_rate": 3.881799929711282e-06, + "loss": 0.57522535, + "num_input_tokens_seen": 39628470, + "router_z_loss_clip": 0.03564453, + "router_z_loss_mlp": 0.27929688, + "step": 1831, + "time_per_iteration": 3.04540753364563 + }, + { + "auxiliary_loss_clip": 0.01129426, + "auxiliary_loss_mlp": 0.01050821, + "balance_loss_clip": 1.02491689, + "balance_loss_mlp": 1.0353601, + "epoch": 0.11014579888771983, + "flos": 24242011822080.0, + "grad_norm": 2.222061058726195, + "language_loss": 0.91241372, + "learning_rate": 3.881671933531061e-06, + "loss": 0.9342162, + "num_input_tokens_seen": 39646670, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.94140625, + "step": 1832, + "time_per_iteration": 3.7834312915802 + }, + { + "auxiliary_loss_clip": 0.01035121, + "auxiliary_loss_mlp": 0.01002943, + "balance_loss_clip": 0.99962914, + "balance_loss_mlp": 1.00814128, + "epoch": 0.1102059221403878, + "flos": 57740684325120.0, + "grad_norm": 0.7083011127659482, + "language_loss": 0.59934974, + "learning_rate": 3.881543870198809e-06, + "loss": 0.61973035, + "num_input_tokens_seen": 39712915, + "router_z_loss_clip": 0.03320312, + "router_z_loss_mlp": 0.26953125, + "step": 1833, + "time_per_iteration": 4.4285361766815186 + }, + { + "auxiliary_loss_clip": 0.01122688, + "auxiliary_loss_mlp": 0.01042234, + "balance_loss_clip": 1.0180105, + "balance_loss_mlp": 1.03290153, + "epoch": 0.11026604539305576, + "flos": 16795107843840.0, + "grad_norm": 6.5647103964142275, + "language_loss": 0.80468589, + "learning_rate": 3.881415739719096e-06, + "loss": 0.82633519, + "num_input_tokens_seen": 39730650, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.8984375, + "step": 1834, + "time_per_iteration": 2.365412712097168 + }, + { + "auxiliary_loss_clip": 0.01127512, + "auxiliary_loss_mlp": 0.01049115, + "balance_loss_clip": 1.0236876, + "balance_loss_mlp": 1.0368191, + "epoch": 0.11032616864572373, + "flos": 23986936362240.0, + "grad_norm": 3.0585102867786986, + "language_loss": 0.90389204, + "learning_rate": 3.881287542096494e-06, + "loss": 0.92565829, + "num_input_tokens_seen": 39751065, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.90625, + "step": 1835, + "time_per_iteration": 2.432892084121704 + }, + { + "auxiliary_loss_clip": 0.01126549, + "auxiliary_loss_mlp": 0.01044755, + "balance_loss_clip": 1.02037621, + "balance_loss_mlp": 1.03564322, + "epoch": 0.1103862918983917, + "flos": 19682110546560.0, + "grad_norm": 2.1670359810007205, + "language_loss": 0.63784945, + "learning_rate": 3.881159277335581e-06, + "loss": 0.65956241, + "num_input_tokens_seen": 39769245, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.90625, + "step": 1836, + "time_per_iteration": 5.125218868255615 + }, + { + "auxiliary_loss_clip": 0.01123524, + "auxiliary_loss_mlp": 0.0104534, + "balance_loss_clip": 1.02220166, + "balance_loss_mlp": 1.03421807, + "epoch": 0.11044641515105967, + "flos": 32159510282880.0, + "grad_norm": 1.921355031365176, + "language_loss": 0.72566742, + "learning_rate": 3.88103094544093e-06, + "loss": 0.74735606, + "num_input_tokens_seen": 39790830, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 0.890625, + "step": 1837, + "time_per_iteration": 2.4840970039367676 + }, + { + "auxiliary_loss_clip": 0.01127698, + "auxiliary_loss_mlp": 0.01051693, + "balance_loss_clip": 1.02830374, + "balance_loss_mlp": 1.03508973, + "epoch": 0.11050653840372764, + "flos": 16688343306240.0, + "grad_norm": 2.6567733589711198, + "language_loss": 0.7852577, + "learning_rate": 3.880902546417125e-06, + "loss": 0.80705154, + "num_input_tokens_seen": 39809475, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.92578125, + "step": 1838, + "time_per_iteration": 2.3714590072631836 + }, + { + "auxiliary_loss_clip": 0.01126018, + "auxiliary_loss_mlp": 0.01053658, + "balance_loss_clip": 1.02975667, + "balance_loss_mlp": 1.03530455, + "epoch": 0.11056666165639562, + "flos": 21207989917440.0, + "grad_norm": 1.8341679071863268, + "language_loss": 0.71916747, + "learning_rate": 3.880774080268745e-06, + "loss": 0.74096417, + "num_input_tokens_seen": 39826355, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.90625, + "step": 1839, + "time_per_iteration": 2.373873472213745 + }, + { + "auxiliary_loss_clip": 0.01129867, + "auxiliary_loss_mlp": 0.0104687, + "balance_loss_clip": 1.02095413, + "balance_loss_mlp": 1.0373522, + "epoch": 0.11062678490906358, + "flos": 19164663152640.0, + "grad_norm": 2.2181662106134747, + "language_loss": 0.7848084, + "learning_rate": 3.880645547000377e-06, + "loss": 0.80657578, + "num_input_tokens_seen": 39845335, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.92578125, + "step": 1840, + "time_per_iteration": 2.397270679473877 + }, + { + "auxiliary_loss_clip": 0.0111785, + "auxiliary_loss_mlp": 0.0104291, + "balance_loss_clip": 1.02027237, + "balance_loss_mlp": 1.03210425, + "epoch": 0.11068690816173155, + "flos": 24894259441920.0, + "grad_norm": 1.6079820018701225, + "language_loss": 0.87717295, + "learning_rate": 3.880516946616606e-06, + "loss": 0.89878058, + "num_input_tokens_seen": 39865065, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.859375, + "step": 1841, + "time_per_iteration": 2.405466079711914 + }, + { + "auxiliary_loss_clip": 0.01119553, + "auxiliary_loss_mlp": 0.01044184, + "balance_loss_clip": 1.02077103, + "balance_loss_mlp": 1.03343081, + "epoch": 0.11074703141439952, + "flos": 16471427829120.0, + "grad_norm": 1.962118133830366, + "language_loss": 0.90375423, + "learning_rate": 3.880388279122023e-06, + "loss": 0.92539161, + "num_input_tokens_seen": 39882780, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.859375, + "step": 1842, + "time_per_iteration": 2.362666606903076 + }, + { + "auxiliary_loss_clip": 0.0112011, + "auxiliary_loss_mlp": 0.01046667, + "balance_loss_clip": 1.02385056, + "balance_loss_mlp": 1.03192472, + "epoch": 0.11080715466706749, + "flos": 19171401045120.0, + "grad_norm": 2.339226252612978, + "language_loss": 0.85794604, + "learning_rate": 3.880259544521219e-06, + "loss": 0.87961382, + "num_input_tokens_seen": 39900295, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.8828125, + "step": 1843, + "time_per_iteration": 2.3702476024627686 + }, + { + "auxiliary_loss_clip": 0.01122537, + "auxiliary_loss_mlp": 0.01047296, + "balance_loss_clip": 1.02326322, + "balance_loss_mlp": 1.03436565, + "epoch": 0.11086727791973545, + "flos": 27703580636160.0, + "grad_norm": 1.9907211936404086, + "language_loss": 0.74612248, + "learning_rate": 3.880130742818789e-06, + "loss": 0.76782072, + "num_input_tokens_seen": 39922075, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.8828125, + "step": 1844, + "time_per_iteration": 2.4470207691192627 + }, + { + "auxiliary_loss_clip": 0.01125056, + "auxiliary_loss_mlp": 0.01043043, + "balance_loss_clip": 1.01850975, + "balance_loss_mlp": 1.03429544, + "epoch": 0.11092740117240343, + "flos": 18513986544000.0, + "grad_norm": 2.2664377638723683, + "language_loss": 0.75605702, + "learning_rate": 3.880001874019328e-06, + "loss": 0.77773809, + "num_input_tokens_seen": 39940115, + "router_z_loss_clip": 0.24511719, + "router_z_loss_mlp": 0.90625, + "step": 1845, + "time_per_iteration": 2.352602243423462 + }, + { + "auxiliary_loss_clip": 0.01121297, + "auxiliary_loss_mlp": 0.01043874, + "balance_loss_clip": 1.0212239, + "balance_loss_mlp": 1.03393865, + "epoch": 0.1109875244250714, + "flos": 20521387653120.0, + "grad_norm": 1.6060166262770896, + "language_loss": 0.76185834, + "learning_rate": 3.879872938127438e-06, + "loss": 0.78351009, + "num_input_tokens_seen": 39959920, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.875, + "step": 1846, + "time_per_iteration": 2.3857526779174805 + }, + { + "auxiliary_loss_clip": 0.0112376, + "auxiliary_loss_mlp": 0.01043863, + "balance_loss_clip": 1.02034295, + "balance_loss_mlp": 1.03274524, + "epoch": 0.11104764767773936, + "flos": 14097787891200.0, + "grad_norm": 2.8021533328888744, + "language_loss": 0.85970891, + "learning_rate": 3.879743935147717e-06, + "loss": 0.88138521, + "num_input_tokens_seen": 39974755, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 0.91015625, + "step": 1847, + "time_per_iteration": 2.3417770862579346 + }, + { + "auxiliary_loss_clip": 0.01122761, + "auxiliary_loss_mlp": 0.01049376, + "balance_loss_clip": 1.02574825, + "balance_loss_mlp": 1.03167677, + "epoch": 0.11110777093040733, + "flos": 20593483344000.0, + "grad_norm": 2.008085509007359, + "language_loss": 0.77417588, + "learning_rate": 3.87961486508477e-06, + "loss": 0.79589725, + "num_input_tokens_seen": 39993355, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.91015625, + "step": 1848, + "time_per_iteration": 2.385127067565918 + }, + { + "auxiliary_loss_clip": 0.01121406, + "auxiliary_loss_mlp": 0.01042318, + "balance_loss_clip": 1.02090836, + "balance_loss_mlp": 1.03614879, + "epoch": 0.11116789418307531, + "flos": 21869035200000.0, + "grad_norm": 2.3025183471435877, + "language_loss": 0.77871823, + "learning_rate": 3.879485727943204e-06, + "loss": 0.80035543, + "num_input_tokens_seen": 40012410, + "router_z_loss_clip": 0.21386719, + "router_z_loss_mlp": 0.8515625, + "step": 1849, + "time_per_iteration": 2.3708243370056152 + }, + { + "auxiliary_loss_clip": 0.01122495, + "auxiliary_loss_mlp": 0.01046036, + "balance_loss_clip": 1.02386284, + "balance_loss_mlp": 1.03078926, + "epoch": 0.11122801743574327, + "flos": 15522209251200.0, + "grad_norm": 3.133443799544841, + "language_loss": 0.712363, + "learning_rate": 3.879356523727627e-06, + "loss": 0.73404837, + "num_input_tokens_seen": 40029315, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 0.91796875, + "step": 1850, + "time_per_iteration": 2.3596272468566895 + }, + { + "auxiliary_loss_clip": 0.01124561, + "auxiliary_loss_mlp": 0.01046046, + "balance_loss_clip": 1.02158463, + "balance_loss_mlp": 1.03467679, + "epoch": 0.11128814068841124, + "flos": 14391407358720.0, + "grad_norm": 2.085739423799335, + "language_loss": 0.81019771, + "learning_rate": 3.87922725244265e-06, + "loss": 0.8319037, + "num_input_tokens_seen": 40045765, + "router_z_loss_clip": 0.24511719, + "router_z_loss_mlp": 0.8984375, + "step": 1851, + "time_per_iteration": 2.337360382080078 + }, + { + "auxiliary_loss_clip": 0.01121203, + "auxiliary_loss_mlp": 0.01045591, + "balance_loss_clip": 1.0227859, + "balance_loss_mlp": 1.03309608, + "epoch": 0.11134826394107922, + "flos": 16653011143680.0, + "grad_norm": 2.4333394722702217, + "language_loss": 0.88124275, + "learning_rate": 3.879097914092886e-06, + "loss": 0.90291065, + "num_input_tokens_seen": 40061660, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.8828125, + "step": 1852, + "time_per_iteration": 2.357152223587036 + }, + { + "auxiliary_loss_clip": 0.01122967, + "auxiliary_loss_mlp": 0.01046162, + "balance_loss_clip": 1.02071118, + "balance_loss_mlp": 1.03313994, + "epoch": 0.11140838719374718, + "flos": 16690053962880.0, + "grad_norm": 2.310298758811435, + "language_loss": 0.72288007, + "learning_rate": 3.878968508682952e-06, + "loss": 0.74457133, + "num_input_tokens_seen": 40080180, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.8984375, + "step": 1853, + "time_per_iteration": 2.3621346950531006 + }, + { + "auxiliary_loss_clip": 0.0103504, + "auxiliary_loss_mlp": 0.01007875, + "balance_loss_clip": 1.00453675, + "balance_loss_mlp": 1.00779462, + "epoch": 0.11146851044641515, + "flos": 60973397976960.0, + "grad_norm": 0.7850775594182413, + "language_loss": 0.53635836, + "learning_rate": 3.878839036217464e-06, + "loss": 0.55678749, + "num_input_tokens_seen": 40138910, + "router_z_loss_clip": 0.03344727, + "router_z_loss_mlp": 0.2734375, + "step": 1854, + "time_per_iteration": 2.982922315597534 + }, + { + "auxiliary_loss_clip": 0.01130098, + "auxiliary_loss_mlp": 0.01046836, + "balance_loss_clip": 1.02103877, + "balance_loss_mlp": 1.03421712, + "epoch": 0.11152863369908313, + "flos": 22192924682880.0, + "grad_norm": 2.521424876635544, + "language_loss": 0.84896588, + "learning_rate": 3.878709496701045e-06, + "loss": 0.87073517, + "num_input_tokens_seen": 40157745, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.95703125, + "step": 1855, + "time_per_iteration": 2.3863637447357178 + }, + { + "auxiliary_loss_clip": 0.01120315, + "auxiliary_loss_mlp": 0.0104773, + "balance_loss_clip": 1.02462673, + "balance_loss_mlp": 1.03160822, + "epoch": 0.11158875695175109, + "flos": 19536487799040.0, + "grad_norm": 2.214380250338141, + "language_loss": 0.81937933, + "learning_rate": 3.8785798901383155e-06, + "loss": 0.84105986, + "num_input_tokens_seen": 40175375, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 0.88671875, + "step": 1856, + "time_per_iteration": 2.386831521987915 + }, + { + "auxiliary_loss_clip": 0.0112035, + "auxiliary_loss_mlp": 0.01044205, + "balance_loss_clip": 1.02087581, + "balance_loss_mlp": 1.03325868, + "epoch": 0.11164888020441906, + "flos": 25441662648960.0, + "grad_norm": 2.387040860028858, + "language_loss": 0.83130205, + "learning_rate": 3.878450216533902e-06, + "loss": 0.85294759, + "num_input_tokens_seen": 40195715, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 0.87109375, + "step": 1857, + "time_per_iteration": 2.4100728034973145 + }, + { + "auxiliary_loss_clip": 0.01122763, + "auxiliary_loss_mlp": 0.01041734, + "balance_loss_clip": 1.01819038, + "balance_loss_mlp": 1.0308063, + "epoch": 0.11170900345708702, + "flos": 15631836520320.0, + "grad_norm": 2.108834383109374, + "language_loss": 0.82937002, + "learning_rate": 3.878320475892433e-06, + "loss": 0.85101503, + "num_input_tokens_seen": 40213975, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 0.921875, + "step": 1858, + "time_per_iteration": 2.378499984741211 + }, + { + "auxiliary_loss_clip": 0.0112419, + "auxiliary_loss_mlp": 0.01053308, + "balance_loss_clip": 1.02928758, + "balance_loss_mlp": 1.03413117, + "epoch": 0.111769126709755, + "flos": 23038311277440.0, + "grad_norm": 2.439230848213365, + "language_loss": 0.91331965, + "learning_rate": 3.878190668218537e-06, + "loss": 0.93509459, + "num_input_tokens_seen": 40233905, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.8984375, + "step": 1859, + "time_per_iteration": 2.3891844749450684 + }, + { + "auxiliary_loss_clip": 0.01121581, + "auxiliary_loss_mlp": 0.01046617, + "balance_loss_clip": 1.02209592, + "balance_loss_mlp": 1.03094959, + "epoch": 0.11182924996242297, + "flos": 20849641056000.0, + "grad_norm": 2.1701555389504694, + "language_loss": 0.81527424, + "learning_rate": 3.878060793516847e-06, + "loss": 0.83695626, + "num_input_tokens_seen": 40252810, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.90625, + "step": 1860, + "time_per_iteration": 2.385209798812866 + }, + { + "auxiliary_loss_clip": 0.01117795, + "auxiliary_loss_mlp": 0.01048985, + "balance_loss_clip": 1.02597761, + "balance_loss_mlp": 1.03135908, + "epoch": 0.11188937321509093, + "flos": 17454407558400.0, + "grad_norm": 4.339887688424837, + "language_loss": 0.74721307, + "learning_rate": 3.8779308517919995e-06, + "loss": 0.76888084, + "num_input_tokens_seen": 40272000, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.8671875, + "step": 1861, + "time_per_iteration": 2.3749191761016846 + }, + { + "auxiliary_loss_clip": 0.01121004, + "auxiliary_loss_mlp": 0.01039688, + "balance_loss_clip": 1.01765776, + "balance_loss_mlp": 1.03207588, + "epoch": 0.11194949646775891, + "flos": 24094818063360.0, + "grad_norm": 1.8191287058658938, + "language_loss": 0.88780928, + "learning_rate": 3.87780084304863e-06, + "loss": 0.9094162, + "num_input_tokens_seen": 40290660, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.890625, + "step": 1862, + "time_per_iteration": 2.390875816345215 + }, + { + "auxiliary_loss_clip": 0.01122035, + "auxiliary_loss_mlp": 0.01049775, + "balance_loss_clip": 1.02638614, + "balance_loss_mlp": 1.03333008, + "epoch": 0.11200961972042688, + "flos": 25152756215040.0, + "grad_norm": 2.4417652901407396, + "language_loss": 0.86999977, + "learning_rate": 3.877670767291379e-06, + "loss": 0.89171791, + "num_input_tokens_seen": 40307820, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 0.88671875, + "step": 1863, + "time_per_iteration": 2.3952221870422363 + }, + { + "auxiliary_loss_clip": 0.01123151, + "auxiliary_loss_mlp": 0.01043867, + "balance_loss_clip": 1.01883328, + "balance_loss_mlp": 1.03344309, + "epoch": 0.11206974297309484, + "flos": 21287242437120.0, + "grad_norm": 1.7716005915019037, + "language_loss": 0.63989413, + "learning_rate": 3.877540624524888e-06, + "loss": 0.66156435, + "num_input_tokens_seen": 40327430, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.8984375, + "step": 1864, + "time_per_iteration": 2.387942314147949 + }, + { + "auxiliary_loss_clip": 0.01121606, + "auxiliary_loss_mlp": 0.01043734, + "balance_loss_clip": 1.02109623, + "balance_loss_mlp": 1.0336833, + "epoch": 0.11212986622576282, + "flos": 18914998953600.0, + "grad_norm": 2.8383906767286464, + "language_loss": 0.74338508, + "learning_rate": 3.877410414753802e-06, + "loss": 0.76503849, + "num_input_tokens_seen": 40344545, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.87890625, + "step": 1865, + "time_per_iteration": 2.3607590198516846 + }, + { + "auxiliary_loss_clip": 0.0112189, + "auxiliary_loss_mlp": 0.01046953, + "balance_loss_clip": 1.02070308, + "balance_loss_mlp": 1.03154969, + "epoch": 0.11218998947843078, + "flos": 22053655802880.0, + "grad_norm": 9.87993134600205, + "language_loss": 0.84361953, + "learning_rate": 3.877280137982767e-06, + "loss": 0.86530793, + "num_input_tokens_seen": 40362300, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.90234375, + "step": 1866, + "time_per_iteration": 2.3785462379455566 + }, + { + "auxiliary_loss_clip": 0.01120683, + "auxiliary_loss_mlp": 0.01045652, + "balance_loss_clip": 1.02070165, + "balance_loss_mlp": 1.03133845, + "epoch": 0.11225011273109875, + "flos": 24570544515840.0, + "grad_norm": 1.7207529171668403, + "language_loss": 0.81263578, + "learning_rate": 3.877149794216433e-06, + "loss": 0.83429909, + "num_input_tokens_seen": 40384720, + "router_z_loss_clip": 0.24902344, + "router_z_loss_mlp": 0.89453125, + "step": 1867, + "time_per_iteration": 2.4535868167877197 + }, + { + "auxiliary_loss_clip": 0.01124865, + "auxiliary_loss_mlp": 0.01051785, + "balance_loss_clip": 1.02877796, + "balance_loss_mlp": 1.03491139, + "epoch": 0.11231023598376672, + "flos": 28437419836800.0, + "grad_norm": 2.0254021408977803, + "language_loss": 0.86644781, + "learning_rate": 3.877019383459451e-06, + "loss": 0.88821429, + "num_input_tokens_seen": 40404000, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 0.8984375, + "step": 1868, + "time_per_iteration": 2.432866334915161 + }, + { + "auxiliary_loss_clip": 0.01122161, + "auxiliary_loss_mlp": 0.01043699, + "balance_loss_clip": 1.02059674, + "balance_loss_mlp": 1.03400004, + "epoch": 0.1123703592364347, + "flos": 14425657269120.0, + "grad_norm": 2.5856270805718995, + "language_loss": 0.68023825, + "learning_rate": 3.876888905716476e-06, + "loss": 0.70189679, + "num_input_tokens_seen": 40418665, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 0.87890625, + "step": 1869, + "time_per_iteration": 2.3502705097198486 + }, + { + "auxiliary_loss_clip": 0.01125969, + "auxiliary_loss_mlp": 0.01052933, + "balance_loss_clip": 1.02717185, + "balance_loss_mlp": 1.03294349, + "epoch": 0.11243048248910266, + "flos": 22235204206080.0, + "grad_norm": 1.536104041632161, + "language_loss": 0.77442759, + "learning_rate": 3.876758360992165e-06, + "loss": 0.79621661, + "num_input_tokens_seen": 40437870, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.9296875, + "step": 1870, + "time_per_iteration": 2.3861958980560303 + }, + { + "auxiliary_loss_clip": 0.01123982, + "auxiliary_loss_mlp": 0.01045221, + "balance_loss_clip": 1.02044964, + "balance_loss_mlp": 1.03114092, + "epoch": 0.11249060574177062, + "flos": 18583289326080.0, + "grad_norm": 2.2165975537900806, + "language_loss": 0.7623505, + "learning_rate": 3.8766277492911736e-06, + "loss": 0.7840426, + "num_input_tokens_seen": 40455570, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.92578125, + "step": 1871, + "time_per_iteration": 3.832282781600952 + }, + { + "auxiliary_loss_clip": 0.0112357, + "auxiliary_loss_mlp": 0.01040434, + "balance_loss_clip": 1.01706851, + "balance_loss_mlp": 1.03323805, + "epoch": 0.1125507289944386, + "flos": 22855471153920.0, + "grad_norm": 1.9322766154803015, + "language_loss": 0.81456953, + "learning_rate": 3.876497070618166e-06, + "loss": 0.83620954, + "num_input_tokens_seen": 40473600, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 0.90234375, + "step": 1872, + "time_per_iteration": 2.3943119049072266 + }, + { + "auxiliary_loss_clip": 0.01125733, + "auxiliary_loss_mlp": 0.01052084, + "balance_loss_clip": 1.02839732, + "balance_loss_mlp": 1.03431916, + "epoch": 0.11261085224710657, + "flos": 19675547210880.0, + "grad_norm": 2.3744857363701612, + "language_loss": 0.82998043, + "learning_rate": 3.876366324977806e-06, + "loss": 0.8517586, + "num_input_tokens_seen": 40490025, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 0.9140625, + "step": 1873, + "time_per_iteration": 3.711221933364868 + }, + { + "auxiliary_loss_clip": 0.01125359, + "auxiliary_loss_mlp": 0.01049416, + "balance_loss_clip": 1.02316654, + "balance_loss_mlp": 1.03108621, + "epoch": 0.11267097549977453, + "flos": 26062173976320.0, + "grad_norm": 1.8842552987423473, + "language_loss": 0.92325759, + "learning_rate": 3.876235512374757e-06, + "loss": 0.94500536, + "num_input_tokens_seen": 40511580, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 0.94140625, + "step": 1874, + "time_per_iteration": 2.4347848892211914 + }, + { + "auxiliary_loss_clip": 0.01119011, + "auxiliary_loss_mlp": 0.01047171, + "balance_loss_clip": 1.02326918, + "balance_loss_mlp": 1.03145373, + "epoch": 0.11273109875244251, + "flos": 21067010380800.0, + "grad_norm": 1.4604694796120459, + "language_loss": 0.7536639, + "learning_rate": 3.876104632813689e-06, + "loss": 0.77532566, + "num_input_tokens_seen": 40530155, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 0.875, + "step": 1875, + "time_per_iteration": 5.135155439376831 + }, + { + "auxiliary_loss_clip": 0.01120029, + "auxiliary_loss_mlp": 0.01045988, + "balance_loss_clip": 1.02455413, + "balance_loss_mlp": 1.03409672, + "epoch": 0.11279122200511048, + "flos": 27087782342400.0, + "grad_norm": 2.0494182757181982, + "language_loss": 0.71384954, + "learning_rate": 3.875973686299272e-06, + "loss": 0.73550969, + "num_input_tokens_seen": 40549500, + "router_z_loss_clip": 0.21386719, + "router_z_loss_mlp": 0.859375, + "step": 1876, + "time_per_iteration": 2.4182865619659424 + }, + { + "auxiliary_loss_clip": 0.01121093, + "auxiliary_loss_mlp": 0.01044036, + "balance_loss_clip": 1.02175617, + "balance_loss_mlp": 1.03341925, + "epoch": 0.11285134525777844, + "flos": 20187024762240.0, + "grad_norm": 1.8334081916707283, + "language_loss": 0.7652418, + "learning_rate": 3.875842672836182e-06, + "loss": 0.78689313, + "num_input_tokens_seen": 40567475, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.875, + "step": 1877, + "time_per_iteration": 2.3859004974365234 + }, + { + "auxiliary_loss_clip": 0.0112174, + "auxiliary_loss_mlp": 0.01055278, + "balance_loss_clip": 1.02965963, + "balance_loss_mlp": 1.03242016, + "epoch": 0.11291146851044641, + "flos": 12457638040320.0, + "grad_norm": 2.5233166777136145, + "language_loss": 0.87412786, + "learning_rate": 3.87571159242909e-06, + "loss": 0.89589804, + "num_input_tokens_seen": 40583280, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.890625, + "step": 1878, + "time_per_iteration": 2.3492658138275146 + }, + { + "auxiliary_loss_clip": 0.01121535, + "auxiliary_loss_mlp": 0.01042434, + "balance_loss_clip": 1.01773417, + "balance_loss_mlp": 1.03220189, + "epoch": 0.11297159176311439, + "flos": 23841173969280.0, + "grad_norm": 2.1187437654021233, + "language_loss": 0.80941617, + "learning_rate": 3.875580445082677e-06, + "loss": 0.83105588, + "num_input_tokens_seen": 40603080, + "router_z_loss_clip": 0.24707031, + "router_z_loss_mlp": 0.89453125, + "step": 1879, + "time_per_iteration": 2.4078209400177 + }, + { + "auxiliary_loss_clip": 0.0112127, + "auxiliary_loss_mlp": 0.0105136, + "balance_loss_clip": 1.02637398, + "balance_loss_mlp": 1.03222597, + "epoch": 0.11303171501578235, + "flos": 29929363499520.0, + "grad_norm": 2.0738310531410757, + "language_loss": 0.69966519, + "learning_rate": 3.875449230801622e-06, + "loss": 0.72139156, + "num_input_tokens_seen": 40623255, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.890625, + "step": 1880, + "time_per_iteration": 2.4354565143585205 + }, + { + "auxiliary_loss_clip": 0.01121064, + "auxiliary_loss_mlp": 0.01045815, + "balance_loss_clip": 1.0205431, + "balance_loss_mlp": 1.03177834, + "epoch": 0.11309183826845032, + "flos": 16179623752320.0, + "grad_norm": 1.6998560748807998, + "language_loss": 0.71996421, + "learning_rate": 3.875317949590609e-06, + "loss": 0.74163306, + "num_input_tokens_seen": 40641570, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.89453125, + "step": 1881, + "time_per_iteration": 2.3698432445526123 + }, + { + "auxiliary_loss_clip": 0.01122401, + "auxiliary_loss_mlp": 0.01048771, + "balance_loss_clip": 1.02403498, + "balance_loss_mlp": 1.03180218, + "epoch": 0.1131519615211183, + "flos": 12019897013760.0, + "grad_norm": 2.16034106561837, + "language_loss": 0.74119371, + "learning_rate": 3.875186601454322e-06, + "loss": 0.76290548, + "num_input_tokens_seen": 40658775, + "router_z_loss_clip": 0.24707031, + "router_z_loss_mlp": 0.90625, + "step": 1882, + "time_per_iteration": 2.3546245098114014 + }, + { + "auxiliary_loss_clip": 0.01119348, + "auxiliary_loss_mlp": 0.01047118, + "balance_loss_clip": 1.0215472, + "balance_loss_mlp": 1.03185344, + "epoch": 0.11321208477378626, + "flos": 26248924172160.0, + "grad_norm": 2.046400534186846, + "language_loss": 0.79340416, + "learning_rate": 3.8750551863974484e-06, + "loss": 0.81506884, + "num_input_tokens_seen": 40679555, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.875, + "step": 1883, + "time_per_iteration": 2.4280452728271484 + }, + { + "auxiliary_loss_clip": 0.01121617, + "auxiliary_loss_mlp": 0.01044822, + "balance_loss_clip": 1.01896548, + "balance_loss_mlp": 1.03018355, + "epoch": 0.11327220802645423, + "flos": 13625517663360.0, + "grad_norm": 2.5416306163806515, + "language_loss": 0.77227646, + "learning_rate": 3.874923704424679e-06, + "loss": 0.79394084, + "num_input_tokens_seen": 40697295, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.9140625, + "step": 1884, + "time_per_iteration": 2.3503458499908447 + }, + { + "auxiliary_loss_clip": 0.01040015, + "auxiliary_loss_mlp": 0.01006052, + "balance_loss_clip": 1.00242758, + "balance_loss_mlp": 1.01261711, + "epoch": 0.1133323312791222, + "flos": 57188672818560.0, + "grad_norm": 0.7855083275624904, + "language_loss": 0.55201423, + "learning_rate": 3.8747921555407045e-06, + "loss": 0.57247484, + "num_input_tokens_seen": 40758095, + "router_z_loss_clip": 0.03613281, + "router_z_loss_mlp": 0.2734375, + "step": 1885, + "time_per_iteration": 2.9273834228515625 + }, + { + "auxiliary_loss_clip": 0.01113865, + "auxiliary_loss_mlp": 0.01042374, + "balance_loss_clip": 1.01996267, + "balance_loss_mlp": 1.03008294, + "epoch": 0.11339245453179017, + "flos": 24350591750400.0, + "grad_norm": 1.9656073476571887, + "language_loss": 0.90563154, + "learning_rate": 3.874660539750222e-06, + "loss": 0.92719388, + "num_input_tokens_seen": 40777140, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.83984375, + "step": 1886, + "time_per_iteration": 2.406250476837158 + }, + { + "auxiliary_loss_clip": 0.01123372, + "auxiliary_loss_mlp": 0.01042115, + "balance_loss_clip": 1.01962066, + "balance_loss_mlp": 1.03481531, + "epoch": 0.11345257778445814, + "flos": 22669698476160.0, + "grad_norm": 1.9569472103942396, + "language_loss": 0.85377294, + "learning_rate": 3.874528857057926e-06, + "loss": 0.87542778, + "num_input_tokens_seen": 40797505, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.88671875, + "step": 1887, + "time_per_iteration": 2.4290239810943604 + }, + { + "auxiliary_loss_clip": 0.01123608, + "auxiliary_loss_mlp": 0.01048515, + "balance_loss_clip": 1.02505445, + "balance_loss_mlp": 1.03379381, + "epoch": 0.11351270103712612, + "flos": 20987408747520.0, + "grad_norm": 3.57150940087995, + "language_loss": 0.75795519, + "learning_rate": 3.874397107468516e-06, + "loss": 0.77967644, + "num_input_tokens_seen": 40812970, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.8984375, + "step": 1888, + "time_per_iteration": 2.40659236907959 + }, + { + "auxiliary_loss_clip": 0.01125696, + "auxiliary_loss_mlp": 0.01049427, + "balance_loss_clip": 1.02421427, + "balance_loss_mlp": 1.03497028, + "epoch": 0.11357282428979408, + "flos": 37346241841920.0, + "grad_norm": 2.520445784871137, + "language_loss": 0.68051779, + "learning_rate": 3.874265290986696e-06, + "loss": 0.70226902, + "num_input_tokens_seen": 40837745, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.90625, + "step": 1889, + "time_per_iteration": 2.5671088695526123 + }, + { + "auxiliary_loss_clip": 0.01123114, + "auxiliary_loss_mlp": 0.01041424, + "balance_loss_clip": 1.01773691, + "balance_loss_mlp": 1.03432798, + "epoch": 0.11363294754246205, + "flos": 21756091351680.0, + "grad_norm": 2.475836393648607, + "language_loss": 0.8416034, + "learning_rate": 3.874133407617169e-06, + "loss": 0.86324883, + "num_input_tokens_seen": 40856490, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.88671875, + "step": 1890, + "time_per_iteration": 2.384964942932129 + }, + { + "auxiliary_loss_clip": 0.01118093, + "auxiliary_loss_mlp": 0.01043316, + "balance_loss_clip": 1.02026057, + "balance_loss_mlp": 1.03270805, + "epoch": 0.11369307079513001, + "flos": 22600535339520.0, + "grad_norm": 2.160870064589821, + "language_loss": 0.64799368, + "learning_rate": 3.874001457364642e-06, + "loss": 0.66960776, + "num_input_tokens_seen": 40874070, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.8515625, + "step": 1891, + "time_per_iteration": 2.3964099884033203 + }, + { + "auxiliary_loss_clip": 0.01121113, + "auxiliary_loss_mlp": 0.01038546, + "balance_loss_clip": 1.01606369, + "balance_loss_mlp": 1.03340304, + "epoch": 0.11375319404779799, + "flos": 21943190661120.0, + "grad_norm": 2.5961754883451422, + "language_loss": 0.8853538, + "learning_rate": 3.873869440233822e-06, + "loss": 0.90695035, + "num_input_tokens_seen": 40892425, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 0.875, + "step": 1892, + "time_per_iteration": 2.4157590866088867 + }, + { + "auxiliary_loss_clip": 0.01123236, + "auxiliary_loss_mlp": 0.01056673, + "balance_loss_clip": 1.03193736, + "balance_loss_mlp": 1.03481007, + "epoch": 0.11381331730046595, + "flos": 26394267628800.0, + "grad_norm": 2.38988995888122, + "language_loss": 0.73289359, + "learning_rate": 3.8737373562294225e-06, + "loss": 0.75469267, + "num_input_tokens_seen": 40912190, + "router_z_loss_clip": 0.24707031, + "router_z_loss_mlp": 0.8828125, + "step": 1893, + "time_per_iteration": 2.435520887374878 + }, + { + "auxiliary_loss_clip": 0.01119479, + "auxiliary_loss_mlp": 0.01054012, + "balance_loss_clip": 1.02993214, + "balance_loss_mlp": 1.03295314, + "epoch": 0.11387344055313392, + "flos": 23803607479680.0, + "grad_norm": 2.002772720280015, + "language_loss": 0.7954644, + "learning_rate": 3.873605205356157e-06, + "loss": 0.81719935, + "num_input_tokens_seen": 40928395, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 0.8671875, + "step": 1894, + "time_per_iteration": 2.38785982131958 + }, + { + "auxiliary_loss_clip": 0.01122086, + "auxiliary_loss_mlp": 0.01047838, + "balance_loss_clip": 1.02388895, + "balance_loss_mlp": 1.03158116, + "epoch": 0.1139335638058019, + "flos": 34521699294720.0, + "grad_norm": 5.640230676070867, + "language_loss": 0.80075616, + "learning_rate": 3.873472987618742e-06, + "loss": 0.82245541, + "num_input_tokens_seen": 40946555, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 0.90625, + "step": 1895, + "time_per_iteration": 2.4901442527770996 + }, + { + "auxiliary_loss_clip": 0.01037659, + "auxiliary_loss_mlp": 0.01010939, + "balance_loss_clip": 1.00774467, + "balance_loss_mlp": 1.01096821, + "epoch": 0.11399368705846986, + "flos": 70584148333440.0, + "grad_norm": 0.799269048333181, + "language_loss": 0.63373232, + "learning_rate": 3.873340703021894e-06, + "loss": 0.65421826, + "num_input_tokens_seen": 41004910, + "router_z_loss_clip": 0.03198242, + "router_z_loss_mlp": 0.26757812, + "step": 1896, + "time_per_iteration": 3.1031527519226074 + }, + { + "auxiliary_loss_clip": 0.01120183, + "auxiliary_loss_mlp": 0.0105012, + "balance_loss_clip": 1.02465725, + "balance_loss_mlp": 1.03358936, + "epoch": 0.11405381031113783, + "flos": 21323203004160.0, + "grad_norm": 1.8792429588436772, + "language_loss": 0.84862256, + "learning_rate": 3.873208351570335e-06, + "loss": 0.87032557, + "num_input_tokens_seen": 41026385, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.8671875, + "step": 1897, + "time_per_iteration": 2.421992540359497 + }, + { + "auxiliary_loss_clip": 0.01120028, + "auxiliary_loss_mlp": 0.01045303, + "balance_loss_clip": 1.02270079, + "balance_loss_mlp": 1.0324626, + "epoch": 0.11411393356380581, + "flos": 19718594784000.0, + "grad_norm": 2.787889135189672, + "language_loss": 0.79151994, + "learning_rate": 3.873075933268788e-06, + "loss": 0.81317323, + "num_input_tokens_seen": 41045315, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.875, + "step": 1898, + "time_per_iteration": 2.4150617122650146 + }, + { + "auxiliary_loss_clip": 0.0112114, + "auxiliary_loss_mlp": 0.01050052, + "balance_loss_clip": 1.02486324, + "balance_loss_mlp": 1.0317378, + "epoch": 0.11417405681647377, + "flos": 17529470714880.0, + "grad_norm": 2.0055423221469075, + "language_loss": 0.73206705, + "learning_rate": 3.87294344812198e-06, + "loss": 0.75377893, + "num_input_tokens_seen": 41063390, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.89453125, + "step": 1899, + "time_per_iteration": 2.3959221839904785 + }, + { + "auxiliary_loss_clip": 0.01034788, + "auxiliary_loss_mlp": 0.01003222, + "balance_loss_clip": 0.99971706, + "balance_loss_mlp": 1.00755334, + "epoch": 0.11423418006914174, + "flos": 59671416355200.0, + "grad_norm": 0.9087447150687288, + "language_loss": 0.63396221, + "learning_rate": 3.8728108961346386e-06, + "loss": 0.65434235, + "num_input_tokens_seen": 41124180, + "router_z_loss_clip": 0.03515625, + "router_z_loss_mlp": 0.2734375, + "step": 1900, + "time_per_iteration": 2.9975342750549316 + }, + { + "auxiliary_loss_clip": 0.01122627, + "auxiliary_loss_mlp": 0.01046789, + "balance_loss_clip": 1.02257764, + "balance_loss_mlp": 1.03412795, + "epoch": 0.1142943033218097, + "flos": 22962096046080.0, + "grad_norm": 1.662102926602138, + "language_loss": 0.78009129, + "learning_rate": 3.872678277311493e-06, + "loss": 0.80178547, + "num_input_tokens_seen": 41143485, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.88671875, + "step": 1901, + "time_per_iteration": 2.405158758163452 + }, + { + "auxiliary_loss_clip": 0.01121845, + "auxiliary_loss_mlp": 0.01041315, + "balance_loss_clip": 1.0184269, + "balance_loss_mlp": 1.0350672, + "epoch": 0.11435442657447768, + "flos": 18255385036800.0, + "grad_norm": 2.0287733645949926, + "language_loss": 0.83728218, + "learning_rate": 3.872545591657276e-06, + "loss": 0.85891378, + "num_input_tokens_seen": 41161695, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.8671875, + "step": 1902, + "time_per_iteration": 2.3576552867889404 + }, + { + "auxiliary_loss_clip": 0.01117738, + "auxiliary_loss_mlp": 0.0104449, + "balance_loss_clip": 1.01995707, + "balance_loss_mlp": 1.0303036, + "epoch": 0.11441454982714565, + "flos": 24060044482560.0, + "grad_norm": 1.6977257217677675, + "language_loss": 0.77722776, + "learning_rate": 3.872412839176725e-06, + "loss": 0.79885, + "num_input_tokens_seen": 41181715, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.875, + "step": 1903, + "time_per_iteration": 2.4129691123962402 + }, + { + "auxiliary_loss_clip": 0.0112092, + "auxiliary_loss_mlp": 0.01038833, + "balance_loss_clip": 1.01737499, + "balance_loss_mlp": 1.03335369, + "epoch": 0.11447467307981361, + "flos": 25336538945280.0, + "grad_norm": 2.289445239864963, + "language_loss": 0.75533634, + "learning_rate": 3.872280019874576e-06, + "loss": 0.77693391, + "num_input_tokens_seen": 41201770, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 0.875, + "step": 1904, + "time_per_iteration": 2.4143829345703125 + }, + { + "auxiliary_loss_clip": 0.01118086, + "auxiliary_loss_mlp": 0.01044327, + "balance_loss_clip": 1.01951957, + "balance_loss_mlp": 1.03191447, + "epoch": 0.11453479633248159, + "flos": 21724983463680.0, + "grad_norm": 2.360565416980462, + "language_loss": 0.91935968, + "learning_rate": 3.872147133755568e-06, + "loss": 0.94098371, + "num_input_tokens_seen": 41220590, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.86328125, + "step": 1905, + "time_per_iteration": 2.3952243328094482 + }, + { + "auxiliary_loss_clip": 0.01121045, + "auxiliary_loss_mlp": 0.01047477, + "balance_loss_clip": 1.02266955, + "balance_loss_mlp": 1.03007984, + "epoch": 0.11459491958514956, + "flos": 12968871212160.0, + "grad_norm": 2.7913693138781923, + "language_loss": 0.77344108, + "learning_rate": 3.872014180824446e-06, + "loss": 0.79512632, + "num_input_tokens_seen": 41237250, + "router_z_loss_clip": 0.24902344, + "router_z_loss_mlp": 0.91015625, + "step": 1906, + "time_per_iteration": 2.3560922145843506 + }, + { + "auxiliary_loss_clip": 0.01119823, + "auxiliary_loss_mlp": 0.01049127, + "balance_loss_clip": 1.02524948, + "balance_loss_mlp": 1.03317046, + "epoch": 0.11465504283781752, + "flos": 22710162608640.0, + "grad_norm": 11.269839278915923, + "language_loss": 0.81792992, + "learning_rate": 3.8718811610859526e-06, + "loss": 0.83961946, + "num_input_tokens_seen": 41256680, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.8671875, + "step": 1907, + "time_per_iteration": 2.3930797576904297 + }, + { + "auxiliary_loss_clip": 0.01120222, + "auxiliary_loss_mlp": 0.01054948, + "balance_loss_clip": 1.03223836, + "balance_loss_mlp": 1.03404033, + "epoch": 0.1147151660904855, + "flos": 23397428188800.0, + "grad_norm": 2.608949679238145, + "language_loss": 0.84991479, + "learning_rate": 3.8717480745448356e-06, + "loss": 0.87166649, + "num_input_tokens_seen": 41270955, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.86328125, + "step": 1908, + "time_per_iteration": 2.362637996673584 + }, + { + "auxiliary_loss_clip": 0.01034183, + "auxiliary_loss_mlp": 0.01003795, + "balance_loss_clip": 1.00045669, + "balance_loss_mlp": 1.00746334, + "epoch": 0.11477528934315347, + "flos": 63009044242560.0, + "grad_norm": 0.9166563959521401, + "language_loss": 0.60988611, + "learning_rate": 3.871614921205845e-06, + "loss": 0.63026589, + "num_input_tokens_seen": 41319180, + "router_z_loss_clip": 0.03344727, + "router_z_loss_mlp": 0.26757812, + "step": 1909, + "time_per_iteration": 2.77168607711792 + }, + { + "auxiliary_loss_clip": 0.01121819, + "auxiliary_loss_mlp": 0.01041726, + "balance_loss_clip": 1.01943362, + "balance_loss_mlp": 1.0347178, + "epoch": 0.11483541259582143, + "flos": 16324687918080.0, + "grad_norm": 1.8870721212084607, + "language_loss": 0.78994447, + "learning_rate": 3.871481701073731e-06, + "loss": 0.81157988, + "num_input_tokens_seen": 41337480, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.87109375, + "step": 1910, + "time_per_iteration": 2.3642501831054688 + }, + { + "auxiliary_loss_clip": 0.01123226, + "auxiliary_loss_mlp": 0.01042623, + "balance_loss_clip": 1.02014017, + "balance_loss_mlp": 1.03540301, + "epoch": 0.1148955358484894, + "flos": 21579325804800.0, + "grad_norm": 2.1832236962668934, + "language_loss": 0.77382857, + "learning_rate": 3.8713484141532505e-06, + "loss": 0.79548711, + "num_input_tokens_seen": 41354650, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.87890625, + "step": 1911, + "time_per_iteration": 3.7580933570861816 + }, + { + "auxiliary_loss_clip": 0.01116111, + "auxiliary_loss_mlp": 0.01042138, + "balance_loss_clip": 1.01986945, + "balance_loss_mlp": 1.03272152, + "epoch": 0.11495565910115738, + "flos": 27672437836800.0, + "grad_norm": 1.8401151809725036, + "language_loss": 0.79115731, + "learning_rate": 3.871215060449158e-06, + "loss": 0.81273973, + "num_input_tokens_seen": 41376935, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.8359375, + "step": 1912, + "time_per_iteration": 2.4457361698150635 + }, + { + "auxiliary_loss_clip": 0.01116913, + "auxiliary_loss_mlp": 0.01054898, + "balance_loss_clip": 1.03103209, + "balance_loss_mlp": 1.03193891, + "epoch": 0.11501578235382534, + "flos": 20631294213120.0, + "grad_norm": 1.8881752293686607, + "language_loss": 0.77768546, + "learning_rate": 3.871081639966213e-06, + "loss": 0.79940355, + "num_input_tokens_seen": 41396105, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.8515625, + "step": 1913, + "time_per_iteration": 3.730783224105835 + }, + { + "auxiliary_loss_clip": 0.01120003, + "auxiliary_loss_mlp": 0.01040292, + "balance_loss_clip": 1.01674795, + "balance_loss_mlp": 1.03115487, + "epoch": 0.1150759056064933, + "flos": 19828012584960.0, + "grad_norm": 2.030156090053584, + "language_loss": 0.7035594, + "learning_rate": 3.870948152709178e-06, + "loss": 0.72516233, + "num_input_tokens_seen": 41415600, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 0.890625, + "step": 1914, + "time_per_iteration": 2.431093215942383 + }, + { + "auxiliary_loss_clip": 0.0103309, + "auxiliary_loss_mlp": 0.01008316, + "balance_loss_clip": 1.00535917, + "balance_loss_mlp": 1.0072974, + "epoch": 0.11513602885916129, + "flos": 70041981830400.0, + "grad_norm": 0.7608967370047242, + "language_loss": 0.61050045, + "learning_rate": 3.870814598682816e-06, + "loss": 0.63091445, + "num_input_tokens_seen": 41478760, + "router_z_loss_clip": 0.02954102, + "router_z_loss_mlp": 0.2578125, + "step": 1915, + "time_per_iteration": 5.959998846054077 + }, + { + "auxiliary_loss_clip": 0.01121487, + "auxiliary_loss_mlp": 0.01043383, + "balance_loss_clip": 1.01954126, + "balance_loss_mlp": 1.03480065, + "epoch": 0.11519615211182925, + "flos": 15740835384960.0, + "grad_norm": 6.53600990937075, + "language_loss": 0.92811406, + "learning_rate": 3.8706809778918935e-06, + "loss": 0.94976276, + "num_input_tokens_seen": 41495720, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 0.8671875, + "step": 1916, + "time_per_iteration": 2.364123821258545 + }, + { + "auxiliary_loss_clip": 0.01118829, + "auxiliary_loss_mlp": 0.01047289, + "balance_loss_clip": 1.02338719, + "balance_loss_mlp": 1.0321542, + "epoch": 0.11525627536449722, + "flos": 20666591464320.0, + "grad_norm": 1.9052463671782878, + "language_loss": 0.72640043, + "learning_rate": 3.870547290341179e-06, + "loss": 0.74806166, + "num_input_tokens_seen": 41513585, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 0.8671875, + "step": 1917, + "time_per_iteration": 2.4193167686462402 + }, + { + "auxiliary_loss_clip": 0.01118869, + "auxiliary_loss_mlp": 0.0103757, + "balance_loss_clip": 1.01358509, + "balance_loss_mlp": 1.03429604, + "epoch": 0.1153163986171652, + "flos": 20302237848960.0, + "grad_norm": 2.388180552467478, + "language_loss": 0.74289095, + "learning_rate": 3.870413536035442e-06, + "loss": 0.76445532, + "num_input_tokens_seen": 41533390, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.84375, + "step": 1918, + "time_per_iteration": 2.3907649517059326 + }, + { + "auxiliary_loss_clip": 0.01121428, + "auxiliary_loss_mlp": 0.01037743, + "balance_loss_clip": 1.01305485, + "balance_loss_mlp": 1.03278852, + "epoch": 0.11537652186983316, + "flos": 17638364845440.0, + "grad_norm": 2.183824722391978, + "language_loss": 0.86369371, + "learning_rate": 3.870279714979458e-06, + "loss": 0.88528538, + "num_input_tokens_seen": 41551015, + "router_z_loss_clip": 0.24707031, + "router_z_loss_mlp": 0.88671875, + "step": 1919, + "time_per_iteration": 2.3742287158966064 + }, + { + "auxiliary_loss_clip": 0.01117779, + "auxiliary_loss_mlp": 0.01045815, + "balance_loss_clip": 1.02177036, + "balance_loss_mlp": 1.03159249, + "epoch": 0.11543664512250112, + "flos": 21068337012480.0, + "grad_norm": 3.6738136039291676, + "language_loss": 0.86615455, + "learning_rate": 3.870145827178002e-06, + "loss": 0.8877905, + "num_input_tokens_seen": 41568055, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.859375, + "step": 1920, + "time_per_iteration": 2.3817317485809326 + }, + { + "auxiliary_loss_clip": 0.01117626, + "auxiliary_loss_mlp": 0.01040966, + "balance_loss_clip": 1.01758945, + "balance_loss_mlp": 1.03209615, + "epoch": 0.11549676837516909, + "flos": 22746437377920.0, + "grad_norm": 2.0350494362644977, + "language_loss": 0.79077518, + "learning_rate": 3.8700118726358525e-06, + "loss": 0.81236112, + "num_input_tokens_seen": 41587435, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 0.85546875, + "step": 1921, + "time_per_iteration": 2.4029366970062256 + }, + { + "auxiliary_loss_clip": 0.01123317, + "auxiliary_loss_mlp": 0.01051315, + "balance_loss_clip": 1.02592325, + "balance_loss_mlp": 1.03322721, + "epoch": 0.11555689162783707, + "flos": 19168049554560.0, + "grad_norm": 1.9432903044582477, + "language_loss": 0.78655696, + "learning_rate": 3.869877851357789e-06, + "loss": 0.80830324, + "num_input_tokens_seen": 41604975, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.90234375, + "step": 1922, + "time_per_iteration": 2.364842414855957 + }, + { + "auxiliary_loss_clip": 0.01125324, + "auxiliary_loss_mlp": 0.01050104, + "balance_loss_clip": 1.02689362, + "balance_loss_mlp": 1.03513312, + "epoch": 0.11561701488050503, + "flos": 24570893629440.0, + "grad_norm": 2.1217780923341003, + "language_loss": 0.8439163, + "learning_rate": 3.869743763348595e-06, + "loss": 0.86567056, + "num_input_tokens_seen": 41626155, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.90234375, + "step": 1923, + "time_per_iteration": 2.4334733486175537 + }, + { + "auxiliary_loss_clip": 0.01122675, + "auxiliary_loss_mlp": 0.01044974, + "balance_loss_clip": 1.02002394, + "balance_loss_mlp": 1.03377521, + "epoch": 0.115677138133173, + "flos": 17091590042880.0, + "grad_norm": 2.242921719131463, + "language_loss": 0.80798101, + "learning_rate": 3.869609608613055e-06, + "loss": 0.82965755, + "num_input_tokens_seen": 41644805, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.890625, + "step": 1924, + "time_per_iteration": 2.3767452239990234 + }, + { + "auxiliary_loss_clip": 0.01033338, + "auxiliary_loss_mlp": 0.01004751, + "balance_loss_clip": 1.00162804, + "balance_loss_mlp": 1.0078938, + "epoch": 0.11573726138584098, + "flos": 62700515758080.0, + "grad_norm": 0.8289051333358758, + "language_loss": 0.61181498, + "learning_rate": 3.869475387155958e-06, + "loss": 0.63219583, + "num_input_tokens_seen": 41709345, + "router_z_loss_clip": 0.03125, + "router_z_loss_mlp": 0.25390625, + "step": 1925, + "time_per_iteration": 3.051574230194092 + }, + { + "auxiliary_loss_clip": 0.01118964, + "auxiliary_loss_mlp": 0.01048068, + "balance_loss_clip": 1.02341545, + "balance_loss_mlp": 1.03163743, + "epoch": 0.11579738463850894, + "flos": 22600046580480.0, + "grad_norm": 1.8870413846579273, + "language_loss": 0.75285721, + "learning_rate": 3.8693410989820925e-06, + "loss": 0.77452743, + "num_input_tokens_seen": 41730210, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.875, + "step": 1926, + "time_per_iteration": 2.398479461669922 + }, + { + "auxiliary_loss_clip": 0.01120091, + "auxiliary_loss_mlp": 0.0104481, + "balance_loss_clip": 1.01912022, + "balance_loss_mlp": 1.03248537, + "epoch": 0.11585750789117691, + "flos": 21725053286400.0, + "grad_norm": 4.134645563731568, + "language_loss": 0.72157353, + "learning_rate": 3.869206744096252e-06, + "loss": 0.74322253, + "num_input_tokens_seen": 41750270, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.875, + "step": 1927, + "time_per_iteration": 2.406965494155884 + }, + { + "auxiliary_loss_clip": 0.01117803, + "auxiliary_loss_mlp": 0.0104096, + "balance_loss_clip": 1.01684427, + "balance_loss_mlp": 1.03162479, + "epoch": 0.11591763114384489, + "flos": 26286316104960.0, + "grad_norm": 1.534695631001158, + "language_loss": 0.86650527, + "learning_rate": 3.869072322503232e-06, + "loss": 0.88809288, + "num_input_tokens_seen": 41772975, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 0.859375, + "step": 1928, + "time_per_iteration": 2.432307243347168 + }, + { + "auxiliary_loss_clip": 0.01120868, + "auxiliary_loss_mlp": 0.01047544, + "balance_loss_clip": 1.02378583, + "balance_loss_mlp": 1.03301144, + "epoch": 0.11597775439651285, + "flos": 22999418156160.0, + "grad_norm": 1.7562016611887232, + "language_loss": 0.77448833, + "learning_rate": 3.868937834207828e-06, + "loss": 0.79617244, + "num_input_tokens_seen": 41791765, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.87890625, + "step": 1929, + "time_per_iteration": 2.4027504920959473 + }, + { + "auxiliary_loss_clip": 0.01117126, + "auxiliary_loss_mlp": 0.01050525, + "balance_loss_clip": 1.02811384, + "balance_loss_mlp": 1.0315845, + "epoch": 0.11603787764918082, + "flos": 31940360478720.0, + "grad_norm": 2.9383215708820356, + "language_loss": 0.76913202, + "learning_rate": 3.86880327921484e-06, + "loss": 0.79080844, + "num_input_tokens_seen": 41815615, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 0.85546875, + "step": 1930, + "time_per_iteration": 2.471935510635376 + }, + { + "auxiliary_loss_clip": 0.01119107, + "auxiliary_loss_mlp": 0.01045321, + "balance_loss_clip": 1.02261209, + "balance_loss_mlp": 1.03244591, + "epoch": 0.1160980009018488, + "flos": 22270606191360.0, + "grad_norm": 1.950474949962868, + "language_loss": 0.72070694, + "learning_rate": 3.8686686575290695e-06, + "loss": 0.74235123, + "num_input_tokens_seen": 41834810, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.8671875, + "step": 1931, + "time_per_iteration": 2.3999249935150146 + }, + { + "auxiliary_loss_clip": 0.01123334, + "auxiliary_loss_mlp": 0.01045584, + "balance_loss_clip": 1.0216229, + "balance_loss_mlp": 1.03538382, + "epoch": 0.11615812415451676, + "flos": 22782537590400.0, + "grad_norm": 1.6694959414934365, + "language_loss": 0.82114506, + "learning_rate": 3.868533969155322e-06, + "loss": 0.84283423, + "num_input_tokens_seen": 41854975, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.8828125, + "step": 1932, + "time_per_iteration": 2.4132401943206787 + }, + { + "auxiliary_loss_clip": 0.01029327, + "auxiliary_loss_mlp": 0.01009468, + "balance_loss_clip": 1.00627303, + "balance_loss_mlp": 1.00398624, + "epoch": 0.11621824740718473, + "flos": 67142864885760.0, + "grad_norm": 0.77813532920461, + "language_loss": 0.61104012, + "learning_rate": 3.868399214098404e-06, + "loss": 0.631428, + "num_input_tokens_seen": 41911105, + "router_z_loss_clip": 0.03198242, + "router_z_loss_mlp": 0.25390625, + "step": 1933, + "time_per_iteration": 2.89139986038208 + }, + { + "auxiliary_loss_clip": 0.01121111, + "auxiliary_loss_mlp": 0.01044965, + "balance_loss_clip": 1.02174306, + "balance_loss_mlp": 1.03182209, + "epoch": 0.11627837065985269, + "flos": 20374892121600.0, + "grad_norm": 5.832653992836649, + "language_loss": 0.85950387, + "learning_rate": 3.868264392363124e-06, + "loss": 0.88116461, + "num_input_tokens_seen": 41931750, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.89453125, + "step": 1934, + "time_per_iteration": 2.4212839603424072 + }, + { + "auxiliary_loss_clip": 0.01126041, + "auxiliary_loss_mlp": 0.01047781, + "balance_loss_clip": 1.02274728, + "balance_loss_mlp": 1.03563976, + "epoch": 0.11633849391252067, + "flos": 21724739084160.0, + "grad_norm": 2.285134865303062, + "language_loss": 0.65957439, + "learning_rate": 3.868129503954293e-06, + "loss": 0.68131256, + "num_input_tokens_seen": 41949400, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.90625, + "step": 1935, + "time_per_iteration": 2.410168409347534 + }, + { + "auxiliary_loss_clip": 0.01124111, + "auxiliary_loss_mlp": 0.01048578, + "balance_loss_clip": 1.02572513, + "balance_loss_mlp": 1.03325069, + "epoch": 0.11639861716518864, + "flos": 18804394166400.0, + "grad_norm": 2.468005328842679, + "language_loss": 0.75913846, + "learning_rate": 3.867994548876726e-06, + "loss": 0.78086537, + "num_input_tokens_seen": 41968100, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.90625, + "step": 1936, + "time_per_iteration": 2.3786065578460693 + }, + { + "auxiliary_loss_clip": 0.01123324, + "auxiliary_loss_mlp": 0.01044361, + "balance_loss_clip": 1.01954186, + "balance_loss_mlp": 1.03270447, + "epoch": 0.1164587404178566, + "flos": 21213924848640.0, + "grad_norm": 2.03835241920668, + "language_loss": 0.8434478, + "learning_rate": 3.867859527135238e-06, + "loss": 0.86512464, + "num_input_tokens_seen": 41986375, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.90625, + "step": 1937, + "time_per_iteration": 2.4055237770080566 + }, + { + "auxiliary_loss_clip": 0.011168, + "auxiliary_loss_mlp": 0.01036059, + "balance_loss_clip": 1.01487517, + "balance_loss_mlp": 1.0320996, + "epoch": 0.11651886367052458, + "flos": 27817397268480.0, + "grad_norm": 1.9669668728451497, + "language_loss": 0.76133978, + "learning_rate": 3.867724438734649e-06, + "loss": 0.78286839, + "num_input_tokens_seen": 42006055, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 0.84765625, + "step": 1938, + "time_per_iteration": 2.444209575653076 + }, + { + "auxiliary_loss_clip": 0.0112379, + "auxiliary_loss_mlp": 0.01045809, + "balance_loss_clip": 1.02169299, + "balance_loss_mlp": 1.03271842, + "epoch": 0.11657898692319255, + "flos": 22888568989440.0, + "grad_norm": 2.4333915561606583, + "language_loss": 0.79423189, + "learning_rate": 3.867589283679779e-06, + "loss": 0.81592792, + "num_input_tokens_seen": 42024995, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 0.91015625, + "step": 1939, + "time_per_iteration": 2.3905863761901855 + }, + { + "auxiliary_loss_clip": 0.01120133, + "auxiliary_loss_mlp": 0.01053959, + "balance_loss_clip": 1.02989054, + "balance_loss_mlp": 1.03123116, + "epoch": 0.11663911017586051, + "flos": 24314770828800.0, + "grad_norm": 2.2356411317656377, + "language_loss": 0.8636415, + "learning_rate": 3.867454061975451e-06, + "loss": 0.88538247, + "num_input_tokens_seen": 42042640, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 0.88671875, + "step": 1940, + "time_per_iteration": 2.4451136589050293 + }, + { + "auxiliary_loss_clip": 0.01118435, + "auxiliary_loss_mlp": 0.01055251, + "balance_loss_clip": 1.03233874, + "balance_loss_mlp": 1.03401971, + "epoch": 0.11669923342852849, + "flos": 42338507794560.0, + "grad_norm": 1.3780814995012212, + "language_loss": 0.75742328, + "learning_rate": 3.8673187736264914e-06, + "loss": 0.77916014, + "num_input_tokens_seen": 42067005, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.84375, + "step": 1941, + "time_per_iteration": 2.576957941055298 + }, + { + "auxiliary_loss_clip": 0.01117836, + "auxiliary_loss_mlp": 0.01047996, + "balance_loss_clip": 1.02486932, + "balance_loss_mlp": 1.0315516, + "epoch": 0.11675935668119646, + "flos": 14641560316800.0, + "grad_norm": 2.077467512953499, + "language_loss": 0.88486266, + "learning_rate": 3.8671834186377275e-06, + "loss": 0.90652096, + "num_input_tokens_seen": 42082295, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 0.86328125, + "step": 1942, + "time_per_iteration": 2.3597772121429443 + }, + { + "auxiliary_loss_clip": 0.01115714, + "auxiliary_loss_mlp": 0.01043805, + "balance_loss_clip": 1.02228785, + "balance_loss_mlp": 1.03200805, + "epoch": 0.11681947993386442, + "flos": 35115012806400.0, + "grad_norm": 1.6107241451107719, + "language_loss": 0.68025339, + "learning_rate": 3.867047997013991e-06, + "loss": 0.70184863, + "num_input_tokens_seen": 42105295, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 0.8359375, + "step": 1943, + "time_per_iteration": 2.5141799449920654 + }, + { + "auxiliary_loss_clip": 0.01115098, + "auxiliary_loss_mlp": 0.01037105, + "balance_loss_clip": 1.01480138, + "balance_loss_mlp": 1.03127599, + "epoch": 0.11687960318653239, + "flos": 38981713570560.0, + "grad_norm": 3.075908187618805, + "language_loss": 0.69172108, + "learning_rate": 3.866912508760114e-06, + "loss": 0.71324313, + "num_input_tokens_seen": 42125520, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 0.8359375, + "step": 1944, + "time_per_iteration": 2.5394175052642822 + }, + { + "auxiliary_loss_clip": 0.01116969, + "auxiliary_loss_mlp": 0.01040783, + "balance_loss_clip": 1.01936138, + "balance_loss_mlp": 1.03080392, + "epoch": 0.11693972643920036, + "flos": 25993778889600.0, + "grad_norm": 1.4261852213290416, + "language_loss": 0.82534927, + "learning_rate": 3.866776953880932e-06, + "loss": 0.84692681, + "num_input_tokens_seen": 42146335, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 0.86328125, + "step": 1945, + "time_per_iteration": 2.4165971279144287 + }, + { + "auxiliary_loss_clip": 0.01115177, + "auxiliary_loss_mlp": 0.01047136, + "balance_loss_clip": 1.02493882, + "balance_loss_mlp": 1.0304879, + "epoch": 0.11699984969186833, + "flos": 27270866845440.0, + "grad_norm": 2.2188461994747657, + "language_loss": 0.764691, + "learning_rate": 3.8666413323812825e-06, + "loss": 0.78631407, + "num_input_tokens_seen": 42165320, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.84375, + "step": 1946, + "time_per_iteration": 2.4419751167297363 + }, + { + "auxiliary_loss_clip": 0.01116491, + "auxiliary_loss_mlp": 0.01048205, + "balance_loss_clip": 1.02660465, + "balance_loss_mlp": 1.03320909, + "epoch": 0.1170599729445363, + "flos": 15266959234560.0, + "grad_norm": 1.852763228158811, + "language_loss": 0.68523192, + "learning_rate": 3.8665056442660055e-06, + "loss": 0.7068789, + "num_input_tokens_seen": 42182955, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 0.8359375, + "step": 1947, + "time_per_iteration": 2.3548645973205566 + }, + { + "auxiliary_loss_clip": 0.01123669, + "auxiliary_loss_mlp": 0.01047806, + "balance_loss_clip": 1.02370167, + "balance_loss_mlp": 1.03648901, + "epoch": 0.11712009619720427, + "flos": 17163511176960.0, + "grad_norm": 2.2191004921610955, + "language_loss": 0.84888136, + "learning_rate": 3.866369889539942e-06, + "loss": 0.87059611, + "num_input_tokens_seen": 42200760, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 0.87109375, + "step": 1948, + "time_per_iteration": 2.3758552074432373 + }, + { + "auxiliary_loss_clip": 0.01033309, + "auxiliary_loss_mlp": 0.0101984, + "balance_loss_clip": 1.01684785, + "balance_loss_mlp": 1.0073961, + "epoch": 0.11718021944987224, + "flos": 70937644515840.0, + "grad_norm": 0.8216537331260262, + "language_loss": 0.65126908, + "learning_rate": 3.86623406820794e-06, + "loss": 0.67180055, + "num_input_tokens_seen": 42265745, + "router_z_loss_clip": 0.02990723, + "router_z_loss_mlp": 0.25976562, + "step": 1949, + "time_per_iteration": 3.048769235610962 + }, + { + "auxiliary_loss_clip": 0.01115862, + "auxiliary_loss_mlp": 0.01048048, + "balance_loss_clip": 1.02580369, + "balance_loss_mlp": 1.03134167, + "epoch": 0.1172403427025402, + "flos": 27452240691840.0, + "grad_norm": 1.6808494109986374, + "language_loss": 0.71865463, + "learning_rate": 3.8660981802748434e-06, + "loss": 0.74029374, + "num_input_tokens_seen": 42286245, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.84375, + "step": 1950, + "time_per_iteration": 2.432399034500122 + }, + { + "auxiliary_loss_clip": 0.01123141, + "auxiliary_loss_mlp": 0.01046762, + "balance_loss_clip": 1.02400458, + "balance_loss_mlp": 1.03402305, + "epoch": 0.11730046595520818, + "flos": 15667831998720.0, + "grad_norm": 2.7210292700723393, + "language_loss": 0.76711386, + "learning_rate": 3.865962225745504e-06, + "loss": 0.78881288, + "num_input_tokens_seen": 42302710, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 0.890625, + "step": 1951, + "time_per_iteration": 3.768670082092285 + }, + { + "auxiliary_loss_clip": 0.01120962, + "auxiliary_loss_mlp": 0.0104838, + "balance_loss_clip": 1.02534842, + "balance_loss_mlp": 1.03474116, + "epoch": 0.11736058920787615, + "flos": 25628971426560.0, + "grad_norm": 1.7319526892347994, + "language_loss": 0.7685138, + "learning_rate": 3.865826204624771e-06, + "loss": 0.79020721, + "num_input_tokens_seen": 42324115, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.86328125, + "step": 1952, + "time_per_iteration": 2.421802520751953 + }, + { + "auxiliary_loss_clip": 0.01118202, + "auxiliary_loss_mlp": 0.01047002, + "balance_loss_clip": 1.02407813, + "balance_loss_mlp": 1.03074563, + "epoch": 0.11742071246054411, + "flos": 21433214298240.0, + "grad_norm": 1.7038080722742601, + "language_loss": 0.71910661, + "learning_rate": 3.865690116917501e-06, + "loss": 0.74075866, + "num_input_tokens_seen": 42342505, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 0.875, + "step": 1953, + "time_per_iteration": 3.782447338104248 + }, + { + "auxiliary_loss_clip": 0.01124642, + "auxiliary_loss_mlp": 0.0104304, + "balance_loss_clip": 1.01981831, + "balance_loss_mlp": 1.03440595, + "epoch": 0.11748083571321208, + "flos": 15996923274240.0, + "grad_norm": 2.5555025588281386, + "language_loss": 0.79637015, + "learning_rate": 3.8655539626285505e-06, + "loss": 0.81804705, + "num_input_tokens_seen": 42360525, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.90234375, + "step": 1954, + "time_per_iteration": 3.778489351272583 + }, + { + "auxiliary_loss_clip": 0.01119053, + "auxiliary_loss_mlp": 0.01050862, + "balance_loss_clip": 1.02672267, + "balance_loss_mlp": 1.03155947, + "epoch": 0.11754095896588006, + "flos": 16179134993280.0, + "grad_norm": 1.9247334416091033, + "language_loss": 0.85399234, + "learning_rate": 3.865417741762777e-06, + "loss": 0.87569153, + "num_input_tokens_seen": 42377045, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 0.875, + "step": 1955, + "time_per_iteration": 3.747711420059204 + }, + { + "auxiliary_loss_clip": 0.01121077, + "auxiliary_loss_mlp": 0.01046602, + "balance_loss_clip": 1.02414286, + "balance_loss_mlp": 1.03478241, + "epoch": 0.11760108221854802, + "flos": 13260745111680.0, + "grad_norm": 2.2783008108075076, + "language_loss": 0.77872068, + "learning_rate": 3.865281454325043e-06, + "loss": 0.80039746, + "num_input_tokens_seen": 42393960, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.86328125, + "step": 1956, + "time_per_iteration": 2.368340492248535 + }, + { + "auxiliary_loss_clip": 0.01116919, + "auxiliary_loss_mlp": 0.01044495, + "balance_loss_clip": 1.02103484, + "balance_loss_mlp": 1.03262877, + "epoch": 0.11766120547121599, + "flos": 24497296750080.0, + "grad_norm": 1.8959509243281567, + "language_loss": 0.80642533, + "learning_rate": 3.865145100320212e-06, + "loss": 0.82803947, + "num_input_tokens_seen": 42413160, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.84375, + "step": 1957, + "time_per_iteration": 2.4359662532806396 + }, + { + "auxiliary_loss_clip": 0.01122664, + "auxiliary_loss_mlp": 0.0103978, + "balance_loss_clip": 1.01593804, + "balance_loss_mlp": 1.03471184, + "epoch": 0.11772132872388397, + "flos": 17783079897600.0, + "grad_norm": 3.407091579068367, + "language_loss": 0.77597332, + "learning_rate": 3.86500867975315e-06, + "loss": 0.79759777, + "num_input_tokens_seen": 42432590, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.87890625, + "step": 1958, + "time_per_iteration": 2.3869376182556152 + }, + { + "auxiliary_loss_clip": 0.01118996, + "auxiliary_loss_mlp": 0.01040341, + "balance_loss_clip": 1.01607013, + "balance_loss_mlp": 1.03330898, + "epoch": 0.11778145197655193, + "flos": 13216405818240.0, + "grad_norm": 2.2072269949487886, + "language_loss": 0.7668767, + "learning_rate": 3.864872192628725e-06, + "loss": 0.78847003, + "num_input_tokens_seen": 42450135, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 0.85546875, + "step": 1959, + "time_per_iteration": 2.3841238021850586 + }, + { + "auxiliary_loss_clip": 0.01122492, + "auxiliary_loss_mlp": 0.01039689, + "balance_loss_clip": 1.01795745, + "balance_loss_mlp": 1.03416228, + "epoch": 0.1178415752292199, + "flos": 20229164640000.0, + "grad_norm": 1.892203115995961, + "language_loss": 0.69768929, + "learning_rate": 3.864735638951809e-06, + "loss": 0.71931112, + "num_input_tokens_seen": 42470050, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 0.8828125, + "step": 1960, + "time_per_iteration": 2.4021859169006348 + }, + { + "auxiliary_loss_clip": 0.01123859, + "auxiliary_loss_mlp": 0.01043981, + "balance_loss_clip": 1.02006721, + "balance_loss_mlp": 1.03432, + "epoch": 0.11790169848188788, + "flos": 13039360980480.0, + "grad_norm": 2.654020946733496, + "language_loss": 0.81240052, + "learning_rate": 3.864599018727275e-06, + "loss": 0.83407891, + "num_input_tokens_seen": 42484335, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 0.8984375, + "step": 1961, + "time_per_iteration": 2.367949962615967 + }, + { + "auxiliary_loss_clip": 0.0111437, + "auxiliary_loss_mlp": 0.01047871, + "balance_loss_clip": 1.02488744, + "balance_loss_mlp": 1.03202939, + "epoch": 0.11796182173455584, + "flos": 22264845816960.0, + "grad_norm": 2.19753021704276, + "language_loss": 0.92440534, + "learning_rate": 3.864462331959998e-06, + "loss": 0.94602782, + "num_input_tokens_seen": 42502720, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.82421875, + "step": 1962, + "time_per_iteration": 2.4064676761627197 + }, + { + "auxiliary_loss_clip": 0.01122337, + "auxiliary_loss_mlp": 0.01049053, + "balance_loss_clip": 1.02627194, + "balance_loss_mlp": 1.03362584, + "epoch": 0.1180219449872238, + "flos": 10634229129600.0, + "grad_norm": 2.1702189567270125, + "language_loss": 0.871997, + "learning_rate": 3.864325578654856e-06, + "loss": 0.89371091, + "num_input_tokens_seen": 42519460, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 0.88671875, + "step": 1963, + "time_per_iteration": 2.4086060523986816 + }, + { + "auxiliary_loss_clip": 0.01118009, + "auxiliary_loss_mlp": 0.01043094, + "balance_loss_clip": 1.02043271, + "balance_loss_mlp": 1.03014529, + "epoch": 0.11808206823989177, + "flos": 20922469885440.0, + "grad_norm": 2.0896722423300678, + "language_loss": 0.83948267, + "learning_rate": 3.864188758816731e-06, + "loss": 0.8610937, + "num_input_tokens_seen": 42539420, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.87890625, + "step": 1964, + "time_per_iteration": 2.3962087631225586 + }, + { + "auxiliary_loss_clip": 0.01124048, + "auxiliary_loss_mlp": 0.01046546, + "balance_loss_clip": 1.02208459, + "balance_loss_mlp": 1.03625286, + "epoch": 0.11814219149255975, + "flos": 20776707492480.0, + "grad_norm": 2.1223552877057097, + "language_loss": 0.82847214, + "learning_rate": 3.864051872450504e-06, + "loss": 0.85017812, + "num_input_tokens_seen": 42558225, + "router_z_loss_clip": 0.24511719, + "router_z_loss_mlp": 0.875, + "step": 1965, + "time_per_iteration": 2.375262975692749 + }, + { + "auxiliary_loss_clip": 0.01120428, + "auxiliary_loss_mlp": 0.01043547, + "balance_loss_clip": 1.01982474, + "balance_loss_mlp": 1.03300381, + "epoch": 0.11820231474522772, + "flos": 48758162572800.0, + "grad_norm": 1.6467709144383305, + "language_loss": 0.74588215, + "learning_rate": 3.863914919561059e-06, + "loss": 0.76752186, + "num_input_tokens_seen": 42580790, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 0.875, + "step": 1966, + "time_per_iteration": 2.6221964359283447 + }, + { + "auxiliary_loss_clip": 0.01129223, + "auxiliary_loss_mlp": 0.01048573, + "balance_loss_clip": 1.02477837, + "balance_loss_mlp": 1.03779209, + "epoch": 0.11826243799789568, + "flos": 16689669937920.0, + "grad_norm": 2.8886502735577246, + "language_loss": 0.72988284, + "learning_rate": 3.863777900153287e-06, + "loss": 0.75166082, + "num_input_tokens_seen": 42597355, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.9140625, + "step": 1967, + "time_per_iteration": 2.3466272354125977 + }, + { + "auxiliary_loss_clip": 0.01121168, + "auxiliary_loss_mlp": 0.01043189, + "balance_loss_clip": 1.01883435, + "balance_loss_mlp": 1.03336382, + "epoch": 0.11832256125056366, + "flos": 16908924476160.0, + "grad_norm": 2.087425676331709, + "language_loss": 0.88269222, + "learning_rate": 3.863640814232076e-06, + "loss": 0.90433586, + "num_input_tokens_seen": 42616060, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 0.875, + "step": 1968, + "time_per_iteration": 2.3878138065338135 + }, + { + "auxiliary_loss_clip": 0.01120177, + "auxiliary_loss_mlp": 0.01045114, + "balance_loss_clip": 1.02167749, + "balance_loss_mlp": 1.03433907, + "epoch": 0.11838268450323162, + "flos": 22819301118720.0, + "grad_norm": 2.373016368325097, + "language_loss": 0.67450416, + "learning_rate": 3.863503661802317e-06, + "loss": 0.6961571, + "num_input_tokens_seen": 42636285, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.859375, + "step": 1969, + "time_per_iteration": 2.3945834636688232 + }, + { + "auxiliary_loss_clip": 0.01122843, + "auxiliary_loss_mlp": 0.01044308, + "balance_loss_clip": 1.02043056, + "balance_loss_mlp": 1.0355742, + "epoch": 0.11844280775589959, + "flos": 33544479939840.0, + "grad_norm": 2.5095524727065324, + "language_loss": 0.80832243, + "learning_rate": 3.863366442868906e-06, + "loss": 0.82999396, + "num_input_tokens_seen": 42658320, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.875, + "step": 1970, + "time_per_iteration": 2.4908838272094727 + }, + { + "auxiliary_loss_clip": 0.01033128, + "auxiliary_loss_mlp": 0.01007311, + "balance_loss_clip": 1.00461662, + "balance_loss_mlp": 1.0066843, + "epoch": 0.11850293100856757, + "flos": 66347577959040.0, + "grad_norm": 0.8002912839407599, + "language_loss": 0.66149813, + "learning_rate": 3.863229157436741e-06, + "loss": 0.68190253, + "num_input_tokens_seen": 42721500, + "router_z_loss_clip": 0.02697754, + "router_z_loss_mlp": 0.26367188, + "step": 1971, + "time_per_iteration": 3.002826452255249 + }, + { + "auxiliary_loss_clip": 0.01120792, + "auxiliary_loss_mlp": 0.01038308, + "balance_loss_clip": 1.01590836, + "balance_loss_mlp": 1.03313684, + "epoch": 0.11856305426123553, + "flos": 24679892494080.0, + "grad_norm": 2.2289507829910598, + "language_loss": 0.7991339, + "learning_rate": 3.863091805510718e-06, + "loss": 0.82072496, + "num_input_tokens_seen": 42739825, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.875, + "step": 1972, + "time_per_iteration": 2.423044204711914 + }, + { + "auxiliary_loss_clip": 0.01118167, + "auxiliary_loss_mlp": 0.01045811, + "balance_loss_clip": 1.02244556, + "balance_loss_mlp": 1.03223205, + "epoch": 0.1186231775139035, + "flos": 24278949907200.0, + "grad_norm": 2.1791190440773556, + "language_loss": 0.72848439, + "learning_rate": 3.862954387095743e-06, + "loss": 0.75012422, + "num_input_tokens_seen": 42758695, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.859375, + "step": 1973, + "time_per_iteration": 2.4122936725616455 + }, + { + "auxiliary_loss_clip": 0.01117423, + "auxiliary_loss_mlp": 0.01040538, + "balance_loss_clip": 1.01789987, + "balance_loss_mlp": 1.03277802, + "epoch": 0.11868330076657148, + "flos": 21756475376640.0, + "grad_norm": 1.721549687915635, + "language_loss": 0.71981263, + "learning_rate": 3.862816902196717e-06, + "loss": 0.74139225, + "num_input_tokens_seen": 42778510, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.84375, + "step": 1974, + "time_per_iteration": 2.415842056274414 + }, + { + "auxiliary_loss_clip": 0.01120913, + "auxiliary_loss_mlp": 0.01044245, + "balance_loss_clip": 1.02083254, + "balance_loss_mlp": 1.03460026, + "epoch": 0.11874342401923944, + "flos": 17192559294720.0, + "grad_norm": 2.1320444490964077, + "language_loss": 0.78171802, + "learning_rate": 3.862679350818547e-06, + "loss": 0.80336952, + "num_input_tokens_seen": 42793995, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.86328125, + "step": 1975, + "time_per_iteration": 2.3707268238067627 + }, + { + "auxiliary_loss_clip": 0.01121493, + "auxiliary_loss_mlp": 0.01041877, + "balance_loss_clip": 1.02053881, + "balance_loss_mlp": 1.03448355, + "epoch": 0.11880354727190741, + "flos": 15228729429120.0, + "grad_norm": 5.821912162635384, + "language_loss": 0.75312293, + "learning_rate": 3.862541732966144e-06, + "loss": 0.77475655, + "num_input_tokens_seen": 42809000, + "router_z_loss_clip": 0.21386719, + "router_z_loss_mlp": 0.8671875, + "step": 1976, + "time_per_iteration": 2.361881971359253 + }, + { + "auxiliary_loss_clip": 0.01117401, + "auxiliary_loss_mlp": 0.01043903, + "balance_loss_clip": 1.02064562, + "balance_loss_mlp": 1.03225017, + "epoch": 0.11886367052457537, + "flos": 27308433335040.0, + "grad_norm": 3.8382577631191896, + "language_loss": 0.75069487, + "learning_rate": 3.862404048644416e-06, + "loss": 0.77230787, + "num_input_tokens_seen": 42831585, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.8515625, + "step": 1977, + "time_per_iteration": 2.4624083042144775 + }, + { + "auxiliary_loss_clip": 0.01122901, + "auxiliary_loss_mlp": 0.010488, + "balance_loss_clip": 1.02482712, + "balance_loss_mlp": 1.03556919, + "epoch": 0.11892379377724335, + "flos": 21797218800000.0, + "grad_norm": 2.156976507215717, + "language_loss": 0.7394048, + "learning_rate": 3.862266297858279e-06, + "loss": 0.76112187, + "num_input_tokens_seen": 42848420, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.87109375, + "step": 1978, + "time_per_iteration": 2.385887622833252 + }, + { + "auxiliary_loss_clip": 0.011179, + "auxiliary_loss_mlp": 0.01046777, + "balance_loss_clip": 1.02485406, + "balance_loss_mlp": 1.03311896, + "epoch": 0.11898391702991132, + "flos": 13990150569600.0, + "grad_norm": 1.908963691018837, + "language_loss": 0.73343402, + "learning_rate": 3.862128480612648e-06, + "loss": 0.75508082, + "num_input_tokens_seen": 42866645, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 0.84765625, + "step": 1979, + "time_per_iteration": 2.3576278686523438 + }, + { + "auxiliary_loss_clip": 0.01120054, + "auxiliary_loss_mlp": 0.01044867, + "balance_loss_clip": 1.02189517, + "balance_loss_mlp": 1.03369176, + "epoch": 0.11904404028257928, + "flos": 32233176984960.0, + "grad_norm": 1.6278011430777886, + "language_loss": 0.9859215, + "learning_rate": 3.8619905969124415e-06, + "loss": 1.00757062, + "num_input_tokens_seen": 42888515, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 0.86328125, + "step": 1980, + "time_per_iteration": 2.4863715171813965 + }, + { + "auxiliary_loss_clip": 0.01123417, + "auxiliary_loss_mlp": 0.01049478, + "balance_loss_clip": 1.02595758, + "balance_loss_mlp": 1.03414297, + "epoch": 0.11910416353524726, + "flos": 23585155902720.0, + "grad_norm": 1.7044605200764433, + "language_loss": 0.8611837, + "learning_rate": 3.86185264676258e-06, + "loss": 0.88291258, + "num_input_tokens_seen": 42909035, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.890625, + "step": 1981, + "time_per_iteration": 2.4147064685821533 + }, + { + "auxiliary_loss_clip": 0.01123441, + "auxiliary_loss_mlp": 0.01047793, + "balance_loss_clip": 1.02433228, + "balance_loss_mlp": 1.03463411, + "epoch": 0.11916428678791523, + "flos": 25332000468480.0, + "grad_norm": 1.8984009700727715, + "language_loss": 0.85393345, + "learning_rate": 3.861714630167987e-06, + "loss": 0.87564576, + "num_input_tokens_seen": 42927555, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.88671875, + "step": 1982, + "time_per_iteration": 2.4262924194335938 + }, + { + "auxiliary_loss_clip": 0.01118164, + "auxiliary_loss_mlp": 0.01041158, + "balance_loss_clip": 1.01779306, + "balance_loss_mlp": 1.03233421, + "epoch": 0.11922441004058319, + "flos": 19787513541120.0, + "grad_norm": 2.4797697769246785, + "language_loss": 0.85202748, + "learning_rate": 3.8615765471335874e-06, + "loss": 0.87362069, + "num_input_tokens_seen": 42945300, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 0.859375, + "step": 1983, + "time_per_iteration": 2.377220392227173 + }, + { + "auxiliary_loss_clip": 0.01124226, + "auxiliary_loss_mlp": 0.01050751, + "balance_loss_clip": 1.02544284, + "balance_loss_mlp": 1.03365731, + "epoch": 0.11928453329325117, + "flos": 21535475270400.0, + "grad_norm": 3.3113596146379733, + "language_loss": 0.77033579, + "learning_rate": 3.8614383976643096e-06, + "loss": 0.79208553, + "num_input_tokens_seen": 42961295, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.90625, + "step": 1984, + "time_per_iteration": 2.3807835578918457 + }, + { + "auxiliary_loss_clip": 0.0111945, + "auxiliary_loss_mlp": 0.01055403, + "balance_loss_clip": 1.03197885, + "balance_loss_mlp": 1.03279757, + "epoch": 0.11934465654591914, + "flos": 20813924868480.0, + "grad_norm": 1.8278780443494753, + "language_loss": 0.83421803, + "learning_rate": 3.861300181765084e-06, + "loss": 0.85596657, + "num_input_tokens_seen": 42980330, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.8671875, + "step": 1985, + "time_per_iteration": 2.380126714706421 + }, + { + "auxiliary_loss_clip": 0.01116393, + "auxiliary_loss_mlp": 0.01042754, + "balance_loss_clip": 1.02002048, + "balance_loss_mlp": 1.03147042, + "epoch": 0.1194047797985871, + "flos": 19059539448960.0, + "grad_norm": 2.061105441215783, + "language_loss": 0.73861659, + "learning_rate": 3.861161899440843e-06, + "loss": 0.76020807, + "num_input_tokens_seen": 42996125, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 0.8515625, + "step": 1986, + "time_per_iteration": 2.3536202907562256 + }, + { + "auxiliary_loss_clip": 0.01121507, + "auxiliary_loss_mlp": 0.01049547, + "balance_loss_clip": 1.02557421, + "balance_loss_mlp": 1.03355265, + "epoch": 0.11946490305125507, + "flos": 27189798935040.0, + "grad_norm": 1.9365010102679958, + "language_loss": 0.720505, + "learning_rate": 3.86102355069652e-06, + "loss": 0.74221563, + "num_input_tokens_seen": 43014180, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.87890625, + "step": 1987, + "time_per_iteration": 2.439728021621704 + }, + { + "auxiliary_loss_clip": 0.0112178, + "auxiliary_loss_mlp": 0.0104697, + "balance_loss_clip": 1.02298498, + "balance_loss_mlp": 1.03436017, + "epoch": 0.11952502630392305, + "flos": 21139769387520.0, + "grad_norm": 2.47030658028113, + "language_loss": 0.71941423, + "learning_rate": 3.860885135537054e-06, + "loss": 0.74110174, + "num_input_tokens_seen": 43032120, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.875, + "step": 1988, + "time_per_iteration": 2.406344413757324 + }, + { + "auxiliary_loss_clip": 0.01119401, + "auxiliary_loss_mlp": 0.01054096, + "balance_loss_clip": 1.02750063, + "balance_loss_mlp": 1.03266072, + "epoch": 0.11958514955659101, + "flos": 22123237875840.0, + "grad_norm": 1.9140912637348366, + "language_loss": 0.80716503, + "learning_rate": 3.860746653967384e-06, + "loss": 0.82889998, + "num_input_tokens_seen": 43052215, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.8671875, + "step": 1989, + "time_per_iteration": 2.383209705352783 + }, + { + "auxiliary_loss_clip": 0.01124657, + "auxiliary_loss_mlp": 0.01047651, + "balance_loss_clip": 1.02223611, + "balance_loss_mlp": 1.03449106, + "epoch": 0.11964527280925898, + "flos": 17420471850240.0, + "grad_norm": 2.7618199204925213, + "language_loss": 0.75409639, + "learning_rate": 3.860608105992454e-06, + "loss": 0.77581948, + "num_input_tokens_seen": 43069720, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.90234375, + "step": 1990, + "time_per_iteration": 3.7754063606262207 + }, + { + "auxiliary_loss_clip": 0.01033002, + "auxiliary_loss_mlp": 0.01003007, + "balance_loss_clip": 0.99988371, + "balance_loss_mlp": 1.00634062, + "epoch": 0.11970539606192696, + "flos": 70676564302080.0, + "grad_norm": 0.846775636511847, + "language_loss": 0.55253577, + "learning_rate": 3.860469491617206e-06, + "loss": 0.57289588, + "num_input_tokens_seen": 43123130, + "router_z_loss_clip": 0.03125, + "router_z_loss_mlp": 0.265625, + "step": 1991, + "time_per_iteration": 3.015073537826538 + }, + { + "auxiliary_loss_clip": 0.01118721, + "auxiliary_loss_mlp": 0.01043284, + "balance_loss_clip": 1.02001476, + "balance_loss_mlp": 1.03399217, + "epoch": 0.11976551931459492, + "flos": 21213959760000.0, + "grad_norm": 6.539764829323169, + "language_loss": 0.78014505, + "learning_rate": 3.8603308108465864e-06, + "loss": 0.80176508, + "num_input_tokens_seen": 43140015, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.84765625, + "step": 1992, + "time_per_iteration": 3.727733612060547 + }, + { + "auxiliary_loss_clip": 0.01123085, + "auxiliary_loss_mlp": 0.01045995, + "balance_loss_clip": 1.02146149, + "balance_loss_mlp": 1.03341937, + "epoch": 0.11982564256726289, + "flos": 25988262894720.0, + "grad_norm": 1.732685606188902, + "language_loss": 0.79110837, + "learning_rate": 3.8601920636855466e-06, + "loss": 0.8127991, + "num_input_tokens_seen": 43160105, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.8984375, + "step": 1993, + "time_per_iteration": 3.8382067680358887 + }, + { + "auxiliary_loss_clip": 0.01117257, + "auxiliary_loss_mlp": 0.01047155, + "balance_loss_clip": 1.02362347, + "balance_loss_mlp": 1.03113675, + "epoch": 0.11988576581993086, + "flos": 21649850484480.0, + "grad_norm": 1.8914864370530282, + "language_loss": 0.82625687, + "learning_rate": 3.860053250139036e-06, + "loss": 0.84790105, + "num_input_tokens_seen": 43179835, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 0.859375, + "step": 1994, + "time_per_iteration": 2.388251781463623 + }, + { + "auxiliary_loss_clip": 0.01119523, + "auxiliary_loss_mlp": 0.01044532, + "balance_loss_clip": 1.02286029, + "balance_loss_mlp": 1.03431726, + "epoch": 0.11994588907259883, + "flos": 17856467308800.0, + "grad_norm": 2.0853703367348304, + "language_loss": 0.88376117, + "learning_rate": 3.859914370212011e-06, + "loss": 0.90540171, + "num_input_tokens_seen": 43197210, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 0.8515625, + "step": 1995, + "time_per_iteration": 3.7453725337982178 + }, + { + "auxiliary_loss_clip": 0.01120729, + "auxiliary_loss_mlp": 0.01053838, + "balance_loss_clip": 1.02916145, + "balance_loss_mlp": 1.03449523, + "epoch": 0.1200060123252668, + "flos": 24461580562560.0, + "grad_norm": 1.9456351794724802, + "language_loss": 0.7399205, + "learning_rate": 3.859775423909426e-06, + "loss": 0.76166618, + "num_input_tokens_seen": 43215050, + "router_z_loss_clip": 0.24707031, + "router_z_loss_mlp": 0.86328125, + "step": 1996, + "time_per_iteration": 2.4598515033721924 + }, + { + "auxiliary_loss_clip": 0.01118924, + "auxiliary_loss_mlp": 0.01044023, + "balance_loss_clip": 1.01909614, + "balance_loss_mlp": 1.03307962, + "epoch": 0.12006613557793476, + "flos": 18731251134720.0, + "grad_norm": 2.0778181248459413, + "language_loss": 0.87980461, + "learning_rate": 3.8596364112362395e-06, + "loss": 0.90143406, + "num_input_tokens_seen": 43233900, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.859375, + "step": 1997, + "time_per_iteration": 2.352095127105713 + }, + { + "auxiliary_loss_clip": 0.01118619, + "auxiliary_loss_mlp": 0.01053211, + "balance_loss_clip": 1.02774751, + "balance_loss_mlp": 1.03132403, + "epoch": 0.12012625883060274, + "flos": 22266800853120.0, + "grad_norm": 2.0035983354663993, + "language_loss": 0.78402185, + "learning_rate": 3.859497332197413e-06, + "loss": 0.80574012, + "num_input_tokens_seen": 43252105, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.875, + "step": 1998, + "time_per_iteration": 2.411452054977417 + }, + { + "auxiliary_loss_clip": 0.01122502, + "auxiliary_loss_mlp": 0.01045654, + "balance_loss_clip": 1.02150226, + "balance_loss_mlp": 1.03462815, + "epoch": 0.1201863820832707, + "flos": 21757906742400.0, + "grad_norm": 1.6482861458018172, + "language_loss": 0.73282808, + "learning_rate": 3.8593581867979105e-06, + "loss": 0.75450969, + "num_input_tokens_seen": 43270315, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 0.87890625, + "step": 1999, + "time_per_iteration": 2.4018945693969727 + }, + { + "auxiliary_loss_clip": 0.01121568, + "auxiliary_loss_mlp": 0.01050488, + "balance_loss_clip": 1.02662206, + "balance_loss_mlp": 1.03275871, + "epoch": 0.12024650533593867, + "flos": 21906915891840.0, + "grad_norm": 2.312608842513132, + "language_loss": 0.74748641, + "learning_rate": 3.8592189750426965e-06, + "loss": 0.769207, + "num_input_tokens_seen": 43289935, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.88671875, + "step": 2000, + "time_per_iteration": 2.414414167404175 + }, + { + "auxiliary_loss_clip": 0.01120195, + "auxiliary_loss_mlp": 0.01045413, + "balance_loss_clip": 1.02091551, + "balance_loss_mlp": 1.0318017, + "epoch": 0.12030662858860665, + "flos": 21688150112640.0, + "grad_norm": 1.5722378616385124, + "language_loss": 0.84657854, + "learning_rate": 3.85907969693674e-06, + "loss": 0.86823463, + "num_input_tokens_seen": 43309325, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.8828125, + "step": 2001, + "time_per_iteration": 2.3905575275421143 + }, + { + "auxiliary_loss_clip": 0.01118935, + "auxiliary_loss_mlp": 0.01042232, + "balance_loss_clip": 1.01928473, + "balance_loss_mlp": 1.03210068, + "epoch": 0.12036675184127461, + "flos": 12932386974720.0, + "grad_norm": 2.150666824424472, + "language_loss": 0.74219608, + "learning_rate": 3.858940352485011e-06, + "loss": 0.76380777, + "num_input_tokens_seen": 43327010, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 0.8671875, + "step": 2002, + "time_per_iteration": 2.3468594551086426 + }, + { + "auxiliary_loss_clip": 0.01125384, + "auxiliary_loss_mlp": 0.01048133, + "balance_loss_clip": 1.02219284, + "balance_loss_mlp": 1.03465629, + "epoch": 0.12042687509394258, + "flos": 20849955258240.0, + "grad_norm": 2.2416977745271627, + "language_loss": 0.77901542, + "learning_rate": 3.8588009416924835e-06, + "loss": 0.80075049, + "num_input_tokens_seen": 43345650, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.90625, + "step": 2003, + "time_per_iteration": 2.3935604095458984 + }, + { + "auxiliary_loss_clip": 0.01118354, + "auxiliary_loss_mlp": 0.01045372, + "balance_loss_clip": 1.02026618, + "balance_loss_mlp": 1.03217614, + "epoch": 0.12048699834661056, + "flos": 23877378915840.0, + "grad_norm": 2.3116049897435924, + "language_loss": 0.72234046, + "learning_rate": 3.858661464564131e-06, + "loss": 0.74397773, + "num_input_tokens_seen": 43365555, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.86328125, + "step": 2004, + "time_per_iteration": 2.4032437801361084 + }, + { + "auxiliary_loss_clip": 0.01127669, + "auxiliary_loss_mlp": 0.01051112, + "balance_loss_clip": 1.02428949, + "balance_loss_mlp": 1.03524661, + "epoch": 0.12054712159927852, + "flos": 19755323400960.0, + "grad_norm": 1.6450597877325683, + "language_loss": 0.78438574, + "learning_rate": 3.858521921104932e-06, + "loss": 0.80617362, + "num_input_tokens_seen": 43384990, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 0.921875, + "step": 2005, + "time_per_iteration": 2.4171624183654785 + }, + { + "auxiliary_loss_clip": 0.01030416, + "auxiliary_loss_mlp": 0.01006522, + "balance_loss_clip": 1.00354171, + "balance_loss_mlp": 1.00357127, + "epoch": 0.12060724485194649, + "flos": 51670057075200.0, + "grad_norm": 0.9250736729463825, + "language_loss": 0.58070427, + "learning_rate": 3.858382311319866e-06, + "loss": 0.60107362, + "num_input_tokens_seen": 43436335, + "router_z_loss_clip": 0.02978516, + "router_z_loss_mlp": 0.26953125, + "step": 2006, + "time_per_iteration": 2.8138062953948975 + }, + { + "auxiliary_loss_clip": 0.01120069, + "auxiliary_loss_mlp": 0.01043962, + "balance_loss_clip": 1.02029848, + "balance_loss_mlp": 1.03448987, + "epoch": 0.12066736810461445, + "flos": 18989398794240.0, + "grad_norm": 1.7877193335870534, + "language_loss": 0.76783776, + "learning_rate": 3.858242635213917e-06, + "loss": 0.78947806, + "num_input_tokens_seen": 43456495, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.85546875, + "step": 2007, + "time_per_iteration": 2.4160773754119873 + }, + { + "auxiliary_loss_clip": 0.01121991, + "auxiliary_loss_mlp": 0.0105572, + "balance_loss_clip": 1.03088856, + "balance_loss_mlp": 1.03364897, + "epoch": 0.12072749135728243, + "flos": 16471043804160.0, + "grad_norm": 3.1240115634081933, + "language_loss": 0.8271625, + "learning_rate": 3.858102892792067e-06, + "loss": 0.84893966, + "num_input_tokens_seen": 43473085, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.8828125, + "step": 2008, + "time_per_iteration": 2.364374876022339 + }, + { + "auxiliary_loss_clip": 0.01119849, + "auxiliary_loss_mlp": 0.01046688, + "balance_loss_clip": 1.02267885, + "balance_loss_mlp": 1.03178072, + "epoch": 0.1207876146099504, + "flos": 18076140783360.0, + "grad_norm": 2.175275464065516, + "language_loss": 0.83321232, + "learning_rate": 3.857963084059304e-06, + "loss": 0.85487771, + "num_input_tokens_seen": 43491135, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.8828125, + "step": 2009, + "time_per_iteration": 2.3697962760925293 + }, + { + "auxiliary_loss_clip": 0.01123511, + "auxiliary_loss_mlp": 0.01055106, + "balance_loss_clip": 1.02812898, + "balance_loss_mlp": 1.03362918, + "epoch": 0.12084773786261836, + "flos": 21870501477120.0, + "grad_norm": 1.731425056093623, + "language_loss": 0.84200156, + "learning_rate": 3.857823209020619e-06, + "loss": 0.86378777, + "num_input_tokens_seen": 43510440, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.8984375, + "step": 2010, + "time_per_iteration": 2.382655382156372 + }, + { + "auxiliary_loss_clip": 0.01125417, + "auxiliary_loss_mlp": 0.01058759, + "balance_loss_clip": 1.03335524, + "balance_loss_mlp": 1.03640819, + "epoch": 0.12090786111528634, + "flos": 18332054115840.0, + "grad_norm": 1.7003581849257905, + "language_loss": 0.84254408, + "learning_rate": 3.857683267681002e-06, + "loss": 0.86438584, + "num_input_tokens_seen": 43530145, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.890625, + "step": 2011, + "time_per_iteration": 2.4131879806518555 + }, + { + "auxiliary_loss_clip": 0.01124281, + "auxiliary_loss_mlp": 0.01052195, + "balance_loss_clip": 1.02688694, + "balance_loss_mlp": 1.03430021, + "epoch": 0.1209679843679543, + "flos": 21104786338560.0, + "grad_norm": 1.8372059440576718, + "language_loss": 0.95579314, + "learning_rate": 3.857543260045448e-06, + "loss": 0.9775579, + "num_input_tokens_seen": 43549315, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.8984375, + "step": 2012, + "time_per_iteration": 2.3958005905151367 + }, + { + "auxiliary_loss_clip": 0.01119601, + "auxiliary_loss_mlp": 0.0104187, + "balance_loss_clip": 1.01671648, + "balance_loss_mlp": 1.03354287, + "epoch": 0.12102810762062227, + "flos": 29239793769600.0, + "grad_norm": 2.677766300536327, + "language_loss": 0.80141032, + "learning_rate": 3.857403186118952e-06, + "loss": 0.82302499, + "num_input_tokens_seen": 43569240, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.859375, + "step": 2013, + "time_per_iteration": 2.4480528831481934 + }, + { + "auxiliary_loss_clip": 0.01123554, + "auxiliary_loss_mlp": 0.01051356, + "balance_loss_clip": 1.02354503, + "balance_loss_mlp": 1.03343034, + "epoch": 0.12108823087329025, + "flos": 17929749985920.0, + "grad_norm": 2.5308944401753597, + "language_loss": 0.77227497, + "learning_rate": 3.857263045906516e-06, + "loss": 0.79402405, + "num_input_tokens_seen": 43587710, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.8984375, + "step": 2014, + "time_per_iteration": 2.358426809310913 + }, + { + "auxiliary_loss_clip": 0.01121764, + "auxiliary_loss_mlp": 0.010419, + "balance_loss_clip": 1.01609063, + "balance_loss_mlp": 1.03438997, + "epoch": 0.12114835412595822, + "flos": 22090733533440.0, + "grad_norm": 3.5941496769831156, + "language_loss": 0.86573106, + "learning_rate": 3.857122839413138e-06, + "loss": 0.88736767, + "num_input_tokens_seen": 43606000, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.875, + "step": 2015, + "time_per_iteration": 2.4904682636260986 + }, + { + "auxiliary_loss_clip": 0.01116226, + "auxiliary_loss_mlp": 0.0105065, + "balance_loss_clip": 1.02637863, + "balance_loss_mlp": 1.03096962, + "epoch": 0.12120847737862618, + "flos": 20411306536320.0, + "grad_norm": 2.4791579545711127, + "language_loss": 0.68878114, + "learning_rate": 3.856982566643824e-06, + "loss": 0.71044993, + "num_input_tokens_seen": 43624815, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 0.8515625, + "step": 2016, + "time_per_iteration": 2.3910837173461914 + }, + { + "auxiliary_loss_clip": 0.01125592, + "auxiliary_loss_mlp": 0.0104802, + "balance_loss_clip": 1.02233076, + "balance_loss_mlp": 1.03680539, + "epoch": 0.12126860063129415, + "flos": 22307963212800.0, + "grad_norm": 5.124671887293178, + "language_loss": 0.80184972, + "learning_rate": 3.856842227603578e-06, + "loss": 0.82358587, + "num_input_tokens_seen": 43643960, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.88671875, + "step": 2017, + "time_per_iteration": 2.395059585571289 + }, + { + "auxiliary_loss_clip": 0.01122697, + "auxiliary_loss_mlp": 0.01047158, + "balance_loss_clip": 1.01989436, + "balance_loss_mlp": 1.03370953, + "epoch": 0.12132872388396213, + "flos": 13698416315520.0, + "grad_norm": 2.248275907000715, + "language_loss": 0.68856907, + "learning_rate": 3.856701822297409e-06, + "loss": 0.71026766, + "num_input_tokens_seen": 43662650, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.890625, + "step": 2018, + "time_per_iteration": 2.3690297603607178 + }, + { + "auxiliary_loss_clip": 0.01126354, + "auxiliary_loss_mlp": 0.0104908, + "balance_loss_clip": 1.02452278, + "balance_loss_mlp": 1.03770387, + "epoch": 0.12138884713663009, + "flos": 26465804737920.0, + "grad_norm": 1.794465734915278, + "language_loss": 0.72320479, + "learning_rate": 3.856561350730329e-06, + "loss": 0.74495912, + "num_input_tokens_seen": 43684205, + "router_z_loss_clip": 0.24511719, + "router_z_loss_mlp": 0.88671875, + "step": 2019, + "time_per_iteration": 2.4442644119262695 + }, + { + "auxiliary_loss_clip": 0.01121581, + "auxiliary_loss_mlp": 0.01058645, + "balance_loss_clip": 1.03263319, + "balance_loss_mlp": 1.03215969, + "epoch": 0.12144897038929806, + "flos": 26140379155200.0, + "grad_norm": 2.9071528755660077, + "language_loss": 0.92150027, + "learning_rate": 3.856420812907349e-06, + "loss": 0.94330251, + "num_input_tokens_seen": 43706320, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.89453125, + "step": 2020, + "time_per_iteration": 2.4346835613250732 + }, + { + "auxiliary_loss_clip": 0.01122444, + "auxiliary_loss_mlp": 0.01047896, + "balance_loss_clip": 1.02292204, + "balance_loss_mlp": 1.03476238, + "epoch": 0.12150909364196603, + "flos": 24716376731520.0, + "grad_norm": 2.000040683910714, + "language_loss": 0.7741518, + "learning_rate": 3.856280208833486e-06, + "loss": 0.79585522, + "num_input_tokens_seen": 43724805, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.875, + "step": 2021, + "time_per_iteration": 2.417357921600342 + }, + { + "auxiliary_loss_clip": 0.01120585, + "auxiliary_loss_mlp": 0.01046029, + "balance_loss_clip": 1.02217555, + "balance_loss_mlp": 1.03421116, + "epoch": 0.121569216894634, + "flos": 25185958784640.0, + "grad_norm": 2.02603328388594, + "language_loss": 0.80683607, + "learning_rate": 3.856139538513758e-06, + "loss": 0.82850218, + "num_input_tokens_seen": 43742320, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.86328125, + "step": 2022, + "time_per_iteration": 2.413191080093384 + }, + { + "auxiliary_loss_clip": 0.01124483, + "auxiliary_loss_mlp": 0.01053167, + "balance_loss_clip": 1.02814507, + "balance_loss_mlp": 1.03584802, + "epoch": 0.12162934014730196, + "flos": 13443236121600.0, + "grad_norm": 1.7818911963633228, + "language_loss": 0.85147119, + "learning_rate": 3.855998801953183e-06, + "loss": 0.87324774, + "num_input_tokens_seen": 43760665, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.88671875, + "step": 2023, + "time_per_iteration": 2.382678985595703 + }, + { + "auxiliary_loss_clip": 0.01120644, + "auxiliary_loss_mlp": 0.01050331, + "balance_loss_clip": 1.02470064, + "balance_loss_mlp": 1.03305507, + "epoch": 0.12168946339996994, + "flos": 16945199245440.0, + "grad_norm": 2.298953223732629, + "language_loss": 0.85245049, + "learning_rate": 3.855857999156786e-06, + "loss": 0.87416029, + "num_input_tokens_seen": 43779020, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.875, + "step": 2024, + "time_per_iteration": 2.3559911251068115 + }, + { + "auxiliary_loss_clip": 0.01119247, + "auxiliary_loss_mlp": 0.01047792, + "balance_loss_clip": 1.02230477, + "balance_loss_mlp": 1.03056741, + "epoch": 0.12174958665263791, + "flos": 29820399546240.0, + "grad_norm": 2.6608086756553595, + "language_loss": 0.71909428, + "learning_rate": 3.85571713012959e-06, + "loss": 0.74076468, + "num_input_tokens_seen": 43798850, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.890625, + "step": 2025, + "time_per_iteration": 2.4525861740112305 + }, + { + "auxiliary_loss_clip": 0.01122017, + "auxiliary_loss_mlp": 0.01044699, + "balance_loss_clip": 1.02079761, + "balance_loss_mlp": 1.03384447, + "epoch": 0.12180970990530587, + "flos": 24640824816000.0, + "grad_norm": 1.9378737122767655, + "language_loss": 0.76372939, + "learning_rate": 3.855576194876624e-06, + "loss": 0.78539658, + "num_input_tokens_seen": 43820130, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.8828125, + "step": 2026, + "time_per_iteration": 2.443085193634033 + }, + { + "auxiliary_loss_clip": 0.01121749, + "auxiliary_loss_mlp": 0.01048304, + "balance_loss_clip": 1.02411687, + "balance_loss_mlp": 1.03315139, + "epoch": 0.12186983315797385, + "flos": 20520654514560.0, + "grad_norm": 2.396102644901252, + "language_loss": 0.88871133, + "learning_rate": 3.855435193402916e-06, + "loss": 0.91041183, + "num_input_tokens_seen": 43838485, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.88671875, + "step": 2027, + "time_per_iteration": 2.375792980194092 + }, + { + "auxiliary_loss_clip": 0.01119273, + "auxiliary_loss_mlp": 0.0104507, + "balance_loss_clip": 1.02220595, + "balance_loss_mlp": 1.03226328, + "epoch": 0.12192995641064182, + "flos": 27817117977600.0, + "grad_norm": 1.5858442001493853, + "language_loss": 0.7563501, + "learning_rate": 3.8552941257135e-06, + "loss": 0.77799356, + "num_input_tokens_seen": 43859080, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.8671875, + "step": 2028, + "time_per_iteration": 2.433760404586792 + }, + { + "auxiliary_loss_clip": 0.0111943, + "auxiliary_loss_mlp": 0.01051931, + "balance_loss_clip": 1.02595568, + "balance_loss_mlp": 1.03203964, + "epoch": 0.12199007966330978, + "flos": 22016054401920.0, + "grad_norm": 2.2405902070927253, + "language_loss": 0.7657541, + "learning_rate": 3.855152991813408e-06, + "loss": 0.78746778, + "num_input_tokens_seen": 43879030, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.875, + "step": 2029, + "time_per_iteration": 3.77600359916687 + }, + { + "auxiliary_loss_clip": 0.01118043, + "auxiliary_loss_mlp": 0.0104532, + "balance_loss_clip": 1.02150249, + "balance_loss_mlp": 1.03121793, + "epoch": 0.12205020291597775, + "flos": 23294084964480.0, + "grad_norm": 3.673993069551172, + "language_loss": 0.7888177, + "learning_rate": 3.855011791707678e-06, + "loss": 0.81045127, + "num_input_tokens_seen": 43898505, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.8671875, + "step": 2030, + "time_per_iteration": 2.3976783752441406 + }, + { + "auxiliary_loss_clip": 0.01117256, + "auxiliary_loss_mlp": 0.01049557, + "balance_loss_clip": 1.0247736, + "balance_loss_mlp": 1.03186178, + "epoch": 0.12211032616864573, + "flos": 26030402772480.0, + "grad_norm": 2.072864758974824, + "language_loss": 0.73834264, + "learning_rate": 3.854870525401349e-06, + "loss": 0.76001072, + "num_input_tokens_seen": 43917945, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.8515625, + "step": 2031, + "time_per_iteration": 2.418802261352539 + }, + { + "auxiliary_loss_clip": 0.01118716, + "auxiliary_loss_mlp": 0.01046682, + "balance_loss_clip": 1.02255392, + "balance_loss_mlp": 1.03258789, + "epoch": 0.12217044942131369, + "flos": 20409944993280.0, + "grad_norm": 4.745944251459415, + "language_loss": 0.74995601, + "learning_rate": 3.8547291928994615e-06, + "loss": 0.77161002, + "num_input_tokens_seen": 43937385, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 0.859375, + "step": 2032, + "time_per_iteration": 5.171973705291748 + }, + { + "auxiliary_loss_clip": 0.01111557, + "auxiliary_loss_mlp": 0.01039076, + "balance_loss_clip": 1.01628351, + "balance_loss_mlp": 1.02953041, + "epoch": 0.12223057267398166, + "flos": 22856029735680.0, + "grad_norm": 1.6312332353564754, + "language_loss": 0.89163828, + "learning_rate": 3.8545877942070605e-06, + "loss": 0.91314459, + "num_input_tokens_seen": 43958130, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.8203125, + "step": 2033, + "time_per_iteration": 2.4246609210968018 + }, + { + "auxiliary_loss_clip": 0.01124556, + "auxiliary_loss_mlp": 0.0104608, + "balance_loss_clip": 1.02266693, + "balance_loss_mlp": 1.03622246, + "epoch": 0.12229069592664964, + "flos": 20046533984640.0, + "grad_norm": 1.9397284321498525, + "language_loss": 0.65490395, + "learning_rate": 3.8544463293291914e-06, + "loss": 0.67661023, + "num_input_tokens_seen": 43976800, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.8828125, + "step": 2034, + "time_per_iteration": 3.7740495204925537 + }, + { + "auxiliary_loss_clip": 0.011211, + "auxiliary_loss_mlp": 0.01050813, + "balance_loss_clip": 1.02624369, + "balance_loss_mlp": 1.03414273, + "epoch": 0.1223508191793176, + "flos": 22273119809280.0, + "grad_norm": 2.121930359580294, + "language_loss": 0.76366186, + "learning_rate": 3.8543047982709035e-06, + "loss": 0.78538096, + "num_input_tokens_seen": 43996620, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.8671875, + "step": 2035, + "time_per_iteration": 2.409691333770752 + }, + { + "auxiliary_loss_clip": 0.01122696, + "auxiliary_loss_mlp": 0.01048104, + "balance_loss_clip": 1.02359462, + "balance_loss_mlp": 1.03383136, + "epoch": 0.12241094243198557, + "flos": 21284973198720.0, + "grad_norm": 1.832738408115759, + "language_loss": 0.71510398, + "learning_rate": 3.854163201037247e-06, + "loss": 0.73681188, + "num_input_tokens_seen": 44016175, + "router_z_loss_clip": 0.24511719, + "router_z_loss_mlp": 0.890625, + "step": 2036, + "time_per_iteration": 2.399597406387329 + }, + { + "auxiliary_loss_clip": 0.01120529, + "auxiliary_loss_mlp": 0.01052217, + "balance_loss_clip": 1.02780282, + "balance_loss_mlp": 1.03389144, + "epoch": 0.12247106568465355, + "flos": 17381473994880.0, + "grad_norm": 1.7031477508166286, + "language_loss": 0.83004296, + "learning_rate": 3.854021537633275e-06, + "loss": 0.8517704, + "num_input_tokens_seen": 44035060, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.8671875, + "step": 2037, + "time_per_iteration": 2.3796517848968506 + }, + { + "auxiliary_loss_clip": 0.01126316, + "auxiliary_loss_mlp": 0.01045922, + "balance_loss_clip": 1.0206852, + "balance_loss_mlp": 1.03652191, + "epoch": 0.12253118893732151, + "flos": 27044420567040.0, + "grad_norm": 3.2035945862169948, + "language_loss": 0.79517901, + "learning_rate": 3.853879808064044e-06, + "loss": 0.81690133, + "num_input_tokens_seen": 44053330, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.8984375, + "step": 2038, + "time_per_iteration": 2.4297780990600586 + }, + { + "auxiliary_loss_clip": 0.01033306, + "auxiliary_loss_mlp": 0.01017511, + "balance_loss_clip": 1.01360095, + "balance_loss_mlp": 1.00665402, + "epoch": 0.12259131218998948, + "flos": 53858762208000.0, + "grad_norm": 0.8245276463797878, + "language_loss": 0.58636552, + "learning_rate": 3.8537380123346105e-06, + "loss": 0.60687369, + "num_input_tokens_seen": 44107575, + "router_z_loss_clip": 0.0390625, + "router_z_loss_mlp": 0.265625, + "step": 2039, + "time_per_iteration": 2.911100149154663 + }, + { + "auxiliary_loss_clip": 0.0112325, + "auxiliary_loss_mlp": 0.01049507, + "balance_loss_clip": 1.02356696, + "balance_loss_mlp": 1.03539979, + "epoch": 0.12265143544265744, + "flos": 17891031421440.0, + "grad_norm": 3.018760004112527, + "language_loss": 0.80326319, + "learning_rate": 3.853596150450037e-06, + "loss": 0.82499075, + "num_input_tokens_seen": 44126075, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.87890625, + "step": 2040, + "time_per_iteration": 2.3550355434417725 + }, + { + "auxiliary_loss_clip": 0.01116468, + "auxiliary_loss_mlp": 0.01044236, + "balance_loss_clip": 1.02149129, + "balance_loss_mlp": 1.03211033, + "epoch": 0.12271155869532542, + "flos": 21798824722560.0, + "grad_norm": 1.7984327608047004, + "language_loss": 0.82874405, + "learning_rate": 3.853454222415384e-06, + "loss": 0.8503511, + "num_input_tokens_seen": 44145605, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 0.84375, + "step": 2041, + "time_per_iteration": 2.387716054916382 + }, + { + "auxiliary_loss_clip": 0.01122322, + "auxiliary_loss_mlp": 0.01042929, + "balance_loss_clip": 1.01605964, + "balance_loss_mlp": 1.03212059, + "epoch": 0.12277168194799339, + "flos": 19827733294080.0, + "grad_norm": 1.7672051121677157, + "language_loss": 0.67215192, + "learning_rate": 3.853312228235717e-06, + "loss": 0.69380438, + "num_input_tokens_seen": 44164770, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.90234375, + "step": 2042, + "time_per_iteration": 2.378965139389038 + }, + { + "auxiliary_loss_clip": 0.01120555, + "auxiliary_loss_mlp": 0.01055578, + "balance_loss_clip": 1.03185534, + "balance_loss_mlp": 1.0333581, + "epoch": 0.12283180520066135, + "flos": 23219929503360.0, + "grad_norm": 1.8628373231360564, + "language_loss": 0.81608152, + "learning_rate": 3.853170167916106e-06, + "loss": 0.83784282, + "num_input_tokens_seen": 44184025, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 0.87109375, + "step": 2043, + "time_per_iteration": 2.3813624382019043 + }, + { + "auxiliary_loss_clip": 0.0111882, + "auxiliary_loss_mlp": 0.01047401, + "balance_loss_clip": 1.02171087, + "balance_loss_mlp": 1.03120828, + "epoch": 0.12289192845332933, + "flos": 18587478689280.0, + "grad_norm": 1.907887097537687, + "language_loss": 0.80282354, + "learning_rate": 3.853028041461617e-06, + "loss": 0.82448578, + "num_input_tokens_seen": 44202950, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.875, + "step": 2044, + "time_per_iteration": 2.379453420639038 + }, + { + "auxiliary_loss_clip": 0.01118916, + "auxiliary_loss_mlp": 0.01046854, + "balance_loss_clip": 1.02279735, + "balance_loss_mlp": 1.03514624, + "epoch": 0.1229520517059973, + "flos": 25768519597440.0, + "grad_norm": 1.6706588656358827, + "language_loss": 0.78307921, + "learning_rate": 3.852885848877323e-06, + "loss": 0.80473691, + "num_input_tokens_seen": 44221115, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.8359375, + "step": 2045, + "time_per_iteration": 2.4202218055725098 + }, + { + "auxiliary_loss_clip": 0.01124099, + "auxiliary_loss_mlp": 0.0105316, + "balance_loss_clip": 1.02633786, + "balance_loss_mlp": 1.03491473, + "epoch": 0.12301217495866526, + "flos": 20886090382080.0, + "grad_norm": 2.1187738228664235, + "language_loss": 0.67233276, + "learning_rate": 3.852743590168301e-06, + "loss": 0.69410533, + "num_input_tokens_seen": 44240575, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.890625, + "step": 2046, + "time_per_iteration": 2.3931007385253906 + }, + { + "auxiliary_loss_clip": 0.01118281, + "auxiliary_loss_mlp": 0.01052155, + "balance_loss_clip": 1.02762175, + "balance_loss_mlp": 1.03496456, + "epoch": 0.12307229821133324, + "flos": 22377824576640.0, + "grad_norm": 2.1682406600206012, + "language_loss": 0.72872901, + "learning_rate": 3.852601265339625e-06, + "loss": 0.75043344, + "num_input_tokens_seen": 44257145, + "router_z_loss_clip": 0.24511719, + "router_z_loss_mlp": 0.83203125, + "step": 2047, + "time_per_iteration": 2.3804879188537598 + }, + { + "auxiliary_loss_clip": 0.01118315, + "auxiliary_loss_mlp": 0.01044566, + "balance_loss_clip": 1.01955628, + "balance_loss_mlp": 1.03371596, + "epoch": 0.1231324214640012, + "flos": 23366285389440.0, + "grad_norm": 1.6043465109701915, + "language_loss": 0.76935506, + "learning_rate": 3.8524588743963755e-06, + "loss": 0.7909838, + "num_input_tokens_seen": 44278035, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.84375, + "step": 2048, + "time_per_iteration": 2.4538443088531494 + }, + { + "auxiliary_loss_clip": 0.01120431, + "auxiliary_loss_mlp": 0.01044889, + "balance_loss_clip": 1.02201271, + "balance_loss_mlp": 1.03345227, + "epoch": 0.12319254471666917, + "flos": 23766075901440.0, + "grad_norm": 1.8330515449971934, + "language_loss": 0.84730721, + "learning_rate": 3.852316417343634e-06, + "loss": 0.86896044, + "num_input_tokens_seen": 44296980, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.8671875, + "step": 2049, + "time_per_iteration": 2.402829885482788 + }, + { + "auxiliary_loss_clip": 0.01118217, + "auxiliary_loss_mlp": 0.01050867, + "balance_loss_clip": 1.02539206, + "balance_loss_mlp": 1.03130078, + "epoch": 0.12325266796933713, + "flos": 23549020778880.0, + "grad_norm": 2.288036285795224, + "language_loss": 0.75656784, + "learning_rate": 3.852173894186484e-06, + "loss": 0.77825868, + "num_input_tokens_seen": 44318005, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.8671875, + "step": 2050, + "time_per_iteration": 2.4450416564941406 + }, + { + "auxiliary_loss_clip": 0.01119152, + "auxiliary_loss_mlp": 0.01044909, + "balance_loss_clip": 1.02019715, + "balance_loss_mlp": 1.03360677, + "epoch": 0.12331279122200511, + "flos": 24422896909440.0, + "grad_norm": 2.177058190261101, + "language_loss": 0.80784744, + "learning_rate": 3.852031304930012e-06, + "loss": 0.82948804, + "num_input_tokens_seen": 44335260, + "router_z_loss_clip": 0.24707031, + "router_z_loss_mlp": 0.8515625, + "step": 2051, + "time_per_iteration": 2.494203805923462 + }, + { + "auxiliary_loss_clip": 0.01118979, + "auxiliary_loss_mlp": 0.01048398, + "balance_loss_clip": 1.02320886, + "balance_loss_mlp": 1.0356648, + "epoch": 0.12337291447467308, + "flos": 25483104299520.0, + "grad_norm": 1.7588340293504667, + "language_loss": 0.80011177, + "learning_rate": 3.851888649579307e-06, + "loss": 0.82178557, + "num_input_tokens_seen": 44355315, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.83203125, + "step": 2052, + "time_per_iteration": 2.4480302333831787 + }, + { + "auxiliary_loss_clip": 0.01119589, + "auxiliary_loss_mlp": 0.01049361, + "balance_loss_clip": 1.02334929, + "balance_loss_mlp": 1.03246593, + "epoch": 0.12343303772734104, + "flos": 23548881133440.0, + "grad_norm": 2.062989969890706, + "language_loss": 0.7362048, + "learning_rate": 3.85174592813946e-06, + "loss": 0.75789428, + "num_input_tokens_seen": 44373020, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.87109375, + "step": 2053, + "time_per_iteration": 2.395052909851074 + }, + { + "auxiliary_loss_clip": 0.01117386, + "auxiliary_loss_mlp": 0.01044747, + "balance_loss_clip": 1.02097631, + "balance_loss_mlp": 1.02949238, + "epoch": 0.12349316098000902, + "flos": 47555299900800.0, + "grad_norm": 1.7078781870035997, + "language_loss": 0.74674809, + "learning_rate": 3.851603140615564e-06, + "loss": 0.76836938, + "num_input_tokens_seen": 44397525, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.87890625, + "step": 2054, + "time_per_iteration": 2.6298487186431885 + }, + { + "auxiliary_loss_clip": 0.0111469, + "auxiliary_loss_mlp": 0.01037518, + "balance_loss_clip": 1.01553583, + "balance_loss_mlp": 1.02974188, + "epoch": 0.12355328423267699, + "flos": 25044804691200.0, + "grad_norm": 2.2983846207671887, + "language_loss": 0.84969324, + "learning_rate": 3.851460287012714e-06, + "loss": 0.87121534, + "num_input_tokens_seen": 44415890, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 0.84765625, + "step": 2055, + "time_per_iteration": 2.4238319396972656 + }, + { + "auxiliary_loss_clip": 0.01117869, + "auxiliary_loss_mlp": 0.01047669, + "balance_loss_clip": 1.02550793, + "balance_loss_mlp": 1.03191376, + "epoch": 0.12361340748534495, + "flos": 27707909644800.0, + "grad_norm": 2.378386507328866, + "language_loss": 0.77205324, + "learning_rate": 3.85131736733601e-06, + "loss": 0.79370862, + "num_input_tokens_seen": 44436625, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 0.859375, + "step": 2056, + "time_per_iteration": 2.454680919647217 + }, + { + "auxiliary_loss_clip": 0.0111708, + "auxiliary_loss_mlp": 0.01043758, + "balance_loss_clip": 1.0191288, + "balance_loss_mlp": 1.03185117, + "epoch": 0.12367353073801293, + "flos": 26139401637120.0, + "grad_norm": 2.3364790556535238, + "language_loss": 0.83136255, + "learning_rate": 3.851174381590551e-06, + "loss": 0.85297096, + "num_input_tokens_seen": 44455265, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.8515625, + "step": 2057, + "time_per_iteration": 2.4133453369140625 + }, + { + "auxiliary_loss_clip": 0.01122137, + "auxiliary_loss_mlp": 0.0104733, + "balance_loss_clip": 1.02342844, + "balance_loss_mlp": 1.03349555, + "epoch": 0.1237336539906809, + "flos": 25154850896640.0, + "grad_norm": 1.7560536613187636, + "language_loss": 0.78054428, + "learning_rate": 3.85103132978144e-06, + "loss": 0.802239, + "num_input_tokens_seen": 44475815, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 0.88671875, + "step": 2058, + "time_per_iteration": 2.4437592029571533 + }, + { + "auxiliary_loss_clip": 0.01117888, + "auxiliary_loss_mlp": 0.0104837, + "balance_loss_clip": 1.02399194, + "balance_loss_mlp": 1.03020191, + "epoch": 0.12379377724334886, + "flos": 15303687851520.0, + "grad_norm": 2.113776375453416, + "language_loss": 0.83108556, + "learning_rate": 3.850888211913782e-06, + "loss": 0.85274816, + "num_input_tokens_seen": 44494045, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.87890625, + "step": 2059, + "time_per_iteration": 2.3637967109680176 + }, + { + "auxiliary_loss_clip": 0.01122157, + "auxiliary_loss_mlp": 0.0105333, + "balance_loss_clip": 1.02692533, + "balance_loss_mlp": 1.03408313, + "epoch": 0.12385390049601683, + "flos": 21315871618560.0, + "grad_norm": 2.294771563660056, + "language_loss": 0.8141284, + "learning_rate": 3.8507450279926856e-06, + "loss": 0.83588326, + "num_input_tokens_seen": 44509120, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 0.8828125, + "step": 2060, + "time_per_iteration": 2.4067494869232178 + }, + { + "auxiliary_loss_clip": 0.01115806, + "auxiliary_loss_mlp": 0.01048251, + "balance_loss_clip": 1.02343154, + "balance_loss_mlp": 1.03006387, + "epoch": 0.1239140237486848, + "flos": 15115576112640.0, + "grad_norm": 2.2533304529817393, + "language_loss": 0.85985982, + "learning_rate": 3.850601778023259e-06, + "loss": 0.88150042, + "num_input_tokens_seen": 44525780, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.859375, + "step": 2061, + "time_per_iteration": 2.3487155437469482 + }, + { + "auxiliary_loss_clip": 0.01117927, + "auxiliary_loss_mlp": 0.01044717, + "balance_loss_clip": 1.02045834, + "balance_loss_mlp": 1.03339148, + "epoch": 0.12397414700135277, + "flos": 21975834648960.0, + "grad_norm": 1.807106900031189, + "language_loss": 0.84427786, + "learning_rate": 3.850458462010615e-06, + "loss": 0.86590421, + "num_input_tokens_seen": 44543125, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.84375, + "step": 2062, + "time_per_iteration": 2.4029412269592285 + }, + { + "auxiliary_loss_clip": 0.0111791, + "auxiliary_loss_mlp": 0.01050326, + "balance_loss_clip": 1.02610242, + "balance_loss_mlp": 1.03355742, + "epoch": 0.12403427025402074, + "flos": 13400223459840.0, + "grad_norm": 1.7333092179216898, + "language_loss": 0.78806698, + "learning_rate": 3.850315079959869e-06, + "loss": 0.80974936, + "num_input_tokens_seen": 44560275, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.84375, + "step": 2063, + "time_per_iteration": 2.377467393875122 + }, + { + "auxiliary_loss_clip": 0.01117095, + "auxiliary_loss_mlp": 0.01046062, + "balance_loss_clip": 1.02002692, + "balance_loss_mlp": 1.03245807, + "epoch": 0.12409439350668872, + "flos": 15303478383360.0, + "grad_norm": 2.240657909027672, + "language_loss": 0.79231298, + "learning_rate": 3.850171631876137e-06, + "loss": 0.81394458, + "num_input_tokens_seen": 44577640, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.84375, + "step": 2064, + "time_per_iteration": 2.3639748096466064 + }, + { + "auxiliary_loss_clip": 0.01116078, + "auxiliary_loss_mlp": 0.01052179, + "balance_loss_clip": 1.02820563, + "balance_loss_mlp": 1.03158522, + "epoch": 0.12415451675935668, + "flos": 25008215719680.0, + "grad_norm": 3.4474827998857696, + "language_loss": 0.92303932, + "learning_rate": 3.850028117764539e-06, + "loss": 0.94472188, + "num_input_tokens_seen": 44594860, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.84375, + "step": 2065, + "time_per_iteration": 2.410625696182251 + }, + { + "auxiliary_loss_clip": 0.01117973, + "auxiliary_loss_mlp": 0.01043903, + "balance_loss_clip": 1.01925051, + "balance_loss_mlp": 1.03079259, + "epoch": 0.12421464001202465, + "flos": 23658543313920.0, + "grad_norm": 1.8623053813568275, + "language_loss": 0.80406475, + "learning_rate": 3.849884537630196e-06, + "loss": 0.82568353, + "num_input_tokens_seen": 44614780, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.875, + "step": 2066, + "time_per_iteration": 2.4238030910491943 + }, + { + "auxiliary_loss_clip": 0.01035428, + "auxiliary_loss_mlp": 0.01008362, + "balance_loss_clip": 1.00404668, + "balance_loss_mlp": 1.00907445, + "epoch": 0.12427476326469263, + "flos": 65729440604160.0, + "grad_norm": 0.8776658503758344, + "language_loss": 0.63336056, + "learning_rate": 3.849740891478233e-06, + "loss": 0.65379852, + "num_input_tokens_seen": 44671240, + "router_z_loss_clip": 0.04321289, + "router_z_loss_mlp": 0.26367188, + "step": 2067, + "time_per_iteration": 2.973073720932007 + }, + { + "auxiliary_loss_clip": 0.01116555, + "auxiliary_loss_mlp": 0.01042225, + "balance_loss_clip": 1.01896691, + "balance_loss_mlp": 1.03140807, + "epoch": 0.12433488651736059, + "flos": 24534269746560.0, + "grad_norm": 2.794207783635333, + "language_loss": 0.9301703, + "learning_rate": 3.849597179313775e-06, + "loss": 0.95175815, + "num_input_tokens_seen": 44691050, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.8515625, + "step": 2068, + "time_per_iteration": 2.4296932220458984 + }, + { + "auxiliary_loss_clip": 0.01119675, + "auxiliary_loss_mlp": 0.01043389, + "balance_loss_clip": 1.02181268, + "balance_loss_mlp": 1.03442335, + "epoch": 0.12439500977002856, + "flos": 21030630877440.0, + "grad_norm": 1.8576322137631927, + "language_loss": 0.81259358, + "learning_rate": 3.849453401141952e-06, + "loss": 0.83422422, + "num_input_tokens_seen": 44709850, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 0.8515625, + "step": 2069, + "time_per_iteration": 3.7766854763031006 + }, + { + "auxiliary_loss_clip": 0.0111926, + "auxiliary_loss_mlp": 0.0105096, + "balance_loss_clip": 1.02696347, + "balance_loss_mlp": 1.03305292, + "epoch": 0.12445513302269653, + "flos": 26829495037440.0, + "grad_norm": 1.798662539204355, + "language_loss": 0.77407026, + "learning_rate": 3.8493095569678945e-06, + "loss": 0.79577243, + "num_input_tokens_seen": 44731475, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.86328125, + "step": 2070, + "time_per_iteration": 2.4453954696655273 + }, + { + "auxiliary_loss_clip": 0.01119335, + "auxiliary_loss_mlp": 0.01041812, + "balance_loss_clip": 1.01684988, + "balance_loss_mlp": 1.03424931, + "epoch": 0.1245152562753645, + "flos": 18367944860160.0, + "grad_norm": 2.31678494920857, + "language_loss": 0.8035953, + "learning_rate": 3.849165646796735e-06, + "loss": 0.82520676, + "num_input_tokens_seen": 44749685, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.8515625, + "step": 2071, + "time_per_iteration": 2.3660426139831543 + }, + { + "auxiliary_loss_clip": 0.01118492, + "auxiliary_loss_mlp": 0.01049068, + "balance_loss_clip": 1.02392673, + "balance_loss_mlp": 1.03493595, + "epoch": 0.12457537952803246, + "flos": 33106634179200.0, + "grad_norm": 1.6516694868819906, + "language_loss": 0.7830193, + "learning_rate": 3.849021670633611e-06, + "loss": 0.80469489, + "num_input_tokens_seen": 44772165, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.8359375, + "step": 2072, + "time_per_iteration": 5.2420947551727295 + }, + { + "auxiliary_loss_clip": 0.01117737, + "auxiliary_loss_mlp": 0.01051804, + "balance_loss_clip": 1.02922606, + "balance_loss_mlp": 1.03530848, + "epoch": 0.12463550278070043, + "flos": 22269209736960.0, + "grad_norm": 2.4421081395701836, + "language_loss": 0.74980325, + "learning_rate": 3.8488776284836595e-06, + "loss": 0.77149862, + "num_input_tokens_seen": 44790580, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 0.8203125, + "step": 2073, + "time_per_iteration": 3.8428778648376465 + }, + { + "auxiliary_loss_clip": 0.01116143, + "auxiliary_loss_mlp": 0.01049777, + "balance_loss_clip": 1.02605462, + "balance_loss_mlp": 1.03189266, + "epoch": 0.12469562603336841, + "flos": 14678288933760.0, + "grad_norm": 2.1570640646911725, + "language_loss": 0.90657204, + "learning_rate": 3.8487335203520215e-06, + "loss": 0.92823124, + "num_input_tokens_seen": 44806730, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 0.84375, + "step": 2074, + "time_per_iteration": 2.376634359359741 + }, + { + "auxiliary_loss_clip": 0.01118504, + "auxiliary_loss_mlp": 0.01048479, + "balance_loss_clip": 1.0228374, + "balance_loss_mlp": 1.03194141, + "epoch": 0.12475574928603637, + "flos": 24643617724800.0, + "grad_norm": 2.37343051951324, + "language_loss": 0.83716631, + "learning_rate": 3.84858934624384e-06, + "loss": 0.85883617, + "num_input_tokens_seen": 44825550, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.86328125, + "step": 2075, + "time_per_iteration": 2.3994948863983154 + }, + { + "auxiliary_loss_clip": 0.01117635, + "auxiliary_loss_mlp": 0.01050142, + "balance_loss_clip": 1.02473879, + "balance_loss_mlp": 1.03250575, + "epoch": 0.12481587253870434, + "flos": 21761886637440.0, + "grad_norm": 2.710930549668586, + "language_loss": 0.73307014, + "learning_rate": 3.8484451061642585e-06, + "loss": 0.75474799, + "num_input_tokens_seen": 44844155, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.8515625, + "step": 2076, + "time_per_iteration": 2.40032696723938 + }, + { + "auxiliary_loss_clip": 0.01115359, + "auxiliary_loss_mlp": 0.0104041, + "balance_loss_clip": 1.01834464, + "balance_loss_mlp": 1.03534937, + "epoch": 0.12487599579137232, + "flos": 21431503641600.0, + "grad_norm": 1.7213159578200155, + "language_loss": 0.75646508, + "learning_rate": 3.8483008001184275e-06, + "loss": 0.77802277, + "num_input_tokens_seen": 44863780, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.80078125, + "step": 2077, + "time_per_iteration": 2.4050636291503906 + }, + { + "auxiliary_loss_clip": 0.01114992, + "auxiliary_loss_mlp": 0.0104251, + "balance_loss_clip": 1.01846504, + "balance_loss_mlp": 1.03148556, + "epoch": 0.12493611904404028, + "flos": 16106690188800.0, + "grad_norm": 2.7697005372158348, + "language_loss": 0.81957054, + "learning_rate": 3.848156428111495e-06, + "loss": 0.84114563, + "num_input_tokens_seen": 44881480, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.8359375, + "step": 2078, + "time_per_iteration": 2.392759323120117 + }, + { + "auxiliary_loss_clip": 0.01119904, + "auxiliary_loss_mlp": 0.01045197, + "balance_loss_clip": 1.02247524, + "balance_loss_mlp": 1.0352838, + "epoch": 0.12499624229670825, + "flos": 21579186159360.0, + "grad_norm": 1.691066522987815, + "language_loss": 0.75001132, + "learning_rate": 3.8480119901486135e-06, + "loss": 0.77166235, + "num_input_tokens_seen": 44900390, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 0.84765625, + "step": 2079, + "time_per_iteration": 2.3888022899627686 + }, + { + "auxiliary_loss_clip": 0.01123983, + "auxiliary_loss_mlp": 0.01051593, + "balance_loss_clip": 1.02683294, + "balance_loss_mlp": 1.03642082, + "epoch": 0.1250563655493762, + "flos": 25697960006400.0, + "grad_norm": 2.096393689326478, + "language_loss": 0.8320049, + "learning_rate": 3.847867486234937e-06, + "loss": 0.8537606, + "num_input_tokens_seen": 44920375, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.875, + "step": 2080, + "time_per_iteration": 2.4352903366088867 + }, + { + "auxiliary_loss_clip": 0.01116862, + "auxiliary_loss_mlp": 0.01050311, + "balance_loss_clip": 1.02584922, + "balance_loss_mlp": 1.03381598, + "epoch": 0.12511648880204418, + "flos": 16908575362560.0, + "grad_norm": 2.0247473194895234, + "language_loss": 0.84366202, + "learning_rate": 3.847722916375624e-06, + "loss": 0.8653338, + "num_input_tokens_seen": 44938415, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.828125, + "step": 2081, + "time_per_iteration": 2.3475122451782227 + }, + { + "auxiliary_loss_clip": 0.01116416, + "auxiliary_loss_mlp": 0.01041025, + "balance_loss_clip": 1.01849461, + "balance_loss_mlp": 1.03229547, + "epoch": 0.12517661205471217, + "flos": 17566513534080.0, + "grad_norm": 1.7118357765139873, + "language_loss": 0.76701432, + "learning_rate": 3.847578280575832e-06, + "loss": 0.78858876, + "num_input_tokens_seen": 44957135, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.83984375, + "step": 2082, + "time_per_iteration": 2.3789234161376953 + }, + { + "auxiliary_loss_clip": 0.01127754, + "auxiliary_loss_mlp": 0.01042905, + "balance_loss_clip": 1.01657128, + "balance_loss_mlp": 1.03636873, + "epoch": 0.12523673530738014, + "flos": 16032883841280.0, + "grad_norm": 2.459898948127579, + "language_loss": 0.79046977, + "learning_rate": 3.847433578840725e-06, + "loss": 0.81217635, + "num_input_tokens_seen": 44974480, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 0.9140625, + "step": 2083, + "time_per_iteration": 2.355137586593628 + }, + { + "auxiliary_loss_clip": 0.01117899, + "auxiliary_loss_mlp": 0.01042609, + "balance_loss_clip": 1.0177182, + "balance_loss_mlp": 1.0319339, + "epoch": 0.1252968585600481, + "flos": 18806733227520.0, + "grad_norm": 3.531654342694931, + "language_loss": 0.90191615, + "learning_rate": 3.847288811175465e-06, + "loss": 0.92352128, + "num_input_tokens_seen": 44990310, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.859375, + "step": 2084, + "time_per_iteration": 2.3714373111724854 + }, + { + "auxiliary_loss_clip": 0.01118012, + "auxiliary_loss_mlp": 0.01041976, + "balance_loss_clip": 1.01832533, + "balance_loss_mlp": 1.03389311, + "epoch": 0.12535698181271607, + "flos": 27270343175040.0, + "grad_norm": 1.9371363425061896, + "language_loss": 0.7973994, + "learning_rate": 3.84714397758522e-06, + "loss": 0.81899923, + "num_input_tokens_seen": 45010720, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.84375, + "step": 2085, + "time_per_iteration": 2.435650587081909 + }, + { + "auxiliary_loss_clip": 0.01115806, + "auxiliary_loss_mlp": 0.0104591, + "balance_loss_clip": 1.0222826, + "balance_loss_mlp": 1.0319972, + "epoch": 0.12541710506538403, + "flos": 22053027398400.0, + "grad_norm": 1.941421465247455, + "language_loss": 0.88167977, + "learning_rate": 3.846999078075156e-06, + "loss": 0.90329695, + "num_input_tokens_seen": 45030360, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.8359375, + "step": 2086, + "time_per_iteration": 2.3734970092773438 + }, + { + "auxiliary_loss_clip": 0.01112312, + "auxiliary_loss_mlp": 0.01044926, + "balance_loss_clip": 1.02293229, + "balance_loss_mlp": 1.03155839, + "epoch": 0.125477228318052, + "flos": 12602388003840.0, + "grad_norm": 2.0588020364240602, + "language_loss": 0.87000966, + "learning_rate": 3.8468541126504476e-06, + "loss": 0.89158201, + "num_input_tokens_seen": 45045085, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 0.80859375, + "step": 2087, + "time_per_iteration": 2.3449716567993164 + }, + { + "auxiliary_loss_clip": 0.01116882, + "auxiliary_loss_mlp": 0.01043699, + "balance_loss_clip": 1.0192132, + "balance_loss_mlp": 1.03264987, + "epoch": 0.12553735157071996, + "flos": 23877413827200.0, + "grad_norm": 1.8887299433909066, + "language_loss": 0.73232102, + "learning_rate": 3.846709081316266e-06, + "loss": 0.75392687, + "num_input_tokens_seen": 45065145, + "router_z_loss_clip": 0.24511719, + "router_z_loss_mlp": 0.84375, + "step": 2088, + "time_per_iteration": 2.399785280227661 + }, + { + "auxiliary_loss_clip": 0.01038205, + "auxiliary_loss_mlp": 0.01002933, + "balance_loss_clip": 0.9987132, + "balance_loss_mlp": 1.01034844, + "epoch": 0.12559747482338796, + "flos": 69917482321920.0, + "grad_norm": 0.7564133700942366, + "language_loss": 0.61721826, + "learning_rate": 3.846563984077788e-06, + "loss": 0.63762963, + "num_input_tokens_seen": 45126230, + "router_z_loss_clip": 0.04223633, + "router_z_loss_mlp": 0.27734375, + "step": 2089, + "time_per_iteration": 3.0131125450134277 + }, + { + "auxiliary_loss_clip": 0.01112258, + "auxiliary_loss_mlp": 0.01043145, + "balance_loss_clip": 1.02019763, + "balance_loss_mlp": 1.03125, + "epoch": 0.12565759807605592, + "flos": 24278426236800.0, + "grad_norm": 3.37125939911503, + "language_loss": 0.77657014, + "learning_rate": 3.846418820940191e-06, + "loss": 0.79812419, + "num_input_tokens_seen": 45145545, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 0.80859375, + "step": 2090, + "time_per_iteration": 2.400271415710449 + }, + { + "auxiliary_loss_clip": 0.01036273, + "auxiliary_loss_mlp": 0.01004513, + "balance_loss_clip": 1.00050783, + "balance_loss_mlp": 1.00848293, + "epoch": 0.12571772132872389, + "flos": 56448375016320.0, + "grad_norm": 0.7524356481684067, + "language_loss": 0.59461302, + "learning_rate": 3.846273591908656e-06, + "loss": 0.61502087, + "num_input_tokens_seen": 45206845, + "router_z_loss_clip": 0.04003906, + "router_z_loss_mlp": 0.27734375, + "step": 2091, + "time_per_iteration": 2.971236228942871 + }, + { + "auxiliary_loss_clip": 0.0111571, + "auxiliary_loss_mlp": 0.01041371, + "balance_loss_clip": 1.01947236, + "balance_loss_mlp": 1.03398085, + "epoch": 0.12577784458139185, + "flos": 41244225050880.0, + "grad_norm": 2.0347088973012024, + "language_loss": 0.6315937, + "learning_rate": 3.846128296988365e-06, + "loss": 0.65316451, + "num_input_tokens_seen": 45228495, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 0.81640625, + "step": 2092, + "time_per_iteration": 2.5666489601135254 + }, + { + "auxiliary_loss_clip": 0.01119389, + "auxiliary_loss_mlp": 0.0105116, + "balance_loss_clip": 1.02620912, + "balance_loss_mlp": 1.03349912, + "epoch": 0.12583796783405982, + "flos": 19754485528320.0, + "grad_norm": 4.127986859516993, + "language_loss": 0.80670291, + "learning_rate": 3.845982936184505e-06, + "loss": 0.82840842, + "num_input_tokens_seen": 45245720, + "router_z_loss_clip": 0.24902344, + "router_z_loss_mlp": 0.859375, + "step": 2093, + "time_per_iteration": 2.372065305709839 + }, + { + "auxiliary_loss_clip": 0.01117134, + "auxiliary_loss_mlp": 0.01052396, + "balance_loss_clip": 1.02943611, + "balance_loss_mlp": 1.03341269, + "epoch": 0.12589809108672778, + "flos": 22600989187200.0, + "grad_norm": 1.8013595234153057, + "language_loss": 0.75998724, + "learning_rate": 3.845837509502262e-06, + "loss": 0.78168249, + "num_input_tokens_seen": 45265650, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 0.8359375, + "step": 2094, + "time_per_iteration": 2.4176156520843506 + }, + { + "auxiliary_loss_clip": 0.01112457, + "auxiliary_loss_mlp": 0.01048578, + "balance_loss_clip": 1.02617884, + "balance_loss_mlp": 1.03079164, + "epoch": 0.12595821433939577, + "flos": 45221111665920.0, + "grad_norm": 1.9134547374868065, + "language_loss": 0.76899022, + "learning_rate": 3.845692016946826e-06, + "loss": 0.79060054, + "num_input_tokens_seen": 45287790, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.81640625, + "step": 2095, + "time_per_iteration": 2.5881059169769287 + }, + { + "auxiliary_loss_clip": 0.01116577, + "auxiliary_loss_mlp": 0.01042894, + "balance_loss_clip": 1.01988709, + "balance_loss_mlp": 1.03235626, + "epoch": 0.12601833759206374, + "flos": 14318927642880.0, + "grad_norm": 2.3136884654316052, + "language_loss": 0.82832527, + "learning_rate": 3.845546458523391e-06, + "loss": 0.84991997, + "num_input_tokens_seen": 45305720, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 0.84375, + "step": 2096, + "time_per_iteration": 2.38016676902771 + }, + { + "auxiliary_loss_clip": 0.01114748, + "auxiliary_loss_mlp": 0.01045874, + "balance_loss_clip": 1.02244925, + "balance_loss_mlp": 1.03167081, + "epoch": 0.1260784608447317, + "flos": 21287172614400.0, + "grad_norm": 2.0683992218288885, + "language_loss": 0.7564081, + "learning_rate": 3.845400834237148e-06, + "loss": 0.7780143, + "num_input_tokens_seen": 45325290, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.83203125, + "step": 2097, + "time_per_iteration": 2.400761127471924 + }, + { + "auxiliary_loss_clip": 0.01115307, + "auxiliary_loss_mlp": 0.01049884, + "balance_loss_clip": 1.0281167, + "balance_loss_mlp": 1.0332402, + "epoch": 0.12613858409739967, + "flos": 26250076247040.0, + "grad_norm": 3.3089693939595457, + "language_loss": 0.8743059, + "learning_rate": 3.8452551440932975e-06, + "loss": 0.89595783, + "num_input_tokens_seen": 45344465, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 0.8203125, + "step": 2098, + "time_per_iteration": 2.4255878925323486 + }, + { + "auxiliary_loss_clip": 0.01119626, + "auxiliary_loss_mlp": 0.01057123, + "balance_loss_clip": 1.03171968, + "balance_loss_mlp": 1.03246355, + "epoch": 0.12619870735006763, + "flos": 21578906868480.0, + "grad_norm": 1.9861322343369732, + "language_loss": 0.69507301, + "learning_rate": 3.8451093880970365e-06, + "loss": 0.71684051, + "num_input_tokens_seen": 45362465, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.87109375, + "step": 2099, + "time_per_iteration": 2.3840246200561523 + }, + { + "auxiliary_loss_clip": 0.01116001, + "auxiliary_loss_mlp": 0.0105122, + "balance_loss_clip": 1.02680588, + "balance_loss_mlp": 1.03224981, + "epoch": 0.1262588306027356, + "flos": 23365936275840.0, + "grad_norm": 2.4534387303493603, + "language_loss": 0.81588322, + "learning_rate": 3.844963566253569e-06, + "loss": 0.83755541, + "num_input_tokens_seen": 45382700, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.83984375, + "step": 2100, + "time_per_iteration": 2.4223971366882324 + }, + { + "auxiliary_loss_clip": 0.01119178, + "auxiliary_loss_mlp": 0.01048629, + "balance_loss_clip": 1.0248704, + "balance_loss_mlp": 1.03249133, + "epoch": 0.12631895385540357, + "flos": 23948113063680.0, + "grad_norm": 1.9546686395482318, + "language_loss": 0.80489665, + "learning_rate": 3.844817678568097e-06, + "loss": 0.82657468, + "num_input_tokens_seen": 45401005, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.8671875, + "step": 2101, + "time_per_iteration": 2.4068095684051514 + }, + { + "auxiliary_loss_clip": 0.01033474, + "auxiliary_loss_mlp": 0.01005857, + "balance_loss_clip": 1.0021373, + "balance_loss_mlp": 1.00711823, + "epoch": 0.12637907710807156, + "flos": 70278868471680.0, + "grad_norm": 0.7011092592635109, + "language_loss": 0.57050014, + "learning_rate": 3.8446717250458275e-06, + "loss": 0.59089339, + "num_input_tokens_seen": 45466555, + "router_z_loss_clip": 0.03710938, + "router_z_loss_mlp": 0.26367188, + "step": 2102, + "time_per_iteration": 3.098021984100342 + }, + { + "auxiliary_loss_clip": 0.01117671, + "auxiliary_loss_mlp": 0.01044396, + "balance_loss_clip": 1.02066135, + "balance_loss_mlp": 1.03210449, + "epoch": 0.12643920036073952, + "flos": 18914126169600.0, + "grad_norm": 2.1890385512611754, + "language_loss": 0.93189907, + "learning_rate": 3.844525705691969e-06, + "loss": 0.9535197, + "num_input_tokens_seen": 45485165, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 0.85546875, + "step": 2103, + "time_per_iteration": 2.3683717250823975 + }, + { + "auxiliary_loss_clip": 0.01109523, + "auxiliary_loss_mlp": 0.01034552, + "balance_loss_clip": 1.01373827, + "balance_loss_mlp": 1.03146935, + "epoch": 0.1264993236134075, + "flos": 27781227233280.0, + "grad_norm": 2.0237792358659945, + "language_loss": 0.77780366, + "learning_rate": 3.844379620511733e-06, + "loss": 0.7992444, + "num_input_tokens_seen": 45504630, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 0.78125, + "step": 2104, + "time_per_iteration": 2.473353385925293 + }, + { + "auxiliary_loss_clip": 0.011191, + "auxiliary_loss_mlp": 0.01046612, + "balance_loss_clip": 1.02412975, + "balance_loss_mlp": 1.03558755, + "epoch": 0.12655944686607545, + "flos": 24753524284800.0, + "grad_norm": 3.309421676063958, + "language_loss": 0.81139278, + "learning_rate": 3.844233469510333e-06, + "loss": 0.83304989, + "num_input_tokens_seen": 45524885, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.8359375, + "step": 2105, + "time_per_iteration": 2.4122915267944336 + }, + { + "auxiliary_loss_clip": 0.01121015, + "auxiliary_loss_mlp": 0.01043026, + "balance_loss_clip": 1.0182066, + "balance_loss_mlp": 1.03295863, + "epoch": 0.12661957011874342, + "flos": 24131930705280.0, + "grad_norm": 2.622898736258822, + "language_loss": 0.83117187, + "learning_rate": 3.844087252692984e-06, + "loss": 0.85281229, + "num_input_tokens_seen": 45545000, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.8828125, + "step": 2106, + "time_per_iteration": 2.431649684906006 + }, + { + "auxiliary_loss_clip": 0.01115561, + "auxiliary_loss_mlp": 0.01042881, + "balance_loss_clip": 1.01909912, + "balance_loss_mlp": 1.03455234, + "epoch": 0.12667969337141138, + "flos": 24568519656960.0, + "grad_norm": 1.8570276402480308, + "language_loss": 0.7331838, + "learning_rate": 3.843940970064904e-06, + "loss": 0.75476825, + "num_input_tokens_seen": 45564210, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 0.80859375, + "step": 2107, + "time_per_iteration": 2.404420852661133 + }, + { + "auxiliary_loss_clip": 0.0111377, + "auxiliary_loss_mlp": 0.01038722, + "balance_loss_clip": 1.01615608, + "balance_loss_mlp": 1.03279042, + "epoch": 0.12673981662407935, + "flos": 22960699591680.0, + "grad_norm": 1.886671777421692, + "language_loss": 0.78911781, + "learning_rate": 3.843794621631314e-06, + "loss": 0.81064278, + "num_input_tokens_seen": 45583030, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 0.8125, + "step": 2108, + "time_per_iteration": 2.4051084518432617 + }, + { + "auxiliary_loss_clip": 0.01114943, + "auxiliary_loss_mlp": 0.01042829, + "balance_loss_clip": 1.02022719, + "balance_loss_mlp": 1.03060329, + "epoch": 0.12679993987674734, + "flos": 17273906496000.0, + "grad_norm": 1.9965757902599248, + "language_loss": 0.75592458, + "learning_rate": 3.843648207397438e-06, + "loss": 0.7775023, + "num_input_tokens_seen": 45602265, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.84375, + "step": 2109, + "time_per_iteration": 3.766605854034424 + }, + { + "auxiliary_loss_clip": 0.0111483, + "auxiliary_loss_mlp": 0.01044969, + "balance_loss_clip": 1.02223563, + "balance_loss_mlp": 1.0322988, + "epoch": 0.1268600631294153, + "flos": 17274115964160.0, + "grad_norm": 1.7273210348148718, + "language_loss": 0.8307693, + "learning_rate": 3.843501727368498e-06, + "loss": 0.85236728, + "num_input_tokens_seen": 45620595, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 0.82421875, + "step": 2110, + "time_per_iteration": 2.4035935401916504 + }, + { + "auxiliary_loss_clip": 0.01112926, + "auxiliary_loss_mlp": 0.01041728, + "balance_loss_clip": 1.01898241, + "balance_loss_mlp": 1.03165388, + "epoch": 0.12692018638208327, + "flos": 24059904837120.0, + "grad_norm": 1.7158466888913007, + "language_loss": 0.78610981, + "learning_rate": 3.8433551815497255e-06, + "loss": 0.80765629, + "num_input_tokens_seen": 45641140, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 0.81640625, + "step": 2111, + "time_per_iteration": 5.266538381576538 + }, + { + "auxiliary_loss_clip": 0.01123097, + "auxiliary_loss_mlp": 0.0105393, + "balance_loss_clip": 1.027632, + "balance_loss_mlp": 1.03377807, + "epoch": 0.12698030963475124, + "flos": 31830558652800.0, + "grad_norm": 2.513129431042387, + "language_loss": 0.76426053, + "learning_rate": 3.843208569946347e-06, + "loss": 0.78603077, + "num_input_tokens_seen": 45662315, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 0.89453125, + "step": 2112, + "time_per_iteration": 2.4973158836364746 + }, + { + "auxiliary_loss_clip": 0.01114453, + "auxiliary_loss_mlp": 0.0105147, + "balance_loss_clip": 1.0288676, + "balance_loss_mlp": 1.03119397, + "epoch": 0.1270404328874192, + "flos": 25186691923200.0, + "grad_norm": 1.7756504212212387, + "language_loss": 0.85513252, + "learning_rate": 3.843061892563596e-06, + "loss": 0.87679178, + "num_input_tokens_seen": 45680335, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.83203125, + "step": 2113, + "time_per_iteration": 3.8007261753082275 + }, + { + "auxiliary_loss_clip": 0.0111456, + "auxiliary_loss_mlp": 0.01046979, + "balance_loss_clip": 1.02329206, + "balance_loss_mlp": 1.03161609, + "epoch": 0.12710055614008717, + "flos": 15996434515200.0, + "grad_norm": 2.1560867935703585, + "language_loss": 0.73853689, + "learning_rate": 3.842915149406707e-06, + "loss": 0.76015228, + "num_input_tokens_seen": 45696240, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.828125, + "step": 2114, + "time_per_iteration": 2.369175672531128 + }, + { + "auxiliary_loss_clip": 0.01118727, + "auxiliary_loss_mlp": 0.0104384, + "balance_loss_clip": 1.02104712, + "balance_loss_mlp": 1.03434312, + "epoch": 0.12716067939275516, + "flos": 15084747515520.0, + "grad_norm": 1.9327626940509444, + "language_loss": 0.83024955, + "learning_rate": 3.842768340480917e-06, + "loss": 0.85187531, + "num_input_tokens_seen": 45713695, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.84375, + "step": 2115, + "time_per_iteration": 2.3644826412200928 + }, + { + "auxiliary_loss_clip": 0.0111886, + "auxiliary_loss_mlp": 0.01048808, + "balance_loss_clip": 1.02586019, + "balance_loss_mlp": 1.03423762, + "epoch": 0.12722080264542313, + "flos": 28365463791360.0, + "grad_norm": 1.6253823017253595, + "language_loss": 0.86538076, + "learning_rate": 3.8426214657914656e-06, + "loss": 0.88705742, + "num_input_tokens_seen": 45736655, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 0.84375, + "step": 2116, + "time_per_iteration": 2.4714713096618652 + }, + { + "auxiliary_loss_clip": 0.01114616, + "auxiliary_loss_mlp": 0.01041541, + "balance_loss_clip": 1.01851034, + "balance_loss_mlp": 1.03221858, + "epoch": 0.1272809258980911, + "flos": 32378520441600.0, + "grad_norm": 1.7531392172531437, + "language_loss": 0.70339799, + "learning_rate": 3.842474525343594e-06, + "loss": 0.72495955, + "num_input_tokens_seen": 45758195, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.82421875, + "step": 2117, + "time_per_iteration": 2.485377788543701 + }, + { + "auxiliary_loss_clip": 0.01114204, + "auxiliary_loss_mlp": 0.01045505, + "balance_loss_clip": 1.02188969, + "balance_loss_mlp": 1.03143668, + "epoch": 0.12734104915075906, + "flos": 16033477334400.0, + "grad_norm": 1.9788775424089131, + "language_loss": 0.86027038, + "learning_rate": 3.842327519142545e-06, + "loss": 0.88186753, + "num_input_tokens_seen": 45774280, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 0.828125, + "step": 2118, + "time_per_iteration": 2.361762523651123 + }, + { + "auxiliary_loss_clip": 0.01112817, + "auxiliary_loss_mlp": 0.01045058, + "balance_loss_clip": 1.02202654, + "balance_loss_mlp": 1.03070664, + "epoch": 0.12740117240342702, + "flos": 18259330020480.0, + "grad_norm": 2.068224738756466, + "language_loss": 0.87385684, + "learning_rate": 3.842180447193566e-06, + "loss": 0.89543557, + "num_input_tokens_seen": 45792760, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.8203125, + "step": 2119, + "time_per_iteration": 2.36799693107605 + }, + { + "auxiliary_loss_clip": 0.01116089, + "auxiliary_loss_mlp": 0.01042747, + "balance_loss_clip": 1.01886976, + "balance_loss_mlp": 1.031811, + "epoch": 0.127461295656095, + "flos": 12121215379200.0, + "grad_norm": 4.792706132267744, + "language_loss": 0.87717366, + "learning_rate": 3.842033309501905e-06, + "loss": 0.89876205, + "num_input_tokens_seen": 45804300, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 0.84375, + "step": 2120, + "time_per_iteration": 2.314236640930176 + }, + { + "auxiliary_loss_clip": 0.01111922, + "auxiliary_loss_mlp": 0.01036893, + "balance_loss_clip": 1.01433849, + "balance_loss_mlp": 1.03037643, + "epoch": 0.12752141890876295, + "flos": 23147973457920.0, + "grad_norm": 1.9133798996998994, + "language_loss": 0.75380892, + "learning_rate": 3.841886106072815e-06, + "loss": 0.77529705, + "num_input_tokens_seen": 45823780, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 0.8125, + "step": 2121, + "time_per_iteration": 2.3969669342041016 + }, + { + "auxiliary_loss_clip": 0.01111711, + "auxiliary_loss_mlp": 0.01044018, + "balance_loss_clip": 1.02122521, + "balance_loss_mlp": 1.03040743, + "epoch": 0.12758154216143094, + "flos": 21614937258240.0, + "grad_norm": 2.479920683180096, + "language_loss": 0.83177739, + "learning_rate": 3.841738836911547e-06, + "loss": 0.85333467, + "num_input_tokens_seen": 45840495, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 0.8125, + "step": 2122, + "time_per_iteration": 2.3829691410064697 + }, + { + "auxiliary_loss_clip": 0.01115723, + "auxiliary_loss_mlp": 0.01043598, + "balance_loss_clip": 1.02047181, + "balance_loss_mlp": 1.03254235, + "epoch": 0.1276416654140989, + "flos": 15923954799360.0, + "grad_norm": 4.716823850097833, + "language_loss": 0.79111636, + "learning_rate": 3.8415915020233574e-06, + "loss": 0.81270957, + "num_input_tokens_seen": 45857735, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 0.83203125, + "step": 2123, + "time_per_iteration": 2.3637309074401855 + }, + { + "auxiliary_loss_clip": 0.01115774, + "auxiliary_loss_mlp": 0.01051057, + "balance_loss_clip": 1.02757287, + "balance_loss_mlp": 1.03227031, + "epoch": 0.12770178866676687, + "flos": 22381595003520.0, + "grad_norm": 1.6150056217743856, + "language_loss": 0.78939128, + "learning_rate": 3.8414441014135045e-06, + "loss": 0.81105959, + "num_input_tokens_seen": 45876485, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 0.8359375, + "step": 2124, + "time_per_iteration": 2.377854108810425 + }, + { + "auxiliary_loss_clip": 0.01115417, + "auxiliary_loss_mlp": 0.01044485, + "balance_loss_clip": 1.02244282, + "balance_loss_mlp": 1.02979326, + "epoch": 0.12776191191943484, + "flos": 21651421495680.0, + "grad_norm": 2.004458347345125, + "language_loss": 0.75415641, + "learning_rate": 3.8412966350872475e-06, + "loss": 0.77575541, + "num_input_tokens_seen": 45894645, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.85546875, + "step": 2125, + "time_per_iteration": 2.3942389488220215 + }, + { + "auxiliary_loss_clip": 0.01112755, + "auxiliary_loss_mlp": 0.01042692, + "balance_loss_clip": 1.02105582, + "balance_loss_mlp": 1.03033376, + "epoch": 0.1278220351721028, + "flos": 25734479155200.0, + "grad_norm": 2.3313958258670318, + "language_loss": 0.77859557, + "learning_rate": 3.841149103049851e-06, + "loss": 0.80015004, + "num_input_tokens_seen": 45913755, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 0.82421875, + "step": 2126, + "time_per_iteration": 2.4090161323547363 + }, + { + "auxiliary_loss_clip": 0.01112666, + "auxiliary_loss_mlp": 0.01045773, + "balance_loss_clip": 1.0235647, + "balance_loss_mlp": 1.03223729, + "epoch": 0.12788215842477077, + "flos": 41241676521600.0, + "grad_norm": 1.5875149616149478, + "language_loss": 0.69364333, + "learning_rate": 3.8410015053065785e-06, + "loss": 0.71522772, + "num_input_tokens_seen": 45936095, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.8046875, + "step": 2127, + "time_per_iteration": 2.5647130012512207 + }, + { + "auxiliary_loss_clip": 0.01032074, + "auxiliary_loss_mlp": 0.01005287, + "balance_loss_clip": 1.00142455, + "balance_loss_mlp": 1.00704312, + "epoch": 0.12794228167743876, + "flos": 70873822817280.0, + "grad_norm": 0.8476597553826426, + "language_loss": 0.62836862, + "learning_rate": 3.8408538418626985e-06, + "loss": 0.6487422, + "num_input_tokens_seen": 46004655, + "router_z_loss_clip": 0.03857422, + "router_z_loss_mlp": 0.25, + "step": 2128, + "time_per_iteration": 3.063953399658203 + }, + { + "auxiliary_loss_clip": 0.01113648, + "auxiliary_loss_mlp": 0.01036754, + "balance_loss_clip": 1.01316237, + "balance_loss_mlp": 1.03077292, + "epoch": 0.12800240493010673, + "flos": 16288797173760.0, + "grad_norm": 2.753633175846378, + "language_loss": 0.77115464, + "learning_rate": 3.840706112723479e-06, + "loss": 0.79265857, + "num_input_tokens_seen": 46023610, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.828125, + "step": 2129, + "time_per_iteration": 2.3833889961242676 + }, + { + "auxiliary_loss_clip": 0.0112229, + "auxiliary_loss_mlp": 0.01046874, + "balance_loss_clip": 1.02174425, + "balance_loss_mlp": 1.03358889, + "epoch": 0.1280625281827747, + "flos": 20630491251840.0, + "grad_norm": 2.0155953904423067, + "language_loss": 0.79008496, + "learning_rate": 3.840558317894194e-06, + "loss": 0.81177664, + "num_input_tokens_seen": 46041725, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.88671875, + "step": 2130, + "time_per_iteration": 2.378575325012207 + }, + { + "auxiliary_loss_clip": 0.01115906, + "auxiliary_loss_mlp": 0.01041164, + "balance_loss_clip": 1.01815629, + "balance_loss_mlp": 1.03099012, + "epoch": 0.12812265143544266, + "flos": 22637124311040.0, + "grad_norm": 2.2626543637482257, + "language_loss": 0.70868599, + "learning_rate": 3.840410457380117e-06, + "loss": 0.73025668, + "num_input_tokens_seen": 46061095, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 0.8515625, + "step": 2131, + "time_per_iteration": 2.4017488956451416 + }, + { + "auxiliary_loss_clip": 0.01116169, + "auxiliary_loss_mlp": 0.01041838, + "balance_loss_clip": 1.01891422, + "balance_loss_mlp": 1.03228021, + "epoch": 0.12818277468811062, + "flos": 34713267258240.0, + "grad_norm": 2.4095980582332497, + "language_loss": 0.72449213, + "learning_rate": 3.840262531186525e-06, + "loss": 0.74607217, + "num_input_tokens_seen": 46082670, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.83984375, + "step": 2132, + "time_per_iteration": 2.497260332107544 + }, + { + "auxiliary_loss_clip": 0.01112032, + "auxiliary_loss_mlp": 0.01044994, + "balance_loss_clip": 1.02172446, + "balance_loss_mlp": 1.03138447, + "epoch": 0.1282428979407786, + "flos": 23111000461440.0, + "grad_norm": 2.3302926949069236, + "language_loss": 0.82523346, + "learning_rate": 3.840114539318697e-06, + "loss": 0.84680378, + "num_input_tokens_seen": 46102410, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.80859375, + "step": 2133, + "time_per_iteration": 2.3946878910064697 + }, + { + "auxiliary_loss_clip": 0.01120332, + "auxiliary_loss_mlp": 0.01052204, + "balance_loss_clip": 1.02770627, + "balance_loss_mlp": 1.03165603, + "epoch": 0.12830302119344655, + "flos": 20885461977600.0, + "grad_norm": 2.325376780580096, + "language_loss": 0.79481399, + "learning_rate": 3.839966481781914e-06, + "loss": 0.81653935, + "num_input_tokens_seen": 46121145, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.890625, + "step": 2134, + "time_per_iteration": 2.3721272945404053 + }, + { + "auxiliary_loss_clip": 0.01116204, + "auxiliary_loss_mlp": 0.01045534, + "balance_loss_clip": 1.02231169, + "balance_loss_mlp": 1.03271592, + "epoch": 0.12836314444611455, + "flos": 21396695149440.0, + "grad_norm": 1.9570237272098825, + "language_loss": 0.82733893, + "learning_rate": 3.83981835858146e-06, + "loss": 0.84895641, + "num_input_tokens_seen": 46140740, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.8359375, + "step": 2135, + "time_per_iteration": 2.391282081604004 + }, + { + "auxiliary_loss_clip": 0.01112881, + "auxiliary_loss_mlp": 0.01050256, + "balance_loss_clip": 1.02685499, + "balance_loss_mlp": 1.03148592, + "epoch": 0.1284232676987825, + "flos": 13661617875840.0, + "grad_norm": 2.3054997403547475, + "language_loss": 0.77193314, + "learning_rate": 3.839670169722622e-06, + "loss": 0.79356444, + "num_input_tokens_seen": 46156805, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.8125, + "step": 2136, + "time_per_iteration": 2.342874050140381 + }, + { + "auxiliary_loss_clip": 0.01033939, + "auxiliary_loss_mlp": 0.01001595, + "balance_loss_clip": 0.99828076, + "balance_loss_mlp": 1.00933623, + "epoch": 0.12848339095145048, + "flos": 59991709968000.0, + "grad_norm": 0.891959578830437, + "language_loss": 0.59144431, + "learning_rate": 3.839521915210688e-06, + "loss": 0.6117996, + "num_input_tokens_seen": 46222085, + "router_z_loss_clip": 0.03320312, + "router_z_loss_mlp": 0.24609375, + "step": 2137, + "time_per_iteration": 3.1718711853027344 + }, + { + "auxiliary_loss_clip": 0.01112519, + "auxiliary_loss_mlp": 0.01044892, + "balance_loss_clip": 1.02251649, + "balance_loss_mlp": 1.02958333, + "epoch": 0.12854351420411844, + "flos": 13880523300480.0, + "grad_norm": 3.2122022524145843, + "language_loss": 0.82409132, + "learning_rate": 3.839373595050948e-06, + "loss": 0.84566545, + "num_input_tokens_seen": 46239970, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.828125, + "step": 2138, + "time_per_iteration": 2.3435263633728027 + }, + { + "auxiliary_loss_clip": 0.01116734, + "auxiliary_loss_mlp": 0.01046684, + "balance_loss_clip": 1.02316427, + "balance_loss_mlp": 1.03054476, + "epoch": 0.1286036374567864, + "flos": 22636845020160.0, + "grad_norm": 2.599591190734799, + "language_loss": 0.78714335, + "learning_rate": 3.839225209248696e-06, + "loss": 0.80877751, + "num_input_tokens_seen": 46257740, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 0.859375, + "step": 2139, + "time_per_iteration": 2.3830668926239014 + }, + { + "auxiliary_loss_clip": 0.0111658, + "auxiliary_loss_mlp": 0.01042747, + "balance_loss_clip": 1.02007365, + "balance_loss_mlp": 1.03078604, + "epoch": 0.12866376070945437, + "flos": 16323884956800.0, + "grad_norm": 2.1733964329112556, + "language_loss": 0.85316467, + "learning_rate": 3.839076757809228e-06, + "loss": 0.87475795, + "num_input_tokens_seen": 46275445, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.859375, + "step": 2140, + "time_per_iteration": 2.3494696617126465 + }, + { + "auxiliary_loss_clip": 0.01112102, + "auxiliary_loss_mlp": 0.01040162, + "balance_loss_clip": 1.01962209, + "balance_loss_mlp": 1.02963066, + "epoch": 0.12872388396212234, + "flos": 11873750595840.0, + "grad_norm": 2.4315846790103257, + "language_loss": 0.85440862, + "learning_rate": 3.83892824073784e-06, + "loss": 0.87593126, + "num_input_tokens_seen": 46291710, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 0.82421875, + "step": 2141, + "time_per_iteration": 2.3729195594787598 + }, + { + "auxiliary_loss_clip": 0.01118117, + "auxiliary_loss_mlp": 0.01043187, + "balance_loss_clip": 1.01964295, + "balance_loss_mlp": 1.03118086, + "epoch": 0.12878400721479033, + "flos": 28365428880000.0, + "grad_norm": 2.0216933033230786, + "language_loss": 0.6776073, + "learning_rate": 3.838779658039834e-06, + "loss": 0.69922036, + "num_input_tokens_seen": 46311335, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 0.87109375, + "step": 2142, + "time_per_iteration": 2.4438652992248535 + }, + { + "auxiliary_loss_clip": 0.01119154, + "auxiliary_loss_mlp": 0.01039734, + "balance_loss_clip": 1.01579714, + "balance_loss_mlp": 1.03259099, + "epoch": 0.1288441304674583, + "flos": 25884430911360.0, + "grad_norm": 1.9742993299275668, + "language_loss": 0.83022559, + "learning_rate": 3.838631009720513e-06, + "loss": 0.85181445, + "num_input_tokens_seen": 46330985, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.8671875, + "step": 2143, + "time_per_iteration": 2.43601131439209 + }, + { + "auxiliary_loss_clip": 0.01118479, + "auxiliary_loss_mlp": 0.01048113, + "balance_loss_clip": 1.0251298, + "balance_loss_mlp": 1.03445256, + "epoch": 0.12890425372012626, + "flos": 20812737882240.0, + "grad_norm": 1.8165525936827422, + "language_loss": 0.81771183, + "learning_rate": 3.83848229578518e-06, + "loss": 0.83937776, + "num_input_tokens_seen": 46351295, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 0.83984375, + "step": 2144, + "time_per_iteration": 2.4065892696380615 + }, + { + "auxiliary_loss_clip": 0.01114425, + "auxiliary_loss_mlp": 0.01048728, + "balance_loss_clip": 1.02498186, + "balance_loss_mlp": 1.02971935, + "epoch": 0.12896437697279423, + "flos": 22564749329280.0, + "grad_norm": 2.18530696944292, + "language_loss": 0.78207135, + "learning_rate": 3.838333516239142e-06, + "loss": 0.80370283, + "num_input_tokens_seen": 46368600, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 0.84765625, + "step": 2145, + "time_per_iteration": 2.3868374824523926 + }, + { + "auxiliary_loss_clip": 0.01117629, + "auxiliary_loss_mlp": 0.01048487, + "balance_loss_clip": 1.02367997, + "balance_loss_mlp": 1.03120184, + "epoch": 0.1290245002254622, + "flos": 17492811920640.0, + "grad_norm": 2.506205522025062, + "language_loss": 0.82470876, + "learning_rate": 3.83818467108771e-06, + "loss": 0.84636986, + "num_input_tokens_seen": 46387370, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.8671875, + "step": 2146, + "time_per_iteration": 2.360330820083618 + }, + { + "auxiliary_loss_clip": 0.01117362, + "auxiliary_loss_mlp": 0.01044412, + "balance_loss_clip": 1.02117813, + "balance_loss_mlp": 1.03334451, + "epoch": 0.12908462347813016, + "flos": 36314593810560.0, + "grad_norm": 3.324351171390452, + "language_loss": 0.71070415, + "learning_rate": 3.838035760336196e-06, + "loss": 0.73232186, + "num_input_tokens_seen": 46409570, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.83984375, + "step": 2147, + "time_per_iteration": 2.5244247913360596 + }, + { + "auxiliary_loss_clip": 0.01115186, + "auxiliary_loss_mlp": 0.01044679, + "balance_loss_clip": 1.02238691, + "balance_loss_mlp": 1.03071284, + "epoch": 0.12914474673079815, + "flos": 22527601776000.0, + "grad_norm": 2.3566456445030193, + "language_loss": 0.71779263, + "learning_rate": 3.837886783989914e-06, + "loss": 0.73939127, + "num_input_tokens_seen": 46429320, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.84375, + "step": 2148, + "time_per_iteration": 3.7509665489196777 + }, + { + "auxiliary_loss_clip": 0.01112953, + "auxiliary_loss_mlp": 0.01037325, + "balance_loss_clip": 1.01605821, + "balance_loss_mlp": 1.03363156, + "epoch": 0.12920486998346611, + "flos": 21470780787840.0, + "grad_norm": 1.4622631739764205, + "language_loss": 0.78898561, + "learning_rate": 3.837737742054179e-06, + "loss": 0.81048834, + "num_input_tokens_seen": 46450155, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 0.79296875, + "step": 2149, + "time_per_iteration": 2.397650718688965 + }, + { + "auxiliary_loss_clip": 0.0111474, + "auxiliary_loss_mlp": 0.01038862, + "balance_loss_clip": 1.01599813, + "balance_loss_mlp": 1.0322175, + "epoch": 0.12926499323613408, + "flos": 27307316171520.0, + "grad_norm": 1.9198554036163238, + "language_loss": 0.76388699, + "learning_rate": 3.837588634534312e-06, + "loss": 0.78542304, + "num_input_tokens_seen": 46470280, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.82421875, + "step": 2150, + "time_per_iteration": 3.8543319702148438 + }, + { + "auxiliary_loss_clip": 0.01115536, + "auxiliary_loss_mlp": 0.01046366, + "balance_loss_clip": 1.0236088, + "balance_loss_mlp": 1.03230286, + "epoch": 0.12932511648880204, + "flos": 22090035306240.0, + "grad_norm": 2.7470104760426186, + "language_loss": 0.70360446, + "learning_rate": 3.837439461435634e-06, + "loss": 0.72522342, + "num_input_tokens_seen": 46487605, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 0.83203125, + "step": 2151, + "time_per_iteration": 3.801199197769165 + }, + { + "auxiliary_loss_clip": 0.01115444, + "auxiliary_loss_mlp": 0.01038906, + "balance_loss_clip": 1.0167923, + "balance_loss_mlp": 1.03347862, + "epoch": 0.12938523974147, + "flos": 20301749089920.0, + "grad_norm": 1.893864815881546, + "language_loss": 0.84205532, + "learning_rate": 3.837290222763467e-06, + "loss": 0.86359888, + "num_input_tokens_seen": 46505100, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.8203125, + "step": 2152, + "time_per_iteration": 3.7916481494903564 + }, + { + "auxiliary_loss_clip": 0.01115379, + "auxiliary_loss_mlp": 0.01048931, + "balance_loss_clip": 1.02667511, + "balance_loss_mlp": 1.03076506, + "epoch": 0.12944536299413797, + "flos": 19498956220800.0, + "grad_norm": 1.7597086564245399, + "language_loss": 0.78322285, + "learning_rate": 3.837140918523139e-06, + "loss": 0.80486596, + "num_input_tokens_seen": 46524020, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.84765625, + "step": 2153, + "time_per_iteration": 2.3810129165649414 + }, + { + "auxiliary_loss_clip": 0.01114693, + "auxiliary_loss_mlp": 0.01046041, + "balance_loss_clip": 1.02345061, + "balance_loss_mlp": 1.03024554, + "epoch": 0.12950548624680594, + "flos": 27706722658560.0, + "grad_norm": 1.5810401249289232, + "language_loss": 0.80105108, + "learning_rate": 3.836991548719977e-06, + "loss": 0.82265842, + "num_input_tokens_seen": 46544640, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.84375, + "step": 2154, + "time_per_iteration": 2.4317073822021484 + }, + { + "auxiliary_loss_clip": 0.01117873, + "auxiliary_loss_mlp": 0.01045428, + "balance_loss_clip": 1.02213466, + "balance_loss_mlp": 1.03387988, + "epoch": 0.12956560949947393, + "flos": 17564802877440.0, + "grad_norm": 1.883418700162283, + "language_loss": 0.83143741, + "learning_rate": 3.836842113359312e-06, + "loss": 0.85307044, + "num_input_tokens_seen": 46561395, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.83984375, + "step": 2155, + "time_per_iteration": 2.3684208393096924 + }, + { + "auxiliary_loss_clip": 0.01116601, + "auxiliary_loss_mlp": 0.0104286, + "balance_loss_clip": 1.01961398, + "balance_loss_mlp": 1.03140187, + "epoch": 0.1296257327521419, + "flos": 20739664673280.0, + "grad_norm": 2.60337306720873, + "language_loss": 0.75139713, + "learning_rate": 3.836692612446477e-06, + "loss": 0.77299178, + "num_input_tokens_seen": 46579395, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.8515625, + "step": 2156, + "time_per_iteration": 2.3783843517303467 + }, + { + "auxiliary_loss_clip": 0.01110463, + "auxiliary_loss_mlp": 0.01041525, + "balance_loss_clip": 1.01924503, + "balance_loss_mlp": 1.03018153, + "epoch": 0.12968585600480986, + "flos": 16394898395520.0, + "grad_norm": 1.8736014366258005, + "language_loss": 0.86187625, + "learning_rate": 3.836543045986806e-06, + "loss": 0.88339609, + "num_input_tokens_seen": 46597090, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.80078125, + "step": 2157, + "time_per_iteration": 2.371201515197754 + }, + { + "auxiliary_loss_clip": 0.01114903, + "auxiliary_loss_mlp": 0.01045962, + "balance_loss_clip": 1.02079701, + "balance_loss_mlp": 1.02920556, + "epoch": 0.12974597925747783, + "flos": 28328281326720.0, + "grad_norm": 2.5809651804015252, + "language_loss": 0.80159575, + "learning_rate": 3.836393413985639e-06, + "loss": 0.8232044, + "num_input_tokens_seen": 46617355, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.85546875, + "step": 2158, + "time_per_iteration": 2.4271130561828613 + }, + { + "auxiliary_loss_clip": 0.01119188, + "auxiliary_loss_mlp": 0.01044262, + "balance_loss_clip": 1.02031255, + "balance_loss_mlp": 1.03189647, + "epoch": 0.1298061025101458, + "flos": 9682357288320.0, + "grad_norm": 2.3233885540699752, + "language_loss": 0.74530011, + "learning_rate": 3.836243716448315e-06, + "loss": 0.76693463, + "num_input_tokens_seen": 46633130, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.875, + "step": 2159, + "time_per_iteration": 2.3568336963653564 + }, + { + "auxiliary_loss_clip": 0.01110925, + "auxiliary_loss_mlp": 0.01043432, + "balance_loss_clip": 1.0205555, + "balance_loss_mlp": 1.02931619, + "epoch": 0.12986622576281376, + "flos": 27708293669760.0, + "grad_norm": 1.9939973787459886, + "language_loss": 0.82547617, + "learning_rate": 3.8360939533801755e-06, + "loss": 0.84701967, + "num_input_tokens_seen": 46650575, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.81640625, + "step": 2160, + "time_per_iteration": 2.436048984527588 + }, + { + "auxiliary_loss_clip": 0.01111275, + "auxiliary_loss_mlp": 0.01039301, + "balance_loss_clip": 1.01673508, + "balance_loss_mlp": 1.03352499, + "epoch": 0.12992634901548175, + "flos": 18801845637120.0, + "grad_norm": 1.6115067908069767, + "language_loss": 0.8194257, + "learning_rate": 3.835944124786566e-06, + "loss": 0.84093148, + "num_input_tokens_seen": 46668780, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 0.77734375, + "step": 2161, + "time_per_iteration": 2.3671443462371826 + }, + { + "auxiliary_loss_clip": 0.0111232, + "auxiliary_loss_mlp": 0.01045213, + "balance_loss_clip": 1.02225292, + "balance_loss_mlp": 1.03120661, + "epoch": 0.12998647226814972, + "flos": 29126430984960.0, + "grad_norm": 2.48374558455811, + "language_loss": 0.82309949, + "learning_rate": 3.835794230672833e-06, + "loss": 0.84467483, + "num_input_tokens_seen": 46687550, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 0.8125, + "step": 2162, + "time_per_iteration": 2.451164722442627 + }, + { + "auxiliary_loss_clip": 0.01114268, + "auxiliary_loss_mlp": 0.01039767, + "balance_loss_clip": 1.01642549, + "balance_loss_mlp": 1.02928841, + "epoch": 0.13004659552081768, + "flos": 19572657834240.0, + "grad_norm": 2.722292118622819, + "language_loss": 0.73022962, + "learning_rate": 3.8356442710443264e-06, + "loss": 0.75177002, + "num_input_tokens_seen": 46706730, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 0.8515625, + "step": 2163, + "time_per_iteration": 2.3654446601867676 + }, + { + "auxiliary_loss_clip": 0.01118636, + "auxiliary_loss_mlp": 0.01045333, + "balance_loss_clip": 1.0205729, + "balance_loss_mlp": 1.03252673, + "epoch": 0.13010671877348565, + "flos": 22489651261440.0, + "grad_norm": 2.095764827754389, + "language_loss": 0.80722785, + "learning_rate": 3.835494245906398e-06, + "loss": 0.82886755, + "num_input_tokens_seen": 46724250, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.859375, + "step": 2164, + "time_per_iteration": 2.3870291709899902 + }, + { + "auxiliary_loss_clip": 0.01114403, + "auxiliary_loss_mlp": 0.01043411, + "balance_loss_clip": 1.02221584, + "balance_loss_mlp": 1.03097296, + "epoch": 0.1301668420261536, + "flos": 23877099624960.0, + "grad_norm": 3.8116589712356306, + "language_loss": 0.72372723, + "learning_rate": 3.835344155264401e-06, + "loss": 0.74530542, + "num_input_tokens_seen": 46744105, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 0.83203125, + "step": 2165, + "time_per_iteration": 2.4053943157196045 + }, + { + "auxiliary_loss_clip": 0.01116581, + "auxiliary_loss_mlp": 0.0104437, + "balance_loss_clip": 1.02080274, + "balance_loss_mlp": 1.03170514, + "epoch": 0.13022696527882158, + "flos": 23148916064640.0, + "grad_norm": 1.9997241389468778, + "language_loss": 0.74730682, + "learning_rate": 3.835193999123692e-06, + "loss": 0.76891643, + "num_input_tokens_seen": 46764250, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.84765625, + "step": 2166, + "time_per_iteration": 2.3938140869140625 + }, + { + "auxiliary_loss_clip": 0.01114391, + "auxiliary_loss_mlp": 0.01047195, + "balance_loss_clip": 1.02433074, + "balance_loss_mlp": 1.03019023, + "epoch": 0.13028708853148954, + "flos": 26907281280000.0, + "grad_norm": 2.0620299613784137, + "language_loss": 0.83216614, + "learning_rate": 3.83504377748963e-06, + "loss": 0.853782, + "num_input_tokens_seen": 46786865, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.84375, + "step": 2167, + "time_per_iteration": 2.4385018348693848 + }, + { + "auxiliary_loss_clip": 0.01115013, + "auxiliary_loss_mlp": 0.01049261, + "balance_loss_clip": 1.02614653, + "balance_loss_mlp": 1.03209972, + "epoch": 0.13034721178415754, + "flos": 21250409086080.0, + "grad_norm": 1.513359311460835, + "language_loss": 0.8302232, + "learning_rate": 3.834893490367576e-06, + "loss": 0.85186589, + "num_input_tokens_seen": 46807030, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 0.828125, + "step": 2168, + "time_per_iteration": 2.40877628326416 + }, + { + "auxiliary_loss_clip": 0.0111569, + "auxiliary_loss_mlp": 0.01046431, + "balance_loss_clip": 1.02275646, + "balance_loss_mlp": 1.0308814, + "epoch": 0.1304073350368255, + "flos": 18766338917760.0, + "grad_norm": 1.984090395510942, + "language_loss": 0.80360681, + "learning_rate": 3.834743137762894e-06, + "loss": 0.82522798, + "num_input_tokens_seen": 46826280, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.84765625, + "step": 2169, + "time_per_iteration": 2.3826045989990234 + }, + { + "auxiliary_loss_clip": 0.0111662, + "auxiliary_loss_mlp": 0.01039846, + "balance_loss_clip": 1.01742303, + "balance_loss_mlp": 1.03232527, + "epoch": 0.13046745828949347, + "flos": 28363438932480.0, + "grad_norm": 2.2305857081038893, + "language_loss": 0.6652239, + "learning_rate": 3.834592719680948e-06, + "loss": 0.68678856, + "num_input_tokens_seen": 46846505, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.84375, + "step": 2170, + "time_per_iteration": 2.461083173751831 + }, + { + "auxiliary_loss_clip": 0.01114168, + "auxiliary_loss_mlp": 0.01043086, + "balance_loss_clip": 1.01999474, + "balance_loss_mlp": 1.03133297, + "epoch": 0.13052758154216143, + "flos": 29603798271360.0, + "grad_norm": 1.7907177832851473, + "language_loss": 0.66911954, + "learning_rate": 3.834442236127107e-06, + "loss": 0.69069207, + "num_input_tokens_seen": 46867380, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.828125, + "step": 2171, + "time_per_iteration": 2.4362990856170654 + }, + { + "auxiliary_loss_clip": 0.01113873, + "auxiliary_loss_mlp": 0.0104092, + "balance_loss_clip": 1.01773369, + "balance_loss_mlp": 1.03114748, + "epoch": 0.1305877047948294, + "flos": 19389852622080.0, + "grad_norm": 3.7675128109353753, + "language_loss": 0.71806937, + "learning_rate": 3.834291687106741e-06, + "loss": 0.73961735, + "num_input_tokens_seen": 46886810, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.828125, + "step": 2172, + "time_per_iteration": 2.380793333053589 + }, + { + "auxiliary_loss_clip": 0.01112477, + "auxiliary_loss_mlp": 0.01036177, + "balance_loss_clip": 1.0148859, + "balance_loss_mlp": 1.03321958, + "epoch": 0.13064782804749736, + "flos": 16872579884160.0, + "grad_norm": 2.3902403008806186, + "language_loss": 0.75815773, + "learning_rate": 3.834141072625224e-06, + "loss": 0.77964425, + "num_input_tokens_seen": 46905620, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 0.79296875, + "step": 2173, + "time_per_iteration": 2.3549892902374268 + }, + { + "auxiliary_loss_clip": 0.01115196, + "auxiliary_loss_mlp": 0.0105006, + "balance_loss_clip": 1.0247401, + "balance_loss_mlp": 1.03040016, + "epoch": 0.13070795130016533, + "flos": 24497925154560.0, + "grad_norm": 2.9225325642837494, + "language_loss": 0.70756316, + "learning_rate": 3.833990392687929e-06, + "loss": 0.72921574, + "num_input_tokens_seen": 46925120, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.84765625, + "step": 2174, + "time_per_iteration": 2.4198758602142334 + }, + { + "auxiliary_loss_clip": 0.01028906, + "auxiliary_loss_mlp": 0.01013085, + "balance_loss_clip": 1.00998521, + "balance_loss_mlp": 1.00515437, + "epoch": 0.13076807455283332, + "flos": 71051042211840.0, + "grad_norm": 0.7942536445789119, + "language_loss": 0.59030503, + "learning_rate": 3.833839647300235e-06, + "loss": 0.61072493, + "num_input_tokens_seen": 46988195, + "router_z_loss_clip": 0.03088379, + "router_z_loss_mlp": 0.23828125, + "step": 2175, + "time_per_iteration": 3.144157648086548 + }, + { + "auxiliary_loss_clip": 0.01114359, + "auxiliary_loss_mlp": 0.01043894, + "balance_loss_clip": 1.02068412, + "balance_loss_mlp": 1.03204215, + "epoch": 0.13082819780550128, + "flos": 20263519284480.0, + "grad_norm": 2.0487518159718525, + "language_loss": 0.79935825, + "learning_rate": 3.8336888364675215e-06, + "loss": 0.82094073, + "num_input_tokens_seen": 47004720, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.8203125, + "step": 2176, + "time_per_iteration": 2.3896164894104004 + }, + { + "auxiliary_loss_clip": 0.01114783, + "auxiliary_loss_mlp": 0.0104714, + "balance_loss_clip": 1.02339339, + "balance_loss_mlp": 1.03182983, + "epoch": 0.13088832105816925, + "flos": 34202034086400.0, + "grad_norm": 1.8814432995606216, + "language_loss": 0.74356544, + "learning_rate": 3.83353796019517e-06, + "loss": 0.76518464, + "num_input_tokens_seen": 47024255, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 0.828125, + "step": 2177, + "time_per_iteration": 2.495523691177368 + }, + { + "auxiliary_loss_clip": 0.01109811, + "auxiliary_loss_mlp": 0.01038365, + "balance_loss_clip": 1.0149163, + "balance_loss_mlp": 1.02974164, + "epoch": 0.13094844431083721, + "flos": 17893998887040.0, + "grad_norm": 3.209198226754352, + "language_loss": 0.81699485, + "learning_rate": 3.833387018488565e-06, + "loss": 0.8384766, + "num_input_tokens_seen": 47042465, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.80078125, + "step": 2178, + "time_per_iteration": 2.3795485496520996 + }, + { + "auxiliary_loss_clip": 0.01114659, + "auxiliary_loss_mlp": 0.01045616, + "balance_loss_clip": 1.02328825, + "balance_loss_mlp": 1.03250599, + "epoch": 0.13100856756350518, + "flos": 17310355822080.0, + "grad_norm": 2.5560615218726506, + "language_loss": 0.74238646, + "learning_rate": 3.833236011353094e-06, + "loss": 0.76398921, + "num_input_tokens_seen": 47060370, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.8203125, + "step": 2179, + "time_per_iteration": 2.342864751815796 + }, + { + "auxiliary_loss_clip": 0.01110019, + "auxiliary_loss_mlp": 0.01040483, + "balance_loss_clip": 1.0178926, + "balance_loss_mlp": 1.03046405, + "epoch": 0.13106869081617314, + "flos": 22199453107200.0, + "grad_norm": 2.048661509886946, + "language_loss": 0.84644121, + "learning_rate": 3.833084938794144e-06, + "loss": 0.86794627, + "num_input_tokens_seen": 47081415, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 0.79296875, + "step": 2180, + "time_per_iteration": 2.407223701477051 + }, + { + "auxiliary_loss_clip": 0.01112943, + "auxiliary_loss_mlp": 0.01047561, + "balance_loss_clip": 1.02467299, + "balance_loss_mlp": 1.0332936, + "epoch": 0.13112881406884114, + "flos": 21762026282880.0, + "grad_norm": 1.9753502228991404, + "language_loss": 0.89866793, + "learning_rate": 3.832933800817109e-06, + "loss": 0.92027295, + "num_input_tokens_seen": 47099860, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.796875, + "step": 2181, + "time_per_iteration": 2.3756368160247803 + }, + { + "auxiliary_loss_clip": 0.01114895, + "auxiliary_loss_mlp": 0.01039846, + "balance_loss_clip": 1.01674366, + "balance_loss_mlp": 1.03142238, + "epoch": 0.1311889373215091, + "flos": 23329975708800.0, + "grad_norm": 1.9743377326784572, + "language_loss": 0.68522978, + "learning_rate": 3.832782597427381e-06, + "loss": 0.70677722, + "num_input_tokens_seen": 47118540, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.8359375, + "step": 2182, + "time_per_iteration": 2.4033896923065186 + }, + { + "auxiliary_loss_clip": 0.01112122, + "auxiliary_loss_mlp": 0.01042636, + "balance_loss_clip": 1.01881838, + "balance_loss_mlp": 1.03059769, + "epoch": 0.13124906057417707, + "flos": 21466381956480.0, + "grad_norm": 2.201104097311233, + "language_loss": 0.78646314, + "learning_rate": 3.832631328630357e-06, + "loss": 0.8080107, + "num_input_tokens_seen": 47136710, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.81640625, + "step": 2183, + "time_per_iteration": 2.3812484741210938 + }, + { + "auxiliary_loss_clip": 0.01112053, + "auxiliary_loss_mlp": 0.01041131, + "balance_loss_clip": 1.01770616, + "balance_loss_mlp": 1.03083646, + "epoch": 0.13130918382684503, + "flos": 23254284147840.0, + "grad_norm": 1.775200535092224, + "language_loss": 0.85511321, + "learning_rate": 3.832479994431435e-06, + "loss": 0.87664509, + "num_input_tokens_seen": 47157155, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.8125, + "step": 2184, + "time_per_iteration": 2.4094576835632324 + }, + { + "auxiliary_loss_clip": 0.01112983, + "auxiliary_loss_mlp": 0.01048421, + "balance_loss_clip": 1.02326775, + "balance_loss_mlp": 1.02956557, + "epoch": 0.131369307079513, + "flos": 20849222119680.0, + "grad_norm": 1.8957297581875063, + "language_loss": 0.81803644, + "learning_rate": 3.8323285948360155e-06, + "loss": 0.83965051, + "num_input_tokens_seen": 47176820, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.83203125, + "step": 2185, + "time_per_iteration": 2.3804616928100586 + }, + { + "auxiliary_loss_clip": 0.01114125, + "auxiliary_loss_mlp": 0.01049463, + "balance_loss_clip": 1.02526331, + "balance_loss_mlp": 1.02938843, + "epoch": 0.13142943033218096, + "flos": 17857375004160.0, + "grad_norm": 2.2495711618574887, + "language_loss": 0.73018312, + "learning_rate": 3.832177129849501e-06, + "loss": 0.75181901, + "num_input_tokens_seen": 47195855, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.84765625, + "step": 2186, + "time_per_iteration": 2.3832266330718994 + }, + { + "auxiliary_loss_clip": 0.01111766, + "auxiliary_loss_mlp": 0.01042853, + "balance_loss_clip": 1.01833165, + "balance_loss_mlp": 1.03020239, + "epoch": 0.13148955358484893, + "flos": 20994984512640.0, + "grad_norm": 1.9049824607495724, + "language_loss": 0.79982936, + "learning_rate": 3.832025599477299e-06, + "loss": 0.82137549, + "num_input_tokens_seen": 47214535, + "router_z_loss_clip": 0.24511719, + "router_z_loss_mlp": 0.81640625, + "step": 2187, + "time_per_iteration": 3.7775983810424805 + }, + { + "auxiliary_loss_clip": 0.01029256, + "auxiliary_loss_mlp": 0.01001701, + "balance_loss_clip": 0.99886429, + "balance_loss_mlp": 1.00545847, + "epoch": 0.13154967683751692, + "flos": 70169206291200.0, + "grad_norm": 0.8418638121387123, + "language_loss": 0.59020334, + "learning_rate": 3.831874003724815e-06, + "loss": 0.61051291, + "num_input_tokens_seen": 47270300, + "router_z_loss_clip": 0.02832031, + "router_z_loss_mlp": 0.23828125, + "step": 2188, + "time_per_iteration": 3.027845859527588 + }, + { + "auxiliary_loss_clip": 0.01113306, + "auxiliary_loss_mlp": 0.0104065, + "balance_loss_clip": 1.0176661, + "balance_loss_mlp": 1.03258336, + "epoch": 0.1316098000901849, + "flos": 20375101589760.0, + "grad_norm": 2.7635906312789493, + "language_loss": 0.74211311, + "learning_rate": 3.83172234259746e-06, + "loss": 0.76365268, + "num_input_tokens_seen": 47290720, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.80859375, + "step": 2189, + "time_per_iteration": 2.404848575592041 + }, + { + "auxiliary_loss_clip": 0.01111108, + "auxiliary_loss_mlp": 0.01043428, + "balance_loss_clip": 1.02089787, + "balance_loss_mlp": 1.03086686, + "epoch": 0.13166992334285285, + "flos": 23220034237440.0, + "grad_norm": 2.841583407847499, + "language_loss": 0.72652352, + "learning_rate": 3.831570616100646e-06, + "loss": 0.74806881, + "num_input_tokens_seen": 47311820, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.80078125, + "step": 2190, + "time_per_iteration": 3.9414379596710205 + }, + { + "auxiliary_loss_clip": 0.01114807, + "auxiliary_loss_mlp": 0.01043525, + "balance_loss_clip": 1.02108979, + "balance_loss_mlp": 1.03255475, + "epoch": 0.13173004659552082, + "flos": 23329836063360.0, + "grad_norm": 2.0700281427836646, + "language_loss": 0.74798489, + "learning_rate": 3.831418824239789e-06, + "loss": 0.7695682, + "num_input_tokens_seen": 47331605, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.82421875, + "step": 2191, + "time_per_iteration": 2.4002647399902344 + }, + { + "auxiliary_loss_clip": 0.01113795, + "auxiliary_loss_mlp": 0.01042197, + "balance_loss_clip": 1.0187366, + "balance_loss_mlp": 1.03179884, + "epoch": 0.13179016984818878, + "flos": 21250443997440.0, + "grad_norm": 1.8918926065518962, + "language_loss": 0.79094386, + "learning_rate": 3.831266967020304e-06, + "loss": 0.81250381, + "num_input_tokens_seen": 47350455, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.8203125, + "step": 2192, + "time_per_iteration": 3.7655372619628906 + }, + { + "auxiliary_loss_clip": 0.01113748, + "auxiliary_loss_mlp": 0.01049621, + "balance_loss_clip": 1.02704287, + "balance_loss_mlp": 1.03156579, + "epoch": 0.13185029310085675, + "flos": 17777913016320.0, + "grad_norm": 1.8315977299639683, + "language_loss": 0.85026896, + "learning_rate": 3.831115044447613e-06, + "loss": 0.87190259, + "num_input_tokens_seen": 47368225, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 0.8203125, + "step": 2193, + "time_per_iteration": 2.3712708950042725 + }, + { + "auxiliary_loss_clip": 0.01027596, + "auxiliary_loss_mlp": 0.01006696, + "balance_loss_clip": 1.00401342, + "balance_loss_mlp": 1.00445843, + "epoch": 0.1319104163535247, + "flos": 69848319185280.0, + "grad_norm": 0.7516439486959761, + "language_loss": 0.5407998, + "learning_rate": 3.830963056527136e-06, + "loss": 0.56114268, + "num_input_tokens_seen": 47427125, + "router_z_loss_clip": 0.02685547, + "router_z_loss_mlp": 0.23242188, + "step": 2194, + "time_per_iteration": 2.982822895050049 + }, + { + "auxiliary_loss_clip": 0.01111447, + "auxiliary_loss_mlp": 0.0104063, + "balance_loss_clip": 1.0185287, + "balance_loss_mlp": 1.02902436, + "epoch": 0.1319705396061927, + "flos": 25191893715840.0, + "grad_norm": 2.706274089240783, + "language_loss": 0.72682089, + "learning_rate": 3.830811003264296e-06, + "loss": 0.74834168, + "num_input_tokens_seen": 47450275, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.82421875, + "step": 2195, + "time_per_iteration": 2.4383349418640137 + }, + { + "auxiliary_loss_clip": 0.01116518, + "auxiliary_loss_mlp": 0.01036477, + "balance_loss_clip": 1.01201487, + "balance_loss_mlp": 1.03085577, + "epoch": 0.13203066285886067, + "flos": 20739420293760.0, + "grad_norm": 2.188057333145781, + "language_loss": 0.77745765, + "learning_rate": 3.830658884664522e-06, + "loss": 0.79898763, + "num_input_tokens_seen": 47469155, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.859375, + "step": 2196, + "time_per_iteration": 2.4020564556121826 + }, + { + "auxiliary_loss_clip": 0.01112904, + "auxiliary_loss_mlp": 0.01046155, + "balance_loss_clip": 1.02268314, + "balance_loss_mlp": 1.03074789, + "epoch": 0.13209078611152864, + "flos": 22053306689280.0, + "grad_norm": 2.3314154498043687, + "language_loss": 0.74964809, + "learning_rate": 3.830506700733241e-06, + "loss": 0.77123868, + "num_input_tokens_seen": 47488405, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.8203125, + "step": 2197, + "time_per_iteration": 2.3787591457366943 + }, + { + "auxiliary_loss_clip": 0.01112553, + "auxiliary_loss_mlp": 0.01036321, + "balance_loss_clip": 1.01301527, + "balance_loss_mlp": 1.03028679, + "epoch": 0.1321509093641966, + "flos": 16284153962880.0, + "grad_norm": 1.9713334442069481, + "language_loss": 0.79398841, + "learning_rate": 3.830354451475884e-06, + "loss": 0.81547713, + "num_input_tokens_seen": 47505650, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.8203125, + "step": 2198, + "time_per_iteration": 2.3466055393218994 + }, + { + "auxiliary_loss_clip": 0.01110413, + "auxiliary_loss_mlp": 0.01045317, + "balance_loss_clip": 1.02369249, + "balance_loss_mlp": 1.0311594, + "epoch": 0.13221103261686457, + "flos": 16982067507840.0, + "grad_norm": 2.16452769131162, + "language_loss": 0.82794964, + "learning_rate": 3.830202136897886e-06, + "loss": 0.84950697, + "num_input_tokens_seen": 47521540, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 0.79296875, + "step": 2199, + "time_per_iteration": 2.3588204383850098 + }, + { + "auxiliary_loss_clip": 0.01114966, + "auxiliary_loss_mlp": 0.01045151, + "balance_loss_clip": 1.02260888, + "balance_loss_mlp": 1.03232229, + "epoch": 0.13227115586953253, + "flos": 34232373924480.0, + "grad_norm": 2.0674267205343058, + "language_loss": 0.69267744, + "learning_rate": 3.8300497570046804e-06, + "loss": 0.71427858, + "num_input_tokens_seen": 47543625, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 0.82421875, + "step": 2200, + "time_per_iteration": 2.498288631439209 + }, + { + "auxiliary_loss_clip": 0.01110224, + "auxiliary_loss_mlp": 0.01044977, + "balance_loss_clip": 1.02214885, + "balance_loss_mlp": 1.02931237, + "epoch": 0.13233127912220052, + "flos": 20703599372160.0, + "grad_norm": 1.72394146433636, + "language_loss": 0.84412003, + "learning_rate": 3.829897311801707e-06, + "loss": 0.86567205, + "num_input_tokens_seen": 47563740, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.80859375, + "step": 2201, + "time_per_iteration": 2.41595721244812 + }, + { + "auxiliary_loss_clip": 0.0111353, + "auxiliary_loss_mlp": 0.0103892, + "balance_loss_clip": 1.01606727, + "balance_loss_mlp": 1.03146267, + "epoch": 0.1323914023748685, + "flos": 25804061228160.0, + "grad_norm": 1.8939885495026298, + "language_loss": 0.8684684, + "learning_rate": 3.829744801294406e-06, + "loss": 0.88999289, + "num_input_tokens_seen": 47582655, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.8203125, + "step": 2202, + "time_per_iteration": 2.4088144302368164 + }, + { + "auxiliary_loss_clip": 0.01108057, + "auxiliary_loss_mlp": 0.01042929, + "balance_loss_clip": 1.02104235, + "balance_loss_mlp": 1.02929723, + "epoch": 0.13245152562753645, + "flos": 21250478908800.0, + "grad_norm": 1.9619794150131111, + "language_loss": 0.72687638, + "learning_rate": 3.8295922254882186e-06, + "loss": 0.74838626, + "num_input_tokens_seen": 47600875, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 0.7890625, + "step": 2203, + "time_per_iteration": 2.4206366539001465 + }, + { + "auxiliary_loss_clip": 0.011129, + "auxiliary_loss_mlp": 0.01045435, + "balance_loss_clip": 1.02400172, + "balance_loss_mlp": 1.03066564, + "epoch": 0.13251164888020442, + "flos": 26609856474240.0, + "grad_norm": 2.3983479091674726, + "language_loss": 0.73204589, + "learning_rate": 3.829439584388591e-06, + "loss": 0.75362927, + "num_input_tokens_seen": 47619250, + "router_z_loss_clip": 0.21386719, + "router_z_loss_mlp": 0.82421875, + "step": 2204, + "time_per_iteration": 2.413806676864624 + }, + { + "auxiliary_loss_clip": 0.01114894, + "auxiliary_loss_mlp": 0.01049636, + "balance_loss_clip": 1.02656865, + "balance_loss_mlp": 1.03270948, + "epoch": 0.13257177213287238, + "flos": 29825217313920.0, + "grad_norm": 1.627487404452735, + "language_loss": 0.78527379, + "learning_rate": 3.8292868780009715e-06, + "loss": 0.8069191, + "num_input_tokens_seen": 47639445, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.8203125, + "step": 2205, + "time_per_iteration": 2.448495626449585 + }, + { + "auxiliary_loss_clip": 0.01112746, + "auxiliary_loss_mlp": 0.01042662, + "balance_loss_clip": 1.01917768, + "balance_loss_mlp": 1.03346515, + "epoch": 0.13263189538554035, + "flos": 21287382082560.0, + "grad_norm": 3.000510399921291, + "language_loss": 0.78886485, + "learning_rate": 3.829134106330809e-06, + "loss": 0.81041896, + "num_input_tokens_seen": 47658740, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.79296875, + "step": 2206, + "time_per_iteration": 2.371365785598755 + }, + { + "auxiliary_loss_clip": 0.01112996, + "auxiliary_loss_mlp": 0.0104297, + "balance_loss_clip": 1.02140486, + "balance_loss_mlp": 1.03161049, + "epoch": 0.13269201863820831, + "flos": 16873138465920.0, + "grad_norm": 1.9451013166631212, + "language_loss": 0.74432611, + "learning_rate": 3.828981269383554e-06, + "loss": 0.76588583, + "num_input_tokens_seen": 47676880, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 0.8125, + "step": 2207, + "time_per_iteration": 2.374790668487549 + }, + { + "auxiliary_loss_clip": 0.01109145, + "auxiliary_loss_mlp": 0.01041137, + "balance_loss_clip": 1.01826119, + "balance_loss_mlp": 1.02947807, + "epoch": 0.1327521418908763, + "flos": 23767786558080.0, + "grad_norm": 1.6835914547074657, + "language_loss": 0.8392238, + "learning_rate": 3.828828367164663e-06, + "loss": 0.86072659, + "num_input_tokens_seen": 47696635, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.796875, + "step": 2208, + "time_per_iteration": 2.4047040939331055 + }, + { + "auxiliary_loss_clip": 0.01109737, + "auxiliary_loss_mlp": 0.010438, + "balance_loss_clip": 1.02266455, + "balance_loss_mlp": 1.03286314, + "epoch": 0.13281226514354427, + "flos": 26504383656960.0, + "grad_norm": 1.640844661454858, + "language_loss": 0.84896123, + "learning_rate": 3.828675399679592e-06, + "loss": 0.87049663, + "num_input_tokens_seen": 47717760, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 0.76953125, + "step": 2209, + "time_per_iteration": 2.446331024169922 + }, + { + "auxiliary_loss_clip": 0.01111576, + "auxiliary_loss_mlp": 0.01041452, + "balance_loss_clip": 1.02059054, + "balance_loss_mlp": 1.02965975, + "epoch": 0.13287238839621224, + "flos": 24497610952320.0, + "grad_norm": 3.8266139058343094, + "language_loss": 0.82185507, + "learning_rate": 3.8285223669337995e-06, + "loss": 0.84338534, + "num_input_tokens_seen": 47737685, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 0.8203125, + "step": 2210, + "time_per_iteration": 2.407039165496826 + }, + { + "auxiliary_loss_clip": 0.01028667, + "auxiliary_loss_mlp": 0.01008429, + "balance_loss_clip": 1.00596142, + "balance_loss_mlp": 1.0048151, + "epoch": 0.1329325116488802, + "flos": 67691071054080.0, + "grad_norm": 0.7631754503202972, + "language_loss": 0.57968318, + "learning_rate": 3.828369268932747e-06, + "loss": 0.60005414, + "num_input_tokens_seen": 47802415, + "router_z_loss_clip": 0.0246582, + "router_z_loss_mlp": 0.23828125, + "step": 2211, + "time_per_iteration": 3.1298675537109375 + }, + { + "auxiliary_loss_clip": 0.01027464, + "auxiliary_loss_mlp": 0.01003969, + "balance_loss_clip": 1.00142968, + "balance_loss_mlp": 1.00369525, + "epoch": 0.13299263490154817, + "flos": 72258303715200.0, + "grad_norm": 0.7972035907439725, + "language_loss": 0.55318034, + "learning_rate": 3.828216105681899e-06, + "loss": 0.57349467, + "num_input_tokens_seen": 47871485, + "router_z_loss_clip": 0.02539062, + "router_z_loss_mlp": 0.23828125, + "step": 2212, + "time_per_iteration": 3.120358943939209 + }, + { + "auxiliary_loss_clip": 0.01115866, + "auxiliary_loss_mlp": 0.01043653, + "balance_loss_clip": 1.02024066, + "balance_loss_mlp": 1.03086209, + "epoch": 0.13305275815421613, + "flos": 17930308567680.0, + "grad_norm": 3.686084098762373, + "language_loss": 0.74999905, + "learning_rate": 3.8280628771867205e-06, + "loss": 0.77159429, + "num_input_tokens_seen": 47888315, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.8515625, + "step": 2213, + "time_per_iteration": 2.3642704486846924 + }, + { + "auxiliary_loss_clip": 0.01106028, + "auxiliary_loss_mlp": 0.0103986, + "balance_loss_clip": 1.01936781, + "balance_loss_mlp": 1.02920556, + "epoch": 0.13311288140688413, + "flos": 22339943884800.0, + "grad_norm": 1.9918070964647272, + "language_loss": 0.79267049, + "learning_rate": 3.8279095834526815e-06, + "loss": 0.81412941, + "num_input_tokens_seen": 47906600, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 0.765625, + "step": 2214, + "time_per_iteration": 2.381934881210327 + }, + { + "auxiliary_loss_clip": 0.01111644, + "auxiliary_loss_mlp": 0.01048696, + "balance_loss_clip": 1.02578413, + "balance_loss_mlp": 1.03067029, + "epoch": 0.1331730046595521, + "flos": 31867531649280.0, + "grad_norm": 1.8632679071922624, + "language_loss": 0.69134682, + "learning_rate": 3.8277562244852495e-06, + "loss": 0.71295023, + "num_input_tokens_seen": 47927630, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 0.8125, + "step": 2215, + "time_per_iteration": 2.469895601272583 + }, + { + "auxiliary_loss_clip": 0.0110936, + "auxiliary_loss_mlp": 0.01038844, + "balance_loss_clip": 1.01729095, + "balance_loss_mlp": 1.0285362, + "epoch": 0.13323312791222006, + "flos": 22565447556480.0, + "grad_norm": 1.7429587827665565, + "language_loss": 0.8103472, + "learning_rate": 3.827602800289901e-06, + "loss": 0.83182919, + "num_input_tokens_seen": 47947935, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 0.8046875, + "step": 2216, + "time_per_iteration": 2.4177145957946777 + }, + { + "auxiliary_loss_clip": 0.01110098, + "auxiliary_loss_mlp": 0.01050275, + "balance_loss_clip": 1.02679074, + "balance_loss_mlp": 1.02934813, + "epoch": 0.13329325116488802, + "flos": 15084433313280.0, + "grad_norm": 1.9884474111186918, + "language_loss": 0.87180638, + "learning_rate": 3.827449310872109e-06, + "loss": 0.89341009, + "num_input_tokens_seen": 47965515, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.8046875, + "step": 2217, + "time_per_iteration": 2.3541815280914307 + }, + { + "auxiliary_loss_clip": 0.01110352, + "auxiliary_loss_mlp": 0.01042972, + "balance_loss_clip": 1.02001226, + "balance_loss_mlp": 1.03001714, + "epoch": 0.133353374417556, + "flos": 27452450160000.0, + "grad_norm": 2.0869599672048142, + "language_loss": 0.73178005, + "learning_rate": 3.827295756237351e-06, + "loss": 0.75331324, + "num_input_tokens_seen": 47985675, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.8046875, + "step": 2218, + "time_per_iteration": 2.4269580841064453 + }, + { + "auxiliary_loss_clip": 0.01110731, + "auxiliary_loss_mlp": 0.01040425, + "balance_loss_clip": 1.0173105, + "balance_loss_mlp": 1.03010273, + "epoch": 0.13341349767022395, + "flos": 24093631077120.0, + "grad_norm": 1.9308602690623262, + "language_loss": 0.87036943, + "learning_rate": 3.8271421363911095e-06, + "loss": 0.89188099, + "num_input_tokens_seen": 48004985, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 0.8046875, + "step": 2219, + "time_per_iteration": 2.412137985229492 + }, + { + "auxiliary_loss_clip": 0.01112182, + "auxiliary_loss_mlp": 0.01044723, + "balance_loss_clip": 1.02320623, + "balance_loss_mlp": 1.03274846, + "epoch": 0.13347362092289192, + "flos": 24132209996160.0, + "grad_norm": 1.8041370165092492, + "language_loss": 0.77078104, + "learning_rate": 3.826988451338864e-06, + "loss": 0.79235017, + "num_input_tokens_seen": 48024965, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 0.79296875, + "step": 2220, + "time_per_iteration": 2.3967785835266113 + }, + { + "auxiliary_loss_clip": 0.01107377, + "auxiliary_loss_mlp": 0.01038252, + "balance_loss_clip": 1.01708031, + "balance_loss_mlp": 1.02819431, + "epoch": 0.1335337441755599, + "flos": 18435711542400.0, + "grad_norm": 7.832869738206291, + "language_loss": 0.78862309, + "learning_rate": 3.826834701086101e-06, + "loss": 0.8100794, + "num_input_tokens_seen": 48040890, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 0.7890625, + "step": 2221, + "time_per_iteration": 2.364459753036499 + }, + { + "auxiliary_loss_clip": 0.01027571, + "auxiliary_loss_mlp": 0.01008219, + "balance_loss_clip": 1.00550091, + "balance_loss_mlp": 1.00492895, + "epoch": 0.13359386742822787, + "flos": 50609395837440.0, + "grad_norm": 1.0235370816867682, + "language_loss": 0.69041914, + "learning_rate": 3.826680885638306e-06, + "loss": 0.71077704, + "num_input_tokens_seen": 48091855, + "router_z_loss_clip": 0.02722168, + "router_z_loss_mlp": 0.2265625, + "step": 2222, + "time_per_iteration": 2.852614641189575 + }, + { + "auxiliary_loss_clip": 0.01111476, + "auxiliary_loss_mlp": 0.01043529, + "balance_loss_clip": 1.02135563, + "balance_loss_mlp": 1.03144515, + "epoch": 0.13365399068089584, + "flos": 22777615088640.0, + "grad_norm": 2.5645377689459323, + "language_loss": 0.67273825, + "learning_rate": 3.826527005000969e-06, + "loss": 0.69428831, + "num_input_tokens_seen": 48111350, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 0.80078125, + "step": 2223, + "time_per_iteration": 2.4104299545288086 + }, + { + "auxiliary_loss_clip": 0.01111142, + "auxiliary_loss_mlp": 0.01041346, + "balance_loss_clip": 1.01845801, + "balance_loss_mlp": 1.02990484, + "epoch": 0.1337141139335638, + "flos": 12530781072000.0, + "grad_norm": 2.3977038475576817, + "language_loss": 0.82913315, + "learning_rate": 3.826373059179582e-06, + "loss": 0.85065806, + "num_input_tokens_seen": 48129840, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.8125, + "step": 2224, + "time_per_iteration": 2.357208013534546 + }, + { + "auxiliary_loss_clip": 0.01115536, + "auxiliary_loss_mlp": 0.01043834, + "balance_loss_clip": 1.01978946, + "balance_loss_mlp": 1.03101516, + "epoch": 0.13377423718623177, + "flos": 23037857429760.0, + "grad_norm": 2.4352710360103487, + "language_loss": 0.6528067, + "learning_rate": 3.826219048179639e-06, + "loss": 0.67440045, + "num_input_tokens_seen": 48149240, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.84375, + "step": 2225, + "time_per_iteration": 2.411724090576172 + }, + { + "auxiliary_loss_clip": 0.01111973, + "auxiliary_loss_mlp": 0.01051142, + "balance_loss_clip": 1.02871895, + "balance_loss_mlp": 1.03108084, + "epoch": 0.13383436043889974, + "flos": 16215479585280.0, + "grad_norm": 2.2820423654325768, + "language_loss": 0.89218378, + "learning_rate": 3.826064972006635e-06, + "loss": 0.9138149, + "num_input_tokens_seen": 48166330, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.80859375, + "step": 2226, + "time_per_iteration": 2.3514645099639893 + }, + { + "auxiliary_loss_clip": 0.01111681, + "auxiliary_loss_mlp": 0.01046075, + "balance_loss_clip": 1.02270985, + "balance_loss_mlp": 1.03023171, + "epoch": 0.1338944836915677, + "flos": 24278530970880.0, + "grad_norm": 2.3901396881263177, + "language_loss": 0.74010229, + "learning_rate": 3.825910830666069e-06, + "loss": 0.76167989, + "num_input_tokens_seen": 48187600, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 0.8125, + "step": 2227, + "time_per_iteration": 3.7839083671569824 + }, + { + "auxiliary_loss_clip": 0.01109886, + "auxiliary_loss_mlp": 0.01042187, + "balance_loss_clip": 1.02022851, + "balance_loss_mlp": 1.02917624, + "epoch": 0.1339546069442357, + "flos": 17597900712960.0, + "grad_norm": 1.9744666336955208, + "language_loss": 0.85193986, + "learning_rate": 3.825756624163443e-06, + "loss": 0.87346053, + "num_input_tokens_seen": 48204400, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 0.80859375, + "step": 2228, + "time_per_iteration": 2.4063632488250732 + }, + { + "auxiliary_loss_clip": 0.0111183, + "auxiliary_loss_mlp": 0.01043096, + "balance_loss_clip": 1.02082753, + "balance_loss_mlp": 1.03083122, + "epoch": 0.13401473019690366, + "flos": 18989049680640.0, + "grad_norm": 2.210839270971356, + "language_loss": 0.80781674, + "learning_rate": 3.825602352504259e-06, + "loss": 0.82936597, + "num_input_tokens_seen": 48222180, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.80859375, + "step": 2229, + "time_per_iteration": 3.8466081619262695 + }, + { + "auxiliary_loss_clip": 0.01112713, + "auxiliary_loss_mlp": 0.01054344, + "balance_loss_clip": 1.03212357, + "balance_loss_mlp": 1.03109515, + "epoch": 0.13407485344957162, + "flos": 26942578531200.0, + "grad_norm": 1.7333625897589784, + "language_loss": 0.73866439, + "learning_rate": 3.825448015694023e-06, + "loss": 0.76033497, + "num_input_tokens_seen": 48243245, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.81640625, + "step": 2230, + "time_per_iteration": 3.813413619995117 + }, + { + "auxiliary_loss_clip": 0.01114408, + "auxiliary_loss_mlp": 0.01052288, + "balance_loss_clip": 1.02911401, + "balance_loss_mlp": 1.03100502, + "epoch": 0.1341349767022396, + "flos": 20338338061440.0, + "grad_norm": 1.665509430892685, + "language_loss": 0.80048466, + "learning_rate": 3.8252936137382435e-06, + "loss": 0.8221516, + "num_input_tokens_seen": 48262600, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.8359375, + "step": 2231, + "time_per_iteration": 3.74367356300354 + }, + { + "auxiliary_loss_clip": 0.01113262, + "auxiliary_loss_mlp": 0.01053751, + "balance_loss_clip": 1.03018284, + "balance_loss_mlp": 1.03137553, + "epoch": 0.13419509995490755, + "flos": 29860724033280.0, + "grad_norm": 1.6743044726072736, + "language_loss": 0.72241318, + "learning_rate": 3.82513914664243e-06, + "loss": 0.74408329, + "num_input_tokens_seen": 48285075, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 0.81640625, + "step": 2232, + "time_per_iteration": 2.4403839111328125 + }, + { + "auxiliary_loss_clip": 0.01116062, + "auxiliary_loss_mlp": 0.01045066, + "balance_loss_clip": 1.02165365, + "balance_loss_mlp": 1.0313139, + "epoch": 0.13425522320757552, + "flos": 26941775569920.0, + "grad_norm": 2.475825651129534, + "language_loss": 0.65877473, + "learning_rate": 3.824984614412095e-06, + "loss": 0.68038601, + "num_input_tokens_seen": 48301285, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.84765625, + "step": 2233, + "time_per_iteration": 2.420086145401001 + }, + { + "auxiliary_loss_clip": 0.01110733, + "auxiliary_loss_mlp": 0.01040684, + "balance_loss_clip": 1.01764095, + "balance_loss_mlp": 1.0282203, + "epoch": 0.1343153464602435, + "flos": 15776411927040.0, + "grad_norm": 2.6796130628290333, + "language_loss": 0.81137764, + "learning_rate": 3.824830017052753e-06, + "loss": 0.83289182, + "num_input_tokens_seen": 48317835, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.82421875, + "step": 2234, + "time_per_iteration": 2.3355088233947754 + }, + { + "auxiliary_loss_clip": 0.01114447, + "auxiliary_loss_mlp": 0.01045909, + "balance_loss_clip": 1.02279413, + "balance_loss_mlp": 1.03209209, + "epoch": 0.13437546971291148, + "flos": 24313653665280.0, + "grad_norm": 2.0611952609550626, + "language_loss": 0.82459158, + "learning_rate": 3.824675354569923e-06, + "loss": 0.8461951, + "num_input_tokens_seen": 48335670, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.82421875, + "step": 2235, + "time_per_iteration": 2.398684501647949 + }, + { + "auxiliary_loss_clip": 0.01113562, + "auxiliary_loss_mlp": 0.01046007, + "balance_loss_clip": 1.02316618, + "balance_loss_mlp": 1.03003097, + "epoch": 0.13443559296557944, + "flos": 26649482734080.0, + "grad_norm": 1.8639521609867127, + "language_loss": 0.86475575, + "learning_rate": 3.824520626969122e-06, + "loss": 0.88635147, + "num_input_tokens_seen": 48357805, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.8359375, + "step": 2236, + "time_per_iteration": 2.4377503395080566 + }, + { + "auxiliary_loss_clip": 0.01115367, + "auxiliary_loss_mlp": 0.01041982, + "balance_loss_clip": 1.01909375, + "balance_loss_mlp": 1.03230786, + "epoch": 0.1344957162182474, + "flos": 21795193941120.0, + "grad_norm": 1.6210711622806835, + "language_loss": 0.77399528, + "learning_rate": 3.824365834255874e-06, + "loss": 0.79556882, + "num_input_tokens_seen": 48377845, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.83203125, + "step": 2237, + "time_per_iteration": 2.3981072902679443 + }, + { + "auxiliary_loss_clip": 0.01115959, + "auxiliary_loss_mlp": 0.01050458, + "balance_loss_clip": 1.02498269, + "balance_loss_mlp": 1.0319252, + "epoch": 0.13455583947091537, + "flos": 19864531733760.0, + "grad_norm": 3.126998467904437, + "language_loss": 0.78480875, + "learning_rate": 3.824210976435702e-06, + "loss": 0.8064729, + "num_input_tokens_seen": 48394735, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.83984375, + "step": 2238, + "time_per_iteration": 2.369377851486206 + }, + { + "auxiliary_loss_clip": 0.01108949, + "auxiliary_loss_mlp": 0.01037434, + "balance_loss_clip": 1.01383018, + "balance_loss_mlp": 1.02933514, + "epoch": 0.13461596272358334, + "flos": 30845519153280.0, + "grad_norm": 2.5142370982128113, + "language_loss": 0.68518054, + "learning_rate": 3.824056053514132e-06, + "loss": 0.70664436, + "num_input_tokens_seen": 48414200, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 0.796875, + "step": 2239, + "time_per_iteration": 2.4480724334716797 + }, + { + "auxiliary_loss_clip": 0.01115627, + "auxiliary_loss_mlp": 0.01046975, + "balance_loss_clip": 1.02345467, + "balance_loss_mlp": 1.0310353, + "epoch": 0.1346760859762513, + "flos": 12493633518720.0, + "grad_norm": 2.4649535556244233, + "language_loss": 0.8140105, + "learning_rate": 3.823901065496693e-06, + "loss": 0.8356365, + "num_input_tokens_seen": 48431065, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.84765625, + "step": 2240, + "time_per_iteration": 2.3443541526794434 + }, + { + "auxiliary_loss_clip": 0.0111462, + "auxiliary_loss_mlp": 0.01047804, + "balance_loss_clip": 1.02402151, + "balance_loss_mlp": 1.03058279, + "epoch": 0.1347362092289193, + "flos": 21834924935040.0, + "grad_norm": 1.6357289943000772, + "language_loss": 0.77624297, + "learning_rate": 3.823746012388918e-06, + "loss": 0.79786718, + "num_input_tokens_seen": 48450335, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.83984375, + "step": 2241, + "time_per_iteration": 2.386347532272339 + }, + { + "auxiliary_loss_clip": 0.01109833, + "auxiliary_loss_mlp": 0.0104163, + "balance_loss_clip": 1.0201124, + "balance_loss_mlp": 1.03068209, + "epoch": 0.13479633248158726, + "flos": 23508451912320.0, + "grad_norm": 1.659205909924718, + "language_loss": 0.82998061, + "learning_rate": 3.823590894196339e-06, + "loss": 0.85149527, + "num_input_tokens_seen": 48468555, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 0.7890625, + "step": 2242, + "time_per_iteration": 2.4124395847320557 + }, + { + "auxiliary_loss_clip": 0.01114746, + "auxiliary_loss_mlp": 0.0104936, + "balance_loss_clip": 1.02486253, + "balance_loss_mlp": 1.03196406, + "epoch": 0.13485645573425523, + "flos": 29343241728000.0, + "grad_norm": 3.4559024996267147, + "language_loss": 0.64423156, + "learning_rate": 3.823435710924491e-06, + "loss": 0.66587257, + "num_input_tokens_seen": 48488515, + "router_z_loss_clip": 0.24511719, + "router_z_loss_mlp": 0.828125, + "step": 2243, + "time_per_iteration": 2.446357250213623 + }, + { + "auxiliary_loss_clip": 0.01107438, + "auxiliary_loss_mlp": 0.0103543, + "balance_loss_clip": 1.01361525, + "balance_loss_mlp": 1.02859759, + "epoch": 0.1349165789869232, + "flos": 28035883756800.0, + "grad_norm": 1.9068844032861496, + "language_loss": 0.72579181, + "learning_rate": 3.823280462578913e-06, + "loss": 0.7472204, + "num_input_tokens_seen": 48510515, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 0.7890625, + "step": 2244, + "time_per_iteration": 2.431729316711426 + }, + { + "auxiliary_loss_clip": 0.01111567, + "auxiliary_loss_mlp": 0.01045916, + "balance_loss_clip": 1.02393365, + "balance_loss_mlp": 1.03124416, + "epoch": 0.13497670223959116, + "flos": 22852713156480.0, + "grad_norm": 1.5985867586427198, + "language_loss": 0.85773522, + "learning_rate": 3.8231251491651455e-06, + "loss": 0.87931001, + "num_input_tokens_seen": 48529940, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 0.8046875, + "step": 2245, + "time_per_iteration": 2.402890920639038 + }, + { + "auxiliary_loss_clip": 0.0110876, + "auxiliary_loss_mlp": 0.01037539, + "balance_loss_clip": 1.01528263, + "balance_loss_mlp": 1.03117847, + "epoch": 0.13503682549225912, + "flos": 16503757614720.0, + "grad_norm": 3.0225897100770207, + "language_loss": 0.7903704, + "learning_rate": 3.822969770688732e-06, + "loss": 0.81183338, + "num_input_tokens_seen": 48548190, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.7734375, + "step": 2246, + "time_per_iteration": 2.375706911087036 + }, + { + "auxiliary_loss_clip": 0.01026554, + "auxiliary_loss_mlp": 0.01003031, + "balance_loss_clip": 1.00014651, + "balance_loss_mlp": 1.00370598, + "epoch": 0.1350969487449271, + "flos": 70753023912960.0, + "grad_norm": 0.7460429799412394, + "language_loss": 0.6049459, + "learning_rate": 3.8228143271552154e-06, + "loss": 0.62524176, + "num_input_tokens_seen": 48613165, + "router_z_loss_clip": 0.02880859, + "router_z_loss_mlp": 0.22851562, + "step": 2247, + "time_per_iteration": 3.1216773986816406 + }, + { + "auxiliary_loss_clip": 0.01116568, + "auxiliary_loss_mlp": 0.01046591, + "balance_loss_clip": 1.02258193, + "balance_loss_mlp": 1.03152978, + "epoch": 0.13515707199759508, + "flos": 23074865337600.0, + "grad_norm": 1.9198332175671928, + "language_loss": 0.81013012, + "learning_rate": 3.822658818570145e-06, + "loss": 0.83176172, + "num_input_tokens_seen": 48631705, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.8515625, + "step": 2248, + "time_per_iteration": 2.382345199584961 + }, + { + "auxiliary_loss_clip": 0.01108966, + "auxiliary_loss_mlp": 0.01039089, + "balance_loss_clip": 1.0176791, + "balance_loss_mlp": 1.03090119, + "epoch": 0.13521719525026304, + "flos": 23185225745280.0, + "grad_norm": 1.7824571080205176, + "language_loss": 0.76665759, + "learning_rate": 3.822503244939069e-06, + "loss": 0.78813815, + "num_input_tokens_seen": 48649740, + "router_z_loss_clip": 0.21386719, + "router_z_loss_mlp": 0.78125, + "step": 2249, + "time_per_iteration": 2.39690899848938 + }, + { + "auxiliary_loss_clip": 0.01112266, + "auxiliary_loss_mlp": 0.01047254, + "balance_loss_clip": 1.02599943, + "balance_loss_mlp": 1.03164351, + "epoch": 0.135277318502931, + "flos": 24789764142720.0, + "grad_norm": 1.4487850858130753, + "language_loss": 0.84145266, + "learning_rate": 3.822347606267541e-06, + "loss": 0.86304784, + "num_input_tokens_seen": 48671565, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 0.8046875, + "step": 2250, + "time_per_iteration": 2.437000274658203 + }, + { + "auxiliary_loss_clip": 0.01112671, + "auxiliary_loss_mlp": 0.01045562, + "balance_loss_clip": 1.02204239, + "balance_loss_mlp": 1.03052807, + "epoch": 0.13533744175559898, + "flos": 21907439562240.0, + "grad_norm": 2.9768659822997896, + "language_loss": 0.82101446, + "learning_rate": 3.8221919025611145e-06, + "loss": 0.84259683, + "num_input_tokens_seen": 48690425, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 0.8203125, + "step": 2251, + "time_per_iteration": 2.400810480117798 + }, + { + "auxiliary_loss_clip": 0.01110933, + "auxiliary_loss_mlp": 0.01045632, + "balance_loss_clip": 1.02289891, + "balance_loss_mlp": 1.02972174, + "epoch": 0.13539756500826694, + "flos": 21210678092160.0, + "grad_norm": 1.6350101086519406, + "language_loss": 0.85983527, + "learning_rate": 3.822036133825346e-06, + "loss": 0.88140088, + "num_input_tokens_seen": 48707505, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 0.8125, + "step": 2252, + "time_per_iteration": 2.3785557746887207 + }, + { + "auxiliary_loss_clip": 0.01026414, + "auxiliary_loss_mlp": 0.01003823, + "balance_loss_clip": 1.00139153, + "balance_loss_mlp": 1.00329792, + "epoch": 0.1354576882609349, + "flos": 63238981656960.0, + "grad_norm": 0.7738025226790045, + "language_loss": 0.61805081, + "learning_rate": 3.821880300065794e-06, + "loss": 0.63835323, + "num_input_tokens_seen": 48775895, + "router_z_loss_clip": 0.02429199, + "router_z_loss_mlp": 0.23144531, + "step": 2253, + "time_per_iteration": 3.105794906616211 + }, + { + "auxiliary_loss_clip": 0.01112569, + "auxiliary_loss_mlp": 0.01045554, + "balance_loss_clip": 1.02329731, + "balance_loss_mlp": 1.03244042, + "epoch": 0.1355178115136029, + "flos": 25481882401920.0, + "grad_norm": 1.8286536160423945, + "language_loss": 0.89134341, + "learning_rate": 3.821724401288022e-06, + "loss": 0.91292465, + "num_input_tokens_seen": 48798370, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.8046875, + "step": 2254, + "time_per_iteration": 2.444282054901123 + }, + { + "auxiliary_loss_clip": 0.01115733, + "auxiliary_loss_mlp": 0.01052232, + "balance_loss_clip": 1.02819943, + "balance_loss_mlp": 1.02996039, + "epoch": 0.13557793476627086, + "flos": 21615879864960.0, + "grad_norm": 1.874734162542784, + "language_loss": 0.84478366, + "learning_rate": 3.821568437497592e-06, + "loss": 0.86646336, + "num_input_tokens_seen": 48817955, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.859375, + "step": 2255, + "time_per_iteration": 2.3827905654907227 + }, + { + "auxiliary_loss_clip": 0.01110767, + "auxiliary_loss_mlp": 0.0104296, + "balance_loss_clip": 1.02015567, + "balance_loss_mlp": 1.02874386, + "epoch": 0.13563805801893883, + "flos": 24927322366080.0, + "grad_norm": 2.5800977448637177, + "language_loss": 0.74805433, + "learning_rate": 3.821412408700069e-06, + "loss": 0.76959157, + "num_input_tokens_seen": 48836330, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.8203125, + "step": 2256, + "time_per_iteration": 2.392350912094116 + }, + { + "auxiliary_loss_clip": 0.01113995, + "auxiliary_loss_mlp": 0.01049828, + "balance_loss_clip": 1.02698743, + "balance_loss_mlp": 1.03051257, + "epoch": 0.1356981812716068, + "flos": 14749581663360.0, + "grad_norm": 2.636399814040291, + "language_loss": 0.83367229, + "learning_rate": 3.821256314901023e-06, + "loss": 0.8553105, + "num_input_tokens_seen": 48851890, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.8359375, + "step": 2257, + "time_per_iteration": 2.3453638553619385 + }, + { + "auxiliary_loss_clip": 0.01117427, + "auxiliary_loss_mlp": 0.01045476, + "balance_loss_clip": 1.02176523, + "balance_loss_mlp": 1.03027248, + "epoch": 0.13575830452427476, + "flos": 11107791077760.0, + "grad_norm": 2.4127163814424946, + "language_loss": 0.81851101, + "learning_rate": 3.821100156106024e-06, + "loss": 0.84013999, + "num_input_tokens_seen": 48865510, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 0.875, + "step": 2258, + "time_per_iteration": 2.3260347843170166 + }, + { + "auxiliary_loss_clip": 0.01112168, + "auxiliary_loss_mlp": 0.01047014, + "balance_loss_clip": 1.02205133, + "balance_loss_mlp": 1.03000212, + "epoch": 0.13581842777694272, + "flos": 17959531242240.0, + "grad_norm": 2.4992820719003985, + "language_loss": 0.82302582, + "learning_rate": 3.820943932320644e-06, + "loss": 0.84461761, + "num_input_tokens_seen": 48882360, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.8203125, + "step": 2259, + "time_per_iteration": 2.3578600883483887 + }, + { + "auxiliary_loss_clip": 0.01115546, + "auxiliary_loss_mlp": 0.01044489, + "balance_loss_clip": 1.02334094, + "balance_loss_mlp": 1.03380466, + "epoch": 0.1358785510296107, + "flos": 22856029735680.0, + "grad_norm": 1.8211233775954654, + "language_loss": 0.73700893, + "learning_rate": 3.82078764355046e-06, + "loss": 0.75860929, + "num_input_tokens_seen": 48902700, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 0.81640625, + "step": 2260, + "time_per_iteration": 2.391852378845215 + }, + { + "auxiliary_loss_clip": 0.01109938, + "auxiliary_loss_mlp": 0.01051966, + "balance_loss_clip": 1.03000736, + "balance_loss_mlp": 1.02989888, + "epoch": 0.13593867428227868, + "flos": 25738214670720.0, + "grad_norm": 2.3528590007555854, + "language_loss": 0.75280863, + "learning_rate": 3.820631289801048e-06, + "loss": 0.77442765, + "num_input_tokens_seen": 48922525, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 0.80078125, + "step": 2261, + "time_per_iteration": 2.457080602645874 + }, + { + "auxiliary_loss_clip": 0.0111298, + "auxiliary_loss_mlp": 0.01039492, + "balance_loss_clip": 1.01767623, + "balance_loss_mlp": 1.03044224, + "epoch": 0.13599879753494665, + "flos": 31247858194560.0, + "grad_norm": 3.7731977894452378, + "language_loss": 0.63001621, + "learning_rate": 3.82047487107799e-06, + "loss": 0.65154088, + "num_input_tokens_seen": 48942510, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 0.82421875, + "step": 2262, + "time_per_iteration": 2.46036696434021 + }, + { + "auxiliary_loss_clip": 0.01111991, + "auxiliary_loss_mlp": 0.01041925, + "balance_loss_clip": 1.01963282, + "balance_loss_mlp": 1.03004408, + "epoch": 0.1360589207876146, + "flos": 23913898064640.0, + "grad_norm": 2.647698818670158, + "language_loss": 0.82917178, + "learning_rate": 3.820318387386865e-06, + "loss": 0.85071099, + "num_input_tokens_seen": 48962625, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.8203125, + "step": 2263, + "time_per_iteration": 2.4298808574676514 + }, + { + "auxiliary_loss_clip": 0.01116094, + "auxiliary_loss_mlp": 0.01052066, + "balance_loss_clip": 1.0279026, + "balance_loss_mlp": 1.03200746, + "epoch": 0.13611904404028258, + "flos": 19973181484800.0, + "grad_norm": 2.037974726999726, + "language_loss": 0.87724793, + "learning_rate": 3.8201618387332605e-06, + "loss": 0.8989296, + "num_input_tokens_seen": 48982525, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 0.84375, + "step": 2264, + "time_per_iteration": 2.3881897926330566 + }, + { + "auxiliary_loss_clip": 0.01116603, + "auxiliary_loss_mlp": 0.01043247, + "balance_loss_clip": 1.01901174, + "balance_loss_mlp": 1.03215957, + "epoch": 0.13617916729295054, + "flos": 15339753152640.0, + "grad_norm": 3.0086405021950764, + "language_loss": 0.71634519, + "learning_rate": 3.82000522512276e-06, + "loss": 0.73794365, + "num_input_tokens_seen": 48997605, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.84375, + "step": 2265, + "time_per_iteration": 2.362116813659668 + }, + { + "auxiliary_loss_clip": 0.01110729, + "auxiliary_loss_mlp": 0.01036144, + "balance_loss_clip": 1.01580715, + "balance_loss_mlp": 1.03179741, + "epoch": 0.1362392905456185, + "flos": 27450285655680.0, + "grad_norm": 2.2099575569314935, + "language_loss": 0.66132319, + "learning_rate": 3.819848546560957e-06, + "loss": 0.68279195, + "num_input_tokens_seen": 49018535, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 0.7890625, + "step": 2266, + "time_per_iteration": 2.4455068111419678 + }, + { + "auxiliary_loss_clip": 0.01111668, + "auxiliary_loss_mlp": 0.01050643, + "balance_loss_clip": 1.02792215, + "balance_loss_mlp": 1.03137374, + "epoch": 0.1362994137982865, + "flos": 25007866606080.0, + "grad_norm": 1.585383208781827, + "language_loss": 0.76206291, + "learning_rate": 3.819691803053439e-06, + "loss": 0.78368604, + "num_input_tokens_seen": 49038865, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 0.8046875, + "step": 2267, + "time_per_iteration": 3.8275153636932373 + }, + { + "auxiliary_loss_clip": 0.01110651, + "auxiliary_loss_mlp": 0.01041567, + "balance_loss_clip": 1.01919103, + "balance_loss_mlp": 1.02962959, + "epoch": 0.13635953705095447, + "flos": 20301993469440.0, + "grad_norm": 2.200944711480025, + "language_loss": 0.81724751, + "learning_rate": 3.819534994605802e-06, + "loss": 0.83876967, + "num_input_tokens_seen": 49058010, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 0.80859375, + "step": 2268, + "time_per_iteration": 2.3766119480133057 + }, + { + "auxiliary_loss_clip": 0.01109573, + "auxiliary_loss_mlp": 0.01038756, + "balance_loss_clip": 1.01694036, + "balance_loss_mlp": 1.03026772, + "epoch": 0.13641966030362243, + "flos": 31357066527360.0, + "grad_norm": 1.7898561510552156, + "language_loss": 0.75749362, + "learning_rate": 3.819378121223641e-06, + "loss": 0.77897686, + "num_input_tokens_seen": 49080330, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 0.79296875, + "step": 2269, + "time_per_iteration": 3.867666244506836 + }, + { + "auxiliary_loss_clip": 0.01115323, + "auxiliary_loss_mlp": 0.01037113, + "balance_loss_clip": 1.01519012, + "balance_loss_mlp": 1.03264654, + "epoch": 0.1364797835562904, + "flos": 20477257827840.0, + "grad_norm": 2.098228893476109, + "language_loss": 0.80965889, + "learning_rate": 3.819221182912555e-06, + "loss": 0.8311832, + "num_input_tokens_seen": 49097035, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 0.828125, + "step": 2270, + "time_per_iteration": 5.221672773361206 + }, + { + "auxiliary_loss_clip": 0.01115496, + "auxiliary_loss_mlp": 0.01048832, + "balance_loss_clip": 1.02586031, + "balance_loss_mlp": 1.03060257, + "epoch": 0.13653990680895836, + "flos": 13077520963200.0, + "grad_norm": 2.722045196078793, + "language_loss": 0.75869644, + "learning_rate": 3.819064179678145e-06, + "loss": 0.78033966, + "num_input_tokens_seen": 49113945, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 0.84765625, + "step": 2271, + "time_per_iteration": 2.361807107925415 + }, + { + "auxiliary_loss_clip": 0.01114825, + "auxiliary_loss_mlp": 0.01044332, + "balance_loss_clip": 1.02134812, + "balance_loss_mlp": 1.03116345, + "epoch": 0.13660003006162633, + "flos": 16945757827200.0, + "grad_norm": 1.8230049855742485, + "language_loss": 0.80149591, + "learning_rate": 3.8189071115260134e-06, + "loss": 0.82308745, + "num_input_tokens_seen": 49132855, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.8359375, + "step": 2272, + "time_per_iteration": 2.3897011280059814 + }, + { + "auxiliary_loss_clip": 0.01027994, + "auxiliary_loss_mlp": 0.01020793, + "balance_loss_clip": 1.01800334, + "balance_loss_mlp": 1.00485504, + "epoch": 0.1366601533142943, + "flos": 68679357310080.0, + "grad_norm": 0.6956821962683516, + "language_loss": 0.60680348, + "learning_rate": 3.818749978461765e-06, + "loss": 0.62729138, + "num_input_tokens_seen": 49198310, + "router_z_loss_clip": 0.0279541, + "router_z_loss_mlp": 0.23144531, + "step": 2273, + "time_per_iteration": 3.1359124183654785 + }, + { + "auxiliary_loss_clip": 0.01109442, + "auxiliary_loss_mlp": 0.01042235, + "balance_loss_clip": 1.02008581, + "balance_loss_mlp": 1.02961373, + "epoch": 0.13672027656696228, + "flos": 19243252356480.0, + "grad_norm": 1.6551537076379452, + "language_loss": 0.77221978, + "learning_rate": 3.8185927804910096e-06, + "loss": 0.79373658, + "num_input_tokens_seen": 49217250, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 0.796875, + "step": 2274, + "time_per_iteration": 2.3578431606292725 + }, + { + "auxiliary_loss_clip": 0.01112591, + "auxiliary_loss_mlp": 0.01045059, + "balance_loss_clip": 1.02202773, + "balance_loss_mlp": 1.03039908, + "epoch": 0.13678039981963025, + "flos": 24533780987520.0, + "grad_norm": 2.473963726005356, + "language_loss": 0.7832284, + "learning_rate": 3.818435517619355e-06, + "loss": 0.80480492, + "num_input_tokens_seen": 49236615, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.8203125, + "step": 2275, + "time_per_iteration": 2.43884539604187 + }, + { + "auxiliary_loss_clip": 0.01110935, + "auxiliary_loss_mlp": 0.01041719, + "balance_loss_clip": 1.02028525, + "balance_loss_mlp": 1.0302285, + "epoch": 0.13684052307229821, + "flos": 15668425491840.0, + "grad_norm": 2.7714124972923755, + "language_loss": 0.81413603, + "learning_rate": 3.818278189852415e-06, + "loss": 0.83566254, + "num_input_tokens_seen": 49253935, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 0.80859375, + "step": 2276, + "time_per_iteration": 2.362184524536133 + }, + { + "auxiliary_loss_clip": 0.01119973, + "auxiliary_loss_mlp": 0.01049788, + "balance_loss_clip": 1.02260852, + "balance_loss_mlp": 1.03208447, + "epoch": 0.13690064632496618, + "flos": 28363473843840.0, + "grad_norm": 2.480011971937364, + "language_loss": 0.69309795, + "learning_rate": 3.8181207971958025e-06, + "loss": 0.71479559, + "num_input_tokens_seen": 49273605, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 0.87890625, + "step": 2277, + "time_per_iteration": 2.4605045318603516 + }, + { + "auxiliary_loss_clip": 0.01112767, + "auxiliary_loss_mlp": 0.01054563, + "balance_loss_clip": 1.03097129, + "balance_loss_mlp": 1.03087139, + "epoch": 0.13696076957763414, + "flos": 23403642410880.0, + "grad_norm": 2.1244063974828564, + "language_loss": 0.80648291, + "learning_rate": 3.817963339655137e-06, + "loss": 0.82815623, + "num_input_tokens_seen": 49291785, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 0.8203125, + "step": 2278, + "time_per_iteration": 2.3916478157043457 + }, + { + "auxiliary_loss_clip": 0.01112518, + "auxiliary_loss_mlp": 0.01041664, + "balance_loss_clip": 1.01930034, + "balance_loss_mlp": 1.03335357, + "epoch": 0.1370208928303021, + "flos": 37195068188160.0, + "grad_norm": 2.334388181097841, + "language_loss": 0.7501992, + "learning_rate": 3.8178058172360346e-06, + "loss": 0.77174109, + "num_input_tokens_seen": 49311405, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 0.7890625, + "step": 2279, + "time_per_iteration": 2.517383098602295 + }, + { + "auxiliary_loss_clip": 0.0111511, + "auxiliary_loss_mlp": 0.01046556, + "balance_loss_clip": 1.02408552, + "balance_loss_mlp": 1.03076339, + "epoch": 0.13708101608297008, + "flos": 26975187607680.0, + "grad_norm": 1.8940635031675936, + "language_loss": 0.76659471, + "learning_rate": 3.817648229944119e-06, + "loss": 0.78821135, + "num_input_tokens_seen": 49331835, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.84375, + "step": 2280, + "time_per_iteration": 2.4143829345703125 + }, + { + "auxiliary_loss_clip": 0.01107492, + "auxiliary_loss_mlp": 0.01041696, + "balance_loss_clip": 1.01880753, + "balance_loss_mlp": 1.02764416, + "epoch": 0.13714113933563807, + "flos": 32555635102080.0, + "grad_norm": 1.73368062530258, + "language_loss": 0.79739249, + "learning_rate": 3.817490577785014e-06, + "loss": 0.81888437, + "num_input_tokens_seen": 49352290, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.796875, + "step": 2281, + "time_per_iteration": 2.465832233428955 + }, + { + "auxiliary_loss_clip": 0.01117367, + "auxiliary_loss_mlp": 0.01047146, + "balance_loss_clip": 1.0238409, + "balance_loss_mlp": 1.03159499, + "epoch": 0.13720126258830603, + "flos": 16100510878080.0, + "grad_norm": 1.7304683509529122, + "language_loss": 0.83738309, + "learning_rate": 3.817332860764346e-06, + "loss": 0.85902822, + "num_input_tokens_seen": 49370285, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 0.859375, + "step": 2282, + "time_per_iteration": 2.3765079975128174 + }, + { + "auxiliary_loss_clip": 0.01109256, + "auxiliary_loss_mlp": 0.01042945, + "balance_loss_clip": 1.02089167, + "balance_loss_mlp": 1.02886319, + "epoch": 0.137261385840974, + "flos": 18952530531840.0, + "grad_norm": 1.6378516218141752, + "language_loss": 0.73454171, + "learning_rate": 3.817175078887742e-06, + "loss": 0.75606376, + "num_input_tokens_seen": 49389610, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.8046875, + "step": 2283, + "time_per_iteration": 2.3719022274017334 + }, + { + "auxiliary_loss_clip": 0.01113893, + "auxiliary_loss_mlp": 0.01046806, + "balance_loss_clip": 1.02537262, + "balance_loss_mlp": 1.03361201, + "epoch": 0.13732150909364196, + "flos": 23294224609920.0, + "grad_norm": 2.343842559962333, + "language_loss": 0.83827215, + "learning_rate": 3.8170172321608345e-06, + "loss": 0.85987914, + "num_input_tokens_seen": 49408390, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 0.8046875, + "step": 2284, + "time_per_iteration": 2.398379325866699 + }, + { + "auxiliary_loss_clip": 0.01116226, + "auxiliary_loss_mlp": 0.01046771, + "balance_loss_clip": 1.02210665, + "balance_loss_mlp": 1.03020883, + "epoch": 0.13738163234630993, + "flos": 29349979620480.0, + "grad_norm": 1.769826224992319, + "language_loss": 0.74995393, + "learning_rate": 3.816859320589255e-06, + "loss": 0.77158391, + "num_input_tokens_seen": 49427725, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.859375, + "step": 2285, + "time_per_iteration": 2.5012168884277344 + }, + { + "auxiliary_loss_clip": 0.0111146, + "auxiliary_loss_mlp": 0.01044058, + "balance_loss_clip": 1.02106261, + "balance_loss_mlp": 1.03127074, + "epoch": 0.1374417555989779, + "flos": 26650111138560.0, + "grad_norm": 1.7714291008538752, + "language_loss": 0.74398136, + "learning_rate": 3.81670134417864e-06, + "loss": 0.76553655, + "num_input_tokens_seen": 49449000, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.80078125, + "step": 2286, + "time_per_iteration": 2.4326276779174805 + }, + { + "auxiliary_loss_clip": 0.01117541, + "auxiliary_loss_mlp": 0.01050704, + "balance_loss_clip": 1.02468061, + "balance_loss_mlp": 1.03174639, + "epoch": 0.1375018788516459, + "flos": 28402122585600.0, + "grad_norm": 2.0022977447187134, + "language_loss": 0.86365223, + "learning_rate": 3.8165433029346276e-06, + "loss": 0.88533461, + "num_input_tokens_seen": 49468360, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.859375, + "step": 2287, + "time_per_iteration": 2.4316582679748535 + }, + { + "auxiliary_loss_clip": 0.0111424, + "auxiliary_loss_mlp": 0.01047324, + "balance_loss_clip": 1.02416182, + "balance_loss_mlp": 1.03153014, + "epoch": 0.13756200210431385, + "flos": 37412297867520.0, + "grad_norm": 1.8374540548779694, + "language_loss": 0.68856287, + "learning_rate": 3.816385196862858e-06, + "loss": 0.71017849, + "num_input_tokens_seen": 49493450, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.828125, + "step": 2288, + "time_per_iteration": 2.5205066204071045 + }, + { + "auxiliary_loss_clip": 0.01114996, + "auxiliary_loss_mlp": 0.01044784, + "balance_loss_clip": 1.02221727, + "balance_loss_mlp": 1.03295159, + "epoch": 0.13762212535698182, + "flos": 22709918229120.0, + "grad_norm": 2.3650727449351887, + "language_loss": 0.86925477, + "learning_rate": 3.816227025968972e-06, + "loss": 0.89085257, + "num_input_tokens_seen": 49511220, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.8203125, + "step": 2289, + "time_per_iteration": 2.3779609203338623 + }, + { + "auxiliary_loss_clip": 0.01108751, + "auxiliary_loss_mlp": 0.01043833, + "balance_loss_clip": 1.02186263, + "balance_loss_mlp": 1.02881837, + "epoch": 0.13768224860964978, + "flos": 23950975795200.0, + "grad_norm": 1.8917479365362528, + "language_loss": 0.74836767, + "learning_rate": 3.8160687902586155e-06, + "loss": 0.76989353, + "num_input_tokens_seen": 49529820, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 0.796875, + "step": 2290, + "time_per_iteration": 2.411607027053833 + }, + { + "auxiliary_loss_clip": 0.01026061, + "auxiliary_loss_mlp": 0.01013156, + "balance_loss_clip": 1.01010406, + "balance_loss_mlp": 1.00359797, + "epoch": 0.13774237186231775, + "flos": 63586750510080.0, + "grad_norm": 0.7022014569495892, + "language_loss": 0.51588422, + "learning_rate": 3.815910489737436e-06, + "loss": 0.5362764, + "num_input_tokens_seen": 49595325, + "router_z_loss_clip": 0.03051758, + "router_z_loss_mlp": 0.22460938, + "step": 2291, + "time_per_iteration": 3.0735068321228027 + }, + { + "auxiliary_loss_clip": 0.01111884, + "auxiliary_loss_mlp": 0.01043238, + "balance_loss_clip": 1.01936054, + "balance_loss_mlp": 1.03058958, + "epoch": 0.1378024951149857, + "flos": 24278321502720.0, + "grad_norm": 1.7932632724464097, + "language_loss": 0.70804548, + "learning_rate": 3.815752124411081e-06, + "loss": 0.72959673, + "num_input_tokens_seen": 49615850, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.81640625, + "step": 2292, + "time_per_iteration": 2.408328056335449 + }, + { + "auxiliary_loss_clip": 0.01112536, + "auxiliary_loss_mlp": 0.01049368, + "balance_loss_clip": 1.02622998, + "balance_loss_mlp": 1.03175652, + "epoch": 0.13786261836765368, + "flos": 14020839521280.0, + "grad_norm": 2.592992239259999, + "language_loss": 0.80301976, + "learning_rate": 3.815593694285204e-06, + "loss": 0.82463878, + "num_input_tokens_seen": 49631860, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 0.80859375, + "step": 2293, + "time_per_iteration": 2.3571887016296387 + }, + { + "auxiliary_loss_clip": 0.01113389, + "auxiliary_loss_mlp": 0.01049373, + "balance_loss_clip": 1.0256741, + "balance_loss_mlp": 1.03126013, + "epoch": 0.13792274162032167, + "flos": 28877360279040.0, + "grad_norm": 2.1590697829436465, + "language_loss": 0.78471428, + "learning_rate": 3.815435199365459e-06, + "loss": 0.80634189, + "num_input_tokens_seen": 49652145, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.8203125, + "step": 2294, + "time_per_iteration": 2.459261894226074 + }, + { + "auxiliary_loss_clip": 0.01113719, + "auxiliary_loss_mlp": 0.01044648, + "balance_loss_clip": 1.02364314, + "balance_loss_mlp": 1.03316617, + "epoch": 0.13798286487298964, + "flos": 21140118501120.0, + "grad_norm": 2.209995404447119, + "language_loss": 0.80169517, + "learning_rate": 3.815276639657501e-06, + "loss": 0.82327884, + "num_input_tokens_seen": 49669880, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 0.8046875, + "step": 2295, + "time_per_iteration": 2.372267484664917 + }, + { + "auxiliary_loss_clip": 0.01111241, + "auxiliary_loss_mlp": 0.0104571, + "balance_loss_clip": 1.02134347, + "balance_loss_mlp": 1.02965164, + "epoch": 0.1380429881256576, + "flos": 22486509239040.0, + "grad_norm": 1.8298599221433658, + "language_loss": 0.78164601, + "learning_rate": 3.815118015166989e-06, + "loss": 0.80321556, + "num_input_tokens_seen": 49687255, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.81640625, + "step": 2296, + "time_per_iteration": 2.387343406677246 + }, + { + "auxiliary_loss_clip": 0.01116511, + "auxiliary_loss_mlp": 0.01047356, + "balance_loss_clip": 1.0243485, + "balance_loss_mlp": 1.0339613, + "epoch": 0.13810311137832557, + "flos": 21392715254400.0, + "grad_norm": 1.833720412786261, + "language_loss": 0.78415352, + "learning_rate": 3.814959325899584e-06, + "loss": 0.80579221, + "num_input_tokens_seen": 49706650, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.828125, + "step": 2297, + "time_per_iteration": 2.3892436027526855 + }, + { + "auxiliary_loss_clip": 0.01111782, + "auxiliary_loss_mlp": 0.01047091, + "balance_loss_clip": 1.02507257, + "balance_loss_mlp": 1.03123212, + "epoch": 0.13816323463099353, + "flos": 25988786565120.0, + "grad_norm": 2.3329139411238775, + "language_loss": 0.68648392, + "learning_rate": 3.81480057186095e-06, + "loss": 0.70807266, + "num_input_tokens_seen": 49725715, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.8046875, + "step": 2298, + "time_per_iteration": 2.412243604660034 + }, + { + "auxiliary_loss_clip": 0.01116407, + "auxiliary_loss_mlp": 0.01051851, + "balance_loss_clip": 1.02841473, + "balance_loss_mlp": 1.0311178, + "epoch": 0.1382233578836615, + "flos": 19243322179200.0, + "grad_norm": 2.047238548430911, + "language_loss": 0.86757356, + "learning_rate": 3.814641753056751e-06, + "loss": 0.88925612, + "num_input_tokens_seen": 49744710, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.8515625, + "step": 2299, + "time_per_iteration": 2.362157106399536 + }, + { + "auxiliary_loss_clip": 0.01111331, + "auxiliary_loss_mlp": 0.01052014, + "balance_loss_clip": 1.02799284, + "balance_loss_mlp": 1.02935147, + "epoch": 0.1382834811363295, + "flos": 25665106550400.0, + "grad_norm": 1.782685339291963, + "language_loss": 0.75776196, + "learning_rate": 3.8144828694926565e-06, + "loss": 0.77939546, + "num_input_tokens_seen": 49764300, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.8203125, + "step": 2300, + "time_per_iteration": 2.427091121673584 + }, + { + "auxiliary_loss_clip": 0.0111164, + "auxiliary_loss_mlp": 0.01046812, + "balance_loss_clip": 1.02509212, + "balance_loss_mlp": 1.03253913, + "epoch": 0.13834360438899745, + "flos": 19783394000640.0, + "grad_norm": 2.815439832138442, + "language_loss": 0.8307541, + "learning_rate": 3.814323921174335e-06, + "loss": 0.85233855, + "num_input_tokens_seen": 49778380, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 0.7890625, + "step": 2301, + "time_per_iteration": 2.347304582595825 + }, + { + "auxiliary_loss_clip": 0.01109228, + "auxiliary_loss_mlp": 0.0104721, + "balance_loss_clip": 1.0243578, + "balance_loss_mlp": 1.03007984, + "epoch": 0.13840372764166542, + "flos": 26650634808960.0, + "grad_norm": 1.8641862102081654, + "language_loss": 0.85776269, + "learning_rate": 3.81416490810746e-06, + "loss": 0.87932712, + "num_input_tokens_seen": 49797460, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.7890625, + "step": 2302, + "time_per_iteration": 2.4252429008483887 + }, + { + "auxiliary_loss_clip": 0.01025471, + "auxiliary_loss_mlp": 0.01006036, + "balance_loss_clip": 1.00288916, + "balance_loss_mlp": 1.00358677, + "epoch": 0.13846385089433338, + "flos": 70507444343040.0, + "grad_norm": 0.7557448061426103, + "language_loss": 0.65586698, + "learning_rate": 3.814005830297706e-06, + "loss": 0.67618203, + "num_input_tokens_seen": 49868005, + "router_z_loss_clip": 0.03149414, + "router_z_loss_mlp": 0.21875, + "step": 2303, + "time_per_iteration": 3.1661269664764404 + }, + { + "auxiliary_loss_clip": 0.01109203, + "auxiliary_loss_mlp": 0.01046523, + "balance_loss_clip": 1.02403975, + "balance_loss_mlp": 1.03107214, + "epoch": 0.13852397414700135, + "flos": 17347747754880.0, + "grad_norm": 1.7502725857789592, + "language_loss": 0.78403562, + "learning_rate": 3.81384668775075e-06, + "loss": 0.80559289, + "num_input_tokens_seen": 49885825, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.78125, + "step": 2304, + "time_per_iteration": 2.371805191040039 + }, + { + "auxiliary_loss_clip": 0.0111554, + "auxiliary_loss_mlp": 0.01040256, + "balance_loss_clip": 1.01833344, + "balance_loss_mlp": 1.03259957, + "epoch": 0.13858409739966931, + "flos": 21542701921920.0, + "grad_norm": 2.023868147563291, + "language_loss": 0.77400017, + "learning_rate": 3.8136874804722724e-06, + "loss": 0.79555821, + "num_input_tokens_seen": 49905975, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 0.83203125, + "step": 2305, + "time_per_iteration": 2.439689874649048 + }, + { + "auxiliary_loss_clip": 0.01109113, + "auxiliary_loss_mlp": 0.01044396, + "balance_loss_clip": 1.02222347, + "balance_loss_mlp": 1.02947581, + "epoch": 0.13864422065233728, + "flos": 21578837045760.0, + "grad_norm": 1.7596167687772786, + "language_loss": 0.87383056, + "learning_rate": 3.813528208467953e-06, + "loss": 0.89536566, + "num_input_tokens_seen": 49925800, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 0.796875, + "step": 2306, + "time_per_iteration": 2.4078853130340576 + }, + { + "auxiliary_loss_clip": 0.01024399, + "auxiliary_loss_mlp": 0.01003031, + "balance_loss_clip": 1.00005126, + "balance_loss_mlp": 1.00275683, + "epoch": 0.13870434390500527, + "flos": 53368861743360.0, + "grad_norm": 0.8668211906086138, + "language_loss": 0.58999717, + "learning_rate": 3.813368871743477e-06, + "loss": 0.61027151, + "num_input_tokens_seen": 49977620, + "router_z_loss_clip": 0.02978516, + "router_z_loss_mlp": 0.21679688, + "step": 2307, + "time_per_iteration": 4.485899925231934 + }, + { + "auxiliary_loss_clip": 0.0111669, + "auxiliary_loss_mlp": 0.01045172, + "balance_loss_clip": 1.02103209, + "balance_loss_mlp": 1.03269196, + "epoch": 0.13876446715767324, + "flos": 22564784240640.0, + "grad_norm": 2.4277061628479344, + "language_loss": 0.79327637, + "learning_rate": 3.813209470304531e-06, + "loss": 0.81489497, + "num_input_tokens_seen": 49996650, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.83984375, + "step": 2308, + "time_per_iteration": 3.7915420532226562 + }, + { + "auxiliary_loss_clip": 0.01112279, + "auxiliary_loss_mlp": 0.01039478, + "balance_loss_clip": 1.01644635, + "balance_loss_mlp": 1.03122795, + "epoch": 0.1388245904103412, + "flos": 20704157953920.0, + "grad_norm": 2.7725072846561845, + "language_loss": 0.77483279, + "learning_rate": 3.813050004156802e-06, + "loss": 0.79635036, + "num_input_tokens_seen": 50015640, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.80859375, + "step": 2309, + "time_per_iteration": 3.744729995727539 + }, + { + "auxiliary_loss_clip": 0.01115686, + "auxiliary_loss_mlp": 0.01039425, + "balance_loss_clip": 1.01646543, + "balance_loss_mlp": 1.03120661, + "epoch": 0.13888471366300917, + "flos": 20553787261440.0, + "grad_norm": 1.9923646727733035, + "language_loss": 0.67644227, + "learning_rate": 3.812890473305983e-06, + "loss": 0.6979934, + "num_input_tokens_seen": 50033500, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 0.84375, + "step": 2310, + "time_per_iteration": 3.7314958572387695 + }, + { + "auxiliary_loss_clip": 0.01114428, + "auxiliary_loss_mlp": 0.01042951, + "balance_loss_clip": 1.01878774, + "balance_loss_mlp": 1.03177369, + "epoch": 0.13894483691567713, + "flos": 13837370993280.0, + "grad_norm": 1.927902290859768, + "language_loss": 0.83659101, + "learning_rate": 3.812730877757766e-06, + "loss": 0.85816479, + "num_input_tokens_seen": 50050075, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 0.828125, + "step": 2311, + "time_per_iteration": 2.350458860397339 + }, + { + "auxiliary_loss_clip": 0.01116978, + "auxiliary_loss_mlp": 0.01043033, + "balance_loss_clip": 1.01932216, + "balance_loss_mlp": 1.03216636, + "epoch": 0.1390049601683451, + "flos": 28030123382400.0, + "grad_norm": 1.9602687914704884, + "language_loss": 0.81861597, + "learning_rate": 3.812571217517847e-06, + "loss": 0.84021604, + "num_input_tokens_seen": 50070080, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 0.84765625, + "step": 2312, + "time_per_iteration": 2.4183192253112793 + }, + { + "auxiliary_loss_clip": 0.01115909, + "auxiliary_loss_mlp": 0.01043386, + "balance_loss_clip": 1.02151084, + "balance_loss_mlp": 1.03175282, + "epoch": 0.13906508342101306, + "flos": 26755758512640.0, + "grad_norm": 1.6884825216715873, + "language_loss": 0.86466634, + "learning_rate": 3.8124114925919234e-06, + "loss": 0.88625932, + "num_input_tokens_seen": 50090040, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 0.83984375, + "step": 2313, + "time_per_iteration": 2.4165217876434326 + }, + { + "auxiliary_loss_clip": 0.01113991, + "auxiliary_loss_mlp": 0.01050481, + "balance_loss_clip": 1.02727151, + "balance_loss_mlp": 1.03228283, + "epoch": 0.13912520667368106, + "flos": 24533955544320.0, + "grad_norm": 1.9420178761846076, + "language_loss": 0.79697347, + "learning_rate": 3.812251702985696e-06, + "loss": 0.81861818, + "num_input_tokens_seen": 50110595, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.8203125, + "step": 2314, + "time_per_iteration": 2.4216740131378174 + }, + { + "auxiliary_loss_clip": 0.01115102, + "auxiliary_loss_mlp": 0.01042126, + "balance_loss_clip": 1.01780772, + "balance_loss_mlp": 1.03328633, + "epoch": 0.13918532992634902, + "flos": 19382416502400.0, + "grad_norm": 6.510030730031837, + "language_loss": 0.85251737, + "learning_rate": 3.8120918487048673e-06, + "loss": 0.87408972, + "num_input_tokens_seen": 50125430, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 0.81640625, + "step": 2315, + "time_per_iteration": 2.3296632766723633 + }, + { + "auxiliary_loss_clip": 0.01113169, + "auxiliary_loss_mlp": 0.01047482, + "balance_loss_clip": 1.02389026, + "balance_loss_mlp": 1.03022432, + "epoch": 0.139245453179017, + "flos": 21322714245120.0, + "grad_norm": 2.1342939804528664, + "language_loss": 0.77397943, + "learning_rate": 3.8119319297551417e-06, + "loss": 0.79558593, + "num_input_tokens_seen": 50144120, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.828125, + "step": 2316, + "time_per_iteration": 2.3798413276672363 + }, + { + "auxiliary_loss_clip": 0.01112326, + "auxiliary_loss_mlp": 0.01044963, + "balance_loss_clip": 1.02010787, + "balance_loss_mlp": 1.03112698, + "epoch": 0.13930557643168495, + "flos": 19499584625280.0, + "grad_norm": 1.6088289410619419, + "language_loss": 0.76960433, + "learning_rate": 3.811771946142226e-06, + "loss": 0.79117715, + "num_input_tokens_seen": 50162500, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.8125, + "step": 2317, + "time_per_iteration": 2.3655471801757812 + }, + { + "auxiliary_loss_clip": 0.01115677, + "auxiliary_loss_mlp": 0.01044519, + "balance_loss_clip": 1.02172649, + "balance_loss_mlp": 1.03252137, + "epoch": 0.13936569968435292, + "flos": 25409647065600.0, + "grad_norm": 1.8353313151848425, + "language_loss": 0.80771768, + "learning_rate": 3.8116118978718298e-06, + "loss": 0.82931966, + "num_input_tokens_seen": 50182415, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 0.83203125, + "step": 2318, + "time_per_iteration": 2.4340999126434326 + }, + { + "auxiliary_loss_clip": 0.01022548, + "auxiliary_loss_mlp": 0.01004856, + "balance_loss_clip": 1.00195885, + "balance_loss_mlp": 1.00148022, + "epoch": 0.13942582293702088, + "flos": 70767372481920.0, + "grad_norm": 0.8539343675442279, + "language_loss": 0.59066468, + "learning_rate": 3.811451784949665e-06, + "loss": 0.61093873, + "num_input_tokens_seen": 50245160, + "router_z_loss_clip": 0.02893066, + "router_z_loss_mlp": 0.2109375, + "step": 2319, + "time_per_iteration": 3.0526671409606934 + }, + { + "auxiliary_loss_clip": 0.01116876, + "auxiliary_loss_mlp": 0.01048714, + "balance_loss_clip": 1.02567124, + "balance_loss_mlp": 1.03278899, + "epoch": 0.13948594618968888, + "flos": 35589412627200.0, + "grad_norm": 2.446227278608528, + "language_loss": 0.65113854, + "learning_rate": 3.811291607381446e-06, + "loss": 0.67279446, + "num_input_tokens_seen": 50268215, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.84375, + "step": 2320, + "time_per_iteration": 2.50424861907959 + }, + { + "auxiliary_loss_clip": 0.01112892, + "auxiliary_loss_mlp": 0.01039067, + "balance_loss_clip": 1.01576197, + "balance_loss_mlp": 1.03142297, + "epoch": 0.13954606944235684, + "flos": 21104157934080.0, + "grad_norm": 1.5250669828234587, + "language_loss": 0.70898479, + "learning_rate": 3.8111313651728887e-06, + "loss": 0.73050439, + "num_input_tokens_seen": 50288575, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 0.81640625, + "step": 2321, + "time_per_iteration": 2.387446165084839 + }, + { + "auxiliary_loss_clip": 0.0111377, + "auxiliary_loss_mlp": 0.01048863, + "balance_loss_clip": 1.02609396, + "balance_loss_mlp": 1.03025997, + "epoch": 0.1396061926950248, + "flos": 25043303502720.0, + "grad_norm": 1.8167873724049057, + "language_loss": 0.85633826, + "learning_rate": 3.810971058329712e-06, + "loss": 0.8779645, + "num_input_tokens_seen": 50308735, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 0.8359375, + "step": 2322, + "time_per_iteration": 2.4280426502227783 + }, + { + "auxiliary_loss_clip": 0.01107724, + "auxiliary_loss_mlp": 0.010402, + "balance_loss_clip": 1.01770544, + "balance_loss_mlp": 1.02950573, + "epoch": 0.13966631594769277, + "flos": 37632495012480.0, + "grad_norm": 1.7811434630357614, + "language_loss": 0.67362523, + "learning_rate": 3.810810686857636e-06, + "loss": 0.69510448, + "num_input_tokens_seen": 50331025, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 0.78125, + "step": 2323, + "time_per_iteration": 2.543933629989624 + }, + { + "auxiliary_loss_clip": 0.01119111, + "auxiliary_loss_mlp": 0.01041933, + "balance_loss_clip": 1.01809096, + "balance_loss_mlp": 1.03224778, + "epoch": 0.13972643920036074, + "flos": 16690053962880.0, + "grad_norm": 1.88168631675888, + "language_loss": 0.88742232, + "learning_rate": 3.8106502507623847e-06, + "loss": 0.90903276, + "num_input_tokens_seen": 50349725, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.8671875, + "step": 2324, + "time_per_iteration": 2.3862104415893555 + }, + { + "auxiliary_loss_clip": 0.01114269, + "auxiliary_loss_mlp": 0.01046201, + "balance_loss_clip": 1.02154899, + "balance_loss_mlp": 1.02962196, + "epoch": 0.1397865624530287, + "flos": 23329940797440.0, + "grad_norm": 2.4616348831774024, + "language_loss": 0.70485055, + "learning_rate": 3.810489750049684e-06, + "loss": 0.72645521, + "num_input_tokens_seen": 50367965, + "router_z_loss_clip": 0.24707031, + "router_z_loss_mlp": 0.84375, + "step": 2325, + "time_per_iteration": 2.3760411739349365 + }, + { + "auxiliary_loss_clip": 0.01114765, + "auxiliary_loss_mlp": 0.0104497, + "balance_loss_clip": 1.02260661, + "balance_loss_mlp": 1.03328538, + "epoch": 0.13984668570569667, + "flos": 22777370709120.0, + "grad_norm": 2.194120627576144, + "language_loss": 0.81632841, + "learning_rate": 3.810329184725261e-06, + "loss": 0.83792573, + "num_input_tokens_seen": 50385605, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 0.81640625, + "step": 2326, + "time_per_iteration": 2.393697500228882 + }, + { + "auxiliary_loss_clip": 0.01113238, + "auxiliary_loss_mlp": 0.01043437, + "balance_loss_clip": 1.02263474, + "balance_loss_mlp": 1.0323596, + "epoch": 0.13990680895836466, + "flos": 19463519324160.0, + "grad_norm": 1.679143758752739, + "language_loss": 0.88916981, + "learning_rate": 3.8101685547948456e-06, + "loss": 0.91073656, + "num_input_tokens_seen": 50403985, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 0.80859375, + "step": 2327, + "time_per_iteration": 2.3681604862213135 + }, + { + "auxiliary_loss_clip": 0.01111782, + "auxiliary_loss_mlp": 0.01051522, + "balance_loss_clip": 1.02977872, + "balance_loss_mlp": 1.03227198, + "epoch": 0.13996693221103262, + "flos": 20302237848960.0, + "grad_norm": 2.3029894270625078, + "language_loss": 0.84684706, + "learning_rate": 3.8100078602641714e-06, + "loss": 0.86848009, + "num_input_tokens_seen": 50421590, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 0.796875, + "step": 2328, + "time_per_iteration": 2.3757174015045166 + }, + { + "auxiliary_loss_clip": 0.01113621, + "auxiliary_loss_mlp": 0.01045352, + "balance_loss_clip": 1.02246439, + "balance_loss_mlp": 1.03034067, + "epoch": 0.1400270554637006, + "flos": 26616419809920.0, + "grad_norm": 1.5164445313294197, + "language_loss": 0.74061275, + "learning_rate": 3.8098471011389723e-06, + "loss": 0.7622025, + "num_input_tokens_seen": 50443945, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 0.83203125, + "step": 2329, + "time_per_iteration": 2.428548574447632 + }, + { + "auxiliary_loss_clip": 0.01112129, + "auxiliary_loss_mlp": 0.01044525, + "balance_loss_clip": 1.02216148, + "balance_loss_mlp": 1.02910054, + "epoch": 0.14008717871636855, + "flos": 19390446115200.0, + "grad_norm": 2.297219445526394, + "language_loss": 0.7825973, + "learning_rate": 3.809686277424986e-06, + "loss": 0.80416381, + "num_input_tokens_seen": 50462065, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 0.828125, + "step": 2330, + "time_per_iteration": 2.3700814247131348 + }, + { + "auxiliary_loss_clip": 0.01110938, + "auxiliary_loss_mlp": 0.01038475, + "balance_loss_clip": 1.01670766, + "balance_loss_mlp": 1.03057289, + "epoch": 0.14014730196903652, + "flos": 15303373649280.0, + "grad_norm": 2.6356683996312147, + "language_loss": 0.71626061, + "learning_rate": 3.809525389127951e-06, + "loss": 0.7377547, + "num_input_tokens_seen": 50479565, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 0.8046875, + "step": 2331, + "time_per_iteration": 2.3808858394622803 + }, + { + "auxiliary_loss_clip": 0.01108511, + "auxiliary_loss_mlp": 0.01043533, + "balance_loss_clip": 1.02311277, + "balance_loss_mlp": 1.03137755, + "epoch": 0.14020742522170448, + "flos": 14938810565760.0, + "grad_norm": 1.8915342608415047, + "language_loss": 0.7251972, + "learning_rate": 3.8093644362536094e-06, + "loss": 0.74671763, + "num_input_tokens_seen": 50497305, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 0.7734375, + "step": 2332, + "time_per_iteration": 2.3650176525115967 + }, + { + "auxiliary_loss_clip": 0.01022938, + "auxiliary_loss_mlp": 0.01007537, + "balance_loss_clip": 1.00495028, + "balance_loss_mlp": 1.00183344, + "epoch": 0.14026754847437245, + "flos": 48822017316480.0, + "grad_norm": 0.8126341124100568, + "language_loss": 0.56089938, + "learning_rate": 3.809203418807706e-06, + "loss": 0.58120418, + "num_input_tokens_seen": 50549735, + "router_z_loss_clip": 0.02587891, + "router_z_loss_mlp": 0.2109375, + "step": 2333, + "time_per_iteration": 2.8726682662963867 + }, + { + "auxiliary_loss_clip": 0.01113125, + "auxiliary_loss_mlp": 0.01046742, + "balance_loss_clip": 1.02356756, + "balance_loss_mlp": 1.03146529, + "epoch": 0.14032767172704044, + "flos": 25772150378880.0, + "grad_norm": 1.6596230994853203, + "language_loss": 0.82564056, + "learning_rate": 3.8090423367959862e-06, + "loss": 0.84723926, + "num_input_tokens_seen": 50570100, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.81640625, + "step": 2334, + "time_per_iteration": 2.431256055831909 + }, + { + "auxiliary_loss_clip": 0.01109192, + "auxiliary_loss_mlp": 0.01039683, + "balance_loss_clip": 1.01901162, + "balance_loss_mlp": 1.02880728, + "epoch": 0.1403877949797084, + "flos": 21215216568960.0, + "grad_norm": 1.8070704326569684, + "language_loss": 0.81511354, + "learning_rate": 3.8088811902241984e-06, + "loss": 0.83660233, + "num_input_tokens_seen": 50589185, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 0.8046875, + "step": 2335, + "time_per_iteration": 2.418689012527466 + }, + { + "auxiliary_loss_clip": 0.01118209, + "auxiliary_loss_mlp": 0.01050244, + "balance_loss_clip": 1.02465022, + "balance_loss_mlp": 1.03229046, + "epoch": 0.14044791823237637, + "flos": 22746856314240.0, + "grad_norm": 1.5857080281952594, + "language_loss": 0.8213681, + "learning_rate": 3.8087199790980943e-06, + "loss": 0.84305263, + "num_input_tokens_seen": 50609645, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.859375, + "step": 2336, + "time_per_iteration": 2.3965723514556885 + }, + { + "auxiliary_loss_clip": 0.01112836, + "auxiliary_loss_mlp": 0.01041006, + "balance_loss_clip": 1.01849961, + "balance_loss_mlp": 1.03063273, + "epoch": 0.14050804148504434, + "flos": 22963387766400.0, + "grad_norm": 1.6075359232070303, + "language_loss": 0.80349731, + "learning_rate": 3.8085587034234268e-06, + "loss": 0.82503575, + "num_input_tokens_seen": 50628385, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.8203125, + "step": 2337, + "time_per_iteration": 2.393561363220215 + }, + { + "auxiliary_loss_clip": 0.01115161, + "auxiliary_loss_mlp": 0.01051235, + "balance_loss_clip": 1.02847803, + "balance_loss_mlp": 1.03135502, + "epoch": 0.1405681647377123, + "flos": 22199243639040.0, + "grad_norm": 3.0750946669038184, + "language_loss": 0.79212838, + "learning_rate": 3.8083973632059507e-06, + "loss": 0.81379235, + "num_input_tokens_seen": 50647260, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 0.83984375, + "step": 2338, + "time_per_iteration": 2.384571075439453 + }, + { + "auxiliary_loss_clip": 0.01119153, + "auxiliary_loss_mlp": 0.01042763, + "balance_loss_clip": 1.01802742, + "balance_loss_mlp": 1.03551793, + "epoch": 0.14062828799038027, + "flos": 23731651434240.0, + "grad_norm": 2.5063985567556615, + "language_loss": 0.79717278, + "learning_rate": 3.8082359584514254e-06, + "loss": 0.81879199, + "num_input_tokens_seen": 50666130, + "router_z_loss_clip": 0.24707031, + "router_z_loss_mlp": 0.8359375, + "step": 2339, + "time_per_iteration": 2.4049720764160156 + }, + { + "auxiliary_loss_clip": 0.01113783, + "auxiliary_loss_mlp": 0.01044787, + "balance_loss_clip": 1.02204204, + "balance_loss_mlp": 1.03190649, + "epoch": 0.14068841124304826, + "flos": 39200933197440.0, + "grad_norm": 2.0345611069085847, + "language_loss": 0.65627486, + "learning_rate": 3.8080744891656095e-06, + "loss": 0.67786056, + "num_input_tokens_seen": 50687440, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 0.8203125, + "step": 2340, + "time_per_iteration": 2.5427424907684326 + }, + { + "auxiliary_loss_clip": 0.01112406, + "auxiliary_loss_mlp": 0.01043054, + "balance_loss_clip": 1.01995158, + "balance_loss_mlp": 1.03261447, + "epoch": 0.14074853449571623, + "flos": 20191283948160.0, + "grad_norm": 2.969123965413322, + "language_loss": 0.77967715, + "learning_rate": 3.807912955354266e-06, + "loss": 0.80123174, + "num_input_tokens_seen": 50704030, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.796875, + "step": 2341, + "time_per_iteration": 2.3721840381622314 + }, + { + "auxiliary_loss_clip": 0.0110888, + "auxiliary_loss_mlp": 0.01044925, + "balance_loss_clip": 1.02176309, + "balance_loss_mlp": 1.03030443, + "epoch": 0.1408086577483842, + "flos": 18404882945280.0, + "grad_norm": 1.8885129295917387, + "language_loss": 0.80213922, + "learning_rate": 3.80775135702316e-06, + "loss": 0.8236773, + "num_input_tokens_seen": 50723305, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 0.78515625, + "step": 2342, + "time_per_iteration": 2.3720855712890625 + }, + { + "auxiliary_loss_clip": 0.01110901, + "auxiliary_loss_mlp": 0.01046541, + "balance_loss_clip": 1.0253576, + "balance_loss_mlp": 1.03243005, + "epoch": 0.14086878100105216, + "flos": 25263430824960.0, + "grad_norm": 1.9349407206328697, + "language_loss": 0.78248572, + "learning_rate": 3.8075896941780576e-06, + "loss": 0.80406016, + "num_input_tokens_seen": 50743270, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 0.78515625, + "step": 2343, + "time_per_iteration": 2.4285085201263428 + }, + { + "auxiliary_loss_clip": 0.01024611, + "auxiliary_loss_mlp": 0.01005424, + "balance_loss_clip": 1.00275409, + "balance_loss_mlp": 1.00350428, + "epoch": 0.14092890425372012, + "flos": 65975194730880.0, + "grad_norm": 0.9113845422778512, + "language_loss": 0.61496663, + "learning_rate": 3.807427966824729e-06, + "loss": 0.63526696, + "num_input_tokens_seen": 50802710, + "router_z_loss_clip": 0.0267334, + "router_z_loss_mlp": 0.2109375, + "step": 2344, + "time_per_iteration": 2.959873914718628 + }, + { + "auxiliary_loss_clip": 0.01110289, + "auxiliary_loss_mlp": 0.0104094, + "balance_loss_clip": 1.01946998, + "balance_loss_mlp": 1.03021097, + "epoch": 0.1409890275063881, + "flos": 23693875476480.0, + "grad_norm": 1.5345214861639942, + "language_loss": 0.644485, + "learning_rate": 3.807266174968946e-06, + "loss": 0.66599727, + "num_input_tokens_seen": 50822625, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 0.80078125, + "step": 2345, + "time_per_iteration": 2.4022178649902344 + }, + { + "auxiliary_loss_clip": 0.0111492, + "auxiliary_loss_mlp": 0.0103954, + "balance_loss_clip": 1.01692581, + "balance_loss_mlp": 1.02984154, + "epoch": 0.14104915075905605, + "flos": 23622024165120.0, + "grad_norm": 3.732364891862398, + "language_loss": 0.72913074, + "learning_rate": 3.8071043186164813e-06, + "loss": 0.75067532, + "num_input_tokens_seen": 50842330, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.8515625, + "step": 2346, + "time_per_iteration": 3.8668501377105713 + }, + { + "auxiliary_loss_clip": 0.01115043, + "auxiliary_loss_mlp": 0.01048066, + "balance_loss_clip": 1.02484357, + "balance_loss_mlp": 1.03207099, + "epoch": 0.14110927401172405, + "flos": 20594111748480.0, + "grad_norm": 3.1926601726259403, + "language_loss": 0.77061605, + "learning_rate": 3.8069423977731123e-06, + "loss": 0.79224718, + "num_input_tokens_seen": 50861035, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.828125, + "step": 2347, + "time_per_iteration": 3.761927366256714 + }, + { + "auxiliary_loss_clip": 0.01112526, + "auxiliary_loss_mlp": 0.01043529, + "balance_loss_clip": 1.02202344, + "balance_loss_mlp": 1.02968144, + "epoch": 0.141169397264392, + "flos": 28546802726400.0, + "grad_norm": 2.338335469689385, + "language_loss": 0.76286185, + "learning_rate": 3.8067804124446167e-06, + "loss": 0.78442234, + "num_input_tokens_seen": 50880105, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 0.828125, + "step": 2348, + "time_per_iteration": 2.4172322750091553 + }, + { + "auxiliary_loss_clip": 0.01113536, + "auxiliary_loss_mlp": 0.01045398, + "balance_loss_clip": 1.02209258, + "balance_loss_mlp": 1.03189969, + "epoch": 0.14122952051705998, + "flos": 17091310752000.0, + "grad_norm": 1.714294255083456, + "language_loss": 0.86320311, + "learning_rate": 3.806618362636776e-06, + "loss": 0.88479245, + "num_input_tokens_seen": 50897720, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.8203125, + "step": 2349, + "time_per_iteration": 3.782675266265869 + }, + { + "auxiliary_loss_clip": 0.01112457, + "auxiliary_loss_mlp": 0.01042913, + "balance_loss_clip": 1.02048922, + "balance_loss_mlp": 1.03230882, + "epoch": 0.14128964376972794, + "flos": 28945615720320.0, + "grad_norm": 1.6260149264212769, + "language_loss": 0.89123261, + "learning_rate": 3.806456248355373e-06, + "loss": 0.91278625, + "num_input_tokens_seen": 50918385, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.80078125, + "step": 2350, + "time_per_iteration": 3.825753927230835 + }, + { + "auxiliary_loss_clip": 0.01117982, + "auxiliary_loss_mlp": 0.01044442, + "balance_loss_clip": 1.02080297, + "balance_loss_mlp": 1.03451514, + "epoch": 0.1413497670223959, + "flos": 18988770389760.0, + "grad_norm": 1.6992135889614395, + "language_loss": 0.81226486, + "learning_rate": 3.806294069606194e-06, + "loss": 0.83388907, + "num_input_tokens_seen": 50938270, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.8359375, + "step": 2351, + "time_per_iteration": 2.3727540969848633 + }, + { + "auxiliary_loss_clip": 0.01115487, + "auxiliary_loss_mlp": 0.01041299, + "balance_loss_clip": 1.01912642, + "balance_loss_mlp": 1.03287506, + "epoch": 0.14140989027506387, + "flos": 29860933501440.0, + "grad_norm": 2.4282329595428696, + "language_loss": 0.83351785, + "learning_rate": 3.806131826395025e-06, + "loss": 0.85508567, + "num_input_tokens_seen": 50958155, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 0.828125, + "step": 2352, + "time_per_iteration": 2.443225860595703 + }, + { + "auxiliary_loss_clip": 0.01023041, + "auxiliary_loss_mlp": 0.01003801, + "balance_loss_clip": 1.001441, + "balance_loss_mlp": 1.00168443, + "epoch": 0.14147001352773186, + "flos": 62076303826560.0, + "grad_norm": 0.9060145135068637, + "language_loss": 0.61919022, + "learning_rate": 3.805969518727658e-06, + "loss": 0.63945866, + "num_input_tokens_seen": 51020705, + "router_z_loss_clip": 0.02355957, + "router_z_loss_mlp": 0.21289062, + "step": 2353, + "time_per_iteration": 2.951414108276367 + }, + { + "auxiliary_loss_clip": 0.01110853, + "auxiliary_loss_mlp": 0.01044185, + "balance_loss_clip": 1.02258432, + "balance_loss_mlp": 1.0318141, + "epoch": 0.14153013678039983, + "flos": 22016438426880.0, + "grad_norm": 1.6978982977159032, + "language_loss": 0.87054855, + "learning_rate": 3.805807146609884e-06, + "loss": 0.8920989, + "num_input_tokens_seen": 51039995, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 0.7890625, + "step": 2354, + "time_per_iteration": 2.403468132019043 + }, + { + "auxiliary_loss_clip": 0.01113959, + "auxiliary_loss_mlp": 0.0104906, + "balance_loss_clip": 1.02529013, + "balance_loss_mlp": 1.03166056, + "epoch": 0.1415902600330678, + "flos": 19719048631680.0, + "grad_norm": 2.191536556128632, + "language_loss": 0.74257559, + "learning_rate": 3.8056447100474976e-06, + "loss": 0.76420581, + "num_input_tokens_seen": 51059075, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.82421875, + "step": 2355, + "time_per_iteration": 2.3797683715820312 + }, + { + "auxiliary_loss_clip": 0.01022638, + "auxiliary_loss_mlp": 0.01004672, + "balance_loss_clip": 1.00213301, + "balance_loss_mlp": 1.00106001, + "epoch": 0.14165038328573576, + "flos": 65897862336000.0, + "grad_norm": 0.6813695692162474, + "language_loss": 0.51837111, + "learning_rate": 3.8054822090462963e-06, + "loss": 0.53864413, + "num_input_tokens_seen": 51120380, + "router_z_loss_clip": 0.02539062, + "router_z_loss_mlp": 0.21484375, + "step": 2356, + "time_per_iteration": 2.9988350868225098 + }, + { + "auxiliary_loss_clip": 0.01111905, + "auxiliary_loss_mlp": 0.01049433, + "balance_loss_clip": 1.0279392, + "balance_loss_mlp": 1.03195858, + "epoch": 0.14171050653840372, + "flos": 12129349726080.0, + "grad_norm": 2.2499940840965778, + "language_loss": 0.71124399, + "learning_rate": 3.80531964361208e-06, + "loss": 0.73285735, + "num_input_tokens_seen": 51136950, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 0.796875, + "step": 2357, + "time_per_iteration": 2.3525867462158203 + }, + { + "auxiliary_loss_clip": 0.01115533, + "auxiliary_loss_mlp": 0.01045419, + "balance_loss_clip": 1.02374637, + "balance_loss_mlp": 1.0324074, + "epoch": 0.1417706297910717, + "flos": 20411446181760.0, + "grad_norm": 3.382316394087526, + "language_loss": 0.81723762, + "learning_rate": 3.8051570137506485e-06, + "loss": 0.8388471, + "num_input_tokens_seen": 51155175, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 0.83203125, + "step": 2358, + "time_per_iteration": 2.3947958946228027 + }, + { + "auxiliary_loss_clip": 0.01116677, + "auxiliary_loss_mlp": 0.0104764, + "balance_loss_clip": 1.02472782, + "balance_loss_mlp": 1.03283024, + "epoch": 0.14183075304373965, + "flos": 22379570144640.0, + "grad_norm": 2.021207632014741, + "language_loss": 0.71728957, + "learning_rate": 3.804994319467807e-06, + "loss": 0.73893273, + "num_input_tokens_seen": 51174500, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.8359375, + "step": 2359, + "time_per_iteration": 2.4015228748321533 + }, + { + "auxiliary_loss_clip": 0.01110608, + "auxiliary_loss_mlp": 0.01036418, + "balance_loss_clip": 1.01448345, + "balance_loss_mlp": 1.03044295, + "epoch": 0.14189087629640765, + "flos": 21579814563840.0, + "grad_norm": 2.0088275417241963, + "language_loss": 0.75609106, + "learning_rate": 3.804831560769361e-06, + "loss": 0.77756137, + "num_input_tokens_seen": 51194270, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 0.80078125, + "step": 2360, + "time_per_iteration": 2.3866117000579834 + }, + { + "auxiliary_loss_clip": 0.01111825, + "auxiliary_loss_mlp": 0.01045388, + "balance_loss_clip": 1.02290535, + "balance_loss_mlp": 1.0320853, + "epoch": 0.1419509995490756, + "flos": 20007605952000.0, + "grad_norm": 1.9198538889155847, + "language_loss": 0.81491876, + "learning_rate": 3.8046687376611196e-06, + "loss": 0.83649093, + "num_input_tokens_seen": 51211850, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.796875, + "step": 2361, + "time_per_iteration": 2.364715337753296 + }, + { + "auxiliary_loss_clip": 0.0111196, + "auxiliary_loss_mlp": 0.01043956, + "balance_loss_clip": 1.02096045, + "balance_loss_mlp": 1.0317812, + "epoch": 0.14201112280174358, + "flos": 31940116099200.0, + "grad_norm": 1.9110472055203933, + "language_loss": 0.74089873, + "learning_rate": 3.8045058501488927e-06, + "loss": 0.76245791, + "num_input_tokens_seen": 51233545, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.8046875, + "step": 2362, + "time_per_iteration": 2.4871034622192383 + }, + { + "auxiliary_loss_clip": 0.01113443, + "auxiliary_loss_mlp": 0.01040609, + "balance_loss_clip": 1.01844823, + "balance_loss_mlp": 1.03260601, + "epoch": 0.14207124605441154, + "flos": 41462536982400.0, + "grad_norm": 1.6877583599104975, + "language_loss": 0.73817307, + "learning_rate": 3.804342898238494e-06, + "loss": 0.75971359, + "num_input_tokens_seen": 51257615, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 0.80859375, + "step": 2363, + "time_per_iteration": 2.5701701641082764 + }, + { + "auxiliary_loss_clip": 0.01111354, + "auxiliary_loss_mlp": 0.01042218, + "balance_loss_clip": 1.02141619, + "balance_loss_mlp": 1.03112841, + "epoch": 0.1421313693070795, + "flos": 31903736595840.0, + "grad_norm": 1.77503551669461, + "language_loss": 0.72893143, + "learning_rate": 3.8041798819357386e-06, + "loss": 0.75046718, + "num_input_tokens_seen": 51279645, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 0.8046875, + "step": 2364, + "time_per_iteration": 2.4738662242889404 + }, + { + "auxiliary_loss_clip": 0.01108281, + "auxiliary_loss_mlp": 0.01039316, + "balance_loss_clip": 1.01900244, + "balance_loss_mlp": 1.03060389, + "epoch": 0.14219149255974747, + "flos": 26869924258560.0, + "grad_norm": 2.37641305853089, + "language_loss": 0.9059546, + "learning_rate": 3.804016801246444e-06, + "loss": 0.92743063, + "num_input_tokens_seen": 51299775, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 0.77734375, + "step": 2365, + "time_per_iteration": 2.4367516040802 + }, + { + "auxiliary_loss_clip": 0.01110698, + "auxiliary_loss_mlp": 0.01040614, + "balance_loss_clip": 1.01842928, + "balance_loss_mlp": 1.02955818, + "epoch": 0.14225161581241544, + "flos": 27453183298560.0, + "grad_norm": 1.680012895617236, + "language_loss": 0.65590346, + "learning_rate": 3.80385365617643e-06, + "loss": 0.67741668, + "num_input_tokens_seen": 51319430, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 0.8125, + "step": 2366, + "time_per_iteration": 2.4501473903656006 + }, + { + "auxiliary_loss_clip": 0.01107549, + "auxiliary_loss_mlp": 0.01038612, + "balance_loss_clip": 1.01609302, + "balance_loss_mlp": 1.02917778, + "epoch": 0.14231173906508343, + "flos": 10560667161600.0, + "grad_norm": 2.2568984621358297, + "language_loss": 0.80072278, + "learning_rate": 3.8036904467315196e-06, + "loss": 0.82218438, + "num_input_tokens_seen": 51336045, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.78125, + "step": 2367, + "time_per_iteration": 2.36179256439209 + }, + { + "auxiliary_loss_clip": 0.01115014, + "auxiliary_loss_mlp": 0.0105425, + "balance_loss_clip": 1.03032434, + "balance_loss_mlp": 1.03221178, + "epoch": 0.1423718623177514, + "flos": 28359773239680.0, + "grad_norm": 2.2982741802170543, + "language_loss": 0.82969856, + "learning_rate": 3.8035271729175366e-06, + "loss": 0.8513912, + "num_input_tokens_seen": 51357030, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.828125, + "step": 2368, + "time_per_iteration": 2.4400994777679443 + }, + { + "auxiliary_loss_clip": 0.01111815, + "auxiliary_loss_mlp": 0.01049996, + "balance_loss_clip": 1.02657127, + "balance_loss_mlp": 1.03188169, + "epoch": 0.14243198557041936, + "flos": 19353228739200.0, + "grad_norm": 2.225072538149742, + "language_loss": 0.86660296, + "learning_rate": 3.803363834740308e-06, + "loss": 0.88822114, + "num_input_tokens_seen": 51374890, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.796875, + "step": 2369, + "time_per_iteration": 2.3709802627563477 + }, + { + "auxiliary_loss_clip": 0.01111436, + "auxiliary_loss_mlp": 0.0104525, + "balance_loss_clip": 1.021909, + "balance_loss_mlp": 1.0288223, + "epoch": 0.14249210882308733, + "flos": 28805508967680.0, + "grad_norm": 1.5397197418609387, + "language_loss": 0.7586726, + "learning_rate": 3.8032004322056627e-06, + "loss": 0.7802394, + "num_input_tokens_seen": 51398100, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 0.828125, + "step": 2370, + "time_per_iteration": 2.4590070247650146 + }, + { + "auxiliary_loss_clip": 0.01111055, + "auxiliary_loss_mlp": 0.01046763, + "balance_loss_clip": 1.02507842, + "balance_loss_mlp": 1.03099585, + "epoch": 0.1425522320757553, + "flos": 21833947416960.0, + "grad_norm": 1.7583329430782595, + "language_loss": 0.83001393, + "learning_rate": 3.8030369653194326e-06, + "loss": 0.85159212, + "num_input_tokens_seen": 51418745, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 0.80078125, + "step": 2371, + "time_per_iteration": 2.3887088298797607 + }, + { + "auxiliary_loss_clip": 0.01113674, + "auxiliary_loss_mlp": 0.0104464, + "balance_loss_clip": 1.02160835, + "balance_loss_mlp": 1.03213429, + "epoch": 0.14261235532842326, + "flos": 17310495467520.0, + "grad_norm": 1.9612660560625004, + "language_loss": 0.82770348, + "learning_rate": 3.802873434087451e-06, + "loss": 0.84928668, + "num_input_tokens_seen": 51437455, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.81640625, + "step": 2372, + "time_per_iteration": 2.3595197200775146 + }, + { + "auxiliary_loss_clip": 0.01111961, + "auxiliary_loss_mlp": 0.0104377, + "balance_loss_clip": 1.02153802, + "balance_loss_mlp": 1.03225899, + "epoch": 0.14267247858109125, + "flos": 18805755709440.0, + "grad_norm": 3.171101260252954, + "language_loss": 0.84952039, + "learning_rate": 3.8027098385155546e-06, + "loss": 0.87107772, + "num_input_tokens_seen": 51455710, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 0.796875, + "step": 2373, + "time_per_iteration": 2.3554599285125732 + }, + { + "auxiliary_loss_clip": 0.01108555, + "auxiliary_loss_mlp": 0.01047167, + "balance_loss_clip": 1.0272826, + "balance_loss_mlp": 1.02993953, + "epoch": 0.14273260183375922, + "flos": 11358258238080.0, + "grad_norm": 1.9497593446538968, + "language_loss": 0.85978901, + "learning_rate": 3.802546178609581e-06, + "loss": 0.88134623, + "num_input_tokens_seen": 51471270, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 0.78515625, + "step": 2374, + "time_per_iteration": 2.3471994400024414 + }, + { + "auxiliary_loss_clip": 0.01116023, + "auxiliary_loss_mlp": 0.01048151, + "balance_loss_clip": 1.02371335, + "balance_loss_mlp": 1.03108776, + "epoch": 0.14279272508642718, + "flos": 27566336615040.0, + "grad_norm": 1.6567888506182693, + "language_loss": 0.79175425, + "learning_rate": 3.8023824543753706e-06, + "loss": 0.81339598, + "num_input_tokens_seen": 51492705, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.8515625, + "step": 2375, + "time_per_iteration": 2.4538514614105225 + }, + { + "auxiliary_loss_clip": 0.01114544, + "auxiliary_loss_mlp": 0.01052575, + "balance_loss_clip": 1.0297699, + "balance_loss_mlp": 1.03305233, + "epoch": 0.14285284833909515, + "flos": 16251649620480.0, + "grad_norm": 2.5280824056824294, + "language_loss": 0.76490855, + "learning_rate": 3.802218665818767e-06, + "loss": 0.78657973, + "num_input_tokens_seen": 51510780, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 0.81640625, + "step": 2376, + "time_per_iteration": 2.356001138687134 + }, + { + "auxiliary_loss_clip": 0.01111805, + "auxiliary_loss_mlp": 0.01041119, + "balance_loss_clip": 1.0195061, + "balance_loss_mlp": 1.03157842, + "epoch": 0.1429129715917631, + "flos": 19754590262400.0, + "grad_norm": 1.822378925866651, + "language_loss": 0.93101025, + "learning_rate": 3.802054812945615e-06, + "loss": 0.95253944, + "num_input_tokens_seen": 51531400, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 0.8046875, + "step": 2377, + "time_per_iteration": 2.403977870941162 + }, + { + "auxiliary_loss_clip": 0.01109632, + "auxiliary_loss_mlp": 0.0104081, + "balance_loss_clip": 1.01652718, + "balance_loss_mlp": 1.02863622, + "epoch": 0.14297309484443108, + "flos": 21136173517440.0, + "grad_norm": 2.0131222186116404, + "language_loss": 0.91564405, + "learning_rate": 3.801890895761762e-06, + "loss": 0.93714845, + "num_input_tokens_seen": 51548215, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 0.8125, + "step": 2378, + "time_per_iteration": 2.373382091522217 + }, + { + "auxiliary_loss_clip": 0.01113144, + "auxiliary_loss_mlp": 0.01038751, + "balance_loss_clip": 1.0161612, + "balance_loss_mlp": 1.02985239, + "epoch": 0.14303321809709904, + "flos": 23585539927680.0, + "grad_norm": 1.6098071087419281, + "language_loss": 0.73419136, + "learning_rate": 3.8017269142730584e-06, + "loss": 0.75571024, + "num_input_tokens_seen": 51566820, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 0.83203125, + "step": 2379, + "time_per_iteration": 2.3976776599884033 + }, + { + "auxiliary_loss_clip": 0.01108895, + "auxiliary_loss_mlp": 0.01051303, + "balance_loss_clip": 1.02926159, + "balance_loss_mlp": 1.0289185, + "epoch": 0.14309334134976703, + "flos": 15887365827840.0, + "grad_norm": 1.961125634825258, + "language_loss": 0.78647882, + "learning_rate": 3.801562868485355e-06, + "loss": 0.80808079, + "num_input_tokens_seen": 51585075, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.80078125, + "step": 2380, + "time_per_iteration": 2.3603386878967285 + }, + { + "auxiliary_loss_clip": 0.01115999, + "auxiliary_loss_mlp": 0.01042334, + "balance_loss_clip": 1.0200417, + "balance_loss_mlp": 1.03406525, + "epoch": 0.143153464602435, + "flos": 16324687918080.0, + "grad_norm": 2.1120644519583975, + "language_loss": 0.88181722, + "learning_rate": 3.801398758404508e-06, + "loss": 0.90340054, + "num_input_tokens_seen": 51603185, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.8203125, + "step": 2381, + "time_per_iteration": 2.377686023712158 + }, + { + "auxiliary_loss_clip": 0.01109959, + "auxiliary_loss_mlp": 0.01041204, + "balance_loss_clip": 1.0184474, + "balance_loss_mlp": 1.0317328, + "epoch": 0.14321358785510296, + "flos": 17091136195200.0, + "grad_norm": 2.218149750101264, + "language_loss": 0.76782626, + "learning_rate": 3.801234584036372e-06, + "loss": 0.78933787, + "num_input_tokens_seen": 51620880, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 0.78125, + "step": 2382, + "time_per_iteration": 2.4079976081848145 + }, + { + "auxiliary_loss_clip": 0.01110099, + "auxiliary_loss_mlp": 0.01046025, + "balance_loss_clip": 1.02367353, + "balance_loss_mlp": 1.02952552, + "epoch": 0.14327371110777093, + "flos": 26321718090240.0, + "grad_norm": 2.2360123700006933, + "language_loss": 0.76984197, + "learning_rate": 3.801070345386808e-06, + "loss": 0.79140317, + "num_input_tokens_seen": 51640170, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 0.8046875, + "step": 2383, + "time_per_iteration": 2.426485776901245 + }, + { + "auxiliary_loss_clip": 0.01112413, + "auxiliary_loss_mlp": 0.01048505, + "balance_loss_clip": 1.02380538, + "balance_loss_mlp": 1.02988279, + "epoch": 0.1433338343604389, + "flos": 18075512378880.0, + "grad_norm": 2.3204216584220494, + "language_loss": 0.87647116, + "learning_rate": 3.8009060424616757e-06, + "loss": 0.89808035, + "num_input_tokens_seen": 51656580, + "router_z_loss_clip": 0.24707031, + "router_z_loss_mlp": 0.82421875, + "step": 2384, + "time_per_iteration": 2.3518261909484863 + }, + { + "auxiliary_loss_clip": 0.01115697, + "auxiliary_loss_mlp": 0.01045557, + "balance_loss_clip": 1.02165604, + "balance_loss_mlp": 1.03101289, + "epoch": 0.14339395761310686, + "flos": 15521895048960.0, + "grad_norm": 2.254275462304245, + "language_loss": 0.79344857, + "learning_rate": 3.800741675266839e-06, + "loss": 0.81506115, + "num_input_tokens_seen": 51674645, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 0.84375, + "step": 2385, + "time_per_iteration": 3.7864675521850586 + }, + { + "auxiliary_loss_clip": 0.01109576, + "auxiliary_loss_mlp": 0.01043029, + "balance_loss_clip": 1.0214045, + "balance_loss_mlp": 1.03019714, + "epoch": 0.14345408086577485, + "flos": 28547500953600.0, + "grad_norm": 1.680801224843066, + "language_loss": 0.75024277, + "learning_rate": 3.8005772438081645e-06, + "loss": 0.77176881, + "num_input_tokens_seen": 51695770, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 0.79296875, + "step": 2386, + "time_per_iteration": 3.8261818885803223 + }, + { + "auxiliary_loss_clip": 0.01111145, + "auxiliary_loss_mlp": 0.01040607, + "balance_loss_clip": 1.01887536, + "balance_loss_mlp": 1.03130329, + "epoch": 0.14351420411844282, + "flos": 20229024994560.0, + "grad_norm": 2.069877764197071, + "language_loss": 0.78709936, + "learning_rate": 3.80041274809152e-06, + "loss": 0.80861688, + "num_input_tokens_seen": 51714165, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 0.796875, + "step": 2387, + "time_per_iteration": 2.3673291206359863 + }, + { + "auxiliary_loss_clip": 0.01109196, + "auxiliary_loss_mlp": 0.01045684, + "balance_loss_clip": 1.02303433, + "balance_loss_mlp": 1.02931619, + "epoch": 0.14357432737111078, + "flos": 19864008063360.0, + "grad_norm": 2.107675298355465, + "language_loss": 0.82349843, + "learning_rate": 3.8002481881227753e-06, + "loss": 0.84504724, + "num_input_tokens_seen": 51734440, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.796875, + "step": 2388, + "time_per_iteration": 2.386894702911377 + }, + { + "auxiliary_loss_clip": 0.01111769, + "auxiliary_loss_mlp": 0.01042883, + "balance_loss_clip": 1.02164018, + "balance_loss_mlp": 1.03092527, + "epoch": 0.14363445062377875, + "flos": 28255557231360.0, + "grad_norm": 2.801989469982729, + "language_loss": 0.82503819, + "learning_rate": 3.8000835639078038e-06, + "loss": 0.8465848, + "num_input_tokens_seen": 51753730, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 0.80859375, + "step": 2389, + "time_per_iteration": 5.149309873580933 + }, + { + "auxiliary_loss_clip": 0.01113311, + "auxiliary_loss_mlp": 0.01045806, + "balance_loss_clip": 1.02178502, + "balance_loss_mlp": 1.03083026, + "epoch": 0.1436945738764467, + "flos": 18185698229760.0, + "grad_norm": 1.9683773180119561, + "language_loss": 0.83158261, + "learning_rate": 3.79991887545248e-06, + "loss": 0.85317379, + "num_input_tokens_seen": 51771195, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.82421875, + "step": 2390, + "time_per_iteration": 2.348984479904175 + }, + { + "auxiliary_loss_clip": 0.01108689, + "auxiliary_loss_mlp": 0.01044275, + "balance_loss_clip": 1.02272224, + "balance_loss_mlp": 1.02935779, + "epoch": 0.14375469712911468, + "flos": 27306687767040.0, + "grad_norm": 1.530518587021261, + "language_loss": 0.74943447, + "learning_rate": 3.799754122762682e-06, + "loss": 0.77096415, + "num_input_tokens_seen": 51792290, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 0.79296875, + "step": 2391, + "time_per_iteration": 2.4368858337402344 + }, + { + "auxiliary_loss_clip": 0.0102576, + "auxiliary_loss_mlp": 0.01002298, + "balance_loss_clip": 0.99986637, + "balance_loss_mlp": 1.00419617, + "epoch": 0.14381482038178264, + "flos": 56888559838080.0, + "grad_norm": 1.0154385386077425, + "language_loss": 0.61786532, + "learning_rate": 3.7995893058442886e-06, + "loss": 0.63814592, + "num_input_tokens_seen": 51843675, + "router_z_loss_clip": 0.02429199, + "router_z_loss_mlp": 0.21582031, + "step": 2392, + "time_per_iteration": 2.861618995666504 + }, + { + "auxiliary_loss_clip": 0.01111459, + "auxiliary_loss_mlp": 0.01049624, + "balance_loss_clip": 1.02554417, + "balance_loss_mlp": 1.02828789, + "epoch": 0.14387494363445064, + "flos": 14281326241920.0, + "grad_norm": 2.069828487465517, + "language_loss": 0.76887888, + "learning_rate": 3.7994244247031814e-06, + "loss": 0.79048973, + "num_input_tokens_seen": 51860285, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 0.83203125, + "step": 2393, + "time_per_iteration": 2.358253002166748 + }, + { + "auxiliary_loss_clip": 0.01112962, + "auxiliary_loss_mlp": 0.01042185, + "balance_loss_clip": 1.02052498, + "balance_loss_mlp": 1.03079224, + "epoch": 0.1439350668871186, + "flos": 26760262078080.0, + "grad_norm": 1.8514424364669433, + "language_loss": 0.76460016, + "learning_rate": 3.799259479345246e-06, + "loss": 0.78615165, + "num_input_tokens_seen": 51880105, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 0.8203125, + "step": 2394, + "time_per_iteration": 2.4324076175689697 + }, + { + "auxiliary_loss_clip": 0.01108887, + "auxiliary_loss_mlp": 0.01045604, + "balance_loss_clip": 1.02238178, + "balance_loss_mlp": 1.02888703, + "epoch": 0.14399519013978657, + "flos": 40698392855040.0, + "grad_norm": 1.6178149714271428, + "language_loss": 0.86226803, + "learning_rate": 3.799094469776367e-06, + "loss": 0.8838129, + "num_input_tokens_seen": 51905175, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.80078125, + "step": 2395, + "time_per_iteration": 2.5615274906158447 + }, + { + "auxiliary_loss_clip": 0.01108712, + "auxiliary_loss_mlp": 0.010458, + "balance_loss_clip": 1.02436662, + "balance_loss_mlp": 1.03135109, + "epoch": 0.14405531339245453, + "flos": 20556510347520.0, + "grad_norm": 1.5520876876903358, + "language_loss": 0.82945615, + "learning_rate": 3.7989293960024353e-06, + "loss": 0.85100126, + "num_input_tokens_seen": 51924490, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 0.7734375, + "step": 2396, + "time_per_iteration": 2.386688232421875 + }, + { + "auxiliary_loss_clip": 0.01106862, + "auxiliary_loss_mlp": 0.01039103, + "balance_loss_clip": 1.01921928, + "balance_loss_mlp": 1.02991748, + "epoch": 0.1441154366451225, + "flos": 19571924695680.0, + "grad_norm": 2.649614566155054, + "language_loss": 0.82483745, + "learning_rate": 3.79876425802934e-06, + "loss": 0.84629709, + "num_input_tokens_seen": 51940490, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 0.76953125, + "step": 2397, + "time_per_iteration": 2.375566005706787 + }, + { + "auxiliary_loss_clip": 0.0111361, + "auxiliary_loss_mlp": 0.01051843, + "balance_loss_clip": 1.02984858, + "balance_loss_mlp": 1.03093338, + "epoch": 0.14417555989779046, + "flos": 18514719682560.0, + "grad_norm": 1.6649212933750084, + "language_loss": 0.79688466, + "learning_rate": 3.798599055862976e-06, + "loss": 0.8185392, + "num_input_tokens_seen": 51957910, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 0.828125, + "step": 2398, + "time_per_iteration": 2.3601315021514893 + }, + { + "auxiliary_loss_clip": 0.01106993, + "auxiliary_loss_mlp": 0.01051333, + "balance_loss_clip": 1.02938616, + "balance_loss_mlp": 1.03000951, + "epoch": 0.14423568315045843, + "flos": 26030472595200.0, + "grad_norm": 10.976560248022666, + "language_loss": 0.64585006, + "learning_rate": 3.798433789509238e-06, + "loss": 0.66743332, + "num_input_tokens_seen": 51978010, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 0.76953125, + "step": 2399, + "time_per_iteration": 2.4182305335998535 + }, + { + "auxiliary_loss_clip": 0.01110507, + "auxiliary_loss_mlp": 0.01045423, + "balance_loss_clip": 1.02336943, + "balance_loss_mlp": 1.03213882, + "epoch": 0.14429580640312642, + "flos": 21287661373440.0, + "grad_norm": 2.09594890333685, + "language_loss": 0.82169539, + "learning_rate": 3.798268458974024e-06, + "loss": 0.84325469, + "num_input_tokens_seen": 51998515, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.78515625, + "step": 2400, + "time_per_iteration": 2.3954548835754395 + }, + { + "auxiliary_loss_clip": 0.01112839, + "auxiliary_loss_mlp": 0.01048561, + "balance_loss_clip": 1.02368224, + "balance_loss_mlp": 1.03122401, + "epoch": 0.14435592965579438, + "flos": 25626737099520.0, + "grad_norm": 2.025461941102729, + "language_loss": 0.74472535, + "learning_rate": 3.7981030642632348e-06, + "loss": 0.76633936, + "num_input_tokens_seen": 52019270, + "router_z_loss_clip": 0.24902344, + "router_z_loss_mlp": 0.81640625, + "step": 2401, + "time_per_iteration": 2.4465906620025635 + }, + { + "auxiliary_loss_clip": 0.01109409, + "auxiliary_loss_mlp": 0.01038422, + "balance_loss_clip": 1.01783419, + "balance_loss_mlp": 1.02983093, + "epoch": 0.14441605290846235, + "flos": 22963981259520.0, + "grad_norm": 1.8874126218600147, + "language_loss": 0.8074652, + "learning_rate": 3.797937605382772e-06, + "loss": 0.82894349, + "num_input_tokens_seen": 52039315, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 0.796875, + "step": 2402, + "time_per_iteration": 2.3857922554016113 + }, + { + "auxiliary_loss_clip": 0.01109774, + "auxiliary_loss_mlp": 0.01045489, + "balance_loss_clip": 1.02326858, + "balance_loss_mlp": 1.03010798, + "epoch": 0.14447617616113032, + "flos": 17346700414080.0, + "grad_norm": 2.4552598884483037, + "language_loss": 0.84383583, + "learning_rate": 3.7977720823385413e-06, + "loss": 0.86538851, + "num_input_tokens_seen": 52056555, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 0.796875, + "step": 2403, + "time_per_iteration": 2.3651437759399414 + }, + { + "auxiliary_loss_clip": 0.01108275, + "auxiliary_loss_mlp": 0.01047624, + "balance_loss_clip": 1.02614224, + "balance_loss_mlp": 1.02868319, + "epoch": 0.14453629941379828, + "flos": 24059066964480.0, + "grad_norm": 1.9221461120669485, + "language_loss": 0.69886154, + "learning_rate": 3.797606495136449e-06, + "loss": 0.72042048, + "num_input_tokens_seen": 52075800, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 0.796875, + "step": 2404, + "time_per_iteration": 2.394455909729004 + }, + { + "auxiliary_loss_clip": 0.01105719, + "auxiliary_loss_mlp": 0.01043794, + "balance_loss_clip": 1.02257442, + "balance_loss_mlp": 1.02929091, + "epoch": 0.14459642266646625, + "flos": 14428659646080.0, + "grad_norm": 1.97815403774669, + "language_loss": 0.73047113, + "learning_rate": 3.7974408437824055e-06, + "loss": 0.75196624, + "num_input_tokens_seen": 52092585, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 0.765625, + "step": 2405, + "time_per_iteration": 2.354947566986084 + }, + { + "auxiliary_loss_clip": 0.01106846, + "auxiliary_loss_mlp": 0.01041665, + "balance_loss_clip": 1.0204097, + "balance_loss_mlp": 1.03113937, + "epoch": 0.14465654591913424, + "flos": 9866314575360.0, + "grad_norm": 4.372737942354544, + "language_loss": 0.73072457, + "learning_rate": 3.7972751282823216e-06, + "loss": 0.7522096, + "num_input_tokens_seen": 52108990, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 0.7578125, + "step": 2406, + "time_per_iteration": 2.3535666465759277 + }, + { + "auxiliary_loss_clip": 0.01110209, + "auxiliary_loss_mlp": 0.01046813, + "balance_loss_clip": 1.02440178, + "balance_loss_mlp": 1.03055358, + "epoch": 0.1447166691718022, + "flos": 24971766393600.0, + "grad_norm": 2.2657112076268757, + "language_loss": 0.75740147, + "learning_rate": 3.797109348642111e-06, + "loss": 0.77897167, + "num_input_tokens_seen": 52125385, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.796875, + "step": 2407, + "time_per_iteration": 2.403923273086548 + }, + { + "auxiliary_loss_clip": 0.01107285, + "auxiliary_loss_mlp": 0.01037676, + "balance_loss_clip": 1.01682675, + "balance_loss_mlp": 1.02851439, + "epoch": 0.14477679242447017, + "flos": 21906950803200.0, + "grad_norm": 1.4982171274409408, + "language_loss": 0.79570168, + "learning_rate": 3.796943504867691e-06, + "loss": 0.81715131, + "num_input_tokens_seen": 52144985, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 0.78515625, + "step": 2408, + "time_per_iteration": 2.3764305114746094 + }, + { + "auxiliary_loss_clip": 0.01110048, + "auxiliary_loss_mlp": 0.01046355, + "balance_loss_clip": 1.0226922, + "balance_loss_mlp": 1.0321908, + "epoch": 0.14483691567713813, + "flos": 20739699584640.0, + "grad_norm": 25.981433853447278, + "language_loss": 0.82397455, + "learning_rate": 3.7967775969649796e-06, + "loss": 0.84553862, + "num_input_tokens_seen": 52163885, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 0.78125, + "step": 2409, + "time_per_iteration": 2.367091178894043 + }, + { + "auxiliary_loss_clip": 0.01109693, + "auxiliary_loss_mlp": 0.01047852, + "balance_loss_clip": 1.02637005, + "balance_loss_mlp": 1.03146958, + "epoch": 0.1448970389298061, + "flos": 35406258301440.0, + "grad_norm": 1.7803194277442216, + "language_loss": 0.74579203, + "learning_rate": 3.7966116249398974e-06, + "loss": 0.76736754, + "num_input_tokens_seen": 52184325, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 0.78125, + "step": 2410, + "time_per_iteration": 2.5116896629333496 + }, + { + "auxiliary_loss_clip": 0.01107985, + "auxiliary_loss_mlp": 0.01041762, + "balance_loss_clip": 1.02059031, + "balance_loss_mlp": 1.02840614, + "epoch": 0.14495716218247406, + "flos": 15413454766080.0, + "grad_norm": 1.8276983278577725, + "language_loss": 0.8123709, + "learning_rate": 3.7964455887983675e-06, + "loss": 0.83386838, + "num_input_tokens_seen": 52202740, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 0.796875, + "step": 2411, + "time_per_iteration": 2.356724739074707 + }, + { + "auxiliary_loss_clip": 0.01108443, + "auxiliary_loss_mlp": 0.01043607, + "balance_loss_clip": 1.0213623, + "balance_loss_mlp": 1.03147197, + "epoch": 0.14501728543514203, + "flos": 33691813344000.0, + "grad_norm": 2.289403854800444, + "language_loss": 0.70229125, + "learning_rate": 3.7962794885463165e-06, + "loss": 0.72381175, + "num_input_tokens_seen": 52223100, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.76953125, + "step": 2412, + "time_per_iteration": 2.4828953742980957 + }, + { + "auxiliary_loss_clip": 0.01109675, + "auxiliary_loss_mlp": 0.01037952, + "balance_loss_clip": 1.0168761, + "balance_loss_mlp": 1.03181338, + "epoch": 0.14507740868781002, + "flos": 15595212637440.0, + "grad_norm": 2.979334482442582, + "language_loss": 0.76595402, + "learning_rate": 3.7961133241896706e-06, + "loss": 0.78743023, + "num_input_tokens_seen": 52239690, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 0.78125, + "step": 2413, + "time_per_iteration": 2.352808713912964 + }, + { + "auxiliary_loss_clip": 0.01028559, + "auxiliary_loss_mlp": 0.01003957, + "balance_loss_clip": 1.00168025, + "balance_loss_mlp": 1.00727987, + "epoch": 0.145137531940478, + "flos": 66672026023680.0, + "grad_norm": 0.8823553427400396, + "language_loss": 0.58824027, + "learning_rate": 3.79594709573436e-06, + "loss": 0.60856533, + "num_input_tokens_seen": 52296705, + "router_z_loss_clip": 0.02282715, + "router_z_loss_mlp": 0.21289062, + "step": 2414, + "time_per_iteration": 2.949816942214966 + }, + { + "auxiliary_loss_clip": 0.01026435, + "auxiliary_loss_mlp": 0.01006749, + "balance_loss_clip": 1.00451934, + "balance_loss_mlp": 1.00521815, + "epoch": 0.14519765519314595, + "flos": 67518041022720.0, + "grad_norm": 0.8364423695325559, + "language_loss": 0.62246674, + "learning_rate": 3.7957808031863173e-06, + "loss": 0.64279854, + "num_input_tokens_seen": 52361830, + "router_z_loss_clip": 0.02233887, + "router_z_loss_mlp": 0.21191406, + "step": 2415, + "time_per_iteration": 3.0504634380340576 + }, + { + "auxiliary_loss_clip": 0.01106679, + "auxiliary_loss_mlp": 0.01037006, + "balance_loss_clip": 1.01550126, + "balance_loss_mlp": 1.02929401, + "epoch": 0.14525777844581392, + "flos": 17198040378240.0, + "grad_norm": 1.999221124672743, + "language_loss": 0.71876603, + "learning_rate": 3.7956144465514775e-06, + "loss": 0.7402029, + "num_input_tokens_seen": 52379420, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 0.7734375, + "step": 2416, + "time_per_iteration": 2.359178304672241 + }, + { + "auxiliary_loss_clip": 0.01025716, + "auxiliary_loss_mlp": 0.01003898, + "balance_loss_clip": 1.00165713, + "balance_loss_mlp": 1.00491476, + "epoch": 0.14531790169848188, + "flos": 65401152289920.0, + "grad_norm": 0.7093771423964166, + "language_loss": 0.60392392, + "learning_rate": 3.7954480258357765e-06, + "loss": 0.62422007, + "num_input_tokens_seen": 52446290, + "router_z_loss_clip": 0.02246094, + "router_z_loss_mlp": 0.20800781, + "step": 2417, + "time_per_iteration": 3.0886611938476562 + }, + { + "auxiliary_loss_clip": 0.01111104, + "auxiliary_loss_mlp": 0.01048751, + "balance_loss_clip": 1.02648282, + "balance_loss_mlp": 1.02903223, + "epoch": 0.14537802495114985, + "flos": 32561081274240.0, + "grad_norm": 2.5029232264615304, + "language_loss": 0.78740788, + "learning_rate": 3.7952815410451542e-06, + "loss": 0.80900639, + "num_input_tokens_seen": 52467295, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.8203125, + "step": 2418, + "time_per_iteration": 2.4675533771514893 + }, + { + "auxiliary_loss_clip": 0.01106791, + "auxiliary_loss_mlp": 0.0103804, + "balance_loss_clip": 1.01740527, + "balance_loss_mlp": 1.02998948, + "epoch": 0.1454381482038178, + "flos": 20225743326720.0, + "grad_norm": 2.3543541080721186, + "language_loss": 0.71558362, + "learning_rate": 3.7951149921855515e-06, + "loss": 0.73703194, + "num_input_tokens_seen": 52487295, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 0.765625, + "step": 2419, + "time_per_iteration": 2.4007863998413086 + }, + { + "auxiliary_loss_clip": 0.01107661, + "auxiliary_loss_mlp": 0.01041494, + "balance_loss_clip": 1.01862967, + "balance_loss_mlp": 1.03081453, + "epoch": 0.1454982714564858, + "flos": 22892025214080.0, + "grad_norm": 2.509133605470454, + "language_loss": 0.89308, + "learning_rate": 3.794948379262913e-06, + "loss": 0.91457152, + "num_input_tokens_seen": 52504220, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.765625, + "step": 2420, + "time_per_iteration": 2.3839590549468994 + }, + { + "auxiliary_loss_clip": 0.01108402, + "auxiliary_loss_mlp": 0.01040117, + "balance_loss_clip": 1.01805139, + "balance_loss_mlp": 1.03007388, + "epoch": 0.14555839470915377, + "flos": 20228815526400.0, + "grad_norm": 1.9597872167404442, + "language_loss": 0.82786453, + "learning_rate": 3.794781702283183e-06, + "loss": 0.84934974, + "num_input_tokens_seen": 52521900, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.78125, + "step": 2421, + "time_per_iteration": 2.3867626190185547 + }, + { + "auxiliary_loss_clip": 0.01110605, + "auxiliary_loss_mlp": 0.01042945, + "balance_loss_clip": 1.0207957, + "balance_loss_mlp": 1.03116322, + "epoch": 0.14561851796182174, + "flos": 22235204206080.0, + "grad_norm": 1.6068226050987182, + "language_loss": 0.81600267, + "learning_rate": 3.7946149612523116e-06, + "loss": 0.83753818, + "num_input_tokens_seen": 52540495, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 0.79296875, + "step": 2422, + "time_per_iteration": 2.388953447341919 + }, + { + "auxiliary_loss_clip": 0.01027698, + "auxiliary_loss_mlp": 0.01003126, + "balance_loss_clip": 1.00078976, + "balance_loss_mlp": 1.00744081, + "epoch": 0.1456786412144897, + "flos": 52633624222080.0, + "grad_norm": 0.9167793141958676, + "language_loss": 0.63315821, + "learning_rate": 3.794448156176248e-06, + "loss": 0.65346646, + "num_input_tokens_seen": 52603305, + "router_z_loss_clip": 0.02331543, + "router_z_loss_mlp": 0.203125, + "step": 2423, + "time_per_iteration": 3.024770975112915 + }, + { + "auxiliary_loss_clip": 0.01109042, + "auxiliary_loss_mlp": 0.01039125, + "balance_loss_clip": 1.01955104, + "balance_loss_mlp": 1.0321672, + "epoch": 0.14573876446715767, + "flos": 23220557907840.0, + "grad_norm": 3.561868481747033, + "language_loss": 0.82309031, + "learning_rate": 3.794281287060946e-06, + "loss": 0.84457195, + "num_input_tokens_seen": 52623435, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 0.76953125, + "step": 2424, + "time_per_iteration": 3.805325984954834 + }, + { + "auxiliary_loss_clip": 0.01108945, + "auxiliary_loss_mlp": 0.01040561, + "balance_loss_clip": 1.01880586, + "balance_loss_mlp": 1.03246284, + "epoch": 0.14579888771982563, + "flos": 18113393070720.0, + "grad_norm": 2.3867775837286835, + "language_loss": 0.78662455, + "learning_rate": 3.7941143539123596e-06, + "loss": 0.80811965, + "num_input_tokens_seen": 52642255, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 0.765625, + "step": 2425, + "time_per_iteration": 2.3818471431732178 + }, + { + "auxiliary_loss_clip": 0.011079, + "auxiliary_loss_mlp": 0.01040786, + "balance_loss_clip": 1.01992452, + "balance_loss_mlp": 1.03184569, + "epoch": 0.14585901097249362, + "flos": 23000046560640.0, + "grad_norm": 2.004185816551749, + "language_loss": 0.8373462, + "learning_rate": 3.7939473567364473e-06, + "loss": 0.85883307, + "num_input_tokens_seen": 52658700, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 0.76171875, + "step": 2426, + "time_per_iteration": 3.758450746536255 + }, + { + "auxiliary_loss_clip": 0.01106852, + "auxiliary_loss_mlp": 0.01039837, + "balance_loss_clip": 1.01924932, + "balance_loss_mlp": 1.03191352, + "epoch": 0.1459191342251616, + "flos": 21907579207680.0, + "grad_norm": 1.9805063390766124, + "language_loss": 0.87306172, + "learning_rate": 3.793780295539169e-06, + "loss": 0.89452857, + "num_input_tokens_seen": 52678140, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 0.74609375, + "step": 2427, + "time_per_iteration": 3.782193899154663 + }, + { + "auxiliary_loss_clip": 0.01112837, + "auxiliary_loss_mlp": 0.01038514, + "balance_loss_clip": 1.01620984, + "balance_loss_mlp": 1.03108454, + "epoch": 0.14597925747782955, + "flos": 14974631487360.0, + "grad_norm": 2.3468969024196586, + "language_loss": 0.66859877, + "learning_rate": 3.793613170326485e-06, + "loss": 0.69011229, + "num_input_tokens_seen": 52696825, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.8203125, + "step": 2428, + "time_per_iteration": 3.758338689804077 + }, + { + "auxiliary_loss_clip": 0.01106052, + "auxiliary_loss_mlp": 0.01040569, + "balance_loss_clip": 1.01875365, + "balance_loss_mlp": 1.02963817, + "epoch": 0.14603938073049752, + "flos": 21067848253440.0, + "grad_norm": 2.593160435012226, + "language_loss": 0.83403075, + "learning_rate": 3.793445981104362e-06, + "loss": 0.855497, + "num_input_tokens_seen": 52715125, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 0.765625, + "step": 2429, + "time_per_iteration": 2.4017221927642822 + }, + { + "auxiliary_loss_clip": 0.0110459, + "auxiliary_loss_mlp": 0.01034364, + "balance_loss_clip": 1.01366925, + "balance_loss_mlp": 1.02881503, + "epoch": 0.14609950398316549, + "flos": 19863763683840.0, + "grad_norm": 1.7183100895627228, + "language_loss": 0.79013276, + "learning_rate": 3.7932787278787643e-06, + "loss": 0.81152231, + "num_input_tokens_seen": 52734015, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 0.7578125, + "step": 2430, + "time_per_iteration": 2.3640503883361816 + }, + { + "auxiliary_loss_clip": 0.01108524, + "auxiliary_loss_mlp": 0.01045156, + "balance_loss_clip": 1.02367413, + "balance_loss_mlp": 1.0301621, + "epoch": 0.14615962723583345, + "flos": 22417765038720.0, + "grad_norm": 2.056080596525592, + "language_loss": 0.82878721, + "learning_rate": 3.7931114106556618e-06, + "loss": 0.85032403, + "num_input_tokens_seen": 52753025, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 0.78515625, + "step": 2431, + "time_per_iteration": 2.3871424198150635 + }, + { + "auxiliary_loss_clip": 0.01110717, + "auxiliary_loss_mlp": 0.0104409, + "balance_loss_clip": 1.02068925, + "balance_loss_mlp": 1.03180575, + "epoch": 0.14621975048850142, + "flos": 22345145677440.0, + "grad_norm": 1.8340758882246042, + "language_loss": 0.78707421, + "learning_rate": 3.7929440294410256e-06, + "loss": 0.80862224, + "num_input_tokens_seen": 52773420, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.7890625, + "step": 2432, + "time_per_iteration": 2.3856136798858643 + }, + { + "auxiliary_loss_clip": 0.01104668, + "auxiliary_loss_mlp": 0.01045863, + "balance_loss_clip": 1.02277184, + "balance_loss_mlp": 1.02831507, + "epoch": 0.1462798737411694, + "flos": 24388018594560.0, + "grad_norm": 2.069365885011866, + "language_loss": 0.79872191, + "learning_rate": 3.792776584240829e-06, + "loss": 0.82022727, + "num_input_tokens_seen": 52792870, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.765625, + "step": 2433, + "time_per_iteration": 2.4183566570281982 + }, + { + "auxiliary_loss_clip": 0.01104393, + "auxiliary_loss_mlp": 0.01039623, + "balance_loss_clip": 1.01834464, + "balance_loss_mlp": 1.03108001, + "epoch": 0.14633999699383737, + "flos": 19243671292800.0, + "grad_norm": 1.8863344199181091, + "language_loss": 0.78056562, + "learning_rate": 3.7926090750610477e-06, + "loss": 0.80200571, + "num_input_tokens_seen": 52811615, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 0.734375, + "step": 2434, + "time_per_iteration": 2.362306594848633 + }, + { + "auxiliary_loss_clip": 0.01023121, + "auxiliary_loss_mlp": 0.01002245, + "balance_loss_clip": 0.9998247, + "balance_loss_mlp": 1.00283957, + "epoch": 0.14640012024650534, + "flos": 62657468184960.0, + "grad_norm": 0.856942128852628, + "language_loss": 0.58509767, + "learning_rate": 3.7924415019076593e-06, + "loss": 0.60535133, + "num_input_tokens_seen": 52873230, + "router_z_loss_clip": 0.02416992, + "router_z_loss_mlp": 0.203125, + "step": 2435, + "time_per_iteration": 2.991661787033081 + }, + { + "auxiliary_loss_clip": 0.01104708, + "auxiliary_loss_mlp": 0.01040238, + "balance_loss_clip": 1.01990104, + "balance_loss_mlp": 1.02923751, + "epoch": 0.1464602434991733, + "flos": 12275426321280.0, + "grad_norm": 2.1762567484660567, + "language_loss": 0.88035542, + "learning_rate": 3.7922738647866447e-06, + "loss": 0.90180486, + "num_input_tokens_seen": 52889325, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 0.75390625, + "step": 2436, + "time_per_iteration": 2.359917640686035 + }, + { + "auxiliary_loss_clip": 0.01109498, + "auxiliary_loss_mlp": 0.01038005, + "balance_loss_clip": 1.01778746, + "balance_loss_mlp": 1.03151584, + "epoch": 0.14652036675184127, + "flos": 20921282899200.0, + "grad_norm": 2.0799352703185243, + "language_loss": 0.74614632, + "learning_rate": 3.792106163703986e-06, + "loss": 0.7676214, + "num_input_tokens_seen": 52909705, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 0.78125, + "step": 2437, + "time_per_iteration": 2.401500701904297 + }, + { + "auxiliary_loss_clip": 0.01107899, + "auxiliary_loss_mlp": 0.01047454, + "balance_loss_clip": 1.02295673, + "balance_loss_mlp": 1.02991128, + "epoch": 0.14658049000450923, + "flos": 27702603118080.0, + "grad_norm": 2.836756276761295, + "language_loss": 0.73944908, + "learning_rate": 3.791938398665668e-06, + "loss": 0.76100266, + "num_input_tokens_seen": 52930300, + "router_z_loss_clip": 0.24511719, + "router_z_loss_mlp": 0.78125, + "step": 2438, + "time_per_iteration": 2.4427995681762695 + }, + { + "auxiliary_loss_clip": 0.01107115, + "auxiliary_loss_mlp": 0.01039244, + "balance_loss_clip": 1.01975298, + "balance_loss_mlp": 1.03227019, + "epoch": 0.14664061325717723, + "flos": 24935351978880.0, + "grad_norm": 2.120152516942651, + "language_loss": 0.74749863, + "learning_rate": 3.7917705696776786e-06, + "loss": 0.76896226, + "num_input_tokens_seen": 52949955, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 0.74609375, + "step": 2439, + "time_per_iteration": 2.42877459526062 + }, + { + "auxiliary_loss_clip": 0.01105312, + "auxiliary_loss_mlp": 0.01045753, + "balance_loss_clip": 1.02427197, + "balance_loss_mlp": 1.0307641, + "epoch": 0.1467007365098452, + "flos": 40296053813760.0, + "grad_norm": 1.8759258491755313, + "language_loss": 0.74690181, + "learning_rate": 3.7916026767460067e-06, + "loss": 0.76841247, + "num_input_tokens_seen": 52972905, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 0.74609375, + "step": 2440, + "time_per_iteration": 2.543079376220703 + }, + { + "auxiliary_loss_clip": 0.01104207, + "auxiliary_loss_mlp": 0.01042874, + "balance_loss_clip": 1.02344286, + "balance_loss_mlp": 1.03002143, + "epoch": 0.14676085976251316, + "flos": 26539890376320.0, + "grad_norm": 1.5325917258297075, + "language_loss": 0.83222544, + "learning_rate": 3.791434719876643e-06, + "loss": 0.85369635, + "num_input_tokens_seen": 52994850, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 0.7421875, + "step": 2441, + "time_per_iteration": 2.4438955783843994 + }, + { + "auxiliary_loss_clip": 0.01112085, + "auxiliary_loss_mlp": 0.01042917, + "balance_loss_clip": 1.01980221, + "balance_loss_mlp": 1.03033793, + "epoch": 0.14682098301518112, + "flos": 23548985867520.0, + "grad_norm": 2.030479993575067, + "language_loss": 0.72118711, + "learning_rate": 3.7912666990755825e-06, + "loss": 0.74273717, + "num_input_tokens_seen": 53014740, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 0.81640625, + "step": 2442, + "time_per_iteration": 2.3936851024627686 + }, + { + "auxiliary_loss_clip": 0.0111204, + "auxiliary_loss_mlp": 0.01041995, + "balance_loss_clip": 1.02048922, + "balance_loss_mlp": 1.03172112, + "epoch": 0.1468811062678491, + "flos": 11650411428480.0, + "grad_norm": 2.8190333055980825, + "language_loss": 0.81732076, + "learning_rate": 3.791098614348821e-06, + "loss": 0.83886111, + "num_input_tokens_seen": 53029780, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 0.8046875, + "step": 2443, + "time_per_iteration": 2.3456430435180664 + }, + { + "auxiliary_loss_clip": 0.01107763, + "auxiliary_loss_mlp": 0.01042506, + "balance_loss_clip": 1.02182293, + "balance_loss_mlp": 1.03075087, + "epoch": 0.14694122952051705, + "flos": 23001512837760.0, + "grad_norm": 1.7544449726777838, + "language_loss": 0.82959914, + "learning_rate": 3.790930465702358e-06, + "loss": 0.85110176, + "num_input_tokens_seen": 53048620, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 0.7734375, + "step": 2444, + "time_per_iteration": 2.3839950561523438 + }, + { + "auxiliary_loss_clip": 0.01108247, + "auxiliary_loss_mlp": 0.01039826, + "balance_loss_clip": 1.01883376, + "balance_loss_mlp": 1.03084493, + "epoch": 0.14700135277318502, + "flos": 26501835127680.0, + "grad_norm": 1.6459977299007913, + "language_loss": 0.70786947, + "learning_rate": 3.790762253142193e-06, + "loss": 0.72935021, + "num_input_tokens_seen": 53070055, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 0.7734375, + "step": 2445, + "time_per_iteration": 2.4207019805908203 + }, + { + "auxiliary_loss_clip": 0.01025382, + "auxiliary_loss_mlp": 0.01003005, + "balance_loss_clip": 1.00078809, + "balance_loss_mlp": 1.00529063, + "epoch": 0.147061476025853, + "flos": 59446366531200.0, + "grad_norm": 0.8105754912794436, + "language_loss": 0.6305809, + "learning_rate": 3.7905939766743296e-06, + "loss": 0.65086478, + "num_input_tokens_seen": 53126945, + "router_z_loss_clip": 0.0222168, + "router_z_loss_mlp": 0.20117188, + "step": 2446, + "time_per_iteration": 2.908247232437134 + }, + { + "auxiliary_loss_clip": 0.01108616, + "auxiliary_loss_mlp": 0.01047134, + "balance_loss_clip": 1.02487803, + "balance_loss_mlp": 1.02990222, + "epoch": 0.14712159927852098, + "flos": 28329607958400.0, + "grad_norm": 1.57403669728487, + "language_loss": 0.74675715, + "learning_rate": 3.790425636304773e-06, + "loss": 0.76831472, + "num_input_tokens_seen": 53149130, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.7890625, + "step": 2447, + "time_per_iteration": 2.44604754447937 + }, + { + "auxiliary_loss_clip": 0.01104429, + "auxiliary_loss_mlp": 0.01038763, + "balance_loss_clip": 1.01849759, + "balance_loss_mlp": 1.02910519, + "epoch": 0.14718172253118894, + "flos": 27088585303680.0, + "grad_norm": 2.1219490618768657, + "language_loss": 0.85836643, + "learning_rate": 3.7902572320395313e-06, + "loss": 0.87979835, + "num_input_tokens_seen": 53167120, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 0.75390625, + "step": 2448, + "time_per_iteration": 2.4199423789978027 + }, + { + "auxiliary_loss_clip": 0.01023192, + "auxiliary_loss_mlp": 0.01010136, + "balance_loss_clip": 1.00791848, + "balance_loss_mlp": 1.00305104, + "epoch": 0.1472418457838569, + "flos": 66703587759360.0, + "grad_norm": 0.7662270412089266, + "language_loss": 0.56839919, + "learning_rate": 3.790088763884614e-06, + "loss": 0.58873248, + "num_input_tokens_seen": 53227945, + "router_z_loss_clip": 0.0222168, + "router_z_loss_mlp": 0.20117188, + "step": 2449, + "time_per_iteration": 2.975759983062744 + }, + { + "auxiliary_loss_clip": 0.01106258, + "auxiliary_loss_mlp": 0.01040438, + "balance_loss_clip": 1.01940989, + "balance_loss_mlp": 1.03197777, + "epoch": 0.14730196903652487, + "flos": 19572553100160.0, + "grad_norm": 1.8227387694827935, + "language_loss": 0.85032845, + "learning_rate": 3.789920231846033e-06, + "loss": 0.87179542, + "num_input_tokens_seen": 53244615, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 0.7421875, + "step": 2450, + "time_per_iteration": 2.370150566101074 + }, + { + "auxiliary_loss_clip": 0.01109353, + "auxiliary_loss_mlp": 0.01039365, + "balance_loss_clip": 1.0171082, + "balance_loss_mlp": 1.03172064, + "epoch": 0.14736209228919284, + "flos": 16070101217280.0, + "grad_norm": 2.111219050806394, + "language_loss": 0.74913561, + "learning_rate": 3.7897516359298034e-06, + "loss": 0.77062279, + "num_input_tokens_seen": 53262205, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.7734375, + "step": 2451, + "time_per_iteration": 2.3699350357055664 + }, + { + "auxiliary_loss_clip": 0.01102903, + "auxiliary_loss_mlp": 0.01041274, + "balance_loss_clip": 1.02171206, + "balance_loss_mlp": 1.02980554, + "epoch": 0.1474222155418608, + "flos": 23038346188800.0, + "grad_norm": 1.5959023829120984, + "language_loss": 0.82293332, + "learning_rate": 3.7895829761419417e-06, + "loss": 0.84437507, + "num_input_tokens_seen": 53282445, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 0.73046875, + "step": 2452, + "time_per_iteration": 2.4012458324432373 + }, + { + "auxiliary_loss_clip": 0.01104116, + "auxiliary_loss_mlp": 0.01037933, + "balance_loss_clip": 1.01787043, + "balance_loss_mlp": 1.0311923, + "epoch": 0.1474823387945288, + "flos": 17017713872640.0, + "grad_norm": 1.9016400293559543, + "language_loss": 0.74453282, + "learning_rate": 3.789414252488467e-06, + "loss": 0.76595336, + "num_input_tokens_seen": 53299060, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 0.73046875, + "step": 2453, + "time_per_iteration": 2.3514039516448975 + }, + { + "auxiliary_loss_clip": 0.011081, + "auxiliary_loss_mlp": 0.01037958, + "balance_loss_clip": 1.01665509, + "balance_loss_mlp": 1.03024554, + "epoch": 0.14754246204719676, + "flos": 17894068709760.0, + "grad_norm": 2.005732480324612, + "language_loss": 0.75826025, + "learning_rate": 3.7892454649754006e-06, + "loss": 0.77972078, + "num_input_tokens_seen": 53315970, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 0.77734375, + "step": 2454, + "time_per_iteration": 2.351875066757202 + }, + { + "auxiliary_loss_clip": 0.011096, + "auxiliary_loss_mlp": 0.01038054, + "balance_loss_clip": 1.01739502, + "balance_loss_mlp": 1.02999902, + "epoch": 0.14760258529986472, + "flos": 13078254101760.0, + "grad_norm": 1.874940662609717, + "language_loss": 0.83087826, + "learning_rate": 3.789076613608766e-06, + "loss": 0.85235482, + "num_input_tokens_seen": 53332940, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 0.796875, + "step": 2455, + "time_per_iteration": 2.3562862873077393 + }, + { + "auxiliary_loss_clip": 0.01110501, + "auxiliary_loss_mlp": 0.01041664, + "balance_loss_clip": 1.02000332, + "balance_loss_mlp": 1.02969289, + "epoch": 0.1476627085525327, + "flos": 30805194666240.0, + "grad_norm": 2.15398892222712, + "language_loss": 0.83898067, + "learning_rate": 3.788907698394589e-06, + "loss": 0.8605023, + "num_input_tokens_seen": 53353295, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 0.80859375, + "step": 2456, + "time_per_iteration": 2.4573004245758057 + }, + { + "auxiliary_loss_clip": 0.0110531, + "auxiliary_loss_mlp": 0.01033511, + "balance_loss_clip": 1.012959, + "balance_loss_mlp": 1.02993011, + "epoch": 0.14772283180520066, + "flos": 21688359580800.0, + "grad_norm": 1.8900714922399362, + "language_loss": 0.84265876, + "learning_rate": 3.788738719338898e-06, + "loss": 0.86404693, + "num_input_tokens_seen": 53373410, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 0.75390625, + "step": 2457, + "time_per_iteration": 2.378192901611328 + }, + { + "auxiliary_loss_clip": 0.01104148, + "auxiliary_loss_mlp": 0.01035294, + "balance_loss_clip": 1.01481414, + "balance_loss_mlp": 1.03054821, + "epoch": 0.14778295505786862, + "flos": 18769411117440.0, + "grad_norm": 1.9567926522310815, + "language_loss": 0.74974108, + "learning_rate": 3.788569676447723e-06, + "loss": 0.77113551, + "num_input_tokens_seen": 53391430, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 0.734375, + "step": 2458, + "time_per_iteration": 2.3949851989746094 + }, + { + "auxiliary_loss_clip": 0.01111074, + "auxiliary_loss_mlp": 0.01041301, + "balance_loss_clip": 1.01924753, + "balance_loss_mlp": 1.03039289, + "epoch": 0.1478430783105366, + "flos": 22892444150400.0, + "grad_norm": 1.8526540964536895, + "language_loss": 0.83802259, + "learning_rate": 3.7884005697270976e-06, + "loss": 0.85954636, + "num_input_tokens_seen": 53409960, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.8046875, + "step": 2459, + "time_per_iteration": 2.386124849319458 + }, + { + "auxiliary_loss_clip": 0.01102934, + "auxiliary_loss_mlp": 0.01039875, + "balance_loss_clip": 1.02070618, + "balance_loss_mlp": 1.03005552, + "epoch": 0.14790320156320458, + "flos": 15084433313280.0, + "grad_norm": 2.3751272671493258, + "language_loss": 0.75208676, + "learning_rate": 3.7882313991830553e-06, + "loss": 0.77351487, + "num_input_tokens_seen": 53426160, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 0.7265625, + "step": 2460, + "time_per_iteration": 2.3616836071014404 + }, + { + "auxiliary_loss_clip": 0.01109286, + "auxiliary_loss_mlp": 0.01040809, + "balance_loss_clip": 1.01885045, + "balance_loss_mlp": 1.03090692, + "epoch": 0.14796332481587254, + "flos": 26503580695680.0, + "grad_norm": 1.695323430815126, + "language_loss": 0.81721282, + "learning_rate": 3.788062164821635e-06, + "loss": 0.83871377, + "num_input_tokens_seen": 53448530, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 0.78515625, + "step": 2461, + "time_per_iteration": 2.438246250152588 + }, + { + "auxiliary_loss_clip": 0.01108934, + "auxiliary_loss_mlp": 0.01041782, + "balance_loss_clip": 1.01964498, + "balance_loss_mlp": 1.03071451, + "epoch": 0.1480234480685405, + "flos": 17562324170880.0, + "grad_norm": 3.2664391766448513, + "language_loss": 0.65738714, + "learning_rate": 3.7878928666488755e-06, + "loss": 0.67889428, + "num_input_tokens_seen": 53465915, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 0.78125, + "step": 2462, + "time_per_iteration": 2.344008207321167 + }, + { + "auxiliary_loss_clip": 0.01106101, + "auxiliary_loss_mlp": 0.01047585, + "balance_loss_clip": 1.02500653, + "balance_loss_mlp": 1.02892792, + "epoch": 0.14808357132120847, + "flos": 53580121580160.0, + "grad_norm": 2.134080966410557, + "language_loss": 0.67159075, + "learning_rate": 3.787723504670818e-06, + "loss": 0.69312757, + "num_input_tokens_seen": 53496055, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.7734375, + "step": 2463, + "time_per_iteration": 2.6848130226135254 + }, + { + "auxiliary_loss_clip": 0.01104801, + "auxiliary_loss_mlp": 0.01044845, + "balance_loss_clip": 1.02301788, + "balance_loss_mlp": 1.02841341, + "epoch": 0.14814369457387644, + "flos": 19828152230400.0, + "grad_norm": 1.6024968555364416, + "language_loss": 0.76646018, + "learning_rate": 3.7875540788935076e-06, + "loss": 0.78795666, + "num_input_tokens_seen": 53513790, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 0.765625, + "step": 2464, + "time_per_iteration": 3.7671191692352295 + }, + { + "auxiliary_loss_clip": 0.01104148, + "auxiliary_loss_mlp": 0.01039635, + "balance_loss_clip": 1.01981044, + "balance_loss_mlp": 1.03094208, + "epoch": 0.1482038178265444, + "flos": 23913828241920.0, + "grad_norm": 1.6632214555847842, + "language_loss": 0.79578698, + "learning_rate": 3.7873845893229896e-06, + "loss": 0.81722486, + "num_input_tokens_seen": 53533410, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 0.73046875, + "step": 2465, + "time_per_iteration": 2.411268711090088 + }, + { + "auxiliary_loss_clip": 0.01109104, + "auxiliary_loss_mlp": 0.01042305, + "balance_loss_clip": 1.02058506, + "balance_loss_mlp": 1.0299412, + "epoch": 0.1482639410792124, + "flos": 24169357549440.0, + "grad_norm": 2.5067273738761964, + "language_loss": 0.76638633, + "learning_rate": 3.7872150359653143e-06, + "loss": 0.78790045, + "num_input_tokens_seen": 53554775, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 0.79296875, + "step": 2466, + "time_per_iteration": 3.780672788619995 + }, + { + "auxiliary_loss_clip": 0.01025631, + "auxiliary_loss_mlp": 0.01002746, + "balance_loss_clip": 1.00039744, + "balance_loss_mlp": 1.00557256, + "epoch": 0.14832406433188036, + "flos": 66188025578880.0, + "grad_norm": 0.7811321793737293, + "language_loss": 0.6012544, + "learning_rate": 3.787045418826531e-06, + "loss": 0.62153816, + "num_input_tokens_seen": 53609675, + "router_z_loss_clip": 0.0234375, + "router_z_loss_mlp": 0.20117188, + "step": 2467, + "time_per_iteration": 4.413452863693237 + }, + { + "auxiliary_loss_clip": 0.01102364, + "auxiliary_loss_mlp": 0.01035207, + "balance_loss_clip": 1.01523924, + "balance_loss_mlp": 1.02893591, + "epoch": 0.14838418758454833, + "flos": 25410066001920.0, + "grad_norm": 2.1207902730080836, + "language_loss": 0.87758505, + "learning_rate": 3.7868757379126938e-06, + "loss": 0.89896071, + "num_input_tokens_seen": 53626950, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 0.734375, + "step": 2468, + "time_per_iteration": 3.744333505630493 + }, + { + "auxiliary_loss_clip": 0.01106776, + "auxiliary_loss_mlp": 0.01040982, + "balance_loss_clip": 1.01863039, + "balance_loss_mlp": 1.02884674, + "epoch": 0.1484443108372163, + "flos": 23288918083200.0, + "grad_norm": 2.031910697410879, + "language_loss": 0.76093054, + "learning_rate": 3.7867059932298578e-06, + "loss": 0.78240806, + "num_input_tokens_seen": 53644200, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 0.78125, + "step": 2469, + "time_per_iteration": 2.4004931449890137 + }, + { + "auxiliary_loss_clip": 0.01105523, + "auxiliary_loss_mlp": 0.01045265, + "balance_loss_clip": 1.02372372, + "balance_loss_mlp": 1.0301652, + "epoch": 0.14850443408988426, + "flos": 14646797020800.0, + "grad_norm": 2.357346122684486, + "language_loss": 0.75776291, + "learning_rate": 3.786536184784081e-06, + "loss": 0.77927077, + "num_input_tokens_seen": 53659650, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 0.75390625, + "step": 2470, + "time_per_iteration": 2.352248430252075 + }, + { + "auxiliary_loss_clip": 0.01102071, + "auxiliary_loss_mlp": 0.01042208, + "balance_loss_clip": 1.02152479, + "balance_loss_mlp": 1.02874684, + "epoch": 0.14856455734255222, + "flos": 23547240299520.0, + "grad_norm": 1.958258548759413, + "language_loss": 0.72282279, + "learning_rate": 3.786366312581423e-06, + "loss": 0.74426562, + "num_input_tokens_seen": 53680275, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 0.734375, + "step": 2471, + "time_per_iteration": 2.4054150581359863 + }, + { + "auxiliary_loss_clip": 0.01108779, + "auxiliary_loss_mlp": 0.01039437, + "balance_loss_clip": 1.01688242, + "balance_loss_mlp": 1.02943885, + "epoch": 0.1486246805952202, + "flos": 18076315340160.0, + "grad_norm": 2.567091590849504, + "language_loss": 0.89306957, + "learning_rate": 3.786196376627947e-06, + "loss": 0.91455173, + "num_input_tokens_seen": 53698270, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.796875, + "step": 2472, + "time_per_iteration": 2.360830783843994 + }, + { + "auxiliary_loss_clip": 0.01105657, + "auxiliary_loss_mlp": 0.01043823, + "balance_loss_clip": 1.02275848, + "balance_loss_mlp": 1.02937889, + "epoch": 0.14868480384788818, + "flos": 19352635246080.0, + "grad_norm": 3.2100663064713113, + "language_loss": 0.80462313, + "learning_rate": 3.7860263769297163e-06, + "loss": 0.82611787, + "num_input_tokens_seen": 53716845, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 0.76171875, + "step": 2473, + "time_per_iteration": 2.360668420791626 + }, + { + "auxiliary_loss_clip": 0.01109533, + "auxiliary_loss_mlp": 0.01041752, + "balance_loss_clip": 1.02066338, + "balance_loss_mlp": 1.02998114, + "epoch": 0.14874492710055615, + "flos": 22199103993600.0, + "grad_norm": 2.565127652122542, + "language_loss": 0.77484328, + "learning_rate": 3.7858563134927985e-06, + "loss": 0.79635614, + "num_input_tokens_seen": 53734970, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 0.796875, + "step": 2474, + "time_per_iteration": 2.384188413619995 + }, + { + "auxiliary_loss_clip": 0.01106567, + "auxiliary_loss_mlp": 0.01043922, + "balance_loss_clip": 1.02127266, + "balance_loss_mlp": 1.02885747, + "epoch": 0.1488050503532241, + "flos": 21102447277440.0, + "grad_norm": 3.3138585561777703, + "language_loss": 0.82387251, + "learning_rate": 3.785686186323263e-06, + "loss": 0.84537739, + "num_input_tokens_seen": 53753415, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.77734375, + "step": 2475, + "time_per_iteration": 2.36897611618042 + }, + { + "auxiliary_loss_clip": 0.01108567, + "auxiliary_loss_mlp": 0.01047392, + "balance_loss_clip": 1.0271976, + "balance_loss_mlp": 1.03358066, + "epoch": 0.14886517360589208, + "flos": 12785751797760.0, + "grad_norm": 1.8469100074926545, + "language_loss": 0.80300593, + "learning_rate": 3.785515995427181e-06, + "loss": 0.82456547, + "num_input_tokens_seen": 53770305, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 0.75, + "step": 2476, + "time_per_iteration": 2.3844544887542725 + }, + { + "auxiliary_loss_clip": 0.01100131, + "auxiliary_loss_mlp": 0.01044251, + "balance_loss_clip": 1.02424812, + "balance_loss_mlp": 1.02915752, + "epoch": 0.14892529685856004, + "flos": 29021586572160.0, + "grad_norm": 1.650390794059293, + "language_loss": 0.77724421, + "learning_rate": 3.7853457408106257e-06, + "loss": 0.79868805, + "num_input_tokens_seen": 53788895, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 0.7109375, + "step": 2477, + "time_per_iteration": 2.4232230186462402 + }, + { + "auxiliary_loss_clip": 0.01023444, + "auxiliary_loss_mlp": 0.01002709, + "balance_loss_clip": 1.0001694, + "balance_loss_mlp": 1.00380075, + "epoch": 0.148985420111228, + "flos": 61923105313920.0, + "grad_norm": 0.8200409354329088, + "language_loss": 0.60067445, + "learning_rate": 3.785175422479673e-06, + "loss": 0.62093598, + "num_input_tokens_seen": 53850260, + "router_z_loss_clip": 0.02539062, + "router_z_loss_mlp": 0.19628906, + "step": 2478, + "time_per_iteration": 3.0559191703796387 + }, + { + "auxiliary_loss_clip": 0.01106866, + "auxiliary_loss_mlp": 0.01040803, + "balance_loss_clip": 1.02027547, + "balance_loss_mlp": 1.03009081, + "epoch": 0.149045543363896, + "flos": 23913967887360.0, + "grad_norm": 3.065683946321891, + "language_loss": 0.70812583, + "learning_rate": 3.785005040440402e-06, + "loss": 0.72960258, + "num_input_tokens_seen": 53867520, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 0.765625, + "step": 2479, + "time_per_iteration": 2.4103593826293945 + }, + { + "auxiliary_loss_clip": 0.01105955, + "auxiliary_loss_mlp": 0.01036613, + "balance_loss_clip": 1.01666939, + "balance_loss_mlp": 1.03049088, + "epoch": 0.14910566661656396, + "flos": 23653411344000.0, + "grad_norm": 1.8489014792152119, + "language_loss": 0.81115156, + "learning_rate": 3.784834594698892e-06, + "loss": 0.83257723, + "num_input_tokens_seen": 53886620, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 0.7578125, + "step": 2480, + "time_per_iteration": 2.4197895526885986 + }, + { + "auxiliary_loss_clip": 0.01108557, + "auxiliary_loss_mlp": 0.01041216, + "balance_loss_clip": 1.02054477, + "balance_loss_mlp": 1.03099465, + "epoch": 0.14916578986923193, + "flos": 20514440292480.0, + "grad_norm": 2.551710077764799, + "language_loss": 0.84128141, + "learning_rate": 3.7846640852612275e-06, + "loss": 0.86277914, + "num_input_tokens_seen": 53902230, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 0.77734375, + "step": 2481, + "time_per_iteration": 2.365886926651001 + }, + { + "auxiliary_loss_clip": 0.01106418, + "auxiliary_loss_mlp": 0.01048773, + "balance_loss_clip": 1.02586138, + "balance_loss_mlp": 1.02866387, + "epoch": 0.1492259131218999, + "flos": 22490733513600.0, + "grad_norm": 2.197025405151839, + "language_loss": 0.7761029, + "learning_rate": 3.7844935121334917e-06, + "loss": 0.79765475, + "num_input_tokens_seen": 53919475, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 0.77734375, + "step": 2482, + "time_per_iteration": 2.3777050971984863 + }, + { + "auxiliary_loss_clip": 0.01113147, + "auxiliary_loss_mlp": 0.01039511, + "balance_loss_clip": 1.01711154, + "balance_loss_mlp": 1.03127205, + "epoch": 0.14928603637456786, + "flos": 23184736986240.0, + "grad_norm": 2.387801286820446, + "language_loss": 0.78827822, + "learning_rate": 3.7843228753217726e-06, + "loss": 0.8098048, + "num_input_tokens_seen": 53939150, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.8203125, + "step": 2483, + "time_per_iteration": 2.389840841293335 + }, + { + "auxiliary_loss_clip": 0.01102808, + "auxiliary_loss_mlp": 0.01034901, + "balance_loss_clip": 1.01655495, + "balance_loss_mlp": 1.02996683, + "epoch": 0.14934615962723582, + "flos": 21652154634240.0, + "grad_norm": 1.7036932947225292, + "language_loss": 0.70134556, + "learning_rate": 3.784152174832161e-06, + "loss": 0.72272271, + "num_input_tokens_seen": 53958735, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.7265625, + "step": 2484, + "time_per_iteration": 2.3964152336120605 + }, + { + "auxiliary_loss_clip": 0.0110853, + "auxiliary_loss_mlp": 0.01039956, + "balance_loss_clip": 1.01690102, + "balance_loss_mlp": 1.02992082, + "epoch": 0.1494062828799038, + "flos": 27009018581760.0, + "grad_norm": 1.9461525369762271, + "language_loss": 0.84372914, + "learning_rate": 3.783981410670747e-06, + "loss": 0.86521399, + "num_input_tokens_seen": 53975065, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.78515625, + "step": 2485, + "time_per_iteration": 2.395015239715576 + }, + { + "auxiliary_loss_clip": 0.01109191, + "auxiliary_loss_mlp": 0.010414, + "balance_loss_clip": 1.01866639, + "balance_loss_mlp": 1.03184628, + "epoch": 0.14946640613257178, + "flos": 21213889937280.0, + "grad_norm": 2.3685290325684805, + "language_loss": 0.85049736, + "learning_rate": 3.7838105828436246e-06, + "loss": 0.87200332, + "num_input_tokens_seen": 53993330, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 0.7734375, + "step": 2486, + "time_per_iteration": 2.380772590637207 + }, + { + "auxiliary_loss_clip": 0.01102168, + "auxiliary_loss_mlp": 0.01035574, + "balance_loss_clip": 1.01717949, + "balance_loss_mlp": 1.0275681, + "epoch": 0.14952652938523975, + "flos": 13370023267200.0, + "grad_norm": 2.4486603015435375, + "language_loss": 0.74685532, + "learning_rate": 3.7836396913568924e-06, + "loss": 0.7682327, + "num_input_tokens_seen": 54010515, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.74609375, + "step": 2487, + "time_per_iteration": 2.348187208175659 + }, + { + "auxiliary_loss_clip": 0.01105893, + "auxiliary_loss_mlp": 0.01045336, + "balance_loss_clip": 1.0232228, + "balance_loss_mlp": 1.0319078, + "epoch": 0.1495866526379077, + "flos": 35516234684160.0, + "grad_norm": 1.970596711140622, + "language_loss": 0.71655691, + "learning_rate": 3.783468736216647e-06, + "loss": 0.73806924, + "num_input_tokens_seen": 54031315, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.7421875, + "step": 2488, + "time_per_iteration": 2.507781982421875 + }, + { + "auxiliary_loss_clip": 0.01108562, + "auxiliary_loss_mlp": 0.01045898, + "balance_loss_clip": 1.02378464, + "balance_loss_mlp": 1.02948689, + "epoch": 0.14964677589057568, + "flos": 17631976066560.0, + "grad_norm": 3.4016547959130485, + "language_loss": 0.70374882, + "learning_rate": 3.78329771742899e-06, + "loss": 0.7252934, + "num_input_tokens_seen": 54045965, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.79296875, + "step": 2489, + "time_per_iteration": 2.3227648735046387 + }, + { + "auxiliary_loss_clip": 0.01107973, + "auxiliary_loss_mlp": 0.01046761, + "balance_loss_clip": 1.02527964, + "balance_loss_mlp": 1.02966833, + "epoch": 0.14970689914324364, + "flos": 20184476232960.0, + "grad_norm": 3.089408897003641, + "language_loss": 0.82155168, + "learning_rate": 3.7831266350000246e-06, + "loss": 0.843099, + "num_input_tokens_seen": 54059960, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 0.78515625, + "step": 2490, + "time_per_iteration": 2.3514554500579834 + }, + { + "auxiliary_loss_clip": 0.01109634, + "auxiliary_loss_mlp": 0.01042063, + "balance_loss_clip": 1.02196455, + "balance_loss_mlp": 1.03133702, + "epoch": 0.1497670223959116, + "flos": 37227293239680.0, + "grad_norm": 1.8708118573962265, + "language_loss": 0.79381263, + "learning_rate": 3.7829554889358566e-06, + "loss": 0.81532961, + "num_input_tokens_seen": 54079330, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 0.78515625, + "step": 2491, + "time_per_iteration": 2.5021839141845703 + }, + { + "auxiliary_loss_clip": 0.01106918, + "auxiliary_loss_mlp": 0.01040376, + "balance_loss_clip": 1.01766706, + "balance_loss_mlp": 1.02717793, + "epoch": 0.1498271456485796, + "flos": 24454877581440.0, + "grad_norm": 1.7980417862337945, + "language_loss": 0.9063071, + "learning_rate": 3.782784279242593e-06, + "loss": 0.92778003, + "num_input_tokens_seen": 54097555, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.796875, + "step": 2492, + "time_per_iteration": 2.4079651832580566 + }, + { + "auxiliary_loss_clip": 0.01105784, + "auxiliary_loss_mlp": 0.01048743, + "balance_loss_clip": 1.02727377, + "balance_loss_mlp": 1.02955317, + "epoch": 0.14988726890124757, + "flos": 16252662049920.0, + "grad_norm": 4.15489378567907, + "language_loss": 0.78564751, + "learning_rate": 3.782613005926345e-06, + "loss": 0.80719274, + "num_input_tokens_seen": 54115600, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 0.76171875, + "step": 2493, + "time_per_iteration": 2.36659836769104 + }, + { + "auxiliary_loss_clip": 0.01106706, + "auxiliary_loss_mlp": 0.01043433, + "balance_loss_clip": 1.02074707, + "balance_loss_mlp": 1.02827418, + "epoch": 0.14994739215391553, + "flos": 20665544123520.0, + "grad_norm": 2.2290031291976846, + "language_loss": 0.80210996, + "learning_rate": 3.7824416689932236e-06, + "loss": 0.82361138, + "num_input_tokens_seen": 54135220, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.78515625, + "step": 2494, + "time_per_iteration": 2.386484146118164 + }, + { + "auxiliary_loss_clip": 0.01104888, + "auxiliary_loss_mlp": 0.01045692, + "balance_loss_clip": 1.02410316, + "balance_loss_mlp": 1.02876639, + "epoch": 0.1500075154065835, + "flos": 70649961845760.0, + "grad_norm": 16.878931362562213, + "language_loss": 0.6622541, + "learning_rate": 3.782270268449345e-06, + "loss": 0.68375999, + "num_input_tokens_seen": 54161065, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 0.76171875, + "step": 2495, + "time_per_iteration": 2.770857572555542 + }, + { + "auxiliary_loss_clip": 0.01022901, + "auxiliary_loss_mlp": 0.01007046, + "balance_loss_clip": 1.00465024, + "balance_loss_mlp": 1.00302672, + "epoch": 0.15006763865925146, + "flos": 68008955783040.0, + "grad_norm": 0.8867353178954155, + "language_loss": 0.59524918, + "learning_rate": 3.7820988043008242e-06, + "loss": 0.61554861, + "num_input_tokens_seen": 54225095, + "router_z_loss_clip": 0.02392578, + "router_z_loss_mlp": 0.19921875, + "step": 2496, + "time_per_iteration": 3.040221691131592 + }, + { + "auxiliary_loss_clip": 0.01107105, + "auxiliary_loss_mlp": 0.0104946, + "balance_loss_clip": 1.02659595, + "balance_loss_mlp": 1.02788699, + "epoch": 0.15012776191191943, + "flos": 18915278244480.0, + "grad_norm": 1.9285751832320503, + "language_loss": 0.65387583, + "learning_rate": 3.7819272765537817e-06, + "loss": 0.6754415, + "num_input_tokens_seen": 54243750, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.7890625, + "step": 2497, + "time_per_iteration": 2.3551132678985596 + }, + { + "auxiliary_loss_clip": 0.01109416, + "auxiliary_loss_mlp": 0.01040362, + "balance_loss_clip": 1.01925039, + "balance_loss_mlp": 1.03077793, + "epoch": 0.1501878851645874, + "flos": 23700054787200.0, + "grad_norm": 1.4946450898699988, + "language_loss": 0.75399512, + "learning_rate": 3.781755685214338e-06, + "loss": 0.77549291, + "num_input_tokens_seen": 54266185, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 0.78515625, + "step": 2498, + "time_per_iteration": 2.451472759246826 + }, + { + "auxiliary_loss_clip": 0.01110941, + "auxiliary_loss_mlp": 0.01048753, + "balance_loss_clip": 1.02396941, + "balance_loss_mlp": 1.03037, + "epoch": 0.15024800841725539, + "flos": 20411481093120.0, + "grad_norm": 3.5897244410704174, + "language_loss": 0.72171801, + "learning_rate": 3.7815840302886174e-06, + "loss": 0.74331498, + "num_input_tokens_seen": 54283940, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.8046875, + "step": 2499, + "time_per_iteration": 2.3646938800811768 + }, + { + "auxiliary_loss_clip": 0.01106481, + "auxiliary_loss_mlp": 0.01044837, + "balance_loss_clip": 1.02206814, + "balance_loss_mlp": 1.02982831, + "epoch": 0.15030813166992335, + "flos": 31829685868800.0, + "grad_norm": 2.078369693629021, + "language_loss": 0.71735597, + "learning_rate": 3.7814123117827446e-06, + "loss": 0.73886919, + "num_input_tokens_seen": 54304830, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 0.76953125, + "step": 2500, + "time_per_iteration": 2.465881586074829 + }, + { + "auxiliary_loss_clip": 0.0110495, + "auxiliary_loss_mlp": 0.01051187, + "balance_loss_clip": 1.0273335, + "balance_loss_mlp": 1.02872014, + "epoch": 0.15036825492259132, + "flos": 35656515993600.0, + "grad_norm": 1.7714213938379884, + "language_loss": 0.64922321, + "learning_rate": 3.7812405297028496e-06, + "loss": 0.67078459, + "num_input_tokens_seen": 54325595, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.76171875, + "step": 2501, + "time_per_iteration": 2.4886040687561035 + }, + { + "auxiliary_loss_clip": 0.01102843, + "auxiliary_loss_mlp": 0.01047496, + "balance_loss_clip": 1.02550197, + "balance_loss_mlp": 1.02860188, + "epoch": 0.15042837817525928, + "flos": 18837317445120.0, + "grad_norm": 3.2510901341753407, + "language_loss": 0.83437526, + "learning_rate": 3.7810686840550627e-06, + "loss": 0.85587859, + "num_input_tokens_seen": 54342180, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 0.7421875, + "step": 2502, + "time_per_iteration": 2.3517165184020996 + }, + { + "auxiliary_loss_clip": 0.01101775, + "auxiliary_loss_mlp": 0.01043332, + "balance_loss_clip": 1.02318525, + "balance_loss_mlp": 1.02661479, + "epoch": 0.15048850142792725, + "flos": 19534567674240.0, + "grad_norm": 1.8809738964560827, + "language_loss": 0.77416849, + "learning_rate": 3.780896774845515e-06, + "loss": 0.79561961, + "num_input_tokens_seen": 54360255, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 0.75, + "step": 2503, + "time_per_iteration": 3.7314107418060303 + }, + { + "auxiliary_loss_clip": 0.01104913, + "auxiliary_loss_mlp": 0.01038449, + "balance_loss_clip": 1.01751566, + "balance_loss_mlp": 1.02834249, + "epoch": 0.1505486246805952, + "flos": 22016473338240.0, + "grad_norm": 1.886192327546137, + "language_loss": 0.8547039, + "learning_rate": 3.780724802080342e-06, + "loss": 0.8761375, + "num_input_tokens_seen": 54378260, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 0.765625, + "step": 2504, + "time_per_iteration": 2.382795810699463 + }, + { + "auxiliary_loss_clip": 0.01104583, + "auxiliary_loss_mlp": 0.01036428, + "balance_loss_clip": 1.01629364, + "balance_loss_mlp": 1.02975178, + "epoch": 0.15060874793326318, + "flos": 20742038645760.0, + "grad_norm": 1.6319880378542269, + "language_loss": 0.83300298, + "learning_rate": 3.780552765765682e-06, + "loss": 0.85441315, + "num_input_tokens_seen": 54399745, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 0.75, + "step": 2505, + "time_per_iteration": 2.389429807662964 + }, + { + "auxiliary_loss_clip": 0.01102799, + "auxiliary_loss_mlp": 0.01041655, + "balance_loss_clip": 1.02066219, + "balance_loss_mlp": 1.02746129, + "epoch": 0.15066887118593117, + "flos": 16470973981440.0, + "grad_norm": 2.5789060434706066, + "language_loss": 0.75912398, + "learning_rate": 3.7803806659076736e-06, + "loss": 0.78056848, + "num_input_tokens_seen": 54417105, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 0.75390625, + "step": 2506, + "time_per_iteration": 3.8077516555786133 + }, + { + "auxiliary_loss_clip": 0.0110723, + "auxiliary_loss_mlp": 0.01043255, + "balance_loss_clip": 1.02203536, + "balance_loss_mlp": 1.02894378, + "epoch": 0.15072899443859913, + "flos": 19858457157120.0, + "grad_norm": 3.620360501858079, + "language_loss": 0.76408052, + "learning_rate": 3.7802085025124596e-06, + "loss": 0.78558534, + "num_input_tokens_seen": 54433920, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 0.78125, + "step": 2507, + "time_per_iteration": 3.6984262466430664 + }, + { + "auxiliary_loss_clip": 0.01101382, + "auxiliary_loss_mlp": 0.01037609, + "balance_loss_clip": 1.01672363, + "balance_loss_mlp": 1.02788413, + "epoch": 0.1507891176912671, + "flos": 20775206304000.0, + "grad_norm": 1.8528607674820419, + "language_loss": 0.68635213, + "learning_rate": 3.780036275586183e-06, + "loss": 0.70774198, + "num_input_tokens_seen": 54451540, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 0.734375, + "step": 2508, + "time_per_iteration": 2.3735949993133545 + }, + { + "auxiliary_loss_clip": 0.0110717, + "auxiliary_loss_mlp": 0.01039804, + "balance_loss_clip": 1.01870346, + "balance_loss_mlp": 1.02987397, + "epoch": 0.15084924094393506, + "flos": 23585505016320.0, + "grad_norm": 1.703027506053113, + "language_loss": 0.77397841, + "learning_rate": 3.77986398513499e-06, + "loss": 0.79544818, + "num_input_tokens_seen": 54470800, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 0.7734375, + "step": 2509, + "time_per_iteration": 2.409031867980957 + }, + { + "auxiliary_loss_clip": 0.01111394, + "auxiliary_loss_mlp": 0.01046605, + "balance_loss_clip": 1.02228665, + "balance_loss_mlp": 1.02996719, + "epoch": 0.15090936419660303, + "flos": 18910460476800.0, + "grad_norm": 2.139432909344303, + "language_loss": 0.79934525, + "learning_rate": 3.7796916311650306e-06, + "loss": 0.82092535, + "num_input_tokens_seen": 54486525, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 0.8125, + "step": 2510, + "time_per_iteration": 2.332805871963501 + }, + { + "auxiliary_loss_clip": 0.0110901, + "auxiliary_loss_mlp": 0.01047356, + "balance_loss_clip": 1.0243609, + "balance_loss_mlp": 1.03031814, + "epoch": 0.150969487449271, + "flos": 17927341102080.0, + "grad_norm": 2.0902943664394393, + "language_loss": 0.73790503, + "learning_rate": 3.779519213682454e-06, + "loss": 0.75946862, + "num_input_tokens_seen": 54503795, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 0.7890625, + "step": 2511, + "time_per_iteration": 2.363016128540039 + }, + { + "auxiliary_loss_clip": 0.01105998, + "auxiliary_loss_mlp": 0.01040421, + "balance_loss_clip": 1.02032197, + "balance_loss_mlp": 1.03014183, + "epoch": 0.151029610701939, + "flos": 24241941999360.0, + "grad_norm": 1.9770655402867399, + "language_loss": 0.69080341, + "learning_rate": 3.7793467326934147e-06, + "loss": 0.71226764, + "num_input_tokens_seen": 54523025, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 0.7578125, + "step": 2512, + "time_per_iteration": 2.397308349609375 + }, + { + "auxiliary_loss_clip": 0.0110867, + "auxiliary_loss_mlp": 0.01042614, + "balance_loss_clip": 1.02091765, + "balance_loss_mlp": 1.0323503, + "epoch": 0.15108973395460695, + "flos": 30261212772480.0, + "grad_norm": 2.6598247798626575, + "language_loss": 0.73773617, + "learning_rate": 3.7791741882040677e-06, + "loss": 0.75924897, + "num_input_tokens_seen": 54545025, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 0.765625, + "step": 2513, + "time_per_iteration": 2.45658802986145 + }, + { + "auxiliary_loss_clip": 0.01020814, + "auxiliary_loss_mlp": 0.010182, + "balance_loss_clip": 1.01548231, + "balance_loss_mlp": 1.00186658, + "epoch": 0.15114985720727492, + "flos": 60434443319040.0, + "grad_norm": 0.8834424408720046, + "language_loss": 0.64798552, + "learning_rate": 3.7790015802205703e-06, + "loss": 0.66837567, + "num_input_tokens_seen": 54604545, + "router_z_loss_clip": 0.02722168, + "router_z_loss_mlp": 0.18945312, + "step": 2514, + "time_per_iteration": 2.968867778778076 + }, + { + "auxiliary_loss_clip": 0.01103398, + "auxiliary_loss_mlp": 0.01038986, + "balance_loss_clip": 1.01708698, + "balance_loss_mlp": 1.02875233, + "epoch": 0.15120998045994288, + "flos": 20520654514560.0, + "grad_norm": 5.903596646191268, + "language_loss": 0.73183751, + "learning_rate": 3.778828908749082e-06, + "loss": 0.75326133, + "num_input_tokens_seen": 54620590, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 0.74609375, + "step": 2515, + "time_per_iteration": 2.3934273719787598 + }, + { + "auxiliary_loss_clip": 0.0110383, + "auxiliary_loss_mlp": 0.01039183, + "balance_loss_clip": 1.0183332, + "balance_loss_mlp": 1.02974045, + "epoch": 0.15127010371261085, + "flos": 21177824636160.0, + "grad_norm": 1.8364367976316958, + "language_loss": 0.7763139, + "learning_rate": 3.7786561737957664e-06, + "loss": 0.79774404, + "num_input_tokens_seen": 54640410, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 0.7421875, + "step": 2516, + "time_per_iteration": 2.3890445232391357 + }, + { + "auxiliary_loss_clip": 0.01022081, + "auxiliary_loss_mlp": 0.01006885, + "balance_loss_clip": 1.00452423, + "balance_loss_mlp": 1.00267529, + "epoch": 0.1513302269652788, + "flos": 65317500938880.0, + "grad_norm": 0.7261458198856392, + "language_loss": 0.54662186, + "learning_rate": 3.7784833753667867e-06, + "loss": 0.56691152, + "num_input_tokens_seen": 54701430, + "router_z_loss_clip": 0.02355957, + "router_z_loss_mlp": 0.19335938, + "step": 2517, + "time_per_iteration": 3.0439090728759766 + }, + { + "auxiliary_loss_clip": 0.01105508, + "auxiliary_loss_mlp": 0.0104157, + "balance_loss_clip": 1.01917112, + "balance_loss_mlp": 1.02838504, + "epoch": 0.15139035021794678, + "flos": 19134812073600.0, + "grad_norm": 2.059296618904718, + "language_loss": 0.78328919, + "learning_rate": 3.7783105134683108e-06, + "loss": 0.80475998, + "num_input_tokens_seen": 54720845, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.7734375, + "step": 2518, + "time_per_iteration": 2.396969795227051 + }, + { + "auxiliary_loss_clip": 0.01109758, + "auxiliary_loss_mlp": 0.01046193, + "balance_loss_clip": 1.02384138, + "balance_loss_mlp": 1.03013384, + "epoch": 0.15145047347061477, + "flos": 26577352131840.0, + "grad_norm": 2.0281620108176632, + "language_loss": 0.69986463, + "learning_rate": 3.7781375881065066e-06, + "loss": 0.7214241, + "num_input_tokens_seen": 54740495, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 0.796875, + "step": 2519, + "time_per_iteration": 2.4198665618896484 + }, + { + "auxiliary_loss_clip": 0.01106441, + "auxiliary_loss_mlp": 0.01041322, + "balance_loss_clip": 1.02094889, + "balance_loss_mlp": 1.02969408, + "epoch": 0.15151059672328274, + "flos": 20301923646720.0, + "grad_norm": 2.499711858783702, + "language_loss": 0.78746629, + "learning_rate": 3.7779645992875453e-06, + "loss": 0.80894399, + "num_input_tokens_seen": 54758415, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 0.76953125, + "step": 2520, + "time_per_iteration": 2.375433921813965 + }, + { + "auxiliary_loss_clip": 0.0111098, + "auxiliary_loss_mlp": 0.01047152, + "balance_loss_clip": 1.02419233, + "balance_loss_mlp": 1.03058589, + "epoch": 0.1515707199759507, + "flos": 27227330513280.0, + "grad_norm": 1.7823679850582101, + "language_loss": 0.74635911, + "learning_rate": 3.7777915470176013e-06, + "loss": 0.7679404, + "num_input_tokens_seen": 54779355, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 0.8046875, + "step": 2521, + "time_per_iteration": 2.4408998489379883 + }, + { + "auxiliary_loss_clip": 0.01110329, + "auxiliary_loss_mlp": 0.01043944, + "balance_loss_clip": 1.0210675, + "balance_loss_mlp": 1.03026247, + "epoch": 0.15163084322861867, + "flos": 23586203243520.0, + "grad_norm": 1.9815758785267137, + "language_loss": 0.81626248, + "learning_rate": 3.7776184313028504e-06, + "loss": 0.83780521, + "num_input_tokens_seen": 54799465, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.80078125, + "step": 2522, + "time_per_iteration": 2.3857479095458984 + }, + { + "auxiliary_loss_clip": 0.01027446, + "auxiliary_loss_mlp": 0.01011436, + "balance_loss_clip": 1.00920677, + "balance_loss_mlp": 1.00768375, + "epoch": 0.15169096648128663, + "flos": 66886427882880.0, + "grad_norm": 0.8246270249677684, + "language_loss": 0.57857478, + "learning_rate": 3.7774452521494703e-06, + "loss": 0.59896362, + "num_input_tokens_seen": 54857665, + "router_z_loss_clip": 0.02233887, + "router_z_loss_mlp": 0.19726562, + "step": 2523, + "time_per_iteration": 2.964157819747925 + }, + { + "auxiliary_loss_clip": 0.01105539, + "auxiliary_loss_mlp": 0.01043856, + "balance_loss_clip": 1.02152777, + "balance_loss_mlp": 1.02922678, + "epoch": 0.1517510897339546, + "flos": 29094171022080.0, + "grad_norm": 1.7355717654770246, + "language_loss": 0.74829638, + "learning_rate": 3.777272009563641e-06, + "loss": 0.76979029, + "num_input_tokens_seen": 54879895, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.765625, + "step": 2524, + "time_per_iteration": 2.434436559677124 + }, + { + "auxiliary_loss_clip": 0.01103708, + "auxiliary_loss_mlp": 0.01045557, + "balance_loss_clip": 1.02325308, + "balance_loss_mlp": 1.0275718, + "epoch": 0.1518112129866226, + "flos": 18405546261120.0, + "grad_norm": 2.2503896736230757, + "language_loss": 0.74488342, + "learning_rate": 3.7770987035515454e-06, + "loss": 0.76637608, + "num_input_tokens_seen": 54898245, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.76171875, + "step": 2525, + "time_per_iteration": 2.3705458641052246 + }, + { + "auxiliary_loss_clip": 0.01110369, + "auxiliary_loss_mlp": 0.01043974, + "balance_loss_clip": 1.02100289, + "balance_loss_mlp": 1.03120804, + "epoch": 0.15187133623929056, + "flos": 19424451646080.0, + "grad_norm": 1.7240981580721493, + "language_loss": 0.79515433, + "learning_rate": 3.7769253341193677e-06, + "loss": 0.81669778, + "num_input_tokens_seen": 54917060, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 0.79296875, + "step": 2526, + "time_per_iteration": 2.3717384338378906 + }, + { + "auxiliary_loss_clip": 0.01101619, + "auxiliary_loss_mlp": 0.01044334, + "balance_loss_clip": 1.0238061, + "balance_loss_mlp": 1.0290668, + "epoch": 0.15193145949195852, + "flos": 17565256725120.0, + "grad_norm": 1.7227662650603675, + "language_loss": 0.84842026, + "learning_rate": 3.7767519012732968e-06, + "loss": 0.86987978, + "num_input_tokens_seen": 54936365, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 0.7265625, + "step": 2527, + "time_per_iteration": 2.3688910007476807 + }, + { + "auxiliary_loss_clip": 0.01107457, + "auxiliary_loss_mlp": 0.01039621, + "balance_loss_clip": 1.01920068, + "balance_loss_mlp": 1.0299511, + "epoch": 0.15199158274462649, + "flos": 36174731437440.0, + "grad_norm": 2.117843194012049, + "language_loss": 0.69113839, + "learning_rate": 3.77657840501952e-06, + "loss": 0.71260917, + "num_input_tokens_seen": 54961365, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 0.77734375, + "step": 2528, + "time_per_iteration": 2.517273187637329 + }, + { + "auxiliary_loss_clip": 0.01107373, + "auxiliary_loss_mlp": 0.01045553, + "balance_loss_clip": 1.02377367, + "balance_loss_mlp": 1.03085923, + "epoch": 0.15205170599729445, + "flos": 23072980124160.0, + "grad_norm": 1.8420005400037076, + "language_loss": 0.86785525, + "learning_rate": 3.77640484536423e-06, + "loss": 0.88938451, + "num_input_tokens_seen": 54980750, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 0.765625, + "step": 2529, + "time_per_iteration": 2.3985602855682373 + }, + { + "auxiliary_loss_clip": 0.01104006, + "auxiliary_loss_mlp": 0.01037226, + "balance_loss_clip": 1.0161376, + "balance_loss_mlp": 1.02957439, + "epoch": 0.15211182924996242, + "flos": 21907299916800.0, + "grad_norm": 1.9081949275850647, + "language_loss": 0.83878064, + "learning_rate": 3.7762312223136206e-06, + "loss": 0.86019295, + "num_input_tokens_seen": 54999675, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 0.74609375, + "step": 2530, + "time_per_iteration": 2.369981050491333 + }, + { + "auxiliary_loss_clip": 0.01107426, + "auxiliary_loss_mlp": 0.01041581, + "balance_loss_clip": 1.01987326, + "balance_loss_mlp": 1.03080463, + "epoch": 0.15217195250263038, + "flos": 13880662945920.0, + "grad_norm": 2.195533786066148, + "language_loss": 0.80004138, + "learning_rate": 3.7760575358738885e-06, + "loss": 0.82153153, + "num_input_tokens_seen": 55018295, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 0.765625, + "step": 2531, + "time_per_iteration": 2.3685219287872314 + }, + { + "auxiliary_loss_clip": 0.011084, + "auxiliary_loss_mlp": 0.01044006, + "balance_loss_clip": 1.02345395, + "balance_loss_mlp": 1.03170586, + "epoch": 0.15223207575529837, + "flos": 24534165012480.0, + "grad_norm": 1.812882691072394, + "language_loss": 0.78945243, + "learning_rate": 3.7758837860512306e-06, + "loss": 0.81097651, + "num_input_tokens_seen": 55037975, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 0.765625, + "step": 2532, + "time_per_iteration": 2.4467689990997314 + }, + { + "auxiliary_loss_clip": 0.01106842, + "auxiliary_loss_mlp": 0.01041883, + "balance_loss_clip": 1.02085471, + "balance_loss_mlp": 1.03198981, + "epoch": 0.15229219900796634, + "flos": 25555618926720.0, + "grad_norm": 4.311532411440558, + "language_loss": 0.87922168, + "learning_rate": 3.775709972851849e-06, + "loss": 0.90070897, + "num_input_tokens_seen": 55057135, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 0.75, + "step": 2533, + "time_per_iteration": 2.43026065826416 + }, + { + "auxiliary_loss_clip": 0.01106862, + "auxiliary_loss_mlp": 0.01046504, + "balance_loss_clip": 1.02447379, + "balance_loss_mlp": 1.03032994, + "epoch": 0.1523523222606343, + "flos": 18216980674560.0, + "grad_norm": 2.273913263189555, + "language_loss": 0.78457522, + "learning_rate": 3.775536096281946e-06, + "loss": 0.80610883, + "num_input_tokens_seen": 55075525, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 0.765625, + "step": 2534, + "time_per_iteration": 2.351072072982788 + }, + { + "auxiliary_loss_clip": 0.0110932, + "auxiliary_loss_mlp": 0.01040589, + "balance_loss_clip": 1.01833248, + "balance_loss_mlp": 1.02856922, + "epoch": 0.15241244551330227, + "flos": 13259278834560.0, + "grad_norm": 3.0967219246986604, + "language_loss": 0.76634681, + "learning_rate": 3.7753621563477268e-06, + "loss": 0.78784585, + "num_input_tokens_seen": 55090845, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.80859375, + "step": 2535, + "time_per_iteration": 2.3607687950134277 + }, + { + "auxiliary_loss_clip": 0.01112375, + "auxiliary_loss_mlp": 0.01041494, + "balance_loss_clip": 1.01929736, + "balance_loss_mlp": 1.03086329, + "epoch": 0.15247256876597023, + "flos": 19714649800320.0, + "grad_norm": 1.9995141359856081, + "language_loss": 0.78141522, + "learning_rate": 3.7751881530553993e-06, + "loss": 0.80295384, + "num_input_tokens_seen": 55108750, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.81640625, + "step": 2536, + "time_per_iteration": 2.3622381687164307 + }, + { + "auxiliary_loss_clip": 0.01105197, + "auxiliary_loss_mlp": 0.01042185, + "balance_loss_clip": 1.02139473, + "balance_loss_mlp": 1.03205538, + "epoch": 0.1525326920186382, + "flos": 20374822298880.0, + "grad_norm": 3.0883739704224134, + "language_loss": 0.76058221, + "learning_rate": 3.775014086411173e-06, + "loss": 0.78205609, + "num_input_tokens_seen": 55126750, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 0.73046875, + "step": 2537, + "time_per_iteration": 2.370582342147827 + }, + { + "auxiliary_loss_clip": 0.0110693, + "auxiliary_loss_mlp": 0.01042626, + "balance_loss_clip": 1.02166939, + "balance_loss_mlp": 1.03100705, + "epoch": 0.15259281527130616, + "flos": 13589103248640.0, + "grad_norm": 2.696911866493913, + "language_loss": 0.77871943, + "learning_rate": 3.7748399564212595e-06, + "loss": 0.80021489, + "num_input_tokens_seen": 55144690, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 0.7578125, + "step": 2538, + "time_per_iteration": 2.358154058456421 + }, + { + "auxiliary_loss_clip": 0.01102209, + "auxiliary_loss_mlp": 0.0103443, + "balance_loss_clip": 1.01554728, + "balance_loss_mlp": 1.02940416, + "epoch": 0.15265293852397416, + "flos": 22859171758080.0, + "grad_norm": 2.018316998131727, + "language_loss": 0.89714652, + "learning_rate": 3.7746657630918735e-06, + "loss": 0.91851294, + "num_input_tokens_seen": 55166055, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 0.7265625, + "step": 2539, + "time_per_iteration": 2.408341646194458 + }, + { + "auxiliary_loss_clip": 0.01106273, + "auxiliary_loss_mlp": 0.0104616, + "balance_loss_clip": 1.02464235, + "balance_loss_mlp": 1.02910089, + "epoch": 0.15271306177664212, + "flos": 29236931038080.0, + "grad_norm": 2.0070992879303664, + "language_loss": 0.93298948, + "learning_rate": 3.7744915064292313e-06, + "loss": 0.95451379, + "num_input_tokens_seen": 55186285, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 0.7734375, + "step": 2540, + "time_per_iteration": 2.4411098957061768 + }, + { + "auxiliary_loss_clip": 0.01100278, + "auxiliary_loss_mlp": 0.01037735, + "balance_loss_clip": 1.01820827, + "balance_loss_mlp": 1.02696204, + "epoch": 0.1527731850293101, + "flos": 31244995463040.0, + "grad_norm": 1.6043658348108991, + "language_loss": 0.75354832, + "learning_rate": 3.7743171864395524e-06, + "loss": 0.77492845, + "num_input_tokens_seen": 55207915, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 0.734375, + "step": 2541, + "time_per_iteration": 2.463897466659546 + }, + { + "auxiliary_loss_clip": 0.01101776, + "auxiliary_loss_mlp": 0.01041522, + "balance_loss_clip": 1.02097011, + "balance_loss_mlp": 1.02875042, + "epoch": 0.15283330828197805, + "flos": 22381001510400.0, + "grad_norm": 1.6350841919306038, + "language_loss": 0.81249166, + "learning_rate": 3.774142803129057e-06, + "loss": 0.83392459, + "num_input_tokens_seen": 55227860, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 0.73046875, + "step": 2542, + "time_per_iteration": 2.385455369949341 + }, + { + "auxiliary_loss_clip": 0.01106315, + "auxiliary_loss_mlp": 0.01043719, + "balance_loss_clip": 1.02259505, + "balance_loss_mlp": 1.02995694, + "epoch": 0.15289343153464602, + "flos": 25518960132480.0, + "grad_norm": 1.7618678381994837, + "language_loss": 0.77379119, + "learning_rate": 3.7739683565039674e-06, + "loss": 0.79529154, + "num_input_tokens_seen": 55247330, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 0.765625, + "step": 2543, + "time_per_iteration": 3.8703291416168213 + }, + { + "auxiliary_loss_clip": 0.01103589, + "auxiliary_loss_mlp": 0.01037816, + "balance_loss_clip": 1.01696634, + "balance_loss_mlp": 1.02916515, + "epoch": 0.15295355478731398, + "flos": 22708940711040.0, + "grad_norm": 1.898209789518137, + "language_loss": 0.86182797, + "learning_rate": 3.7737938465705115e-06, + "loss": 0.88324201, + "num_input_tokens_seen": 55266195, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 0.74609375, + "step": 2544, + "time_per_iteration": 2.3836936950683594 + }, + { + "auxiliary_loss_clip": 0.01105948, + "auxiliary_loss_mlp": 0.01043145, + "balance_loss_clip": 1.02056634, + "balance_loss_mlp": 1.02821922, + "epoch": 0.15301367803998198, + "flos": 23250967568640.0, + "grad_norm": 2.012905755527917, + "language_loss": 0.8258521, + "learning_rate": 3.773619273334916e-06, + "loss": 0.84734297, + "num_input_tokens_seen": 55283305, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.77734375, + "step": 2545, + "time_per_iteration": 3.8111298084259033 + }, + { + "auxiliary_loss_clip": 0.01103459, + "auxiliary_loss_mlp": 0.01039394, + "balance_loss_clip": 1.01785302, + "balance_loss_mlp": 1.02929902, + "epoch": 0.15307380129264994, + "flos": 25885059315840.0, + "grad_norm": 2.5491668962390683, + "language_loss": 0.71097058, + "learning_rate": 3.77344463680341e-06, + "loss": 0.73239911, + "num_input_tokens_seen": 55303035, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 0.7421875, + "step": 2546, + "time_per_iteration": 2.455937147140503 + }, + { + "auxiliary_loss_clip": 0.01103855, + "auxiliary_loss_mlp": 0.01043118, + "balance_loss_clip": 1.02050447, + "balance_loss_mlp": 1.02750611, + "epoch": 0.1531339245453179, + "flos": 46971482279040.0, + "grad_norm": 1.8089644794002437, + "language_loss": 0.77684152, + "learning_rate": 3.7732699369822276e-06, + "loss": 0.79831123, + "num_input_tokens_seen": 55327570, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.76171875, + "step": 2547, + "time_per_iteration": 3.963167190551758 + }, + { + "auxiliary_loss_clip": 0.01105039, + "auxiliary_loss_mlp": 0.0104707, + "balance_loss_clip": 1.02424145, + "balance_loss_mlp": 1.02820408, + "epoch": 0.15319404779798587, + "flos": 35880588299520.0, + "grad_norm": 2.501128504882286, + "language_loss": 0.74221045, + "learning_rate": 3.7730951738776025e-06, + "loss": 0.76373148, + "num_input_tokens_seen": 55351090, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.76953125, + "step": 2548, + "time_per_iteration": 2.5057740211486816 + }, + { + "auxiliary_loss_clip": 0.01107012, + "auxiliary_loss_mlp": 0.01040167, + "balance_loss_clip": 1.01805377, + "balance_loss_mlp": 1.02945113, + "epoch": 0.15325417105065384, + "flos": 25663500627840.0, + "grad_norm": 1.3539366184629202, + "language_loss": 0.80474466, + "learning_rate": 3.7729203474957715e-06, + "loss": 0.82621646, + "num_input_tokens_seen": 55371050, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.7734375, + "step": 2549, + "time_per_iteration": 2.4154052734375 + }, + { + "auxiliary_loss_clip": 0.01105433, + "auxiliary_loss_mlp": 0.01038403, + "balance_loss_clip": 1.0184834, + "balance_loss_mlp": 1.02960038, + "epoch": 0.1533142943033218, + "flos": 18769830053760.0, + "grad_norm": 1.7346121941856547, + "language_loss": 0.74965739, + "learning_rate": 3.7727454578429735e-06, + "loss": 0.77109581, + "num_input_tokens_seen": 55390375, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 0.7578125, + "step": 2550, + "time_per_iteration": 2.369154691696167 + }, + { + "auxiliary_loss_clip": 0.01107496, + "auxiliary_loss_mlp": 0.01047447, + "balance_loss_clip": 1.0250597, + "balance_loss_mlp": 1.02971649, + "epoch": 0.15337441755598977, + "flos": 23106392161920.0, + "grad_norm": 2.2196697060420028, + "language_loss": 0.77113855, + "learning_rate": 3.7725705049254507e-06, + "loss": 0.79268789, + "num_input_tokens_seen": 55408890, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 0.77734375, + "step": 2551, + "time_per_iteration": 2.379380941390991 + }, + { + "auxiliary_loss_clip": 0.01020893, + "auxiliary_loss_mlp": 0.01003217, + "balance_loss_clip": 1.00111914, + "balance_loss_mlp": 1.00219727, + "epoch": 0.15343454080865776, + "flos": 59857712703360.0, + "grad_norm": 0.9447315928812187, + "language_loss": 0.56754923, + "learning_rate": 3.7723954887494457e-06, + "loss": 0.58779031, + "num_input_tokens_seen": 55463815, + "router_z_loss_clip": 0.02099609, + "router_z_loss_mlp": 0.1875, + "step": 2552, + "time_per_iteration": 2.9446704387664795 + }, + { + "auxiliary_loss_clip": 0.01107918, + "auxiliary_loss_mlp": 0.01040425, + "balance_loss_clip": 1.01816869, + "balance_loss_mlp": 1.02918124, + "epoch": 0.15349466406132573, + "flos": 11910095187840.0, + "grad_norm": 2.3160070802126898, + "language_loss": 0.88518476, + "learning_rate": 3.772220409321205e-06, + "loss": 0.90666825, + "num_input_tokens_seen": 55481050, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.7890625, + "step": 2553, + "time_per_iteration": 2.35170841217041 + }, + { + "auxiliary_loss_clip": 0.01108115, + "auxiliary_loss_mlp": 0.0104021, + "balance_loss_clip": 1.01832306, + "balance_loss_mlp": 1.02948189, + "epoch": 0.1535547873139937, + "flos": 24095795581440.0, + "grad_norm": 3.7418927186046984, + "language_loss": 0.78330117, + "learning_rate": 3.7720452666469766e-06, + "loss": 0.80478442, + "num_input_tokens_seen": 55500050, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 0.78515625, + "step": 2554, + "time_per_iteration": 2.4206840991973877 + }, + { + "auxiliary_loss_clip": 0.01111027, + "auxiliary_loss_mlp": 0.01044822, + "balance_loss_clip": 1.02211225, + "balance_loss_mlp": 1.03178763, + "epoch": 0.15361491056666166, + "flos": 17565501104640.0, + "grad_norm": 2.5973429021719974, + "language_loss": 0.77826989, + "learning_rate": 3.7718700607330114e-06, + "loss": 0.79982841, + "num_input_tokens_seen": 55518125, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.79296875, + "step": 2555, + "time_per_iteration": 2.356719970703125 + }, + { + "auxiliary_loss_clip": 0.01102546, + "auxiliary_loss_mlp": 0.01039915, + "balance_loss_clip": 1.02055502, + "balance_loss_mlp": 1.02782702, + "epoch": 0.15367503381932962, + "flos": 25044874513920.0, + "grad_norm": 1.6601553386463048, + "language_loss": 0.77098221, + "learning_rate": 3.7716947915855607e-06, + "loss": 0.7924068, + "num_input_tokens_seen": 55540960, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 0.74609375, + "step": 2556, + "time_per_iteration": 2.433222532272339 + }, + { + "auxiliary_loss_clip": 0.01102502, + "auxiliary_loss_mlp": 0.01033546, + "balance_loss_clip": 1.01278019, + "balance_loss_mlp": 1.0288049, + "epoch": 0.15373515707199759, + "flos": 21506252595840.0, + "grad_norm": 1.9197042729823908, + "language_loss": 0.89976764, + "learning_rate": 3.7715194592108805e-06, + "loss": 0.92112815, + "num_input_tokens_seen": 55559210, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 0.734375, + "step": 2557, + "time_per_iteration": 2.376099109649658 + }, + { + "auxiliary_loss_clip": 0.01105636, + "auxiliary_loss_mlp": 0.01042414, + "balance_loss_clip": 1.01971674, + "balance_loss_mlp": 1.02806902, + "epoch": 0.15379528032466555, + "flos": 25993534510080.0, + "grad_norm": 1.9933591767860965, + "language_loss": 0.71279323, + "learning_rate": 3.7713440636152276e-06, + "loss": 0.73427367, + "num_input_tokens_seen": 55578925, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.7734375, + "step": 2558, + "time_per_iteration": 2.4223055839538574 + }, + { + "auxiliary_loss_clip": 0.0110847, + "auxiliary_loss_mlp": 0.01044002, + "balance_loss_clip": 1.02246046, + "balance_loss_mlp": 1.03012931, + "epoch": 0.15385540357733354, + "flos": 19276420014720.0, + "grad_norm": 2.4142626335273087, + "language_loss": 0.91885328, + "learning_rate": 3.7711686048048613e-06, + "loss": 0.94037807, + "num_input_tokens_seen": 55597255, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 0.78125, + "step": 2559, + "time_per_iteration": 2.371500253677368 + }, + { + "auxiliary_loss_clip": 0.01106853, + "auxiliary_loss_mlp": 0.01046784, + "balance_loss_clip": 1.02265644, + "balance_loss_mlp": 1.02879667, + "epoch": 0.1539155268300015, + "flos": 28547850067200.0, + "grad_norm": 2.5031426019798815, + "language_loss": 0.63263065, + "learning_rate": 3.7709930827860445e-06, + "loss": 0.65416706, + "num_input_tokens_seen": 55619515, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.78125, + "step": 2560, + "time_per_iteration": 2.436124324798584 + }, + { + "auxiliary_loss_clip": 0.01105785, + "auxiliary_loss_mlp": 0.01048935, + "balance_loss_clip": 1.02601123, + "balance_loss_mlp": 1.02788079, + "epoch": 0.15397565008266947, + "flos": 23546821363200.0, + "grad_norm": 1.8593641817635, + "language_loss": 0.88214654, + "learning_rate": 3.770817497565039e-06, + "loss": 0.90369374, + "num_input_tokens_seen": 55640050, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.78125, + "step": 2561, + "time_per_iteration": 2.4040687084198 + }, + { + "auxiliary_loss_clip": 0.01103428, + "auxiliary_loss_mlp": 0.01036151, + "balance_loss_clip": 1.01612425, + "balance_loss_mlp": 1.02934492, + "epoch": 0.15403577333533744, + "flos": 17128842330240.0, + "grad_norm": 1.8613872011541217, + "language_loss": 0.8302772, + "learning_rate": 3.770641849148113e-06, + "loss": 0.85167301, + "num_input_tokens_seen": 55658695, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 0.7421875, + "step": 2562, + "time_per_iteration": 2.362842559814453 + }, + { + "auxiliary_loss_clip": 0.01111663, + "auxiliary_loss_mlp": 0.01048927, + "balance_loss_clip": 1.024418, + "balance_loss_mlp": 1.03037024, + "epoch": 0.1540958965880054, + "flos": 17893545039360.0, + "grad_norm": 2.6615220319396173, + "language_loss": 0.74593759, + "learning_rate": 3.7704661375415336e-06, + "loss": 0.76754344, + "num_input_tokens_seen": 55676340, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.8125, + "step": 2563, + "time_per_iteration": 2.3713762760162354 + }, + { + "auxiliary_loss_clip": 0.01106939, + "auxiliary_loss_mlp": 0.01037819, + "balance_loss_clip": 1.01505017, + "balance_loss_mlp": 1.02760148, + "epoch": 0.15415601984067337, + "flos": 32159684839680.0, + "grad_norm": 2.127160502792017, + "language_loss": 0.7599268, + "learning_rate": 3.770290362751572e-06, + "loss": 0.78137439, + "num_input_tokens_seen": 55698890, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 0.79296875, + "step": 2564, + "time_per_iteration": 2.4816644191741943 + }, + { + "auxiliary_loss_clip": 0.01103838, + "auxiliary_loss_mlp": 0.01044494, + "balance_loss_clip": 1.02382302, + "balance_loss_mlp": 1.02820563, + "epoch": 0.15421614309334136, + "flos": 24023280954240.0, + "grad_norm": 2.3929769772717817, + "language_loss": 0.70904052, + "learning_rate": 3.7701145247845006e-06, + "loss": 0.73052388, + "num_input_tokens_seen": 55718535, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 0.7578125, + "step": 2565, + "time_per_iteration": 2.4122776985168457 + }, + { + "auxiliary_loss_clip": 0.01102412, + "auxiliary_loss_mlp": 0.01045623, + "balance_loss_clip": 1.02420139, + "balance_loss_mlp": 1.02667618, + "epoch": 0.15427626634600933, + "flos": 24385225685760.0, + "grad_norm": 128.62132603744894, + "language_loss": 0.72072661, + "learning_rate": 3.7699386236465954e-06, + "loss": 0.74220693, + "num_input_tokens_seen": 55738970, + "router_z_loss_clip": 0.21386719, + "router_z_loss_mlp": 0.7578125, + "step": 2566, + "time_per_iteration": 2.411787271499634 + }, + { + "auxiliary_loss_clip": 0.01101311, + "auxiliary_loss_mlp": 0.01036585, + "balance_loss_clip": 1.01612926, + "balance_loss_mlp": 1.02717113, + "epoch": 0.1543363895986773, + "flos": 23330394645120.0, + "grad_norm": 1.7865310614845986, + "language_loss": 0.85019439, + "learning_rate": 3.769762659344134e-06, + "loss": 0.87157333, + "num_input_tokens_seen": 55759585, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 0.7421875, + "step": 2567, + "time_per_iteration": 2.410080671310425 + }, + { + "auxiliary_loss_clip": 0.01107123, + "auxiliary_loss_mlp": 0.01041647, + "balance_loss_clip": 1.02020097, + "balance_loss_mlp": 1.0298934, + "epoch": 0.15439651285134526, + "flos": 24273294266880.0, + "grad_norm": 1.8003980210100428, + "language_loss": 0.78207928, + "learning_rate": 3.7695866318833946e-06, + "loss": 0.80356699, + "num_input_tokens_seen": 55779250, + "router_z_loss_clip": 0.21386719, + "router_z_loss_mlp": 0.7734375, + "step": 2568, + "time_per_iteration": 2.424752950668335 + }, + { + "auxiliary_loss_clip": 0.01105023, + "auxiliary_loss_mlp": 0.01037149, + "balance_loss_clip": 1.0147258, + "balance_loss_mlp": 1.02836204, + "epoch": 0.15445663610401322, + "flos": 22455052237440.0, + "grad_norm": 1.8000672053146936, + "language_loss": 0.70164311, + "learning_rate": 3.769410541270661e-06, + "loss": 0.72306484, + "num_input_tokens_seen": 55800470, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.765625, + "step": 2569, + "time_per_iteration": 2.3858604431152344 + }, + { + "auxiliary_loss_clip": 0.01100376, + "auxiliary_loss_mlp": 0.0104229, + "balance_loss_clip": 1.02138114, + "balance_loss_mlp": 1.02779078, + "epoch": 0.1545167593566812, + "flos": 22048558744320.0, + "grad_norm": 1.6523212296947989, + "language_loss": 0.76557863, + "learning_rate": 3.7692343875122167e-06, + "loss": 0.78700531, + "num_input_tokens_seen": 55817795, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 0.7265625, + "step": 2570, + "time_per_iteration": 2.38033390045166 + }, + { + "auxiliary_loss_clip": 0.01103751, + "auxiliary_loss_mlp": 0.01040179, + "balance_loss_clip": 1.0186975, + "balance_loss_mlp": 1.02916908, + "epoch": 0.15457688260934915, + "flos": 19317233260800.0, + "grad_norm": 2.519584689727318, + "language_loss": 0.77578133, + "learning_rate": 3.769058170614348e-06, + "loss": 0.79722065, + "num_input_tokens_seen": 55836125, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 0.74609375, + "step": 2571, + "time_per_iteration": 2.377727508544922 + }, + { + "auxiliary_loss_clip": 0.01104421, + "auxiliary_loss_mlp": 0.01042916, + "balance_loss_clip": 1.02193546, + "balance_loss_mlp": 1.02845848, + "epoch": 0.15463700586201715, + "flos": 24132838400640.0, + "grad_norm": 2.585517759659818, + "language_loss": 0.82445037, + "learning_rate": 3.768881890583344e-06, + "loss": 0.84592372, + "num_input_tokens_seen": 55855280, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 0.7578125, + "step": 2572, + "time_per_iteration": 2.402364730834961 + }, + { + "auxiliary_loss_clip": 0.01107173, + "auxiliary_loss_mlp": 0.01036979, + "balance_loss_clip": 1.01581907, + "balance_loss_mlp": 1.02824247, + "epoch": 0.1546971291146851, + "flos": 22419789897600.0, + "grad_norm": 1.5434582100933483, + "language_loss": 0.90369272, + "learning_rate": 3.7687055474254946e-06, + "loss": 0.92513418, + "num_input_tokens_seen": 55875695, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 0.7890625, + "step": 2573, + "time_per_iteration": 2.402705669403076 + }, + { + "auxiliary_loss_clip": 0.01106665, + "auxiliary_loss_mlp": 0.01041862, + "balance_loss_clip": 1.02113163, + "balance_loss_mlp": 1.02890539, + "epoch": 0.15475725236735308, + "flos": 17529261246720.0, + "grad_norm": 1.746927606225564, + "language_loss": 0.70083201, + "learning_rate": 3.7685291411470946e-06, + "loss": 0.72231728, + "num_input_tokens_seen": 55894575, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 0.77734375, + "step": 2574, + "time_per_iteration": 2.3665170669555664 + }, + { + "auxiliary_loss_clip": 0.01105633, + "auxiliary_loss_mlp": 0.01043801, + "balance_loss_clip": 1.02161622, + "balance_loss_mlp": 1.02813995, + "epoch": 0.15481737562002104, + "flos": 22560734522880.0, + "grad_norm": 1.749430047223734, + "language_loss": 0.82673597, + "learning_rate": 3.768352671754439e-06, + "loss": 0.84823036, + "num_input_tokens_seen": 55912855, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 0.7734375, + "step": 2575, + "time_per_iteration": 2.394659996032715 + }, + { + "auxiliary_loss_clip": 0.01105269, + "auxiliary_loss_mlp": 0.01038444, + "balance_loss_clip": 1.0179522, + "balance_loss_mlp": 1.02768183, + "epoch": 0.154877498872689, + "flos": 24899391411840.0, + "grad_norm": 2.0641206323001935, + "language_loss": 0.85018152, + "learning_rate": 3.7681761392538246e-06, + "loss": 0.87161869, + "num_input_tokens_seen": 55932375, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 0.7734375, + "step": 2576, + "time_per_iteration": 2.4128005504608154 + }, + { + "auxiliary_loss_clip": 0.01102041, + "auxiliary_loss_mlp": 0.01040264, + "balance_loss_clip": 1.019629, + "balance_loss_mlp": 1.026214, + "epoch": 0.15493762212535697, + "flos": 28146244164480.0, + "grad_norm": 1.7035648936845607, + "language_loss": 0.82154602, + "learning_rate": 3.7679995436515525e-06, + "loss": 0.84296906, + "num_input_tokens_seen": 55953970, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 0.7578125, + "step": 2577, + "time_per_iteration": 2.43742036819458 + }, + { + "auxiliary_loss_clip": 0.0110781, + "auxiliary_loss_mlp": 0.01047088, + "balance_loss_clip": 1.02558279, + "balance_loss_mlp": 1.03000402, + "epoch": 0.15499774537802496, + "flos": 25409891445120.0, + "grad_norm": 2.681903882832599, + "language_loss": 0.76673013, + "learning_rate": 3.7678228849539244e-06, + "loss": 0.78827906, + "num_input_tokens_seen": 55973120, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 0.78125, + "step": 2578, + "time_per_iteration": 2.4083006381988525 + }, + { + "auxiliary_loss_clip": 0.01106969, + "auxiliary_loss_mlp": 0.01040705, + "balance_loss_clip": 1.01773357, + "balance_loss_mlp": 1.02976203, + "epoch": 0.15505786863069293, + "flos": 22090454242560.0, + "grad_norm": 2.1068175774035662, + "language_loss": 0.82854289, + "learning_rate": 3.767646163167245e-06, + "loss": 0.85001969, + "num_input_tokens_seen": 55993260, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 0.76953125, + "step": 2579, + "time_per_iteration": 2.4026007652282715 + }, + { + "auxiliary_loss_clip": 0.01104262, + "auxiliary_loss_mlp": 0.01040047, + "balance_loss_clip": 1.02056837, + "balance_loss_mlp": 1.03090668, + "epoch": 0.1551179918833609, + "flos": 18916116117120.0, + "grad_norm": 2.178197886575931, + "language_loss": 0.80735964, + "learning_rate": 3.7674693782978206e-06, + "loss": 0.82880276, + "num_input_tokens_seen": 56012130, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 0.734375, + "step": 2580, + "time_per_iteration": 2.367403268814087 + }, + { + "auxiliary_loss_clip": 0.01024101, + "auxiliary_loss_mlp": 0.01002831, + "balance_loss_clip": 1.00063789, + "balance_loss_mlp": 1.00525832, + "epoch": 0.15517811513602886, + "flos": 66235821096960.0, + "grad_norm": 0.8381763744793062, + "language_loss": 0.58836788, + "learning_rate": 3.7672925303519605e-06, + "loss": 0.60863721, + "num_input_tokens_seen": 56079045, + "router_z_loss_clip": 0.02197266, + "router_z_loss_mlp": 0.18847656, + "step": 2581, + "time_per_iteration": 3.164335250854492 + }, + { + "auxiliary_loss_clip": 0.01108483, + "auxiliary_loss_mlp": 0.01039474, + "balance_loss_clip": 1.01849318, + "balance_loss_mlp": 1.02883208, + "epoch": 0.15523823838869683, + "flos": 24020034197760.0, + "grad_norm": 2.419429780114484, + "language_loss": 0.8530618, + "learning_rate": 3.7671156193359764e-06, + "loss": 0.87454134, + "num_input_tokens_seen": 56098745, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 0.796875, + "step": 2582, + "time_per_iteration": 3.7996675968170166 + }, + { + "auxiliary_loss_clip": 0.01103487, + "auxiliary_loss_mlp": 0.01044752, + "balance_loss_clip": 1.02330661, + "balance_loss_mlp": 1.02763653, + "epoch": 0.1552983616413648, + "flos": 20484030631680.0, + "grad_norm": 2.3637131348154115, + "language_loss": 0.78676498, + "learning_rate": 3.766938645256182e-06, + "loss": 0.80824739, + "num_input_tokens_seen": 56117655, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 0.7578125, + "step": 2583, + "time_per_iteration": 2.4146738052368164 + }, + { + "auxiliary_loss_clip": 0.01103412, + "auxiliary_loss_mlp": 0.01042965, + "balance_loss_clip": 1.02308095, + "balance_loss_mlp": 1.02806306, + "epoch": 0.15535848489403276, + "flos": 32122362729600.0, + "grad_norm": 1.787448485889266, + "language_loss": 0.76157773, + "learning_rate": 3.766761608118892e-06, + "loss": 0.78304148, + "num_input_tokens_seen": 56141960, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 0.75390625, + "step": 2584, + "time_per_iteration": 2.497995376586914 + }, + { + "auxiliary_loss_clip": 0.01102306, + "auxiliary_loss_mlp": 0.01039194, + "balance_loss_clip": 1.01690185, + "balance_loss_mlp": 1.02709401, + "epoch": 0.15541860814670075, + "flos": 19097455052160.0, + "grad_norm": 2.1192931896292055, + "language_loss": 0.75837165, + "learning_rate": 3.766584507930424e-06, + "loss": 0.77978659, + "num_input_tokens_seen": 56161430, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.75, + "step": 2585, + "time_per_iteration": 5.134189605712891 + }, + { + "auxiliary_loss_clip": 0.01100754, + "auxiliary_loss_mlp": 0.01039866, + "balance_loss_clip": 1.01944494, + "balance_loss_mlp": 1.02741408, + "epoch": 0.1554787313993687, + "flos": 19171086842880.0, + "grad_norm": 2.61309060370953, + "language_loss": 0.61490977, + "learning_rate": 3.7664073446971e-06, + "loss": 0.63631594, + "num_input_tokens_seen": 56179390, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 0.734375, + "step": 2586, + "time_per_iteration": 3.731452465057373 + }, + { + "auxiliary_loss_clip": 0.011037, + "auxiliary_loss_mlp": 0.0103897, + "balance_loss_clip": 1.01838279, + "balance_loss_mlp": 1.02729452, + "epoch": 0.15553885465203668, + "flos": 16142895135360.0, + "grad_norm": 1.5537971677562452, + "language_loss": 0.80979955, + "learning_rate": 3.7662301184252413e-06, + "loss": 0.83122623, + "num_input_tokens_seen": 56198020, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 0.76171875, + "step": 2587, + "time_per_iteration": 2.4011423587799072 + }, + { + "auxiliary_loss_clip": 0.01106464, + "auxiliary_loss_mlp": 0.01045561, + "balance_loss_clip": 1.02371013, + "balance_loss_mlp": 1.02809954, + "epoch": 0.15559897790470464, + "flos": 25336608768000.0, + "grad_norm": 1.796175972885769, + "language_loss": 0.88489425, + "learning_rate": 3.766052829121173e-06, + "loss": 0.90641451, + "num_input_tokens_seen": 56218165, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 0.78125, + "step": 2588, + "time_per_iteration": 2.447763442993164 + }, + { + "auxiliary_loss_clip": 0.01104995, + "auxiliary_loss_mlp": 0.01047148, + "balance_loss_clip": 1.02462912, + "balance_loss_mlp": 1.0295099, + "epoch": 0.1556591011573726, + "flos": 23147659255680.0, + "grad_norm": 2.333558389343231, + "language_loss": 0.64972603, + "learning_rate": 3.765875476791222e-06, + "loss": 0.67124742, + "num_input_tokens_seen": 56237160, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.75390625, + "step": 2589, + "time_per_iteration": 2.421673536300659 + }, + { + "auxiliary_loss_clip": 0.01104252, + "auxiliary_loss_mlp": 0.01036745, + "balance_loss_clip": 1.01425052, + "balance_loss_mlp": 1.02719223, + "epoch": 0.15571922441004057, + "flos": 25369811337600.0, + "grad_norm": 1.7241537957140867, + "language_loss": 0.82660699, + "learning_rate": 3.765698061441718e-06, + "loss": 0.84801698, + "num_input_tokens_seen": 56257610, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.76953125, + "step": 2590, + "time_per_iteration": 2.4393231868743896 + }, + { + "auxiliary_loss_clip": 0.01103367, + "auxiliary_loss_mlp": 0.01036363, + "balance_loss_clip": 1.01496434, + "balance_loss_mlp": 1.02695584, + "epoch": 0.15577934766270854, + "flos": 14500510957440.0, + "grad_norm": 1.987275243360299, + "language_loss": 0.79317725, + "learning_rate": 3.7655205830789918e-06, + "loss": 0.81457454, + "num_input_tokens_seen": 56275215, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 0.765625, + "step": 2591, + "time_per_iteration": 2.373990774154663 + }, + { + "auxiliary_loss_clip": 0.01102398, + "auxiliary_loss_mlp": 0.01047158, + "balance_loss_clip": 1.02709532, + "balance_loss_mlp": 1.02724028, + "epoch": 0.15583947091537653, + "flos": 37413031006080.0, + "grad_norm": 3.0056621303756965, + "language_loss": 0.64931399, + "learning_rate": 3.7653430417093777e-06, + "loss": 0.67080957, + "num_input_tokens_seen": 56297130, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 0.75390625, + "step": 2592, + "time_per_iteration": 2.553753614425659 + }, + { + "auxiliary_loss_clip": 0.01107732, + "auxiliary_loss_mlp": 0.01042511, + "balance_loss_clip": 1.01964641, + "balance_loss_mlp": 1.02997577, + "epoch": 0.1558995941680445, + "flos": 21833668126080.0, + "grad_norm": 1.990849898249795, + "language_loss": 0.81727475, + "learning_rate": 3.765165437339211e-06, + "loss": 0.83877718, + "num_input_tokens_seen": 56314995, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.77734375, + "step": 2593, + "time_per_iteration": 2.397907018661499 + }, + { + "auxiliary_loss_clip": 0.01100068, + "auxiliary_loss_mlp": 0.01039175, + "balance_loss_clip": 1.01808691, + "balance_loss_mlp": 1.02801895, + "epoch": 0.15595971742071246, + "flos": 19791598170240.0, + "grad_norm": 2.0859177285149797, + "language_loss": 0.73165357, + "learning_rate": 3.764987769974831e-06, + "loss": 0.75304604, + "num_input_tokens_seen": 56334005, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 0.71875, + "step": 2594, + "time_per_iteration": 2.378899335861206 + }, + { + "auxiliary_loss_clip": 0.01098355, + "auxiliary_loss_mlp": 0.01035254, + "balance_loss_clip": 1.01486969, + "balance_loss_mlp": 1.02698159, + "epoch": 0.15601984067338043, + "flos": 26720984931840.0, + "grad_norm": 3.6423690182773587, + "language_loss": 0.81098974, + "learning_rate": 3.764810039622577e-06, + "loss": 0.83232594, + "num_input_tokens_seen": 56353795, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 0.71484375, + "step": 2595, + "time_per_iteration": 2.4521374702453613 + }, + { + "auxiliary_loss_clip": 0.01100865, + "auxiliary_loss_mlp": 0.01038839, + "balance_loss_clip": 1.01794171, + "balance_loss_mlp": 1.02678692, + "epoch": 0.1560799639260484, + "flos": 18368293973760.0, + "grad_norm": 1.9787245185617681, + "language_loss": 0.86365926, + "learning_rate": 3.7646322462887927e-06, + "loss": 0.88505626, + "num_input_tokens_seen": 56373195, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 0.7421875, + "step": 2596, + "time_per_iteration": 2.3852343559265137 + }, + { + "auxiliary_loss_clip": 0.01099539, + "auxiliary_loss_mlp": 0.01037294, + "balance_loss_clip": 1.01727879, + "balance_loss_mlp": 1.02835238, + "epoch": 0.15614008717871636, + "flos": 22597951898880.0, + "grad_norm": 1.657866688818024, + "language_loss": 0.68346548, + "learning_rate": 3.764454389979822e-06, + "loss": 0.70483381, + "num_input_tokens_seen": 56391525, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 0.7109375, + "step": 2597, + "time_per_iteration": 2.417179822921753 + }, + { + "auxiliary_loss_clip": 0.01097967, + "auxiliary_loss_mlp": 0.01040386, + "balance_loss_clip": 1.02059722, + "balance_loss_mlp": 1.02725589, + "epoch": 0.15620021043138435, + "flos": 22745774062080.0, + "grad_norm": 1.783937869785717, + "language_loss": 0.79627144, + "learning_rate": 3.7642764707020134e-06, + "loss": 0.81765497, + "num_input_tokens_seen": 56410715, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 0.70703125, + "step": 2598, + "time_per_iteration": 2.415555238723755 + }, + { + "auxiliary_loss_clip": 0.01096528, + "auxiliary_loss_mlp": 0.01032579, + "balance_loss_clip": 1.01294494, + "balance_loss_mlp": 1.02566028, + "epoch": 0.15626033368405232, + "flos": 13114109934720.0, + "grad_norm": 2.19423517852279, + "language_loss": 0.82752991, + "learning_rate": 3.764098488461716e-06, + "loss": 0.84882104, + "num_input_tokens_seen": 56429170, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 0.7109375, + "step": 2599, + "time_per_iteration": 2.381692409515381 + }, + { + "auxiliary_loss_clip": 0.01107132, + "auxiliary_loss_mlp": 0.01037977, + "balance_loss_clip": 1.01541018, + "balance_loss_mlp": 1.02894783, + "epoch": 0.15632045693672028, + "flos": 16471358006400.0, + "grad_norm": 2.8044434070259467, + "language_loss": 0.81662029, + "learning_rate": 3.7639204432652808e-06, + "loss": 0.83807135, + "num_input_tokens_seen": 56445685, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 0.78125, + "step": 2600, + "time_per_iteration": 2.3765108585357666 + }, + { + "auxiliary_loss_clip": 0.01106237, + "auxiliary_loss_mlp": 0.0103855, + "balance_loss_clip": 1.01822448, + "balance_loss_mlp": 1.02960777, + "epoch": 0.15638058018938825, + "flos": 20849291942400.0, + "grad_norm": 1.8183566185622821, + "language_loss": 0.884462, + "learning_rate": 3.7637423351190628e-06, + "loss": 0.9059099, + "num_input_tokens_seen": 56465900, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 0.765625, + "step": 2601, + "time_per_iteration": 2.4123878479003906 + }, + { + "auxiliary_loss_clip": 0.01107621, + "auxiliary_loss_mlp": 0.01056924, + "balance_loss_clip": 1.03384519, + "balance_loss_mlp": 1.03044391, + "epoch": 0.1564407034420562, + "flos": 21871129881600.0, + "grad_norm": 1.6946203892584524, + "language_loss": 0.78171384, + "learning_rate": 3.7635641640294177e-06, + "loss": 0.80335927, + "num_input_tokens_seen": 56485020, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.7734375, + "step": 2602, + "time_per_iteration": 2.4028422832489014 + }, + { + "auxiliary_loss_clip": 0.01101798, + "auxiliary_loss_mlp": 0.01039578, + "balance_loss_clip": 1.01946712, + "balance_loss_mlp": 1.02789998, + "epoch": 0.15650082669472418, + "flos": 21833493569280.0, + "grad_norm": 3.598417753453261, + "language_loss": 0.73629385, + "learning_rate": 3.7633859300027036e-06, + "loss": 0.7577076, + "num_input_tokens_seen": 56505205, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 0.73828125, + "step": 2603, + "time_per_iteration": 2.4165844917297363 + }, + { + "auxiliary_loss_clip": 0.01103958, + "auxiliary_loss_mlp": 0.01045743, + "balance_loss_clip": 1.02513182, + "balance_loss_mlp": 1.02844143, + "epoch": 0.15656094994739214, + "flos": 13799909237760.0, + "grad_norm": 2.672069342832569, + "language_loss": 0.87356353, + "learning_rate": 3.7632076330452823e-06, + "loss": 0.89506054, + "num_input_tokens_seen": 56521495, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 0.75390625, + "step": 2604, + "time_per_iteration": 2.3788928985595703 + }, + { + "auxiliary_loss_clip": 0.01102621, + "auxiliary_loss_mlp": 0.01041581, + "balance_loss_clip": 1.02092218, + "balance_loss_mlp": 1.0275898, + "epoch": 0.15662107320006013, + "flos": 27306967057920.0, + "grad_norm": 1.9506636409082554, + "language_loss": 0.85097289, + "learning_rate": 3.7630292731635155e-06, + "loss": 0.87241483, + "num_input_tokens_seen": 56540665, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 0.75, + "step": 2605, + "time_per_iteration": 2.4615745544433594 + }, + { + "auxiliary_loss_clip": 0.01107971, + "auxiliary_loss_mlp": 0.01042204, + "balance_loss_clip": 1.02082968, + "balance_loss_mlp": 1.02778459, + "epoch": 0.1566811964527281, + "flos": 26683942112640.0, + "grad_norm": 2.1938220667631048, + "language_loss": 0.73083031, + "learning_rate": 3.762850850363769e-06, + "loss": 0.75233209, + "num_input_tokens_seen": 56560805, + "router_z_loss_clip": 0.21386719, + "router_z_loss_mlp": 0.80078125, + "step": 2606, + "time_per_iteration": 2.446239471435547 + }, + { + "auxiliary_loss_clip": 0.01104533, + "auxiliary_loss_mlp": 0.0103651, + "balance_loss_clip": 1.01601839, + "balance_loss_mlp": 1.0298152, + "epoch": 0.15674131970539606, + "flos": 16102605559680.0, + "grad_norm": 2.216983009685828, + "language_loss": 0.76660913, + "learning_rate": 3.7626723646524107e-06, + "loss": 0.78801954, + "num_input_tokens_seen": 56576335, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 0.74609375, + "step": 2607, + "time_per_iteration": 2.374535083770752 + }, + { + "auxiliary_loss_clip": 0.01101664, + "auxiliary_loss_mlp": 0.01040982, + "balance_loss_clip": 1.02083576, + "balance_loss_mlp": 1.02859378, + "epoch": 0.15680144295806403, + "flos": 19168747781760.0, + "grad_norm": 2.121054199236041, + "language_loss": 0.81724632, + "learning_rate": 3.7624938160358096e-06, + "loss": 0.83867276, + "num_input_tokens_seen": 56595880, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 0.73046875, + "step": 2608, + "time_per_iteration": 2.399024486541748 + }, + { + "auxiliary_loss_clip": 0.01107222, + "auxiliary_loss_mlp": 0.01045988, + "balance_loss_clip": 1.02292061, + "balance_loss_mlp": 1.02944684, + "epoch": 0.156861566210732, + "flos": 20812388768640.0, + "grad_norm": 2.286626616381914, + "language_loss": 0.72848833, + "learning_rate": 3.762315204520338e-06, + "loss": 0.75002038, + "num_input_tokens_seen": 56615130, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.77734375, + "step": 2609, + "time_per_iteration": 2.435127019882202 + }, + { + "auxiliary_loss_clip": 0.01103131, + "auxiliary_loss_mlp": 0.01038042, + "balance_loss_clip": 1.01733494, + "balance_loss_mlp": 1.02717757, + "epoch": 0.15692168946339996, + "flos": 20046883098240.0, + "grad_norm": 2.2163270139322533, + "language_loss": 0.71791583, + "learning_rate": 3.7621365301123696e-06, + "loss": 0.73932755, + "num_input_tokens_seen": 56634005, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 0.7578125, + "step": 2610, + "time_per_iteration": 2.417283296585083 + }, + { + "auxiliary_loss_clip": 0.0110366, + "auxiliary_loss_mlp": 0.01042168, + "balance_loss_clip": 1.02022147, + "balance_loss_mlp": 1.02626252, + "epoch": 0.15698181271606793, + "flos": 21396939528960.0, + "grad_norm": 1.6604171307479039, + "language_loss": 0.72618192, + "learning_rate": 3.7619577928182816e-06, + "loss": 0.74764025, + "num_input_tokens_seen": 56653480, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 0.7734375, + "step": 2611, + "time_per_iteration": 2.42850399017334 + }, + { + "auxiliary_loss_clip": 0.01102773, + "auxiliary_loss_mlp": 0.0104017, + "balance_loss_clip": 1.02003598, + "balance_loss_mlp": 1.02798891, + "epoch": 0.15704193596873592, + "flos": 20844858199680.0, + "grad_norm": 2.1770139587214623, + "language_loss": 0.70722824, + "learning_rate": 3.7617789926444525e-06, + "loss": 0.72865766, + "num_input_tokens_seen": 56672270, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 0.74609375, + "step": 2612, + "time_per_iteration": 2.4021122455596924 + }, + { + "auxiliary_loss_clip": 0.01105137, + "auxiliary_loss_mlp": 0.01046943, + "balance_loss_clip": 1.02634406, + "balance_loss_mlp": 1.02847147, + "epoch": 0.15710205922140388, + "flos": 21761816814720.0, + "grad_norm": 1.971352266797598, + "language_loss": 0.75976723, + "learning_rate": 3.761600129597262e-06, + "loss": 0.78128803, + "num_input_tokens_seen": 56691510, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 0.765625, + "step": 2613, + "time_per_iteration": 2.4084362983703613 + }, + { + "auxiliary_loss_clip": 0.01103495, + "auxiliary_loss_mlp": 0.01048327, + "balance_loss_clip": 1.02672625, + "balance_loss_mlp": 1.02705002, + "epoch": 0.15716218247407185, + "flos": 25006644708480.0, + "grad_norm": 1.6309071429618132, + "language_loss": 0.65967524, + "learning_rate": 3.761421203683095e-06, + "loss": 0.68119335, + "num_input_tokens_seen": 56712230, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 0.765625, + "step": 2614, + "time_per_iteration": 2.4340224266052246 + }, + { + "auxiliary_loss_clip": 0.01106212, + "auxiliary_loss_mlp": 0.01040308, + "balance_loss_clip": 1.01833797, + "balance_loss_mlp": 1.02878881, + "epoch": 0.1572223057267398, + "flos": 20190795189120.0, + "grad_norm": 2.356254018131287, + "language_loss": 0.74882823, + "learning_rate": 3.7612422149083362e-06, + "loss": 0.77029347, + "num_input_tokens_seen": 56727490, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 0.7734375, + "step": 2615, + "time_per_iteration": 2.355894088745117 + }, + { + "auxiliary_loss_clip": 0.01100076, + "auxiliary_loss_mlp": 0.0104314, + "balance_loss_clip": 1.02350664, + "balance_loss_mlp": 1.02799809, + "epoch": 0.15728242897940778, + "flos": 20958465363840.0, + "grad_norm": 2.0172598137008455, + "language_loss": 0.73029327, + "learning_rate": 3.761063163279373e-06, + "loss": 0.75172544, + "num_input_tokens_seen": 56747385, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 0.72265625, + "step": 2616, + "time_per_iteration": 2.3996548652648926 + }, + { + "auxiliary_loss_clip": 0.01103443, + "auxiliary_loss_mlp": 0.01042533, + "balance_loss_clip": 1.02121794, + "balance_loss_mlp": 1.02759457, + "epoch": 0.15734255223207574, + "flos": 23037194113920.0, + "grad_norm": 1.952295697980959, + "language_loss": 0.72702718, + "learning_rate": 3.7608840488025955e-06, + "loss": 0.748487, + "num_input_tokens_seen": 56768055, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 0.7578125, + "step": 2617, + "time_per_iteration": 2.3851678371429443 + }, + { + "auxiliary_loss_clip": 0.01102768, + "auxiliary_loss_mlp": 0.01037228, + "balance_loss_clip": 1.01672435, + "balance_loss_mlp": 1.02899408, + "epoch": 0.15740267548474374, + "flos": 20550435770880.0, + "grad_norm": 2.7305945018535875, + "language_loss": 0.74240804, + "learning_rate": 3.760704871484396e-06, + "loss": 0.76380801, + "num_input_tokens_seen": 56785110, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 0.734375, + "step": 2618, + "time_per_iteration": 2.3909361362457275 + }, + { + "auxiliary_loss_clip": 0.01106578, + "auxiliary_loss_mlp": 0.01043279, + "balance_loss_clip": 1.01898444, + "balance_loss_mlp": 1.02754009, + "epoch": 0.1574627987374117, + "flos": 22666032783360.0, + "grad_norm": 1.918600653750494, + "language_loss": 0.78889054, + "learning_rate": 3.7605256313311684e-06, + "loss": 0.8103891, + "num_input_tokens_seen": 56804975, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 0.7890625, + "step": 2619, + "time_per_iteration": 2.376849889755249 + }, + { + "auxiliary_loss_clip": 0.01100153, + "auxiliary_loss_mlp": 0.01035403, + "balance_loss_clip": 1.01598394, + "balance_loss_mlp": 1.02774501, + "epoch": 0.15752292199007967, + "flos": 16799716143360.0, + "grad_norm": 1.9631199295647428, + "language_loss": 0.76334906, + "learning_rate": 3.7603463283493093e-06, + "loss": 0.78470463, + "num_input_tokens_seen": 56822470, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 0.72265625, + "step": 2620, + "time_per_iteration": 2.378913640975952 + }, + { + "auxiliary_loss_clip": 0.01105, + "auxiliary_loss_mlp": 0.01036747, + "balance_loss_clip": 1.01451433, + "balance_loss_mlp": 1.02806258, + "epoch": 0.15758304524274763, + "flos": 29824693643520.0, + "grad_norm": 1.6950638831849694, + "language_loss": 0.71077681, + "learning_rate": 3.760166962545219e-06, + "loss": 0.7321943, + "num_input_tokens_seen": 56842100, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.76953125, + "step": 2621, + "time_per_iteration": 2.445683002471924 + }, + { + "auxiliary_loss_clip": 0.01105344, + "auxiliary_loss_mlp": 0.01040413, + "balance_loss_clip": 1.01920557, + "balance_loss_mlp": 1.02917194, + "epoch": 0.1576431684954156, + "flos": 53575478369280.0, + "grad_norm": 2.043786453576383, + "language_loss": 0.72216332, + "learning_rate": 3.7599875339252962e-06, + "loss": 0.74362087, + "num_input_tokens_seen": 56865920, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 0.76171875, + "step": 2622, + "time_per_iteration": 4.0393757820129395 + }, + { + "auxiliary_loss_clip": 0.01102739, + "auxiliary_loss_mlp": 0.01036596, + "balance_loss_clip": 1.01675987, + "balance_loss_mlp": 1.0277462, + "epoch": 0.15770329174808356, + "flos": 20812563325440.0, + "grad_norm": 1.728780941941876, + "language_loss": 0.87419021, + "learning_rate": 3.759808042495947e-06, + "loss": 0.89558357, + "num_input_tokens_seen": 56885265, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 0.75, + "step": 2623, + "time_per_iteration": 2.4093070030212402 + }, + { + "auxiliary_loss_clip": 0.01103387, + "auxiliary_loss_mlp": 0.01036534, + "balance_loss_clip": 1.01710296, + "balance_loss_mlp": 1.02887702, + "epoch": 0.15776341500075153, + "flos": 24972813734400.0, + "grad_norm": 1.685512866388488, + "language_loss": 0.81717169, + "learning_rate": 3.7596284882635746e-06, + "loss": 0.83857095, + "num_input_tokens_seen": 56906710, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 0.74609375, + "step": 2624, + "time_per_iteration": 5.201894044876099 + }, + { + "auxiliary_loss_clip": 0.01103976, + "auxiliary_loss_mlp": 0.01039816, + "balance_loss_clip": 1.01782215, + "balance_loss_mlp": 1.02759087, + "epoch": 0.15782353825341952, + "flos": 21906846069120.0, + "grad_norm": 2.6155308847246554, + "language_loss": 0.7979489, + "learning_rate": 3.7594488712345878e-06, + "loss": 0.81938678, + "num_input_tokens_seen": 56924275, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.765625, + "step": 2625, + "time_per_iteration": 2.397899866104126 + }, + { + "auxiliary_loss_clip": 0.0110183, + "auxiliary_loss_mlp": 0.01039829, + "balance_loss_clip": 1.01994491, + "balance_loss_mlp": 1.0284586, + "epoch": 0.15788366150608749, + "flos": 26175990608640.0, + "grad_norm": 3.0363684040067476, + "language_loss": 0.80167592, + "learning_rate": 3.7592691914153967e-06, + "loss": 0.82309252, + "num_input_tokens_seen": 56941525, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 0.734375, + "step": 2626, + "time_per_iteration": 3.8083722591400146 + }, + { + "auxiliary_loss_clip": 0.01104302, + "auxiliary_loss_mlp": 0.0103839, + "balance_loss_clip": 1.01789784, + "balance_loss_mlp": 1.03113103, + "epoch": 0.15794378475875545, + "flos": 27708572960640.0, + "grad_norm": 1.8168098782618698, + "language_loss": 0.73536825, + "learning_rate": 3.7590894488124134e-06, + "loss": 0.75679517, + "num_input_tokens_seen": 56962145, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 0.73046875, + "step": 2627, + "time_per_iteration": 2.457184076309204 + }, + { + "auxiliary_loss_clip": 0.01102751, + "auxiliary_loss_mlp": 0.01041898, + "balance_loss_clip": 1.02078581, + "balance_loss_mlp": 1.02858937, + "epoch": 0.15800390801142342, + "flos": 12129349726080.0, + "grad_norm": 2.1391588192881947, + "language_loss": 0.85239929, + "learning_rate": 3.7589096434320534e-06, + "loss": 0.87384582, + "num_input_tokens_seen": 56977505, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 0.7421875, + "step": 2628, + "time_per_iteration": 2.3664088249206543 + }, + { + "auxiliary_loss_clip": 0.01099321, + "auxiliary_loss_mlp": 0.01036293, + "balance_loss_clip": 1.01702857, + "balance_loss_mlp": 1.0267477, + "epoch": 0.15806403126409138, + "flos": 20703669194880.0, + "grad_norm": 1.8327200788202531, + "language_loss": 0.76718879, + "learning_rate": 3.7587297752807315e-06, + "loss": 0.78854489, + "num_input_tokens_seen": 56996770, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 0.7265625, + "step": 2629, + "time_per_iteration": 2.407407283782959 + }, + { + "auxiliary_loss_clip": 0.01103757, + "auxiliary_loss_mlp": 0.01045324, + "balance_loss_clip": 1.02343762, + "balance_loss_mlp": 1.02720749, + "epoch": 0.15812415451675935, + "flos": 17820751121280.0, + "grad_norm": 2.4884600869974265, + "language_loss": 0.73892325, + "learning_rate": 3.758549844364869e-06, + "loss": 0.76041412, + "num_input_tokens_seen": 57014970, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 0.765625, + "step": 2630, + "time_per_iteration": 2.4013473987579346 + }, + { + "auxiliary_loss_clip": 0.0110509, + "auxiliary_loss_mlp": 0.01041841, + "balance_loss_clip": 1.0194056, + "balance_loss_mlp": 1.02786446, + "epoch": 0.15818427776942734, + "flos": 20083018222080.0, + "grad_norm": 5.646154236075837, + "language_loss": 0.83460271, + "learning_rate": 3.7583698506908854e-06, + "loss": 0.85607207, + "num_input_tokens_seen": 57034045, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.76953125, + "step": 2631, + "time_per_iteration": 2.435514450073242 + }, + { + "auxiliary_loss_clip": 0.01101833, + "auxiliary_loss_mlp": 0.01036206, + "balance_loss_clip": 1.01524854, + "balance_loss_mlp": 1.02746201, + "epoch": 0.1582444010220953, + "flos": 21213855025920.0, + "grad_norm": 1.702038878764565, + "language_loss": 0.78231049, + "learning_rate": 3.7581897942652046e-06, + "loss": 0.80369091, + "num_input_tokens_seen": 57053695, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 0.7421875, + "step": 2632, + "time_per_iteration": 2.405518054962158 + }, + { + "auxiliary_loss_clip": 0.01104133, + "auxiliary_loss_mlp": 0.01050743, + "balance_loss_clip": 1.0299412, + "balance_loss_mlp": 1.0280633, + "epoch": 0.15830452427476327, + "flos": 17857375004160.0, + "grad_norm": 2.156080894809283, + "language_loss": 0.83225524, + "learning_rate": 3.7580096750942535e-06, + "loss": 0.85380399, + "num_input_tokens_seen": 57071290, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 0.76171875, + "step": 2633, + "time_per_iteration": 2.364070177078247 + }, + { + "auxiliary_loss_clip": 0.01104832, + "auxiliary_loss_mlp": 0.01040668, + "balance_loss_clip": 1.02018774, + "balance_loss_mlp": 1.02880466, + "epoch": 0.15836464752743123, + "flos": 24533815898880.0, + "grad_norm": 1.6509486986117194, + "language_loss": 0.77444232, + "learning_rate": 3.7578294931844584e-06, + "loss": 0.79589731, + "num_input_tokens_seen": 57091465, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 0.7578125, + "step": 2634, + "time_per_iteration": 2.4315414428710938 + }, + { + "auxiliary_loss_clip": 0.01104861, + "auxiliary_loss_mlp": 0.01038563, + "balance_loss_clip": 1.01715338, + "balance_loss_mlp": 1.02793598, + "epoch": 0.1584247707800992, + "flos": 20119781750400.0, + "grad_norm": 3.32648958753033, + "language_loss": 0.88971549, + "learning_rate": 3.757649248542251e-06, + "loss": 0.91114974, + "num_input_tokens_seen": 57110075, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 0.76953125, + "step": 2635, + "time_per_iteration": 2.402858018875122 + }, + { + "auxiliary_loss_clip": 0.01104143, + "auxiliary_loss_mlp": 0.01043664, + "balance_loss_clip": 1.02177715, + "balance_loss_mlp": 1.02616, + "epoch": 0.15848489403276717, + "flos": 20374927032960.0, + "grad_norm": 2.140323991383923, + "language_loss": 0.75747037, + "learning_rate": 3.757468941174063e-06, + "loss": 0.77894843, + "num_input_tokens_seen": 57128945, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 0.78125, + "step": 2636, + "time_per_iteration": 2.39902400970459 + }, + { + "auxiliary_loss_clip": 0.01107464, + "auxiliary_loss_mlp": 0.01042391, + "balance_loss_clip": 1.02092123, + "balance_loss_mlp": 1.02948594, + "epoch": 0.15854501728543513, + "flos": 39345368958720.0, + "grad_norm": 2.2281739836646084, + "language_loss": 0.71450502, + "learning_rate": 3.7572885710863293e-06, + "loss": 0.73600358, + "num_input_tokens_seen": 57152385, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 0.78125, + "step": 2637, + "time_per_iteration": 2.563585042953491 + }, + { + "auxiliary_loss_clip": 0.01102063, + "auxiliary_loss_mlp": 0.01035121, + "balance_loss_clip": 1.01518965, + "balance_loss_mlp": 1.02702391, + "epoch": 0.15860514053810312, + "flos": 24863046819840.0, + "grad_norm": 1.9987676543931971, + "language_loss": 0.77517295, + "learning_rate": 3.7571081382854866e-06, + "loss": 0.79654485, + "num_input_tokens_seen": 57172620, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 0.75, + "step": 2638, + "time_per_iteration": 2.4379286766052246 + }, + { + "auxiliary_loss_clip": 0.01105014, + "auxiliary_loss_mlp": 0.01041608, + "balance_loss_clip": 1.0185771, + "balance_loss_mlp": 1.02786207, + "epoch": 0.1586652637907711, + "flos": 26176479367680.0, + "grad_norm": 1.7938691790713672, + "language_loss": 0.75311208, + "learning_rate": 3.756927642777974e-06, + "loss": 0.77457821, + "num_input_tokens_seen": 57194680, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.7734375, + "step": 2639, + "time_per_iteration": 2.448011636734009 + }, + { + "auxiliary_loss_clip": 0.01106245, + "auxiliary_loss_mlp": 0.01049362, + "balance_loss_clip": 1.02793968, + "balance_loss_mlp": 1.0296948, + "epoch": 0.15872538704343905, + "flos": 19791039588480.0, + "grad_norm": 1.8529297947319283, + "language_loss": 0.81090569, + "learning_rate": 3.7567470845702337e-06, + "loss": 0.83246183, + "num_input_tokens_seen": 57214675, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 0.765625, + "step": 2640, + "time_per_iteration": 2.4345805644989014 + }, + { + "auxiliary_loss_clip": 0.0110238, + "auxiliary_loss_mlp": 0.0104355, + "balance_loss_clip": 1.02324867, + "balance_loss_mlp": 1.02789259, + "epoch": 0.15878551029610702, + "flos": 28474113542400.0, + "grad_norm": 2.238978024847191, + "language_loss": 0.66688108, + "learning_rate": 3.756566463668709e-06, + "loss": 0.68834043, + "num_input_tokens_seen": 57235830, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 0.74609375, + "step": 2641, + "time_per_iteration": 2.429933547973633 + }, + { + "auxiliary_loss_clip": 0.01110032, + "auxiliary_loss_mlp": 0.01047067, + "balance_loss_clip": 1.02529955, + "balance_loss_mlp": 1.02991748, + "epoch": 0.15884563354877498, + "flos": 24205562496000.0, + "grad_norm": 2.0593057564250232, + "language_loss": 0.75106114, + "learning_rate": 3.756385780079845e-06, + "loss": 0.77263212, + "num_input_tokens_seen": 57255970, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 0.80078125, + "step": 2642, + "time_per_iteration": 2.417956590652466 + }, + { + "auxiliary_loss_clip": 0.01099275, + "auxiliary_loss_mlp": 0.01043671, + "balance_loss_clip": 1.02241588, + "balance_loss_mlp": 1.02721715, + "epoch": 0.15890575680144295, + "flos": 23948706556800.0, + "grad_norm": 1.763166913728333, + "language_loss": 0.70588106, + "learning_rate": 3.756205033810091e-06, + "loss": 0.72731048, + "num_input_tokens_seen": 57274435, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 0.72265625, + "step": 2643, + "time_per_iteration": 2.4096288681030273 + }, + { + "auxiliary_loss_clip": 0.01099905, + "auxiliary_loss_mlp": 0.01037241, + "balance_loss_clip": 1.01777434, + "balance_loss_mlp": 1.02727807, + "epoch": 0.15896588005411091, + "flos": 21213959760000.0, + "grad_norm": 2.136780877812778, + "language_loss": 0.77865797, + "learning_rate": 3.7560242248658963e-06, + "loss": 0.8000294, + "num_input_tokens_seen": 57293115, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 0.7265625, + "step": 2644, + "time_per_iteration": 2.4319334030151367 + }, + { + "auxiliary_loss_clip": 0.01100841, + "auxiliary_loss_mlp": 0.0104081, + "balance_loss_clip": 1.02156985, + "balance_loss_mlp": 1.02712774, + "epoch": 0.1590260033067789, + "flos": 24351255066240.0, + "grad_norm": 1.8506680923764118, + "language_loss": 0.8223685, + "learning_rate": 3.7558433532537145e-06, + "loss": 0.84378505, + "num_input_tokens_seen": 57312565, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 0.734375, + "step": 2645, + "time_per_iteration": 2.4091849327087402 + }, + { + "auxiliary_loss_clip": 0.01103085, + "auxiliary_loss_mlp": 0.01038907, + "balance_loss_clip": 1.01693726, + "balance_loss_mlp": 1.02752018, + "epoch": 0.15908612655944687, + "flos": 32047648686720.0, + "grad_norm": 2.162556548938065, + "language_loss": 0.70025808, + "learning_rate": 3.75566241898e-06, + "loss": 0.72167802, + "num_input_tokens_seen": 57333360, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 0.7578125, + "step": 2646, + "time_per_iteration": 2.494476079940796 + }, + { + "auxiliary_loss_clip": 0.01098996, + "auxiliary_loss_mlp": 0.01037495, + "balance_loss_clip": 1.01734889, + "balance_loss_mlp": 1.0268023, + "epoch": 0.15914624981211484, + "flos": 17784406529280.0, + "grad_norm": 2.392693770113908, + "language_loss": 0.62278962, + "learning_rate": 3.7554814220512095e-06, + "loss": 0.64415455, + "num_input_tokens_seen": 57350575, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 0.72265625, + "step": 2647, + "time_per_iteration": 2.339232921600342 + }, + { + "auxiliary_loss_clip": 0.01101412, + "auxiliary_loss_mlp": 0.01039351, + "balance_loss_clip": 1.01848936, + "balance_loss_mlp": 1.02895546, + "epoch": 0.1592063730647828, + "flos": 17711542788480.0, + "grad_norm": 2.094533690090337, + "language_loss": 0.89786607, + "learning_rate": 3.755300362473803e-06, + "loss": 0.91927373, + "num_input_tokens_seen": 57367570, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 0.7265625, + "step": 2648, + "time_per_iteration": 2.3754818439483643 + }, + { + "auxiliary_loss_clip": 0.01099667, + "auxiliary_loss_mlp": 0.0103661, + "balance_loss_clip": 1.01740479, + "balance_loss_mlp": 1.02806485, + "epoch": 0.15926649631745077, + "flos": 18802648598400.0, + "grad_norm": 1.784209771623308, + "language_loss": 0.91517699, + "learning_rate": 3.7551192402542418e-06, + "loss": 0.93653977, + "num_input_tokens_seen": 57383980, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 0.71484375, + "step": 2649, + "time_per_iteration": 2.3740234375 + }, + { + "auxiliary_loss_clip": 0.01108733, + "auxiliary_loss_mlp": 0.01039118, + "balance_loss_clip": 1.01682544, + "balance_loss_mlp": 1.02754092, + "epoch": 0.15932661957011873, + "flos": 17565291636480.0, + "grad_norm": 2.4707960811613074, + "language_loss": 0.71221823, + "learning_rate": 3.7549380553989893e-06, + "loss": 0.73369676, + "num_input_tokens_seen": 57400840, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.8125, + "step": 2650, + "time_per_iteration": 2.391444206237793 + }, + { + "auxiliary_loss_clip": 0.01097245, + "auxiliary_loss_mlp": 0.01033142, + "balance_loss_clip": 1.01448572, + "balance_loss_mlp": 1.02721882, + "epoch": 0.15938674282278673, + "flos": 13333504118400.0, + "grad_norm": 1.84575017835478, + "language_loss": 0.71013993, + "learning_rate": 3.7547568079145116e-06, + "loss": 0.73144376, + "num_input_tokens_seen": 57419230, + "router_z_loss_clip": 0.18652344, + "router_z_loss_mlp": 0.69921875, + "step": 2651, + "time_per_iteration": 2.355515956878662 + }, + { + "auxiliary_loss_clip": 0.01102709, + "auxiliary_loss_mlp": 0.01037868, + "balance_loss_clip": 1.01563537, + "balance_loss_mlp": 1.02707005, + "epoch": 0.1594468660754547, + "flos": 22487835870720.0, + "grad_norm": 1.9484677562824262, + "language_loss": 0.79622519, + "learning_rate": 3.754575497807278e-06, + "loss": 0.81763101, + "num_input_tokens_seen": 57439315, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.7578125, + "step": 2652, + "time_per_iteration": 2.4259936809539795 + }, + { + "auxiliary_loss_clip": 0.01100165, + "auxiliary_loss_mlp": 0.01039507, + "balance_loss_clip": 1.01925349, + "balance_loss_mlp": 1.02832174, + "epoch": 0.15950698932812266, + "flos": 15006577248000.0, + "grad_norm": 2.95076321606993, + "language_loss": 0.69801968, + "learning_rate": 3.7543941250837578e-06, + "loss": 0.71941638, + "num_input_tokens_seen": 57454635, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 0.71875, + "step": 2653, + "time_per_iteration": 2.325303554534912 + }, + { + "auxiliary_loss_clip": 0.011016, + "auxiliary_loss_mlp": 0.0103608, + "balance_loss_clip": 1.01498032, + "balance_loss_mlp": 1.02698135, + "epoch": 0.15956711258079062, + "flos": 30153715096320.0, + "grad_norm": 2.0871788872937076, + "language_loss": 0.77066928, + "learning_rate": 3.7542126897504235e-06, + "loss": 0.79204607, + "num_input_tokens_seen": 57476805, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 0.74609375, + "step": 2654, + "time_per_iteration": 2.4599967002868652 + }, + { + "auxiliary_loss_clip": 0.01098148, + "auxiliary_loss_mlp": 0.01037046, + "balance_loss_clip": 1.01642323, + "balance_loss_mlp": 1.02560902, + "epoch": 0.1596272358334586, + "flos": 21031643306880.0, + "grad_norm": 1.8948480458854995, + "language_loss": 0.81581485, + "learning_rate": 3.754031191813752e-06, + "loss": 0.83716679, + "num_input_tokens_seen": 57496400, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 0.7265625, + "step": 2655, + "time_per_iteration": 2.3683502674102783 + }, + { + "auxiliary_loss_clip": 0.01099878, + "auxiliary_loss_mlp": 0.01033338, + "balance_loss_clip": 1.01475358, + "balance_loss_mlp": 1.02669549, + "epoch": 0.15968735908612655, + "flos": 15267133791360.0, + "grad_norm": 1.9719336390073554, + "language_loss": 0.73297918, + "learning_rate": 3.753849631280218e-06, + "loss": 0.75431132, + "num_input_tokens_seen": 57513700, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.73046875, + "step": 2656, + "time_per_iteration": 2.367332696914673 + }, + { + "auxiliary_loss_clip": 0.01095125, + "auxiliary_loss_mlp": 0.01037746, + "balance_loss_clip": 1.01929212, + "balance_loss_mlp": 1.02536428, + "epoch": 0.15974748233879452, + "flos": 52663791369600.0, + "grad_norm": 2.1035566022409644, + "language_loss": 0.77869081, + "learning_rate": 3.7536680081563023e-06, + "loss": 0.80001956, + "num_input_tokens_seen": 57536180, + "router_z_loss_clip": 0.18457031, + "router_z_loss_mlp": 0.69921875, + "step": 2657, + "time_per_iteration": 2.6401469707489014 + }, + { + "auxiliary_loss_clip": 0.01099954, + "auxiliary_loss_mlp": 0.01039954, + "balance_loss_clip": 1.0214889, + "balance_loss_mlp": 1.02863574, + "epoch": 0.1598076055914625, + "flos": 18732263564160.0, + "grad_norm": 1.7498684972558385, + "language_loss": 0.74488926, + "learning_rate": 3.753486322448487e-06, + "loss": 0.76628828, + "num_input_tokens_seen": 57555025, + "router_z_loss_clip": 0.18457031, + "router_z_loss_mlp": 0.71484375, + "step": 2658, + "time_per_iteration": 2.3692216873168945 + }, + { + "auxiliary_loss_clip": 0.01099768, + "auxiliary_loss_mlp": 0.01035301, + "balance_loss_clip": 1.01396298, + "balance_loss_mlp": 1.02635539, + "epoch": 0.15986772884413047, + "flos": 34347831390720.0, + "grad_norm": 1.7291313209764942, + "language_loss": 0.75411272, + "learning_rate": 3.753304574163255e-06, + "loss": 0.77546334, + "num_input_tokens_seen": 57577660, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 0.734375, + "step": 2659, + "time_per_iteration": 2.4754250049591064 + }, + { + "auxiliary_loss_clip": 0.01099666, + "auxiliary_loss_mlp": 0.01040488, + "balance_loss_clip": 1.01914978, + "balance_loss_mlp": 1.0264492, + "epoch": 0.15992785209679844, + "flos": 22053865271040.0, + "grad_norm": 1.9408709358154512, + "language_loss": 0.90600204, + "learning_rate": 3.7531227633070924e-06, + "loss": 0.92740357, + "num_input_tokens_seen": 57596335, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 0.734375, + "step": 2660, + "time_per_iteration": 2.3919284343719482 + }, + { + "auxiliary_loss_clip": 0.01102115, + "auxiliary_loss_mlp": 0.01036555, + "balance_loss_clip": 1.01618242, + "balance_loss_mlp": 1.02813041, + "epoch": 0.1599879753494664, + "flos": 33065436908160.0, + "grad_norm": 1.6439194375650927, + "language_loss": 0.77577305, + "learning_rate": 3.7529408898864887e-06, + "loss": 0.79715973, + "num_input_tokens_seen": 57616830, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 0.7421875, + "step": 2661, + "time_per_iteration": 3.8710927963256836 + }, + { + "auxiliary_loss_clip": 0.01099562, + "auxiliary_loss_mlp": 0.01032414, + "balance_loss_clip": 1.01285183, + "balance_loss_mlp": 1.02597821, + "epoch": 0.16004809860213437, + "flos": 28036756540800.0, + "grad_norm": 2.155112005459171, + "language_loss": 0.74525195, + "learning_rate": 3.752758953907933e-06, + "loss": 0.7665717, + "num_input_tokens_seen": 57635515, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 0.734375, + "step": 2662, + "time_per_iteration": 2.450744390487671 + }, + { + "auxiliary_loss_clip": 0.0110121, + "auxiliary_loss_mlp": 0.01042846, + "balance_loss_clip": 1.02243745, + "balance_loss_mlp": 1.02744985, + "epoch": 0.16010822185480234, + "flos": 22779116277120.0, + "grad_norm": 1.948664428006108, + "language_loss": 0.82199454, + "learning_rate": 3.7525769553779192e-06, + "loss": 0.84343511, + "num_input_tokens_seen": 57654250, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 0.73828125, + "step": 2663, + "time_per_iteration": 3.7367546558380127 + }, + { + "auxiliary_loss_clip": 0.01104684, + "auxiliary_loss_mlp": 0.0103922, + "balance_loss_clip": 1.01894271, + "balance_loss_mlp": 1.02900457, + "epoch": 0.16016834510747033, + "flos": 20082983310720.0, + "grad_norm": 1.9397599102893541, + "language_loss": 0.80063188, + "learning_rate": 3.7523948943029424e-06, + "loss": 0.82207096, + "num_input_tokens_seen": 57672645, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 0.7578125, + "step": 2664, + "time_per_iteration": 3.814304828643799 + }, + { + "auxiliary_loss_clip": 0.01099688, + "auxiliary_loss_mlp": 0.01039963, + "balance_loss_clip": 1.01994777, + "balance_loss_mlp": 1.02588677, + "epoch": 0.1602284683601383, + "flos": 21172902134400.0, + "grad_norm": 1.604862513318294, + "language_loss": 0.93802118, + "learning_rate": 3.752212770689499e-06, + "loss": 0.9594177, + "num_input_tokens_seen": 57691055, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 0.73828125, + "step": 2665, + "time_per_iteration": 2.383065938949585 + }, + { + "auxiliary_loss_clip": 0.01101918, + "auxiliary_loss_mlp": 0.01034422, + "balance_loss_clip": 1.01403761, + "balance_loss_mlp": 1.02683282, + "epoch": 0.16028859161280626, + "flos": 14646692286720.0, + "grad_norm": 2.328704262901842, + "language_loss": 0.84797919, + "learning_rate": 3.752030584544089e-06, + "loss": 0.86934257, + "num_input_tokens_seen": 57707235, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 0.75, + "step": 2666, + "time_per_iteration": 3.7724173069000244 + }, + { + "auxiliary_loss_clip": 0.01099267, + "auxiliary_loss_mlp": 0.01039921, + "balance_loss_clip": 1.02032328, + "balance_loss_mlp": 1.02699661, + "epoch": 0.16034871486547422, + "flos": 20989433606400.0, + "grad_norm": 2.2293109702379645, + "language_loss": 0.81689608, + "learning_rate": 3.7518483358732142e-06, + "loss": 0.83828795, + "num_input_tokens_seen": 57724190, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 0.72265625, + "step": 2667, + "time_per_iteration": 2.3819265365600586 + }, + { + "auxiliary_loss_clip": 0.01100691, + "auxiliary_loss_mlp": 0.01045485, + "balance_loss_clip": 1.02395546, + "balance_loss_mlp": 1.02797079, + "epoch": 0.1604088381181422, + "flos": 21396660238080.0, + "grad_norm": 2.223020140601549, + "language_loss": 0.74172294, + "learning_rate": 3.751666024683379e-06, + "loss": 0.76318473, + "num_input_tokens_seen": 57743620, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 0.7265625, + "step": 2668, + "time_per_iteration": 2.4038949012756348 + }, + { + "auxiliary_loss_clip": 0.01101106, + "auxiliary_loss_mlp": 0.01041508, + "balance_loss_clip": 1.02031231, + "balance_loss_mlp": 1.02621579, + "epoch": 0.16046896137081015, + "flos": 23875947550080.0, + "grad_norm": 1.5919316620720776, + "language_loss": 0.77043045, + "learning_rate": 3.751483650981089e-06, + "loss": 0.79185653, + "num_input_tokens_seen": 57764810, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 0.75, + "step": 2669, + "time_per_iteration": 2.435511350631714 + }, + { + "auxiliary_loss_clip": 0.01026355, + "auxiliary_loss_mlp": 0.01005916, + "balance_loss_clip": 1.00378191, + "balance_loss_mlp": 1.00718212, + "epoch": 0.16052908462347812, + "flos": 59803842608640.0, + "grad_norm": 0.8001425358573404, + "language_loss": 0.55502141, + "learning_rate": 3.7513012147728527e-06, + "loss": 0.57534409, + "num_input_tokens_seen": 57824390, + "router_z_loss_clip": 0.0213623, + "router_z_loss_mlp": 0.19140625, + "step": 2670, + "time_per_iteration": 2.9251914024353027 + }, + { + "auxiliary_loss_clip": 0.011007, + "auxiliary_loss_mlp": 0.01038048, + "balance_loss_clip": 1.01755571, + "balance_loss_mlp": 1.02633071, + "epoch": 0.1605892078761461, + "flos": 18295569878400.0, + "grad_norm": 1.9109288235358965, + "language_loss": 0.77216643, + "learning_rate": 3.751118716065181e-06, + "loss": 0.79355395, + "num_input_tokens_seen": 57843665, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 0.7421875, + "step": 2671, + "time_per_iteration": 2.3948540687561035 + }, + { + "auxiliary_loss_clip": 0.01101305, + "auxiliary_loss_mlp": 0.01034247, + "balance_loss_clip": 1.01454139, + "balance_loss_mlp": 1.02747202, + "epoch": 0.16064933112881408, + "flos": 32159370637440.0, + "grad_norm": 2.0845246797628487, + "language_loss": 0.65131581, + "learning_rate": 3.750936154864587e-06, + "loss": 0.67267138, + "num_input_tokens_seen": 57863305, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 0.73828125, + "step": 2672, + "time_per_iteration": 2.483060598373413 + }, + { + "auxiliary_loss_clip": 0.01101782, + "auxiliary_loss_mlp": 0.01037121, + "balance_loss_clip": 1.0153178, + "balance_loss_mlp": 1.02615297, + "epoch": 0.16070945438148204, + "flos": 19827768205440.0, + "grad_norm": 2.084943627089922, + "language_loss": 0.85613823, + "learning_rate": 3.750753531177586e-06, + "loss": 0.8775273, + "num_input_tokens_seen": 57883025, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 0.7578125, + "step": 2673, + "time_per_iteration": 2.3847758769989014 + }, + { + "auxiliary_loss_clip": 0.01102021, + "auxiliary_loss_mlp": 0.01043887, + "balance_loss_clip": 1.02426529, + "balance_loss_mlp": 1.02886093, + "epoch": 0.16076957763415, + "flos": 18912240956160.0, + "grad_norm": 2.5506654545037857, + "language_loss": 0.73004067, + "learning_rate": 3.750570845010694e-06, + "loss": 0.75149977, + "num_input_tokens_seen": 57901430, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 0.73046875, + "step": 2674, + "time_per_iteration": 2.352660894393921 + }, + { + "auxiliary_loss_clip": 0.01099736, + "auxiliary_loss_mlp": 0.01037093, + "balance_loss_clip": 1.01462245, + "balance_loss_mlp": 1.0261476, + "epoch": 0.16082970088681797, + "flos": 16763406462720.0, + "grad_norm": 1.5455839925373789, + "language_loss": 0.8386209, + "learning_rate": 3.7503880963704314e-06, + "loss": 0.85998923, + "num_input_tokens_seen": 57919550, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.734375, + "step": 2675, + "time_per_iteration": 2.3616671562194824 + }, + { + "auxiliary_loss_clip": 0.01103789, + "auxiliary_loss_mlp": 0.01039682, + "balance_loss_clip": 1.018749, + "balance_loss_mlp": 1.02890682, + "epoch": 0.16088982413948594, + "flos": 35148878691840.0, + "grad_norm": 1.8855339884645612, + "language_loss": 0.82327354, + "learning_rate": 3.7502052852633206e-06, + "loss": 0.84470832, + "num_input_tokens_seen": 57939890, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 0.75, + "step": 2676, + "time_per_iteration": 2.47837233543396 + }, + { + "auxiliary_loss_clip": 0.01099421, + "auxiliary_loss_mlp": 0.01035022, + "balance_loss_clip": 1.01689005, + "balance_loss_mlp": 1.02847695, + "epoch": 0.1609499473921539, + "flos": 18624102572160.0, + "grad_norm": 2.4144862954961335, + "language_loss": 0.73110569, + "learning_rate": 3.7500224116958856e-06, + "loss": 0.75245011, + "num_input_tokens_seen": 57957410, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.7109375, + "step": 2677, + "time_per_iteration": 2.363727569580078 + }, + { + "auxiliary_loss_clip": 0.01096931, + "auxiliary_loss_mlp": 0.01035353, + "balance_loss_clip": 1.01620793, + "balance_loss_mlp": 1.02643561, + "epoch": 0.1610100706448219, + "flos": 33144340314240.0, + "grad_norm": 1.700185238420181, + "language_loss": 0.7650227, + "learning_rate": 3.7498394756746522e-06, + "loss": 0.78634554, + "num_input_tokens_seen": 57977900, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 0.703125, + "step": 2678, + "time_per_iteration": 2.464813232421875 + }, + { + "auxiliary_loss_clip": 0.01101983, + "auxiliary_loss_mlp": 0.01036238, + "balance_loss_clip": 1.01424408, + "balance_loss_mlp": 1.0274899, + "epoch": 0.16107019389748986, + "flos": 34675316743680.0, + "grad_norm": 1.8492951103689623, + "language_loss": 0.70696336, + "learning_rate": 3.749656477206149e-06, + "loss": 0.72834557, + "num_input_tokens_seen": 57998210, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.7421875, + "step": 2679, + "time_per_iteration": 2.497814893722534 + }, + { + "auxiliary_loss_clip": 0.01025193, + "auxiliary_loss_mlp": 0.01011856, + "balance_loss_clip": 1.00947165, + "balance_loss_mlp": 1.00566459, + "epoch": 0.16113031715015783, + "flos": 65710483735680.0, + "grad_norm": 0.7914817310226664, + "language_loss": 0.51820886, + "learning_rate": 3.749473416296906e-06, + "loss": 0.53857934, + "num_input_tokens_seen": 58059420, + "router_z_loss_clip": 0.02380371, + "router_z_loss_mlp": 0.1953125, + "step": 2680, + "time_per_iteration": 3.0602798461914062 + }, + { + "auxiliary_loss_clip": 0.01100346, + "auxiliary_loss_mlp": 0.01039493, + "balance_loss_clip": 1.01643753, + "balance_loss_mlp": 1.0259409, + "epoch": 0.1611904404028258, + "flos": 20809456214400.0, + "grad_norm": 1.8493612624825924, + "language_loss": 0.80466175, + "learning_rate": 3.749290292953458e-06, + "loss": 0.82606018, + "num_input_tokens_seen": 58078370, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.7421875, + "step": 2681, + "time_per_iteration": 2.3731982707977295 + }, + { + "auxiliary_loss_clip": 0.01098794, + "auxiliary_loss_mlp": 0.01043045, + "balance_loss_clip": 1.02272022, + "balance_loss_mlp": 1.02759242, + "epoch": 0.16125056365549376, + "flos": 27012195515520.0, + "grad_norm": 1.9213457875697393, + "language_loss": 0.68854344, + "learning_rate": 3.749107107182339e-06, + "loss": 0.70996189, + "num_input_tokens_seen": 58097395, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 0.7109375, + "step": 2682, + "time_per_iteration": 2.4167189598083496 + }, + { + "auxiliary_loss_clip": 0.01102691, + "auxiliary_loss_mlp": 0.010446, + "balance_loss_clip": 1.02196264, + "balance_loss_mlp": 1.02975488, + "epoch": 0.16131068690816172, + "flos": 19275651964800.0, + "grad_norm": 2.0126592915452126, + "language_loss": 0.87158656, + "learning_rate": 3.7489238589900855e-06, + "loss": 0.89305949, + "num_input_tokens_seen": 58115630, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.73046875, + "step": 2683, + "time_per_iteration": 2.3900914192199707 + }, + { + "auxiliary_loss_clip": 0.01103146, + "auxiliary_loss_mlp": 0.0104813, + "balance_loss_clip": 1.02664828, + "balance_loss_mlp": 1.02847111, + "epoch": 0.16137081016082971, + "flos": 35336396937600.0, + "grad_norm": 1.9732873598850735, + "language_loss": 0.74236965, + "learning_rate": 3.7487405483832395e-06, + "loss": 0.7638824, + "num_input_tokens_seen": 58138655, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 0.74609375, + "step": 2684, + "time_per_iteration": 2.5172033309936523 + }, + { + "auxiliary_loss_clip": 0.01107227, + "auxiliary_loss_mlp": 0.01043533, + "balance_loss_clip": 1.02189636, + "balance_loss_mlp": 1.03007555, + "epoch": 0.16143093341349768, + "flos": 34233979847040.0, + "grad_norm": 3.002548692009738, + "language_loss": 0.70575935, + "learning_rate": 3.748557175368341e-06, + "loss": 0.72726703, + "num_input_tokens_seen": 58157440, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 0.7734375, + "step": 2685, + "time_per_iteration": 2.5088298320770264 + }, + { + "auxiliary_loss_clip": 0.01097151, + "auxiliary_loss_mlp": 0.01038804, + "balance_loss_clip": 1.01793098, + "balance_loss_mlp": 1.02681971, + "epoch": 0.16149105666616564, + "flos": 27998072887680.0, + "grad_norm": 1.8848704834199657, + "language_loss": 0.716102, + "learning_rate": 3.748373739951935e-06, + "loss": 0.73746157, + "num_input_tokens_seen": 58176660, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 0.703125, + "step": 2686, + "time_per_iteration": 2.4251043796539307 + }, + { + "auxiliary_loss_clip": 0.01103306, + "auxiliary_loss_mlp": 0.01045832, + "balance_loss_clip": 1.0247798, + "balance_loss_mlp": 1.03054428, + "epoch": 0.1615511799188336, + "flos": 19421344535040.0, + "grad_norm": 2.039853388909567, + "language_loss": 0.81668341, + "learning_rate": 3.7481902421405676e-06, + "loss": 0.83817482, + "num_input_tokens_seen": 58195085, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 0.7265625, + "step": 2687, + "time_per_iteration": 2.3885385990142822 + }, + { + "auxiliary_loss_clip": 0.01106753, + "auxiliary_loss_mlp": 0.01043671, + "balance_loss_clip": 1.01904213, + "balance_loss_mlp": 1.02707458, + "epoch": 0.16161130317150157, + "flos": 22853865231360.0, + "grad_norm": 1.776605083139513, + "language_loss": 0.71621692, + "learning_rate": 3.7480066819407876e-06, + "loss": 0.7377212, + "num_input_tokens_seen": 58213540, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.796875, + "step": 2688, + "time_per_iteration": 2.383474349975586 + }, + { + "auxiliary_loss_clip": 0.01101855, + "auxiliary_loss_mlp": 0.01042002, + "balance_loss_clip": 1.02196348, + "balance_loss_mlp": 1.02826595, + "epoch": 0.16167142642416954, + "flos": 26109201444480.0, + "grad_norm": 3.3158671752621918, + "language_loss": 0.75798613, + "learning_rate": 3.7478230593591448e-06, + "loss": 0.77942467, + "num_input_tokens_seen": 58236995, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 0.734375, + "step": 2689, + "time_per_iteration": 2.491985321044922 + }, + { + "auxiliary_loss_clip": 0.01101707, + "auxiliary_loss_mlp": 0.01040813, + "balance_loss_clip": 1.01946294, + "balance_loss_mlp": 1.02907157, + "epoch": 0.1617315496768375, + "flos": 22778662429440.0, + "grad_norm": 1.83149192820103, + "language_loss": 0.87536496, + "learning_rate": 3.747639374402193e-06, + "loss": 0.89679015, + "num_input_tokens_seen": 58257230, + "router_z_loss_clip": 0.21386719, + "router_z_loss_mlp": 0.7265625, + "step": 2690, + "time_per_iteration": 2.417999029159546 + }, + { + "auxiliary_loss_clip": 0.01099128, + "auxiliary_loss_mlp": 0.01040434, + "balance_loss_clip": 1.02136016, + "balance_loss_mlp": 1.02734601, + "epoch": 0.1617916729295055, + "flos": 22016228958720.0, + "grad_norm": 1.8029494924009606, + "language_loss": 0.88038915, + "learning_rate": 3.7474556270764877e-06, + "loss": 0.90178472, + "num_input_tokens_seen": 58277080, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 0.71875, + "step": 2691, + "time_per_iteration": 2.3933136463165283 + }, + { + "auxiliary_loss_clip": 0.01106955, + "auxiliary_loss_mlp": 0.0105175, + "balance_loss_clip": 1.02844501, + "balance_loss_mlp": 1.02773464, + "epoch": 0.16185179618217346, + "flos": 23437194094080.0, + "grad_norm": 2.117898701803228, + "language_loss": 0.82161796, + "learning_rate": 3.7472718173885864e-06, + "loss": 0.84320498, + "num_input_tokens_seen": 58294815, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 0.79296875, + "step": 2692, + "time_per_iteration": 2.3905084133148193 + }, + { + "auxiliary_loss_clip": 0.01105115, + "auxiliary_loss_mlp": 0.01043661, + "balance_loss_clip": 1.02046311, + "balance_loss_mlp": 1.02833152, + "epoch": 0.16191191943484143, + "flos": 25664931993600.0, + "grad_norm": 2.1941735514482166, + "language_loss": 0.81331909, + "learning_rate": 3.747087945345048e-06, + "loss": 0.83480686, + "num_input_tokens_seen": 58313215, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.765625, + "step": 2693, + "time_per_iteration": 2.413591146469116 + }, + { + "auxiliary_loss_clip": 0.01100085, + "auxiliary_loss_mlp": 0.01043148, + "balance_loss_clip": 1.02325225, + "balance_loss_mlp": 1.02796292, + "epoch": 0.1619720426875094, + "flos": 23476226860800.0, + "grad_norm": 1.4848001684834402, + "language_loss": 0.83649707, + "learning_rate": 3.746904010952435e-06, + "loss": 0.85792935, + "num_input_tokens_seen": 58333215, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 0.71875, + "step": 2694, + "time_per_iteration": 2.411134958267212 + }, + { + "auxiliary_loss_clip": 0.01107016, + "auxiliary_loss_mlp": 0.01046708, + "balance_loss_clip": 1.02434468, + "balance_loss_mlp": 1.02975321, + "epoch": 0.16203216594017736, + "flos": 24132524198400.0, + "grad_norm": 1.9641912622310724, + "language_loss": 0.69131589, + "learning_rate": 3.7467200142173114e-06, + "loss": 0.71285313, + "num_input_tokens_seen": 58351160, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 0.7734375, + "step": 2695, + "time_per_iteration": 2.4020485877990723 + }, + { + "auxiliary_loss_clip": 0.01106137, + "auxiliary_loss_mlp": 0.0104104, + "balance_loss_clip": 1.01900971, + "balance_loss_mlp": 1.02939367, + "epoch": 0.16209228919284532, + "flos": 22339943884800.0, + "grad_norm": 2.057423905456492, + "language_loss": 0.82545096, + "learning_rate": 3.7465359551462438e-06, + "loss": 0.84692276, + "num_input_tokens_seen": 58368505, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.765625, + "step": 2696, + "time_per_iteration": 2.3817596435546875 + }, + { + "auxiliary_loss_clip": 0.01108746, + "auxiliary_loss_mlp": 0.01043349, + "balance_loss_clip": 1.02006698, + "balance_loss_mlp": 1.0291909, + "epoch": 0.1621524124455133, + "flos": 15814222796160.0, + "grad_norm": 2.223793290993585, + "language_loss": 0.88445479, + "learning_rate": 3.7463518337458006e-06, + "loss": 0.90597576, + "num_input_tokens_seen": 58385085, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.796875, + "step": 2697, + "time_per_iteration": 2.373197317123413 + }, + { + "auxiliary_loss_clip": 0.01097182, + "auxiliary_loss_mlp": 0.01035197, + "balance_loss_clip": 1.01657605, + "balance_loss_mlp": 1.02738893, + "epoch": 0.16221253569818128, + "flos": 30185486300160.0, + "grad_norm": 1.4813412941126236, + "language_loss": 0.80739617, + "learning_rate": 3.7461676500225522e-06, + "loss": 0.82871991, + "num_input_tokens_seen": 58406985, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.69921875, + "step": 2698, + "time_per_iteration": 2.4726316928863525 + }, + { + "auxiliary_loss_clip": 0.01098231, + "auxiliary_loss_mlp": 0.010466, + "balance_loss_clip": 1.02551174, + "balance_loss_mlp": 1.02775669, + "epoch": 0.16227265895084925, + "flos": 24604899160320.0, + "grad_norm": 1.7044726119749638, + "language_loss": 0.77323377, + "learning_rate": 3.7459834039830726e-06, + "loss": 0.79468215, + "num_input_tokens_seen": 58426205, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 0.703125, + "step": 2699, + "time_per_iteration": 2.424938678741455 + }, + { + "auxiliary_loss_clip": 0.01099858, + "auxiliary_loss_mlp": 0.01036946, + "balance_loss_clip": 1.01789641, + "balance_loss_mlp": 1.02728641, + "epoch": 0.1623327822035172, + "flos": 19572308720640.0, + "grad_norm": 2.732572726447219, + "language_loss": 0.85681903, + "learning_rate": 3.745799095633936e-06, + "loss": 0.87818706, + "num_input_tokens_seen": 58443830, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 0.7265625, + "step": 2700, + "time_per_iteration": 3.7618701457977295 + }, + { + "auxiliary_loss_clip": 0.01099364, + "auxiliary_loss_mlp": 0.01043829, + "balance_loss_clip": 1.02216864, + "balance_loss_mlp": 1.0269953, + "epoch": 0.16239290545618518, + "flos": 26467271015040.0, + "grad_norm": 3.9493981243958216, + "language_loss": 0.8032552, + "learning_rate": 3.7456147249817203e-06, + "loss": 0.82468712, + "num_input_tokens_seen": 58464405, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 0.72265625, + "step": 2701, + "time_per_iteration": 2.429082155227661 + }, + { + "auxiliary_loss_clip": 0.01101654, + "auxiliary_loss_mlp": 0.01040281, + "balance_loss_clip": 1.01992059, + "balance_loss_mlp": 1.02973723, + "epoch": 0.16245302870885314, + "flos": 15851021235840.0, + "grad_norm": 1.9193976785875857, + "language_loss": 0.73022813, + "learning_rate": 3.745430292033006e-06, + "loss": 0.75164747, + "num_input_tokens_seen": 58483295, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 0.71875, + "step": 2702, + "time_per_iteration": 3.7418477535247803 + }, + { + "auxiliary_loss_clip": 0.01101791, + "auxiliary_loss_mlp": 0.01042486, + "balance_loss_clip": 1.02016985, + "balance_loss_mlp": 1.02802634, + "epoch": 0.1625131519615211, + "flos": 14755656240000.0, + "grad_norm": 2.2755867486125743, + "language_loss": 0.72900951, + "learning_rate": 3.745245796794374e-06, + "loss": 0.75045222, + "num_input_tokens_seen": 58501205, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.73828125, + "step": 2703, + "time_per_iteration": 3.726701021194458 + }, + { + "auxiliary_loss_clip": 0.01102086, + "auxiliary_loss_mlp": 0.01038425, + "balance_loss_clip": 1.01637101, + "balance_loss_mlp": 1.02656317, + "epoch": 0.1625732752141891, + "flos": 28219247550720.0, + "grad_norm": 2.2852940608047865, + "language_loss": 0.70878398, + "learning_rate": 3.7450612392724084e-06, + "loss": 0.73018903, + "num_input_tokens_seen": 58522315, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.75390625, + "step": 2704, + "time_per_iteration": 2.432807683944702 + }, + { + "auxiliary_loss_clip": 0.01026434, + "auxiliary_loss_mlp": 0.01003702, + "balance_loss_clip": 1.00136578, + "balance_loss_mlp": 1.00683141, + "epoch": 0.16263339846685707, + "flos": 67324727491200.0, + "grad_norm": 0.7802057449767931, + "language_loss": 0.53309071, + "learning_rate": 3.7448766194736967e-06, + "loss": 0.55339205, + "num_input_tokens_seen": 58586695, + "router_z_loss_clip": 0.02331543, + "router_z_loss_mlp": 0.19628906, + "step": 2705, + "time_per_iteration": 3.03957462310791 + }, + { + "auxiliary_loss_clip": 0.01103926, + "auxiliary_loss_mlp": 0.01042252, + "balance_loss_clip": 1.02042484, + "balance_loss_mlp": 1.02761436, + "epoch": 0.16269352171952503, + "flos": 14318299238400.0, + "grad_norm": 2.8690243218573026, + "language_loss": 0.75459617, + "learning_rate": 3.7446919374048265e-06, + "loss": 0.77605796, + "num_input_tokens_seen": 58602435, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 0.765625, + "step": 2706, + "time_per_iteration": 3.706674814224243 + }, + { + "auxiliary_loss_clip": 0.01101461, + "auxiliary_loss_mlp": 0.01033724, + "balance_loss_clip": 1.01388717, + "balance_loss_mlp": 1.02723992, + "epoch": 0.162753644972193, + "flos": 28360087441920.0, + "grad_norm": 1.8757145565592657, + "language_loss": 0.72222096, + "learning_rate": 3.7445071930723888e-06, + "loss": 0.74357283, + "num_input_tokens_seen": 58621275, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 0.7421875, + "step": 2707, + "time_per_iteration": 2.4413931369781494 + }, + { + "auxiliary_loss_clip": 0.01103872, + "auxiliary_loss_mlp": 0.01043134, + "balance_loss_clip": 1.0217123, + "balance_loss_mlp": 1.02834046, + "epoch": 0.16281376822486096, + "flos": 19936836892800.0, + "grad_norm": 2.571560648137463, + "language_loss": 0.83571339, + "learning_rate": 3.7443223864829773e-06, + "loss": 0.85718346, + "num_input_tokens_seen": 58637550, + "router_z_loss_clip": 0.21386719, + "router_z_loss_mlp": 0.75390625, + "step": 2708, + "time_per_iteration": 2.38787579536438 + }, + { + "auxiliary_loss_clip": 0.01107758, + "auxiliary_loss_mlp": 0.01043605, + "balance_loss_clip": 1.01953697, + "balance_loss_mlp": 1.02829206, + "epoch": 0.16287389147752893, + "flos": 21250653465600.0, + "grad_norm": 2.0981569629576327, + "language_loss": 0.86046529, + "learning_rate": 3.7441375176431863e-06, + "loss": 0.88197893, + "num_input_tokens_seen": 58654135, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 0.796875, + "step": 2709, + "time_per_iteration": 2.376641273498535 + }, + { + "auxiliary_loss_clip": 0.01101736, + "auxiliary_loss_mlp": 0.01040952, + "balance_loss_clip": 1.02080548, + "balance_loss_mlp": 1.02731037, + "epoch": 0.1629340147301969, + "flos": 19243671292800.0, + "grad_norm": 1.6395461203937707, + "language_loss": 0.91247582, + "learning_rate": 3.7439525865596137e-06, + "loss": 0.93390268, + "num_input_tokens_seen": 58674320, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 0.7421875, + "step": 2710, + "time_per_iteration": 2.399232864379883 + }, + { + "auxiliary_loss_clip": 0.01102421, + "auxiliary_loss_mlp": 0.01043115, + "balance_loss_clip": 1.02130008, + "balance_loss_mlp": 1.02973938, + "epoch": 0.16299413798286488, + "flos": 21248803163520.0, + "grad_norm": 2.4521110439754237, + "language_loss": 0.81027466, + "learning_rate": 3.7437675932388596e-06, + "loss": 0.83173001, + "num_input_tokens_seen": 58691000, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 0.7265625, + "step": 2711, + "time_per_iteration": 2.367405891418457 + }, + { + "auxiliary_loss_clip": 0.01104288, + "auxiliary_loss_mlp": 0.01039016, + "balance_loss_clip": 1.01658082, + "balance_loss_mlp": 1.02587044, + "epoch": 0.16305426123553285, + "flos": 18769585674240.0, + "grad_norm": 2.1562163814010247, + "language_loss": 0.8089633, + "learning_rate": 3.7435825376875253e-06, + "loss": 0.83039629, + "num_input_tokens_seen": 58710230, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.78125, + "step": 2712, + "time_per_iteration": 2.3526713848114014 + }, + { + "auxiliary_loss_clip": 0.0110389, + "auxiliary_loss_mlp": 0.01043683, + "balance_loss_clip": 1.02184391, + "balance_loss_mlp": 1.02715445, + "epoch": 0.16311438448820081, + "flos": 22086648904320.0, + "grad_norm": 1.8909279713995406, + "language_loss": 0.77136874, + "learning_rate": 3.743397419912215e-06, + "loss": 0.79284441, + "num_input_tokens_seen": 58728610, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 0.765625, + "step": 2713, + "time_per_iteration": 2.3635666370391846 + }, + { + "auxiliary_loss_clip": 0.01103318, + "auxiliary_loss_mlp": 0.01045318, + "balance_loss_clip": 1.02394414, + "balance_loss_mlp": 1.03042424, + "epoch": 0.16317450774086878, + "flos": 16466889352320.0, + "grad_norm": 2.6818998512786365, + "language_loss": 0.7886489, + "learning_rate": 3.7432122399195365e-06, + "loss": 0.81013525, + "num_input_tokens_seen": 58744385, + "router_z_loss_clip": 0.21386719, + "router_z_loss_mlp": 0.7265625, + "step": 2714, + "time_per_iteration": 2.347163200378418 + }, + { + "auxiliary_loss_clip": 0.0110423, + "auxiliary_loss_mlp": 0.01040664, + "balance_loss_clip": 1.02042222, + "balance_loss_mlp": 1.02938485, + "epoch": 0.16323463099353674, + "flos": 24351778736640.0, + "grad_norm": 1.650219947186336, + "language_loss": 0.77981454, + "learning_rate": 3.7430269977160956e-06, + "loss": 0.80126345, + "num_input_tokens_seen": 58763905, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 0.75, + "step": 2715, + "time_per_iteration": 2.402946949005127 + }, + { + "auxiliary_loss_clip": 0.01099133, + "auxiliary_loss_mlp": 0.0103536, + "balance_loss_clip": 1.01517773, + "balance_loss_mlp": 1.0264461, + "epoch": 0.1632947542462047, + "flos": 24899600880000.0, + "grad_norm": 2.4567311825744897, + "language_loss": 0.82195216, + "learning_rate": 3.742841693308506e-06, + "loss": 0.84329712, + "num_input_tokens_seen": 58785580, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 0.7265625, + "step": 2716, + "time_per_iteration": 2.417941093444824 + }, + { + "auxiliary_loss_clip": 0.01105238, + "auxiliary_loss_mlp": 0.01042246, + "balance_loss_clip": 1.02075291, + "balance_loss_mlp": 1.0306592, + "epoch": 0.1633548774988727, + "flos": 24899112120960.0, + "grad_norm": 1.936804784333361, + "language_loss": 0.86132491, + "learning_rate": 3.742656326703379e-06, + "loss": 0.88279974, + "num_input_tokens_seen": 58806075, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 0.74609375, + "step": 2717, + "time_per_iteration": 2.3981850147247314 + }, + { + "auxiliary_loss_clip": 0.01100908, + "auxiliary_loss_mlp": 0.01038622, + "balance_loss_clip": 1.01877379, + "balance_loss_mlp": 1.0287087, + "epoch": 0.16341500075154067, + "flos": 30440596671360.0, + "grad_norm": 1.706598655723777, + "language_loss": 0.76384556, + "learning_rate": 3.7424708979073306e-06, + "loss": 0.78524089, + "num_input_tokens_seen": 58827405, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 0.72265625, + "step": 2718, + "time_per_iteration": 2.4446098804473877 + }, + { + "auxiliary_loss_clip": 0.01102697, + "auxiliary_loss_mlp": 0.01037988, + "balance_loss_clip": 1.0174011, + "balance_loss_mlp": 1.02754319, + "epoch": 0.16347512400420863, + "flos": 22783410374400.0, + "grad_norm": 1.9844112790707533, + "language_loss": 0.73798156, + "learning_rate": 3.742285406926978e-06, + "loss": 0.75938845, + "num_input_tokens_seen": 58847205, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 0.75, + "step": 2719, + "time_per_iteration": 2.3830654621124268 + }, + { + "auxiliary_loss_clip": 0.0110243, + "auxiliary_loss_mlp": 0.0103931, + "balance_loss_clip": 1.01919889, + "balance_loss_mlp": 1.02733767, + "epoch": 0.1635352472568766, + "flos": 22632306543360.0, + "grad_norm": 1.6711539850838706, + "language_loss": 0.72027409, + "learning_rate": 3.7420998537689402e-06, + "loss": 0.74169153, + "num_input_tokens_seen": 58866865, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 0.75, + "step": 2720, + "time_per_iteration": 2.3898520469665527 + }, + { + "auxiliary_loss_clip": 0.01099884, + "auxiliary_loss_mlp": 0.01038156, + "balance_loss_clip": 1.01648426, + "balance_loss_mlp": 1.02837658, + "epoch": 0.16359537050954456, + "flos": 15522104517120.0, + "grad_norm": 2.0201840408562517, + "language_loss": 0.75201935, + "learning_rate": 3.7419142384398404e-06, + "loss": 0.77339977, + "num_input_tokens_seen": 58885200, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 0.71484375, + "step": 2721, + "time_per_iteration": 2.3708364963531494 + }, + { + "auxiliary_loss_clip": 0.01102009, + "auxiliary_loss_mlp": 0.01038147, + "balance_loss_clip": 1.01720154, + "balance_loss_mlp": 1.02607942, + "epoch": 0.16365549376221253, + "flos": 22089092699520.0, + "grad_norm": 2.0125671119970114, + "language_loss": 0.79488349, + "learning_rate": 3.7417285609463026e-06, + "loss": 0.81628501, + "num_input_tokens_seen": 58906385, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 0.7578125, + "step": 2722, + "time_per_iteration": 2.408007860183716 + }, + { + "auxiliary_loss_clip": 0.01104363, + "auxiliary_loss_mlp": 0.01044011, + "balance_loss_clip": 1.02027655, + "balance_loss_mlp": 1.02764964, + "epoch": 0.1637156170148805, + "flos": 24059276432640.0, + "grad_norm": 3.464672780683019, + "language_loss": 0.84411418, + "learning_rate": 3.7415428212949524e-06, + "loss": 0.86559796, + "num_input_tokens_seen": 58925040, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 0.765625, + "step": 2723, + "time_per_iteration": 2.450230598449707 + }, + { + "auxiliary_loss_clip": 0.01097816, + "auxiliary_loss_mlp": 0.01037045, + "balance_loss_clip": 1.0164578, + "balance_loss_mlp": 1.02698934, + "epoch": 0.1637757402675485, + "flos": 26684221403520.0, + "grad_norm": 6.8876305134667035, + "language_loss": 0.71284223, + "learning_rate": 3.7413570194924183e-06, + "loss": 0.73419076, + "num_input_tokens_seen": 58944790, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 0.7109375, + "step": 2724, + "time_per_iteration": 2.427006721496582 + }, + { + "auxiliary_loss_clip": 0.01097508, + "auxiliary_loss_mlp": 0.01036383, + "balance_loss_clip": 1.01665354, + "balance_loss_mlp": 1.02673888, + "epoch": 0.16383586352021645, + "flos": 16106026872960.0, + "grad_norm": 2.22160942867275, + "language_loss": 0.70896482, + "learning_rate": 3.741171155545332e-06, + "loss": 0.73030376, + "num_input_tokens_seen": 58962500, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 0.70703125, + "step": 2725, + "time_per_iteration": 2.3558108806610107 + }, + { + "auxiliary_loss_clip": 0.01097869, + "auxiliary_loss_mlp": 0.01036382, + "balance_loss_clip": 1.01664102, + "balance_loss_mlp": 1.02822781, + "epoch": 0.16389598677288442, + "flos": 19165151911680.0, + "grad_norm": 2.929731921097319, + "language_loss": 0.88497961, + "learning_rate": 3.7409852294603255e-06, + "loss": 0.90632212, + "num_input_tokens_seen": 58980355, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 0.6953125, + "step": 2726, + "time_per_iteration": 2.37483549118042 + }, + { + "auxiliary_loss_clip": 0.01105815, + "auxiliary_loss_mlp": 0.01041284, + "balance_loss_clip": 1.02011204, + "balance_loss_mlp": 1.03108537, + "epoch": 0.16395611002555238, + "flos": 21505938393600.0, + "grad_norm": 1.9957275556100098, + "language_loss": 0.74080288, + "learning_rate": 3.740799241244035e-06, + "loss": 0.76227391, + "num_input_tokens_seen": 58999505, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 0.75, + "step": 2727, + "time_per_iteration": 2.384979009628296 + }, + { + "auxiliary_loss_clip": 0.01097125, + "auxiliary_loss_mlp": 0.01038954, + "balance_loss_clip": 1.01969004, + "balance_loss_mlp": 1.02770567, + "epoch": 0.16401623327822035, + "flos": 21469838181120.0, + "grad_norm": 1.7100790204559277, + "language_loss": 0.82165432, + "learning_rate": 3.7406131909030972e-06, + "loss": 0.84301507, + "num_input_tokens_seen": 59017930, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 0.6953125, + "step": 2728, + "time_per_iteration": 2.406728982925415 + }, + { + "auxiliary_loss_clip": 0.01102966, + "auxiliary_loss_mlp": 0.01040676, + "balance_loss_clip": 1.02002883, + "balance_loss_mlp": 1.02820706, + "epoch": 0.1640763565308883, + "flos": 13625378017920.0, + "grad_norm": 7.722774197557603, + "language_loss": 0.85067058, + "learning_rate": 3.740427078444152e-06, + "loss": 0.87210703, + "num_input_tokens_seen": 59035130, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 0.74609375, + "step": 2729, + "time_per_iteration": 2.3586716651916504 + }, + { + "auxiliary_loss_clip": 0.01100578, + "auxiliary_loss_mlp": 0.01044189, + "balance_loss_clip": 1.02415013, + "balance_loss_mlp": 1.02738476, + "epoch": 0.16413647978355628, + "flos": 15450532496640.0, + "grad_norm": 2.248498291358936, + "language_loss": 0.72755969, + "learning_rate": 3.7402409038738416e-06, + "loss": 0.74900734, + "num_input_tokens_seen": 59053080, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 0.73046875, + "step": 2730, + "time_per_iteration": 2.357088804244995 + }, + { + "auxiliary_loss_clip": 0.01102125, + "auxiliary_loss_mlp": 0.01043043, + "balance_loss_clip": 1.02051187, + "balance_loss_mlp": 1.0260247, + "epoch": 0.16419660303622427, + "flos": 45876955155840.0, + "grad_norm": 1.7213639834879002, + "language_loss": 0.74439585, + "learning_rate": 3.7400546671988096e-06, + "loss": 0.76584756, + "num_input_tokens_seen": 59075610, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 0.76171875, + "step": 2731, + "time_per_iteration": 2.5883896350860596 + }, + { + "auxiliary_loss_clip": 0.01105179, + "auxiliary_loss_mlp": 0.0103957, + "balance_loss_clip": 1.01845753, + "balance_loss_mlp": 1.02862, + "epoch": 0.16425672628889224, + "flos": 18951832304640.0, + "grad_norm": 2.8613939021894943, + "language_loss": 0.79236877, + "learning_rate": 3.739868368425702e-06, + "loss": 0.81381625, + "num_input_tokens_seen": 59094555, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 0.765625, + "step": 2732, + "time_per_iteration": 2.36692476272583 + }, + { + "auxiliary_loss_clip": 0.0110444, + "auxiliary_loss_mlp": 0.01039292, + "balance_loss_clip": 1.01827526, + "balance_loss_mlp": 1.02979863, + "epoch": 0.1643168495415602, + "flos": 24311943008640.0, + "grad_norm": 2.5706923919132962, + "language_loss": 0.69387078, + "learning_rate": 3.7396820075611682e-06, + "loss": 0.71530807, + "num_input_tokens_seen": 59113515, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 0.74609375, + "step": 2733, + "time_per_iteration": 2.3920483589172363 + }, + { + "auxiliary_loss_clip": 0.01103058, + "auxiliary_loss_mlp": 0.01040968, + "balance_loss_clip": 1.01924813, + "balance_loss_mlp": 1.02886534, + "epoch": 0.16437697279422817, + "flos": 26427330552960.0, + "grad_norm": 2.0778114265675827, + "language_loss": 0.81116164, + "learning_rate": 3.7394955846118585e-06, + "loss": 0.83260185, + "num_input_tokens_seen": 59133275, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 0.7421875, + "step": 2734, + "time_per_iteration": 2.4219093322753906 + }, + { + "auxiliary_loss_clip": 0.01100096, + "auxiliary_loss_mlp": 0.01037873, + "balance_loss_clip": 1.01756001, + "balance_loss_mlp": 1.02732301, + "epoch": 0.16443709604689613, + "flos": 34530811159680.0, + "grad_norm": 2.1915788350221095, + "language_loss": 0.82217395, + "learning_rate": 3.739309099584426e-06, + "loss": 0.84355366, + "num_input_tokens_seen": 59154095, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 0.7265625, + "step": 2735, + "time_per_iteration": 2.478828191757202 + }, + { + "auxiliary_loss_clip": 0.01099139, + "auxiliary_loss_mlp": 0.01036727, + "balance_loss_clip": 1.01740348, + "balance_loss_mlp": 1.0274241, + "epoch": 0.1644972192995641, + "flos": 23256937411200.0, + "grad_norm": 3.0648714549534146, + "language_loss": 0.78555602, + "learning_rate": 3.7391225524855256e-06, + "loss": 0.80691475, + "num_input_tokens_seen": 59173795, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 0.71875, + "step": 2736, + "time_per_iteration": 2.388718605041504 + }, + { + "auxiliary_loss_clip": 0.01103637, + "auxiliary_loss_mlp": 0.01040628, + "balance_loss_clip": 1.0214355, + "balance_loss_mlp": 1.03016293, + "epoch": 0.1645573425522321, + "flos": 26978329630080.0, + "grad_norm": 1.7862917661888835, + "language_loss": 0.81538427, + "learning_rate": 3.738935943321815e-06, + "loss": 0.83682692, + "num_input_tokens_seen": 59191610, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 0.734375, + "step": 2737, + "time_per_iteration": 2.411057472229004 + }, + { + "auxiliary_loss_clip": 0.0110028, + "auxiliary_loss_mlp": 0.01038799, + "balance_loss_clip": 1.01886678, + "balance_loss_mlp": 1.02628779, + "epoch": 0.16461746580490005, + "flos": 28730480722560.0, + "grad_norm": 1.9910116654991181, + "language_loss": 0.87328762, + "learning_rate": 3.7387492720999536e-06, + "loss": 0.89467835, + "num_input_tokens_seen": 59213000, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 0.7421875, + "step": 2738, + "time_per_iteration": 2.4444847106933594 + }, + { + "auxiliary_loss_clip": 0.01099441, + "auxiliary_loss_mlp": 0.01044621, + "balance_loss_clip": 1.02408099, + "balance_loss_mlp": 1.02695906, + "epoch": 0.16467758905756802, + "flos": 24929172668160.0, + "grad_norm": 1.6850865175004341, + "language_loss": 0.71940517, + "learning_rate": 3.7385625388266037e-06, + "loss": 0.7408458, + "num_input_tokens_seen": 59232340, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 0.7265625, + "step": 2739, + "time_per_iteration": 2.3921971321105957 + }, + { + "auxiliary_loss_clip": 0.01098789, + "auxiliary_loss_mlp": 0.01035218, + "balance_loss_clip": 1.01532221, + "balance_loss_mlp": 1.02643645, + "epoch": 0.16473771231023598, + "flos": 24825375596160.0, + "grad_norm": 3.8569228628265426, + "language_loss": 0.81790274, + "learning_rate": 3.7383757435084284e-06, + "loss": 0.83924282, + "num_input_tokens_seen": 59253950, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 0.72265625, + "step": 2740, + "time_per_iteration": 3.817305564880371 + }, + { + "auxiliary_loss_clip": 0.01106972, + "auxiliary_loss_mlp": 0.01048364, + "balance_loss_clip": 1.02619159, + "balance_loss_mlp": 1.0297575, + "epoch": 0.16479783556290395, + "flos": 39894482822400.0, + "grad_norm": 2.491280067494279, + "language_loss": 0.68863475, + "learning_rate": 3.7381888861520943e-06, + "loss": 0.71018815, + "num_input_tokens_seen": 59275545, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 0.7734375, + "step": 2741, + "time_per_iteration": 2.5311717987060547 + }, + { + "auxiliary_loss_clip": 0.01100268, + "auxiliary_loss_mlp": 0.01035299, + "balance_loss_clip": 1.01497364, + "balance_loss_mlp": 1.02653241, + "epoch": 0.16485795881557191, + "flos": 19896163292160.0, + "grad_norm": 1.7078130198013188, + "language_loss": 0.79608095, + "learning_rate": 3.73800196676427e-06, + "loss": 0.8174367, + "num_input_tokens_seen": 59293480, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 0.73828125, + "step": 2742, + "time_per_iteration": 3.780141592025757 + }, + { + "auxiliary_loss_clip": 0.01099181, + "auxiliary_loss_mlp": 0.0104227, + "balance_loss_clip": 1.02131319, + "balance_loss_mlp": 1.02702117, + "epoch": 0.16491808206823988, + "flos": 20555148804480.0, + "grad_norm": 2.675536099106907, + "language_loss": 0.8468293, + "learning_rate": 3.737814985351627e-06, + "loss": 0.86824381, + "num_input_tokens_seen": 59313435, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 0.72265625, + "step": 2743, + "time_per_iteration": 3.7769858837127686 + }, + { + "auxiliary_loss_clip": 0.0109797, + "auxiliary_loss_mlp": 0.01038289, + "balance_loss_clip": 1.01821387, + "balance_loss_mlp": 1.02596939, + "epoch": 0.16497820532090787, + "flos": 23799802141440.0, + "grad_norm": 1.614851134462598, + "language_loss": 0.85501188, + "learning_rate": 3.7376279419208367e-06, + "loss": 0.87637448, + "num_input_tokens_seen": 59331535, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 0.71875, + "step": 2744, + "time_per_iteration": 2.3961873054504395 + }, + { + "auxiliary_loss_clip": 0.0109583, + "auxiliary_loss_mlp": 0.0104075, + "balance_loss_clip": 1.02158082, + "balance_loss_mlp": 1.02610826, + "epoch": 0.16503832857357584, + "flos": 25481498376960.0, + "grad_norm": 2.0251563213959, + "language_loss": 0.82605666, + "learning_rate": 3.7374408364785744e-06, + "loss": 0.84742248, + "num_input_tokens_seen": 59350680, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 0.69921875, + "step": 2745, + "time_per_iteration": 3.801182508468628 + }, + { + "auxiliary_loss_clip": 0.01105267, + "auxiliary_loss_mlp": 0.01046314, + "balance_loss_clip": 1.02546382, + "balance_loss_mlp": 1.02833867, + "epoch": 0.1650984518262438, + "flos": 17675093462400.0, + "grad_norm": 2.203588025169381, + "language_loss": 0.76188481, + "learning_rate": 3.7372536690315187e-06, + "loss": 0.7834006, + "num_input_tokens_seen": 59367020, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 0.76953125, + "step": 2746, + "time_per_iteration": 2.367724895477295 + }, + { + "auxiliary_loss_clip": 0.0109833, + "auxiliary_loss_mlp": 0.01038929, + "balance_loss_clip": 1.01905644, + "balance_loss_mlp": 1.02701068, + "epoch": 0.16515857507891177, + "flos": 18697315426560.0, + "grad_norm": 1.5013443128957897, + "language_loss": 0.80648381, + "learning_rate": 3.737066439586348e-06, + "loss": 0.82785642, + "num_input_tokens_seen": 59386075, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 0.7109375, + "step": 2747, + "time_per_iteration": 2.379795789718628 + }, + { + "auxiliary_loss_clip": 0.01103344, + "auxiliary_loss_mlp": 0.01039918, + "balance_loss_clip": 1.01911581, + "balance_loss_mlp": 1.02899504, + "epoch": 0.16521869833157973, + "flos": 15009649447680.0, + "grad_norm": 2.0025581701827586, + "language_loss": 0.69230592, + "learning_rate": 3.7368791481497448e-06, + "loss": 0.71373856, + "num_input_tokens_seen": 59402690, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 0.7421875, + "step": 2748, + "time_per_iteration": 2.341280937194824 + }, + { + "auxiliary_loss_clip": 0.01101063, + "auxiliary_loss_mlp": 0.01046647, + "balance_loss_clip": 1.02620232, + "balance_loss_mlp": 1.02744234, + "epoch": 0.1652788215842477, + "flos": 22120235498880.0, + "grad_norm": 2.13358667618576, + "language_loss": 0.87971032, + "learning_rate": 3.736691794728392e-06, + "loss": 0.90118742, + "num_input_tokens_seen": 59421130, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 0.734375, + "step": 2749, + "time_per_iteration": 2.374248504638672 + }, + { + "auxiliary_loss_clip": 0.01100285, + "auxiliary_loss_mlp": 0.01034894, + "balance_loss_clip": 1.01435399, + "balance_loss_mlp": 1.0265491, + "epoch": 0.16533894483691566, + "flos": 18332089027200.0, + "grad_norm": 1.9698727226091124, + "language_loss": 0.79004288, + "learning_rate": 3.736504379328976e-06, + "loss": 0.81139457, + "num_input_tokens_seen": 59438970, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 0.73828125, + "step": 2750, + "time_per_iteration": 2.3773529529571533 + }, + { + "auxiliary_loss_clip": 0.01100886, + "auxiliary_loss_mlp": 0.01040034, + "balance_loss_clip": 1.019876, + "balance_loss_mlp": 1.02821505, + "epoch": 0.16539906808958366, + "flos": 22381036421760.0, + "grad_norm": 1.679330557616043, + "language_loss": 0.95238423, + "learning_rate": 3.7363169019581865e-06, + "loss": 0.97379339, + "num_input_tokens_seen": 59458510, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 0.7265625, + "step": 2751, + "time_per_iteration": 2.3926827907562256 + }, + { + "auxiliary_loss_clip": 0.01098458, + "auxiliary_loss_mlp": 0.010343, + "balance_loss_clip": 1.01447558, + "balance_loss_mlp": 1.02918196, + "epoch": 0.16545919134225162, + "flos": 22709988051840.0, + "grad_norm": 3.6291009142097597, + "language_loss": 0.70971817, + "learning_rate": 3.7361293626227125e-06, + "loss": 0.73104578, + "num_input_tokens_seen": 59477110, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 0.69140625, + "step": 2752, + "time_per_iteration": 2.4085581302642822 + }, + { + "auxiliary_loss_clip": 0.01029316, + "auxiliary_loss_mlp": 0.01008623, + "balance_loss_clip": 1.00564301, + "balance_loss_mlp": 1.00707996, + "epoch": 0.1655193145949196, + "flos": 67799720805120.0, + "grad_norm": 0.8051340800737071, + "language_loss": 0.54032564, + "learning_rate": 3.735941761329248e-06, + "loss": 0.56070507, + "num_input_tokens_seen": 59541155, + "router_z_loss_clip": 0.02978516, + "router_z_loss_mlp": 0.22265625, + "step": 2753, + "time_per_iteration": 3.1193695068359375 + }, + { + "auxiliary_loss_clip": 0.01099621, + "auxiliary_loss_mlp": 0.01033631, + "balance_loss_clip": 1.01318693, + "balance_loss_mlp": 1.02745223, + "epoch": 0.16557943784758755, + "flos": 24279229198080.0, + "grad_norm": 1.8369958358468492, + "language_loss": 0.75099742, + "learning_rate": 3.735754098084487e-06, + "loss": 0.77232993, + "num_input_tokens_seen": 59561155, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 0.71875, + "step": 2754, + "time_per_iteration": 2.4263463020324707 + }, + { + "auxiliary_loss_clip": 0.01108749, + "auxiliary_loss_mlp": 0.01046593, + "balance_loss_clip": 1.02282274, + "balance_loss_mlp": 1.03078508, + "epoch": 0.16563956110025552, + "flos": 20082599285760.0, + "grad_norm": 2.722434912623219, + "language_loss": 0.86311758, + "learning_rate": 3.7355663728951265e-06, + "loss": 0.88467097, + "num_input_tokens_seen": 59580460, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.77734375, + "step": 2755, + "time_per_iteration": 2.3921918869018555 + }, + { + "auxiliary_loss_clip": 0.01098502, + "auxiliary_loss_mlp": 0.01042226, + "balance_loss_clip": 1.02297413, + "balance_loss_mlp": 1.02710485, + "epoch": 0.16569968435292348, + "flos": 28033300316160.0, + "grad_norm": 1.9668023918212456, + "language_loss": 0.73244894, + "learning_rate": 3.7353785857678675e-06, + "loss": 0.75385618, + "num_input_tokens_seen": 59600025, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 0.71484375, + "step": 2756, + "time_per_iteration": 2.442095994949341 + }, + { + "auxiliary_loss_clip": 0.01097588, + "auxiliary_loss_mlp": 0.01038453, + "balance_loss_clip": 1.01904607, + "balance_loss_mlp": 1.02879632, + "epoch": 0.16575980760559147, + "flos": 26249028906240.0, + "grad_norm": 1.7749553241589369, + "language_loss": 0.74760187, + "learning_rate": 3.7351907367094105e-06, + "loss": 0.76896232, + "num_input_tokens_seen": 59620600, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 0.6875, + "step": 2757, + "time_per_iteration": 2.4306015968322754 + }, + { + "auxiliary_loss_clip": 0.01101508, + "auxiliary_loss_mlp": 0.01037136, + "balance_loss_clip": 1.01654887, + "balance_loss_mlp": 1.02925587, + "epoch": 0.16581993085825944, + "flos": 26942718176640.0, + "grad_norm": 2.1596303075322982, + "language_loss": 0.84663153, + "learning_rate": 3.7350028257264593e-06, + "loss": 0.86801791, + "num_input_tokens_seen": 59641385, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 0.72265625, + "step": 2758, + "time_per_iteration": 2.438163995742798 + }, + { + "auxiliary_loss_clip": 0.01104018, + "auxiliary_loss_mlp": 0.01039387, + "balance_loss_clip": 1.0203855, + "balance_loss_mlp": 1.03096461, + "epoch": 0.1658800541109274, + "flos": 21652538659200.0, + "grad_norm": 1.886810125326837, + "language_loss": 0.79101157, + "learning_rate": 3.7348148528257202e-06, + "loss": 0.81244564, + "num_input_tokens_seen": 59659865, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 0.73046875, + "step": 2759, + "time_per_iteration": 2.3999645709991455 + }, + { + "auxiliary_loss_clip": 0.0109939, + "auxiliary_loss_mlp": 0.01037362, + "balance_loss_clip": 1.01654792, + "balance_loss_mlp": 1.02732992, + "epoch": 0.16594017736359537, + "flos": 16434559566720.0, + "grad_norm": 2.2047309594012026, + "language_loss": 0.75204885, + "learning_rate": 3.734626818013902e-06, + "loss": 0.77341634, + "num_input_tokens_seen": 59678780, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 0.72265625, + "step": 2760, + "time_per_iteration": 2.3776280879974365 + }, + { + "auxiliary_loss_clip": 0.01104055, + "auxiliary_loss_mlp": 0.0103902, + "balance_loss_clip": 1.01839638, + "balance_loss_mlp": 1.0285697, + "epoch": 0.16600030061626334, + "flos": 22636216615680.0, + "grad_norm": 2.606869656949303, + "language_loss": 0.73423386, + "learning_rate": 3.734438721297714e-06, + "loss": 0.75566459, + "num_input_tokens_seen": 59698795, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 0.75390625, + "step": 2761, + "time_per_iteration": 2.4225025177001953 + }, + { + "auxiliary_loss_clip": 0.01099037, + "auxiliary_loss_mlp": 0.01040729, + "balance_loss_clip": 1.02139318, + "balance_loss_mlp": 1.02757204, + "epoch": 0.1660604238689313, + "flos": 26395349880960.0, + "grad_norm": 3.788373493784275, + "language_loss": 0.8883667, + "learning_rate": 3.73425056268387e-06, + "loss": 0.90976429, + "num_input_tokens_seen": 59718795, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 0.71484375, + "step": 2762, + "time_per_iteration": 2.479235887527466 + }, + { + "auxiliary_loss_clip": 0.01101874, + "auxiliary_loss_mlp": 0.01041349, + "balance_loss_clip": 1.02078533, + "balance_loss_mlp": 1.02852178, + "epoch": 0.16612054712159927, + "flos": 23038869859200.0, + "grad_norm": 2.36889733150707, + "language_loss": 0.8771072, + "learning_rate": 3.7340623421790843e-06, + "loss": 0.89853942, + "num_input_tokens_seen": 59737555, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 0.734375, + "step": 2763, + "time_per_iteration": 2.483316421508789 + }, + { + "auxiliary_loss_clip": 0.0102727, + "auxiliary_loss_mlp": 0.01003247, + "balance_loss_clip": 1.00039816, + "balance_loss_mlp": 1.0044899, + "epoch": 0.16618067037426726, + "flos": 59237864691840.0, + "grad_norm": 0.7714551426644473, + "language_loss": 0.59774059, + "learning_rate": 3.733874059790074e-06, + "loss": 0.61804575, + "num_input_tokens_seen": 59800915, + "router_z_loss_clip": 0.02844238, + "router_z_loss_mlp": 0.22753906, + "step": 2764, + "time_per_iteration": 3.048445224761963 + }, + { + "auxiliary_loss_clip": 0.01105783, + "auxiliary_loss_mlp": 0.01040468, + "balance_loss_clip": 1.01886737, + "balance_loss_mlp": 1.0312283, + "epoch": 0.16624079362693522, + "flos": 27197584168320.0, + "grad_norm": 1.7848183524532986, + "language_loss": 0.82096636, + "learning_rate": 3.733685715523559e-06, + "loss": 0.84242886, + "num_input_tokens_seen": 59822910, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 0.74609375, + "step": 2765, + "time_per_iteration": 2.4442572593688965 + }, + { + "auxiliary_loss_clip": 0.01107709, + "auxiliary_loss_mlp": 0.010432, + "balance_loss_clip": 1.01971591, + "balance_loss_mlp": 1.02863848, + "epoch": 0.1663009168796032, + "flos": 10924322549760.0, + "grad_norm": 2.682801917681024, + "language_loss": 0.69734764, + "learning_rate": 3.7334973093862595e-06, + "loss": 0.71885675, + "num_input_tokens_seen": 59838805, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 0.7890625, + "step": 2766, + "time_per_iteration": 2.3364741802215576 + }, + { + "auxiliary_loss_clip": 0.01100791, + "auxiliary_loss_mlp": 0.01040526, + "balance_loss_clip": 1.02058244, + "balance_loss_mlp": 1.03000283, + "epoch": 0.16636104013227115, + "flos": 17893475216640.0, + "grad_norm": 2.7257714940608744, + "language_loss": 0.88355601, + "learning_rate": 3.7333088413849008e-06, + "loss": 0.9049691, + "num_input_tokens_seen": 59855345, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 0.70703125, + "step": 2767, + "time_per_iteration": 2.365118980407715 + }, + { + "auxiliary_loss_clip": 0.01026081, + "auxiliary_loss_mlp": 0.01000866, + "balance_loss_clip": 0.99827963, + "balance_loss_mlp": 1.00439978, + "epoch": 0.16642116338493912, + "flos": 66722335159680.0, + "grad_norm": 0.6389215842058035, + "language_loss": 0.52877498, + "learning_rate": 3.7331203115262078e-06, + "loss": 0.54904449, + "num_input_tokens_seen": 59917710, + "router_z_loss_clip": 0.02587891, + "router_z_loss_mlp": 0.21679688, + "step": 2768, + "time_per_iteration": 3.1787824630737305 + }, + { + "auxiliary_loss_clip": 0.01104612, + "auxiliary_loss_mlp": 0.01035194, + "balance_loss_clip": 1.01367664, + "balance_loss_mlp": 1.02860165, + "epoch": 0.16648128663760708, + "flos": 19025045159040.0, + "grad_norm": 2.534206350664639, + "language_loss": 0.85295093, + "learning_rate": 3.7329317198169098e-06, + "loss": 0.87434894, + "num_input_tokens_seen": 59935105, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 0.7578125, + "step": 2769, + "time_per_iteration": 2.354337692260742 + }, + { + "auxiliary_loss_clip": 0.01025435, + "auxiliary_loss_mlp": 0.01002373, + "balance_loss_clip": 0.99979842, + "balance_loss_mlp": 1.00393605, + "epoch": 0.16654140989027508, + "flos": 70131744535680.0, + "grad_norm": 0.806276761124177, + "language_loss": 0.57446808, + "learning_rate": 3.732743066263736e-06, + "loss": 0.59474611, + "num_input_tokens_seen": 59984085, + "router_z_loss_clip": 0.02575684, + "router_z_loss_mlp": 0.21484375, + "step": 2770, + "time_per_iteration": 2.9205851554870605 + }, + { + "auxiliary_loss_clip": 0.01024973, + "auxiliary_loss_mlp": 0.01004411, + "balance_loss_clip": 1.0017761, + "balance_loss_mlp": 1.00355017, + "epoch": 0.16660153314294304, + "flos": 70269407493120.0, + "grad_norm": 0.8895709965521077, + "language_loss": 0.56245881, + "learning_rate": 3.7325543508734187e-06, + "loss": 0.58275265, + "num_input_tokens_seen": 60043470, + "router_z_loss_clip": 0.02636719, + "router_z_loss_mlp": 0.21484375, + "step": 2771, + "time_per_iteration": 2.8889243602752686 + }, + { + "auxiliary_loss_clip": 0.01101051, + "auxiliary_loss_mlp": 0.0104363, + "balance_loss_clip": 1.02268481, + "balance_loss_mlp": 1.02889836, + "epoch": 0.166661656395611, + "flos": 23073957642240.0, + "grad_norm": 3.431203791623101, + "language_loss": 0.70461863, + "learning_rate": 3.732365573652694e-06, + "loss": 0.7260654, + "num_input_tokens_seen": 60063045, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 0.72265625, + "step": 2772, + "time_per_iteration": 2.4362192153930664 + }, + { + "auxiliary_loss_clip": 0.01099417, + "auxiliary_loss_mlp": 0.01039775, + "balance_loss_clip": 1.01942599, + "balance_loss_mlp": 1.02762997, + "epoch": 0.16672177964827897, + "flos": 28365079766400.0, + "grad_norm": 3.772922793979306, + "language_loss": 0.86091107, + "learning_rate": 3.7321767346082977e-06, + "loss": 0.882303, + "num_input_tokens_seen": 60081945, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 0.71875, + "step": 2773, + "time_per_iteration": 2.4295010566711426 + }, + { + "auxiliary_loss_clip": 0.01099592, + "auxiliary_loss_mlp": 0.01030765, + "balance_loss_clip": 1.01282382, + "balance_loss_mlp": 1.02890277, + "epoch": 0.16678190290094694, + "flos": 19090228400640.0, + "grad_norm": 2.2753054762817917, + "language_loss": 0.82221007, + "learning_rate": 3.7319878337469694e-06, + "loss": 0.84351367, + "num_input_tokens_seen": 60096820, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.70703125, + "step": 2774, + "time_per_iteration": 2.39690899848938 + }, + { + "auxiliary_loss_clip": 0.01102119, + "auxiliary_loss_mlp": 0.01040889, + "balance_loss_clip": 1.02076638, + "balance_loss_mlp": 1.02918148, + "epoch": 0.1668420261536149, + "flos": 21798021761280.0, + "grad_norm": 2.396825643784463, + "language_loss": 0.8285411, + "learning_rate": 3.73179887107545e-06, + "loss": 0.84997118, + "num_input_tokens_seen": 60116140, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 0.7265625, + "step": 2775, + "time_per_iteration": 2.388744354248047 + }, + { + "auxiliary_loss_clip": 0.01099687, + "auxiliary_loss_mlp": 0.01039013, + "balance_loss_clip": 1.02028513, + "balance_loss_mlp": 1.02941787, + "epoch": 0.16690214940628287, + "flos": 19061529396480.0, + "grad_norm": 3.126882021668333, + "language_loss": 0.806099, + "learning_rate": 3.731609846600485e-06, + "loss": 0.82748598, + "num_input_tokens_seen": 60134235, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.703125, + "step": 2776, + "time_per_iteration": 2.386744976043701 + }, + { + "auxiliary_loss_clip": 0.01095309, + "auxiliary_loss_mlp": 0.01035522, + "balance_loss_clip": 1.0168184, + "balance_loss_mlp": 1.02750969, + "epoch": 0.16696227265895086, + "flos": 18587548512000.0, + "grad_norm": 2.1056546406434435, + "language_loss": 0.80016923, + "learning_rate": 3.731420760328818e-06, + "loss": 0.82147753, + "num_input_tokens_seen": 60153275, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.67578125, + "step": 2777, + "time_per_iteration": 2.3581535816192627 + }, + { + "auxiliary_loss_clip": 0.01100116, + "auxiliary_loss_mlp": 0.01037908, + "balance_loss_clip": 1.01821482, + "balance_loss_mlp": 1.02836227, + "epoch": 0.16702239591161883, + "flos": 23293037623680.0, + "grad_norm": 1.775130683428226, + "language_loss": 0.85230374, + "learning_rate": 3.7312316122671977e-06, + "loss": 0.87368405, + "num_input_tokens_seen": 60173215, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 0.71875, + "step": 2778, + "time_per_iteration": 2.4324114322662354 + }, + { + "auxiliary_loss_clip": 0.01103116, + "auxiliary_loss_mlp": 0.01036977, + "balance_loss_clip": 1.01637769, + "balance_loss_mlp": 1.02825439, + "epoch": 0.1670825191642868, + "flos": 24424502832000.0, + "grad_norm": 2.077255176239374, + "language_loss": 0.74045932, + "learning_rate": 3.731042402422375e-06, + "loss": 0.76186025, + "num_input_tokens_seen": 60190515, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 0.75, + "step": 2779, + "time_per_iteration": 3.8116507530212402 + }, + { + "auxiliary_loss_clip": 0.01099015, + "auxiliary_loss_mlp": 0.01040698, + "balance_loss_clip": 1.0210402, + "balance_loss_mlp": 1.02855587, + "epoch": 0.16714264241695476, + "flos": 26796292467840.0, + "grad_norm": 3.883328641370763, + "language_loss": 0.66294205, + "learning_rate": 3.730853130801101e-06, + "loss": 0.68433917, + "num_input_tokens_seen": 60211655, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 0.703125, + "step": 2780, + "time_per_iteration": 2.469529628753662 + }, + { + "auxiliary_loss_clip": 0.01098358, + "auxiliary_loss_mlp": 0.01036649, + "balance_loss_clip": 1.01637185, + "balance_loss_mlp": 1.02710223, + "epoch": 0.16720276566962272, + "flos": 21834226707840.0, + "grad_norm": 2.3808420385732205, + "language_loss": 0.78112018, + "learning_rate": 3.7306637974101312e-06, + "loss": 0.80247027, + "num_input_tokens_seen": 60230860, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 0.7109375, + "step": 2781, + "time_per_iteration": 3.8163065910339355 + }, + { + "auxiliary_loss_clip": 0.01102115, + "auxiliary_loss_mlp": 0.01033987, + "balance_loss_clip": 1.01564074, + "balance_loss_mlp": 1.02904999, + "epoch": 0.1672628889222907, + "flos": 21469349422080.0, + "grad_norm": 1.7121057808521025, + "language_loss": 0.74994546, + "learning_rate": 3.730474402256223e-06, + "loss": 0.77130646, + "num_input_tokens_seen": 60250535, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.73046875, + "step": 2782, + "time_per_iteration": 3.775129556655884 + }, + { + "auxiliary_loss_clip": 0.01103694, + "auxiliary_loss_mlp": 0.01036492, + "balance_loss_clip": 1.01619673, + "balance_loss_mlp": 1.02874088, + "epoch": 0.16732301217495865, + "flos": 30772690323840.0, + "grad_norm": 4.691143884560107, + "language_loss": 0.67676735, + "learning_rate": 3.7302849453461337e-06, + "loss": 0.69816923, + "num_input_tokens_seen": 60269530, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 0.75, + "step": 2783, + "time_per_iteration": 2.4699010848999023 + }, + { + "auxiliary_loss_clip": 0.01101268, + "auxiliary_loss_mlp": 0.01038573, + "balance_loss_clip": 1.01932085, + "balance_loss_mlp": 1.02963662, + "epoch": 0.16738313542762664, + "flos": 23473573597440.0, + "grad_norm": 1.7539523891610789, + "language_loss": 0.70496118, + "learning_rate": 3.730095426686626e-06, + "loss": 0.72635961, + "num_input_tokens_seen": 60289900, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 0.71875, + "step": 2784, + "time_per_iteration": 3.8684651851654053 + }, + { + "auxiliary_loss_clip": 0.01102211, + "auxiliary_loss_mlp": 0.01042852, + "balance_loss_clip": 1.02014303, + "balance_loss_mlp": 1.02739811, + "epoch": 0.1674432586802946, + "flos": 29787790469760.0, + "grad_norm": 2.05039647654335, + "language_loss": 0.60617006, + "learning_rate": 3.729905846284463e-06, + "loss": 0.62762076, + "num_input_tokens_seen": 60310025, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.75, + "step": 2785, + "time_per_iteration": 2.4373538494110107 + }, + { + "auxiliary_loss_clip": 0.01025677, + "auxiliary_loss_mlp": 0.01008213, + "balance_loss_clip": 1.00566173, + "balance_loss_mlp": 1.00451803, + "epoch": 0.16750338193296258, + "flos": 66132547695360.0, + "grad_norm": 0.8260912271145021, + "language_loss": 0.58771896, + "learning_rate": 3.72971620414641e-06, + "loss": 0.60805786, + "num_input_tokens_seen": 60377800, + "router_z_loss_clip": 0.0255127, + "router_z_loss_mlp": 0.2109375, + "step": 2786, + "time_per_iteration": 3.076218605041504 + }, + { + "auxiliary_loss_clip": 0.01101877, + "auxiliary_loss_mlp": 0.01039603, + "balance_loss_clip": 1.01880097, + "balance_loss_mlp": 1.02818251, + "epoch": 0.16756350518563054, + "flos": 25695760590720.0, + "grad_norm": 1.9732594496456566, + "language_loss": 0.76632226, + "learning_rate": 3.729526500279235e-06, + "loss": 0.78773701, + "num_input_tokens_seen": 60398215, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 0.73828125, + "step": 2787, + "time_per_iteration": 2.409895896911621 + }, + { + "auxiliary_loss_clip": 0.01101295, + "auxiliary_loss_mlp": 0.01038415, + "balance_loss_clip": 1.0182445, + "balance_loss_mlp": 1.02843809, + "epoch": 0.1676236284382985, + "flos": 23835134304000.0, + "grad_norm": 2.4665047237238906, + "language_loss": 0.76906705, + "learning_rate": 3.729336734689708e-06, + "loss": 0.7904641, + "num_input_tokens_seen": 60416910, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 0.73046875, + "step": 2788, + "time_per_iteration": 2.4283406734466553 + }, + { + "auxiliary_loss_clip": 0.01023516, + "auxiliary_loss_mlp": 0.01007237, + "balance_loss_clip": 1.00474536, + "balance_loss_mlp": 1.00275159, + "epoch": 0.16768375169096647, + "flos": 59872167872640.0, + "grad_norm": 0.8522369178846958, + "language_loss": 0.59424734, + "learning_rate": 3.7291469073846017e-06, + "loss": 0.61455488, + "num_input_tokens_seen": 60468660, + "router_z_loss_clip": 0.02490234, + "router_z_loss_mlp": 0.20800781, + "step": 2789, + "time_per_iteration": 2.8966400623321533 + }, + { + "auxiliary_loss_clip": 0.01102404, + "auxiliary_loss_mlp": 0.0104348, + "balance_loss_clip": 1.02137899, + "balance_loss_mlp": 1.02846503, + "epoch": 0.16774387494363446, + "flos": 38434135806720.0, + "grad_norm": 1.596248254233782, + "language_loss": 0.69839656, + "learning_rate": 3.72895701837069e-06, + "loss": 0.71985543, + "num_input_tokens_seen": 60492370, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 0.73828125, + "step": 2790, + "time_per_iteration": 2.550748586654663 + }, + { + "auxiliary_loss_clip": 0.01102681, + "auxiliary_loss_mlp": 0.01043039, + "balance_loss_clip": 1.02257061, + "balance_loss_mlp": 1.0283494, + "epoch": 0.16780399819630243, + "flos": 22636530817920.0, + "grad_norm": 1.8564311737949704, + "language_loss": 0.79571879, + "learning_rate": 3.7287670676547495e-06, + "loss": 0.81717592, + "num_input_tokens_seen": 60512655, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 0.74609375, + "step": 2791, + "time_per_iteration": 2.3864128589630127 + }, + { + "auxiliary_loss_clip": 0.0110316, + "auxiliary_loss_mlp": 0.01046247, + "balance_loss_clip": 1.02599347, + "balance_loss_mlp": 1.02913141, + "epoch": 0.1678641214489704, + "flos": 32890102727040.0, + "grad_norm": 2.08072886880986, + "language_loss": 0.71467054, + "learning_rate": 3.7285770552435593e-06, + "loss": 0.73616463, + "num_input_tokens_seen": 60533090, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 0.7421875, + "step": 2792, + "time_per_iteration": 2.4916250705718994 + }, + { + "auxiliary_loss_clip": 0.01102263, + "auxiliary_loss_mlp": 0.01038455, + "balance_loss_clip": 1.0188446, + "balance_loss_mlp": 1.02897751, + "epoch": 0.16792424470163836, + "flos": 19973879712000.0, + "grad_norm": 1.916949862508531, + "language_loss": 0.71492851, + "learning_rate": 3.7283869811439006e-06, + "loss": 0.73633564, + "num_input_tokens_seen": 60553190, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 0.734375, + "step": 2793, + "time_per_iteration": 2.388706922531128 + }, + { + "auxiliary_loss_clip": 0.01103298, + "auxiliary_loss_mlp": 0.01038733, + "balance_loss_clip": 1.01902735, + "balance_loss_mlp": 1.02974629, + "epoch": 0.16798436795430632, + "flos": 19718839163520.0, + "grad_norm": 2.0882395204511353, + "language_loss": 0.7694692, + "learning_rate": 3.728196845362557e-06, + "loss": 0.7908895, + "num_input_tokens_seen": 60571995, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 0.734375, + "step": 2794, + "time_per_iteration": 2.3973076343536377 + }, + { + "auxiliary_loss_clip": 0.01104216, + "auxiliary_loss_mlp": 0.01042443, + "balance_loss_clip": 1.02249885, + "balance_loss_mlp": 1.03091669, + "epoch": 0.1680444912069743, + "flos": 28103755173120.0, + "grad_norm": 3.660215334389666, + "language_loss": 0.7173906, + "learning_rate": 3.7280066479063128e-06, + "loss": 0.73885721, + "num_input_tokens_seen": 60591275, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 0.73046875, + "step": 2795, + "time_per_iteration": 2.4465954303741455 + }, + { + "auxiliary_loss_clip": 0.01100168, + "auxiliary_loss_mlp": 0.01029214, + "balance_loss_clip": 1.00934196, + "balance_loss_mlp": 1.02792013, + "epoch": 0.16810461445964225, + "flos": 18074290481280.0, + "grad_norm": 2.0025951283747716, + "language_loss": 0.83917654, + "learning_rate": 3.7278163887819565e-06, + "loss": 0.86047041, + "num_input_tokens_seen": 60609235, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 0.72265625, + "step": 2796, + "time_per_iteration": 2.364109992980957 + }, + { + "auxiliary_loss_clip": 0.01101342, + "auxiliary_loss_mlp": 0.01040593, + "balance_loss_clip": 1.02031505, + "balance_loss_mlp": 1.02778924, + "epoch": 0.16816473771231025, + "flos": 23877518561280.0, + "grad_norm": 2.6026955903410967, + "language_loss": 0.81673908, + "learning_rate": 3.727626067996277e-06, + "loss": 0.83815849, + "num_input_tokens_seen": 60629880, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 0.734375, + "step": 2797, + "time_per_iteration": 2.405458450317383 + }, + { + "auxiliary_loss_clip": 0.01095684, + "auxiliary_loss_mlp": 0.01032874, + "balance_loss_clip": 1.01563668, + "balance_loss_mlp": 1.02797866, + "epoch": 0.1682248609649782, + "flos": 22782502679040.0, + "grad_norm": 1.5458723910185148, + "language_loss": 0.75072479, + "learning_rate": 3.727435685556068e-06, + "loss": 0.77201039, + "num_input_tokens_seen": 60651175, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.67578125, + "step": 2798, + "time_per_iteration": 2.412954330444336 + }, + { + "auxiliary_loss_clip": 0.01102921, + "auxiliary_loss_mlp": 0.01039167, + "balance_loss_clip": 1.0206418, + "balance_loss_mlp": 1.03008187, + "epoch": 0.16828498421764618, + "flos": 20704053219840.0, + "grad_norm": 2.087515366307674, + "language_loss": 0.79870963, + "learning_rate": 3.7272452414681227e-06, + "loss": 0.82013059, + "num_input_tokens_seen": 60670210, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.7265625, + "step": 2799, + "time_per_iteration": 2.3892929553985596 + }, + { + "auxiliary_loss_clip": 0.01103858, + "auxiliary_loss_mlp": 0.0103511, + "balance_loss_clip": 1.01366425, + "balance_loss_mlp": 1.02899194, + "epoch": 0.16834510747031414, + "flos": 29419422048000.0, + "grad_norm": 2.1088789988159067, + "language_loss": 0.70523083, + "learning_rate": 3.7270547357392375e-06, + "loss": 0.72662044, + "num_input_tokens_seen": 60690895, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 0.75, + "step": 2800, + "time_per_iteration": 2.461484909057617 + }, + { + "auxiliary_loss_clip": 0.01099733, + "auxiliary_loss_mlp": 0.01035997, + "balance_loss_clip": 1.01558852, + "balance_loss_mlp": 1.02740645, + "epoch": 0.1684052307229821, + "flos": 18144535870080.0, + "grad_norm": 1.724561753876366, + "language_loss": 0.83576268, + "learning_rate": 3.7268641683762113e-06, + "loss": 0.85712004, + "num_input_tokens_seen": 60708280, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 0.72265625, + "step": 2801, + "time_per_iteration": 2.348421335220337 + }, + { + "auxiliary_loss_clip": 0.01101752, + "auxiliary_loss_mlp": 0.01041094, + "balance_loss_clip": 1.02017236, + "balance_loss_mlp": 1.02821028, + "epoch": 0.16846535397565007, + "flos": 16574177560320.0, + "grad_norm": 2.7224930861772654, + "language_loss": 0.82470471, + "learning_rate": 3.7266735393858456e-06, + "loss": 0.84613317, + "num_input_tokens_seen": 60724150, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 0.734375, + "step": 2802, + "time_per_iteration": 2.3624260425567627 + }, + { + "auxiliary_loss_clip": 0.01102921, + "auxiliary_loss_mlp": 0.01043189, + "balance_loss_clip": 1.02175534, + "balance_loss_mlp": 1.02774191, + "epoch": 0.16852547722831807, + "flos": 30407568658560.0, + "grad_norm": 1.5945284759246205, + "language_loss": 0.80595237, + "learning_rate": 3.7264828487749422e-06, + "loss": 0.82741344, + "num_input_tokens_seen": 60746485, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 0.75390625, + "step": 2803, + "time_per_iteration": 2.461865186691284 + }, + { + "auxiliary_loss_clip": 0.01099747, + "auxiliary_loss_mlp": 0.01042445, + "balance_loss_clip": 1.0223465, + "balance_loss_mlp": 1.02886486, + "epoch": 0.16858560048098603, + "flos": 33506110488960.0, + "grad_norm": 2.253990294914759, + "language_loss": 0.76085579, + "learning_rate": 3.726292096550307e-06, + "loss": 0.78227776, + "num_input_tokens_seen": 60762875, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 0.70703125, + "step": 2804, + "time_per_iteration": 2.4685802459716797 + }, + { + "auxiliary_loss_clip": 0.01021925, + "auxiliary_loss_mlp": 0.0100264, + "balance_loss_clip": 1.00030327, + "balance_loss_mlp": 1.00205374, + "epoch": 0.168645723733654, + "flos": 67367111748480.0, + "grad_norm": 0.8313708537078875, + "language_loss": 0.55405569, + "learning_rate": 3.7261012827187477e-06, + "loss": 0.57430136, + "num_input_tokens_seen": 60825510, + "router_z_loss_clip": 0.02331543, + "router_z_loss_mlp": 0.19921875, + "step": 2805, + "time_per_iteration": 2.996175765991211 + }, + { + "auxiliary_loss_clip": 0.01095749, + "auxiliary_loss_mlp": 0.01034293, + "balance_loss_clip": 1.01666236, + "balance_loss_mlp": 1.02667689, + "epoch": 0.16870584698632196, + "flos": 21323552117760.0, + "grad_norm": 2.471730494349587, + "language_loss": 0.72939378, + "learning_rate": 3.725910407287074e-06, + "loss": 0.75069416, + "num_input_tokens_seen": 60844440, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.69140625, + "step": 2806, + "time_per_iteration": 2.394117593765259 + }, + { + "auxiliary_loss_clip": 0.01098756, + "auxiliary_loss_mlp": 0.01038943, + "balance_loss_clip": 1.02041769, + "balance_loss_mlp": 1.02883208, + "epoch": 0.16876597023898993, + "flos": 20739699584640.0, + "grad_norm": 2.0119891708960393, + "language_loss": 0.70036387, + "learning_rate": 3.7257194702620964e-06, + "loss": 0.72174084, + "num_input_tokens_seen": 60863210, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.69921875, + "step": 2807, + "time_per_iteration": 2.383373498916626 + }, + { + "auxiliary_loss_clip": 0.01101881, + "auxiliary_loss_mlp": 0.01041627, + "balance_loss_clip": 1.02217221, + "balance_loss_mlp": 1.03004301, + "epoch": 0.1688260934916579, + "flos": 20302447317120.0, + "grad_norm": 2.4472358434644166, + "language_loss": 0.70172656, + "learning_rate": 3.725528471650631e-06, + "loss": 0.72316158, + "num_input_tokens_seen": 60882510, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 0.71875, + "step": 2808, + "time_per_iteration": 2.3989601135253906 + }, + { + "auxiliary_loss_clip": 0.01100398, + "auxiliary_loss_mlp": 0.0104131, + "balance_loss_clip": 1.02078247, + "balance_loss_mlp": 1.02708447, + "epoch": 0.16888621674432586, + "flos": 20339629781760.0, + "grad_norm": 2.292779589312388, + "language_loss": 0.80167681, + "learning_rate": 3.7253374114594925e-06, + "loss": 0.82309389, + "num_input_tokens_seen": 60901105, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 0.734375, + "step": 2809, + "time_per_iteration": 2.369114398956299 + }, + { + "auxiliary_loss_clip": 0.01109348, + "auxiliary_loss_mlp": 0.0104127, + "balance_loss_clip": 1.02125466, + "balance_loss_mlp": 1.03182256, + "epoch": 0.16894633999699385, + "flos": 16244108766720.0, + "grad_norm": 2.8977700555594548, + "language_loss": 0.8793937, + "learning_rate": 3.7251462896955e-06, + "loss": 0.90089989, + "num_input_tokens_seen": 60915340, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 0.7734375, + "step": 2810, + "time_per_iteration": 2.3441340923309326 + }, + { + "auxiliary_loss_clip": 0.01103802, + "auxiliary_loss_mlp": 0.01047031, + "balance_loss_clip": 1.02674174, + "balance_loss_mlp": 1.02955842, + "epoch": 0.16900646324966181, + "flos": 19609142071680.0, + "grad_norm": 2.502849501120339, + "language_loss": 0.92641753, + "learning_rate": 3.724955106365474e-06, + "loss": 0.94792581, + "num_input_tokens_seen": 60933735, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 0.7421875, + "step": 2811, + "time_per_iteration": 2.377276659011841 + }, + { + "auxiliary_loss_clip": 0.01100842, + "auxiliary_loss_mlp": 0.01038255, + "balance_loss_clip": 1.01937222, + "balance_loss_mlp": 1.02856755, + "epoch": 0.16906658650232978, + "flos": 22016997008640.0, + "grad_norm": 2.3749282258990276, + "language_loss": 0.78287768, + "learning_rate": 3.724763861476237e-06, + "loss": 0.80426866, + "num_input_tokens_seen": 60953105, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 0.72265625, + "step": 2812, + "time_per_iteration": 2.377631902694702 + }, + { + "auxiliary_loss_clip": 0.01100864, + "auxiliary_loss_mlp": 0.01040717, + "balance_loss_clip": 1.02241826, + "balance_loss_mlp": 1.02951097, + "epoch": 0.16912670975499774, + "flos": 11762936340480.0, + "grad_norm": 2.81241792316052, + "language_loss": 0.7505877, + "learning_rate": 3.724572555034615e-06, + "loss": 0.77200353, + "num_input_tokens_seen": 60969150, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 0.71484375, + "step": 2813, + "time_per_iteration": 2.3591530323028564 + }, + { + "auxiliary_loss_clip": 0.01101558, + "auxiliary_loss_mlp": 0.01041879, + "balance_loss_clip": 1.02145851, + "balance_loss_mlp": 1.02710545, + "epoch": 0.1691868330076657, + "flos": 17160543711360.0, + "grad_norm": 8.40670518179425, + "language_loss": 0.68826377, + "learning_rate": 3.7243811870474346e-06, + "loss": 0.70969814, + "num_input_tokens_seen": 60982825, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 0.7421875, + "step": 2814, + "time_per_iteration": 2.324979066848755 + }, + { + "auxiliary_loss_clip": 0.01100765, + "auxiliary_loss_mlp": 0.01037974, + "balance_loss_clip": 1.01856661, + "balance_loss_mlp": 1.02842844, + "epoch": 0.16924695626033368, + "flos": 22415530711680.0, + "grad_norm": 2.2746765096551487, + "language_loss": 0.61625373, + "learning_rate": 3.724189757521525e-06, + "loss": 0.63764107, + "num_input_tokens_seen": 61000875, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 0.72265625, + "step": 2815, + "time_per_iteration": 2.3952457904815674 + }, + { + "auxiliary_loss_clip": 0.01097071, + "auxiliary_loss_mlp": 0.01035948, + "balance_loss_clip": 1.01794755, + "balance_loss_mlp": 1.02694273, + "epoch": 0.16930707951300164, + "flos": 25738459050240.0, + "grad_norm": 3.3403623940253144, + "language_loss": 0.82395369, + "learning_rate": 3.7239982664637185e-06, + "loss": 0.84528393, + "num_input_tokens_seen": 61021940, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.703125, + "step": 2816, + "time_per_iteration": 2.4072141647338867 + }, + { + "auxiliary_loss_clip": 0.01102119, + "auxiliary_loss_mlp": 0.01041241, + "balance_loss_clip": 1.02181005, + "balance_loss_mlp": 1.02807236, + "epoch": 0.16936720276566963, + "flos": 22745948618880.0, + "grad_norm": 3.3116096120669414, + "language_loss": 0.86611402, + "learning_rate": 3.7238067138808477e-06, + "loss": 0.88754761, + "num_input_tokens_seen": 61040285, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 0.7421875, + "step": 2817, + "time_per_iteration": 2.388659715652466 + }, + { + "auxiliary_loss_clip": 0.01100093, + "auxiliary_loss_mlp": 0.01039138, + "balance_loss_clip": 1.01989794, + "balance_loss_mlp": 1.03019714, + "epoch": 0.1694273260183376, + "flos": 19572937125120.0, + "grad_norm": 1.7000006864629023, + "language_loss": 0.8144446, + "learning_rate": 3.72361509977975e-06, + "loss": 0.83583695, + "num_input_tokens_seen": 61059020, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 0.69921875, + "step": 2818, + "time_per_iteration": 2.3735878467559814 + }, + { + "auxiliary_loss_clip": 0.01097576, + "auxiliary_loss_mlp": 0.01040857, + "balance_loss_clip": 1.02055573, + "balance_loss_mlp": 1.02641368, + "epoch": 0.16948744927100556, + "flos": 12457044547200.0, + "grad_norm": 2.4156942861080433, + "language_loss": 0.82009411, + "learning_rate": 3.7234234241672632e-06, + "loss": 0.84147841, + "num_input_tokens_seen": 61074245, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 0.7109375, + "step": 2819, + "time_per_iteration": 3.769258737564087 + }, + { + "auxiliary_loss_clip": 0.01023284, + "auxiliary_loss_mlp": 0.01007441, + "balance_loss_clip": 1.00496185, + "balance_loss_mlp": 1.00381994, + "epoch": 0.16954757252367353, + "flos": 71288731814400.0, + "grad_norm": 0.9319188129893082, + "language_loss": 0.61062413, + "learning_rate": 3.7232316870502274e-06, + "loss": 0.63093144, + "num_input_tokens_seen": 61127080, + "router_z_loss_clip": 0.02478027, + "router_z_loss_mlp": 0.1953125, + "step": 2820, + "time_per_iteration": 2.91005539894104 + }, + { + "auxiliary_loss_clip": 0.01099981, + "auxiliary_loss_mlp": 0.01043868, + "balance_loss_clip": 1.02462757, + "balance_loss_mlp": 1.02774501, + "epoch": 0.1696076957763415, + "flos": 29605229637120.0, + "grad_norm": 3.091531330886817, + "language_loss": 0.78350353, + "learning_rate": 3.723039888435485e-06, + "loss": 0.80494201, + "num_input_tokens_seen": 61146955, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 0.72265625, + "step": 2821, + "time_per_iteration": 3.8358261585235596 + }, + { + "auxiliary_loss_clip": 0.01102246, + "auxiliary_loss_mlp": 0.01045946, + "balance_loss_clip": 1.02509594, + "balance_loss_mlp": 1.03072417, + "epoch": 0.16966781902900946, + "flos": 24387460012800.0, + "grad_norm": 1.9821330134851807, + "language_loss": 0.78271604, + "learning_rate": 3.722848028329882e-06, + "loss": 0.80419791, + "num_input_tokens_seen": 61166605, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 0.71484375, + "step": 2822, + "time_per_iteration": 3.8631176948547363 + }, + { + "auxiliary_loss_clip": 0.01099396, + "auxiliary_loss_mlp": 0.01037836, + "balance_loss_clip": 1.01911986, + "balance_loss_mlp": 1.02830386, + "epoch": 0.16972794228167745, + "flos": 23037717784320.0, + "grad_norm": 4.088592431205343, + "language_loss": 0.75136393, + "learning_rate": 3.7226561067402638e-06, + "loss": 0.77273631, + "num_input_tokens_seen": 61186535, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.7109375, + "step": 2823, + "time_per_iteration": 3.7679522037506104 + }, + { + "auxiliary_loss_clip": 0.01100015, + "auxiliary_loss_mlp": 0.01039195, + "balance_loss_clip": 1.01944184, + "balance_loss_mlp": 1.0288341, + "epoch": 0.16978806553434542, + "flos": 35227153693440.0, + "grad_norm": 2.161818085815661, + "language_loss": 0.60268676, + "learning_rate": 3.7224641236734805e-06, + "loss": 0.62407881, + "num_input_tokens_seen": 61208965, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 0.7109375, + "step": 2824, + "time_per_iteration": 2.519198179244995 + }, + { + "auxiliary_loss_clip": 0.01099241, + "auxiliary_loss_mlp": 0.01039886, + "balance_loss_clip": 1.0196799, + "balance_loss_mlp": 1.02819765, + "epoch": 0.16984818878701338, + "flos": 32012944928640.0, + "grad_norm": 1.6091365944583946, + "language_loss": 0.73137844, + "learning_rate": 3.7222720791363837e-06, + "loss": 0.75276971, + "num_input_tokens_seen": 61230670, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 0.7109375, + "step": 2825, + "time_per_iteration": 2.457822561264038 + }, + { + "auxiliary_loss_clip": 0.01103609, + "auxiliary_loss_mlp": 0.0104685, + "balance_loss_clip": 1.02480817, + "balance_loss_mlp": 1.0285238, + "epoch": 0.16990831203968135, + "flos": 22817555550720.0, + "grad_norm": 2.0299478360072247, + "language_loss": 0.85285699, + "learning_rate": 3.7220799731358264e-06, + "loss": 0.87436152, + "num_input_tokens_seen": 61249510, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.75, + "step": 2826, + "time_per_iteration": 2.3784821033477783 + }, + { + "auxiliary_loss_clip": 0.01105385, + "auxiliary_loss_mlp": 0.01045185, + "balance_loss_clip": 1.02478862, + "balance_loss_mlp": 1.0288589, + "epoch": 0.1699684352923493, + "flos": 23038485834240.0, + "grad_norm": 1.7377436091686924, + "language_loss": 0.82375735, + "learning_rate": 3.721887805678665e-06, + "loss": 0.84526312, + "num_input_tokens_seen": 61269440, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 0.765625, + "step": 2827, + "time_per_iteration": 2.3947606086730957 + }, + { + "auxiliary_loss_clip": 0.01102699, + "auxiliary_loss_mlp": 0.01037606, + "balance_loss_clip": 1.01596928, + "balance_loss_mlp": 1.02850986, + "epoch": 0.17002855854501728, + "flos": 21433039741440.0, + "grad_norm": 1.7688978957597494, + "language_loss": 0.73898339, + "learning_rate": 3.7216955767717558e-06, + "loss": 0.76038647, + "num_input_tokens_seen": 61288195, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 0.7421875, + "step": 2828, + "time_per_iteration": 2.3846895694732666 + }, + { + "auxiliary_loss_clip": 0.0102301, + "auxiliary_loss_mlp": 0.01005238, + "balance_loss_clip": 1.00283051, + "balance_loss_mlp": 1.00294185, + "epoch": 0.17008868179768524, + "flos": 71450099585280.0, + "grad_norm": 0.7636207272108545, + "language_loss": 0.56493086, + "learning_rate": 3.721503286421961e-06, + "loss": 0.58521336, + "num_input_tokens_seen": 61350850, + "router_z_loss_clip": 0.02404785, + "router_z_loss_mlp": 0.20117188, + "step": 2829, + "time_per_iteration": 3.060309648513794 + }, + { + "auxiliary_loss_clip": 0.01100291, + "auxiliary_loss_mlp": 0.01034828, + "balance_loss_clip": 1.01595736, + "balance_loss_mlp": 1.02777958, + "epoch": 0.17014880505035324, + "flos": 24899147032320.0, + "grad_norm": 1.9485488632433958, + "language_loss": 0.83049953, + "learning_rate": 3.7213109346361424e-06, + "loss": 0.85185075, + "num_input_tokens_seen": 61370765, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 0.7265625, + "step": 2830, + "time_per_iteration": 2.4133338928222656 + }, + { + "auxiliary_loss_clip": 0.01100308, + "auxiliary_loss_mlp": 0.01036126, + "balance_loss_clip": 1.01582432, + "balance_loss_mlp": 1.02769089, + "epoch": 0.1702089283030212, + "flos": 29861108058240.0, + "grad_norm": 1.8773353166213922, + "language_loss": 0.78348982, + "learning_rate": 3.721118521421164e-06, + "loss": 0.80485415, + "num_input_tokens_seen": 61388935, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 0.7265625, + "step": 2831, + "time_per_iteration": 2.445157527923584 + }, + { + "auxiliary_loss_clip": 0.0110162, + "auxiliary_loss_mlp": 0.01043851, + "balance_loss_clip": 1.02229834, + "balance_loss_mlp": 1.02805817, + "epoch": 0.17026905155568917, + "flos": 17743348903680.0, + "grad_norm": 2.739917273717214, + "language_loss": 0.79639959, + "learning_rate": 3.7209260467838926e-06, + "loss": 0.81785429, + "num_input_tokens_seen": 61407350, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 0.734375, + "step": 2832, + "time_per_iteration": 2.355471611022949 + }, + { + "auxiliary_loss_clip": 0.01100581, + "auxiliary_loss_mlp": 0.01042989, + "balance_loss_clip": 1.02341509, + "balance_loss_mlp": 1.0275898, + "epoch": 0.17032917480835713, + "flos": 23147554521600.0, + "grad_norm": 1.6157854847826956, + "language_loss": 0.8847543, + "learning_rate": 3.720733510731198e-06, + "loss": 0.90619004, + "num_input_tokens_seen": 61429010, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 0.73046875, + "step": 2833, + "time_per_iteration": 2.4187119007110596 + }, + { + "auxiliary_loss_clip": 0.01099931, + "auxiliary_loss_mlp": 0.01042799, + "balance_loss_clip": 1.02253354, + "balance_loss_mlp": 1.02734184, + "epoch": 0.1703892980610251, + "flos": 39201003020160.0, + "grad_norm": 2.2330127490136915, + "language_loss": 0.71865654, + "learning_rate": 3.72054091326995e-06, + "loss": 0.74008387, + "num_input_tokens_seen": 61450040, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 0.7265625, + "step": 2834, + "time_per_iteration": 2.5245070457458496 + }, + { + "auxiliary_loss_clip": 0.01104221, + "auxiliary_loss_mlp": 0.01044854, + "balance_loss_clip": 1.02518415, + "balance_loss_mlp": 1.03048635, + "epoch": 0.17044942131369306, + "flos": 23037997075200.0, + "grad_norm": 2.04637989842674, + "language_loss": 0.86782855, + "learning_rate": 3.7203482544070227e-06, + "loss": 0.8893193, + "num_input_tokens_seen": 61468585, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 0.73828125, + "step": 2835, + "time_per_iteration": 2.3873298168182373 + }, + { + "auxiliary_loss_clip": 0.0110199, + "auxiliary_loss_mlp": 0.0104297, + "balance_loss_clip": 1.02076173, + "balance_loss_mlp": 1.0270679, + "epoch": 0.17050954456636103, + "flos": 17054058464640.0, + "grad_norm": 2.074709036086154, + "language_loss": 0.73609614, + "learning_rate": 3.720155534149292e-06, + "loss": 0.75754571, + "num_input_tokens_seen": 61486330, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.75, + "step": 2836, + "time_per_iteration": 2.3513550758361816 + }, + { + "auxiliary_loss_clip": 0.01107355, + "auxiliary_loss_mlp": 0.01040434, + "balance_loss_clip": 1.01742601, + "balance_loss_mlp": 1.02962399, + "epoch": 0.17056966781902902, + "flos": 16836025824000.0, + "grad_norm": 2.084422244258617, + "language_loss": 0.80140126, + "learning_rate": 3.7199627525036343e-06, + "loss": 0.8228792, + "num_input_tokens_seen": 61503950, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.77734375, + "step": 2837, + "time_per_iteration": 2.3739218711853027 + }, + { + "auxiliary_loss_clip": 0.01098566, + "auxiliary_loss_mlp": 0.01039546, + "balance_loss_clip": 1.01877975, + "balance_loss_mlp": 1.02908397, + "epoch": 0.17062979107169698, + "flos": 17711577699840.0, + "grad_norm": 9.587808322010051, + "language_loss": 0.83528239, + "learning_rate": 3.7197699094769303e-06, + "loss": 0.85666353, + "num_input_tokens_seen": 61523550, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 0.6953125, + "step": 2838, + "time_per_iteration": 2.3722033500671387 + }, + { + "auxiliary_loss_clip": 0.01098691, + "auxiliary_loss_mlp": 0.01034909, + "balance_loss_clip": 1.01614571, + "balance_loss_mlp": 1.02855921, + "epoch": 0.17068991432436495, + "flos": 22524040817280.0, + "grad_norm": 1.7594720877797905, + "language_loss": 0.93555927, + "learning_rate": 3.719577005076062e-06, + "loss": 0.95689523, + "num_input_tokens_seen": 61542720, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.703125, + "step": 2839, + "time_per_iteration": 2.414508581161499 + }, + { + "auxiliary_loss_clip": 0.01102511, + "auxiliary_loss_mlp": 0.01039726, + "balance_loss_clip": 1.01892352, + "balance_loss_mlp": 1.02878416, + "epoch": 0.17075003757703291, + "flos": 25881812559360.0, + "grad_norm": 2.486884554266925, + "language_loss": 0.83609664, + "learning_rate": 3.719384039307914e-06, + "loss": 0.85751897, + "num_input_tokens_seen": 61563040, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 0.734375, + "step": 2840, + "time_per_iteration": 2.414292573928833 + }, + { + "auxiliary_loss_clip": 0.01102081, + "auxiliary_loss_mlp": 0.01041209, + "balance_loss_clip": 1.01995409, + "balance_loss_mlp": 1.02834702, + "epoch": 0.17081016082970088, + "flos": 20119677016320.0, + "grad_norm": 1.9077523811333352, + "language_loss": 0.75979531, + "learning_rate": 3.7191910121793723e-06, + "loss": 0.78122818, + "num_input_tokens_seen": 61581890, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 0.734375, + "step": 2841, + "time_per_iteration": 2.3861541748046875 + }, + { + "auxiliary_loss_clip": 0.0109873, + "auxiliary_loss_mlp": 0.0104296, + "balance_loss_clip": 1.02227688, + "balance_loss_mlp": 1.02622223, + "epoch": 0.17087028408236885, + "flos": 24935317067520.0, + "grad_norm": 1.8118609133322574, + "language_loss": 0.76893795, + "learning_rate": 3.718997923697326e-06, + "loss": 0.79035485, + "num_input_tokens_seen": 61602095, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 0.7265625, + "step": 2842, + "time_per_iteration": 2.4093101024627686 + }, + { + "auxiliary_loss_clip": 0.01098277, + "auxiliary_loss_mlp": 0.01038635, + "balance_loss_clip": 1.01902461, + "balance_loss_mlp": 1.02864337, + "epoch": 0.17093040733503684, + "flos": 19056990919680.0, + "grad_norm": 1.96747438457974, + "language_loss": 0.85524523, + "learning_rate": 3.7188047738686655e-06, + "loss": 0.87661433, + "num_input_tokens_seen": 61620400, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 0.6953125, + "step": 2843, + "time_per_iteration": 2.4019691944122314 + }, + { + "auxiliary_loss_clip": 0.01098306, + "auxiliary_loss_mlp": 0.01040205, + "balance_loss_clip": 1.02007055, + "balance_loss_mlp": 1.02914369, + "epoch": 0.1709905305877048, + "flos": 13078114456320.0, + "grad_norm": 1.7886432537207648, + "language_loss": 0.68202627, + "learning_rate": 3.7186115627002837e-06, + "loss": 0.70341146, + "num_input_tokens_seen": 61637680, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 0.69140625, + "step": 2844, + "time_per_iteration": 2.374447822570801 + }, + { + "auxiliary_loss_clip": 0.01101739, + "auxiliary_loss_mlp": 0.01043683, + "balance_loss_clip": 1.02196276, + "balance_loss_mlp": 1.02908492, + "epoch": 0.17105065384037277, + "flos": 19208304218880.0, + "grad_norm": 2.0010483451506085, + "language_loss": 0.78770077, + "learning_rate": 3.718418290199076e-06, + "loss": 0.80915499, + "num_input_tokens_seen": 61655630, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 0.7265625, + "step": 2845, + "time_per_iteration": 2.3720314502716064 + }, + { + "auxiliary_loss_clip": 0.01097968, + "auxiliary_loss_mlp": 0.01039524, + "balance_loss_clip": 1.02000928, + "balance_loss_mlp": 1.02587223, + "epoch": 0.17111077709304073, + "flos": 18514196012160.0, + "grad_norm": 4.3704053337883755, + "language_loss": 0.77804375, + "learning_rate": 3.71822495637194e-06, + "loss": 0.79941869, + "num_input_tokens_seen": 61673475, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 0.72265625, + "step": 2846, + "time_per_iteration": 2.3560523986816406 + }, + { + "auxiliary_loss_clip": 0.01099426, + "auxiliary_loss_mlp": 0.01040035, + "balance_loss_clip": 1.02113986, + "balance_loss_mlp": 1.02861381, + "epoch": 0.1711709003457087, + "flos": 25081498396800.0, + "grad_norm": 1.711799864431428, + "language_loss": 0.79977489, + "learning_rate": 3.7180315612257748e-06, + "loss": 0.8211695, + "num_input_tokens_seen": 61693370, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 0.70703125, + "step": 2847, + "time_per_iteration": 2.428332805633545 + }, + { + "auxiliary_loss_clip": 0.01097876, + "auxiliary_loss_mlp": 0.01039338, + "balance_loss_clip": 1.02033615, + "balance_loss_mlp": 1.02556586, + "epoch": 0.17123102359837666, + "flos": 17565431281920.0, + "grad_norm": 3.0462412738497533, + "language_loss": 0.86679769, + "learning_rate": 3.7178381047674825e-06, + "loss": 0.88816977, + "num_input_tokens_seen": 61710820, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 0.72265625, + "step": 2848, + "time_per_iteration": 2.3382651805877686 + }, + { + "auxiliary_loss_clip": 0.01100188, + "auxiliary_loss_mlp": 0.01039521, + "balance_loss_clip": 1.01960135, + "balance_loss_mlp": 1.02768493, + "epoch": 0.17129114685104463, + "flos": 26172534384000.0, + "grad_norm": 2.1077203639678475, + "language_loss": 0.75360501, + "learning_rate": 3.717644587003967e-06, + "loss": 0.77500212, + "num_input_tokens_seen": 61729855, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 0.72265625, + "step": 2849, + "time_per_iteration": 2.4131977558135986 + }, + { + "auxiliary_loss_clip": 0.01026538, + "auxiliary_loss_mlp": 0.01004335, + "balance_loss_clip": 1.00199831, + "balance_loss_mlp": 1.00581324, + "epoch": 0.17135127010371262, + "flos": 69266212220160.0, + "grad_norm": 0.7842330333871769, + "language_loss": 0.57423878, + "learning_rate": 3.7174510079421347e-06, + "loss": 0.59454751, + "num_input_tokens_seen": 61790290, + "router_z_loss_clip": 0.02331543, + "router_z_loss_mlp": 0.20703125, + "step": 2850, + "time_per_iteration": 3.004063367843628 + }, + { + "auxiliary_loss_clip": 0.01097386, + "auxiliary_loss_mlp": 0.01042264, + "balance_loss_clip": 1.02221298, + "balance_loss_mlp": 1.02785206, + "epoch": 0.1714113933563806, + "flos": 23548985867520.0, + "grad_norm": 3.1580788317294433, + "language_loss": 0.80728292, + "learning_rate": 3.7172573675888937e-06, + "loss": 0.82867938, + "num_input_tokens_seen": 61809265, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 0.6953125, + "step": 2851, + "time_per_iteration": 2.409008502960205 + }, + { + "auxiliary_loss_clip": 0.01096345, + "auxiliary_loss_mlp": 0.01035792, + "balance_loss_clip": 1.01668262, + "balance_loss_mlp": 1.02716529, + "epoch": 0.17147151660904855, + "flos": 21141375310080.0, + "grad_norm": 6.502278396988303, + "language_loss": 0.93110287, + "learning_rate": 3.717063665951155e-06, + "loss": 0.95242417, + "num_input_tokens_seen": 61828980, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 0.69140625, + "step": 2852, + "time_per_iteration": 2.3677895069122314 + }, + { + "auxiliary_loss_clip": 0.01099955, + "auxiliary_loss_mlp": 0.01041107, + "balance_loss_clip": 1.02143705, + "balance_loss_mlp": 1.02682829, + "epoch": 0.17153163986171652, + "flos": 18623893104000.0, + "grad_norm": 1.9697959632093773, + "language_loss": 0.68919253, + "learning_rate": 3.7168699030358305e-06, + "loss": 0.71060312, + "num_input_tokens_seen": 61847915, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 0.734375, + "step": 2853, + "time_per_iteration": 2.3650238513946533 + }, + { + "auxiliary_loss_clip": 0.01103775, + "auxiliary_loss_mlp": 0.01041317, + "balance_loss_clip": 1.02082443, + "balance_loss_mlp": 1.03048611, + "epoch": 0.17159176311438448, + "flos": 18222287201280.0, + "grad_norm": 2.3573506234697623, + "language_loss": 0.66342807, + "learning_rate": 3.7166760788498355e-06, + "loss": 0.68487895, + "num_input_tokens_seen": 61865570, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 0.734375, + "step": 2854, + "time_per_iteration": 2.346888303756714 + }, + { + "auxiliary_loss_clip": 0.01094329, + "auxiliary_loss_mlp": 0.0103849, + "balance_loss_clip": 1.0200839, + "balance_loss_mlp": 1.02519643, + "epoch": 0.17165188636705245, + "flos": 20737988928000.0, + "grad_norm": 1.7339636954993085, + "language_loss": 0.89137179, + "learning_rate": 3.716482193400087e-06, + "loss": 0.9127, + "num_input_tokens_seen": 61883340, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.69140625, + "step": 2855, + "time_per_iteration": 2.3888208866119385 + }, + { + "auxiliary_loss_clip": 0.01101268, + "auxiliary_loss_mlp": 0.0103767, + "balance_loss_clip": 1.01755893, + "balance_loss_mlp": 1.02768111, + "epoch": 0.17171200961972044, + "flos": 24898728096000.0, + "grad_norm": 1.9672459927595096, + "language_loss": 0.82613242, + "learning_rate": 3.7162882466935042e-06, + "loss": 0.84752178, + "num_input_tokens_seen": 61900610, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 0.734375, + "step": 2856, + "time_per_iteration": 2.4004223346710205 + }, + { + "auxiliary_loss_clip": 0.01098951, + "auxiliary_loss_mlp": 0.0104157, + "balance_loss_clip": 1.02188826, + "balance_loss_mlp": 1.02765584, + "epoch": 0.1717721328723884, + "flos": 20156196165120.0, + "grad_norm": 2.041226216781192, + "language_loss": 0.86407518, + "learning_rate": 3.716094238737009e-06, + "loss": 0.8854804, + "num_input_tokens_seen": 61916795, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 0.7109375, + "step": 2857, + "time_per_iteration": 2.368487596511841 + }, + { + "auxiliary_loss_clip": 0.01100271, + "auxiliary_loss_mlp": 0.01043538, + "balance_loss_clip": 1.02352262, + "balance_loss_mlp": 1.02817392, + "epoch": 0.17183225612505637, + "flos": 23360699571840.0, + "grad_norm": 2.6917011832688993, + "language_loss": 0.78101349, + "learning_rate": 3.715900169537524e-06, + "loss": 0.80245161, + "num_input_tokens_seen": 61936665, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 0.72265625, + "step": 2858, + "time_per_iteration": 2.407489776611328 + }, + { + "auxiliary_loss_clip": 0.01106981, + "auxiliary_loss_mlp": 0.01050912, + "balance_loss_clip": 1.02846503, + "balance_loss_mlp": 1.0280242, + "epoch": 0.17189237937772434, + "flos": 18113253425280.0, + "grad_norm": 2.327169641792259, + "language_loss": 0.76883638, + "learning_rate": 3.7157060391019767e-06, + "loss": 0.79041535, + "num_input_tokens_seen": 61954415, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.7890625, + "step": 2859, + "time_per_iteration": 3.7524237632751465 + }, + { + "auxiliary_loss_clip": 0.01097007, + "auxiliary_loss_mlp": 0.01037272, + "balance_loss_clip": 1.01747096, + "balance_loss_mlp": 1.02755082, + "epoch": 0.1719525026303923, + "flos": 23257286524800.0, + "grad_norm": 1.9563460985952137, + "language_loss": 0.76953274, + "learning_rate": 3.7155118474372936e-06, + "loss": 0.79087549, + "num_input_tokens_seen": 61973940, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 0.6953125, + "step": 2860, + "time_per_iteration": 2.4103503227233887 + }, + { + "auxiliary_loss_clip": 0.01099755, + "auxiliary_loss_mlp": 0.01034111, + "balance_loss_clip": 1.01408386, + "balance_loss_mlp": 1.02734613, + "epoch": 0.17201262588306027, + "flos": 20809456214400.0, + "grad_norm": 2.818945073790931, + "language_loss": 0.81869853, + "learning_rate": 3.7153175945504057e-06, + "loss": 0.84003723, + "num_input_tokens_seen": 61991845, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 0.7265625, + "step": 2861, + "time_per_iteration": 3.7697300910949707 + }, + { + "auxiliary_loss_clip": 0.01098907, + "auxiliary_loss_mlp": 0.01039077, + "balance_loss_clip": 1.0195142, + "balance_loss_mlp": 1.02756715, + "epoch": 0.17207274913572823, + "flos": 20374822298880.0, + "grad_norm": 4.648906746694479, + "language_loss": 0.85571301, + "learning_rate": 3.7151232804482456e-06, + "loss": 0.87709284, + "num_input_tokens_seen": 62009395, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 0.71484375, + "step": 2862, + "time_per_iteration": 3.7209343910217285 + }, + { + "auxiliary_loss_clip": 0.0109487, + "auxiliary_loss_mlp": 0.01037687, + "balance_loss_clip": 1.01934016, + "balance_loss_mlp": 1.02720714, + "epoch": 0.17213287238839622, + "flos": 26796501936000.0, + "grad_norm": 3.680640271375408, + "language_loss": 0.78025091, + "learning_rate": 3.7149289051377474e-06, + "loss": 0.8015765, + "num_input_tokens_seen": 62029005, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.67578125, + "step": 2863, + "time_per_iteration": 3.7889277935028076 + }, + { + "auxiliary_loss_clip": 0.01097385, + "auxiliary_loss_mlp": 0.01042709, + "balance_loss_clip": 1.02197862, + "balance_loss_mlp": 1.02595067, + "epoch": 0.1721929956410642, + "flos": 26029634722560.0, + "grad_norm": 1.6838877162819452, + "language_loss": 0.72329086, + "learning_rate": 3.714734468625847e-06, + "loss": 0.74469173, + "num_input_tokens_seen": 62048730, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 0.71484375, + "step": 2864, + "time_per_iteration": 2.438499927520752 + }, + { + "auxiliary_loss_clip": 0.01101815, + "auxiliary_loss_mlp": 0.01038164, + "balance_loss_clip": 1.01929295, + "balance_loss_mlp": 1.02860403, + "epoch": 0.17225311889373215, + "flos": 22272002645760.0, + "grad_norm": 2.0219813649855074, + "language_loss": 0.72502583, + "learning_rate": 3.714539970919485e-06, + "loss": 0.74642563, + "num_input_tokens_seen": 62069000, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 0.734375, + "step": 2865, + "time_per_iteration": 2.3733339309692383 + }, + { + "auxiliary_loss_clip": 0.01100059, + "auxiliary_loss_mlp": 0.01040146, + "balance_loss_clip": 1.02076221, + "balance_loss_mlp": 1.02951944, + "epoch": 0.17231324214640012, + "flos": 21286718766720.0, + "grad_norm": 2.9961208754105124, + "language_loss": 0.78705955, + "learning_rate": 3.7143454120256017e-06, + "loss": 0.80846155, + "num_input_tokens_seen": 62086750, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 0.70703125, + "step": 2866, + "time_per_iteration": 2.4031829833984375 + }, + { + "auxiliary_loss_clip": 0.01097716, + "auxiliary_loss_mlp": 0.01040117, + "balance_loss_clip": 1.02022111, + "balance_loss_mlp": 1.02698362, + "epoch": 0.17237336539906808, + "flos": 19679771485440.0, + "grad_norm": 1.706005589775413, + "language_loss": 0.79743361, + "learning_rate": 3.71415079195114e-06, + "loss": 0.81881189, + "num_input_tokens_seen": 62106240, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 0.70703125, + "step": 2867, + "time_per_iteration": 2.3662564754486084 + }, + { + "auxiliary_loss_clip": 0.01097452, + "auxiliary_loss_mlp": 0.01035584, + "balance_loss_clip": 1.01467443, + "balance_loss_mlp": 1.02625537, + "epoch": 0.17243348865173605, + "flos": 17528702664960.0, + "grad_norm": 1.9585180560379911, + "language_loss": 0.79336035, + "learning_rate": 3.713956110703046e-06, + "loss": 0.81469071, + "num_input_tokens_seen": 62124895, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 0.7109375, + "step": 2868, + "time_per_iteration": 2.3915653228759766 + }, + { + "auxiliary_loss_clip": 0.0110449, + "auxiliary_loss_mlp": 0.01038557, + "balance_loss_clip": 1.01855421, + "balance_loss_mlp": 1.02958059, + "epoch": 0.17249361190440402, + "flos": 18258876172800.0, + "grad_norm": 2.445199233157961, + "language_loss": 0.83990276, + "learning_rate": 3.713761368288268e-06, + "loss": 0.86133319, + "num_input_tokens_seen": 62143510, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 0.75, + "step": 2869, + "time_per_iteration": 2.3506219387054443 + }, + { + "auxiliary_loss_clip": 0.01100771, + "auxiliary_loss_mlp": 0.01042502, + "balance_loss_clip": 1.02104473, + "balance_loss_mlp": 1.02764046, + "epoch": 0.172553735157072, + "flos": 21173425804800.0, + "grad_norm": 1.7938640814322828, + "language_loss": 0.76781571, + "learning_rate": 3.713566564713754e-06, + "loss": 0.78924841, + "num_input_tokens_seen": 62162285, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 0.73046875, + "step": 2870, + "time_per_iteration": 2.4121081829071045 + }, + { + "auxiliary_loss_clip": 0.01094729, + "auxiliary_loss_mlp": 0.01034617, + "balance_loss_clip": 1.0175581, + "balance_loss_mlp": 1.02771533, + "epoch": 0.17261385840973997, + "flos": 22272177202560.0, + "grad_norm": 1.7992539394948617, + "language_loss": 0.76963258, + "learning_rate": 3.7133716999864574e-06, + "loss": 0.7909261, + "num_input_tokens_seen": 62180970, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.671875, + "step": 2871, + "time_per_iteration": 2.3836746215820312 + }, + { + "auxiliary_loss_clip": 0.01097193, + "auxiliary_loss_mlp": 0.010355, + "balance_loss_clip": 1.01587784, + "balance_loss_mlp": 1.02676845, + "epoch": 0.17267398166240794, + "flos": 27921159429120.0, + "grad_norm": 2.6200849364685523, + "language_loss": 0.7476573, + "learning_rate": 3.7131767741133327e-06, + "loss": 0.76898426, + "num_input_tokens_seen": 62198965, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 0.703125, + "step": 2872, + "time_per_iteration": 2.450266122817993 + }, + { + "auxiliary_loss_clip": 0.01095105, + "auxiliary_loss_mlp": 0.01039845, + "balance_loss_clip": 1.0204612, + "balance_loss_mlp": 1.02686155, + "epoch": 0.1727341049150759, + "flos": 21944028533760.0, + "grad_norm": 6.242095849254738, + "language_loss": 0.82088262, + "learning_rate": 3.712981787101335e-06, + "loss": 0.84223211, + "num_input_tokens_seen": 62219890, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 0.6796875, + "step": 2873, + "time_per_iteration": 2.385953903198242 + }, + { + "auxiliary_loss_clip": 0.01097729, + "auxiliary_loss_mlp": 0.01035709, + "balance_loss_clip": 1.01587296, + "balance_loss_mlp": 1.0280807, + "epoch": 0.17279422816774387, + "flos": 18107074114560.0, + "grad_norm": 2.0824944934024656, + "language_loss": 0.74705172, + "learning_rate": 3.7127867389574244e-06, + "loss": 0.76838607, + "num_input_tokens_seen": 62237140, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 0.6953125, + "step": 2874, + "time_per_iteration": 2.3947160243988037 + }, + { + "auxiliary_loss_clip": 0.01097361, + "auxiliary_loss_mlp": 0.01040762, + "balance_loss_clip": 1.01957846, + "balance_loss_mlp": 1.02649164, + "epoch": 0.17285435142041183, + "flos": 21834366353280.0, + "grad_norm": 2.0074995783007985, + "language_loss": 0.80613792, + "learning_rate": 3.7125916296885606e-06, + "loss": 0.82751918, + "num_input_tokens_seen": 62255405, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 0.7109375, + "step": 2875, + "time_per_iteration": 2.3952105045318604 + }, + { + "auxiliary_loss_clip": 0.01100966, + "auxiliary_loss_mlp": 0.01041017, + "balance_loss_clip": 1.01912999, + "balance_loss_mlp": 1.02759469, + "epoch": 0.17291447467307983, + "flos": 18367491012480.0, + "grad_norm": 2.8247080586996716, + "language_loss": 0.87094033, + "learning_rate": 3.7123964593017066e-06, + "loss": 0.89236015, + "num_input_tokens_seen": 62271280, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 0.734375, + "step": 2876, + "time_per_iteration": 2.377943515777588 + }, + { + "auxiliary_loss_clip": 0.01097734, + "auxiliary_loss_mlp": 0.01039503, + "balance_loss_clip": 1.02060854, + "balance_loss_mlp": 1.0285635, + "epoch": 0.1729745979257478, + "flos": 18623648724480.0, + "grad_norm": 1.839598799780756, + "language_loss": 0.84719235, + "learning_rate": 3.7122012278038285e-06, + "loss": 0.86856472, + "num_input_tokens_seen": 62289140, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 0.69140625, + "step": 2877, + "time_per_iteration": 2.348487138748169 + }, + { + "auxiliary_loss_clip": 0.01100964, + "auxiliary_loss_mlp": 0.01036013, + "balance_loss_clip": 1.01605749, + "balance_loss_mlp": 1.02966297, + "epoch": 0.17303472117841576, + "flos": 22997253651840.0, + "grad_norm": 2.2625307315465935, + "language_loss": 0.79290515, + "learning_rate": 3.7120059352018922e-06, + "loss": 0.81427491, + "num_input_tokens_seen": 62307490, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 0.7109375, + "step": 2878, + "time_per_iteration": 2.4186136722564697 + }, + { + "auxiliary_loss_clip": 0.01095456, + "auxiliary_loss_mlp": 0.01037214, + "balance_loss_clip": 1.01696002, + "balance_loss_mlp": 1.02593291, + "epoch": 0.17309484443108372, + "flos": 25663256248320.0, + "grad_norm": 1.7061990498952924, + "language_loss": 0.70197231, + "learning_rate": 3.7118105815028677e-06, + "loss": 0.72329903, + "num_input_tokens_seen": 62328570, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 0.6953125, + "step": 2879, + "time_per_iteration": 2.4180474281311035 + }, + { + "auxiliary_loss_clip": 0.01097694, + "auxiliary_loss_mlp": 0.01041854, + "balance_loss_clip": 1.02081323, + "balance_loss_mlp": 1.02637446, + "epoch": 0.1731549676837517, + "flos": 13552060429440.0, + "grad_norm": 2.0531514935022184, + "language_loss": 0.8334012, + "learning_rate": 3.7116151667137272e-06, + "loss": 0.85479665, + "num_input_tokens_seen": 62345735, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 0.71484375, + "step": 2880, + "time_per_iteration": 2.375086545944214 + }, + { + "auxiliary_loss_clip": 0.01102314, + "auxiliary_loss_mlp": 0.01036382, + "balance_loss_clip": 1.01543653, + "balance_loss_mlp": 1.02918601, + "epoch": 0.17321509093641965, + "flos": 22855959912960.0, + "grad_norm": 2.0841091537860934, + "language_loss": 0.80595112, + "learning_rate": 3.7114196908414444e-06, + "loss": 0.8273381, + "num_input_tokens_seen": 62365525, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 0.734375, + "step": 2881, + "time_per_iteration": 2.3861749172210693 + }, + { + "auxiliary_loss_clip": 0.01099988, + "auxiliary_loss_mlp": 0.01034552, + "balance_loss_clip": 1.01553822, + "balance_loss_mlp": 1.02886462, + "epoch": 0.17327521418908762, + "flos": 24351639091200.0, + "grad_norm": 2.6595247932454313, + "language_loss": 0.77453423, + "learning_rate": 3.7112241538929946e-06, + "loss": 0.7958796, + "num_input_tokens_seen": 62385160, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 0.7109375, + "step": 2882, + "time_per_iteration": 2.4411187171936035 + }, + { + "auxiliary_loss_clip": 0.01096081, + "auxiliary_loss_mlp": 0.01035144, + "balance_loss_clip": 1.01562929, + "balance_loss_mlp": 1.02655911, + "epoch": 0.1733353374417556, + "flos": 33104364940800.0, + "grad_norm": 1.8538684903891403, + "language_loss": 0.76314259, + "learning_rate": 3.711028555875357e-06, + "loss": 0.78445482, + "num_input_tokens_seen": 62405280, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 0.6953125, + "step": 2883, + "time_per_iteration": 2.4880666732788086 + }, + { + "auxiliary_loss_clip": 0.01097012, + "auxiliary_loss_mlp": 0.01035835, + "balance_loss_clip": 1.01678514, + "balance_loss_mlp": 1.02765751, + "epoch": 0.17339546069442358, + "flos": 24387809126400.0, + "grad_norm": 1.8911862531878234, + "language_loss": 0.85407919, + "learning_rate": 3.7108328967955113e-06, + "loss": 0.87540758, + "num_input_tokens_seen": 62423665, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 0.6953125, + "step": 2884, + "time_per_iteration": 2.4213695526123047 + }, + { + "auxiliary_loss_clip": 0.01098005, + "auxiliary_loss_mlp": 0.0104383, + "balance_loss_clip": 1.02525759, + "balance_loss_mlp": 1.02772915, + "epoch": 0.17345558394709154, + "flos": 27452938919040.0, + "grad_norm": 2.875467068049921, + "language_loss": 0.74540174, + "learning_rate": 3.7106371766604408e-06, + "loss": 0.76682007, + "num_input_tokens_seen": 62445170, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.703125, + "step": 2885, + "time_per_iteration": 2.422321319580078 + }, + { + "auxiliary_loss_clip": 0.01093593, + "auxiliary_loss_mlp": 0.01034796, + "balance_loss_clip": 1.01702213, + "balance_loss_mlp": 1.02822804, + "epoch": 0.1735157071997595, + "flos": 24680974746240.0, + "grad_norm": 1.5230780813505223, + "language_loss": 0.70776856, + "learning_rate": 3.7104413954771294e-06, + "loss": 0.72905242, + "num_input_tokens_seen": 62466135, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.65234375, + "step": 2886, + "time_per_iteration": 2.4529497623443604 + }, + { + "auxiliary_loss_clip": 0.0109737, + "auxiliary_loss_mlp": 0.01034553, + "balance_loss_clip": 1.01472795, + "balance_loss_mlp": 1.02688885, + "epoch": 0.17357583045242747, + "flos": 21687870821760.0, + "grad_norm": 2.549200029262177, + "language_loss": 0.69418108, + "learning_rate": 3.710245553252564e-06, + "loss": 0.7155003, + "num_input_tokens_seen": 62483910, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 0.703125, + "step": 2887, + "time_per_iteration": 2.4095358848571777 + }, + { + "auxiliary_loss_clip": 0.01098123, + "auxiliary_loss_mlp": 0.01041994, + "balance_loss_clip": 1.02282476, + "balance_loss_mlp": 1.02799964, + "epoch": 0.17363595370509544, + "flos": 15374875847040.0, + "grad_norm": 1.8093871820287766, + "language_loss": 0.853176, + "learning_rate": 3.7100496499937345e-06, + "loss": 0.87457716, + "num_input_tokens_seen": 62501530, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 0.703125, + "step": 2888, + "time_per_iteration": 2.3673512935638428 + }, + { + "auxiliary_loss_clip": 0.01097958, + "auxiliary_loss_mlp": 0.01039942, + "balance_loss_clip": 1.01965261, + "balance_loss_mlp": 1.02668357, + "epoch": 0.1736960769577634, + "flos": 23439812446080.0, + "grad_norm": 2.7465663528092237, + "language_loss": 0.78378886, + "learning_rate": 3.7098536857076315e-06, + "loss": 0.80516785, + "num_input_tokens_seen": 62521295, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 0.7109375, + "step": 2889, + "time_per_iteration": 2.4165608882904053 + }, + { + "auxiliary_loss_clip": 0.01094021, + "auxiliary_loss_mlp": 0.01035741, + "balance_loss_clip": 1.01685858, + "balance_loss_mlp": 1.0275017, + "epoch": 0.1737562002104314, + "flos": 18586850284800.0, + "grad_norm": 2.261143993961399, + "language_loss": 0.83725846, + "learning_rate": 3.7096576604012492e-06, + "loss": 0.85855603, + "num_input_tokens_seen": 62539615, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 0.6640625, + "step": 2890, + "time_per_iteration": 2.373586893081665 + }, + { + "auxiliary_loss_clip": 0.01099012, + "auxiliary_loss_mlp": 0.01044074, + "balance_loss_clip": 1.02492881, + "balance_loss_mlp": 1.02829003, + "epoch": 0.17381632346309936, + "flos": 15997132742400.0, + "grad_norm": 2.1077282277956457, + "language_loss": 0.82070744, + "learning_rate": 3.7094615740815824e-06, + "loss": 0.84213829, + "num_input_tokens_seen": 62556820, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 0.70703125, + "step": 2891, + "time_per_iteration": 2.340186595916748 + }, + { + "auxiliary_loss_clip": 0.01098339, + "auxiliary_loss_mlp": 0.01034982, + "balance_loss_clip": 1.01451373, + "balance_loss_mlp": 1.02644587, + "epoch": 0.17387644671576732, + "flos": 13369010837760.0, + "grad_norm": 1.9985918358305603, + "language_loss": 0.80757391, + "learning_rate": 3.709265426755629e-06, + "loss": 0.82890713, + "num_input_tokens_seen": 62572450, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 0.71875, + "step": 2892, + "time_per_iteration": 2.343317985534668 + }, + { + "auxiliary_loss_clip": 0.01100705, + "auxiliary_loss_mlp": 0.01040469, + "balance_loss_clip": 1.01998901, + "balance_loss_mlp": 1.0291729, + "epoch": 0.1739365699684353, + "flos": 26614290216960.0, + "grad_norm": 2.924729453432933, + "language_loss": 0.74330664, + "learning_rate": 3.7090692184303894e-06, + "loss": 0.76471835, + "num_input_tokens_seen": 62592580, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 0.71484375, + "step": 2893, + "time_per_iteration": 2.4177193641662598 + }, + { + "auxiliary_loss_clip": 0.01099165, + "auxiliary_loss_mlp": 0.01040929, + "balance_loss_clip": 1.02050877, + "balance_loss_mlp": 1.02689755, + "epoch": 0.17399669322110325, + "flos": 23366843971200.0, + "grad_norm": 1.9308889919567662, + "language_loss": 0.82883406, + "learning_rate": 3.7088729491128665e-06, + "loss": 0.85023499, + "num_input_tokens_seen": 62611220, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 0.72265625, + "step": 2894, + "time_per_iteration": 2.4008944034576416 + }, + { + "auxiliary_loss_clip": 0.010986, + "auxiliary_loss_mlp": 0.0103375, + "balance_loss_clip": 1.01236343, + "balance_loss_mlp": 1.02747357, + "epoch": 0.17405681647377122, + "flos": 22053027398400.0, + "grad_norm": 4.187679183561046, + "language_loss": 0.74383038, + "learning_rate": 3.708676618810063e-06, + "loss": 0.76515388, + "num_input_tokens_seen": 62629185, + "router_z_loss_clip": 0.21386719, + "router_z_loss_mlp": 0.7109375, + "step": 2895, + "time_per_iteration": 2.3879427909851074 + }, + { + "auxiliary_loss_clip": 0.01029121, + "auxiliary_loss_mlp": 0.01007236, + "balance_loss_clip": 1.00432754, + "balance_loss_mlp": 1.00671434, + "epoch": 0.1741169397264392, + "flos": 61454396044800.0, + "grad_norm": 0.8721027012547796, + "language_loss": 0.62732995, + "learning_rate": 3.7084802275289866e-06, + "loss": 0.64769351, + "num_input_tokens_seen": 62691895, + "router_z_loss_clip": 0.02905273, + "router_z_loss_mlp": 0.22460938, + "step": 2896, + "time_per_iteration": 3.1353063583374023 + }, + { + "auxiliary_loss_clip": 0.01096297, + "auxiliary_loss_mlp": 0.01033438, + "balance_loss_clip": 1.01472223, + "balance_loss_mlp": 1.02518332, + "epoch": 0.17417706297910718, + "flos": 27016419790080.0, + "grad_norm": 2.159513337646916, + "language_loss": 0.75981808, + "learning_rate": 3.708283775276645e-06, + "loss": 0.78111547, + "num_input_tokens_seen": 62713790, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.7109375, + "step": 2897, + "time_per_iteration": 2.430640935897827 + }, + { + "auxiliary_loss_clip": 0.01096242, + "auxiliary_loss_mlp": 0.0103623, + "balance_loss_clip": 1.01592231, + "balance_loss_mlp": 1.02759027, + "epoch": 0.17423718623177514, + "flos": 33507506943360.0, + "grad_norm": 2.226953318823505, + "language_loss": 0.69583464, + "learning_rate": 3.70808726206005e-06, + "loss": 0.71715933, + "num_input_tokens_seen": 62736285, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 0.6875, + "step": 2898, + "time_per_iteration": 3.8754465579986572 + }, + { + "auxiliary_loss_clip": 0.01102547, + "auxiliary_loss_mlp": 0.01040489, + "balance_loss_clip": 1.01961541, + "balance_loss_mlp": 1.02899253, + "epoch": 0.1742973094844431, + "flos": 27197409611520.0, + "grad_norm": 2.4331251828178315, + "language_loss": 0.76197898, + "learning_rate": 3.7078906878862145e-06, + "loss": 0.78340936, + "num_input_tokens_seen": 62756240, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 0.734375, + "step": 2899, + "time_per_iteration": 2.4297640323638916 + }, + { + "auxiliary_loss_clip": 0.01094865, + "auxiliary_loss_mlp": 0.01038535, + "balance_loss_clip": 1.01890135, + "balance_loss_mlp": 1.02577353, + "epoch": 0.17435743273711107, + "flos": 22709638938240.0, + "grad_norm": 1.829918370174299, + "language_loss": 0.7232179, + "learning_rate": 3.7076940527621536e-06, + "loss": 0.74455196, + "num_input_tokens_seen": 62775910, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 0.69140625, + "step": 2900, + "time_per_iteration": 2.4094789028167725 + }, + { + "auxiliary_loss_clip": 0.01101356, + "auxiliary_loss_mlp": 0.01043212, + "balance_loss_clip": 1.02312541, + "balance_loss_mlp": 1.0292275, + "epoch": 0.17441755598977904, + "flos": 41644853435520.0, + "grad_norm": 1.6640230087428245, + "language_loss": 0.69881225, + "learning_rate": 3.707497356694884e-06, + "loss": 0.72025788, + "num_input_tokens_seen": 62799385, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 0.72265625, + "step": 2901, + "time_per_iteration": 5.310842990875244 + }, + { + "auxiliary_loss_clip": 0.01100747, + "auxiliary_loss_mlp": 0.01041757, + "balance_loss_clip": 1.0216223, + "balance_loss_mlp": 1.02748847, + "epoch": 0.174477679242447, + "flos": 26285862257280.0, + "grad_norm": 2.413978229389291, + "language_loss": 0.76461095, + "learning_rate": 3.707300599691427e-06, + "loss": 0.78603601, + "num_input_tokens_seen": 62819380, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 0.734375, + "step": 2902, + "time_per_iteration": 3.814647674560547 + }, + { + "auxiliary_loss_clip": 0.01099999, + "auxiliary_loss_mlp": 0.01043622, + "balance_loss_clip": 1.02456009, + "balance_loss_mlp": 1.02765155, + "epoch": 0.174537802495115, + "flos": 17857444826880.0, + "grad_norm": 2.1695329462755764, + "language_loss": 0.81537986, + "learning_rate": 3.7071037817588023e-06, + "loss": 0.83681607, + "num_input_tokens_seen": 62836205, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 0.72265625, + "step": 2903, + "time_per_iteration": 2.345242500305176 + }, + { + "auxiliary_loss_clip": 0.01098288, + "auxiliary_loss_mlp": 0.01034931, + "balance_loss_clip": 1.01455855, + "balance_loss_mlp": 1.02797866, + "epoch": 0.17459792574778296, + "flos": 16939927630080.0, + "grad_norm": 3.2726253911474683, + "language_loss": 0.73323435, + "learning_rate": 3.706906902904036e-06, + "loss": 0.75456655, + "num_input_tokens_seen": 62854045, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 0.703125, + "step": 2904, + "time_per_iteration": 2.3807260990142822 + }, + { + "auxiliary_loss_clip": 0.01098457, + "auxiliary_loss_mlp": 0.01037204, + "balance_loss_clip": 1.0170573, + "balance_loss_mlp": 1.02728105, + "epoch": 0.17465804900045093, + "flos": 25518855398400.0, + "grad_norm": 1.9188512777035645, + "language_loss": 0.64299375, + "learning_rate": 3.7067099631341517e-06, + "loss": 0.66435039, + "num_input_tokens_seen": 62873075, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 0.7109375, + "step": 2905, + "time_per_iteration": 2.408696413040161 + }, + { + "auxiliary_loss_clip": 0.01106027, + "auxiliary_loss_mlp": 0.01047717, + "balance_loss_clip": 1.02572274, + "balance_loss_mlp": 1.03002274, + "epoch": 0.1747181722531189, + "flos": 24128683948800.0, + "grad_norm": 1.7097878330468699, + "language_loss": 0.7937634, + "learning_rate": 3.70651296245618e-06, + "loss": 0.81530094, + "num_input_tokens_seen": 62892675, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 0.7578125, + "step": 2906, + "time_per_iteration": 2.4063754081726074 + }, + { + "auxiliary_loss_clip": 0.01099716, + "auxiliary_loss_mlp": 0.01047005, + "balance_loss_clip": 1.0270853, + "balance_loss_mlp": 1.02893543, + "epoch": 0.17477829550578686, + "flos": 17747852469120.0, + "grad_norm": 1.6778656885318153, + "language_loss": 0.80657685, + "learning_rate": 3.70631590087715e-06, + "loss": 0.82804406, + "num_input_tokens_seen": 62910675, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 0.70703125, + "step": 2907, + "time_per_iteration": 2.3640167713165283 + }, + { + "auxiliary_loss_clip": 0.01098921, + "auxiliary_loss_mlp": 0.01042513, + "balance_loss_clip": 1.02270007, + "balance_loss_mlp": 1.02678657, + "epoch": 0.17483841875845482, + "flos": 15376446858240.0, + "grad_norm": 2.761525710535151, + "language_loss": 0.80839372, + "learning_rate": 3.706118778404095e-06, + "loss": 0.82980806, + "num_input_tokens_seen": 62928130, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 0.72265625, + "step": 2908, + "time_per_iteration": 2.372986078262329 + }, + { + "auxiliary_loss_clip": 0.0109838, + "auxiliary_loss_mlp": 0.01037755, + "balance_loss_clip": 1.017609, + "balance_loss_mlp": 1.02853179, + "epoch": 0.17489854201112282, + "flos": 17162359102080.0, + "grad_norm": 2.5263439339312135, + "language_loss": 0.80055851, + "learning_rate": 3.7059215950440487e-06, + "loss": 0.8219198, + "num_input_tokens_seen": 62944290, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 0.69921875, + "step": 2909, + "time_per_iteration": 2.3411989212036133 + }, + { + "auxiliary_loss_clip": 0.01097792, + "auxiliary_loss_mlp": 0.01038519, + "balance_loss_clip": 1.01776505, + "balance_loss_mlp": 1.02661288, + "epoch": 0.17495866526379078, + "flos": 19754276060160.0, + "grad_norm": 2.032621032577187, + "language_loss": 0.76930559, + "learning_rate": 3.7057243508040494e-06, + "loss": 0.79066873, + "num_input_tokens_seen": 62963505, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 0.71484375, + "step": 2910, + "time_per_iteration": 2.3876938819885254 + }, + { + "auxiliary_loss_clip": 0.01100046, + "auxiliary_loss_mlp": 0.01041668, + "balance_loss_clip": 1.02018619, + "balance_loss_mlp": 1.02748728, + "epoch": 0.17501878851645875, + "flos": 28509899552640.0, + "grad_norm": 2.720788766012272, + "language_loss": 0.87412465, + "learning_rate": 3.7055270456911354e-06, + "loss": 0.89554185, + "num_input_tokens_seen": 62985020, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 0.7265625, + "step": 2911, + "time_per_iteration": 2.4427218437194824 + }, + { + "auxiliary_loss_clip": 0.01097482, + "auxiliary_loss_mlp": 0.01043231, + "balance_loss_clip": 1.02215457, + "balance_loss_mlp": 1.02574348, + "epoch": 0.1750789117691267, + "flos": 17930238744960.0, + "grad_norm": 2.365782983215061, + "language_loss": 0.89540219, + "learning_rate": 3.7053296797123485e-06, + "loss": 0.91680932, + "num_input_tokens_seen": 63001745, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 0.71875, + "step": 2912, + "time_per_iteration": 2.364178419113159 + }, + { + "auxiliary_loss_clip": 0.01098745, + "auxiliary_loss_mlp": 0.01041352, + "balance_loss_clip": 1.01925063, + "balance_loss_mlp": 1.02638698, + "epoch": 0.17513903502179468, + "flos": 18258457236480.0, + "grad_norm": 1.9260236057718623, + "language_loss": 0.7252481, + "learning_rate": 3.7051322528747313e-06, + "loss": 0.74664903, + "num_input_tokens_seen": 63019750, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.72265625, + "step": 2913, + "time_per_iteration": 2.3827435970306396 + }, + { + "auxiliary_loss_clip": 0.01096951, + "auxiliary_loss_mlp": 0.01034689, + "balance_loss_clip": 1.0148052, + "balance_loss_mlp": 1.02793205, + "epoch": 0.17519915827446264, + "flos": 20703669194880.0, + "grad_norm": 1.6688855674381564, + "language_loss": 0.68798614, + "learning_rate": 3.704934765185331e-06, + "loss": 0.70930254, + "num_input_tokens_seen": 63039500, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 0.6875, + "step": 2914, + "time_per_iteration": 2.417940616607666 + }, + { + "auxiliary_loss_clip": 0.01095125, + "auxiliary_loss_mlp": 0.0103836, + "balance_loss_clip": 1.01779675, + "balance_loss_mlp": 1.02606404, + "epoch": 0.1752592815271306, + "flos": 20522330259840.0, + "grad_norm": 1.706189824721809, + "language_loss": 0.93502462, + "learning_rate": 3.7047372166511945e-06, + "loss": 0.95635939, + "num_input_tokens_seen": 63059785, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 0.69140625, + "step": 2915, + "time_per_iteration": 2.3981058597564697 + }, + { + "auxiliary_loss_clip": 0.01093396, + "auxiliary_loss_mlp": 0.01034579, + "balance_loss_clip": 1.01422966, + "balance_loss_mlp": 1.02555728, + "epoch": 0.1753194047797986, + "flos": 21798091584000.0, + "grad_norm": 1.65925447995177, + "language_loss": 0.80993646, + "learning_rate": 3.704539607279371e-06, + "loss": 0.83121622, + "num_input_tokens_seen": 63079385, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 0.6796875, + "step": 2916, + "time_per_iteration": 2.411088705062866 + }, + { + "auxiliary_loss_clip": 0.01099869, + "auxiliary_loss_mlp": 0.0104288, + "balance_loss_clip": 1.02164841, + "balance_loss_mlp": 1.02748048, + "epoch": 0.17537952803246656, + "flos": 20667289691520.0, + "grad_norm": 1.5880739721979988, + "language_loss": 0.73977023, + "learning_rate": 3.704341937076914e-06, + "loss": 0.76119775, + "num_input_tokens_seen": 63098970, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 0.72265625, + "step": 2917, + "time_per_iteration": 2.3958816528320312 + }, + { + "auxiliary_loss_clip": 0.01094786, + "auxiliary_loss_mlp": 0.01036699, + "balance_loss_clip": 1.01557553, + "balance_loss_mlp": 1.02746558, + "epoch": 0.17543965128513453, + "flos": 23293945319040.0, + "grad_norm": 1.9491242793277963, + "language_loss": 0.7629177, + "learning_rate": 3.7041442060508778e-06, + "loss": 0.78423256, + "num_input_tokens_seen": 63118750, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 0.671875, + "step": 2918, + "time_per_iteration": 2.401737689971924 + }, + { + "auxiliary_loss_clip": 0.01100381, + "auxiliary_loss_mlp": 0.01037278, + "balance_loss_clip": 1.01521242, + "balance_loss_mlp": 1.02643442, + "epoch": 0.1754997745378025, + "flos": 29094345578880.0, + "grad_norm": 3.161543355332977, + "language_loss": 0.7428453, + "learning_rate": 3.7039464142083183e-06, + "loss": 0.76422191, + "num_input_tokens_seen": 63136865, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.73828125, + "step": 2919, + "time_per_iteration": 2.4384982585906982 + }, + { + "auxiliary_loss_clip": 0.01100849, + "auxiliary_loss_mlp": 0.01042433, + "balance_loss_clip": 1.02054608, + "balance_loss_mlp": 1.02537584, + "epoch": 0.17555989779047046, + "flos": 30370560750720.0, + "grad_norm": 2.1375587565939926, + "language_loss": 0.74586523, + "learning_rate": 3.7037485615562936e-06, + "loss": 0.76729798, + "num_input_tokens_seen": 63158325, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 0.75390625, + "step": 2920, + "time_per_iteration": 2.4775569438934326 + }, + { + "auxiliary_loss_clip": 0.01094259, + "auxiliary_loss_mlp": 0.01037155, + "balance_loss_clip": 1.01831973, + "balance_loss_mlp": 1.02546024, + "epoch": 0.17562002104313842, + "flos": 23286823401600.0, + "grad_norm": 2.1269303189241673, + "language_loss": 0.79498994, + "learning_rate": 3.703550648101866e-06, + "loss": 0.81630409, + "num_input_tokens_seen": 63173115, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 0.6875, + "step": 2921, + "time_per_iteration": 2.411048650741577 + }, + { + "auxiliary_loss_clip": 0.01102464, + "auxiliary_loss_mlp": 0.01043531, + "balance_loss_clip": 1.02088106, + "balance_loss_mlp": 1.02722764, + "epoch": 0.1756801442958064, + "flos": 24789345206400.0, + "grad_norm": 1.720611101927665, + "language_loss": 0.87780988, + "learning_rate": 3.7033526738520983e-06, + "loss": 0.89926982, + "num_input_tokens_seen": 63192880, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.75, + "step": 2922, + "time_per_iteration": 2.4284417629241943 + }, + { + "auxiliary_loss_clip": 0.01097197, + "auxiliary_loss_mlp": 0.01043666, + "balance_loss_clip": 1.02282798, + "balance_loss_mlp": 1.02513099, + "epoch": 0.17574026754847438, + "flos": 25770579367680.0, + "grad_norm": 2.2000436260039042, + "language_loss": 0.62409222, + "learning_rate": 3.7031546388140545e-06, + "loss": 0.64550078, + "num_input_tokens_seen": 63214395, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 0.71875, + "step": 2923, + "time_per_iteration": 2.4263389110565186 + }, + { + "auxiliary_loss_clip": 0.01102621, + "auxiliary_loss_mlp": 0.01041787, + "balance_loss_clip": 1.01973307, + "balance_loss_mlp": 1.02732992, + "epoch": 0.17580039080114235, + "flos": 17455664367360.0, + "grad_norm": 2.059515887283728, + "language_loss": 0.80213439, + "learning_rate": 3.702956542994802e-06, + "loss": 0.82357854, + "num_input_tokens_seen": 63231020, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.75390625, + "step": 2924, + "time_per_iteration": 2.3719711303710938 + }, + { + "auxiliary_loss_clip": 0.0110046, + "auxiliary_loss_mlp": 0.01041157, + "balance_loss_clip": 1.01857901, + "balance_loss_mlp": 1.02674377, + "epoch": 0.1758605140538103, + "flos": 14863817232000.0, + "grad_norm": 3.4252338567491845, + "language_loss": 0.7123369, + "learning_rate": 3.7027583864014123e-06, + "loss": 0.73375309, + "num_input_tokens_seen": 63246245, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 0.734375, + "step": 2925, + "time_per_iteration": 2.3547937870025635 + }, + { + "auxiliary_loss_clip": 0.010994, + "auxiliary_loss_mlp": 0.01037619, + "balance_loss_clip": 1.01755607, + "balance_loss_mlp": 1.02856195, + "epoch": 0.17592063730647828, + "flos": 23003118760320.0, + "grad_norm": 1.7061490438206, + "language_loss": 0.71652341, + "learning_rate": 3.7025601690409555e-06, + "loss": 0.73789358, + "num_input_tokens_seen": 63267790, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 0.70703125, + "step": 2926, + "time_per_iteration": 2.4420547485351562 + }, + { + "auxiliary_loss_clip": 0.01100969, + "auxiliary_loss_mlp": 0.01036308, + "balance_loss_clip": 1.01409912, + "balance_loss_mlp": 1.02715552, + "epoch": 0.17598076055914624, + "flos": 20740432723200.0, + "grad_norm": 1.8321860126161547, + "language_loss": 0.8483274, + "learning_rate": 3.702361890920505e-06, + "loss": 0.86970007, + "num_input_tokens_seen": 63286830, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 0.73828125, + "step": 2927, + "time_per_iteration": 2.3879168033599854 + }, + { + "auxiliary_loss_clip": 0.0109817, + "auxiliary_loss_mlp": 0.01040642, + "balance_loss_clip": 1.02124703, + "balance_loss_mlp": 1.02745426, + "epoch": 0.1760408838118142, + "flos": 34091080185600.0, + "grad_norm": 1.9231681298547754, + "language_loss": 0.7214148, + "learning_rate": 3.702163552047138e-06, + "loss": 0.74280298, + "num_input_tokens_seen": 63308870, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 0.70703125, + "step": 2928, + "time_per_iteration": 2.513338804244995 + }, + { + "auxiliary_loss_clip": 0.01095716, + "auxiliary_loss_mlp": 0.01038513, + "balance_loss_clip": 1.01797342, + "balance_loss_mlp": 1.02707195, + "epoch": 0.1761010070644822, + "flos": 24167297779200.0, + "grad_norm": 1.8471882104266197, + "language_loss": 0.83402288, + "learning_rate": 3.7019651524279326e-06, + "loss": 0.85536516, + "num_input_tokens_seen": 63329005, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 0.6875, + "step": 2929, + "time_per_iteration": 2.391378164291382 + }, + { + "auxiliary_loss_clip": 0.01098179, + "auxiliary_loss_mlp": 0.01043888, + "balance_loss_clip": 1.02446914, + "balance_loss_mlp": 1.02659583, + "epoch": 0.17616113031715017, + "flos": 26575536741120.0, + "grad_norm": 1.5544288993568705, + "language_loss": 0.79389054, + "learning_rate": 3.7017666920699693e-06, + "loss": 0.81531119, + "num_input_tokens_seen": 63349390, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 0.71875, + "step": 2930, + "time_per_iteration": 2.433692216873169 + }, + { + "auxiliary_loss_clip": 0.01100173, + "auxiliary_loss_mlp": 0.01036324, + "balance_loss_clip": 1.01531935, + "balance_loss_mlp": 1.02801824, + "epoch": 0.17622125356981813, + "flos": 25665490575360.0, + "grad_norm": 2.2015391258894117, + "language_loss": 0.77019572, + "learning_rate": 3.701568170980329e-06, + "loss": 0.79156071, + "num_input_tokens_seen": 63368835, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 0.72265625, + "step": 2931, + "time_per_iteration": 2.4104647636413574 + }, + { + "auxiliary_loss_clip": 0.01098125, + "auxiliary_loss_mlp": 0.01036678, + "balance_loss_clip": 1.0169245, + "balance_loss_mlp": 1.02678323, + "epoch": 0.1762813768224861, + "flos": 16507597864320.0, + "grad_norm": 2.7257495806613976, + "language_loss": 0.74576712, + "learning_rate": 3.7013695891660985e-06, + "loss": 0.76711518, + "num_input_tokens_seen": 63385220, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 0.71484375, + "step": 2932, + "time_per_iteration": 2.381547212600708 + }, + { + "auxiliary_loss_clip": 0.01104459, + "auxiliary_loss_mlp": 0.01046361, + "balance_loss_clip": 1.02371168, + "balance_loss_mlp": 1.02852345, + "epoch": 0.17634150007515406, + "flos": 11211239036160.0, + "grad_norm": 2.7797515402489887, + "language_loss": 0.89398766, + "learning_rate": 3.701170946634364e-06, + "loss": 0.91549587, + "num_input_tokens_seen": 63400865, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.7578125, + "step": 2933, + "time_per_iteration": 2.340221643447876 + }, + { + "auxiliary_loss_clip": 0.01096472, + "auxiliary_loss_mlp": 0.01042185, + "balance_loss_clip": 1.02317154, + "balance_loss_mlp": 1.02740037, + "epoch": 0.17640162332782203, + "flos": 23658787693440.0, + "grad_norm": 1.7039241733091834, + "language_loss": 0.88141811, + "learning_rate": 3.700972243392214e-06, + "loss": 0.90280473, + "num_input_tokens_seen": 63421390, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 0.69140625, + "step": 2934, + "time_per_iteration": 2.44293212890625 + }, + { + "auxiliary_loss_clip": 0.01092239, + "auxiliary_loss_mlp": 0.01039757, + "balance_loss_clip": 1.02050495, + "balance_loss_mlp": 1.0247401, + "epoch": 0.17646174658049, + "flos": 53795012198400.0, + "grad_norm": 1.544438353879266, + "language_loss": 0.70650262, + "learning_rate": 3.70077347944674e-06, + "loss": 0.72782254, + "num_input_tokens_seen": 63444715, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 0.67578125, + "step": 2935, + "time_per_iteration": 2.655700206756592 + }, + { + "auxiliary_loss_clip": 0.01101648, + "auxiliary_loss_mlp": 0.01037841, + "balance_loss_clip": 1.01676512, + "balance_loss_mlp": 1.02728581, + "epoch": 0.17652186983315798, + "flos": 24242710049280.0, + "grad_norm": 2.695729821185055, + "language_loss": 0.70003366, + "learning_rate": 3.7005746548050353e-06, + "loss": 0.72142857, + "num_input_tokens_seen": 63465525, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 0.7421875, + "step": 2936, + "time_per_iteration": 2.4301464557647705 + }, + { + "auxiliary_loss_clip": 0.01101024, + "auxiliary_loss_mlp": 0.01038122, + "balance_loss_clip": 1.01888156, + "balance_loss_mlp": 1.03025663, + "epoch": 0.17658199308582595, + "flos": 27453043653120.0, + "grad_norm": 1.802780086834324, + "language_loss": 0.71520585, + "learning_rate": 3.7003757694741956e-06, + "loss": 0.7365973, + "num_input_tokens_seen": 63485815, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 0.70703125, + "step": 2937, + "time_per_iteration": 3.927194118499756 + }, + { + "auxiliary_loss_clip": 0.01102053, + "auxiliary_loss_mlp": 0.01042294, + "balance_loss_clip": 1.02105069, + "balance_loss_mlp": 1.0282346, + "epoch": 0.17664211633849392, + "flos": 22417590481920.0, + "grad_norm": 4.800873366126143, + "language_loss": 0.75749171, + "learning_rate": 3.7001768234613188e-06, + "loss": 0.77893519, + "num_input_tokens_seen": 63503905, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 0.73828125, + "step": 2938, + "time_per_iteration": 2.4505906105041504 + }, + { + "auxiliary_loss_clip": 0.01098209, + "auxiliary_loss_mlp": 0.0103539, + "balance_loss_clip": 1.01573217, + "balance_loss_mlp": 1.02652121, + "epoch": 0.17670223959116188, + "flos": 24714037670400.0, + "grad_norm": 2.297893445036306, + "language_loss": 0.71310973, + "learning_rate": 3.6999778167735043e-06, + "loss": 0.73444581, + "num_input_tokens_seen": 63521985, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 0.71875, + "step": 2939, + "time_per_iteration": 2.422348737716675 + }, + { + "auxiliary_loss_clip": 0.01099561, + "auxiliary_loss_mlp": 0.01036472, + "balance_loss_clip": 1.01658833, + "balance_loss_mlp": 1.02866793, + "epoch": 0.17676236284382985, + "flos": 22525995853440.0, + "grad_norm": 2.369397878630063, + "language_loss": 0.73411208, + "learning_rate": 3.699778749417855e-06, + "loss": 0.75547242, + "num_input_tokens_seen": 63539830, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 0.70703125, + "step": 2940, + "time_per_iteration": 3.8665661811828613 + }, + { + "auxiliary_loss_clip": 0.01098768, + "auxiliary_loss_mlp": 0.01037577, + "balance_loss_clip": 1.01682305, + "balance_loss_mlp": 1.0269599, + "epoch": 0.1768224860964978, + "flos": 12384355363200.0, + "grad_norm": 2.239458217562189, + "language_loss": 0.85782027, + "learning_rate": 3.699579621401474e-06, + "loss": 0.87918377, + "num_input_tokens_seen": 63555495, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 0.71875, + "step": 2941, + "time_per_iteration": 3.711132287979126 + }, + { + "auxiliary_loss_clip": 0.01097197, + "auxiliary_loss_mlp": 0.01032487, + "balance_loss_clip": 1.01241255, + "balance_loss_mlp": 1.02669907, + "epoch": 0.1768826093491658, + "flos": 24352197672960.0, + "grad_norm": 2.178196462890618, + "language_loss": 0.76568735, + "learning_rate": 3.699380432731468e-06, + "loss": 0.78698421, + "num_input_tokens_seen": 63575290, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 0.703125, + "step": 2942, + "time_per_iteration": 3.78407621383667 + }, + { + "auxiliary_loss_clip": 0.01098873, + "auxiliary_loss_mlp": 0.01038952, + "balance_loss_clip": 1.01731586, + "balance_loss_mlp": 1.02701664, + "epoch": 0.17694273260183377, + "flos": 23585923952640.0, + "grad_norm": 3.5547954155900796, + "language_loss": 0.79881883, + "learning_rate": 3.699181183414946e-06, + "loss": 0.82019711, + "num_input_tokens_seen": 63594670, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 0.71875, + "step": 2943, + "time_per_iteration": 2.390895366668701 + }, + { + "auxiliary_loss_clip": 0.01097381, + "auxiliary_loss_mlp": 0.01040082, + "balance_loss_clip": 1.01857674, + "balance_loss_mlp": 1.02504814, + "epoch": 0.17700285585450173, + "flos": 26759773319040.0, + "grad_norm": 2.4635067207059373, + "language_loss": 0.80503607, + "learning_rate": 3.698981873459018e-06, + "loss": 0.82641065, + "num_input_tokens_seen": 63614780, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 0.72265625, + "step": 2944, + "time_per_iteration": 2.428388833999634 + }, + { + "auxiliary_loss_clip": 0.01096845, + "auxiliary_loss_mlp": 0.01047621, + "balance_loss_clip": 1.02808249, + "balance_loss_mlp": 1.02602494, + "epoch": 0.1770629791071697, + "flos": 42774712721280.0, + "grad_norm": 2.0907109865020654, + "language_loss": 0.73149455, + "learning_rate": 3.6987825028707976e-06, + "loss": 0.75293922, + "num_input_tokens_seen": 63637190, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 0.70703125, + "step": 2945, + "time_per_iteration": 2.551279067993164 + }, + { + "auxiliary_loss_clip": 0.0109669, + "auxiliary_loss_mlp": 0.01039958, + "balance_loss_clip": 1.01960874, + "balance_loss_mlp": 1.02730441, + "epoch": 0.17712310235983766, + "flos": 17344675555200.0, + "grad_norm": 2.518853551238311, + "language_loss": 0.78102767, + "learning_rate": 3.698583071657399e-06, + "loss": 0.80239409, + "num_input_tokens_seen": 63652140, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 0.6953125, + "step": 2946, + "time_per_iteration": 2.349885940551758 + }, + { + "auxiliary_loss_clip": 0.01096569, + "auxiliary_loss_mlp": 0.01032906, + "balance_loss_clip": 1.01380908, + "balance_loss_mlp": 1.0272876, + "epoch": 0.17718322561250563, + "flos": 23877344004480.0, + "grad_norm": 2.926468314024668, + "language_loss": 0.76134998, + "learning_rate": 3.6983835798259404e-06, + "loss": 0.78264475, + "num_input_tokens_seen": 63671700, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 0.69140625, + "step": 2947, + "time_per_iteration": 2.394010543823242 + }, + { + "auxiliary_loss_clip": 0.01094627, + "auxiliary_loss_mlp": 0.01039666, + "balance_loss_clip": 1.02019954, + "balance_loss_mlp": 1.02523696, + "epoch": 0.1772433488651736, + "flos": 36464859768960.0, + "grad_norm": 3.613830340342464, + "language_loss": 0.72849512, + "learning_rate": 3.6981840273835405e-06, + "loss": 0.74983805, + "num_input_tokens_seen": 63691685, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 0.6953125, + "step": 2948, + "time_per_iteration": 2.5140676498413086 + }, + { + "auxiliary_loss_clip": 0.01096881, + "auxiliary_loss_mlp": 0.01034496, + "balance_loss_clip": 1.01445663, + "balance_loss_mlp": 1.02745891, + "epoch": 0.1773034721178416, + "flos": 26683592999040.0, + "grad_norm": 1.952034909545662, + "language_loss": 0.81700194, + "learning_rate": 3.6979844143373207e-06, + "loss": 0.83831561, + "num_input_tokens_seen": 63711720, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 0.6953125, + "step": 2949, + "time_per_iteration": 2.415473461151123 + }, + { + "auxiliary_loss_clip": 0.01034546, + "auxiliary_loss_mlp": 0.01004541, + "balance_loss_clip": 1.0013696, + "balance_loss_mlp": 1.01126313, + "epoch": 0.17736359537050955, + "flos": 57114377712000.0, + "grad_norm": 0.8157807708418564, + "language_loss": 0.64964092, + "learning_rate": 3.6977847406944053e-06, + "loss": 0.67003179, + "num_input_tokens_seen": 63776280, + "router_z_loss_clip": 0.03173828, + "router_z_loss_mlp": 0.23242188, + "step": 2950, + "time_per_iteration": 3.150360345840454 + }, + { + "auxiliary_loss_clip": 0.01094283, + "auxiliary_loss_mlp": 0.01034287, + "balance_loss_clip": 1.01507056, + "balance_loss_mlp": 1.0265522, + "epoch": 0.17742371862317752, + "flos": 27196990675200.0, + "grad_norm": 1.956804133860281, + "language_loss": 0.83536267, + "learning_rate": 3.6975850064619193e-06, + "loss": 0.85664833, + "num_input_tokens_seen": 63797535, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 0.67578125, + "step": 2951, + "time_per_iteration": 2.4206326007843018 + }, + { + "auxiliary_loss_clip": 0.01097285, + "auxiliary_loss_mlp": 0.01038312, + "balance_loss_clip": 1.01787972, + "balance_loss_mlp": 1.02587032, + "epoch": 0.17748384187584548, + "flos": 20958639920640.0, + "grad_norm": 3.7777157195705753, + "language_loss": 0.80479968, + "learning_rate": 3.697385211646991e-06, + "loss": 0.82615566, + "num_input_tokens_seen": 63817045, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 0.7109375, + "step": 2952, + "time_per_iteration": 2.4163801670074463 + }, + { + "auxiliary_loss_clip": 0.01095084, + "auxiliary_loss_mlp": 0.01032055, + "balance_loss_clip": 1.01189661, + "balance_loss_mlp": 1.02589631, + "epoch": 0.17754396512851345, + "flos": 25008809212800.0, + "grad_norm": 8.429013698081665, + "language_loss": 0.79238909, + "learning_rate": 3.697185356256751e-06, + "loss": 0.8136605, + "num_input_tokens_seen": 63837665, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 0.69140625, + "step": 2953, + "time_per_iteration": 2.409651517868042 + }, + { + "auxiliary_loss_clip": 0.01098027, + "auxiliary_loss_mlp": 0.01036523, + "balance_loss_clip": 1.01796246, + "balance_loss_mlp": 1.02777719, + "epoch": 0.1776040883811814, + "flos": 32050197216000.0, + "grad_norm": 1.8421486948225045, + "language_loss": 0.88229394, + "learning_rate": 3.6969854402983314e-06, + "loss": 0.90363944, + "num_input_tokens_seen": 63858455, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.703125, + "step": 2954, + "time_per_iteration": 2.483719825744629 + }, + { + "auxiliary_loss_clip": 0.01097547, + "auxiliary_loss_mlp": 0.01041523, + "balance_loss_clip": 1.0208286, + "balance_loss_mlp": 1.02728677, + "epoch": 0.17766421163384938, + "flos": 21573216316800.0, + "grad_norm": 2.006047180188921, + "language_loss": 0.84834766, + "learning_rate": 3.6967854637788665e-06, + "loss": 0.86973828, + "num_input_tokens_seen": 63876935, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 0.703125, + "step": 2955, + "time_per_iteration": 2.379448652267456 + }, + { + "auxiliary_loss_clip": 0.01093515, + "auxiliary_loss_mlp": 0.0103567, + "balance_loss_clip": 1.01706147, + "balance_loss_mlp": 1.02667737, + "epoch": 0.17772433488651737, + "flos": 22418218886400.0, + "grad_norm": 2.3926075165858425, + "language_loss": 0.70818555, + "learning_rate": 3.696585426705493e-06, + "loss": 0.72947741, + "num_input_tokens_seen": 63896815, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.66796875, + "step": 2956, + "time_per_iteration": 2.41111421585083 + }, + { + "auxiliary_loss_clip": 0.0109402, + "auxiliary_loss_mlp": 0.01038841, + "balance_loss_clip": 1.01956499, + "balance_loss_mlp": 1.02558947, + "epoch": 0.17778445813918534, + "flos": 25628273199360.0, + "grad_norm": 1.952670960754375, + "language_loss": 0.82171714, + "learning_rate": 3.6963853290853503e-06, + "loss": 0.84304583, + "num_input_tokens_seen": 63916140, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 0.6875, + "step": 2957, + "time_per_iteration": 2.4054601192474365 + }, + { + "auxiliary_loss_clip": 0.01093597, + "auxiliary_loss_mlp": 0.01034734, + "balance_loss_clip": 1.01699567, + "balance_loss_mlp": 1.02670491, + "epoch": 0.1778445813918533, + "flos": 25627714617600.0, + "grad_norm": 1.7895656136965656, + "language_loss": 0.75002372, + "learning_rate": 3.6961851709255784e-06, + "loss": 0.77130711, + "num_input_tokens_seen": 63935220, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.66796875, + "step": 2958, + "time_per_iteration": 2.432297706604004 + }, + { + "auxiliary_loss_clip": 0.010995, + "auxiliary_loss_mlp": 0.01033267, + "balance_loss_clip": 1.01464653, + "balance_loss_mlp": 1.02971029, + "epoch": 0.17790470464452127, + "flos": 22344447450240.0, + "grad_norm": 2.2348646910534926, + "language_loss": 0.80148596, + "learning_rate": 3.6959849522333206e-06, + "loss": 0.82281363, + "num_input_tokens_seen": 63954550, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.69921875, + "step": 2959, + "time_per_iteration": 2.386784076690674 + }, + { + "auxiliary_loss_clip": 0.01096945, + "auxiliary_loss_mlp": 0.01039385, + "balance_loss_clip": 1.01919079, + "balance_loss_mlp": 1.02686608, + "epoch": 0.17796482789718923, + "flos": 18765012286080.0, + "grad_norm": 1.75423547245717, + "language_loss": 0.51365209, + "learning_rate": 3.6957846730157222e-06, + "loss": 0.53501546, + "num_input_tokens_seen": 63972425, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 0.703125, + "step": 2960, + "time_per_iteration": 2.3954639434814453 + }, + { + "auxiliary_loss_clip": 0.01100382, + "auxiliary_loss_mlp": 0.01044366, + "balance_loss_clip": 1.02450538, + "balance_loss_mlp": 1.02857542, + "epoch": 0.1780249511498572, + "flos": 23439812446080.0, + "grad_norm": 1.9753834197272402, + "language_loss": 0.88879579, + "learning_rate": 3.6955843332799317e-06, + "loss": 0.91024327, + "num_input_tokens_seen": 63992165, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 0.71875, + "step": 2961, + "time_per_iteration": 2.3947105407714844 + }, + { + "auxiliary_loss_clip": 0.01097315, + "auxiliary_loss_mlp": 0.01043823, + "balance_loss_clip": 1.02297306, + "balance_loss_mlp": 1.02615452, + "epoch": 0.1780850744025252, + "flos": 23366355212160.0, + "grad_norm": 1.6960386384330346, + "language_loss": 0.79236126, + "learning_rate": 3.6953839330330972e-06, + "loss": 0.81377268, + "num_input_tokens_seen": 64013470, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 0.7109375, + "step": 2962, + "time_per_iteration": 2.4387855529785156 + }, + { + "auxiliary_loss_clip": 0.01099002, + "auxiliary_loss_mlp": 0.01040503, + "balance_loss_clip": 1.02032113, + "balance_loss_mlp": 1.02963376, + "epoch": 0.17814519765519315, + "flos": 13771140410880.0, + "grad_norm": 2.200560901801987, + "language_loss": 0.74530143, + "learning_rate": 3.6951834722823715e-06, + "loss": 0.76669645, + "num_input_tokens_seen": 64030975, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 0.6953125, + "step": 2963, + "time_per_iteration": 2.357468366622925 + }, + { + "auxiliary_loss_clip": 0.01098758, + "auxiliary_loss_mlp": 0.010387, + "balance_loss_clip": 1.01862502, + "balance_loss_mlp": 1.0284189, + "epoch": 0.17820532090786112, + "flos": 21975450624000.0, + "grad_norm": 1.658869639699114, + "language_loss": 0.78876424, + "learning_rate": 3.6949829510349082e-06, + "loss": 0.81013888, + "num_input_tokens_seen": 64050075, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 0.703125, + "step": 2964, + "time_per_iteration": 2.448072671890259 + }, + { + "auxiliary_loss_clip": 0.01097214, + "auxiliary_loss_mlp": 0.01040353, + "balance_loss_clip": 1.02223301, + "balance_loss_mlp": 1.02902925, + "epoch": 0.17826544416052909, + "flos": 24789589585920.0, + "grad_norm": 2.460196336053275, + "language_loss": 0.80767095, + "learning_rate": 3.6947823692978634e-06, + "loss": 0.82904661, + "num_input_tokens_seen": 64071920, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.6796875, + "step": 2965, + "time_per_iteration": 2.414928436279297 + }, + { + "auxiliary_loss_clip": 0.01095953, + "auxiliary_loss_mlp": 0.0104086, + "balance_loss_clip": 1.02256083, + "balance_loss_mlp": 1.02708244, + "epoch": 0.17832556741319705, + "flos": 13878777732480.0, + "grad_norm": 2.4380853617515474, + "language_loss": 0.94539225, + "learning_rate": 3.6945817270783955e-06, + "loss": 0.9667604, + "num_input_tokens_seen": 64086835, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.6875, + "step": 2966, + "time_per_iteration": 2.3657639026641846 + }, + { + "auxiliary_loss_clip": 0.01095675, + "auxiliary_loss_mlp": 0.01038397, + "balance_loss_clip": 1.01912105, + "balance_loss_mlp": 1.02737188, + "epoch": 0.17838569066586502, + "flos": 36640403418240.0, + "grad_norm": 2.6088065706095853, + "language_loss": 0.72639889, + "learning_rate": 3.6943810243836648e-06, + "loss": 0.74773961, + "num_input_tokens_seen": 64107360, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 0.68359375, + "step": 2967, + "time_per_iteration": 2.495950222015381 + }, + { + "auxiliary_loss_clip": 0.01092959, + "auxiliary_loss_mlp": 0.01038678, + "balance_loss_clip": 1.01980734, + "balance_loss_mlp": 1.02761495, + "epoch": 0.17844581391853298, + "flos": 18726468278400.0, + "grad_norm": 2.019554405112788, + "language_loss": 0.77192456, + "learning_rate": 3.6941802612208334e-06, + "loss": 0.7932409, + "num_input_tokens_seen": 64124690, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 0.65625, + "step": 2968, + "time_per_iteration": 2.385192632675171 + }, + { + "auxiliary_loss_clip": 0.01096339, + "auxiliary_loss_mlp": 0.01039807, + "balance_loss_clip": 1.02190185, + "balance_loss_mlp": 1.02813447, + "epoch": 0.17850593717120097, + "flos": 27377107712640.0, + "grad_norm": 2.374990649800464, + "language_loss": 0.75913197, + "learning_rate": 3.6939794375970667e-06, + "loss": 0.78049338, + "num_input_tokens_seen": 64146315, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.68359375, + "step": 2969, + "time_per_iteration": 2.4243085384368896 + }, + { + "auxiliary_loss_clip": 0.01040425, + "auxiliary_loss_mlp": 0.01015848, + "balance_loss_clip": 1.01160467, + "balance_loss_mlp": 1.01369655, + "epoch": 0.17856606042386894, + "flos": 66992913129600.0, + "grad_norm": 0.839377654031634, + "language_loss": 0.69052625, + "learning_rate": 3.693778553519531e-06, + "loss": 0.71108902, + "num_input_tokens_seen": 64210875, + "router_z_loss_clip": 0.04248047, + "router_z_loss_mlp": 0.26757812, + "step": 2970, + "time_per_iteration": 3.1388394832611084 + }, + { + "auxiliary_loss_clip": 0.01099696, + "auxiliary_loss_mlp": 0.01035852, + "balance_loss_clip": 1.01663566, + "balance_loss_mlp": 1.02782619, + "epoch": 0.1786261836765369, + "flos": 36975499447680.0, + "grad_norm": 1.832802303528834, + "language_loss": 0.67340553, + "learning_rate": 3.6935776089953956e-06, + "loss": 0.69476104, + "num_input_tokens_seen": 64230740, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 0.71875, + "step": 2971, + "time_per_iteration": 2.5004563331604004 + }, + { + "auxiliary_loss_clip": 0.01095868, + "auxiliary_loss_mlp": 0.01039161, + "balance_loss_clip": 1.01888382, + "balance_loss_mlp": 1.02606821, + "epoch": 0.17868630692920487, + "flos": 24824328255360.0, + "grad_norm": 1.695710173332243, + "language_loss": 0.89951611, + "learning_rate": 3.6933766040318323e-06, + "loss": 0.92086643, + "num_input_tokens_seen": 64252300, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 0.6953125, + "step": 2972, + "time_per_iteration": 2.4530422687530518 + }, + { + "auxiliary_loss_clip": 0.01098219, + "auxiliary_loss_mlp": 0.010454, + "balance_loss_clip": 1.02532518, + "balance_loss_mlp": 1.02798152, + "epoch": 0.17874643018187283, + "flos": 16981055078400.0, + "grad_norm": 2.9073345714967735, + "language_loss": 0.87565172, + "learning_rate": 3.693175538636014e-06, + "loss": 0.89708793, + "num_input_tokens_seen": 64270105, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 0.703125, + "step": 2973, + "time_per_iteration": 2.346275806427002 + }, + { + "auxiliary_loss_clip": 0.01095117, + "auxiliary_loss_mlp": 0.01043492, + "balance_loss_clip": 1.02205861, + "balance_loss_mlp": 1.02672338, + "epoch": 0.1788065534345408, + "flos": 21031189459200.0, + "grad_norm": 2.6152529641254287, + "language_loss": 0.76249814, + "learning_rate": 3.692974412815116e-06, + "loss": 0.78388429, + "num_input_tokens_seen": 64287250, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 0.68359375, + "step": 2974, + "time_per_iteration": 2.4068634510040283 + }, + { + "auxiliary_loss_clip": 0.01097236, + "auxiliary_loss_mlp": 0.01038095, + "balance_loss_clip": 1.01747215, + "balance_loss_mlp": 1.02757478, + "epoch": 0.17886667668720876, + "flos": 23986587248640.0, + "grad_norm": 2.789210916353887, + "language_loss": 0.74411094, + "learning_rate": 3.692773226576315e-06, + "loss": 0.76546419, + "num_input_tokens_seen": 64307140, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 0.6953125, + "step": 2975, + "time_per_iteration": 2.387455940246582 + }, + { + "auxiliary_loss_clip": 0.01095074, + "auxiliary_loss_mlp": 0.01033124, + "balance_loss_clip": 1.01447964, + "balance_loss_mlp": 1.02747369, + "epoch": 0.17892679993987676, + "flos": 25738284493440.0, + "grad_norm": 1.5661051401773733, + "language_loss": 0.72881365, + "learning_rate": 3.692571979926793e-06, + "loss": 0.75009561, + "num_input_tokens_seen": 64328760, + "router_z_loss_clip": 0.18652344, + "router_z_loss_mlp": 0.67578125, + "step": 2976, + "time_per_iteration": 2.446561098098755 + }, + { + "auxiliary_loss_clip": 0.01091909, + "auxiliary_loss_mlp": 0.01031705, + "balance_loss_clip": 1.0149678, + "balance_loss_mlp": 1.02750278, + "epoch": 0.17898692319254472, + "flos": 25698588410880.0, + "grad_norm": 1.5147232149525842, + "language_loss": 0.77300251, + "learning_rate": 3.69237067287373e-06, + "loss": 0.79423863, + "num_input_tokens_seen": 64348800, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.64453125, + "step": 2977, + "time_per_iteration": 3.884591579437256 + }, + { + "auxiliary_loss_clip": 0.01098059, + "auxiliary_loss_mlp": 0.01046172, + "balance_loss_clip": 1.0268724, + "balance_loss_mlp": 1.03055906, + "epoch": 0.1790470464452127, + "flos": 19316779413120.0, + "grad_norm": 2.1145517949177695, + "language_loss": 0.79672265, + "learning_rate": 3.6921693054243118e-06, + "loss": 0.81816506, + "num_input_tokens_seen": 64367955, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 0.67578125, + "step": 2978, + "time_per_iteration": 2.462244987487793 + }, + { + "auxiliary_loss_clip": 0.01098729, + "auxiliary_loss_mlp": 0.01035956, + "balance_loss_clip": 1.01608396, + "balance_loss_mlp": 1.02767551, + "epoch": 0.17910716969788065, + "flos": 30042970663680.0, + "grad_norm": 1.655474119658755, + "language_loss": 0.76386064, + "learning_rate": 3.6919678775857235e-06, + "loss": 0.78520751, + "num_input_tokens_seen": 64389805, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 0.7109375, + "step": 2979, + "time_per_iteration": 2.4736757278442383 + }, + { + "auxiliary_loss_clip": 0.01095602, + "auxiliary_loss_mlp": 0.01034779, + "balance_loss_clip": 1.01668274, + "balance_loss_mlp": 1.02867532, + "epoch": 0.17916729295054862, + "flos": 19426685973120.0, + "grad_norm": 1.987467858622668, + "language_loss": 0.68844259, + "learning_rate": 3.691766389365154e-06, + "loss": 0.70974636, + "num_input_tokens_seen": 64408220, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.671875, + "step": 2980, + "time_per_iteration": 5.1556713581085205 + }, + { + "auxiliary_loss_clip": 0.01100295, + "auxiliary_loss_mlp": 0.01039066, + "balance_loss_clip": 1.01813245, + "balance_loss_mlp": 1.02987719, + "epoch": 0.17922741620321658, + "flos": 14610661896960.0, + "grad_norm": 1.717844595830466, + "language_loss": 0.70527929, + "learning_rate": 3.6915648407697936e-06, + "loss": 0.72667289, + "num_input_tokens_seen": 64426380, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 0.703125, + "step": 2981, + "time_per_iteration": 2.354583501815796 + }, + { + "auxiliary_loss_clip": 0.01099471, + "auxiliary_loss_mlp": 0.01046799, + "balance_loss_clip": 1.02537692, + "balance_loss_mlp": 1.02893233, + "epoch": 0.17928753945588458, + "flos": 17164349049600.0, + "grad_norm": 2.3933114345657147, + "language_loss": 0.81727308, + "learning_rate": 3.691363231806836e-06, + "loss": 0.83873576, + "num_input_tokens_seen": 64444355, + "router_z_loss_clip": 0.21386719, + "router_z_loss_mlp": 0.703125, + "step": 2982, + "time_per_iteration": 3.780759811401367 + }, + { + "auxiliary_loss_clip": 0.01095086, + "auxiliary_loss_mlp": 0.01034561, + "balance_loss_clip": 1.01578546, + "balance_loss_mlp": 1.02700841, + "epoch": 0.17934766270855254, + "flos": 31394248992000.0, + "grad_norm": 1.502508796775173, + "language_loss": 0.8268553, + "learning_rate": 3.691161562483474e-06, + "loss": 0.84815174, + "num_input_tokens_seen": 64467800, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.6796875, + "step": 2983, + "time_per_iteration": 2.4457900524139404 + }, + { + "auxiliary_loss_clip": 0.01097371, + "auxiliary_loss_mlp": 0.01040516, + "balance_loss_clip": 1.01946414, + "balance_loss_mlp": 1.02622414, + "epoch": 0.1794077859612205, + "flos": 20813121907200.0, + "grad_norm": 1.9901995182426722, + "language_loss": 0.8515988, + "learning_rate": 3.690959832806907e-06, + "loss": 0.87297773, + "num_input_tokens_seen": 64487230, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 0.7109375, + "step": 2984, + "time_per_iteration": 2.4008259773254395 + }, + { + "auxiliary_loss_clip": 0.01096673, + "auxiliary_loss_mlp": 0.01039458, + "balance_loss_clip": 1.01822722, + "balance_loss_mlp": 1.02621925, + "epoch": 0.17946790921388847, + "flos": 28985172157440.0, + "grad_norm": 1.3482576795547023, + "language_loss": 0.89483905, + "learning_rate": 3.690758042784333e-06, + "loss": 0.9162004, + "num_input_tokens_seen": 64509165, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 0.70703125, + "step": 2985, + "time_per_iteration": 2.4310142993927 + }, + { + "auxiliary_loss_clip": 0.0109346, + "auxiliary_loss_mlp": 0.01037514, + "balance_loss_clip": 1.01982343, + "balance_loss_mlp": 1.02734017, + "epoch": 0.17952803246655644, + "flos": 20736452828160.0, + "grad_norm": 1.9732589629646102, + "language_loss": 0.69493186, + "learning_rate": 3.690556192422954e-06, + "loss": 0.7162416, + "num_input_tokens_seen": 64527940, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.66015625, + "step": 2986, + "time_per_iteration": 2.4057769775390625 + }, + { + "auxiliary_loss_clip": 0.01093622, + "auxiliary_loss_mlp": 0.0103883, + "balance_loss_clip": 1.01949382, + "balance_loss_mlp": 1.02628994, + "epoch": 0.1795881557192244, + "flos": 28254754270080.0, + "grad_norm": 2.1704732837980933, + "language_loss": 0.77198172, + "learning_rate": 3.6903542817299725e-06, + "loss": 0.79330623, + "num_input_tokens_seen": 64545230, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 0.671875, + "step": 2987, + "time_per_iteration": 2.412198066711426 + }, + { + "auxiliary_loss_clip": 0.01098981, + "auxiliary_loss_mlp": 0.01040336, + "balance_loss_clip": 1.01951039, + "balance_loss_mlp": 1.0278964, + "epoch": 0.17964827897189237, + "flos": 18551029363200.0, + "grad_norm": 1.8687050131293323, + "language_loss": 0.77970552, + "learning_rate": 3.690152310712595e-06, + "loss": 0.80109864, + "num_input_tokens_seen": 64563820, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 0.7109375, + "step": 2988, + "time_per_iteration": 2.426103353500366 + }, + { + "auxiliary_loss_clip": 0.01027175, + "auxiliary_loss_mlp": 0.01007762, + "balance_loss_clip": 1.00511575, + "balance_loss_mlp": 1.006791, + "epoch": 0.17970840222456036, + "flos": 58162261392000.0, + "grad_norm": 0.7673995150315861, + "language_loss": 0.62703419, + "learning_rate": 3.6899502793780295e-06, + "loss": 0.64738357, + "num_input_tokens_seen": 64621315, + "router_z_loss_clip": 0.02648926, + "router_z_loss_mlp": 0.203125, + "step": 2989, + "time_per_iteration": 2.9614171981811523 + }, + { + "auxiliary_loss_clip": 0.01096796, + "auxiliary_loss_mlp": 0.01032886, + "balance_loss_clip": 1.01431358, + "balance_loss_mlp": 1.02748251, + "epoch": 0.17976852547722832, + "flos": 20299828965120.0, + "grad_norm": 2.5947417124638914, + "language_loss": 0.70792025, + "learning_rate": 3.689748187733485e-06, + "loss": 0.72921705, + "num_input_tokens_seen": 64639885, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.69140625, + "step": 2990, + "time_per_iteration": 2.399005889892578 + }, + { + "auxiliary_loss_clip": 0.01096707, + "auxiliary_loss_mlp": 0.01041279, + "balance_loss_clip": 1.02242041, + "balance_loss_mlp": 1.02798045, + "epoch": 0.1798286487298963, + "flos": 39668001632640.0, + "grad_norm": 1.794322181293968, + "language_loss": 0.68833303, + "learning_rate": 3.6895460357861743e-06, + "loss": 0.70971286, + "num_input_tokens_seen": 64661220, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 0.6875, + "step": 2991, + "time_per_iteration": 2.546729803085327 + }, + { + "auxiliary_loss_clip": 0.01095533, + "auxiliary_loss_mlp": 0.01036174, + "balance_loss_clip": 1.01694548, + "balance_loss_mlp": 1.02731848, + "epoch": 0.17988877198256426, + "flos": 25519134689280.0, + "grad_norm": 1.9749019486282824, + "language_loss": 0.83044302, + "learning_rate": 3.6893438235433117e-06, + "loss": 0.85176003, + "num_input_tokens_seen": 64682530, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 0.6796875, + "step": 2992, + "time_per_iteration": 2.4183197021484375 + }, + { + "auxiliary_loss_clip": 0.01093847, + "auxiliary_loss_mlp": 0.01034821, + "balance_loss_clip": 1.01723754, + "balance_loss_mlp": 1.02816367, + "epoch": 0.17994889523523222, + "flos": 18806488848000.0, + "grad_norm": 2.2885754724046654, + "language_loss": 0.81842172, + "learning_rate": 3.689141551012114e-06, + "loss": 0.83970839, + "num_input_tokens_seen": 64701025, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.65625, + "step": 2993, + "time_per_iteration": 2.3537867069244385 + }, + { + "auxiliary_loss_clip": 0.01094229, + "auxiliary_loss_mlp": 0.01031966, + "balance_loss_clip": 1.01268983, + "balance_loss_mlp": 1.02611601, + "epoch": 0.18000901848790019, + "flos": 21103424795520.0, + "grad_norm": 1.8544353092325712, + "language_loss": 0.78179508, + "learning_rate": 3.688939218199799e-06, + "loss": 0.80305707, + "num_input_tokens_seen": 64719570, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 0.68359375, + "step": 2994, + "time_per_iteration": 2.401881456375122 + }, + { + "auxiliary_loss_clip": 0.01097756, + "auxiliary_loss_mlp": 0.0103543, + "balance_loss_clip": 1.01721501, + "balance_loss_mlp": 1.02783489, + "epoch": 0.18006914174056818, + "flos": 19645416840960.0, + "grad_norm": 2.2977123948197695, + "language_loss": 0.80790877, + "learning_rate": 3.6887368251135875e-06, + "loss": 0.82924068, + "num_input_tokens_seen": 64738110, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.69921875, + "step": 2995, + "time_per_iteration": 2.355602741241455 + }, + { + "auxiliary_loss_clip": 0.01096986, + "auxiliary_loss_mlp": 0.01036714, + "balance_loss_clip": 1.01842737, + "balance_loss_mlp": 1.02779651, + "epoch": 0.18012926499323614, + "flos": 19498886398080.0, + "grad_norm": 2.004213944979025, + "language_loss": 0.84364128, + "learning_rate": 3.688534371760703e-06, + "loss": 0.86497831, + "num_input_tokens_seen": 64756345, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 0.69140625, + "step": 2996, + "time_per_iteration": 2.3671207427978516 + }, + { + "auxiliary_loss_clip": 0.01091169, + "auxiliary_loss_mlp": 0.01036037, + "balance_loss_clip": 1.01834655, + "balance_loss_mlp": 1.02571321, + "epoch": 0.1801893882459041, + "flos": 19463519324160.0, + "grad_norm": 1.8246987670289778, + "language_loss": 0.88096237, + "learning_rate": 3.68833185814837e-06, + "loss": 0.90223432, + "num_input_tokens_seen": 64776375, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.65625, + "step": 2997, + "time_per_iteration": 2.428534507751465 + }, + { + "auxiliary_loss_clip": 0.01098237, + "auxiliary_loss_mlp": 0.01043539, + "balance_loss_clip": 1.02237988, + "balance_loss_mlp": 1.0258584, + "epoch": 0.18024951149857207, + "flos": 26869365676800.0, + "grad_norm": 1.749073381435927, + "language_loss": 0.85452026, + "learning_rate": 3.688129284283816e-06, + "loss": 0.875938, + "num_input_tokens_seen": 64796210, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 0.72265625, + "step": 2998, + "time_per_iteration": 2.436452627182007 + }, + { + "auxiliary_loss_clip": 0.01097172, + "auxiliary_loss_mlp": 0.01042816, + "balance_loss_clip": 1.02375484, + "balance_loss_mlp": 1.02929878, + "epoch": 0.18030963475124004, + "flos": 30225322028160.0, + "grad_norm": 1.898293642111629, + "language_loss": 0.84303552, + "learning_rate": 3.6879266501742705e-06, + "loss": 0.86443543, + "num_input_tokens_seen": 64818590, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 0.6796875, + "step": 2999, + "time_per_iteration": 2.440263032913208 + }, + { + "auxiliary_loss_clip": 0.01094612, + "auxiliary_loss_mlp": 0.01039983, + "balance_loss_clip": 1.0202781, + "balance_loss_mlp": 1.0264858, + "epoch": 0.180369758003908, + "flos": 22306462024320.0, + "grad_norm": 1.7817234409355363, + "language_loss": 0.74977803, + "learning_rate": 3.6877239558269642e-06, + "loss": 0.77112401, + "num_input_tokens_seen": 64838350, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 0.6796875, + "step": 3000, + "time_per_iteration": 2.400106191635132 + }, + { + "auxiliary_loss_clip": 0.01094685, + "auxiliary_loss_mlp": 0.01045176, + "balance_loss_clip": 1.02609098, + "balance_loss_mlp": 1.02801108, + "epoch": 0.18042988125657597, + "flos": 23730918295680.0, + "grad_norm": 1.6914341462004998, + "language_loss": 0.7138685, + "learning_rate": 3.687521201249132e-06, + "loss": 0.73526716, + "num_input_tokens_seen": 64858065, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 0.66796875, + "step": 3001, + "time_per_iteration": 2.4026381969451904 + }, + { + "auxiliary_loss_clip": 0.01097361, + "auxiliary_loss_mlp": 0.01037554, + "balance_loss_clip": 1.01790857, + "balance_loss_mlp": 1.02772808, + "epoch": 0.18049000450924396, + "flos": 24092548824960.0, + "grad_norm": 1.9922244182544901, + "language_loss": 0.88416296, + "learning_rate": 3.687318386448008e-06, + "loss": 0.90551209, + "num_input_tokens_seen": 64877305, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 0.6953125, + "step": 3002, + "time_per_iteration": 2.4055991172790527 + }, + { + "auxiliary_loss_clip": 0.01094645, + "auxiliary_loss_mlp": 0.01039409, + "balance_loss_clip": 1.0210861, + "balance_loss_mlp": 1.02737689, + "epoch": 0.18055012776191193, + "flos": 22162096085760.0, + "grad_norm": 1.8975920308571603, + "language_loss": 0.80576307, + "learning_rate": 3.687115511430832e-06, + "loss": 0.82710361, + "num_input_tokens_seen": 64896955, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.671875, + "step": 3003, + "time_per_iteration": 2.3824965953826904 + }, + { + "auxiliary_loss_clip": 0.01095178, + "auxiliary_loss_mlp": 0.01038835, + "balance_loss_clip": 1.01905823, + "balance_loss_mlp": 1.02607787, + "epoch": 0.1806102510145799, + "flos": 28912238593920.0, + "grad_norm": 2.2792963212946056, + "language_loss": 0.66961324, + "learning_rate": 3.6869125762048423e-06, + "loss": 0.69095337, + "num_input_tokens_seen": 64917080, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 0.69140625, + "step": 3004, + "time_per_iteration": 2.422297239303589 + }, + { + "auxiliary_loss_clip": 0.01097773, + "auxiliary_loss_mlp": 0.01040511, + "balance_loss_clip": 1.02108049, + "balance_loss_mlp": 1.02752137, + "epoch": 0.18067037426724786, + "flos": 19024696045440.0, + "grad_norm": 1.6577558114338158, + "language_loss": 0.85528255, + "learning_rate": 3.6867095807772826e-06, + "loss": 0.87666535, + "num_input_tokens_seen": 64935215, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 0.703125, + "step": 3005, + "time_per_iteration": 2.3702433109283447 + }, + { + "auxiliary_loss_clip": 0.01092354, + "auxiliary_loss_mlp": 0.01038303, + "balance_loss_clip": 1.01965904, + "balance_loss_mlp": 1.02653646, + "epoch": 0.18073049751991582, + "flos": 27452415248640.0, + "grad_norm": 1.545663060047751, + "language_loss": 0.8309502, + "learning_rate": 3.6865065251553967e-06, + "loss": 0.85225677, + "num_input_tokens_seen": 64956275, + "router_z_loss_clip": 0.18652344, + "router_z_loss_mlp": 0.65625, + "step": 3006, + "time_per_iteration": 2.427565574645996 + }, + { + "auxiliary_loss_clip": 0.01092904, + "auxiliary_loss_mlp": 0.01038936, + "balance_loss_clip": 1.0199697, + "balance_loss_mlp": 1.02515292, + "epoch": 0.1807906207725838, + "flos": 28727827459200.0, + "grad_norm": 1.6693494832060916, + "language_loss": 0.77110308, + "learning_rate": 3.6863034093464307e-06, + "loss": 0.79242146, + "num_input_tokens_seen": 64979390, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 0.6796875, + "step": 3007, + "time_per_iteration": 2.4451706409454346 + }, + { + "auxiliary_loss_clip": 0.0102929, + "auxiliary_loss_mlp": 0.01018946, + "balance_loss_clip": 1.01641917, + "balance_loss_mlp": 1.00968313, + "epoch": 0.18085074402525175, + "flos": 64462791144960.0, + "grad_norm": 0.7976309932500728, + "language_loss": 0.56929284, + "learning_rate": 3.686100233357634e-06, + "loss": 0.5897752, + "num_input_tokens_seen": 65043135, + "router_z_loss_clip": 0.02526855, + "router_z_loss_mlp": 0.19628906, + "step": 3008, + "time_per_iteration": 3.161513328552246 + }, + { + "auxiliary_loss_clip": 0.01097169, + "auxiliary_loss_mlp": 0.01044263, + "balance_loss_clip": 1.02424729, + "balance_loss_mlp": 1.02932644, + "epoch": 0.18091086727791975, + "flos": 23475842835840.0, + "grad_norm": 1.9043645543070598, + "language_loss": 0.67481375, + "learning_rate": 3.6858969971962573e-06, + "loss": 0.69622803, + "num_input_tokens_seen": 65062845, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 0.6796875, + "step": 3009, + "time_per_iteration": 2.3931095600128174 + }, + { + "auxiliary_loss_clip": 0.01096304, + "auxiliary_loss_mlp": 0.01034655, + "balance_loss_clip": 1.01597512, + "balance_loss_mlp": 1.02767563, + "epoch": 0.1809709905305877, + "flos": 24169322638080.0, + "grad_norm": 2.562430365773123, + "language_loss": 0.75639015, + "learning_rate": 3.685693700869553e-06, + "loss": 0.77769971, + "num_input_tokens_seen": 65082110, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.6875, + "step": 3010, + "time_per_iteration": 2.4174158573150635 + }, + { + "auxiliary_loss_clip": 0.01089004, + "auxiliary_loss_mlp": 0.01034499, + "balance_loss_clip": 1.0177027, + "balance_loss_mlp": 1.02533138, + "epoch": 0.18103111378325568, + "flos": 21649885395840.0, + "grad_norm": 1.5937217071721066, + "language_loss": 0.67342001, + "learning_rate": 3.6854903443847772e-06, + "loss": 0.69465506, + "num_input_tokens_seen": 65101985, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.63671875, + "step": 3011, + "time_per_iteration": 2.4023966789245605 + }, + { + "auxiliary_loss_clip": 0.0109164, + "auxiliary_loss_mlp": 0.01035591, + "balance_loss_clip": 1.01705384, + "balance_loss_mlp": 1.02585912, + "epoch": 0.18109123703592364, + "flos": 53684965992960.0, + "grad_norm": 1.7738741352484462, + "language_loss": 0.71349472, + "learning_rate": 3.6852869277491865e-06, + "loss": 0.73476702, + "num_input_tokens_seen": 65129295, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.66015625, + "step": 3012, + "time_per_iteration": 2.692625045776367 + }, + { + "auxiliary_loss_clip": 0.01093415, + "auxiliary_loss_mlp": 0.01038487, + "balance_loss_clip": 1.02000999, + "balance_loss_mlp": 1.02846265, + "epoch": 0.1811513602885916, + "flos": 35844104062080.0, + "grad_norm": 2.037454486184128, + "language_loss": 0.63142848, + "learning_rate": 3.68508345097004e-06, + "loss": 0.65274751, + "num_input_tokens_seen": 65150625, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.6484375, + "step": 3013, + "time_per_iteration": 2.505274772644043 + }, + { + "auxiliary_loss_clip": 0.01097457, + "auxiliary_loss_mlp": 0.01043572, + "balance_loss_clip": 1.02592874, + "balance_loss_mlp": 1.02976298, + "epoch": 0.18121148354125957, + "flos": 23731441966080.0, + "grad_norm": 1.6507094366992145, + "language_loss": 0.76124537, + "learning_rate": 3.6848799140546e-06, + "loss": 0.78265566, + "num_input_tokens_seen": 65170880, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.6796875, + "step": 3014, + "time_per_iteration": 2.4147589206695557 + }, + { + "auxiliary_loss_clip": 0.01098058, + "auxiliary_loss_mlp": 0.01042203, + "balance_loss_clip": 1.0211271, + "balance_loss_mlp": 1.02832627, + "epoch": 0.18127160679392756, + "flos": 28727129232000.0, + "grad_norm": 2.1835324373493568, + "language_loss": 0.66143107, + "learning_rate": 3.6846763170101297e-06, + "loss": 0.68283367, + "num_input_tokens_seen": 65192530, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 0.69921875, + "step": 3015, + "time_per_iteration": 2.45542573928833 + }, + { + "auxiliary_loss_clip": 0.01092742, + "auxiliary_loss_mlp": 0.01035633, + "balance_loss_clip": 1.01674986, + "balance_loss_mlp": 1.02746844, + "epoch": 0.18133173004659553, + "flos": 20484030631680.0, + "grad_norm": 1.6975769583465765, + "language_loss": 0.7801252, + "learning_rate": 3.684472659843895e-06, + "loss": 0.80140895, + "num_input_tokens_seen": 65211675, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 0.6484375, + "step": 3016, + "time_per_iteration": 2.3770716190338135 + }, + { + "auxiliary_loss_clip": 0.01098174, + "auxiliary_loss_mlp": 0.01038127, + "balance_loss_clip": 1.0191493, + "balance_loss_mlp": 1.02951336, + "epoch": 0.1813918532992635, + "flos": 22851107233920.0, + "grad_norm": 1.6992091223276173, + "language_loss": 0.83647573, + "learning_rate": 3.6842689425631645e-06, + "loss": 0.85783875, + "num_input_tokens_seen": 65231185, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 0.6875, + "step": 3017, + "time_per_iteration": 3.90226411819458 + }, + { + "auxiliary_loss_clip": 0.01091733, + "auxiliary_loss_mlp": 0.01035836, + "balance_loss_clip": 1.01820517, + "balance_loss_mlp": 1.0266006, + "epoch": 0.18145197655193146, + "flos": 36063637891200.0, + "grad_norm": 4.773021443468642, + "language_loss": 0.67354894, + "learning_rate": 3.684065165175208e-06, + "loss": 0.69482458, + "num_input_tokens_seen": 65251645, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.6484375, + "step": 3018, + "time_per_iteration": 2.5068161487579346 + }, + { + "auxiliary_loss_clip": 0.01095457, + "auxiliary_loss_mlp": 0.01034266, + "balance_loss_clip": 1.01630139, + "balance_loss_mlp": 1.0276463, + "epoch": 0.18151209980459942, + "flos": 24022827106560.0, + "grad_norm": 1.9264538479523363, + "language_loss": 0.75988364, + "learning_rate": 3.683861327687297e-06, + "loss": 0.78118086, + "num_input_tokens_seen": 65271125, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.67578125, + "step": 3019, + "time_per_iteration": 3.7964046001434326 + }, + { + "auxiliary_loss_clip": 0.01097, + "auxiliary_loss_mlp": 0.01035523, + "balance_loss_clip": 1.01623487, + "balance_loss_mlp": 1.02833366, + "epoch": 0.1815722230572674, + "flos": 23950487036160.0, + "grad_norm": 2.1930430614750605, + "language_loss": 0.81399328, + "learning_rate": 3.683657430106707e-06, + "loss": 0.83531857, + "num_input_tokens_seen": 65290600, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 0.6875, + "step": 3020, + "time_per_iteration": 3.7735393047332764 + }, + { + "auxiliary_loss_clip": 0.0109454, + "auxiliary_loss_mlp": 0.01037535, + "balance_loss_clip": 1.01861608, + "balance_loss_mlp": 1.02751517, + "epoch": 0.18163234630993536, + "flos": 24385400242560.0, + "grad_norm": 1.7764036231911737, + "language_loss": 0.77370763, + "learning_rate": 3.683453472440714e-06, + "loss": 0.79502845, + "num_input_tokens_seen": 65311040, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 0.671875, + "step": 3021, + "time_per_iteration": 2.4085967540740967 + }, + { + "auxiliary_loss_clip": 0.01090358, + "auxiliary_loss_mlp": 0.010367, + "balance_loss_clip": 1.01873493, + "balance_loss_mlp": 1.02425432, + "epoch": 0.18169246956260335, + "flos": 24680171784960.0, + "grad_norm": 1.6819576893612795, + "language_loss": 0.84895861, + "learning_rate": 3.6832494546965975e-06, + "loss": 0.87022913, + "num_input_tokens_seen": 65332115, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.66015625, + "step": 3022, + "time_per_iteration": 3.787031888961792 + }, + { + "auxiliary_loss_clip": 0.01093793, + "auxiliary_loss_mlp": 0.01037292, + "balance_loss_clip": 1.01830173, + "balance_loss_mlp": 1.02538633, + "epoch": 0.1817525928152713, + "flos": 24242151467520.0, + "grad_norm": 1.8081751746641312, + "language_loss": 0.69382024, + "learning_rate": 3.6830453768816376e-06, + "loss": 0.71513104, + "num_input_tokens_seen": 65352210, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 0.68359375, + "step": 3023, + "time_per_iteration": 2.3976800441741943 + }, + { + "auxiliary_loss_clip": 0.0109341, + "auxiliary_loss_mlp": 0.01036629, + "balance_loss_clip": 1.01902175, + "balance_loss_mlp": 1.02653098, + "epoch": 0.18181271606793928, + "flos": 16471148538240.0, + "grad_norm": 1.8344109458778812, + "language_loss": 0.73903406, + "learning_rate": 3.6828412390031174e-06, + "loss": 0.76033449, + "num_input_tokens_seen": 65370600, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.66796875, + "step": 3024, + "time_per_iteration": 2.3516170978546143 + }, + { + "auxiliary_loss_clip": 0.01094789, + "auxiliary_loss_mlp": 0.0103464, + "balance_loss_clip": 1.01550698, + "balance_loss_mlp": 1.02728665, + "epoch": 0.18187283932060724, + "flos": 18580252037760.0, + "grad_norm": 1.9590317436915126, + "language_loss": 0.8824296, + "learning_rate": 3.682637041068322e-06, + "loss": 0.90372384, + "num_input_tokens_seen": 65387270, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 0.67578125, + "step": 3025, + "time_per_iteration": 2.3552322387695312 + }, + { + "auxiliary_loss_clip": 0.01093535, + "auxiliary_loss_mlp": 0.01035689, + "balance_loss_clip": 1.01683044, + "balance_loss_mlp": 1.02792573, + "epoch": 0.1819329625732752, + "flos": 20265788522880.0, + "grad_norm": 1.6937359134732468, + "language_loss": 0.78706336, + "learning_rate": 3.6824327830845387e-06, + "loss": 0.80835557, + "num_input_tokens_seen": 65406550, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 0.65625, + "step": 3026, + "time_per_iteration": 2.3805599212646484 + }, + { + "auxiliary_loss_clip": 0.01095746, + "auxiliary_loss_mlp": 0.01039595, + "balance_loss_clip": 1.02073646, + "balance_loss_mlp": 1.02767563, + "epoch": 0.18199308582594317, + "flos": 25914177256320.0, + "grad_norm": 1.6651142140429498, + "language_loss": 0.75892401, + "learning_rate": 3.6822284650590576e-06, + "loss": 0.78027743, + "num_input_tokens_seen": 65425955, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 0.6796875, + "step": 3027, + "time_per_iteration": 2.4328739643096924 + }, + { + "auxiliary_loss_clip": 0.01097257, + "auxiliary_loss_mlp": 0.01039279, + "balance_loss_clip": 1.01903725, + "balance_loss_mlp": 1.02633667, + "epoch": 0.18205320907861114, + "flos": 15376621415040.0, + "grad_norm": 1.9379751917848766, + "language_loss": 0.85705507, + "learning_rate": 3.68202408699917e-06, + "loss": 0.87842047, + "num_input_tokens_seen": 65442820, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 0.7109375, + "step": 3028, + "time_per_iteration": 2.361747980117798 + }, + { + "auxiliary_loss_clip": 0.01092009, + "auxiliary_loss_mlp": 0.01036923, + "balance_loss_clip": 1.01812363, + "balance_loss_mlp": 1.02600992, + "epoch": 0.18211333233127913, + "flos": 25623280874880.0, + "grad_norm": 1.9534451950594895, + "language_loss": 0.82559109, + "learning_rate": 3.6818196489121683e-06, + "loss": 0.84688038, + "num_input_tokens_seen": 65461825, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.66015625, + "step": 3029, + "time_per_iteration": 2.4177732467651367 + }, + { + "auxiliary_loss_clip": 0.010948, + "auxiliary_loss_mlp": 0.01039609, + "balance_loss_clip": 1.02011859, + "balance_loss_mlp": 1.02715087, + "epoch": 0.1821734555839471, + "flos": 14975120246400.0, + "grad_norm": 1.9630182510346148, + "language_loss": 0.77569473, + "learning_rate": 3.68161515080535e-06, + "loss": 0.79703879, + "num_input_tokens_seen": 65479480, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 0.6796875, + "step": 3030, + "time_per_iteration": 2.346942663192749 + }, + { + "auxiliary_loss_clip": 0.01093451, + "auxiliary_loss_mlp": 0.01034323, + "balance_loss_clip": 1.01573825, + "balance_loss_mlp": 1.02540433, + "epoch": 0.18223357883661506, + "flos": 20192959693440.0, + "grad_norm": 1.9172145309317545, + "language_loss": 0.84994686, + "learning_rate": 3.681410592686013e-06, + "loss": 0.87122458, + "num_input_tokens_seen": 65497775, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.6796875, + "step": 3031, + "time_per_iteration": 2.3773000240325928 + }, + { + "auxiliary_loss_clip": 0.01094263, + "auxiliary_loss_mlp": 0.01035067, + "balance_loss_clip": 1.01641083, + "balance_loss_mlp": 1.02630711, + "epoch": 0.18229370208928303, + "flos": 15231068490240.0, + "grad_norm": 2.3139330996384486, + "language_loss": 0.80105782, + "learning_rate": 3.681205974561457e-06, + "loss": 0.82235116, + "num_input_tokens_seen": 65516505, + "router_z_loss_clip": 0.18652344, + "router_z_loss_mlp": 0.6796875, + "step": 3032, + "time_per_iteration": 2.3608875274658203 + }, + { + "auxiliary_loss_clip": 0.01099095, + "auxiliary_loss_mlp": 0.01039615, + "balance_loss_clip": 1.02026689, + "balance_loss_mlp": 1.0278728, + "epoch": 0.182353825341951, + "flos": 23839393489920.0, + "grad_norm": 2.2208230265790116, + "language_loss": 0.81210154, + "learning_rate": 3.6810012964389846e-06, + "loss": 0.83348858, + "num_input_tokens_seen": 65536160, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 0.7109375, + "step": 3033, + "time_per_iteration": 2.4141690731048584 + }, + { + "auxiliary_loss_clip": 0.0102659, + "auxiliary_loss_mlp": 0.01005783, + "balance_loss_clip": 1.00319636, + "balance_loss_mlp": 1.00633883, + "epoch": 0.18241394859461896, + "flos": 61188114038400.0, + "grad_norm": 0.8954202485514626, + "language_loss": 0.63418603, + "learning_rate": 3.680796558325899e-06, + "loss": 0.65450966, + "num_input_tokens_seen": 65589375, + "router_z_loss_clip": 0.02587891, + "router_z_loss_mlp": 0.20214844, + "step": 3034, + "time_per_iteration": 2.9214541912078857 + }, + { + "auxiliary_loss_clip": 0.01093738, + "auxiliary_loss_mlp": 0.01035767, + "balance_loss_clip": 1.01731384, + "balance_loss_mlp": 1.02661943, + "epoch": 0.18247407184728695, + "flos": 18470904059520.0, + "grad_norm": 1.9366467761323554, + "language_loss": 0.79605818, + "learning_rate": 3.6805917602295084e-06, + "loss": 0.81735319, + "num_input_tokens_seen": 65606720, + "router_z_loss_clip": 0.18457031, + "router_z_loss_mlp": 0.671875, + "step": 3035, + "time_per_iteration": 2.369335651397705 + }, + { + "auxiliary_loss_clip": 0.01090274, + "auxiliary_loss_mlp": 0.01031759, + "balance_loss_clip": 1.01418781, + "balance_loss_mlp": 1.02530837, + "epoch": 0.18253419509995492, + "flos": 21794216423040.0, + "grad_norm": 1.7195695900240333, + "language_loss": 0.84461898, + "learning_rate": 3.680386902157121e-06, + "loss": 0.8658393, + "num_input_tokens_seen": 65625495, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.6484375, + "step": 3036, + "time_per_iteration": 2.401374101638794 + }, + { + "auxiliary_loss_clip": 0.01092218, + "auxiliary_loss_mlp": 0.010331, + "balance_loss_clip": 1.01519525, + "balance_loss_mlp": 1.02690077, + "epoch": 0.18259431835262288, + "flos": 20148934602240.0, + "grad_norm": 2.0379480875904177, + "language_loss": 0.79803252, + "learning_rate": 3.680181984116047e-06, + "loss": 0.81928569, + "num_input_tokens_seen": 65643515, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.65234375, + "step": 3037, + "time_per_iteration": 2.3707478046417236 + }, + { + "auxiliary_loss_clip": 0.01098544, + "auxiliary_loss_mlp": 0.01038427, + "balance_loss_clip": 1.01751804, + "balance_loss_mlp": 1.02916551, + "epoch": 0.18265444160529085, + "flos": 16980740876160.0, + "grad_norm": 4.894582151194986, + "language_loss": 0.79521585, + "learning_rate": 3.6799770061136e-06, + "loss": 0.81658554, + "num_input_tokens_seen": 65658155, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 0.6953125, + "step": 3038, + "time_per_iteration": 2.3390767574310303 + }, + { + "auxiliary_loss_clip": 0.01093295, + "auxiliary_loss_mlp": 0.01034576, + "balance_loss_clip": 1.01652741, + "balance_loss_mlp": 1.02583265, + "epoch": 0.1827145648579588, + "flos": 34421812295040.0, + "grad_norm": 2.248150297807195, + "language_loss": 0.67581129, + "learning_rate": 3.6797719681570953e-06, + "loss": 0.69708991, + "num_input_tokens_seen": 65679310, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.671875, + "step": 3039, + "time_per_iteration": 2.5091495513916016 + }, + { + "auxiliary_loss_clip": 0.01094548, + "auxiliary_loss_mlp": 0.01036112, + "balance_loss_clip": 1.01676488, + "balance_loss_mlp": 1.02687049, + "epoch": 0.18277468811062678, + "flos": 53285035835520.0, + "grad_norm": 2.444654234344379, + "language_loss": 0.73460305, + "learning_rate": 3.6795668702538505e-06, + "loss": 0.75590956, + "num_input_tokens_seen": 65705235, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 0.6796875, + "step": 3040, + "time_per_iteration": 2.6638429164886475 + }, + { + "auxiliary_loss_clip": 0.01093964, + "auxiliary_loss_mlp": 0.01032173, + "balance_loss_clip": 1.01376712, + "balance_loss_mlp": 1.02748919, + "epoch": 0.18283481136329474, + "flos": 31649289540480.0, + "grad_norm": 2.0313259677411803, + "language_loss": 0.60360682, + "learning_rate": 3.6793617124111836e-06, + "loss": 0.62486821, + "num_input_tokens_seen": 65727575, + "router_z_loss_clip": 0.18457031, + "router_z_loss_mlp": 0.66796875, + "step": 3041, + "time_per_iteration": 2.4727203845977783 + }, + { + "auxiliary_loss_clip": 0.01095961, + "auxiliary_loss_mlp": 0.01041796, + "balance_loss_clip": 1.02248418, + "balance_loss_mlp": 1.02796614, + "epoch": 0.18289493461596273, + "flos": 53135782306560.0, + "grad_norm": 1.6755375784789484, + "language_loss": 0.60253775, + "learning_rate": 3.6791564946364176e-06, + "loss": 0.62391531, + "num_input_tokens_seen": 65751370, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 0.6796875, + "step": 3042, + "time_per_iteration": 2.6615445613861084 + }, + { + "auxiliary_loss_clip": 0.01093501, + "auxiliary_loss_mlp": 0.01032486, + "balance_loss_clip": 1.01429451, + "balance_loss_mlp": 1.02835739, + "epoch": 0.1829550578686307, + "flos": 25588297825920.0, + "grad_norm": 1.6013928454406494, + "language_loss": 0.87596387, + "learning_rate": 3.678951216936875e-06, + "loss": 0.89722371, + "num_input_tokens_seen": 65771040, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.65234375, + "step": 3043, + "time_per_iteration": 2.425600051879883 + }, + { + "auxiliary_loss_clip": 0.01096452, + "auxiliary_loss_mlp": 0.01038411, + "balance_loss_clip": 1.01756108, + "balance_loss_mlp": 1.02765286, + "epoch": 0.18301518112129866, + "flos": 22600325871360.0, + "grad_norm": 2.1711627912841824, + "language_loss": 0.70740992, + "learning_rate": 3.6787458793198825e-06, + "loss": 0.72875857, + "num_input_tokens_seen": 65789345, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 0.6875, + "step": 3044, + "time_per_iteration": 2.37648606300354 + }, + { + "auxiliary_loss_clip": 0.01098498, + "auxiliary_loss_mlp": 0.01040058, + "balance_loss_clip": 1.01912498, + "balance_loss_mlp": 1.02705944, + "epoch": 0.18307530437396663, + "flos": 34019403431040.0, + "grad_norm": 2.1079227102188396, + "language_loss": 0.64306909, + "learning_rate": 3.678540481792768e-06, + "loss": 0.66445458, + "num_input_tokens_seen": 65810990, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 0.71484375, + "step": 3045, + "time_per_iteration": 2.4975714683532715 + }, + { + "auxiliary_loss_clip": 0.01092421, + "auxiliary_loss_mlp": 0.01037968, + "balance_loss_clip": 1.01928782, + "balance_loss_mlp": 1.02710378, + "epoch": 0.1831354276266346, + "flos": 21278933533440.0, + "grad_norm": 2.2035152419714414, + "language_loss": 0.79463446, + "learning_rate": 3.6783350243628613e-06, + "loss": 0.81593835, + "num_input_tokens_seen": 65827230, + "router_z_loss_clip": 0.18652344, + "router_z_loss_mlp": 0.65625, + "step": 3046, + "time_per_iteration": 2.3456878662109375 + }, + { + "auxiliary_loss_clip": 0.01093784, + "auxiliary_loss_mlp": 0.01036512, + "balance_loss_clip": 1.01663971, + "balance_loss_mlp": 1.02519798, + "epoch": 0.18319555087930256, + "flos": 21031887686400.0, + "grad_norm": 3.27295768454744, + "language_loss": 0.78758115, + "learning_rate": 3.678129507037495e-06, + "loss": 0.80888414, + "num_input_tokens_seen": 65845900, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 0.6875, + "step": 3047, + "time_per_iteration": 2.380859136581421 + }, + { + "auxiliary_loss_clip": 0.01095002, + "auxiliary_loss_mlp": 0.01033843, + "balance_loss_clip": 1.01493669, + "balance_loss_mlp": 1.02816248, + "epoch": 0.18325567413197055, + "flos": 34381627453440.0, + "grad_norm": 1.5145269717417007, + "language_loss": 0.80488312, + "learning_rate": 3.6779239298240032e-06, + "loss": 0.82617152, + "num_input_tokens_seen": 65868730, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 0.66796875, + "step": 3048, + "time_per_iteration": 2.49118971824646 + }, + { + "auxiliary_loss_clip": 0.01096412, + "auxiliary_loss_mlp": 0.01041874, + "balance_loss_clip": 1.02191842, + "balance_loss_mlp": 1.02703547, + "epoch": 0.18331579738463852, + "flos": 20557418042880.0, + "grad_norm": 2.404031913488755, + "language_loss": 0.8656354, + "learning_rate": 3.6777182927297225e-06, + "loss": 0.88701832, + "num_input_tokens_seen": 65888420, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 0.69140625, + "step": 3049, + "time_per_iteration": 2.37909197807312 + }, + { + "auxiliary_loss_clip": 0.0110299, + "auxiliary_loss_mlp": 0.01040136, + "balance_loss_clip": 1.01970398, + "balance_loss_mlp": 1.02914858, + "epoch": 0.18337592063730648, + "flos": 19606907744640.0, + "grad_norm": 2.3591333444806923, + "language_loss": 0.76766431, + "learning_rate": 3.6775125957619913e-06, + "loss": 0.78909552, + "num_input_tokens_seen": 65905840, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 0.73828125, + "step": 3050, + "time_per_iteration": 2.3622822761535645 + }, + { + "auxiliary_loss_clip": 0.01091689, + "auxiliary_loss_mlp": 0.01032873, + "balance_loss_clip": 1.01391912, + "balance_loss_mlp": 1.02520013, + "epoch": 0.18343604388997445, + "flos": 20849815612800.0, + "grad_norm": 2.0644922428537096, + "language_loss": 0.99320161, + "learning_rate": 3.6773068389281507e-06, + "loss": 1.01444721, + "num_input_tokens_seen": 65922845, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 0.6640625, + "step": 3051, + "time_per_iteration": 2.3741469383239746 + }, + { + "auxiliary_loss_clip": 0.01093216, + "auxiliary_loss_mlp": 0.01036161, + "balance_loss_clip": 1.01641965, + "balance_loss_mlp": 1.02763176, + "epoch": 0.1834961671426424, + "flos": 24393080741760.0, + "grad_norm": 2.272240720087646, + "language_loss": 0.86265355, + "learning_rate": 3.6771010222355434e-06, + "loss": 0.88394737, + "num_input_tokens_seen": 65945555, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 0.65625, + "step": 3052, + "time_per_iteration": 2.44637131690979 + }, + { + "auxiliary_loss_clip": 0.01093433, + "auxiliary_loss_mlp": 0.010358, + "balance_loss_clip": 1.01694083, + "balance_loss_mlp": 1.02554989, + "epoch": 0.18355629039531038, + "flos": 21250548731520.0, + "grad_norm": 2.0104145983990582, + "language_loss": 0.73043442, + "learning_rate": 3.6768951456915147e-06, + "loss": 0.75172675, + "num_input_tokens_seen": 65963965, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 0.6796875, + "step": 3053, + "time_per_iteration": 2.391200542449951 + }, + { + "auxiliary_loss_clip": 0.01098646, + "auxiliary_loss_mlp": 0.01038256, + "balance_loss_clip": 1.01797879, + "balance_loss_mlp": 1.02836823, + "epoch": 0.18361641364797834, + "flos": 28655277920640.0, + "grad_norm": 1.8701847575316863, + "language_loss": 0.61304927, + "learning_rate": 3.6766892093034123e-06, + "loss": 0.63441837, + "num_input_tokens_seen": 65985965, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 0.703125, + "step": 3054, + "time_per_iteration": 2.433223247528076 + }, + { + "auxiliary_loss_clip": 0.01095956, + "auxiliary_loss_mlp": 0.01035546, + "balance_loss_clip": 1.01764071, + "balance_loss_mlp": 1.02743411, + "epoch": 0.18367653690064634, + "flos": 20917896497280.0, + "grad_norm": 1.9018163121145335, + "language_loss": 0.78297484, + "learning_rate": 3.6764832130785846e-06, + "loss": 0.80428982, + "num_input_tokens_seen": 66005645, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 0.6875, + "step": 3055, + "time_per_iteration": 2.4101922512054443 + }, + { + "auxiliary_loss_clip": 0.01093526, + "auxiliary_loss_mlp": 0.0103933, + "balance_loss_clip": 1.0211978, + "balance_loss_mlp": 1.02704954, + "epoch": 0.1837366601533143, + "flos": 28764381519360.0, + "grad_norm": 14.423993688140268, + "language_loss": 0.70290178, + "learning_rate": 3.6762771570243834e-06, + "loss": 0.72423035, + "num_input_tokens_seen": 66025675, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.6640625, + "step": 3056, + "time_per_iteration": 3.8228213787078857 + }, + { + "auxiliary_loss_clip": 0.01095429, + "auxiliary_loss_mlp": 0.01036901, + "balance_loss_clip": 1.01788759, + "balance_loss_mlp": 1.02727616, + "epoch": 0.18379678340598227, + "flos": 21250374174720.0, + "grad_norm": 1.7355973902289035, + "language_loss": 0.80511397, + "learning_rate": 3.6760710411481623e-06, + "loss": 0.82643723, + "num_input_tokens_seen": 66046125, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 0.6796875, + "step": 3057, + "time_per_iteration": 2.4140725135803223 + }, + { + "auxiliary_loss_clip": 0.01100634, + "auxiliary_loss_mlp": 0.01040652, + "balance_loss_clip": 1.0181576, + "balance_loss_mlp": 1.0272398, + "epoch": 0.18385690665865023, + "flos": 20448558823680.0, + "grad_norm": 2.0946795049792666, + "language_loss": 0.82550985, + "learning_rate": 3.675864865457277e-06, + "loss": 0.84692276, + "num_input_tokens_seen": 66064375, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.734375, + "step": 3058, + "time_per_iteration": 2.379794120788574 + }, + { + "auxiliary_loss_clip": 0.01096393, + "auxiliary_loss_mlp": 0.01040085, + "balance_loss_clip": 1.0208447, + "balance_loss_mlp": 1.02711689, + "epoch": 0.1839170299113182, + "flos": 26139366725760.0, + "grad_norm": 2.0980446596863476, + "language_loss": 0.85711503, + "learning_rate": 3.675658629959086e-06, + "loss": 0.87847984, + "num_input_tokens_seen": 66084590, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 0.69140625, + "step": 3059, + "time_per_iteration": 3.891240119934082 + }, + { + "auxiliary_loss_clip": 0.01093734, + "auxiliary_loss_mlp": 0.01036044, + "balance_loss_clip": 1.01762605, + "balance_loss_mlp": 1.02620101, + "epoch": 0.18397715316398616, + "flos": 31756717393920.0, + "grad_norm": 1.6536118193911227, + "language_loss": 0.72956884, + "learning_rate": 3.6754523346609486e-06, + "loss": 0.75086659, + "num_input_tokens_seen": 66107105, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.671875, + "step": 3060, + "time_per_iteration": 2.4973304271698 + }, + { + "auxiliary_loss_clip": 0.01097536, + "auxiliary_loss_mlp": 0.01041634, + "balance_loss_clip": 1.02152312, + "balance_loss_mlp": 1.02810681, + "epoch": 0.18403727641665413, + "flos": 24610729357440.0, + "grad_norm": 1.7066628802824622, + "language_loss": 0.72872066, + "learning_rate": 3.675245979570227e-06, + "loss": 0.75011235, + "num_input_tokens_seen": 66129295, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 0.6953125, + "step": 3061, + "time_per_iteration": 3.771973133087158 + }, + { + "auxiliary_loss_clip": 0.01096596, + "auxiliary_loss_mlp": 0.01043026, + "balance_loss_clip": 1.02271307, + "balance_loss_mlp": 1.02863383, + "epoch": 0.18409739966932212, + "flos": 23438800016640.0, + "grad_norm": 1.8269389315323057, + "language_loss": 0.81693745, + "learning_rate": 3.6750395646942857e-06, + "loss": 0.83833361, + "num_input_tokens_seen": 66146910, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 0.6796875, + "step": 3062, + "time_per_iteration": 2.428438425064087 + }, + { + "auxiliary_loss_clip": 0.01100455, + "auxiliary_loss_mlp": 0.01040849, + "balance_loss_clip": 1.02059579, + "balance_loss_mlp": 1.02837312, + "epoch": 0.18415752292199009, + "flos": 21871025147520.0, + "grad_norm": 2.02936284940346, + "language_loss": 0.73024154, + "learning_rate": 3.674833090040491e-06, + "loss": 0.75165462, + "num_input_tokens_seen": 66165370, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 0.71875, + "step": 3063, + "time_per_iteration": 2.3711354732513428 + }, + { + "auxiliary_loss_clip": 0.01092559, + "auxiliary_loss_mlp": 0.01036058, + "balance_loss_clip": 1.0182246, + "balance_loss_mlp": 1.0251112, + "epoch": 0.18421764617465805, + "flos": 25409507420160.0, + "grad_norm": 1.735898876993406, + "language_loss": 0.65679663, + "learning_rate": 3.6746265556162116e-06, + "loss": 0.67808282, + "num_input_tokens_seen": 66186210, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.67578125, + "step": 3064, + "time_per_iteration": 2.438000440597534 + }, + { + "auxiliary_loss_clip": 0.01095055, + "auxiliary_loss_mlp": 0.01036554, + "balance_loss_clip": 1.01711094, + "balance_loss_mlp": 1.02754521, + "epoch": 0.18427776942732602, + "flos": 27196920852480.0, + "grad_norm": 2.484580111970709, + "language_loss": 0.69019604, + "learning_rate": 3.6744199614288174e-06, + "loss": 0.71151215, + "num_input_tokens_seen": 66204800, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 0.671875, + "step": 3065, + "time_per_iteration": 2.418018102645874 + }, + { + "auxiliary_loss_clip": 0.01098874, + "auxiliary_loss_mlp": 0.01041382, + "balance_loss_clip": 1.01999593, + "balance_loss_mlp": 1.02820754, + "epoch": 0.18433789267999398, + "flos": 27851193331200.0, + "grad_norm": 2.250389640607876, + "language_loss": 0.72799128, + "learning_rate": 3.6742133074856828e-06, + "loss": 0.74939388, + "num_input_tokens_seen": 66222195, + "router_z_loss_clip": 0.21386719, + "router_z_loss_mlp": 0.70703125, + "step": 3066, + "time_per_iteration": 2.4412014484405518 + }, + { + "auxiliary_loss_clip": 0.01094326, + "auxiliary_loss_mlp": 0.01037958, + "balance_loss_clip": 1.01918304, + "balance_loss_mlp": 1.02577353, + "epoch": 0.18439801593266195, + "flos": 17856013461120.0, + "grad_norm": 2.4021419349828457, + "language_loss": 0.81847805, + "learning_rate": 3.6740065937941815e-06, + "loss": 0.83980089, + "num_input_tokens_seen": 66239505, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.6875, + "step": 3067, + "time_per_iteration": 2.353297233581543 + }, + { + "auxiliary_loss_clip": 0.01027585, + "auxiliary_loss_mlp": 0.01006485, + "balance_loss_clip": 1.00370753, + "balance_loss_mlp": 1.00713754, + "epoch": 0.18445813918532994, + "flos": 56386403619840.0, + "grad_norm": 0.9837619196764028, + "language_loss": 0.5968374, + "learning_rate": 3.673799820361691e-06, + "loss": 0.61717808, + "num_input_tokens_seen": 66295695, + "router_z_loss_clip": 0.02783203, + "router_z_loss_mlp": 0.20507812, + "step": 3068, + "time_per_iteration": 2.925403594970703 + }, + { + "auxiliary_loss_clip": 0.01094959, + "auxiliary_loss_mlp": 0.01035708, + "balance_loss_clip": 1.01705205, + "balance_loss_mlp": 1.0279814, + "epoch": 0.1845182624379979, + "flos": 20956196125440.0, + "grad_norm": 1.7097395201758374, + "language_loss": 0.76456642, + "learning_rate": 3.67359298719559e-06, + "loss": 0.78587306, + "num_input_tokens_seen": 66315315, + "router_z_loss_clip": 0.18652344, + "router_z_loss_mlp": 0.671875, + "step": 3069, + "time_per_iteration": 2.3808977603912354 + }, + { + "auxiliary_loss_clip": 0.01095065, + "auxiliary_loss_mlp": 0.01035625, + "balance_loss_clip": 1.01588392, + "balance_loss_mlp": 1.02618957, + "epoch": 0.18457838569066587, + "flos": 20484135365760.0, + "grad_norm": 1.818544207956705, + "language_loss": 0.84722435, + "learning_rate": 3.6733860943032607e-06, + "loss": 0.86853123, + "num_input_tokens_seen": 66333675, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 0.6875, + "step": 3070, + "time_per_iteration": 2.394941568374634 + }, + { + "auxiliary_loss_clip": 0.01095165, + "auxiliary_loss_mlp": 0.01036996, + "balance_loss_clip": 1.0171473, + "balance_loss_mlp": 1.02638769, + "epoch": 0.18463850894333383, + "flos": 25008844124160.0, + "grad_norm": 1.9230594787216562, + "language_loss": 0.77398825, + "learning_rate": 3.6731791416920863e-06, + "loss": 0.79530984, + "num_input_tokens_seen": 66354075, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 0.6875, + "step": 3071, + "time_per_iteration": 2.402108907699585 + }, + { + "auxiliary_loss_clip": 0.01099213, + "auxiliary_loss_mlp": 0.01046013, + "balance_loss_clip": 1.02627218, + "balance_loss_mlp": 1.02819109, + "epoch": 0.1846986321960018, + "flos": 16799681232000.0, + "grad_norm": 2.5374344837361407, + "language_loss": 0.77136636, + "learning_rate": 3.6729721293694523e-06, + "loss": 0.79281867, + "num_input_tokens_seen": 66372520, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 0.7109375, + "step": 3072, + "time_per_iteration": 2.424731969833374 + }, + { + "auxiliary_loss_clip": 0.01097982, + "auxiliary_loss_mlp": 0.0103529, + "balance_loss_clip": 1.01547766, + "balance_loss_mlp": 1.02694249, + "epoch": 0.18475875544866976, + "flos": 20813261552640.0, + "grad_norm": 1.872650812020611, + "language_loss": 0.86287987, + "learning_rate": 3.6727650573427464e-06, + "loss": 0.88421261, + "num_input_tokens_seen": 66390745, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 0.7109375, + "step": 3073, + "time_per_iteration": 2.382587432861328 + }, + { + "auxiliary_loss_clip": 0.01097807, + "auxiliary_loss_mlp": 0.01038702, + "balance_loss_clip": 1.02026057, + "balance_loss_mlp": 1.0290978, + "epoch": 0.18481887870133773, + "flos": 22600325871360.0, + "grad_norm": 2.581206154861645, + "language_loss": 0.91659003, + "learning_rate": 3.672557925619358e-06, + "loss": 0.93795508, + "num_input_tokens_seen": 66410525, + "router_z_loss_clip": 0.18457031, + "router_z_loss_mlp": 0.6875, + "step": 3074, + "time_per_iteration": 2.416276216506958 + }, + { + "auxiliary_loss_clip": 0.01095132, + "auxiliary_loss_mlp": 0.0104095, + "balance_loss_clip": 1.0198257, + "balance_loss_mlp": 1.02679682, + "epoch": 0.18487900195400572, + "flos": 29457582030720.0, + "grad_norm": 1.9061680747010519, + "language_loss": 0.64877582, + "learning_rate": 3.67235073420668e-06, + "loss": 0.67013657, + "num_input_tokens_seen": 66432535, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 0.68359375, + "step": 3075, + "time_per_iteration": 2.4442203044891357 + }, + { + "auxiliary_loss_clip": 0.01096984, + "auxiliary_loss_mlp": 0.01037603, + "balance_loss_clip": 1.01770711, + "balance_loss_mlp": 1.02914739, + "epoch": 0.1849391252066737, + "flos": 20627803077120.0, + "grad_norm": 1.8140366071750742, + "language_loss": 0.72486526, + "learning_rate": 3.672143483112106e-06, + "loss": 0.74621117, + "num_input_tokens_seen": 66450620, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 0.6796875, + "step": 3076, + "time_per_iteration": 2.3927721977233887 + }, + { + "auxiliary_loss_clip": 0.0109696, + "auxiliary_loss_mlp": 0.01038754, + "balance_loss_clip": 1.01946545, + "balance_loss_mlp": 1.02690089, + "epoch": 0.18499924845934165, + "flos": 14427682128000.0, + "grad_norm": 2.272661651313579, + "language_loss": 0.81143332, + "learning_rate": 3.6719361723430325e-06, + "loss": 0.83279043, + "num_input_tokens_seen": 66467865, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 0.703125, + "step": 3077, + "time_per_iteration": 2.3459293842315674 + }, + { + "auxiliary_loss_clip": 0.01093051, + "auxiliary_loss_mlp": 0.01037479, + "balance_loss_clip": 1.01968122, + "balance_loss_mlp": 1.02585387, + "epoch": 0.18505937171200962, + "flos": 23726659109760.0, + "grad_norm": 1.8577127837084841, + "language_loss": 0.78537548, + "learning_rate": 3.671728801906857e-06, + "loss": 0.8066808, + "num_input_tokens_seen": 66486245, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.671875, + "step": 3078, + "time_per_iteration": 2.426886558532715 + }, + { + "auxiliary_loss_clip": 0.01095366, + "auxiliary_loss_mlp": 0.01039953, + "balance_loss_clip": 1.02030742, + "balance_loss_mlp": 1.02709413, + "epoch": 0.18511949496467758, + "flos": 25956317134080.0, + "grad_norm": 1.8974727382618128, + "language_loss": 0.77608848, + "learning_rate": 3.6715213718109816e-06, + "loss": 0.79744172, + "num_input_tokens_seen": 66506510, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 0.68359375, + "step": 3079, + "time_per_iteration": 2.4180920124053955 + }, + { + "auxiliary_loss_clip": 0.01090707, + "auxiliary_loss_mlp": 0.01038363, + "balance_loss_clip": 1.02035069, + "balance_loss_mlp": 1.02345252, + "epoch": 0.18517961821734555, + "flos": 42411895205760.0, + "grad_norm": 1.751473176273842, + "language_loss": 0.81666404, + "learning_rate": 3.671313882062808e-06, + "loss": 0.8379547, + "num_input_tokens_seen": 66530960, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.671875, + "step": 3080, + "time_per_iteration": 2.6055829524993896 + }, + { + "auxiliary_loss_clip": 0.01096193, + "auxiliary_loss_mlp": 0.01039956, + "balance_loss_clip": 1.01905835, + "balance_loss_mlp": 1.02513731, + "epoch": 0.18523974147001354, + "flos": 24096423985920.0, + "grad_norm": 1.8256720833864581, + "language_loss": 0.73711753, + "learning_rate": 3.6711063326697405e-06, + "loss": 0.758479, + "num_input_tokens_seen": 66550275, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 0.7109375, + "step": 3081, + "time_per_iteration": 2.4011452198028564 + }, + { + "auxiliary_loss_clip": 0.01097376, + "auxiliary_loss_mlp": 0.01039001, + "balance_loss_clip": 1.01956987, + "balance_loss_mlp": 1.0292356, + "epoch": 0.1852998647226815, + "flos": 27374210069760.0, + "grad_norm": 2.017635067332807, + "language_loss": 0.71629858, + "learning_rate": 3.6708987236391867e-06, + "loss": 0.73766237, + "num_input_tokens_seen": 66569040, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 0.6796875, + "step": 3082, + "time_per_iteration": 2.4442226886749268 + }, + { + "auxiliary_loss_clip": 0.01095924, + "auxiliary_loss_mlp": 0.01036522, + "balance_loss_clip": 1.01576734, + "balance_loss_mlp": 1.0284586, + "epoch": 0.18535998797534947, + "flos": 18331774824960.0, + "grad_norm": 2.59461557446974, + "language_loss": 0.69121969, + "learning_rate": 3.6706910549785562e-06, + "loss": 0.71254414, + "num_input_tokens_seen": 66587775, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 0.67578125, + "step": 3083, + "time_per_iteration": 2.348402261734009 + }, + { + "auxiliary_loss_clip": 0.01095571, + "auxiliary_loss_mlp": 0.01035693, + "balance_loss_clip": 1.01739383, + "balance_loss_mlp": 1.02865887, + "epoch": 0.18542011122801744, + "flos": 37844522899200.0, + "grad_norm": 2.0029760019679537, + "language_loss": 0.68881965, + "learning_rate": 3.670483326695259e-06, + "loss": 0.71013224, + "num_input_tokens_seen": 66610800, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.671875, + "step": 3084, + "time_per_iteration": 2.5439836978912354 + }, + { + "auxiliary_loss_clip": 0.01093441, + "auxiliary_loss_mlp": 0.01035632, + "balance_loss_clip": 1.01701152, + "balance_loss_mlp": 1.02686095, + "epoch": 0.1854802344806854, + "flos": 25185120912000.0, + "grad_norm": 1.7511023900322003, + "language_loss": 0.77998507, + "learning_rate": 3.6702755387967097e-06, + "loss": 0.80127585, + "num_input_tokens_seen": 66630960, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.6640625, + "step": 3085, + "time_per_iteration": 2.4212591648101807 + }, + { + "auxiliary_loss_clip": 0.01093055, + "auxiliary_loss_mlp": 0.01037337, + "balance_loss_clip": 1.01842999, + "balance_loss_mlp": 1.02593207, + "epoch": 0.18554035773335337, + "flos": 26683662821760.0, + "grad_norm": 2.121942337652293, + "language_loss": 0.73581004, + "learning_rate": 3.6700676912903214e-06, + "loss": 0.75711393, + "num_input_tokens_seen": 66650585, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 0.671875, + "step": 3086, + "time_per_iteration": 2.4404194355010986 + }, + { + "auxiliary_loss_clip": 0.010925, + "auxiliary_loss_mlp": 0.01038782, + "balance_loss_clip": 1.01865935, + "balance_loss_mlp": 1.02658355, + "epoch": 0.18560048098602133, + "flos": 22345774081920.0, + "grad_norm": 2.215072680838077, + "language_loss": 0.69519728, + "learning_rate": 3.6698597841835144e-06, + "loss": 0.71651012, + "num_input_tokens_seen": 66670045, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 0.65625, + "step": 3087, + "time_per_iteration": 2.3740296363830566 + }, + { + "auxiliary_loss_clip": 0.01092804, + "auxiliary_loss_mlp": 0.0103977, + "balance_loss_clip": 1.01925397, + "balance_loss_mlp": 1.02611017, + "epoch": 0.18566060423868933, + "flos": 17747573178240.0, + "grad_norm": 2.3717575823996118, + "language_loss": 0.73237813, + "learning_rate": 3.6696518174837064e-06, + "loss": 0.75370395, + "num_input_tokens_seen": 66688790, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 0.66796875, + "step": 3088, + "time_per_iteration": 2.3804128170013428 + }, + { + "auxiliary_loss_clip": 0.01092503, + "auxiliary_loss_mlp": 0.01036027, + "balance_loss_clip": 1.01862299, + "balance_loss_mlp": 1.0264163, + "epoch": 0.1857207274913573, + "flos": 24676226801280.0, + "grad_norm": 1.8183974767075333, + "language_loss": 0.91748768, + "learning_rate": 3.6694437911983197e-06, + "loss": 0.93877304, + "num_input_tokens_seen": 66708090, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.66015625, + "step": 3089, + "time_per_iteration": 2.4014928340911865 + }, + { + "auxiliary_loss_clip": 0.01090311, + "auxiliary_loss_mlp": 0.01034988, + "balance_loss_clip": 1.01645148, + "balance_loss_mlp": 1.02585053, + "epoch": 0.18578085074402526, + "flos": 28146558366720.0, + "grad_norm": 4.054582797978431, + "language_loss": 0.57891083, + "learning_rate": 3.669235705334779e-06, + "loss": 0.60016382, + "num_input_tokens_seen": 66727320, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.64453125, + "step": 3090, + "time_per_iteration": 2.441551446914673 + }, + { + "auxiliary_loss_clip": 0.01089237, + "auxiliary_loss_mlp": 0.01036836, + "balance_loss_clip": 1.01866841, + "balance_loss_mlp": 1.02500856, + "epoch": 0.18584097399669322, + "flos": 23950731415680.0, + "grad_norm": 1.991318415116826, + "language_loss": 0.81947285, + "learning_rate": 3.669027559900509e-06, + "loss": 0.84073359, + "num_input_tokens_seen": 66747505, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.640625, + "step": 3091, + "time_per_iteration": 2.428163528442383 + }, + { + "auxiliary_loss_clip": 0.01094214, + "auxiliary_loss_mlp": 0.01042256, + "balance_loss_clip": 1.02330172, + "balance_loss_mlp": 1.02634609, + "epoch": 0.18590109724936119, + "flos": 17200728552960.0, + "grad_norm": 5.040958102622603, + "language_loss": 0.84161019, + "learning_rate": 3.6688193549029397e-06, + "loss": 0.86297488, + "num_input_tokens_seen": 66766425, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 0.6796875, + "step": 3092, + "time_per_iteration": 2.395855188369751 + }, + { + "auxiliary_loss_clip": 0.01096124, + "auxiliary_loss_mlp": 0.01040186, + "balance_loss_clip": 1.02018285, + "balance_loss_mlp": 1.0266664, + "epoch": 0.18596122050202915, + "flos": 17233791477120.0, + "grad_norm": 2.3730589433562215, + "language_loss": 0.93141162, + "learning_rate": 3.6686110903494995e-06, + "loss": 0.95277476, + "num_input_tokens_seen": 66781130, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 0.6953125, + "step": 3093, + "time_per_iteration": 2.3353850841522217 + }, + { + "auxiliary_loss_clip": 0.01094786, + "auxiliary_loss_mlp": 0.01040432, + "balance_loss_clip": 1.02232397, + "balance_loss_mlp": 1.02821648, + "epoch": 0.18602134375469712, + "flos": 19019878277760.0, + "grad_norm": 1.8480169426734527, + "language_loss": 0.77004647, + "learning_rate": 3.668402766247622e-06, + "loss": 0.7913987, + "num_input_tokens_seen": 66797535, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.6640625, + "step": 3094, + "time_per_iteration": 2.387707233428955 + }, + { + "auxiliary_loss_clip": 0.01094195, + "auxiliary_loss_mlp": 0.01037264, + "balance_loss_clip": 1.01876283, + "balance_loss_mlp": 1.02692091, + "epoch": 0.1860814670073651, + "flos": 50948229248640.0, + "grad_norm": 1.6034813027980024, + "language_loss": 0.69743431, + "learning_rate": 3.6681943826047413e-06, + "loss": 0.71874893, + "num_input_tokens_seen": 66821720, + "router_z_loss_clip": 0.18457031, + "router_z_loss_mlp": 0.671875, + "step": 3095, + "time_per_iteration": 2.652736186981201 + }, + { + "auxiliary_loss_clip": 0.01094172, + "auxiliary_loss_mlp": 0.01038766, + "balance_loss_clip": 1.01911986, + "balance_loss_mlp": 1.02603316, + "epoch": 0.18614159026003307, + "flos": 19389957356160.0, + "grad_norm": 2.022373330053034, + "language_loss": 0.80696297, + "learning_rate": 3.6679859394282944e-06, + "loss": 0.82829237, + "num_input_tokens_seen": 66839060, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 0.6796875, + "step": 3096, + "time_per_iteration": 3.8538777828216553 + }, + { + "auxiliary_loss_clip": 0.01092448, + "auxiliary_loss_mlp": 0.010374, + "balance_loss_clip": 1.01905358, + "balance_loss_mlp": 1.02626419, + "epoch": 0.18620171351270104, + "flos": 21797707559040.0, + "grad_norm": 2.027790195257226, + "language_loss": 0.74775016, + "learning_rate": 3.6677774367257194e-06, + "loss": 0.76904869, + "num_input_tokens_seen": 66857760, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.6640625, + "step": 3097, + "time_per_iteration": 2.3681282997131348 + }, + { + "auxiliary_loss_clip": 0.01092863, + "auxiliary_loss_mlp": 0.01035928, + "balance_loss_clip": 1.0169971, + "balance_loss_mlp": 1.02733111, + "epoch": 0.186261836765369, + "flos": 16361940205440.0, + "grad_norm": 2.02847078941279, + "language_loss": 0.65580666, + "learning_rate": 3.6675688745044583e-06, + "loss": 0.67709458, + "num_input_tokens_seen": 66876460, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 0.65625, + "step": 3098, + "time_per_iteration": 2.395278215408325 + }, + { + "auxiliary_loss_clip": 0.01092862, + "auxiliary_loss_mlp": 0.01038471, + "balance_loss_clip": 1.01805055, + "balance_loss_mlp": 1.02456856, + "epoch": 0.18632196001803697, + "flos": 23368868830080.0, + "grad_norm": 1.7684825156704067, + "language_loss": 0.6959098, + "learning_rate": 3.6673602527719533e-06, + "loss": 0.71722305, + "num_input_tokens_seen": 66897960, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 0.68359375, + "step": 3099, + "time_per_iteration": 5.235820531845093 + }, + { + "auxiliary_loss_clip": 0.01095068, + "auxiliary_loss_mlp": 0.0104164, + "balance_loss_clip": 1.02186322, + "balance_loss_mlp": 1.02683389, + "epoch": 0.18638208327070493, + "flos": 22490908070400.0, + "grad_norm": 1.54436392584293, + "language_loss": 0.71356487, + "learning_rate": 3.66715157153565e-06, + "loss": 0.73493195, + "num_input_tokens_seen": 66917675, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 0.6796875, + "step": 3100, + "time_per_iteration": 2.3982720375061035 + }, + { + "auxiliary_loss_clip": 0.01094381, + "auxiliary_loss_mlp": 0.01045709, + "balance_loss_clip": 1.02631426, + "balance_loss_mlp": 1.02628279, + "epoch": 0.18644220652337293, + "flos": 29164067297280.0, + "grad_norm": 1.9548442918114333, + "language_loss": 0.80125928, + "learning_rate": 3.666942830802996e-06, + "loss": 0.82266021, + "num_input_tokens_seen": 66936000, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 0.6796875, + "step": 3101, + "time_per_iteration": 3.767545223236084 + }, + { + "auxiliary_loss_clip": 0.01090047, + "auxiliary_loss_mlp": 0.01035671, + "balance_loss_clip": 1.01759875, + "balance_loss_mlp": 1.02579045, + "epoch": 0.1865023297760409, + "flos": 24242640226560.0, + "grad_norm": 1.8817863431958264, + "language_loss": 0.76819777, + "learning_rate": 3.6667340305814394e-06, + "loss": 0.78945494, + "num_input_tokens_seen": 66955700, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.640625, + "step": 3102, + "time_per_iteration": 2.4271433353424072 + }, + { + "auxiliary_loss_clip": 0.01095077, + "auxiliary_loss_mlp": 0.01033553, + "balance_loss_clip": 1.01529002, + "balance_loss_mlp": 1.02549887, + "epoch": 0.18656245302870886, + "flos": 19127899624320.0, + "grad_norm": 2.3303646098086364, + "language_loss": 0.76854289, + "learning_rate": 3.6665251708784325e-06, + "loss": 0.78982925, + "num_input_tokens_seen": 66972815, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 0.6953125, + "step": 3103, + "time_per_iteration": 2.3667640686035156 + }, + { + "auxiliary_loss_clip": 0.01094081, + "auxiliary_loss_mlp": 0.01040071, + "balance_loss_clip": 1.0222609, + "balance_loss_mlp": 1.0266794, + "epoch": 0.18662257628137682, + "flos": 17785104756480.0, + "grad_norm": 1.6592553548614029, + "language_loss": 0.79195917, + "learning_rate": 3.6663162517014294e-06, + "loss": 0.81330061, + "num_input_tokens_seen": 66992280, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.67578125, + "step": 3104, + "time_per_iteration": 2.387533187866211 + }, + { + "auxiliary_loss_clip": 0.01093203, + "auxiliary_loss_mlp": 0.01032621, + "balance_loss_clip": 1.01515698, + "balance_loss_mlp": 1.0283078, + "epoch": 0.1866826995340448, + "flos": 24023246042880.0, + "grad_norm": 2.181170976821608, + "language_loss": 0.85263824, + "learning_rate": 3.6661072730578858e-06, + "loss": 0.87389648, + "num_input_tokens_seen": 67012220, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.6484375, + "step": 3105, + "time_per_iteration": 2.407792568206787 + }, + { + "auxiliary_loss_clip": 0.01095813, + "auxiliary_loss_mlp": 0.01037537, + "balance_loss_clip": 1.01649654, + "balance_loss_mlp": 1.02444923, + "epoch": 0.18674282278671275, + "flos": 26140030041600.0, + "grad_norm": 2.084470877677171, + "language_loss": 0.86739075, + "learning_rate": 3.665898234955259e-06, + "loss": 0.88872427, + "num_input_tokens_seen": 67032030, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 0.71484375, + "step": 3106, + "time_per_iteration": 2.4420242309570312 + }, + { + "auxiliary_loss_clip": 0.01094503, + "auxiliary_loss_mlp": 0.01038425, + "balance_loss_clip": 1.01956582, + "balance_loss_mlp": 1.02600205, + "epoch": 0.18680294603938072, + "flos": 19201112478720.0, + "grad_norm": 1.9136123432418746, + "language_loss": 0.78331274, + "learning_rate": 3.6656891374010097e-06, + "loss": 0.80464196, + "num_input_tokens_seen": 67048920, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 0.6875, + "step": 3107, + "time_per_iteration": 2.356750726699829 + }, + { + "auxiliary_loss_clip": 0.01094176, + "auxiliary_loss_mlp": 0.0104153, + "balance_loss_clip": 1.02033496, + "balance_loss_mlp": 1.02441061, + "epoch": 0.1868630692920487, + "flos": 28543730526720.0, + "grad_norm": 2.008652235562938, + "language_loss": 0.73930967, + "learning_rate": 3.665479980402599e-06, + "loss": 0.76066679, + "num_input_tokens_seen": 67068645, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 0.6953125, + "step": 3108, + "time_per_iteration": 2.44694185256958 + }, + { + "auxiliary_loss_clip": 0.01094503, + "auxiliary_loss_mlp": 0.01035724, + "balance_loss_clip": 1.01715183, + "balance_loss_mlp": 1.02770066, + "epoch": 0.18692319254471668, + "flos": 17237073144960.0, + "grad_norm": 1.7324864186310476, + "language_loss": 0.74367827, + "learning_rate": 3.665270763967493e-06, + "loss": 0.76498055, + "num_input_tokens_seen": 67087075, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.66796875, + "step": 3109, + "time_per_iteration": 2.361837387084961 + }, + { + "auxiliary_loss_clip": 0.01091546, + "auxiliary_loss_mlp": 0.01034762, + "balance_loss_clip": 1.01597452, + "balance_loss_mlp": 1.02526867, + "epoch": 0.18698331579738464, + "flos": 23184073670400.0, + "grad_norm": 1.6227875736907937, + "language_loss": 0.84263664, + "learning_rate": 3.6650614881031567e-06, + "loss": 0.86389971, + "num_input_tokens_seen": 67108040, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.6640625, + "step": 3110, + "time_per_iteration": 2.40545916557312 + }, + { + "auxiliary_loss_clip": 0.01095176, + "auxiliary_loss_mlp": 0.01040211, + "balance_loss_clip": 1.02092266, + "balance_loss_mlp": 1.02745783, + "epoch": 0.1870434390500526, + "flos": 25515643553280.0, + "grad_norm": 2.1454489632159643, + "language_loss": 0.84406185, + "learning_rate": 3.664852152817059e-06, + "loss": 0.86541569, + "num_input_tokens_seen": 67127605, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 0.67578125, + "step": 3111, + "time_per_iteration": 2.4096744060516357 + }, + { + "auxiliary_loss_clip": 0.01091842, + "auxiliary_loss_mlp": 0.01036712, + "balance_loss_clip": 1.01865137, + "balance_loss_mlp": 1.0261302, + "epoch": 0.18710356230272057, + "flos": 19499794093440.0, + "grad_norm": 1.9619282300149858, + "language_loss": 0.76870215, + "learning_rate": 3.6646427581166702e-06, + "loss": 0.78998768, + "num_input_tokens_seen": 67145785, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.65625, + "step": 3112, + "time_per_iteration": 2.378523111343384 + }, + { + "auxiliary_loss_clip": 0.0109347, + "auxiliary_loss_mlp": 0.01037504, + "balance_loss_clip": 1.01946771, + "balance_loss_mlp": 1.02579355, + "epoch": 0.18716368555538854, + "flos": 26759633673600.0, + "grad_norm": 2.0331328031005156, + "language_loss": 0.64472282, + "learning_rate": 3.6644333040094636e-06, + "loss": 0.66603267, + "num_input_tokens_seen": 67165930, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.67578125, + "step": 3113, + "time_per_iteration": 2.4664793014526367 + }, + { + "auxiliary_loss_clip": 0.0109792, + "auxiliary_loss_mlp": 0.01032934, + "balance_loss_clip": 1.01378942, + "balance_loss_mlp": 1.02778101, + "epoch": 0.1872238088080565, + "flos": 25188716782080.0, + "grad_norm": 3.8465490341367548, + "language_loss": 0.81099665, + "learning_rate": 3.6642237905029132e-06, + "loss": 0.83230519, + "num_input_tokens_seen": 67185830, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 0.703125, + "step": 3114, + "time_per_iteration": 2.439657688140869 + }, + { + "auxiliary_loss_clip": 0.01094659, + "auxiliary_loss_mlp": 0.01041677, + "balance_loss_clip": 1.02135181, + "balance_loss_mlp": 1.02711296, + "epoch": 0.1872839320607245, + "flos": 24133152602880.0, + "grad_norm": 1.890025172784249, + "language_loss": 0.57458973, + "learning_rate": 3.664014217604497e-06, + "loss": 0.59595311, + "num_input_tokens_seen": 67206930, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 0.67578125, + "step": 3115, + "time_per_iteration": 2.4208052158355713 + }, + { + "auxiliary_loss_clip": 0.01091956, + "auxiliary_loss_mlp": 0.01034217, + "balance_loss_clip": 1.01615715, + "balance_loss_mlp": 1.02803063, + "epoch": 0.18734405531339246, + "flos": 21172867223040.0, + "grad_norm": 2.0238768270140497, + "language_loss": 0.71198618, + "learning_rate": 3.6638045853216938e-06, + "loss": 0.733248, + "num_input_tokens_seen": 67226290, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.640625, + "step": 3116, + "time_per_iteration": 2.40444016456604 + }, + { + "auxiliary_loss_clip": 0.01089161, + "auxiliary_loss_mlp": 0.01030626, + "balance_loss_clip": 1.01394904, + "balance_loss_mlp": 1.02528214, + "epoch": 0.18740417856606043, + "flos": 17236758942720.0, + "grad_norm": 1.9591007837706198, + "language_loss": 0.78899264, + "learning_rate": 3.663594893661985e-06, + "loss": 0.81019044, + "num_input_tokens_seen": 67244410, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.640625, + "step": 3117, + "time_per_iteration": 2.3701653480529785 + }, + { + "auxiliary_loss_clip": 0.01092906, + "auxiliary_loss_mlp": 0.01033878, + "balance_loss_clip": 1.01637793, + "balance_loss_mlp": 1.02760911, + "epoch": 0.1874643018187284, + "flos": 32556787176960.0, + "grad_norm": 1.7784577775703345, + "language_loss": 0.84191912, + "learning_rate": 3.663385142632853e-06, + "loss": 0.86318696, + "num_input_tokens_seen": 67264470, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.65234375, + "step": 3118, + "time_per_iteration": 2.495004177093506 + }, + { + "auxiliary_loss_clip": 0.01092904, + "auxiliary_loss_mlp": 0.01032003, + "balance_loss_clip": 1.01405048, + "balance_loss_mlp": 1.0258255, + "epoch": 0.18752442507139636, + "flos": 23257042145280.0, + "grad_norm": 10.896413965969675, + "language_loss": 0.76097798, + "learning_rate": 3.663175332241785e-06, + "loss": 0.78222704, + "num_input_tokens_seen": 67284315, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.66796875, + "step": 3119, + "time_per_iteration": 2.3944449424743652 + }, + { + "auxiliary_loss_clip": 0.01094838, + "auxiliary_loss_mlp": 0.01039663, + "balance_loss_clip": 1.02112603, + "balance_loss_mlp": 1.02713871, + "epoch": 0.18758454832406432, + "flos": 21759896689920.0, + "grad_norm": 1.9147952443760252, + "language_loss": 0.82168788, + "learning_rate": 3.6629654624962666e-06, + "loss": 0.84303284, + "num_input_tokens_seen": 67302780, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.6796875, + "step": 3120, + "time_per_iteration": 2.40030574798584 + }, + { + "auxiliary_loss_clip": 0.01089715, + "auxiliary_loss_mlp": 0.01034538, + "balance_loss_clip": 1.01741982, + "balance_loss_mlp": 1.02592897, + "epoch": 0.1876446715767323, + "flos": 29568919956480.0, + "grad_norm": 2.0956589745081087, + "language_loss": 0.85304511, + "learning_rate": 3.6627555334037893e-06, + "loss": 0.87428761, + "num_input_tokens_seen": 67323405, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.63671875, + "step": 3121, + "time_per_iteration": 2.442066192626953 + }, + { + "auxiliary_loss_clip": 0.01092619, + "auxiliary_loss_mlp": 0.01037071, + "balance_loss_clip": 1.0192734, + "balance_loss_mlp": 1.0265708, + "epoch": 0.18770479482940028, + "flos": 30338580078720.0, + "grad_norm": 1.771444275332751, + "language_loss": 0.70667934, + "learning_rate": 3.662545544971844e-06, + "loss": 0.72797626, + "num_input_tokens_seen": 67345800, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.66015625, + "step": 3122, + "time_per_iteration": 2.4791295528411865 + }, + { + "auxiliary_loss_clip": 0.01088481, + "auxiliary_loss_mlp": 0.01035789, + "balance_loss_clip": 1.01690674, + "balance_loss_mlp": 1.02441263, + "epoch": 0.18776491808206824, + "flos": 14464480567680.0, + "grad_norm": 2.354608206212646, + "language_loss": 0.70926332, + "learning_rate": 3.662335497207924e-06, + "loss": 0.73050606, + "num_input_tokens_seen": 67363575, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 0.640625, + "step": 3123, + "time_per_iteration": 2.3511626720428467 + }, + { + "auxiliary_loss_clip": 0.01090677, + "auxiliary_loss_mlp": 0.01035649, + "balance_loss_clip": 1.01949656, + "balance_loss_mlp": 1.02599788, + "epoch": 0.1878250413347362, + "flos": 24497401484160.0, + "grad_norm": 1.9326721085903336, + "language_loss": 0.74157596, + "learning_rate": 3.662125390119527e-06, + "loss": 0.76283926, + "num_input_tokens_seen": 67381765, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.6484375, + "step": 3124, + "time_per_iteration": 2.408418655395508 + }, + { + "auxiliary_loss_clip": 0.01092443, + "auxiliary_loss_mlp": 0.01036397, + "balance_loss_clip": 1.01840878, + "balance_loss_mlp": 1.02593827, + "epoch": 0.18788516458740417, + "flos": 39784611173760.0, + "grad_norm": 1.6700523984093973, + "language_loss": 0.8071084, + "learning_rate": 3.66191522371415e-06, + "loss": 0.8283968, + "num_input_tokens_seen": 67405000, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.6640625, + "step": 3125, + "time_per_iteration": 2.548044443130493 + }, + { + "auxiliary_loss_clip": 0.01028515, + "auxiliary_loss_mlp": 0.01011871, + "balance_loss_clip": 1.00909352, + "balance_loss_mlp": 1.00887442, + "epoch": 0.18794528784007214, + "flos": 64696151738880.0, + "grad_norm": 0.9630832536964263, + "language_loss": 0.63649619, + "learning_rate": 3.6617049979992937e-06, + "loss": 0.65690005, + "num_input_tokens_seen": 67467140, + "router_z_loss_clip": 0.02783203, + "router_z_loss_mlp": 0.19628906, + "step": 3126, + "time_per_iteration": 3.0950701236724854 + }, + { + "auxiliary_loss_clip": 0.01090555, + "auxiliary_loss_mlp": 0.01034912, + "balance_loss_clip": 1.01673269, + "balance_loss_mlp": 1.02696335, + "epoch": 0.1880054110927401, + "flos": 28620783630720.0, + "grad_norm": 1.6531561448667726, + "language_loss": 0.81115246, + "learning_rate": 3.6614947129824603e-06, + "loss": 0.83240718, + "num_input_tokens_seen": 67487980, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.63671875, + "step": 3127, + "time_per_iteration": 2.4451937675476074 + }, + { + "auxiliary_loss_clip": 0.01026604, + "auxiliary_loss_mlp": 0.01004844, + "balance_loss_clip": 1.00189924, + "balance_loss_mlp": 1.00730681, + "epoch": 0.1880655343454081, + "flos": 64485625040640.0, + "grad_norm": 0.7640370653681977, + "language_loss": 0.61857343, + "learning_rate": 3.6612843686711542e-06, + "loss": 0.63888794, + "num_input_tokens_seen": 67552500, + "router_z_loss_clip": 0.02941895, + "router_z_loss_mlp": 0.19335938, + "step": 3128, + "time_per_iteration": 3.1438512802124023 + }, + { + "auxiliary_loss_clip": 0.01095201, + "auxiliary_loss_mlp": 0.01029671, + "balance_loss_clip": 1.0115037, + "balance_loss_mlp": 1.02658224, + "epoch": 0.18812565759807606, + "flos": 32123095868160.0, + "grad_norm": 2.1786778699681593, + "language_loss": 0.70593059, + "learning_rate": 3.661073965072883e-06, + "loss": 0.72717929, + "num_input_tokens_seen": 67573295, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.6875, + "step": 3129, + "time_per_iteration": 2.520672559738159 + }, + { + "auxiliary_loss_clip": 0.01094378, + "auxiliary_loss_mlp": 0.01040439, + "balance_loss_clip": 1.02124667, + "balance_loss_mlp": 1.02671683, + "epoch": 0.18818578085074403, + "flos": 20623683536640.0, + "grad_norm": 2.4633057488059817, + "language_loss": 0.85206509, + "learning_rate": 3.6608635021951546e-06, + "loss": 0.87341321, + "num_input_tokens_seen": 67590010, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 0.67578125, + "step": 3130, + "time_per_iteration": 2.4170432090759277 + }, + { + "auxiliary_loss_clip": 0.01093009, + "auxiliary_loss_mlp": 0.01035344, + "balance_loss_clip": 1.01569831, + "balance_loss_mlp": 1.025419, + "epoch": 0.188245904103412, + "flos": 28839235207680.0, + "grad_norm": 2.0859429617400593, + "language_loss": 0.77010924, + "learning_rate": 3.6606529800454794e-06, + "loss": 0.7913928, + "num_input_tokens_seen": 67611110, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 0.67578125, + "step": 3131, + "time_per_iteration": 2.4445645809173584 + }, + { + "auxiliary_loss_clip": 0.01091879, + "auxiliary_loss_mlp": 0.01037633, + "balance_loss_clip": 1.0196805, + "balance_loss_mlp": 1.02741194, + "epoch": 0.18830602735607996, + "flos": 29419142757120.0, + "grad_norm": 2.053265037194725, + "language_loss": 0.81552517, + "learning_rate": 3.660442398631372e-06, + "loss": 0.83682024, + "num_input_tokens_seen": 67631990, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.64453125, + "step": 3132, + "time_per_iteration": 2.4377148151397705 + }, + { + "auxiliary_loss_clip": 0.01094441, + "auxiliary_loss_mlp": 0.01040242, + "balance_loss_clip": 1.02169347, + "balance_loss_mlp": 1.02709687, + "epoch": 0.18836615060874792, + "flos": 28871774461440.0, + "grad_norm": 2.180847825789763, + "language_loss": 0.79780543, + "learning_rate": 3.660231757960346e-06, + "loss": 0.81915224, + "num_input_tokens_seen": 67650490, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.671875, + "step": 3133, + "time_per_iteration": 2.4449267387390137 + }, + { + "auxiliary_loss_clip": 0.01093615, + "auxiliary_loss_mlp": 0.01043389, + "balance_loss_clip": 1.02441084, + "balance_loss_mlp": 1.02740383, + "epoch": 0.18842627386141592, + "flos": 22600570250880.0, + "grad_norm": 11.22870173067583, + "language_loss": 0.82609212, + "learning_rate": 3.660021058039919e-06, + "loss": 0.84746218, + "num_input_tokens_seen": 67668860, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 0.66015625, + "step": 3134, + "time_per_iteration": 2.400921583175659 + }, + { + "auxiliary_loss_clip": 0.01092727, + "auxiliary_loss_mlp": 0.01038447, + "balance_loss_clip": 1.01982653, + "balance_loss_mlp": 1.02743077, + "epoch": 0.18848639711408388, + "flos": 24572394817920.0, + "grad_norm": 1.5148271158508548, + "language_loss": 0.8306362, + "learning_rate": 3.659810298877611e-06, + "loss": 0.8519479, + "num_input_tokens_seen": 67690220, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.65234375, + "step": 3135, + "time_per_iteration": 3.787660598754883 + }, + { + "auxiliary_loss_clip": 0.01098167, + "auxiliary_loss_mlp": 0.010379, + "balance_loss_clip": 1.01861191, + "balance_loss_mlp": 1.02853096, + "epoch": 0.18854652036675185, + "flos": 34165514937600.0, + "grad_norm": 2.004547343841207, + "language_loss": 0.78512704, + "learning_rate": 3.659599480480943e-06, + "loss": 0.80648768, + "num_input_tokens_seen": 67709820, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 0.6953125, + "step": 3136, + "time_per_iteration": 2.479468822479248 + }, + { + "auxiliary_loss_clip": 0.01093813, + "auxiliary_loss_mlp": 0.0104207, + "balance_loss_clip": 1.02251983, + "balance_loss_mlp": 1.02774096, + "epoch": 0.1886066436194198, + "flos": 24199278451200.0, + "grad_norm": 2.0835368106190146, + "language_loss": 0.81216836, + "learning_rate": 3.659388602857438e-06, + "loss": 0.83352721, + "num_input_tokens_seen": 67729490, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 0.66015625, + "step": 3137, + "time_per_iteration": 2.4111809730529785 + }, + { + "auxiliary_loss_clip": 0.01095639, + "auxiliary_loss_mlp": 0.01036815, + "balance_loss_clip": 1.01908875, + "balance_loss_mlp": 1.02895725, + "epoch": 0.18866676687208778, + "flos": 21250059972480.0, + "grad_norm": 1.5100388369519946, + "language_loss": 0.80736995, + "learning_rate": 3.6591776660146225e-06, + "loss": 0.82869452, + "num_input_tokens_seen": 67749665, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.66796875, + "step": 3138, + "time_per_iteration": 3.7891883850097656 + }, + { + "auxiliary_loss_clip": 0.01095721, + "auxiliary_loss_mlp": 0.01039131, + "balance_loss_clip": 1.02066553, + "balance_loss_mlp": 1.02673495, + "epoch": 0.18872689012475574, + "flos": 37307069429760.0, + "grad_norm": 2.0378947896863555, + "language_loss": 0.63375771, + "learning_rate": 3.6589666699600247e-06, + "loss": 0.65510619, + "num_input_tokens_seen": 67776230, + "router_z_loss_clip": 0.18457031, + "router_z_loss_mlp": 0.69140625, + "step": 3139, + "time_per_iteration": 3.908792495727539 + }, + { + "auxiliary_loss_clip": 0.01092315, + "auxiliary_loss_mlp": 0.01037282, + "balance_loss_clip": 1.017326, + "balance_loss_mlp": 1.02600133, + "epoch": 0.1887870133774237, + "flos": 21651246938880.0, + "grad_norm": 2.4065564903787893, + "language_loss": 0.71284431, + "learning_rate": 3.6587556147011728e-06, + "loss": 0.73414028, + "num_input_tokens_seen": 67795080, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 0.6640625, + "step": 3140, + "time_per_iteration": 3.7559385299682617 + }, + { + "auxiliary_loss_clip": 0.01094735, + "auxiliary_loss_mlp": 0.01037656, + "balance_loss_clip": 1.01778328, + "balance_loss_mlp": 1.02658761, + "epoch": 0.1888471366300917, + "flos": 15923745331200.0, + "grad_norm": 2.4677283334453546, + "language_loss": 0.87063736, + "learning_rate": 3.6585445002456004e-06, + "loss": 0.89196122, + "num_input_tokens_seen": 67813110, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 0.6796875, + "step": 3141, + "time_per_iteration": 2.3590939044952393 + }, + { + "auxiliary_loss_clip": 0.01096879, + "auxiliary_loss_mlp": 0.01036378, + "balance_loss_clip": 1.01556396, + "balance_loss_mlp": 1.0269196, + "epoch": 0.18890725988275966, + "flos": 18550959540480.0, + "grad_norm": 1.8405154147025118, + "language_loss": 0.7696079, + "learning_rate": 3.6583333266008404e-06, + "loss": 0.79094052, + "num_input_tokens_seen": 67831070, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 0.69921875, + "step": 3142, + "time_per_iteration": 2.342622756958008 + }, + { + "auxiliary_loss_clip": 0.01091897, + "auxiliary_loss_mlp": 0.01035272, + "balance_loss_clip": 1.01705718, + "balance_loss_mlp": 1.02641535, + "epoch": 0.18896738313542763, + "flos": 28839584321280.0, + "grad_norm": 1.7804266465807372, + "language_loss": 0.78882277, + "learning_rate": 3.6581220937744305e-06, + "loss": 0.81009448, + "num_input_tokens_seen": 67852170, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.65234375, + "step": 3143, + "time_per_iteration": 2.4450275897979736 + }, + { + "auxiliary_loss_clip": 0.01094285, + "auxiliary_loss_mlp": 0.01040665, + "balance_loss_clip": 1.02175856, + "balance_loss_mlp": 1.02750754, + "epoch": 0.1890275063880956, + "flos": 22411830107520.0, + "grad_norm": 2.287618186149079, + "language_loss": 0.71571839, + "learning_rate": 3.6579108017739076e-06, + "loss": 0.73706782, + "num_input_tokens_seen": 67869945, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 0.66796875, + "step": 3144, + "time_per_iteration": 2.3720335960388184 + }, + { + "auxiliary_loss_clip": 0.01094998, + "auxiliary_loss_mlp": 0.01037076, + "balance_loss_clip": 1.01803827, + "balance_loss_mlp": 1.02687371, + "epoch": 0.18908762964076356, + "flos": 24242744960640.0, + "grad_norm": 2.6075958756325393, + "language_loss": 0.73052001, + "learning_rate": 3.6576994506068136e-06, + "loss": 0.75184077, + "num_input_tokens_seen": 67890240, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 0.6796875, + "step": 3145, + "time_per_iteration": 2.4356400966644287 + }, + { + "auxiliary_loss_clip": 0.0109119, + "auxiliary_loss_mlp": 0.01034824, + "balance_loss_clip": 1.01741934, + "balance_loss_mlp": 1.02506208, + "epoch": 0.18914775289343153, + "flos": 16981962773760.0, + "grad_norm": 2.683121301152449, + "language_loss": 0.76936823, + "learning_rate": 3.6574880402806897e-06, + "loss": 0.79062837, + "num_input_tokens_seen": 67907825, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.6640625, + "step": 3146, + "time_per_iteration": 2.351532220840454 + }, + { + "auxiliary_loss_clip": 0.01093525, + "auxiliary_loss_mlp": 0.01039422, + "balance_loss_clip": 1.02119517, + "balance_loss_mlp": 1.02663589, + "epoch": 0.1892078761460995, + "flos": 21542701921920.0, + "grad_norm": 2.1745879156237082, + "language_loss": 0.78983533, + "learning_rate": 3.6572765708030813e-06, + "loss": 0.81116486, + "num_input_tokens_seen": 67926670, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 0.66796875, + "step": 3147, + "time_per_iteration": 2.396304130554199 + }, + { + "auxiliary_loss_clip": 0.01090724, + "auxiliary_loss_mlp": 0.01032712, + "balance_loss_clip": 1.01547456, + "balance_loss_mlp": 1.02646947, + "epoch": 0.18926799939876748, + "flos": 23000465496960.0, + "grad_norm": 2.8520065875250187, + "language_loss": 0.66726327, + "learning_rate": 3.657065042181536e-06, + "loss": 0.68849766, + "num_input_tokens_seen": 67943645, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.640625, + "step": 3148, + "time_per_iteration": 2.395343065261841 + }, + { + "auxiliary_loss_clip": 0.01091736, + "auxiliary_loss_mlp": 0.01030399, + "balance_loss_clip": 1.01361406, + "balance_loss_mlp": 1.02645969, + "epoch": 0.18932812265143545, + "flos": 22271932823040.0, + "grad_norm": 2.445524490717879, + "language_loss": 0.76157504, + "learning_rate": 3.6568534544236008e-06, + "loss": 0.78279638, + "num_input_tokens_seen": 67962345, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.65234375, + "step": 3149, + "time_per_iteration": 2.3933093547821045 + }, + { + "auxiliary_loss_clip": 0.01090073, + "auxiliary_loss_mlp": 0.01036841, + "balance_loss_clip": 1.02015185, + "balance_loss_mlp": 1.02690399, + "epoch": 0.1893882459041034, + "flos": 18623439256320.0, + "grad_norm": 3.363665441741508, + "language_loss": 0.81101823, + "learning_rate": 3.656641807536828e-06, + "loss": 0.83228737, + "num_input_tokens_seen": 67979760, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.6328125, + "step": 3150, + "time_per_iteration": 2.3639278411865234 + }, + { + "auxiliary_loss_clip": 0.01095356, + "auxiliary_loss_mlp": 0.01041358, + "balance_loss_clip": 1.022928, + "balance_loss_mlp": 1.02777815, + "epoch": 0.18944836915677138, + "flos": 22891885568640.0, + "grad_norm": 2.0949354009812304, + "language_loss": 0.84872854, + "learning_rate": 3.6564301015287706e-06, + "loss": 0.87009573, + "num_input_tokens_seen": 67996895, + "router_z_loss_clip": 0.18457031, + "router_z_loss_mlp": 0.67578125, + "step": 3151, + "time_per_iteration": 2.3968522548675537 + }, + { + "auxiliary_loss_clip": 0.01095155, + "auxiliary_loss_mlp": 0.01040186, + "balance_loss_clip": 1.0226388, + "balance_loss_mlp": 1.02819836, + "epoch": 0.18950849240943934, + "flos": 26795349861120.0, + "grad_norm": 1.9176161239989238, + "language_loss": 0.74011457, + "learning_rate": 3.6562183364069835e-06, + "loss": 0.76146793, + "num_input_tokens_seen": 68018365, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.66796875, + "step": 3152, + "time_per_iteration": 2.4383933544158936 + }, + { + "auxiliary_loss_clip": 0.01091615, + "auxiliary_loss_mlp": 0.01039334, + "balance_loss_clip": 1.02085638, + "balance_loss_mlp": 1.02577949, + "epoch": 0.1895686156621073, + "flos": 24970125559680.0, + "grad_norm": 1.8800007116162436, + "language_loss": 0.75120592, + "learning_rate": 3.6560065121790244e-06, + "loss": 0.77251536, + "num_input_tokens_seen": 68037985, + "router_z_loss_clip": 0.18457031, + "router_z_loss_mlp": 0.65625, + "step": 3153, + "time_per_iteration": 2.4134364128112793 + }, + { + "auxiliary_loss_clip": 0.01093964, + "auxiliary_loss_mlp": 0.01037649, + "balance_loss_clip": 1.01952863, + "balance_loss_mlp": 1.0262568, + "epoch": 0.1896287389147753, + "flos": 21943469952000.0, + "grad_norm": 5.572109106942339, + "language_loss": 0.79413539, + "learning_rate": 3.655794628852453e-06, + "loss": 0.8154515, + "num_input_tokens_seen": 68057975, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.6796875, + "step": 3154, + "time_per_iteration": 2.3982646465301514 + }, + { + "auxiliary_loss_clip": 0.01094087, + "auxiliary_loss_mlp": 0.01037787, + "balance_loss_clip": 1.01841521, + "balance_loss_mlp": 1.02587223, + "epoch": 0.18968886216744327, + "flos": 18178297021440.0, + "grad_norm": 2.80094461298542, + "language_loss": 0.72725987, + "learning_rate": 3.6555826864348297e-06, + "loss": 0.74857867, + "num_input_tokens_seen": 68074175, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 0.6796875, + "step": 3155, + "time_per_iteration": 2.3434884548187256 + }, + { + "auxiliary_loss_clip": 0.01089547, + "auxiliary_loss_mlp": 0.01035331, + "balance_loss_clip": 1.01737821, + "balance_loss_mlp": 1.02359009, + "epoch": 0.18974898542011123, + "flos": 20411446181760.0, + "grad_norm": 2.2744931621323725, + "language_loss": 0.7401787, + "learning_rate": 3.6553706849337197e-06, + "loss": 0.76142752, + "num_input_tokens_seen": 68095230, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.65625, + "step": 3156, + "time_per_iteration": 2.408158540725708 + }, + { + "auxiliary_loss_clip": 0.01093336, + "auxiliary_loss_mlp": 0.01032518, + "balance_loss_clip": 1.01494694, + "balance_loss_mlp": 1.02672505, + "epoch": 0.1898091086727792, + "flos": 23983968896640.0, + "grad_norm": 1.781444490073804, + "language_loss": 0.67989981, + "learning_rate": 3.6551586243566877e-06, + "loss": 0.7011584, + "num_input_tokens_seen": 68113805, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.6640625, + "step": 3157, + "time_per_iteration": 2.396543502807617 + }, + { + "auxiliary_loss_clip": 0.01091825, + "auxiliary_loss_mlp": 0.01031665, + "balance_loss_clip": 1.01412976, + "balance_loss_mlp": 1.02482057, + "epoch": 0.18986923192544716, + "flos": 27635813953920.0, + "grad_norm": 1.7151154497333212, + "language_loss": 0.79707837, + "learning_rate": 3.654946504711302e-06, + "loss": 0.81831336, + "num_input_tokens_seen": 68133190, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.671875, + "step": 3158, + "time_per_iteration": 2.4283323287963867 + }, + { + "auxiliary_loss_clip": 0.01096707, + "auxiliary_loss_mlp": 0.01039898, + "balance_loss_clip": 1.01951361, + "balance_loss_mlp": 1.02673757, + "epoch": 0.18992935517811513, + "flos": 25482964654080.0, + "grad_norm": 2.665136737825096, + "language_loss": 0.72027659, + "learning_rate": 3.6547343260051323e-06, + "loss": 0.74164271, + "num_input_tokens_seen": 68152330, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 0.69921875, + "step": 3159, + "time_per_iteration": 2.4149372577667236 + }, + { + "auxiliary_loss_clip": 0.01093615, + "auxiliary_loss_mlp": 0.0104252, + "balance_loss_clip": 1.02317274, + "balance_loss_mlp": 1.02667046, + "epoch": 0.1899894784307831, + "flos": 17419843445760.0, + "grad_norm": 2.4817875191132286, + "language_loss": 0.85185206, + "learning_rate": 3.6545220882457518e-06, + "loss": 0.87321341, + "num_input_tokens_seen": 68170185, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 0.671875, + "step": 3160, + "time_per_iteration": 2.370260238647461 + }, + { + "auxiliary_loss_clip": 0.01088359, + "auxiliary_loss_mlp": 0.01044831, + "balance_loss_clip": 1.02783227, + "balance_loss_mlp": 1.02554953, + "epoch": 0.19004960168345109, + "flos": 27490959256320.0, + "grad_norm": 1.8624981820899977, + "language_loss": 0.73385042, + "learning_rate": 3.6543097914407336e-06, + "loss": 0.75518227, + "num_input_tokens_seen": 68191665, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.625, + "step": 3161, + "time_per_iteration": 2.4460105895996094 + }, + { + "auxiliary_loss_clip": 0.01090908, + "auxiliary_loss_mlp": 0.01041006, + "balance_loss_clip": 1.0234704, + "balance_loss_mlp": 1.02615905, + "epoch": 0.19010972493611905, + "flos": 38653145965440.0, + "grad_norm": 1.8150751487237726, + "language_loss": 0.80446106, + "learning_rate": 3.6540974355976537e-06, + "loss": 0.82578015, + "num_input_tokens_seen": 68214635, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.6484375, + "step": 3162, + "time_per_iteration": 2.5258138179779053 + }, + { + "auxiliary_loss_clip": 0.01092994, + "auxiliary_loss_mlp": 0.01031712, + "balance_loss_clip": 1.01316249, + "balance_loss_mlp": 1.02582717, + "epoch": 0.19016984818878702, + "flos": 19243741115520.0, + "grad_norm": 3.1610870978860692, + "language_loss": 0.75388765, + "learning_rate": 3.653885020724092e-06, + "loss": 0.77513468, + "num_input_tokens_seen": 68232150, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.671875, + "step": 3163, + "time_per_iteration": 2.3539695739746094 + }, + { + "auxiliary_loss_clip": 0.01091513, + "auxiliary_loss_mlp": 0.01038314, + "balance_loss_clip": 1.02019417, + "balance_loss_mlp": 1.02672601, + "epoch": 0.19022997144145498, + "flos": 37595382370560.0, + "grad_norm": 2.5051610458785984, + "language_loss": 0.74053907, + "learning_rate": 3.653672546827628e-06, + "loss": 0.76183736, + "num_input_tokens_seen": 68253370, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.6484375, + "step": 3164, + "time_per_iteration": 2.5292487144470215 + }, + { + "auxiliary_loss_clip": 0.01093023, + "auxiliary_loss_mlp": 0.01028941, + "balance_loss_clip": 1.01123857, + "balance_loss_mlp": 1.02741444, + "epoch": 0.19029009469412295, + "flos": 61528762840320.0, + "grad_norm": 1.4450338276097412, + "language_loss": 0.66605741, + "learning_rate": 3.653460013915844e-06, + "loss": 0.68727708, + "num_input_tokens_seen": 68278895, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.65625, + "step": 3165, + "time_per_iteration": 2.7367405891418457 + }, + { + "auxiliary_loss_clip": 0.01095137, + "auxiliary_loss_mlp": 0.01038974, + "balance_loss_clip": 1.02067578, + "balance_loss_mlp": 1.02840436, + "epoch": 0.1903502179467909, + "flos": 13953980534400.0, + "grad_norm": 2.4984910791317807, + "language_loss": 0.73748457, + "learning_rate": 3.653247421996326e-06, + "loss": 0.75882566, + "num_input_tokens_seen": 68294880, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.6640625, + "step": 3166, + "time_per_iteration": 2.3404548168182373 + }, + { + "auxiliary_loss_clip": 0.01027155, + "auxiliary_loss_mlp": 0.01025743, + "balance_loss_clip": 1.02271545, + "balance_loss_mlp": 1.0086807, + "epoch": 0.1904103411994589, + "flos": 66896168152320.0, + "grad_norm": 0.7936556718366062, + "language_loss": 0.50340271, + "learning_rate": 3.65303477107666e-06, + "loss": 0.52393174, + "num_input_tokens_seen": 68359665, + "router_z_loss_clip": 0.03027344, + "router_z_loss_mlp": 0.18457031, + "step": 3167, + "time_per_iteration": 3.0473318099975586 + }, + { + "auxiliary_loss_clip": 0.01092101, + "auxiliary_loss_mlp": 0.01034347, + "balance_loss_clip": 1.01715732, + "balance_loss_mlp": 1.02729797, + "epoch": 0.19047046445212687, + "flos": 21907649030400.0, + "grad_norm": 1.9924075328792246, + "language_loss": 0.7409988, + "learning_rate": 3.6528220611644356e-06, + "loss": 0.7622633, + "num_input_tokens_seen": 68378950, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.6484375, + "step": 3168, + "time_per_iteration": 2.402815103530884 + }, + { + "auxiliary_loss_clip": 0.01024807, + "auxiliary_loss_mlp": 0.0101354, + "balance_loss_clip": 1.01065552, + "balance_loss_mlp": 1.00624621, + "epoch": 0.19053058770479483, + "flos": 59252424595200.0, + "grad_norm": 0.8669423342558235, + "language_loss": 0.6008268, + "learning_rate": 3.652609292267242e-06, + "loss": 0.62121028, + "num_input_tokens_seen": 68434235, + "router_z_loss_clip": 0.02880859, + "router_z_loss_mlp": 0.18554688, + "step": 3169, + "time_per_iteration": 2.9471940994262695 + }, + { + "auxiliary_loss_clip": 0.01095435, + "auxiliary_loss_mlp": 0.01041426, + "balance_loss_clip": 1.02373552, + "balance_loss_mlp": 1.0265286, + "epoch": 0.1905907109574628, + "flos": 23950172833920.0, + "grad_norm": 1.6654320331704824, + "language_loss": 0.78398848, + "learning_rate": 3.6523964643926754e-06, + "loss": 0.8053571, + "num_input_tokens_seen": 68453830, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.6875, + "step": 3170, + "time_per_iteration": 2.4139318466186523 + }, + { + "auxiliary_loss_clip": 0.01089523, + "auxiliary_loss_mlp": 0.01036114, + "balance_loss_clip": 1.01806617, + "balance_loss_mlp": 1.02465212, + "epoch": 0.19065083421013077, + "flos": 20811306516480.0, + "grad_norm": 1.71655659388284, + "language_loss": 0.78177553, + "learning_rate": 3.6521835775483285e-06, + "loss": 0.80303192, + "num_input_tokens_seen": 68473005, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.6484375, + "step": 3171, + "time_per_iteration": 2.3902087211608887 + }, + { + "auxiliary_loss_clip": 0.01093806, + "auxiliary_loss_mlp": 0.0103914, + "balance_loss_clip": 1.01995897, + "balance_loss_mlp": 1.02565539, + "epoch": 0.19071095746279873, + "flos": 31283644204800.0, + "grad_norm": 2.0296539216697793, + "language_loss": 0.77943277, + "learning_rate": 3.6519706317417995e-06, + "loss": 0.8007623, + "num_input_tokens_seen": 68493470, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 0.6796875, + "step": 3172, + "time_per_iteration": 2.4620447158813477 + }, + { + "auxiliary_loss_clip": 0.01093869, + "auxiliary_loss_mlp": 0.0104006, + "balance_loss_clip": 1.02184463, + "balance_loss_mlp": 1.02689338, + "epoch": 0.1907710807154667, + "flos": 14355237323520.0, + "grad_norm": 7.969753133433176, + "language_loss": 0.80303502, + "learning_rate": 3.6517576269806885e-06, + "loss": 0.82437432, + "num_input_tokens_seen": 68511290, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.66796875, + "step": 3173, + "time_per_iteration": 2.364328384399414 + }, + { + "auxiliary_loss_clip": 0.01093834, + "auxiliary_loss_mlp": 0.01049215, + "balance_loss_clip": 1.03058267, + "balance_loss_mlp": 1.02608895, + "epoch": 0.1908312039681347, + "flos": 26905815002880.0, + "grad_norm": 1.5726743213791063, + "language_loss": 0.78732854, + "learning_rate": 3.651544563272597e-06, + "loss": 0.80875897, + "num_input_tokens_seen": 68532575, + "router_z_loss_clip": 0.18652344, + "router_z_loss_mlp": 0.6796875, + "step": 3174, + "time_per_iteration": 3.873528242111206 + }, + { + "auxiliary_loss_clip": 0.0109644, + "auxiliary_loss_mlp": 0.01044016, + "balance_loss_clip": 1.02575302, + "balance_loss_mlp": 1.02894258, + "epoch": 0.19089132722080265, + "flos": 14494017444480.0, + "grad_norm": 2.560932419383946, + "language_loss": 0.81298071, + "learning_rate": 3.651331440625127e-06, + "loss": 0.83438522, + "num_input_tokens_seen": 68548760, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 0.671875, + "step": 3175, + "time_per_iteration": 2.3709824085235596 + }, + { + "auxiliary_loss_clip": 0.01095595, + "auxiliary_loss_mlp": 0.01047697, + "balance_loss_clip": 1.02912462, + "balance_loss_mlp": 1.02780724, + "epoch": 0.19095145047347062, + "flos": 13952060409600.0, + "grad_norm": 2.1343172854609658, + "language_loss": 0.85423797, + "learning_rate": 3.651118259045887e-06, + "loss": 0.87567091, + "num_input_tokens_seen": 68563100, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.67578125, + "step": 3176, + "time_per_iteration": 2.352125883102417 + }, + { + "auxiliary_loss_clip": 0.01097972, + "auxiliary_loss_mlp": 0.01050253, + "balance_loss_clip": 1.0299871, + "balance_loss_mlp": 1.02877474, + "epoch": 0.19101157372613858, + "flos": 25300648200960.0, + "grad_norm": 1.9692737698191998, + "language_loss": 0.81437957, + "learning_rate": 3.650905018542483e-06, + "loss": 0.8358618, + "num_input_tokens_seen": 68581650, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 0.69140625, + "step": 3177, + "time_per_iteration": 3.792844533920288 + }, + { + "auxiliary_loss_clip": 0.01092264, + "auxiliary_loss_mlp": 0.01036427, + "balance_loss_clip": 1.01786613, + "balance_loss_mlp": 1.02616823, + "epoch": 0.19107169697880655, + "flos": 20557173663360.0, + "grad_norm": 2.7706199197200676, + "language_loss": 0.74712181, + "learning_rate": 3.650691719122525e-06, + "loss": 0.76840878, + "num_input_tokens_seen": 68600360, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.6640625, + "step": 3178, + "time_per_iteration": 3.7311582565307617 + }, + { + "auxiliary_loss_clip": 0.01094936, + "auxiliary_loss_mlp": 0.01036207, + "balance_loss_clip": 1.01819444, + "balance_loss_mlp": 1.02852631, + "epoch": 0.19113182023147451, + "flos": 22162130997120.0, + "grad_norm": 1.6771813715315846, + "language_loss": 0.81381947, + "learning_rate": 3.6504783607936266e-06, + "loss": 0.83513093, + "num_input_tokens_seen": 68617885, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.6640625, + "step": 3179, + "time_per_iteration": 2.387749671936035 + }, + { + "auxiliary_loss_clip": 0.01095876, + "auxiliary_loss_mlp": 0.01035741, + "balance_loss_clip": 1.01747799, + "balance_loss_mlp": 1.02783322, + "epoch": 0.19119194348414248, + "flos": 18580985176320.0, + "grad_norm": 3.571695730743239, + "language_loss": 0.80022579, + "learning_rate": 3.6502649435634006e-06, + "loss": 0.82154197, + "num_input_tokens_seen": 68634550, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 0.6796875, + "step": 3180, + "time_per_iteration": 3.7456233501434326 + }, + { + "auxiliary_loss_clip": 0.01092087, + "auxiliary_loss_mlp": 0.01042117, + "balance_loss_clip": 1.02328157, + "balance_loss_mlp": 1.02579308, + "epoch": 0.19125206673681047, + "flos": 19025603740800.0, + "grad_norm": 2.4491028994390365, + "language_loss": 0.79080421, + "learning_rate": 3.6500514674394634e-06, + "loss": 0.81214631, + "num_input_tokens_seen": 68651895, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 0.6640625, + "step": 3181, + "time_per_iteration": 2.3571255207061768 + }, + { + "auxiliary_loss_clip": 0.01094004, + "auxiliary_loss_mlp": 0.01037111, + "balance_loss_clip": 1.01886034, + "balance_loss_mlp": 1.02600908, + "epoch": 0.19131218998947844, + "flos": 21689057808000.0, + "grad_norm": 1.8849302621669406, + "language_loss": 0.73793995, + "learning_rate": 3.649837932429434e-06, + "loss": 0.75925112, + "num_input_tokens_seen": 68671500, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 0.6796875, + "step": 3182, + "time_per_iteration": 2.392728090286255 + }, + { + "auxiliary_loss_clip": 0.01094247, + "auxiliary_loss_mlp": 0.01040668, + "balance_loss_clip": 1.02185655, + "balance_loss_mlp": 1.02753794, + "epoch": 0.1913723132421464, + "flos": 18441506828160.0, + "grad_norm": 1.7585453217167473, + "language_loss": 0.64951855, + "learning_rate": 3.649624338540933e-06, + "loss": 0.67086768, + "num_input_tokens_seen": 68690570, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 0.6640625, + "step": 3183, + "time_per_iteration": 2.3653316497802734 + }, + { + "auxiliary_loss_clip": 0.01092735, + "auxiliary_loss_mlp": 0.01043071, + "balance_loss_clip": 1.02306771, + "balance_loss_mlp": 1.0258553, + "epoch": 0.19143243649481437, + "flos": 27158935426560.0, + "grad_norm": 1.5167240814876268, + "language_loss": 0.73595703, + "learning_rate": 3.649410685781582e-06, + "loss": 0.75731504, + "num_input_tokens_seen": 68709735, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 0.66796875, + "step": 3184, + "time_per_iteration": 2.432704210281372 + }, + { + "auxiliary_loss_clip": 0.01092212, + "auxiliary_loss_mlp": 0.01035316, + "balance_loss_clip": 1.01520491, + "balance_loss_mlp": 1.02478993, + "epoch": 0.19149255974748233, + "flos": 21718071014400.0, + "grad_norm": 1.9895216461375365, + "language_loss": 0.88315654, + "learning_rate": 3.6491969741590075e-06, + "loss": 0.90443182, + "num_input_tokens_seen": 68727565, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 0.671875, + "step": 3185, + "time_per_iteration": 2.3759915828704834 + }, + { + "auxiliary_loss_clip": 0.01092097, + "auxiliary_loss_mlp": 0.01033265, + "balance_loss_clip": 1.0140481, + "balance_loss_mlp": 1.02576983, + "epoch": 0.1915526830001503, + "flos": 22962270602880.0, + "grad_norm": 2.0860743889738442, + "language_loss": 0.72633183, + "learning_rate": 3.648983203680834e-06, + "loss": 0.74758548, + "num_input_tokens_seen": 68748110, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 0.6640625, + "step": 3186, + "time_per_iteration": 2.4003496170043945 + }, + { + "auxiliary_loss_clip": 0.01096054, + "auxiliary_loss_mlp": 0.0103823, + "balance_loss_clip": 1.01659393, + "balance_loss_mlp": 1.02652895, + "epoch": 0.1916128062528183, + "flos": 26139541282560.0, + "grad_norm": 1.784433829999298, + "language_loss": 0.83411252, + "learning_rate": 3.6487693743546927e-06, + "loss": 0.8554554, + "num_input_tokens_seen": 68769765, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 0.6953125, + "step": 3187, + "time_per_iteration": 2.4265408515930176 + }, + { + "auxiliary_loss_clip": 0.01027159, + "auxiliary_loss_mlp": 0.01014839, + "balance_loss_clip": 1.01260972, + "balance_loss_mlp": 1.00835872, + "epoch": 0.19167292950548626, + "flos": 54922809847680.0, + "grad_norm": 0.853340881450663, + "language_loss": 0.55857521, + "learning_rate": 3.648555486188213e-06, + "loss": 0.57899523, + "num_input_tokens_seen": 68826815, + "router_z_loss_clip": 0.02233887, + "router_z_loss_mlp": 0.1875, + "step": 3188, + "time_per_iteration": 3.071138858795166 + }, + { + "auxiliary_loss_clip": 0.01093198, + "auxiliary_loss_mlp": 0.01037988, + "balance_loss_clip": 1.01933169, + "balance_loss_mlp": 1.02743053, + "epoch": 0.19173305275815422, + "flos": 29934286001280.0, + "grad_norm": 1.6054637380264414, + "language_loss": 0.70125937, + "learning_rate": 3.648341539189029e-06, + "loss": 0.72257119, + "num_input_tokens_seen": 68847585, + "router_z_loss_clip": 0.18652344, + "router_z_loss_mlp": 0.65625, + "step": 3189, + "time_per_iteration": 2.4534246921539307 + }, + { + "auxiliary_loss_clip": 0.01088219, + "auxiliary_loss_mlp": 0.01031638, + "balance_loss_clip": 1.01462686, + "balance_loss_mlp": 1.02538657, + "epoch": 0.1917931760108222, + "flos": 24751359780480.0, + "grad_norm": 3.424436763277206, + "language_loss": 0.74134934, + "learning_rate": 3.648127533364775e-06, + "loss": 0.76254797, + "num_input_tokens_seen": 68866620, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.62890625, + "step": 3190, + "time_per_iteration": 2.4190168380737305 + }, + { + "auxiliary_loss_clip": 0.01092916, + "auxiliary_loss_mlp": 0.01048443, + "balance_loss_clip": 1.02938151, + "balance_loss_mlp": 1.02776313, + "epoch": 0.19185329926349015, + "flos": 18842554149120.0, + "grad_norm": 3.6163422025086005, + "language_loss": 0.8435185, + "learning_rate": 3.6479134687230887e-06, + "loss": 0.86493206, + "num_input_tokens_seen": 68885515, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 0.6484375, + "step": 3191, + "time_per_iteration": 2.377803087234497 + }, + { + "auxiliary_loss_clip": 0.01090419, + "auxiliary_loss_mlp": 0.01032266, + "balance_loss_clip": 1.01468277, + "balance_loss_mlp": 1.0269171, + "epoch": 0.19191342251615812, + "flos": 22085880854400.0, + "grad_norm": 1.8367942314446566, + "language_loss": 0.89690745, + "learning_rate": 3.64769934527161e-06, + "loss": 0.91813433, + "num_input_tokens_seen": 68903225, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.6328125, + "step": 3192, + "time_per_iteration": 2.38694167137146 + }, + { + "auxiliary_loss_clip": 0.01095632, + "auxiliary_loss_mlp": 0.01041773, + "balance_loss_clip": 1.02150786, + "balance_loss_mlp": 1.02851701, + "epoch": 0.19197354576882608, + "flos": 22198056652800.0, + "grad_norm": 1.7812500685174586, + "language_loss": 0.74489391, + "learning_rate": 3.64748516301798e-06, + "loss": 0.7662679, + "num_input_tokens_seen": 68922860, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 0.671875, + "step": 3193, + "time_per_iteration": 2.389866352081299 + }, + { + "auxiliary_loss_clip": 0.01096063, + "auxiliary_loss_mlp": 0.0103819, + "balance_loss_clip": 1.01852024, + "balance_loss_mlp": 1.02679372, + "epoch": 0.19203366902149407, + "flos": 24895132225920.0, + "grad_norm": 1.7237964102702663, + "language_loss": 0.7463479, + "learning_rate": 3.6472709219698422e-06, + "loss": 0.76769042, + "num_input_tokens_seen": 68943000, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 0.69140625, + "step": 3194, + "time_per_iteration": 2.419905185699463 + }, + { + "auxiliary_loss_clip": 0.01022737, + "auxiliary_loss_mlp": 0.01002894, + "balance_loss_clip": 1.00068891, + "balance_loss_mlp": 1.00399566, + "epoch": 0.19209379227416204, + "flos": 68413633885440.0, + "grad_norm": 0.7852784648745245, + "language_loss": 0.68454325, + "learning_rate": 3.647056622134843e-06, + "loss": 0.70479953, + "num_input_tokens_seen": 69000255, + "router_z_loss_clip": 0.02209473, + "router_z_loss_mlp": 0.1875, + "step": 3195, + "time_per_iteration": 2.9521589279174805 + }, + { + "auxiliary_loss_clip": 0.0109407, + "auxiliary_loss_mlp": 0.01042562, + "balance_loss_clip": 1.02328563, + "balance_loss_mlp": 1.02698195, + "epoch": 0.19215391552683, + "flos": 22054074739200.0, + "grad_norm": 2.5447857934723115, + "language_loss": 0.72515213, + "learning_rate": 3.6468422635206297e-06, + "loss": 0.74651849, + "num_input_tokens_seen": 69019665, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 0.671875, + "step": 3196, + "time_per_iteration": 2.3983092308044434 + }, + { + "auxiliary_loss_clip": 0.01096925, + "auxiliary_loss_mlp": 0.01042756, + "balance_loss_clip": 1.02365923, + "balance_loss_mlp": 1.03062785, + "epoch": 0.19221403877949797, + "flos": 20301923646720.0, + "grad_norm": 2.1716943994036444, + "language_loss": 0.83250105, + "learning_rate": 3.6466278461348514e-06, + "loss": 0.85389781, + "num_input_tokens_seen": 69039055, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 0.6640625, + "step": 3197, + "time_per_iteration": 2.3770110607147217 + }, + { + "auxiliary_loss_clip": 0.01092871, + "auxiliary_loss_mlp": 0.01035991, + "balance_loss_clip": 1.0170486, + "balance_loss_mlp": 1.02595544, + "epoch": 0.19227416203216594, + "flos": 23184213315840.0, + "grad_norm": 2.105888060625776, + "language_loss": 0.80370164, + "learning_rate": 3.646413369985161e-06, + "loss": 0.82499027, + "num_input_tokens_seen": 69056370, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 0.66796875, + "step": 3198, + "time_per_iteration": 2.3984158039093018 + }, + { + "auxiliary_loss_clip": 0.01094667, + "auxiliary_loss_mlp": 0.01039524, + "balance_loss_clip": 1.01867473, + "balance_loss_mlp": 1.02617788, + "epoch": 0.1923342852848339, + "flos": 25775397135360.0, + "grad_norm": 2.109748742392438, + "language_loss": 0.78280067, + "learning_rate": 3.6461988350792137e-06, + "loss": 0.80414265, + "num_input_tokens_seen": 69075915, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 0.6875, + "step": 3199, + "time_per_iteration": 2.41072940826416 + }, + { + "auxiliary_loss_clip": 0.01094885, + "auxiliary_loss_mlp": 0.01036543, + "balance_loss_clip": 1.0186615, + "balance_loss_mlp": 1.02991319, + "epoch": 0.19239440853750187, + "flos": 17127410964480.0, + "grad_norm": 2.4234828034627993, + "language_loss": 0.83533007, + "learning_rate": 3.6459842414246636e-06, + "loss": 0.85664433, + "num_input_tokens_seen": 69094145, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 0.6484375, + "step": 3200, + "time_per_iteration": 2.351614475250244 + }, + { + "auxiliary_loss_clip": 0.01095715, + "auxiliary_loss_mlp": 0.01044843, + "balance_loss_clip": 1.02640176, + "balance_loss_mlp": 1.02879262, + "epoch": 0.19245453179016986, + "flos": 16434175541760.0, + "grad_norm": 2.052054101014935, + "language_loss": 0.79116702, + "learning_rate": 3.6457695890291697e-06, + "loss": 0.8125726, + "num_input_tokens_seen": 69111110, + "router_z_loss_clip": 0.18457031, + "router_z_loss_mlp": 0.66796875, + "step": 3201, + "time_per_iteration": 2.348501682281494 + }, + { + "auxiliary_loss_clip": 0.01095271, + "auxiliary_loss_mlp": 0.01037754, + "balance_loss_clip": 1.01831102, + "balance_loss_mlp": 1.02722049, + "epoch": 0.19251465504283782, + "flos": 20229234462720.0, + "grad_norm": 2.270429026092637, + "language_loss": 0.69541204, + "learning_rate": 3.645554877900393e-06, + "loss": 0.71674228, + "num_input_tokens_seen": 69130280, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 0.6796875, + "step": 3202, + "time_per_iteration": 2.384735584259033 + }, + { + "auxiliary_loss_clip": 0.01092863, + "auxiliary_loss_mlp": 0.01032726, + "balance_loss_clip": 1.01412892, + "balance_loss_mlp": 1.02728343, + "epoch": 0.1925747782955058, + "flos": 19463344767360.0, + "grad_norm": 2.5595947348008443, + "language_loss": 0.91117144, + "learning_rate": 3.645340108045995e-06, + "loss": 0.93242729, + "num_input_tokens_seen": 69149570, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.65625, + "step": 3203, + "time_per_iteration": 2.3753819465637207 + }, + { + "auxiliary_loss_clip": 0.01094895, + "auxiliary_loss_mlp": 0.01042478, + "balance_loss_clip": 1.02277279, + "balance_loss_mlp": 1.02658033, + "epoch": 0.19263490154817375, + "flos": 17784615997440.0, + "grad_norm": 1.9669752936168026, + "language_loss": 0.81680238, + "learning_rate": 3.6451252794736417e-06, + "loss": 0.83817607, + "num_input_tokens_seen": 69168190, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 0.6796875, + "step": 3204, + "time_per_iteration": 2.368269920349121 + }, + { + "auxiliary_loss_clip": 0.01092571, + "auxiliary_loss_mlp": 0.01039142, + "balance_loss_clip": 1.02041459, + "balance_loss_mlp": 1.02673435, + "epoch": 0.19269502480084172, + "flos": 17456118215040.0, + "grad_norm": 1.988349642468062, + "language_loss": 0.75792122, + "learning_rate": 3.6449103921909983e-06, + "loss": 0.77923828, + "num_input_tokens_seen": 69186950, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.66015625, + "step": 3205, + "time_per_iteration": 2.3562424182891846 + }, + { + "auxiliary_loss_clip": 0.01096497, + "auxiliary_loss_mlp": 0.01039308, + "balance_loss_clip": 1.02030611, + "balance_loss_mlp": 1.02881098, + "epoch": 0.19275514805350968, + "flos": 21505833659520.0, + "grad_norm": 2.9249157336018707, + "language_loss": 0.82801032, + "learning_rate": 3.644695446205735e-06, + "loss": 0.84936833, + "num_input_tokens_seen": 69204850, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 0.67578125, + "step": 3206, + "time_per_iteration": 2.388993740081787 + }, + { + "auxiliary_loss_clip": 0.01024912, + "auxiliary_loss_mlp": 0.01007991, + "balance_loss_clip": 1.00558305, + "balance_loss_mlp": 1.00593722, + "epoch": 0.19281527130617768, + "flos": 47693379928320.0, + "grad_norm": 0.8365671296608214, + "language_loss": 0.60553396, + "learning_rate": 3.644480441525521e-06, + "loss": 0.62586296, + "num_input_tokens_seen": 69259200, + "router_z_loss_clip": 0.02404785, + "router_z_loss_mlp": 0.18945312, + "step": 3207, + "time_per_iteration": 2.855283260345459 + }, + { + "auxiliary_loss_clip": 0.01092823, + "auxiliary_loss_mlp": 0.0103696, + "balance_loss_clip": 1.01725471, + "balance_loss_mlp": 1.02535605, + "epoch": 0.19287539455884564, + "flos": 11800467918720.0, + "grad_norm": 5.028053307957577, + "language_loss": 0.74671447, + "learning_rate": 3.6442653781580305e-06, + "loss": 0.76801234, + "num_input_tokens_seen": 69275835, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 0.671875, + "step": 3208, + "time_per_iteration": 2.3509304523468018 + }, + { + "auxiliary_loss_clip": 0.01094353, + "auxiliary_loss_mlp": 0.01039326, + "balance_loss_clip": 1.01995444, + "balance_loss_mlp": 1.02553701, + "epoch": 0.1929355178115136, + "flos": 20630386517760.0, + "grad_norm": 2.0997662987306325, + "language_loss": 0.60876942, + "learning_rate": 3.6440502561109384e-06, + "loss": 0.63010621, + "num_input_tokens_seen": 69294810, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 0.6875, + "step": 3209, + "time_per_iteration": 2.3633580207824707 + }, + { + "auxiliary_loss_clip": 0.01096845, + "auxiliary_loss_mlp": 0.01043453, + "balance_loss_clip": 1.02256715, + "balance_loss_mlp": 1.02674937, + "epoch": 0.19299564106418157, + "flos": 40806309467520.0, + "grad_norm": 1.9447056824487978, + "language_loss": 0.7999202, + "learning_rate": 3.6438350753919213e-06, + "loss": 0.82132316, + "num_input_tokens_seen": 69316065, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 0.69921875, + "step": 3210, + "time_per_iteration": 2.5475172996520996 + }, + { + "auxiliary_loss_clip": 0.01088278, + "auxiliary_loss_mlp": 0.01037149, + "balance_loss_clip": 1.01960135, + "balance_loss_mlp": 1.02380741, + "epoch": 0.19305576431684954, + "flos": 11360702033280.0, + "grad_norm": 2.2666304209789923, + "language_loss": 0.82905734, + "learning_rate": 3.643619836008659e-06, + "loss": 0.85031164, + "num_input_tokens_seen": 69332900, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.64453125, + "step": 3211, + "time_per_iteration": 2.3530471324920654 + }, + { + "auxiliary_loss_clip": 0.01021493, + "auxiliary_loss_mlp": 0.01002706, + "balance_loss_clip": 1.00033367, + "balance_loss_mlp": 1.00280476, + "epoch": 0.1931158875695175, + "flos": 54509299171200.0, + "grad_norm": 0.9651194982063522, + "language_loss": 0.63612223, + "learning_rate": 3.6434045379688324e-06, + "loss": 0.6563642, + "num_input_tokens_seen": 69382535, + "router_z_loss_clip": 0.02368164, + "router_z_loss_mlp": 0.18652344, + "step": 3212, + "time_per_iteration": 2.9029247760772705 + }, + { + "auxiliary_loss_clip": 0.01093502, + "auxiliary_loss_mlp": 0.01041722, + "balance_loss_clip": 1.02342355, + "balance_loss_mlp": 1.02697301, + "epoch": 0.19317601082218547, + "flos": 19827419091840.0, + "grad_norm": 1.7286706440858817, + "language_loss": 0.76006323, + "learning_rate": 3.6431891812801254e-06, + "loss": 0.78141546, + "num_input_tokens_seen": 69400600, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.6640625, + "step": 3213, + "time_per_iteration": 2.38496732711792 + }, + { + "auxiliary_loss_clip": 0.01096553, + "auxiliary_loss_mlp": 0.01041368, + "balance_loss_clip": 1.02217579, + "balance_loss_mlp": 1.02772832, + "epoch": 0.19323613407485346, + "flos": 13151222576640.0, + "grad_norm": 2.0194626720413957, + "language_loss": 0.71029568, + "learning_rate": 3.6429737659502237e-06, + "loss": 0.73167491, + "num_input_tokens_seen": 69417350, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 0.6875, + "step": 3214, + "time_per_iteration": 3.7430498600006104 + }, + { + "auxiliary_loss_clip": 0.01092736, + "auxiliary_loss_mlp": 0.0103565, + "balance_loss_clip": 1.01599288, + "balance_loss_mlp": 1.02590609, + "epoch": 0.19329625732752143, + "flos": 14026390427520.0, + "grad_norm": 2.04791721415141, + "language_loss": 0.74819297, + "learning_rate": 3.642758291986814e-06, + "loss": 0.76947683, + "num_input_tokens_seen": 69431845, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 0.66796875, + "step": 3215, + "time_per_iteration": 2.345534086227417 + }, + { + "auxiliary_loss_clip": 0.01089553, + "auxiliary_loss_mlp": 0.01040103, + "balance_loss_clip": 1.02125573, + "balance_loss_mlp": 1.02407169, + "epoch": 0.1933563805801894, + "flos": 23440580496000.0, + "grad_norm": 3.8363575180347804, + "language_loss": 0.88652748, + "learning_rate": 3.642542759397587e-06, + "loss": 0.90782398, + "num_input_tokens_seen": 69453275, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 0.65625, + "step": 3216, + "time_per_iteration": 2.4154319763183594 + }, + { + "auxiliary_loss_clip": 0.01092795, + "auxiliary_loss_mlp": 0.01040408, + "balance_loss_clip": 1.02185881, + "balance_loss_mlp": 1.0269376, + "epoch": 0.19341650383285736, + "flos": 20484275011200.0, + "grad_norm": 1.7398186088472865, + "language_loss": 0.80092424, + "learning_rate": 3.6423271681902336e-06, + "loss": 0.82225633, + "num_input_tokens_seen": 69471830, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.65625, + "step": 3217, + "time_per_iteration": 3.7727136611938477 + }, + { + "auxiliary_loss_clip": 0.01096014, + "auxiliary_loss_mlp": 0.01038216, + "balance_loss_clip": 1.01750958, + "balance_loss_mlp": 1.02631855, + "epoch": 0.19347662708552532, + "flos": 17857514649600.0, + "grad_norm": 2.3762710878227398, + "language_loss": 0.61644971, + "learning_rate": 3.642111518372448e-06, + "loss": 0.63779199, + "num_input_tokens_seen": 69489320, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 0.6953125, + "step": 3218, + "time_per_iteration": 3.738217830657959 + }, + { + "auxiliary_loss_clip": 0.01093309, + "auxiliary_loss_mlp": 0.01041184, + "balance_loss_clip": 1.0221107, + "balance_loss_mlp": 1.02671599, + "epoch": 0.1935367503381933, + "flos": 18186256811520.0, + "grad_norm": 1.9905440332339441, + "language_loss": 0.80267423, + "learning_rate": 3.6418958099519267e-06, + "loss": 0.82401913, + "num_input_tokens_seen": 69506665, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 0.6640625, + "step": 3219, + "time_per_iteration": 3.819187641143799 + }, + { + "auxiliary_loss_clip": 0.01094091, + "auxiliary_loss_mlp": 0.01043146, + "balance_loss_clip": 1.02382195, + "balance_loss_mlp": 1.02746654, + "epoch": 0.19359687359086128, + "flos": 15956319496320.0, + "grad_norm": 2.4496600917349647, + "language_loss": 0.85869569, + "learning_rate": 3.6416800429363674e-06, + "loss": 0.88006806, + "num_input_tokens_seen": 69523835, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 0.66796875, + "step": 3220, + "time_per_iteration": 2.361116886138916 + }, + { + "auxiliary_loss_clip": 0.01088917, + "auxiliary_loss_mlp": 0.01035224, + "balance_loss_clip": 1.01898813, + "balance_loss_mlp": 1.02643895, + "epoch": 0.19365699684352924, + "flos": 21214134316800.0, + "grad_norm": 3.4607468513524915, + "language_loss": 0.84419346, + "learning_rate": 3.6414642173334704e-06, + "loss": 0.86543494, + "num_input_tokens_seen": 69542620, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.625, + "step": 3221, + "time_per_iteration": 2.382223129272461 + }, + { + "auxiliary_loss_clip": 0.01092355, + "auxiliary_loss_mlp": 0.01036781, + "balance_loss_clip": 1.01993644, + "balance_loss_mlp": 1.0285064, + "epoch": 0.1937171200961972, + "flos": 17310146353920.0, + "grad_norm": 2.191763726116518, + "language_loss": 0.86122036, + "learning_rate": 3.6412483331509373e-06, + "loss": 0.88251173, + "num_input_tokens_seen": 69561130, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.640625, + "step": 3222, + "time_per_iteration": 2.412787437438965 + }, + { + "auxiliary_loss_clip": 0.0109185, + "auxiliary_loss_mlp": 0.01032477, + "balance_loss_clip": 1.01404715, + "balance_loss_mlp": 1.02545762, + "epoch": 0.19377724334886517, + "flos": 22634924895360.0, + "grad_norm": 2.525734028275212, + "language_loss": 0.78428602, + "learning_rate": 3.641032390396473e-06, + "loss": 0.80552936, + "num_input_tokens_seen": 69580425, + "router_z_loss_clip": 0.18457031, + "router_z_loss_mlp": 0.6640625, + "step": 3223, + "time_per_iteration": 2.4569711685180664 + }, + { + "auxiliary_loss_clip": 0.01090868, + "auxiliary_loss_mlp": 0.01034828, + "balance_loss_clip": 1.01762629, + "balance_loss_mlp": 1.02666807, + "epoch": 0.19383736660153314, + "flos": 15077136839040.0, + "grad_norm": 2.1340963987027926, + "language_loss": 0.75416589, + "learning_rate": 3.6408163890777843e-06, + "loss": 0.77542287, + "num_input_tokens_seen": 69597085, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.640625, + "step": 3224, + "time_per_iteration": 2.358529567718506 + }, + { + "auxiliary_loss_clip": 0.01090426, + "auxiliary_loss_mlp": 0.01031402, + "balance_loss_clip": 1.01259124, + "balance_loss_mlp": 1.02646494, + "epoch": 0.1938974898542011, + "flos": 47118152367360.0, + "grad_norm": 2.2078840050057473, + "language_loss": 0.70660877, + "learning_rate": 3.640600329202579e-06, + "loss": 0.72782701, + "num_input_tokens_seen": 69618885, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.640625, + "step": 3225, + "time_per_iteration": 2.617093563079834 + }, + { + "auxiliary_loss_clip": 0.01090086, + "auxiliary_loss_mlp": 0.01033062, + "balance_loss_clip": 1.01528764, + "balance_loss_mlp": 1.02510476, + "epoch": 0.19395761310686907, + "flos": 25811357702400.0, + "grad_norm": 2.4472171369842837, + "language_loss": 0.69760823, + "learning_rate": 3.6403842107785686e-06, + "loss": 0.71883965, + "num_input_tokens_seen": 69638200, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.6484375, + "step": 3226, + "time_per_iteration": 2.4120709896087646 + }, + { + "auxiliary_loss_clip": 0.01091227, + "auxiliary_loss_mlp": 0.01039145, + "balance_loss_clip": 1.02029812, + "balance_loss_mlp": 1.02752805, + "epoch": 0.19401773635953706, + "flos": 23038485834240.0, + "grad_norm": 1.6591788837545542, + "language_loss": 0.76039732, + "learning_rate": 3.6401680338134653e-06, + "loss": 0.78170109, + "num_input_tokens_seen": 69657550, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 0.63671875, + "step": 3227, + "time_per_iteration": 2.3980982303619385 + }, + { + "auxiliary_loss_clip": 0.01090895, + "auxiliary_loss_mlp": 0.01040711, + "balance_loss_clip": 1.02276993, + "balance_loss_mlp": 1.02430868, + "epoch": 0.19407785961220503, + "flos": 15919974904320.0, + "grad_norm": 1.8649498984843145, + "language_loss": 0.69280744, + "learning_rate": 3.6399517983149838e-06, + "loss": 0.71412349, + "num_input_tokens_seen": 69675005, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.66796875, + "step": 3228, + "time_per_iteration": 2.3586676120758057 + }, + { + "auxiliary_loss_clip": 0.01091473, + "auxiliary_loss_mlp": 0.01043592, + "balance_loss_clip": 1.02537704, + "balance_loss_mlp": 1.02686977, + "epoch": 0.194137982864873, + "flos": 25920531123840.0, + "grad_norm": 2.220200115653601, + "language_loss": 0.74391913, + "learning_rate": 3.6397355042908407e-06, + "loss": 0.76526976, + "num_input_tokens_seen": 69696455, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.64453125, + "step": 3229, + "time_per_iteration": 2.4335381984710693 + }, + { + "auxiliary_loss_clip": 0.01091645, + "auxiliary_loss_mlp": 0.01033087, + "balance_loss_clip": 1.01598048, + "balance_loss_mlp": 1.02630305, + "epoch": 0.19419810611754096, + "flos": 13260500732160.0, + "grad_norm": 2.4094008360195143, + "language_loss": 0.65313721, + "learning_rate": 3.6395191517487557e-06, + "loss": 0.67438447, + "num_input_tokens_seen": 69714245, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.65234375, + "step": 3230, + "time_per_iteration": 2.3724405765533447 + }, + { + "auxiliary_loss_clip": 0.01089027, + "auxiliary_loss_mlp": 0.01036275, + "balance_loss_clip": 1.01831055, + "balance_loss_mlp": 1.02513218, + "epoch": 0.19425822937020892, + "flos": 15704665349760.0, + "grad_norm": 1.9126845229967357, + "language_loss": 0.82243401, + "learning_rate": 3.6393027406964494e-06, + "loss": 0.84368706, + "num_input_tokens_seen": 69731515, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.640625, + "step": 3231, + "time_per_iteration": 2.372706174850464 + }, + { + "auxiliary_loss_clip": 0.01093246, + "auxiliary_loss_mlp": 0.01034572, + "balance_loss_clip": 1.01459241, + "balance_loss_mlp": 1.02733052, + "epoch": 0.1943183526228769, + "flos": 23104472037120.0, + "grad_norm": 1.882908253566389, + "language_loss": 0.87074304, + "learning_rate": 3.639086271141645e-06, + "loss": 0.89202118, + "num_input_tokens_seen": 69748885, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 0.66015625, + "step": 3232, + "time_per_iteration": 2.3734681606292725 + }, + { + "auxiliary_loss_clip": 0.01092436, + "auxiliary_loss_mlp": 0.01037586, + "balance_loss_clip": 1.01937103, + "balance_loss_mlp": 1.02718019, + "epoch": 0.19437847587554485, + "flos": 24711593875200.0, + "grad_norm": 1.7653542276973573, + "language_loss": 0.85239351, + "learning_rate": 3.6388697430920674e-06, + "loss": 0.8736937, + "num_input_tokens_seen": 69767540, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 0.65234375, + "step": 3233, + "time_per_iteration": 2.4153127670288086 + }, + { + "auxiliary_loss_clip": 0.01093505, + "auxiliary_loss_mlp": 0.01042232, + "balance_loss_clip": 1.02377868, + "balance_loss_mlp": 1.02482581, + "epoch": 0.19443859912821285, + "flos": 23114910533760.0, + "grad_norm": 1.7319351285692142, + "language_loss": 0.88985711, + "learning_rate": 3.638653156555445e-06, + "loss": 0.91121447, + "num_input_tokens_seen": 69789340, + "router_z_loss_clip": 0.18457031, + "router_z_loss_mlp": 0.6875, + "step": 3234, + "time_per_iteration": 2.4098269939422607 + }, + { + "auxiliary_loss_clip": 0.01091998, + "auxiliary_loss_mlp": 0.01033936, + "balance_loss_clip": 1.01485085, + "balance_loss_mlp": 1.02412546, + "epoch": 0.1944987223808808, + "flos": 15083525617920.0, + "grad_norm": 5.7409567116605515, + "language_loss": 0.78201854, + "learning_rate": 3.638436511539507e-06, + "loss": 0.80327791, + "num_input_tokens_seen": 69806470, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 0.6796875, + "step": 3235, + "time_per_iteration": 2.35680890083313 + }, + { + "auxiliary_loss_clip": 0.01092563, + "auxiliary_loss_mlp": 0.01034089, + "balance_loss_clip": 1.01667261, + "balance_loss_mlp": 1.02676952, + "epoch": 0.19455884563354878, + "flos": 17125979598720.0, + "grad_norm": 1.953177095274907, + "language_loss": 0.79242563, + "learning_rate": 3.6382198080519833e-06, + "loss": 0.81369209, + "num_input_tokens_seen": 69822655, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.66015625, + "step": 3236, + "time_per_iteration": 2.345677614212036 + }, + { + "auxiliary_loss_clip": 0.01090901, + "auxiliary_loss_mlp": 0.01037931, + "balance_loss_clip": 1.01946533, + "balance_loss_mlp": 1.02475131, + "epoch": 0.19461896888621674, + "flos": 20192366200320.0, + "grad_norm": 1.503568861209779, + "language_loss": 0.7555871, + "learning_rate": 3.6380030461006093e-06, + "loss": 0.77687538, + "num_input_tokens_seen": 69841895, + "router_z_loss_clip": 0.18457031, + "router_z_loss_mlp": 0.6640625, + "step": 3237, + "time_per_iteration": 2.3915979862213135 + }, + { + "auxiliary_loss_clip": 0.01092935, + "auxiliary_loss_mlp": 0.01038811, + "balance_loss_clip": 1.02026165, + "balance_loss_mlp": 1.02521873, + "epoch": 0.1946790921388847, + "flos": 25300194353280.0, + "grad_norm": 1.5232704083913822, + "language_loss": 0.75017565, + "learning_rate": 3.6377862256931203e-06, + "loss": 0.77149314, + "num_input_tokens_seen": 69862220, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.67578125, + "step": 3238, + "time_per_iteration": 2.416321039199829 + }, + { + "auxiliary_loss_clip": 0.01094228, + "auxiliary_loss_mlp": 0.01040916, + "balance_loss_clip": 1.02069783, + "balance_loss_mlp": 1.02699256, + "epoch": 0.19473921539155267, + "flos": 20192366200320.0, + "grad_norm": 1.9690252501932926, + "language_loss": 0.73038596, + "learning_rate": 3.637569346837253e-06, + "loss": 0.75173736, + "num_input_tokens_seen": 69881830, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 0.671875, + "step": 3239, + "time_per_iteration": 2.3719887733459473 + }, + { + "auxiliary_loss_clip": 0.01091792, + "auxiliary_loss_mlp": 0.01038523, + "balance_loss_clip": 1.02036738, + "balance_loss_mlp": 1.02522206, + "epoch": 0.19479933864422067, + "flos": 20886474407040.0, + "grad_norm": 1.7556989267650014, + "language_loss": 0.7344541, + "learning_rate": 3.6373524095407485e-06, + "loss": 0.75575721, + "num_input_tokens_seen": 69900515, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.66796875, + "step": 3240, + "time_per_iteration": 2.3777613639831543 + }, + { + "auxiliary_loss_clip": 0.01091297, + "auxiliary_loss_mlp": 0.01034383, + "balance_loss_clip": 1.01701427, + "balance_loss_mlp": 1.02552462, + "epoch": 0.19485946189688863, + "flos": 23293945319040.0, + "grad_norm": 1.9485447126523352, + "language_loss": 0.66402727, + "learning_rate": 3.637135413811348e-06, + "loss": 0.68528414, + "num_input_tokens_seen": 69920060, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.65625, + "step": 3241, + "time_per_iteration": 2.399484395980835 + }, + { + "auxiliary_loss_clip": 0.0109156, + "auxiliary_loss_mlp": 0.01037781, + "balance_loss_clip": 1.01962543, + "balance_loss_mlp": 1.02629566, + "epoch": 0.1949195851495566, + "flos": 23293910407680.0, + "grad_norm": 1.9349221812557778, + "language_loss": 0.8284806, + "learning_rate": 3.636918359656796e-06, + "loss": 0.84977406, + "num_input_tokens_seen": 69939820, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.65234375, + "step": 3242, + "time_per_iteration": 2.4018092155456543 + }, + { + "auxiliary_loss_clip": 0.01023619, + "auxiliary_loss_mlp": 0.01003701, + "balance_loss_clip": 1.00105453, + "balance_loss_mlp": 1.00448895, + "epoch": 0.19497970840222456, + "flos": 64959536102400.0, + "grad_norm": 0.8196101855354372, + "language_loss": 0.57456034, + "learning_rate": 3.636701247084839e-06, + "loss": 0.59483355, + "num_input_tokens_seen": 70002145, + "router_z_loss_clip": 0.02648926, + "router_z_loss_mlp": 0.19140625, + "step": 3243, + "time_per_iteration": 3.0555503368377686 + }, + { + "auxiliary_loss_clip": 0.01095224, + "auxiliary_loss_mlp": 0.01039058, + "balance_loss_clip": 1.0204618, + "balance_loss_mlp": 1.0275383, + "epoch": 0.19503983165489253, + "flos": 19643741095680.0, + "grad_norm": 2.016429930690371, + "language_loss": 0.83302236, + "learning_rate": 3.6364840761032238e-06, + "loss": 0.85436511, + "num_input_tokens_seen": 70020510, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.67578125, + "step": 3244, + "time_per_iteration": 2.3813095092773438 + }, + { + "auxiliary_loss_clip": 0.01094253, + "auxiliary_loss_mlp": 0.01035704, + "balance_loss_clip": 1.01697576, + "balance_loss_mlp": 1.02831161, + "epoch": 0.1950999549075605, + "flos": 21140921462400.0, + "grad_norm": 1.6515982101360513, + "language_loss": 0.7687943, + "learning_rate": 3.6362668467197015e-06, + "loss": 0.7900939, + "num_input_tokens_seen": 70040760, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.65625, + "step": 3245, + "time_per_iteration": 2.3906078338623047 + }, + { + "auxiliary_loss_clip": 0.01093757, + "auxiliary_loss_mlp": 0.01038342, + "balance_loss_clip": 1.01860094, + "balance_loss_mlp": 1.02652812, + "epoch": 0.19516007816022846, + "flos": 20883821143680.0, + "grad_norm": 1.9215117662172279, + "language_loss": 0.84480739, + "learning_rate": 3.6360495589420247e-06, + "loss": 0.86612833, + "num_input_tokens_seen": 70058720, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 0.671875, + "step": 3246, + "time_per_iteration": 2.376635789871216 + }, + { + "auxiliary_loss_clip": 0.0109496, + "auxiliary_loss_mlp": 0.01038014, + "balance_loss_clip": 1.01931024, + "balance_loss_mlp": 1.02649212, + "epoch": 0.19522020141289645, + "flos": 16909552880640.0, + "grad_norm": 2.0241418147502372, + "language_loss": 0.75633973, + "learning_rate": 3.6358322127779476e-06, + "loss": 0.77766943, + "num_input_tokens_seen": 70076470, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.68359375, + "step": 3247, + "time_per_iteration": 2.3652443885803223 + }, + { + "auxiliary_loss_clip": 0.01097248, + "auxiliary_loss_mlp": 0.01035461, + "balance_loss_clip": 1.01579106, + "balance_loss_mlp": 1.02903318, + "epoch": 0.19528032466556441, + "flos": 26723603283840.0, + "grad_norm": 1.8681365272471933, + "language_loss": 0.75390351, + "learning_rate": 3.6356148082352265e-06, + "loss": 0.77523059, + "num_input_tokens_seen": 70096220, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 0.6796875, + "step": 3248, + "time_per_iteration": 2.444396495819092 + }, + { + "auxiliary_loss_clip": 0.01093291, + "auxiliary_loss_mlp": 0.01037908, + "balance_loss_clip": 1.018417, + "balance_loss_mlp": 1.0265578, + "epoch": 0.19534044791823238, + "flos": 21031748040960.0, + "grad_norm": 2.045327479393052, + "language_loss": 0.78500307, + "learning_rate": 3.63539734532162e-06, + "loss": 0.80631506, + "num_input_tokens_seen": 70114800, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 0.6640625, + "step": 3249, + "time_per_iteration": 2.3759329319000244 + }, + { + "auxiliary_loss_clip": 0.0109475, + "auxiliary_loss_mlp": 0.01036823, + "balance_loss_clip": 1.01817822, + "balance_loss_mlp": 1.02709186, + "epoch": 0.19540057117090034, + "flos": 22343016084480.0, + "grad_norm": 1.5481371088008538, + "language_loss": 0.72917652, + "learning_rate": 3.6351798240448894e-06, + "loss": 0.75049222, + "num_input_tokens_seen": 70134930, + "router_z_loss_clip": 0.18652344, + "router_z_loss_mlp": 0.67578125, + "step": 3250, + "time_per_iteration": 2.3821792602539062 + }, + { + "auxiliary_loss_clip": 0.01091139, + "auxiliary_loss_mlp": 0.01036574, + "balance_loss_clip": 1.01860952, + "balance_loss_mlp": 1.02606761, + "epoch": 0.1954606944235683, + "flos": 20300631926400.0, + "grad_norm": 2.077856386507019, + "language_loss": 0.79381561, + "learning_rate": 3.634962244412797e-06, + "loss": 0.8150928, + "num_input_tokens_seen": 70152045, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.65234375, + "step": 3251, + "time_per_iteration": 2.3681230545043945 + }, + { + "auxiliary_loss_clip": 0.01094269, + "auxiliary_loss_mlp": 0.01043514, + "balance_loss_clip": 1.02563214, + "balance_loss_mlp": 1.02753401, + "epoch": 0.19552081767623627, + "flos": 17345932364160.0, + "grad_norm": 4.307334667439784, + "language_loss": 0.83700019, + "learning_rate": 3.6347446064331074e-06, + "loss": 0.85837805, + "num_input_tokens_seen": 70169240, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 0.66796875, + "step": 3252, + "time_per_iteration": 2.3527212142944336 + }, + { + "auxiliary_loss_clip": 0.01096735, + "auxiliary_loss_mlp": 0.01046693, + "balance_loss_clip": 1.02541399, + "balance_loss_mlp": 1.02695727, + "epoch": 0.19558094092890424, + "flos": 31976286134400.0, + "grad_norm": 1.8757880103444917, + "language_loss": 0.73458648, + "learning_rate": 3.6345269101135885e-06, + "loss": 0.75602067, + "num_input_tokens_seen": 70192690, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 0.69921875, + "step": 3253, + "time_per_iteration": 2.4764764308929443 + }, + { + "auxiliary_loss_clip": 0.01094574, + "auxiliary_loss_mlp": 0.01037477, + "balance_loss_clip": 1.01700842, + "balance_loss_mlp": 1.02562308, + "epoch": 0.19564106418157223, + "flos": 22267918016640.0, + "grad_norm": 1.850358038220547, + "language_loss": 0.76417327, + "learning_rate": 3.634309155462008e-06, + "loss": 0.78549385, + "num_input_tokens_seen": 70209685, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 0.6875, + "step": 3254, + "time_per_iteration": 3.743779182434082 + }, + { + "auxiliary_loss_clip": 0.01020729, + "auxiliary_loss_mlp": 0.01004913, + "balance_loss_clip": 1.00242162, + "balance_loss_mlp": 1.00204873, + "epoch": 0.1957011874342402, + "flos": 54362000678400.0, + "grad_norm": 0.7573270102413824, + "language_loss": 0.55256647, + "learning_rate": 3.6340913424861383e-06, + "loss": 0.57282287, + "num_input_tokens_seen": 70265050, + "router_z_loss_clip": 0.02490234, + "router_z_loss_mlp": 0.18652344, + "step": 3255, + "time_per_iteration": 2.994016408920288 + }, + { + "auxiliary_loss_clip": 0.01096999, + "auxiliary_loss_mlp": 0.01036695, + "balance_loss_clip": 1.01614356, + "balance_loss_mlp": 1.02813172, + "epoch": 0.19576131068690816, + "flos": 16505817384960.0, + "grad_norm": 2.7410325008516776, + "language_loss": 0.70526785, + "learning_rate": 3.6338734711937512e-06, + "loss": 0.72660476, + "num_input_tokens_seen": 70281830, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 0.6875, + "step": 3256, + "time_per_iteration": 3.76116943359375 + }, + { + "auxiliary_loss_clip": 0.01091378, + "auxiliary_loss_mlp": 0.01036129, + "balance_loss_clip": 1.01678145, + "balance_loss_mlp": 1.02586532, + "epoch": 0.19582143393957613, + "flos": 14718822888960.0, + "grad_norm": 3.4934437635727464, + "language_loss": 0.80128163, + "learning_rate": 3.6336555415926232e-06, + "loss": 0.82255673, + "num_input_tokens_seen": 70297420, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 0.65625, + "step": 3257, + "time_per_iteration": 3.6858999729156494 + }, + { + "auxiliary_loss_clip": 0.01093835, + "auxiliary_loss_mlp": 0.01036658, + "balance_loss_clip": 1.01727438, + "balance_loss_mlp": 1.02637625, + "epoch": 0.1958815571922441, + "flos": 24424363186560.0, + "grad_norm": 1.9926092582446306, + "language_loss": 0.74545258, + "learning_rate": 3.6334375536905313e-06, + "loss": 0.76675749, + "num_input_tokens_seen": 70319210, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 0.67578125, + "step": 3258, + "time_per_iteration": 2.413703203201294 + }, + { + "auxiliary_loss_clip": 0.01095093, + "auxiliary_loss_mlp": 0.01035571, + "balance_loss_clip": 1.01610398, + "balance_loss_mlp": 1.02671051, + "epoch": 0.19594168044491206, + "flos": 24899112120960.0, + "grad_norm": 1.9569839831859468, + "language_loss": 0.74006474, + "learning_rate": 3.633219507495255e-06, + "loss": 0.76137137, + "num_input_tokens_seen": 70339045, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 0.68359375, + "step": 3259, + "time_per_iteration": 3.779968500137329 + }, + { + "auxiliary_loss_clip": 0.01097543, + "auxiliary_loss_mlp": 0.01043504, + "balance_loss_clip": 1.0222733, + "balance_loss_mlp": 1.02805924, + "epoch": 0.19600180369758005, + "flos": 12056206694400.0, + "grad_norm": 2.537177333310713, + "language_loss": 0.76501352, + "learning_rate": 3.633001403014575e-06, + "loss": 0.78642392, + "num_input_tokens_seen": 70356505, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 0.6953125, + "step": 3260, + "time_per_iteration": 2.352241039276123 + }, + { + "auxiliary_loss_clip": 0.01094873, + "auxiliary_loss_mlp": 0.01039222, + "balance_loss_clip": 1.01876605, + "balance_loss_mlp": 1.02674258, + "epoch": 0.19606192695024802, + "flos": 20849152296960.0, + "grad_norm": 2.0974309113083542, + "language_loss": 0.82169342, + "learning_rate": 3.632783240256276e-06, + "loss": 0.84303439, + "num_input_tokens_seen": 70375410, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 0.6796875, + "step": 3261, + "time_per_iteration": 2.386594295501709 + }, + { + "auxiliary_loss_clip": 0.0109362, + "auxiliary_loss_mlp": 0.01039481, + "balance_loss_clip": 1.01932275, + "balance_loss_mlp": 1.02704477, + "epoch": 0.19612205020291598, + "flos": 28474253187840.0, + "grad_norm": 2.2523880615555, + "language_loss": 0.76350236, + "learning_rate": 3.632565019228143e-06, + "loss": 0.78483337, + "num_input_tokens_seen": 70396315, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 0.6640625, + "step": 3262, + "time_per_iteration": 2.4290218353271484 + }, + { + "auxiliary_loss_clip": 0.01097967, + "auxiliary_loss_mlp": 0.01042585, + "balance_loss_clip": 1.02332032, + "balance_loss_mlp": 1.02915668, + "epoch": 0.19618217345558395, + "flos": 25555444369920.0, + "grad_norm": 1.6467314770588994, + "language_loss": 0.86481088, + "learning_rate": 3.6323467399379634e-06, + "loss": 0.8862164, + "num_input_tokens_seen": 70417945, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 0.6875, + "step": 3263, + "time_per_iteration": 2.4395463466644287 + }, + { + "auxiliary_loss_clip": 0.01092486, + "auxiliary_loss_mlp": 0.01035799, + "balance_loss_clip": 1.01756024, + "balance_loss_mlp": 1.02606213, + "epoch": 0.1962422967082519, + "flos": 25263256268160.0, + "grad_norm": 1.672456435112228, + "language_loss": 0.73797274, + "learning_rate": 3.6321284023935284e-06, + "loss": 0.75925559, + "num_input_tokens_seen": 70438690, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 0.6640625, + "step": 3264, + "time_per_iteration": 2.4126996994018555 + }, + { + "auxiliary_loss_clip": 0.01093631, + "auxiliary_loss_mlp": 0.01035924, + "balance_loss_clip": 1.01721966, + "balance_loss_mlp": 1.02846122, + "epoch": 0.19630241996091988, + "flos": 18806349202560.0, + "grad_norm": 1.8398535179686513, + "language_loss": 0.78879499, + "learning_rate": 3.6319100066026284e-06, + "loss": 0.81009054, + "num_input_tokens_seen": 70455385, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.65234375, + "step": 3265, + "time_per_iteration": 2.3726553916931152 + }, + { + "auxiliary_loss_clip": 0.01020868, + "auxiliary_loss_mlp": 0.01002098, + "balance_loss_clip": 0.99966645, + "balance_loss_mlp": 1.00293803, + "epoch": 0.19636254321358784, + "flos": 62318287526400.0, + "grad_norm": 0.7818136536489693, + "language_loss": 0.53380704, + "learning_rate": 3.6316915525730586e-06, + "loss": 0.55403674, + "num_input_tokens_seen": 70514280, + "router_z_loss_clip": 0.02429199, + "router_z_loss_mlp": 0.1796875, + "step": 3266, + "time_per_iteration": 3.045663833618164 + }, + { + "auxiliary_loss_clip": 0.01097612, + "auxiliary_loss_mlp": 0.010397, + "balance_loss_clip": 1.01870728, + "balance_loss_mlp": 1.02730501, + "epoch": 0.19642266646625584, + "flos": 21068267189760.0, + "grad_norm": 1.995501135005288, + "language_loss": 0.80237895, + "learning_rate": 3.631473040312614e-06, + "loss": 0.82375205, + "num_input_tokens_seen": 70531800, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 0.703125, + "step": 3267, + "time_per_iteration": 2.3809406757354736 + }, + { + "auxiliary_loss_clip": 0.010922, + "auxiliary_loss_mlp": 0.01036523, + "balance_loss_clip": 1.01773524, + "balance_loss_mlp": 1.02631319, + "epoch": 0.1964827897189238, + "flos": 14537763244800.0, + "grad_norm": 9.820342443007977, + "language_loss": 0.86726725, + "learning_rate": 3.631254469829094e-06, + "loss": 0.88855445, + "num_input_tokens_seen": 70550615, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.66015625, + "step": 3268, + "time_per_iteration": 2.3497135639190674 + }, + { + "auxiliary_loss_clip": 0.01094062, + "auxiliary_loss_mlp": 0.01034729, + "balance_loss_clip": 1.01679969, + "balance_loss_mlp": 1.02803731, + "epoch": 0.19654291297159177, + "flos": 19243636381440.0, + "grad_norm": 2.55200700436689, + "language_loss": 0.69314349, + "learning_rate": 3.631035841130297e-06, + "loss": 0.71443152, + "num_input_tokens_seen": 70568690, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.66015625, + "step": 3269, + "time_per_iteration": 2.382796049118042 + }, + { + "auxiliary_loss_clip": 0.01098343, + "auxiliary_loss_mlp": 0.01038692, + "balance_loss_clip": 1.019189, + "balance_loss_mlp": 1.02958822, + "epoch": 0.19660303622425973, + "flos": 25774524351360.0, + "grad_norm": 2.125134277645011, + "language_loss": 0.80778444, + "learning_rate": 3.6308171542240273e-06, + "loss": 0.82915473, + "num_input_tokens_seen": 70588665, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 0.6875, + "step": 3270, + "time_per_iteration": 2.418290376663208 + }, + { + "auxiliary_loss_clip": 0.01090844, + "auxiliary_loss_mlp": 0.01036312, + "balance_loss_clip": 1.01775169, + "balance_loss_mlp": 1.02518535, + "epoch": 0.1966631594769277, + "flos": 20594041925760.0, + "grad_norm": 2.3224232715776933, + "language_loss": 0.83714098, + "learning_rate": 3.6305984091180875e-06, + "loss": 0.8584125, + "num_input_tokens_seen": 70606900, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.65625, + "step": 3271, + "time_per_iteration": 2.3715765476226807 + }, + { + "auxiliary_loss_clip": 0.01090027, + "auxiliary_loss_mlp": 0.01035944, + "balance_loss_clip": 1.01740742, + "balance_loss_mlp": 1.026191, + "epoch": 0.19672328272959566, + "flos": 23622059076480.0, + "grad_norm": 1.9779680712166663, + "language_loss": 0.8020243, + "learning_rate": 3.630379605820286e-06, + "loss": 0.82328403, + "num_input_tokens_seen": 70625955, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.640625, + "step": 3272, + "time_per_iteration": 2.3996176719665527 + }, + { + "auxiliary_loss_clip": 0.01094305, + "auxiliary_loss_mlp": 0.0104103, + "balance_loss_clip": 1.02173042, + "balance_loss_mlp": 1.0272423, + "epoch": 0.19678340598226365, + "flos": 23109848386560.0, + "grad_norm": 1.9417672352006365, + "language_loss": 0.80638385, + "learning_rate": 3.630160744338429e-06, + "loss": 0.82773721, + "num_input_tokens_seen": 70646090, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 0.671875, + "step": 3273, + "time_per_iteration": 2.4034245014190674 + }, + { + "auxiliary_loss_clip": 0.01093358, + "auxiliary_loss_mlp": 0.01042803, + "balance_loss_clip": 1.02361071, + "balance_loss_mlp": 1.02661026, + "epoch": 0.19684352923493162, + "flos": 24533711164800.0, + "grad_norm": 1.685327645217058, + "language_loss": 0.77463973, + "learning_rate": 3.6299418246803287e-06, + "loss": 0.79600132, + "num_input_tokens_seen": 70666065, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 0.66796875, + "step": 3274, + "time_per_iteration": 2.4098117351531982 + }, + { + "auxiliary_loss_clip": 0.01093006, + "auxiliary_loss_mlp": 0.01038849, + "balance_loss_clip": 1.01897693, + "balance_loss_mlp": 1.02611113, + "epoch": 0.19690365248759958, + "flos": 21795438320640.0, + "grad_norm": 3.613819678474458, + "language_loss": 0.81330287, + "learning_rate": 3.6297228468537976e-06, + "loss": 0.83462143, + "num_input_tokens_seen": 70681580, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 0.66796875, + "step": 3275, + "time_per_iteration": 2.3586437702178955 + }, + { + "auxiliary_loss_clip": 0.01093968, + "auxiliary_loss_mlp": 0.01039103, + "balance_loss_clip": 1.01931393, + "balance_loss_mlp": 1.02676809, + "epoch": 0.19696377574026755, + "flos": 19055803933440.0, + "grad_norm": 1.8388194076600255, + "language_loss": 0.81185746, + "learning_rate": 3.6295038108666504e-06, + "loss": 0.83318818, + "num_input_tokens_seen": 70697745, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 0.671875, + "step": 3276, + "time_per_iteration": 2.3627398014068604 + }, + { + "auxiliary_loss_clip": 0.01093263, + "auxiliary_loss_mlp": 0.0103276, + "balance_loss_clip": 1.01329339, + "balance_loss_mlp": 1.02681422, + "epoch": 0.19702389899293551, + "flos": 22819545498240.0, + "grad_norm": 3.913293022738735, + "language_loss": 0.89339715, + "learning_rate": 3.629284716726703e-06, + "loss": 0.91465741, + "num_input_tokens_seen": 70715110, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 0.6640625, + "step": 3277, + "time_per_iteration": 2.3836357593536377 + }, + { + "auxiliary_loss_clip": 0.01096947, + "auxiliary_loss_mlp": 0.0104411, + "balance_loss_clip": 1.02131701, + "balance_loss_mlp": 1.02706742, + "epoch": 0.19708402224560348, + "flos": 22893107466240.0, + "grad_norm": 2.121847641070351, + "language_loss": 0.62500441, + "learning_rate": 3.6290655644417757e-06, + "loss": 0.646415, + "num_input_tokens_seen": 70734715, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 0.69921875, + "step": 3278, + "time_per_iteration": 2.398437738418579 + }, + { + "auxiliary_loss_clip": 0.01094952, + "auxiliary_loss_mlp": 0.01041973, + "balance_loss_clip": 1.02227986, + "balance_loss_mlp": 1.02898657, + "epoch": 0.19714414549827144, + "flos": 25661440857600.0, + "grad_norm": 2.9479425926662484, + "language_loss": 0.73167086, + "learning_rate": 3.6288463540196894e-06, + "loss": 0.75304008, + "num_input_tokens_seen": 70752650, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 0.66015625, + "step": 3279, + "time_per_iteration": 2.409766912460327 + }, + { + "auxiliary_loss_clip": 0.01094849, + "auxiliary_loss_mlp": 0.01036963, + "balance_loss_clip": 1.01818752, + "balance_loss_mlp": 1.02610898, + "epoch": 0.19720426875093944, + "flos": 23914666114560.0, + "grad_norm": 1.6378735440973151, + "language_loss": 0.8245886, + "learning_rate": 3.6286270854682654e-06, + "loss": 0.84590667, + "num_input_tokens_seen": 70772365, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.6875, + "step": 3280, + "time_per_iteration": 2.402646780014038 + }, + { + "auxiliary_loss_clip": 0.01096637, + "auxiliary_loss_mlp": 0.01039202, + "balance_loss_clip": 1.02074862, + "balance_loss_mlp": 1.02819932, + "epoch": 0.1972643920036074, + "flos": 13881081882240.0, + "grad_norm": 1.9159719125239376, + "language_loss": 0.77710402, + "learning_rate": 3.6284077587953307e-06, + "loss": 0.79846239, + "num_input_tokens_seen": 70790340, + "router_z_loss_clip": 0.18457031, + "router_z_loss_mlp": 0.68359375, + "step": 3281, + "time_per_iteration": 2.359015703201294 + }, + { + "auxiliary_loss_clip": 0.01091432, + "auxiliary_loss_mlp": 0.0104029, + "balance_loss_clip": 1.02223015, + "balance_loss_mlp": 1.02704263, + "epoch": 0.19732451525627537, + "flos": 19862611608960.0, + "grad_norm": 1.7886278430035771, + "language_loss": 0.79787135, + "learning_rate": 3.628188374008712e-06, + "loss": 0.81918859, + "num_input_tokens_seen": 70809295, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.64453125, + "step": 3282, + "time_per_iteration": 2.4334022998809814 + }, + { + "auxiliary_loss_clip": 0.01097752, + "auxiliary_loss_mlp": 0.01036859, + "balance_loss_clip": 1.01765406, + "balance_loss_mlp": 1.02917266, + "epoch": 0.19738463850894333, + "flos": 24972255152640.0, + "grad_norm": 2.004615061425067, + "language_loss": 0.71514744, + "learning_rate": 3.6279689311162382e-06, + "loss": 0.73649353, + "num_input_tokens_seen": 70828765, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 0.68359375, + "step": 3283, + "time_per_iteration": 2.411259412765503 + }, + { + "auxiliary_loss_clip": 0.01093562, + "auxiliary_loss_mlp": 0.010486, + "balance_loss_clip": 1.02915692, + "balance_loss_mlp": 1.02674568, + "epoch": 0.1974447617616113, + "flos": 18367909948800.0, + "grad_norm": 2.0901111004947532, + "language_loss": 0.78843147, + "learning_rate": 3.6277494301257407e-06, + "loss": 0.80985308, + "num_input_tokens_seen": 70846805, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 0.66796875, + "step": 3284, + "time_per_iteration": 2.363900661468506 + }, + { + "auxiliary_loss_clip": 0.01095873, + "auxiliary_loss_mlp": 0.01040929, + "balance_loss_clip": 1.01948333, + "balance_loss_mlp": 1.02663827, + "epoch": 0.19750488501427926, + "flos": 22891850657280.0, + "grad_norm": 2.1448640694948136, + "language_loss": 0.86014587, + "learning_rate": 3.6275298710450533e-06, + "loss": 0.88151383, + "num_input_tokens_seen": 70863805, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 0.69140625, + "step": 3285, + "time_per_iteration": 2.3824238777160645 + }, + { + "auxiliary_loss_clip": 0.01092925, + "auxiliary_loss_mlp": 0.01034811, + "balance_loss_clip": 1.01686954, + "balance_loss_mlp": 1.0281477, + "epoch": 0.19756500826694723, + "flos": 21870431654400.0, + "grad_norm": 2.307979665706718, + "language_loss": 0.88518846, + "learning_rate": 3.627310253882012e-06, + "loss": 0.90646577, + "num_input_tokens_seen": 70882660, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.6484375, + "step": 3286, + "time_per_iteration": 2.3906936645507812 + }, + { + "auxiliary_loss_clip": 0.01095962, + "auxiliary_loss_mlp": 0.01041885, + "balance_loss_clip": 1.02093983, + "balance_loss_mlp": 1.02850592, + "epoch": 0.19762513151961522, + "flos": 15158065104000.0, + "grad_norm": 2.3708893620062006, + "language_loss": 0.78196716, + "learning_rate": 3.627090578644452e-06, + "loss": 0.80334568, + "num_input_tokens_seen": 70898765, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 0.671875, + "step": 3287, + "time_per_iteration": 2.351134777069092 + }, + { + "auxiliary_loss_clip": 0.01095035, + "auxiliary_loss_mlp": 0.0103711, + "balance_loss_clip": 1.01710689, + "balance_loss_mlp": 1.02704477, + "epoch": 0.1976852547722832, + "flos": 16978331992320.0, + "grad_norm": 6.362389513168747, + "language_loss": 0.81321955, + "learning_rate": 3.6268708453402163e-06, + "loss": 0.83454096, + "num_input_tokens_seen": 70916370, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 0.6796875, + "step": 3288, + "time_per_iteration": 2.357706308364868 + }, + { + "auxiliary_loss_clip": 0.01091467, + "auxiliary_loss_mlp": 0.01039245, + "balance_loss_clip": 1.02087522, + "balance_loss_mlp": 1.02781177, + "epoch": 0.19774537802495115, + "flos": 20301888735360.0, + "grad_norm": 1.8304637570278746, + "language_loss": 0.72829801, + "learning_rate": 3.626651053977144e-06, + "loss": 0.74960506, + "num_input_tokens_seen": 70934870, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.63671875, + "step": 3289, + "time_per_iteration": 2.3803505897521973 + }, + { + "auxiliary_loss_clip": 0.01093329, + "auxiliary_loss_mlp": 0.01041591, + "balance_loss_clip": 1.02309, + "balance_loss_mlp": 1.02678442, + "epoch": 0.19780550127761912, + "flos": 27234242962560.0, + "grad_norm": 2.524989848506488, + "language_loss": 0.7940982, + "learning_rate": 3.6264312045630802e-06, + "loss": 0.81544745, + "num_input_tokens_seen": 70955140, + "router_z_loss_clip": 0.18457031, + "router_z_loss_mlp": 0.6640625, + "step": 3290, + "time_per_iteration": 2.4267165660858154 + }, + { + "auxiliary_loss_clip": 0.01023181, + "auxiliary_loss_mlp": 0.01002361, + "balance_loss_clip": 0.9999764, + "balance_loss_mlp": 1.00474858, + "epoch": 0.19786562453028708, + "flos": 63547368629760.0, + "grad_norm": 0.8859536292225495, + "language_loss": 0.60287488, + "learning_rate": 3.62621129710587e-06, + "loss": 0.6231302, + "num_input_tokens_seen": 71012005, + "router_z_loss_clip": 0.02380371, + "router_z_loss_mlp": 0.18457031, + "step": 3291, + "time_per_iteration": 3.0441205501556396 + }, + { + "auxiliary_loss_clip": 0.01095104, + "auxiliary_loss_mlp": 0.01036338, + "balance_loss_clip": 1.01521397, + "balance_loss_mlp": 1.02619529, + "epoch": 0.19792574778295505, + "flos": 26285443320960.0, + "grad_norm": 1.710016469752574, + "language_loss": 0.81068504, + "learning_rate": 3.6259913316133625e-06, + "loss": 0.83199942, + "num_input_tokens_seen": 71031140, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 0.6875, + "step": 3292, + "time_per_iteration": 2.4184789657592773 + }, + { + "auxiliary_loss_clip": 0.01088314, + "auxiliary_loss_mlp": 0.01036573, + "balance_loss_clip": 1.01852489, + "balance_loss_mlp": 1.02581763, + "epoch": 0.19798587103562304, + "flos": 19937081272320.0, + "grad_norm": 2.164269928396593, + "language_loss": 0.81474257, + "learning_rate": 3.625771308093406e-06, + "loss": 0.8359915, + "num_input_tokens_seen": 71050250, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.625, + "step": 3293, + "time_per_iteration": 3.7663729190826416 + }, + { + "auxiliary_loss_clip": 0.011012, + "auxiliary_loss_mlp": 0.01041589, + "balance_loss_clip": 1.02060866, + "balance_loss_mlp": 1.03096867, + "epoch": 0.198045994288291, + "flos": 20119258080000.0, + "grad_norm": 1.8060343049106946, + "language_loss": 0.61091065, + "learning_rate": 3.625551226553854e-06, + "loss": 0.63233852, + "num_input_tokens_seen": 71068665, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 0.703125, + "step": 3294, + "time_per_iteration": 2.3722171783447266 + }, + { + "auxiliary_loss_clip": 0.01092659, + "auxiliary_loss_mlp": 0.01038661, + "balance_loss_clip": 1.01956379, + "balance_loss_mlp": 1.02670622, + "epoch": 0.19810611754095897, + "flos": 17966688071040.0, + "grad_norm": 1.9379544072355641, + "language_loss": 0.87146139, + "learning_rate": 3.6253310870025598e-06, + "loss": 0.89277458, + "num_input_tokens_seen": 71085320, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 0.66015625, + "step": 3295, + "time_per_iteration": 2.3429510593414307 + }, + { + "auxiliary_loss_clip": 0.01092325, + "auxiliary_loss_mlp": 0.01037285, + "balance_loss_clip": 1.01948702, + "balance_loss_mlp": 1.02713811, + "epoch": 0.19816624079362694, + "flos": 15084119111040.0, + "grad_norm": 2.476934992282589, + "language_loss": 0.80659974, + "learning_rate": 3.6251108894473806e-06, + "loss": 0.82789588, + "num_input_tokens_seen": 71102020, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.65234375, + "step": 3296, + "time_per_iteration": 5.0992112159729 + }, + { + "auxiliary_loss_clip": 0.01095761, + "auxiliary_loss_mlp": 0.01034934, + "balance_loss_clip": 1.01356006, + "balance_loss_mlp": 1.0261941, + "epoch": 0.1982263640462949, + "flos": 24899147032320.0, + "grad_norm": 1.9064652416671366, + "language_loss": 0.68187982, + "learning_rate": 3.624890633896173e-06, + "loss": 0.70318681, + "num_input_tokens_seen": 71123390, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 0.6953125, + "step": 3297, + "time_per_iteration": 2.414424180984497 + }, + { + "auxiliary_loss_clip": 0.01090671, + "auxiliary_loss_mlp": 0.01034454, + "balance_loss_clip": 1.01708508, + "balance_loss_mlp": 1.02733803, + "epoch": 0.19828648729896287, + "flos": 20375136501120.0, + "grad_norm": 1.7080929907446294, + "language_loss": 0.81423819, + "learning_rate": 3.6246703203567996e-06, + "loss": 0.83548945, + "num_input_tokens_seen": 71141800, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.6328125, + "step": 3298, + "time_per_iteration": 2.3621015548706055 + }, + { + "auxiliary_loss_clip": 0.01096755, + "auxiliary_loss_mlp": 0.01040343, + "balance_loss_clip": 1.01938558, + "balance_loss_mlp": 1.02652717, + "epoch": 0.19834661055163083, + "flos": 18879038386560.0, + "grad_norm": 1.8118896067170283, + "language_loss": 0.8499251, + "learning_rate": 3.624449948837121e-06, + "loss": 0.87129605, + "num_input_tokens_seen": 71159505, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 0.703125, + "step": 3299, + "time_per_iteration": 3.7855286598205566 + }, + { + "auxiliary_loss_clip": 0.0102453, + "auxiliary_loss_mlp": 0.01005913, + "balance_loss_clip": 1.00327826, + "balance_loss_mlp": 1.00592458, + "epoch": 0.19840673380429882, + "flos": 60255897292800.0, + "grad_norm": 0.7660662056871961, + "language_loss": 0.53273189, + "learning_rate": 3.6242295193450024e-06, + "loss": 0.55303633, + "num_input_tokens_seen": 71223265, + "router_z_loss_clip": 0.02636719, + "router_z_loss_mlp": 0.18554688, + "step": 3300, + "time_per_iteration": 2.9814531803131104 + }, + { + "auxiliary_loss_clip": 0.01093189, + "auxiliary_loss_mlp": 0.01043868, + "balance_loss_clip": 1.02426958, + "balance_loss_mlp": 1.02672458, + "epoch": 0.1984668570569668, + "flos": 19900981059840.0, + "grad_norm": 2.006506243334892, + "language_loss": 0.72906816, + "learning_rate": 3.6240090318883103e-06, + "loss": 0.75043869, + "num_input_tokens_seen": 71242385, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 0.6640625, + "step": 3301, + "time_per_iteration": 2.390085458755493 + }, + { + "auxiliary_loss_clip": 0.01093821, + "auxiliary_loss_mlp": 0.01034821, + "balance_loss_clip": 1.01660538, + "balance_loss_mlp": 1.0276103, + "epoch": 0.19852698030963475, + "flos": 15629916395520.0, + "grad_norm": 2.3942277196780233, + "language_loss": 0.88054079, + "learning_rate": 3.623788486474913e-06, + "loss": 0.90182722, + "num_input_tokens_seen": 71258990, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 0.66015625, + "step": 3302, + "time_per_iteration": 2.3620595932006836 + }, + { + "auxiliary_loss_clip": 0.01093732, + "auxiliary_loss_mlp": 0.01034571, + "balance_loss_clip": 1.01491392, + "balance_loss_mlp": 1.02640676, + "epoch": 0.19858710356230272, + "flos": 43141335575040.0, + "grad_norm": 1.8277191047193035, + "language_loss": 0.73320621, + "learning_rate": 3.623567883112682e-06, + "loss": 0.7544893, + "num_input_tokens_seen": 71282770, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 0.671875, + "step": 3303, + "time_per_iteration": 2.573152780532837 + }, + { + "auxiliary_loss_clip": 0.01092105, + "auxiliary_loss_mlp": 0.01041146, + "balance_loss_clip": 1.02151227, + "balance_loss_mlp": 1.02686501, + "epoch": 0.19864722681497068, + "flos": 35142873937920.0, + "grad_norm": 1.8424346043034194, + "language_loss": 0.74658036, + "learning_rate": 3.6233472218094897e-06, + "loss": 0.76791286, + "num_input_tokens_seen": 71301410, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 0.65234375, + "step": 3304, + "time_per_iteration": 2.49125599861145 + }, + { + "auxiliary_loss_clip": 0.01091011, + "auxiliary_loss_mlp": 0.01033155, + "balance_loss_clip": 1.01579833, + "balance_loss_mlp": 1.02630901, + "epoch": 0.19870735006763865, + "flos": 19425219696000.0, + "grad_norm": 2.7354527523926517, + "language_loss": 0.85860914, + "learning_rate": 3.62312650257321e-06, + "loss": 0.8798508, + "num_input_tokens_seen": 71319670, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.6484375, + "step": 3305, + "time_per_iteration": 2.3646278381347656 + }, + { + "auxiliary_loss_clip": 0.01093773, + "auxiliary_loss_mlp": 0.01033682, + "balance_loss_clip": 1.01421475, + "balance_loss_mlp": 1.02594137, + "epoch": 0.19876747332030664, + "flos": 23546332604160.0, + "grad_norm": 1.5847495036578148, + "language_loss": 0.68523008, + "learning_rate": 3.622905725411721e-06, + "loss": 0.70650458, + "num_input_tokens_seen": 71339850, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 0.6796875, + "step": 3306, + "time_per_iteration": 2.4286417961120605 + }, + { + "auxiliary_loss_clip": 0.01089723, + "auxiliary_loss_mlp": 0.01030964, + "balance_loss_clip": 1.01242733, + "balance_loss_mlp": 1.02498138, + "epoch": 0.1988275965729746, + "flos": 19828361698560.0, + "grad_norm": 1.9311524076025604, + "language_loss": 0.76461613, + "learning_rate": 3.622684890332901e-06, + "loss": 0.78582305, + "num_input_tokens_seen": 71359795, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.6484375, + "step": 3307, + "time_per_iteration": 2.4046173095703125 + }, + { + "auxiliary_loss_clip": 0.01092585, + "auxiliary_loss_mlp": 0.0103464, + "balance_loss_clip": 1.01755738, + "balance_loss_mlp": 1.02870154, + "epoch": 0.19888771982564257, + "flos": 23512501630080.0, + "grad_norm": 2.169181339009595, + "language_loss": 0.7575652, + "learning_rate": 3.622463997344632e-06, + "loss": 0.77883744, + "num_input_tokens_seen": 71378885, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.63671875, + "step": 3308, + "time_per_iteration": 2.406249761581421 + }, + { + "auxiliary_loss_clip": 0.01092841, + "auxiliary_loss_mlp": 0.01034602, + "balance_loss_clip": 1.01551628, + "balance_loss_mlp": 1.02665699, + "epoch": 0.19894784307831054, + "flos": 18149528194560.0, + "grad_norm": 3.2640222401270114, + "language_loss": 0.75881577, + "learning_rate": 3.622243046454796e-06, + "loss": 0.78009021, + "num_input_tokens_seen": 71397285, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 0.66015625, + "step": 3309, + "time_per_iteration": 2.353468894958496 + }, + { + "auxiliary_loss_clip": 0.01090891, + "auxiliary_loss_mlp": 0.01042274, + "balance_loss_clip": 1.02351093, + "balance_loss_mlp": 1.02639914, + "epoch": 0.1990079663309785, + "flos": 24275004923520.0, + "grad_norm": 2.1937839839446713, + "language_loss": 0.87566149, + "learning_rate": 3.6220220376712787e-06, + "loss": 0.89699316, + "num_input_tokens_seen": 71415775, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.64453125, + "step": 3310, + "time_per_iteration": 2.4153454303741455 + }, + { + "auxiliary_loss_clip": 0.01090099, + "auxiliary_loss_mlp": 0.01032432, + "balance_loss_clip": 1.01319122, + "balance_loss_mlp": 1.02569306, + "epoch": 0.19906808958364647, + "flos": 34896212115840.0, + "grad_norm": 2.7435653817024566, + "language_loss": 0.64026791, + "learning_rate": 3.621800971001967e-06, + "loss": 0.66149318, + "num_input_tokens_seen": 71437315, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 0.64453125, + "step": 3311, + "time_per_iteration": 2.482752561569214 + }, + { + "auxiliary_loss_clip": 0.01094648, + "auxiliary_loss_mlp": 0.01034431, + "balance_loss_clip": 1.01598966, + "balance_loss_mlp": 1.02681947, + "epoch": 0.19912821283631443, + "flos": 24023734801920.0, + "grad_norm": 2.2046276982202686, + "language_loss": 0.73585874, + "learning_rate": 3.6215798464547505e-06, + "loss": 0.75714952, + "num_input_tokens_seen": 71456320, + "router_z_loss_clip": 0.18457031, + "router_z_loss_mlp": 0.67578125, + "step": 3312, + "time_per_iteration": 2.39709734916687 + }, + { + "auxiliary_loss_clip": 0.01089779, + "auxiliary_loss_mlp": 0.01037231, + "balance_loss_clip": 1.01886082, + "balance_loss_mlp": 1.0253818, + "epoch": 0.19918833608898243, + "flos": 19858177866240.0, + "grad_norm": 2.214729870251275, + "language_loss": 0.83765405, + "learning_rate": 3.6213586640375207e-06, + "loss": 0.85892415, + "num_input_tokens_seen": 71475360, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.64453125, + "step": 3313, + "time_per_iteration": 2.3777294158935547 + }, + { + "auxiliary_loss_clip": 0.01094265, + "auxiliary_loss_mlp": 0.01039953, + "balance_loss_clip": 1.02130866, + "balance_loss_mlp": 1.02869534, + "epoch": 0.1992484593416504, + "flos": 29094520135680.0, + "grad_norm": 4.863015940835911, + "language_loss": 0.80661523, + "learning_rate": 3.6211374237581706e-06, + "loss": 0.82795733, + "num_input_tokens_seen": 71496155, + "router_z_loss_clip": 0.18652344, + "router_z_loss_mlp": 0.65625, + "step": 3314, + "time_per_iteration": 2.4357516765594482 + }, + { + "auxiliary_loss_clip": 0.01088398, + "auxiliary_loss_mlp": 0.01033582, + "balance_loss_clip": 1.01609409, + "balance_loss_mlp": 1.02526283, + "epoch": 0.19930858259431836, + "flos": 23293875496320.0, + "grad_norm": 1.444588805498063, + "language_loss": 0.8718859, + "learning_rate": 3.620916125624596e-06, + "loss": 0.89310575, + "num_input_tokens_seen": 71517295, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.6328125, + "step": 3315, + "time_per_iteration": 2.4119343757629395 + }, + { + "auxiliary_loss_clip": 0.01093967, + "auxiliary_loss_mlp": 0.01034429, + "balance_loss_clip": 1.0160712, + "balance_loss_mlp": 1.02733731, + "epoch": 0.19936870584698632, + "flos": 25377526748160.0, + "grad_norm": 1.5396054078786452, + "language_loss": 0.7089622, + "learning_rate": 3.620694769644694e-06, + "loss": 0.73024619, + "num_input_tokens_seen": 71540000, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.66796875, + "step": 3316, + "time_per_iteration": 2.441154956817627 + }, + { + "auxiliary_loss_clip": 0.01023534, + "auxiliary_loss_mlp": 0.01004258, + "balance_loss_clip": 1.00170732, + "balance_loss_mlp": 1.00516033, + "epoch": 0.1994288290996543, + "flos": 62164388920320.0, + "grad_norm": 0.8381400898957825, + "language_loss": 0.66274536, + "learning_rate": 3.6204733558263653e-06, + "loss": 0.68302333, + "num_input_tokens_seen": 71607880, + "router_z_loss_clip": 0.0255127, + "router_z_loss_mlp": 0.18359375, + "step": 3317, + "time_per_iteration": 3.0949625968933105 + }, + { + "auxiliary_loss_clip": 0.01095287, + "auxiliary_loss_mlp": 0.01039173, + "balance_loss_clip": 1.02044487, + "balance_loss_mlp": 1.02687836, + "epoch": 0.19948895235232225, + "flos": 19024835690880.0, + "grad_norm": 3.3600161071849532, + "language_loss": 0.74098063, + "learning_rate": 3.6202518841775104e-06, + "loss": 0.76232529, + "num_input_tokens_seen": 71625695, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.68359375, + "step": 3318, + "time_per_iteration": 2.3706955909729004 + }, + { + "auxiliary_loss_clip": 0.01088429, + "auxiliary_loss_mlp": 0.01038061, + "balance_loss_clip": 1.02025127, + "balance_loss_mlp": 1.02606761, + "epoch": 0.19954907560499022, + "flos": 37814287795200.0, + "grad_norm": 1.9049686259150524, + "language_loss": 0.78945422, + "learning_rate": 3.6200303547060336e-06, + "loss": 0.81071913, + "num_input_tokens_seen": 71648520, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.625, + "step": 3319, + "time_per_iteration": 2.5089056491851807 + }, + { + "auxiliary_loss_clip": 0.01091702, + "auxiliary_loss_mlp": 0.01034212, + "balance_loss_clip": 1.01383972, + "balance_loss_mlp": 1.02566719, + "epoch": 0.1996091988576582, + "flos": 49563329414400.0, + "grad_norm": 2.4920940963466145, + "language_loss": 0.76352167, + "learning_rate": 3.61980876741984e-06, + "loss": 0.78478074, + "num_input_tokens_seen": 71672185, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 0.66015625, + "step": 3320, + "time_per_iteration": 2.6354074478149414 + }, + { + "auxiliary_loss_clip": 0.01090883, + "auxiliary_loss_mlp": 0.01039099, + "balance_loss_clip": 1.02159953, + "balance_loss_mlp": 1.02575707, + "epoch": 0.19966932211032618, + "flos": 22634750338560.0, + "grad_norm": 2.207245808725498, + "language_loss": 0.80172241, + "learning_rate": 3.6195871223268392e-06, + "loss": 0.82302225, + "num_input_tokens_seen": 71692890, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.65234375, + "step": 3321, + "time_per_iteration": 2.3941807746887207 + }, + { + "auxiliary_loss_clip": 0.0102174, + "auxiliary_loss_mlp": 0.01001946, + "balance_loss_clip": 0.99951375, + "balance_loss_mlp": 1.00350666, + "epoch": 0.19972944536299414, + "flos": 54079308466560.0, + "grad_norm": 0.8696088293687311, + "language_loss": 0.65128511, + "learning_rate": 3.61936541943494e-06, + "loss": 0.67152202, + "num_input_tokens_seen": 71745815, + "router_z_loss_clip": 0.02429199, + "router_z_loss_mlp": 0.18261719, + "step": 3322, + "time_per_iteration": 2.8282928466796875 + }, + { + "auxiliary_loss_clip": 0.01021052, + "auxiliary_loss_mlp": 0.01002368, + "balance_loss_clip": 0.99996036, + "balance_loss_mlp": 1.00306439, + "epoch": 0.1997895686156621, + "flos": 69352204498560.0, + "grad_norm": 0.7836553028040963, + "language_loss": 0.56965047, + "learning_rate": 3.619143658752054e-06, + "loss": 0.5898847, + "num_input_tokens_seen": 71806915, + "router_z_loss_clip": 0.02404785, + "router_z_loss_mlp": 0.1796875, + "step": 3323, + "time_per_iteration": 3.154853105545044 + }, + { + "auxiliary_loss_clip": 0.01092334, + "auxiliary_loss_mlp": 0.01036843, + "balance_loss_clip": 1.01741147, + "balance_loss_mlp": 1.02751148, + "epoch": 0.19984969186833007, + "flos": 18551064274560.0, + "grad_norm": 2.64619446476262, + "language_loss": 0.80301172, + "learning_rate": 3.6189218402860958e-06, + "loss": 0.82430351, + "num_input_tokens_seen": 71824645, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 0.6484375, + "step": 3324, + "time_per_iteration": 2.364062786102295 + }, + { + "auxiliary_loss_clip": 0.01089631, + "auxiliary_loss_mlp": 0.0104084, + "balance_loss_clip": 1.02077699, + "balance_loss_mlp": 1.02509081, + "epoch": 0.19990981512099804, + "flos": 26428552450560.0, + "grad_norm": 1.755109624568066, + "language_loss": 0.54017216, + "learning_rate": 3.6186999640449817e-06, + "loss": 0.56147689, + "num_input_tokens_seen": 71845125, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 0.64453125, + "step": 3325, + "time_per_iteration": 2.447310447692871 + }, + { + "auxiliary_loss_clip": 0.01091465, + "auxiliary_loss_mlp": 0.01034303, + "balance_loss_clip": 1.01624262, + "balance_loss_mlp": 1.02723396, + "epoch": 0.19996993837366603, + "flos": 16325071943040.0, + "grad_norm": 2.354431826748387, + "language_loss": 0.85965687, + "learning_rate": 3.6184780300366294e-06, + "loss": 0.88091457, + "num_input_tokens_seen": 71863500, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.64453125, + "step": 3326, + "time_per_iteration": 2.361738443374634 + }, + { + "auxiliary_loss_clip": 0.01090149, + "auxiliary_loss_mlp": 0.01036799, + "balance_loss_clip": 1.01886964, + "balance_loss_mlp": 1.02702761, + "epoch": 0.200030061626334, + "flos": 20843287188480.0, + "grad_norm": 1.8797726577910963, + "language_loss": 0.71729505, + "learning_rate": 3.6182560382689598e-06, + "loss": 0.73856449, + "num_input_tokens_seen": 71881845, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.6328125, + "step": 3327, + "time_per_iteration": 2.370218515396118 + }, + { + "auxiliary_loss_clip": 0.01093333, + "auxiliary_loss_mlp": 0.01038162, + "balance_loss_clip": 1.0185287, + "balance_loss_mlp": 1.02680278, + "epoch": 0.20009018487900196, + "flos": 23761677070080.0, + "grad_norm": 1.981411592710895, + "language_loss": 0.76705289, + "learning_rate": 3.6180339887498948e-06, + "loss": 0.78836781, + "num_input_tokens_seen": 71900940, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 0.6640625, + "step": 3328, + "time_per_iteration": 2.3940131664276123 + }, + { + "auxiliary_loss_clip": 0.01087467, + "auxiliary_loss_mlp": 0.01032658, + "balance_loss_clip": 1.01604056, + "balance_loss_mlp": 1.02578974, + "epoch": 0.20015030813166992, + "flos": 28110283597440.0, + "grad_norm": 1.7915055564184035, + "language_loss": 0.6896466, + "learning_rate": 3.6178118814873587e-06, + "loss": 0.71084785, + "num_input_tokens_seen": 71921925, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.6171875, + "step": 3329, + "time_per_iteration": 2.536358594894409 + }, + { + "auxiliary_loss_clip": 0.010967, + "auxiliary_loss_mlp": 0.01038771, + "balance_loss_clip": 1.01779008, + "balance_loss_mlp": 1.02862477, + "epoch": 0.2002104313843379, + "flos": 26065979314560.0, + "grad_norm": 1.5752217212677344, + "language_loss": 0.81327724, + "learning_rate": 3.6175897164892783e-06, + "loss": 0.83463192, + "num_input_tokens_seen": 71941855, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 0.6796875, + "step": 3330, + "time_per_iteration": 2.4372637271881104 + }, + { + "auxiliary_loss_clip": 0.01092833, + "auxiliary_loss_mlp": 0.01030856, + "balance_loss_clip": 1.01161528, + "balance_loss_mlp": 1.02617478, + "epoch": 0.20027055463700585, + "flos": 22965517359360.0, + "grad_norm": 4.3997616609776005, + "language_loss": 0.76216048, + "learning_rate": 3.617367493763581e-06, + "loss": 0.78339738, + "num_input_tokens_seen": 71960915, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 0.66796875, + "step": 3331, + "time_per_iteration": 2.3871850967407227 + }, + { + "auxiliary_loss_clip": 0.01093251, + "auxiliary_loss_mlp": 0.01037035, + "balance_loss_clip": 1.0170908, + "balance_loss_mlp": 1.026245, + "epoch": 0.20033067788967382, + "flos": 17164698163200.0, + "grad_norm": 1.8408896854764203, + "language_loss": 0.79128915, + "learning_rate": 3.6171452133181994e-06, + "loss": 0.81259203, + "num_input_tokens_seen": 71979220, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 0.671875, + "step": 3332, + "time_per_iteration": 3.806399345397949 + }, + { + "auxiliary_loss_clip": 0.01021104, + "auxiliary_loss_mlp": 0.01010393, + "balance_loss_clip": 1.00805676, + "balance_loss_mlp": 1.00349379, + "epoch": 0.2003908011423418, + "flos": 60822747993600.0, + "grad_norm": 0.935613534119169, + "language_loss": 0.61915773, + "learning_rate": 3.6169228751610643e-06, + "loss": 0.63947272, + "num_input_tokens_seen": 72033950, + "router_z_loss_clip": 0.02331543, + "router_z_loss_mlp": 0.17578125, + "step": 3333, + "time_per_iteration": 2.9001359939575195 + }, + { + "auxiliary_loss_clip": 0.01091975, + "auxiliary_loss_mlp": 0.01042819, + "balance_loss_clip": 1.02338839, + "balance_loss_mlp": 1.02532458, + "epoch": 0.20045092439500978, + "flos": 24205108648320.0, + "grad_norm": 2.1869137175770432, + "language_loss": 0.80889475, + "learning_rate": 3.6167004793001107e-06, + "loss": 0.83024263, + "num_input_tokens_seen": 72051395, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 0.6640625, + "step": 3334, + "time_per_iteration": 2.3922924995422363 + }, + { + "auxiliary_loss_clip": 0.01098087, + "auxiliary_loss_mlp": 0.01040421, + "balance_loss_clip": 1.01921344, + "balance_loss_mlp": 1.02878976, + "epoch": 0.20051104764767774, + "flos": 29386324212480.0, + "grad_norm": 1.845156825894402, + "language_loss": 0.73663443, + "learning_rate": 3.616478025743276e-06, + "loss": 0.75801957, + "num_input_tokens_seen": 72071305, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 0.6953125, + "step": 3335, + "time_per_iteration": 3.7880725860595703 + }, + { + "auxiliary_loss_clip": 0.01098442, + "auxiliary_loss_mlp": 0.01046572, + "balance_loss_clip": 1.02708125, + "balance_loss_mlp": 1.03009892, + "epoch": 0.2005711709003457, + "flos": 23512676186880.0, + "grad_norm": 2.0307575494379106, + "language_loss": 0.80261171, + "learning_rate": 3.6162555144984986e-06, + "loss": 0.82406187, + "num_input_tokens_seen": 72090165, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 0.68359375, + "step": 3336, + "time_per_iteration": 3.768510341644287 + }, + { + "auxiliary_loss_clip": 0.0109644, + "auxiliary_loss_mlp": 0.01039487, + "balance_loss_clip": 1.01849413, + "balance_loss_mlp": 1.02602303, + "epoch": 0.20063129415301367, + "flos": 22522434894720.0, + "grad_norm": 2.4752090450551316, + "language_loss": 0.77787721, + "learning_rate": 3.6160329455737193e-06, + "loss": 0.79923654, + "num_input_tokens_seen": 72107210, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 0.703125, + "step": 3337, + "time_per_iteration": 2.377169132232666 + }, + { + "auxiliary_loss_clip": 0.01097078, + "auxiliary_loss_mlp": 0.01045507, + "balance_loss_clip": 1.02416873, + "balance_loss_mlp": 1.02941728, + "epoch": 0.20069141740568164, + "flos": 25957050272640.0, + "grad_norm": 1.908908843564825, + "language_loss": 0.68563581, + "learning_rate": 3.6158103189768815e-06, + "loss": 0.70706165, + "num_input_tokens_seen": 72126315, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 0.67578125, + "step": 3338, + "time_per_iteration": 2.4194326400756836 + }, + { + "auxiliary_loss_clip": 0.01093844, + "auxiliary_loss_mlp": 0.01041333, + "balance_loss_clip": 1.0220685, + "balance_loss_mlp": 1.02799261, + "epoch": 0.2007515406583496, + "flos": 24789449940480.0, + "grad_norm": 1.9056828358542846, + "language_loss": 0.68690825, + "learning_rate": 3.6155876347159296e-06, + "loss": 0.70825994, + "num_input_tokens_seen": 72146470, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 0.66015625, + "step": 3339, + "time_per_iteration": 3.78161883354187 + }, + { + "auxiliary_loss_clip": 0.01098606, + "auxiliary_loss_mlp": 0.01035575, + "balance_loss_clip": 1.01578593, + "balance_loss_mlp": 1.02914751, + "epoch": 0.2008116639110176, + "flos": 37924054709760.0, + "grad_norm": 2.1736325201628337, + "language_loss": 0.66454792, + "learning_rate": 3.6153648927988104e-06, + "loss": 0.68588972, + "num_input_tokens_seen": 72166600, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 0.6953125, + "step": 3340, + "time_per_iteration": 2.5375027656555176 + }, + { + "auxiliary_loss_clip": 0.01097662, + "auxiliary_loss_mlp": 0.01037098, + "balance_loss_clip": 1.01603413, + "balance_loss_mlp": 1.0282203, + "epoch": 0.20087178716368556, + "flos": 20739490116480.0, + "grad_norm": 2.124235432886837, + "language_loss": 0.73798639, + "learning_rate": 3.6151420932334737e-06, + "loss": 0.75933397, + "num_input_tokens_seen": 72185160, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 0.6953125, + "step": 3341, + "time_per_iteration": 2.3896074295043945 + }, + { + "auxiliary_loss_clip": 0.01092513, + "auxiliary_loss_mlp": 0.01037874, + "balance_loss_clip": 1.01921797, + "balance_loss_mlp": 1.02799368, + "epoch": 0.20093191041635353, + "flos": 23841139057920.0, + "grad_norm": 2.0304712392790076, + "language_loss": 0.71718943, + "learning_rate": 3.6149192360278706e-06, + "loss": 0.73849332, + "num_input_tokens_seen": 72205160, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.64453125, + "step": 3342, + "time_per_iteration": 2.3963634967803955 + }, + { + "auxiliary_loss_clip": 0.01094734, + "auxiliary_loss_mlp": 0.01041169, + "balance_loss_clip": 1.02151191, + "balance_loss_mlp": 1.02824581, + "epoch": 0.2009920336690215, + "flos": 21791179134720.0, + "grad_norm": 2.003400364739572, + "language_loss": 0.72176361, + "learning_rate": 3.614696321189954e-06, + "loss": 0.74312264, + "num_input_tokens_seen": 72223555, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 0.6640625, + "step": 3343, + "time_per_iteration": 2.389746904373169 + }, + { + "auxiliary_loss_clip": 0.01096642, + "auxiliary_loss_mlp": 0.01040227, + "balance_loss_clip": 1.01825643, + "balance_loss_mlp": 1.02871799, + "epoch": 0.20105215692168946, + "flos": 26358027770880.0, + "grad_norm": 2.104630117063403, + "language_loss": 0.80623066, + "learning_rate": 3.614473348727679e-06, + "loss": 0.82759929, + "num_input_tokens_seen": 72242465, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 0.6796875, + "step": 3344, + "time_per_iteration": 2.4078948497772217 + }, + { + "auxiliary_loss_clip": 0.01094111, + "auxiliary_loss_mlp": 0.01037183, + "balance_loss_clip": 1.01802564, + "balance_loss_mlp": 1.0274086, + "epoch": 0.20111228017435742, + "flos": 18806279379840.0, + "grad_norm": 1.9719152022919537, + "language_loss": 0.83162439, + "learning_rate": 3.614250318649003e-06, + "loss": 0.85293734, + "num_input_tokens_seen": 72260655, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 0.66796875, + "step": 3345, + "time_per_iteration": 2.371562957763672 + }, + { + "auxiliary_loss_clip": 0.01089602, + "auxiliary_loss_mlp": 0.01035226, + "balance_loss_clip": 1.01720166, + "balance_loss_mlp": 1.02813196, + "epoch": 0.20117240342702541, + "flos": 19974019357440.0, + "grad_norm": 3.4927529554161354, + "language_loss": 0.67638123, + "learning_rate": 3.614027230961885e-06, + "loss": 0.69762951, + "num_input_tokens_seen": 72279055, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.61328125, + "step": 3346, + "time_per_iteration": 2.3736603260040283 + }, + { + "auxiliary_loss_clip": 0.01094478, + "auxiliary_loss_mlp": 0.01044854, + "balance_loss_clip": 1.02589989, + "balance_loss_mlp": 1.02834606, + "epoch": 0.20123252667969338, + "flos": 23141759235840.0, + "grad_norm": 2.1909215971763474, + "language_loss": 0.7360484, + "learning_rate": 3.613804085674288e-06, + "loss": 0.75744176, + "num_input_tokens_seen": 72297895, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 0.6640625, + "step": 3347, + "time_per_iteration": 2.379194974899292 + }, + { + "auxiliary_loss_clip": 0.01094469, + "auxiliary_loss_mlp": 0.01041893, + "balance_loss_clip": 1.02266407, + "balance_loss_mlp": 1.02856326, + "epoch": 0.20129264993236134, + "flos": 23220557907840.0, + "grad_norm": 1.6814276601146283, + "language_loss": 0.86456525, + "learning_rate": 3.6135808827941733e-06, + "loss": 0.88592887, + "num_input_tokens_seen": 72318385, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 0.65625, + "step": 3348, + "time_per_iteration": 2.419203519821167 + }, + { + "auxiliary_loss_clip": 0.01089155, + "auxiliary_loss_mlp": 0.01037182, + "balance_loss_clip": 1.01655841, + "balance_loss_mlp": 1.02553391, + "epoch": 0.2013527731850293, + "flos": 21870396743040.0, + "grad_norm": 1.560812432538016, + "language_loss": 0.70809293, + "learning_rate": 3.6133576223295083e-06, + "loss": 0.72935629, + "num_input_tokens_seen": 72338235, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 0.63671875, + "step": 3349, + "time_per_iteration": 2.3877413272857666 + }, + { + "auxiliary_loss_clip": 0.01095388, + "auxiliary_loss_mlp": 0.01038328, + "balance_loss_clip": 1.01774049, + "balance_loss_mlp": 1.02822638, + "epoch": 0.20141289643769728, + "flos": 18039831102720.0, + "grad_norm": 2.5211327439086255, + "language_loss": 0.71299899, + "learning_rate": 3.61313430428826e-06, + "loss": 0.73433614, + "num_input_tokens_seen": 72357825, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 0.671875, + "step": 3350, + "time_per_iteration": 2.377131700515747 + }, + { + "auxiliary_loss_clip": 0.01095739, + "auxiliary_loss_mlp": 0.01040096, + "balance_loss_clip": 1.01918685, + "balance_loss_mlp": 1.02833414, + "epoch": 0.20147301969036524, + "flos": 23950277568000.0, + "grad_norm": 2.330382344330002, + "language_loss": 0.76556957, + "learning_rate": 3.612910928678397e-06, + "loss": 0.78692788, + "num_input_tokens_seen": 72376335, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 0.671875, + "step": 3351, + "time_per_iteration": 2.3786630630493164 + }, + { + "auxiliary_loss_clip": 0.01095691, + "auxiliary_loss_mlp": 0.01040055, + "balance_loss_clip": 1.01907384, + "balance_loss_mlp": 1.02656364, + "epoch": 0.2015331429430332, + "flos": 25587425041920.0, + "grad_norm": 2.4628884995422418, + "language_loss": 0.80423838, + "learning_rate": 3.6126874955078926e-06, + "loss": 0.82559586, + "num_input_tokens_seen": 72395440, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 0.69140625, + "step": 3352, + "time_per_iteration": 2.4034831523895264 + }, + { + "auxiliary_loss_clip": 0.01096008, + "auxiliary_loss_mlp": 0.01035413, + "balance_loss_clip": 1.01550508, + "balance_loss_mlp": 1.02854121, + "epoch": 0.2015932661957012, + "flos": 26723742929280.0, + "grad_norm": 2.4913601598434134, + "language_loss": 0.80097902, + "learning_rate": 3.6124640047847193e-06, + "loss": 0.82229328, + "num_input_tokens_seen": 72414670, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 0.67578125, + "step": 3353, + "time_per_iteration": 2.4384377002716064 + }, + { + "auxiliary_loss_clip": 0.01088909, + "auxiliary_loss_mlp": 0.01034565, + "balance_loss_clip": 1.0159328, + "balance_loss_mlp": 1.02580047, + "epoch": 0.20165338944836916, + "flos": 15632220545280.0, + "grad_norm": 1.8571748381955588, + "language_loss": 0.89597869, + "learning_rate": 3.6122404565168533e-06, + "loss": 0.91721344, + "num_input_tokens_seen": 72432210, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.6328125, + "step": 3354, + "time_per_iteration": 2.3681228160858154 + }, + { + "auxiliary_loss_clip": 0.01022829, + "auxiliary_loss_mlp": 0.01010027, + "balance_loss_clip": 1.00754726, + "balance_loss_mlp": 1.00564718, + "epoch": 0.20171351270103713, + "flos": 57909629727360.0, + "grad_norm": 0.832773605290989, + "language_loss": 0.55907285, + "learning_rate": 3.612016850712273e-06, + "loss": 0.57940137, + "num_input_tokens_seen": 72489225, + "router_z_loss_clip": 0.02478027, + "router_z_loss_mlp": 0.171875, + "step": 3355, + "time_per_iteration": 2.9575581550598145 + }, + { + "auxiliary_loss_clip": 0.01090334, + "auxiliary_loss_mlp": 0.01040644, + "balance_loss_clip": 1.02155828, + "balance_loss_mlp": 1.02739358, + "epoch": 0.2017736359537051, + "flos": 20813296464000.0, + "grad_norm": 3.6784021252170906, + "language_loss": 0.84033597, + "learning_rate": 3.611793187378958e-06, + "loss": 0.8616457, + "num_input_tokens_seen": 72508715, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 0.62890625, + "step": 3356, + "time_per_iteration": 2.3754444122314453 + }, + { + "auxiliary_loss_clip": 0.01099126, + "auxiliary_loss_mlp": 0.01044522, + "balance_loss_clip": 1.02122903, + "balance_loss_mlp": 1.02733374, + "epoch": 0.20183375920637306, + "flos": 17091101283840.0, + "grad_norm": 3.0902134296115134, + "language_loss": 0.69024551, + "learning_rate": 3.61156946652489e-06, + "loss": 0.7116819, + "num_input_tokens_seen": 72525135, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 0.71875, + "step": 3357, + "time_per_iteration": 2.348914861679077 + }, + { + "auxiliary_loss_clip": 0.01094203, + "auxiliary_loss_mlp": 0.01039822, + "balance_loss_clip": 1.01940107, + "balance_loss_mlp": 1.02652955, + "epoch": 0.20189388245904102, + "flos": 18660342430080.0, + "grad_norm": 2.08794261793352, + "language_loss": 0.71407759, + "learning_rate": 3.611345688158053e-06, + "loss": 0.73541784, + "num_input_tokens_seen": 72543690, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 0.6796875, + "step": 3358, + "time_per_iteration": 2.3747360706329346 + }, + { + "auxiliary_loss_clip": 0.01089285, + "auxiliary_loss_mlp": 0.01036534, + "balance_loss_clip": 1.01908135, + "balance_loss_mlp": 1.02508187, + "epoch": 0.20195400571170902, + "flos": 16796678855040.0, + "grad_norm": 1.6969373474453948, + "language_loss": 0.82704282, + "learning_rate": 3.6111218522864336e-06, + "loss": 0.84830093, + "num_input_tokens_seen": 72560725, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.640625, + "step": 3359, + "time_per_iteration": 2.3471667766571045 + }, + { + "auxiliary_loss_clip": 0.01022401, + "auxiliary_loss_mlp": 0.010032, + "balance_loss_clip": 1.00067246, + "balance_loss_mlp": 1.00500762, + "epoch": 0.20201412896437698, + "flos": 67171703535360.0, + "grad_norm": 0.797525516278317, + "language_loss": 0.58966005, + "learning_rate": 3.610897958918019e-06, + "loss": 0.60991603, + "num_input_tokens_seen": 72621940, + "router_z_loss_clip": 0.02526855, + "router_z_loss_mlp": 0.17382812, + "step": 3360, + "time_per_iteration": 2.9902760982513428 + }, + { + "auxiliary_loss_clip": 0.01094829, + "auxiliary_loss_mlp": 0.010354, + "balance_loss_clip": 1.01514673, + "balance_loss_mlp": 1.02801824, + "epoch": 0.20207425221704495, + "flos": 21323936142720.0, + "grad_norm": 2.754692304580016, + "language_loss": 0.62511683, + "learning_rate": 3.6106740080608e-06, + "loss": 0.64641917, + "num_input_tokens_seen": 72639135, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 0.66796875, + "step": 3361, + "time_per_iteration": 2.374682903289795 + }, + { + "auxiliary_loss_clip": 0.01092233, + "auxiliary_loss_mlp": 0.01042311, + "balance_loss_clip": 1.02411985, + "balance_loss_mlp": 1.02760386, + "epoch": 0.2021343754697129, + "flos": 22526100587520.0, + "grad_norm": 2.081613946848037, + "language_loss": 0.75787866, + "learning_rate": 3.61044999972277e-06, + "loss": 0.77922404, + "num_input_tokens_seen": 72658525, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.64453125, + "step": 3362, + "time_per_iteration": 2.442420482635498 + }, + { + "auxiliary_loss_clip": 0.01092394, + "auxiliary_loss_mlp": 0.01038864, + "balance_loss_clip": 1.02045846, + "balance_loss_mlp": 1.02810884, + "epoch": 0.20219449872238088, + "flos": 19061773776000.0, + "grad_norm": 2.1816263421883795, + "language_loss": 0.7692908, + "learning_rate": 3.610225933911921e-06, + "loss": 0.79060346, + "num_input_tokens_seen": 72678085, + "router_z_loss_clip": 0.18457031, + "router_z_loss_mlp": 0.640625, + "step": 3363, + "time_per_iteration": 2.364548921585083 + }, + { + "auxiliary_loss_clip": 0.01093417, + "auxiliary_loss_mlp": 0.01036413, + "balance_loss_clip": 1.01863885, + "balance_loss_mlp": 1.02827358, + "epoch": 0.20225462197504884, + "flos": 24715887972480.0, + "grad_norm": 1.7871976773933484, + "language_loss": 0.74927402, + "learning_rate": 3.6100018106362507e-06, + "loss": 0.7705723, + "num_input_tokens_seen": 72698695, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.6484375, + "step": 3364, + "time_per_iteration": 2.4184486865997314 + }, + { + "auxiliary_loss_clip": 0.01094641, + "auxiliary_loss_mlp": 0.01043725, + "balance_loss_clip": 1.0238291, + "balance_loss_mlp": 1.02821565, + "epoch": 0.2023147452277168, + "flos": 22017206476800.0, + "grad_norm": 3.0410914034095593, + "language_loss": 0.71063465, + "learning_rate": 3.6097776299037573e-06, + "loss": 0.73201829, + "num_input_tokens_seen": 72717880, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 0.6640625, + "step": 3365, + "time_per_iteration": 2.387681007385254 + }, + { + "auxiliary_loss_clip": 0.01093729, + "auxiliary_loss_mlp": 0.01041783, + "balance_loss_clip": 1.02231598, + "balance_loss_mlp": 1.02873588, + "epoch": 0.2023748684803848, + "flos": 17744500978560.0, + "grad_norm": 1.986425390483882, + "language_loss": 0.8576386, + "learning_rate": 3.609553391722441e-06, + "loss": 0.87899375, + "num_input_tokens_seen": 72736410, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 0.65234375, + "step": 3366, + "time_per_iteration": 2.3731088638305664 + }, + { + "auxiliary_loss_clip": 0.01091239, + "auxiliary_loss_mlp": 0.01033907, + "balance_loss_clip": 1.01612079, + "balance_loss_mlp": 1.02765083, + "epoch": 0.20243499173305277, + "flos": 31137602520960.0, + "grad_norm": 1.6715117426982948, + "language_loss": 0.69499671, + "learning_rate": 3.6093290961003044e-06, + "loss": 0.71624815, + "num_input_tokens_seen": 72758295, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.6328125, + "step": 3367, + "time_per_iteration": 2.4572179317474365 + }, + { + "auxiliary_loss_clip": 0.01096051, + "auxiliary_loss_mlp": 0.01039835, + "balance_loss_clip": 1.01739931, + "balance_loss_mlp": 1.02729523, + "epoch": 0.20249511498572073, + "flos": 33837820116480.0, + "grad_norm": 1.7574362353739208, + "language_loss": 0.68120944, + "learning_rate": 3.6091047430453517e-06, + "loss": 0.70256829, + "num_input_tokens_seen": 72782495, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.6875, + "step": 3368, + "time_per_iteration": 2.5089542865753174 + }, + { + "auxiliary_loss_clip": 0.01093486, + "auxiliary_loss_mlp": 0.01039775, + "balance_loss_clip": 1.0203675, + "balance_loss_mlp": 1.02808619, + "epoch": 0.2025552382383887, + "flos": 21214553253120.0, + "grad_norm": 1.6910014215550044, + "language_loss": 0.771227, + "learning_rate": 3.6088803325655907e-06, + "loss": 0.79255962, + "num_input_tokens_seen": 72801885, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 0.65625, + "step": 3369, + "time_per_iteration": 2.3900160789489746 + }, + { + "auxiliary_loss_clip": 0.01094711, + "auxiliary_loss_mlp": 0.01046861, + "balance_loss_clip": 1.02626193, + "balance_loss_mlp": 1.02703071, + "epoch": 0.20261536149105666, + "flos": 14646517729920.0, + "grad_norm": 3.3090264824915923, + "language_loss": 0.65069675, + "learning_rate": 3.6086558646690284e-06, + "loss": 0.67211252, + "num_input_tokens_seen": 72816990, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 0.6796875, + "step": 3370, + "time_per_iteration": 2.368041515350342 + }, + { + "auxiliary_loss_clip": 0.01019647, + "auxiliary_loss_mlp": 0.01002803, + "balance_loss_clip": 1.00054967, + "balance_loss_mlp": 1.00291896, + "epoch": 0.20267548474372463, + "flos": 66780361572480.0, + "grad_norm": 0.6786419455732046, + "language_loss": 0.58097756, + "learning_rate": 3.608431339363677e-06, + "loss": 0.60120201, + "num_input_tokens_seen": 72879240, + "router_z_loss_clip": 0.02258301, + "router_z_loss_mlp": 0.16796875, + "step": 3371, + "time_per_iteration": 3.1006276607513428 + }, + { + "auxiliary_loss_clip": 0.0109262, + "auxiliary_loss_mlp": 0.01036929, + "balance_loss_clip": 1.01752126, + "balance_loss_mlp": 1.0270896, + "epoch": 0.2027356079963926, + "flos": 24679648114560.0, + "grad_norm": 2.482154933139551, + "language_loss": 0.91939795, + "learning_rate": 3.6082067566575474e-06, + "loss": 0.94069338, + "num_input_tokens_seen": 72899030, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 0.65625, + "step": 3372, + "time_per_iteration": 3.8045270442962646 + }, + { + "auxiliary_loss_clip": 0.01094782, + "auxiliary_loss_mlp": 0.01046025, + "balance_loss_clip": 1.02521157, + "balance_loss_mlp": 1.02801609, + "epoch": 0.20279573124906058, + "flos": 26391649276800.0, + "grad_norm": 1.5398457785339825, + "language_loss": 0.78556454, + "learning_rate": 3.6079821165586563e-06, + "loss": 0.80697268, + "num_input_tokens_seen": 72919190, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 0.66796875, + "step": 3373, + "time_per_iteration": 2.4265899658203125 + }, + { + "auxiliary_loss_clip": 0.01090916, + "auxiliary_loss_mlp": 0.01037119, + "balance_loss_clip": 1.01793838, + "balance_loss_mlp": 1.02722061, + "epoch": 0.20285585450172855, + "flos": 33798647704320.0, + "grad_norm": 1.9416745036578418, + "language_loss": 0.71174419, + "learning_rate": 3.6077574190750194e-06, + "loss": 0.73302448, + "num_input_tokens_seen": 72939720, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 0.63671875, + "step": 3374, + "time_per_iteration": 3.846038579940796 + }, + { + "auxiliary_loss_clip": 0.010204, + "auxiliary_loss_mlp": 0.01003981, + "balance_loss_clip": 1.00163269, + "balance_loss_mlp": 1.00372291, + "epoch": 0.20291597775439651, + "flos": 71161332796800.0, + "grad_norm": 0.9958992296052362, + "language_loss": 0.62452167, + "learning_rate": 3.607532664214656e-06, + "loss": 0.6447655, + "num_input_tokens_seen": 73000015, + "router_z_loss_clip": 0.0234375, + "router_z_loss_mlp": 0.16699219, + "step": 3375, + "time_per_iteration": 4.376126289367676 + }, + { + "auxiliary_loss_clip": 0.01090797, + "auxiliary_loss_mlp": 0.0104347, + "balance_loss_clip": 1.02424145, + "balance_loss_mlp": 1.02583313, + "epoch": 0.20297610100706448, + "flos": 19493440225920.0, + "grad_norm": 1.558875313038995, + "language_loss": 0.82283205, + "learning_rate": 3.6073078519855863e-06, + "loss": 0.84417474, + "num_input_tokens_seen": 73017675, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 0.6484375, + "step": 3376, + "time_per_iteration": 2.366511583328247 + }, + { + "auxiliary_loss_clip": 0.01093643, + "auxiliary_loss_mlp": 0.01032397, + "balance_loss_clip": 1.01220286, + "balance_loss_mlp": 1.02646852, + "epoch": 0.20303622425973245, + "flos": 25043128945920.0, + "grad_norm": 2.0001322716041026, + "language_loss": 0.81369841, + "learning_rate": 3.607082982395835e-06, + "loss": 0.83495879, + "num_input_tokens_seen": 73036135, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 0.671875, + "step": 3377, + "time_per_iteration": 2.394261121749878 + }, + { + "auxiliary_loss_clip": 0.01090901, + "auxiliary_loss_mlp": 0.01045261, + "balance_loss_clip": 1.02623534, + "balance_loss_mlp": 1.02719855, + "epoch": 0.2030963475124004, + "flos": 21978941760000.0, + "grad_norm": 2.1762421344037968, + "language_loss": 0.7660687, + "learning_rate": 3.6068580554534245e-06, + "loss": 0.78743029, + "num_input_tokens_seen": 73054075, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 0.63671875, + "step": 3378, + "time_per_iteration": 3.735699415206909 + }, + { + "auxiliary_loss_clip": 0.01092641, + "auxiliary_loss_mlp": 0.01039045, + "balance_loss_clip": 1.01862407, + "balance_loss_mlp": 1.02671015, + "epoch": 0.2031564707650684, + "flos": 19499375157120.0, + "grad_norm": 1.7978053934395233, + "language_loss": 0.79383403, + "learning_rate": 3.6066330711663845e-06, + "loss": 0.81515092, + "num_input_tokens_seen": 73073530, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 0.66015625, + "step": 3379, + "time_per_iteration": 2.3794503211975098 + }, + { + "auxiliary_loss_clip": 0.01088602, + "auxiliary_loss_mlp": 0.01035663, + "balance_loss_clip": 1.01787734, + "balance_loss_mlp": 1.02662444, + "epoch": 0.20321659401773637, + "flos": 22745983530240.0, + "grad_norm": 1.6450798738094692, + "language_loss": 0.86684787, + "learning_rate": 3.606408029542743e-06, + "loss": 0.88809049, + "num_input_tokens_seen": 73092820, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.62109375, + "step": 3380, + "time_per_iteration": 2.4105381965637207 + }, + { + "auxiliary_loss_clip": 0.01094082, + "auxiliary_loss_mlp": 0.01042201, + "balance_loss_clip": 1.02209008, + "balance_loss_mlp": 1.02926743, + "epoch": 0.20327671727040433, + "flos": 22454738035200.0, + "grad_norm": 1.8939562049784857, + "language_loss": 0.74317086, + "learning_rate": 3.60618293059053e-06, + "loss": 0.76453364, + "num_input_tokens_seen": 73113385, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 0.6484375, + "step": 3381, + "time_per_iteration": 2.4410035610198975 + }, + { + "auxiliary_loss_clip": 0.01094867, + "auxiliary_loss_mlp": 0.01047308, + "balance_loss_clip": 1.02769852, + "balance_loss_mlp": 1.02803874, + "epoch": 0.2033368405230723, + "flos": 19534044003840.0, + "grad_norm": 2.1740730139041373, + "language_loss": 0.79045564, + "learning_rate": 3.6059577743177803e-06, + "loss": 0.81187737, + "num_input_tokens_seen": 73131195, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 0.66796875, + "step": 3382, + "time_per_iteration": 2.3881824016571045 + }, + { + "auxiliary_loss_clip": 0.01095466, + "auxiliary_loss_mlp": 0.01035108, + "balance_loss_clip": 1.01440144, + "balance_loss_mlp": 1.02753496, + "epoch": 0.20339696377574026, + "flos": 13808357786880.0, + "grad_norm": 2.669638909555472, + "language_loss": 0.79990542, + "learning_rate": 3.6057325607325293e-06, + "loss": 0.82121116, + "num_input_tokens_seen": 73148850, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 0.6796875, + "step": 3383, + "time_per_iteration": 2.354276418685913 + }, + { + "auxiliary_loss_clip": 0.01093858, + "auxiliary_loss_mlp": 0.01036705, + "balance_loss_clip": 1.01661801, + "balance_loss_mlp": 1.02673268, + "epoch": 0.20345708702840823, + "flos": 20338372972800.0, + "grad_norm": 1.8328700479000588, + "language_loss": 0.74354744, + "learning_rate": 3.605507289842813e-06, + "loss": 0.76485312, + "num_input_tokens_seen": 73166775, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 0.671875, + "step": 3384, + "time_per_iteration": 2.384086847305298 + }, + { + "auxiliary_loss_clip": 0.01097517, + "auxiliary_loss_mlp": 0.01039666, + "balance_loss_clip": 1.01760077, + "balance_loss_mlp": 1.02787459, + "epoch": 0.2035172102810762, + "flos": 20333066446080.0, + "grad_norm": 2.6285400968744397, + "language_loss": 0.76375276, + "learning_rate": 3.6052819616566717e-06, + "loss": 0.78512466, + "num_input_tokens_seen": 73183215, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.6953125, + "step": 3385, + "time_per_iteration": 2.371457815170288 + }, + { + "auxiliary_loss_clip": 0.01092619, + "auxiliary_loss_mlp": 0.01052824, + "balance_loss_clip": 1.03224897, + "balance_loss_mlp": 1.0266149, + "epoch": 0.2035773335337442, + "flos": 23329870974720.0, + "grad_norm": 1.5632193165782053, + "language_loss": 0.68682873, + "learning_rate": 3.6050565761821464e-06, + "loss": 0.70828313, + "num_input_tokens_seen": 73203290, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 0.66015625, + "step": 3386, + "time_per_iteration": 2.4078545570373535 + }, + { + "auxiliary_loss_clip": 0.01092784, + "auxiliary_loss_mlp": 0.01054127, + "balance_loss_clip": 1.03370607, + "balance_loss_mlp": 1.02673006, + "epoch": 0.20363745678641215, + "flos": 28329014465280.0, + "grad_norm": 1.3779751012375796, + "language_loss": 0.81030715, + "learning_rate": 3.6048311334272806e-06, + "loss": 0.83177626, + "num_input_tokens_seen": 73226185, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 0.66015625, + "step": 3387, + "time_per_iteration": 2.436722993850708 + }, + { + "auxiliary_loss_clip": 0.01088018, + "auxiliary_loss_mlp": 0.0103672, + "balance_loss_clip": 1.01711559, + "balance_loss_mlp": 1.02645564, + "epoch": 0.20369758003908012, + "flos": 18914684751360.0, + "grad_norm": 2.268566145035477, + "language_loss": 0.79522198, + "learning_rate": 3.6046056334001195e-06, + "loss": 0.81646937, + "num_input_tokens_seen": 73243300, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 0.6171875, + "step": 3388, + "time_per_iteration": 2.3675179481506348 + }, + { + "auxiliary_loss_clip": 0.01092747, + "auxiliary_loss_mlp": 0.01042757, + "balance_loss_clip": 1.02213335, + "balance_loss_mlp": 1.02644062, + "epoch": 0.20375770329174808, + "flos": 19205546221440.0, + "grad_norm": 2.317221942652294, + "language_loss": 0.71999937, + "learning_rate": 3.604380076108711e-06, + "loss": 0.74135441, + "num_input_tokens_seen": 73261490, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 0.6640625, + "step": 3389, + "time_per_iteration": 2.349846363067627 + }, + { + "auxiliary_loss_clip": 0.01090671, + "auxiliary_loss_mlp": 0.01039414, + "balance_loss_clip": 1.01960146, + "balance_loss_mlp": 1.02674413, + "epoch": 0.20381782654441605, + "flos": 19389992267520.0, + "grad_norm": 1.988578197047473, + "language_loss": 0.87115598, + "learning_rate": 3.6041544615611047e-06, + "loss": 0.89245689, + "num_input_tokens_seen": 73280180, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 0.640625, + "step": 3390, + "time_per_iteration": 2.3579490184783936 + }, + { + "auxiliary_loss_clip": 0.01092361, + "auxiliary_loss_mlp": 0.01036621, + "balance_loss_clip": 1.01826298, + "balance_loss_mlp": 1.02729249, + "epoch": 0.203877949797084, + "flos": 24826527671040.0, + "grad_norm": 1.7992059945882672, + "language_loss": 0.7063992, + "learning_rate": 3.6039287897653523e-06, + "loss": 0.72768903, + "num_input_tokens_seen": 73300680, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.6484375, + "step": 3391, + "time_per_iteration": 2.4349241256713867 + }, + { + "auxiliary_loss_clip": 0.01091526, + "auxiliary_loss_mlp": 0.01039392, + "balance_loss_clip": 1.02031267, + "balance_loss_mlp": 1.02667701, + "epoch": 0.20393807304975198, + "flos": 18002753372160.0, + "grad_norm": 2.4840311671601554, + "language_loss": 0.86306632, + "learning_rate": 3.6037030607295063e-06, + "loss": 0.88437545, + "num_input_tokens_seen": 73316760, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 0.6484375, + "step": 3392, + "time_per_iteration": 2.3522307872772217 + }, + { + "auxiliary_loss_clip": 0.01094558, + "auxiliary_loss_mlp": 0.01040172, + "balance_loss_clip": 1.02049112, + "balance_loss_mlp": 1.0278697, + "epoch": 0.20399819630241997, + "flos": 24205841786880.0, + "grad_norm": 1.6081761552515894, + "language_loss": 0.8031919, + "learning_rate": 3.603477274461624e-06, + "loss": 0.82453918, + "num_input_tokens_seen": 73339385, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 0.66796875, + "step": 3393, + "time_per_iteration": 2.431004285812378 + }, + { + "auxiliary_loss_clip": 0.01089731, + "auxiliary_loss_mlp": 0.01033757, + "balance_loss_clip": 1.01573205, + "balance_loss_mlp": 1.02630556, + "epoch": 0.20405831955508794, + "flos": 20776079088000.0, + "grad_norm": 1.8735140560235117, + "language_loss": 0.85764956, + "learning_rate": 3.603251430969762e-06, + "loss": 0.87888443, + "num_input_tokens_seen": 73357235, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.6328125, + "step": 3394, + "time_per_iteration": 2.3881540298461914 + }, + { + "auxiliary_loss_clip": 0.01090659, + "auxiliary_loss_mlp": 0.01033694, + "balance_loss_clip": 1.01533568, + "balance_loss_mlp": 1.02663696, + "epoch": 0.2041184428077559, + "flos": 15486004304640.0, + "grad_norm": 2.743829622847949, + "language_loss": 0.84085333, + "learning_rate": 3.603025530261981e-06, + "loss": 0.86209691, + "num_input_tokens_seen": 73374435, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.640625, + "step": 3395, + "time_per_iteration": 2.3404524326324463 + }, + { + "auxiliary_loss_clip": 0.01092742, + "auxiliary_loss_mlp": 0.01034949, + "balance_loss_clip": 1.01531529, + "balance_loss_mlp": 1.0249157, + "epoch": 0.20417856606042387, + "flos": 15587776517760.0, + "grad_norm": 2.31580434959557, + "language_loss": 0.83367699, + "learning_rate": 3.602799572346342e-06, + "loss": 0.85495389, + "num_input_tokens_seen": 73391025, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 0.6796875, + "step": 3396, + "time_per_iteration": 2.343005895614624 + }, + { + "auxiliary_loss_clip": 0.01091129, + "auxiliary_loss_mlp": 0.01034107, + "balance_loss_clip": 1.01504493, + "balance_loss_mlp": 1.02700043, + "epoch": 0.20423868931309183, + "flos": 24278216768640.0, + "grad_norm": 2.5392266252188356, + "language_loss": 0.770015, + "learning_rate": 3.602573557230909e-06, + "loss": 0.7912674, + "num_input_tokens_seen": 73409270, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 0.640625, + "step": 3397, + "time_per_iteration": 2.4021718502044678 + }, + { + "auxiliary_loss_clip": 0.01091382, + "auxiliary_loss_mlp": 0.01037151, + "balance_loss_clip": 1.01880515, + "balance_loss_mlp": 1.02668989, + "epoch": 0.2042988125657598, + "flos": 18614152834560.0, + "grad_norm": 2.4693531404220836, + "language_loss": 0.87320244, + "learning_rate": 3.602347484923748e-06, + "loss": 0.8944878, + "num_input_tokens_seen": 73425225, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.6484375, + "step": 3398, + "time_per_iteration": 2.338831901550293 + }, + { + "auxiliary_loss_clip": 0.0109228, + "auxiliary_loss_mlp": 0.01039025, + "balance_loss_clip": 1.02011812, + "balance_loss_mlp": 1.02819514, + "epoch": 0.2043589358184278, + "flos": 17850462554880.0, + "grad_norm": 2.014866073013619, + "language_loss": 0.7797541, + "learning_rate": 3.6021213554329277e-06, + "loss": 0.80106717, + "num_input_tokens_seen": 73440940, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 0.640625, + "step": 3399, + "time_per_iteration": 2.3305556774139404 + }, + { + "auxiliary_loss_clip": 0.01090105, + "auxiliary_loss_mlp": 0.01033464, + "balance_loss_clip": 1.0142591, + "balance_loss_mlp": 1.02567756, + "epoch": 0.20441905907109575, + "flos": 21434121993600.0, + "grad_norm": 1.9548736773764572, + "language_loss": 0.76317549, + "learning_rate": 3.601895168766517e-06, + "loss": 0.78441119, + "num_input_tokens_seen": 73458805, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 0.64453125, + "step": 3400, + "time_per_iteration": 2.3623459339141846 + }, + { + "auxiliary_loss_clip": 0.01091122, + "auxiliary_loss_mlp": 0.01035199, + "balance_loss_clip": 1.01837873, + "balance_loss_mlp": 1.02779388, + "epoch": 0.20447918232376372, + "flos": 27706513190400.0, + "grad_norm": 1.695005337189241, + "language_loss": 0.79227334, + "learning_rate": 3.601668924932588e-06, + "loss": 0.81353664, + "num_input_tokens_seen": 73479380, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.6328125, + "step": 3401, + "time_per_iteration": 2.439919948577881 + }, + { + "auxiliary_loss_clip": 0.01091997, + "auxiliary_loss_mlp": 0.01034112, + "balance_loss_clip": 1.01454926, + "balance_loss_mlp": 1.02668262, + "epoch": 0.20453930557643168, + "flos": 30522746833920.0, + "grad_norm": 2.1437863458260242, + "language_loss": 0.6956296, + "learning_rate": 3.601442623939215e-06, + "loss": 0.71689069, + "num_input_tokens_seen": 73505105, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 0.65234375, + "step": 3402, + "time_per_iteration": 2.5202667713165283 + }, + { + "auxiliary_loss_clip": 0.0109146, + "auxiliary_loss_mlp": 0.01035624, + "balance_loss_clip": 1.01727784, + "balance_loss_mlp": 1.02655721, + "epoch": 0.20459942882909965, + "flos": 18986815353600.0, + "grad_norm": 2.422100239714057, + "language_loss": 0.80654657, + "learning_rate": 3.6012162657944745e-06, + "loss": 0.82781738, + "num_input_tokens_seen": 73523700, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.6484375, + "step": 3403, + "time_per_iteration": 2.396238088607788 + }, + { + "auxiliary_loss_clip": 0.01093829, + "auxiliary_loss_mlp": 0.0103324, + "balance_loss_clip": 1.01473844, + "balance_loss_mlp": 1.02883208, + "epoch": 0.20465955208176762, + "flos": 20338023859200.0, + "grad_norm": 1.8919767613263943, + "language_loss": 0.82822037, + "learning_rate": 3.600989850506444e-06, + "loss": 0.84949106, + "num_input_tokens_seen": 73542625, + "router_z_loss_clip": 0.18457031, + "router_z_loss_mlp": 0.6484375, + "step": 3404, + "time_per_iteration": 2.3653664588928223 + }, + { + "auxiliary_loss_clip": 0.01090939, + "auxiliary_loss_mlp": 0.01041672, + "balance_loss_clip": 1.02270603, + "balance_loss_mlp": 1.02520812, + "epoch": 0.20471967533443558, + "flos": 21250234529280.0, + "grad_norm": 1.863448038932841, + "language_loss": 0.85795009, + "learning_rate": 3.6007633780832043e-06, + "loss": 0.87927622, + "num_input_tokens_seen": 73561450, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 0.65625, + "step": 3405, + "time_per_iteration": 2.3978824615478516 + }, + { + "auxiliary_loss_clip": 0.01089667, + "auxiliary_loss_mlp": 0.01037807, + "balance_loss_clip": 1.01919866, + "balance_loss_mlp": 1.02511287, + "epoch": 0.20477979858710357, + "flos": 14500685514240.0, + "grad_norm": 2.7106561337924093, + "language_loss": 0.84639657, + "learning_rate": 3.600536848532837e-06, + "loss": 0.86767137, + "num_input_tokens_seen": 73577155, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.6484375, + "step": 3406, + "time_per_iteration": 2.3337669372558594 + }, + { + "auxiliary_loss_clip": 0.01089747, + "auxiliary_loss_mlp": 0.01033084, + "balance_loss_clip": 1.0159061, + "balance_loss_mlp": 1.02772033, + "epoch": 0.20483992183977154, + "flos": 11399525331840.0, + "grad_norm": 2.074774039957221, + "language_loss": 0.67743528, + "learning_rate": 3.600310261863427e-06, + "loss": 0.69866359, + "num_input_tokens_seen": 73594900, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.62109375, + "step": 3407, + "time_per_iteration": 2.3922691345214844 + }, + { + "auxiliary_loss_clip": 0.0108831, + "auxiliary_loss_mlp": 0.01034514, + "balance_loss_clip": 1.01623893, + "balance_loss_mlp": 1.02531064, + "epoch": 0.2049000450924395, + "flos": 19059329980800.0, + "grad_norm": 2.0119760705765226, + "language_loss": 0.84033918, + "learning_rate": 3.6000836180830598e-06, + "loss": 0.86156738, + "num_input_tokens_seen": 73613810, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 0.6328125, + "step": 3408, + "time_per_iteration": 2.3717234134674072 + }, + { + "auxiliary_loss_clip": 0.01091751, + "auxiliary_loss_mlp": 0.0103593, + "balance_loss_clip": 1.01751184, + "balance_loss_mlp": 1.02712226, + "epoch": 0.20496016834510747, + "flos": 14573688900480.0, + "grad_norm": 2.145279569849743, + "language_loss": 0.63664538, + "learning_rate": 3.5998569171998247e-06, + "loss": 0.65792221, + "num_input_tokens_seen": 73631495, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.6484375, + "step": 3409, + "time_per_iteration": 2.3818070888519287 + }, + { + "auxiliary_loss_clip": 0.0108886, + "auxiliary_loss_mlp": 0.01031594, + "balance_loss_clip": 1.01383233, + "balance_loss_mlp": 1.02462614, + "epoch": 0.20502029159777543, + "flos": 22125576936960.0, + "grad_norm": 1.3643240352330017, + "language_loss": 0.80532646, + "learning_rate": 3.599630159221811e-06, + "loss": 0.82653105, + "num_input_tokens_seen": 73652840, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.640625, + "step": 3410, + "time_per_iteration": 2.3842222690582275 + }, + { + "auxiliary_loss_clip": 0.01089651, + "auxiliary_loss_mlp": 0.01036217, + "balance_loss_clip": 1.01829982, + "balance_loss_mlp": 1.02750182, + "epoch": 0.2050804148504434, + "flos": 25366913694720.0, + "grad_norm": 3.1972839530835313, + "language_loss": 0.75666493, + "learning_rate": 3.599403344157112e-06, + "loss": 0.77792358, + "num_input_tokens_seen": 73672150, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.62109375, + "step": 3411, + "time_per_iteration": 3.8647632598876953 + }, + { + "auxiliary_loss_clip": 0.0109096, + "auxiliary_loss_mlp": 0.01042045, + "balance_loss_clip": 1.0248549, + "balance_loss_mlp": 1.0274868, + "epoch": 0.2051405381031114, + "flos": 23619126522240.0, + "grad_norm": 1.7979701633073433, + "language_loss": 0.73625255, + "learning_rate": 3.5991764720138214e-06, + "loss": 0.7575826, + "num_input_tokens_seen": 73691940, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.6328125, + "step": 3412, + "time_per_iteration": 2.38908314704895 + }, + { + "auxiliary_loss_clip": 0.01093034, + "auxiliary_loss_mlp": 0.01045989, + "balance_loss_clip": 1.02583075, + "balance_loss_mlp": 1.02652621, + "epoch": 0.20520066135577936, + "flos": 19564732955520.0, + "grad_norm": 2.359456746502695, + "language_loss": 0.77695239, + "learning_rate": 3.598949542800037e-06, + "loss": 0.79834253, + "num_input_tokens_seen": 73709080, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 0.6640625, + "step": 3413, + "time_per_iteration": 2.4109857082366943 + }, + { + "auxiliary_loss_clip": 0.01088034, + "auxiliary_loss_mlp": 0.01037268, + "balance_loss_clip": 1.01910043, + "balance_loss_mlp": 1.02752459, + "epoch": 0.20526078460844732, + "flos": 17675372753280.0, + "grad_norm": 1.9421409032084236, + "language_loss": 0.85001194, + "learning_rate": 3.5987225565238556e-06, + "loss": 0.87126493, + "num_input_tokens_seen": 73727670, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.60546875, + "step": 3414, + "time_per_iteration": 3.822932481765747 + }, + { + "auxiliary_loss_clip": 0.01089411, + "auxiliary_loss_mlp": 0.0103071, + "balance_loss_clip": 1.01269698, + "balance_loss_mlp": 1.02587461, + "epoch": 0.2053209078611153, + "flos": 21499444880640.0, + "grad_norm": 2.0015277424090763, + "language_loss": 0.80799913, + "learning_rate": 3.598495513193379e-06, + "loss": 0.82920033, + "num_input_tokens_seen": 73747170, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.6328125, + "step": 3415, + "time_per_iteration": 3.8386893272399902 + }, + { + "auxiliary_loss_clip": 0.01088169, + "auxiliary_loss_mlp": 0.01028838, + "balance_loss_clip": 1.01270878, + "balance_loss_mlp": 1.02676189, + "epoch": 0.20538103111378325, + "flos": 25662418375680.0, + "grad_norm": 1.8200864009169986, + "language_loss": 0.72592711, + "learning_rate": 3.5982684128167093e-06, + "loss": 0.74709719, + "num_input_tokens_seen": 73767690, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.61328125, + "step": 3416, + "time_per_iteration": 2.4197278022766113 + }, + { + "auxiliary_loss_clip": 0.0109024, + "auxiliary_loss_mlp": 0.01032892, + "balance_loss_clip": 1.01505804, + "balance_loss_mlp": 1.02519274, + "epoch": 0.20544115436645122, + "flos": 23147833812480.0, + "grad_norm": 1.9038989953330967, + "language_loss": 0.78435564, + "learning_rate": 3.598041255401951e-06, + "loss": 0.80558705, + "num_input_tokens_seen": 73786900, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.65234375, + "step": 3417, + "time_per_iteration": 3.752944231033325 + }, + { + "auxiliary_loss_clip": 0.01091466, + "auxiliary_loss_mlp": 0.01035336, + "balance_loss_clip": 1.0164535, + "balance_loss_mlp": 1.02768421, + "epoch": 0.20550127761911918, + "flos": 19389433685760.0, + "grad_norm": 2.9929824267351792, + "language_loss": 0.87379462, + "learning_rate": 3.5978140409572105e-06, + "loss": 0.89506263, + "num_input_tokens_seen": 73804515, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 0.63671875, + "step": 3418, + "time_per_iteration": 2.3542520999908447 + }, + { + "auxiliary_loss_clip": 0.01089245, + "auxiliary_loss_mlp": 0.0103486, + "balance_loss_clip": 1.01657367, + "balance_loss_mlp": 1.02650845, + "epoch": 0.20556140087178718, + "flos": 22892025214080.0, + "grad_norm": 2.0387380567678037, + "language_loss": 0.62095773, + "learning_rate": 3.597586769490598e-06, + "loss": 0.6421988, + "num_input_tokens_seen": 73822910, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 0.62890625, + "step": 3419, + "time_per_iteration": 2.4001595973968506 + }, + { + "auxiliary_loss_clip": 0.01095027, + "auxiliary_loss_mlp": 0.01039858, + "balance_loss_clip": 1.01910341, + "balance_loss_mlp": 1.0285635, + "epoch": 0.20562152412445514, + "flos": 19788700527360.0, + "grad_norm": 1.7780668301721039, + "language_loss": 0.86269796, + "learning_rate": 3.5973594410102218e-06, + "loss": 0.88404679, + "num_input_tokens_seen": 73841160, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 0.6640625, + "step": 3420, + "time_per_iteration": 2.371304750442505 + }, + { + "auxiliary_loss_clip": 0.01088989, + "auxiliary_loss_mlp": 0.01032728, + "balance_loss_clip": 1.01493001, + "balance_loss_mlp": 1.0254879, + "epoch": 0.2056816473771231, + "flos": 31500699327360.0, + "grad_norm": 3.131106011252814, + "language_loss": 0.71428061, + "learning_rate": 3.5971320555241967e-06, + "loss": 0.73549777, + "num_input_tokens_seen": 73862795, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.63671875, + "step": 3421, + "time_per_iteration": 2.532630681991577 + }, + { + "auxiliary_loss_clip": 0.01088822, + "auxiliary_loss_mlp": 0.01034526, + "balance_loss_clip": 1.01706159, + "balance_loss_mlp": 1.02615345, + "epoch": 0.20574177062979107, + "flos": 23257251613440.0, + "grad_norm": 2.2017221536752407, + "language_loss": 0.70898926, + "learning_rate": 3.5969046130406376e-06, + "loss": 0.7302227, + "num_input_tokens_seen": 73881525, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.625, + "step": 3422, + "time_per_iteration": 2.376978874206543 + }, + { + "auxiliary_loss_clip": 0.01023299, + "auxiliary_loss_mlp": 0.01000564, + "balance_loss_clip": 0.9985975, + "balance_loss_mlp": 1.00740194, + "epoch": 0.20580189388245904, + "flos": 70309417777920.0, + "grad_norm": 0.745883669525793, + "language_loss": 0.55542767, + "learning_rate": 3.5966771135676596e-06, + "loss": 0.57566631, + "num_input_tokens_seen": 73937775, + "router_z_loss_clip": 0.01965332, + "router_z_loss_mlp": 0.15820312, + "step": 3423, + "time_per_iteration": 3.106325387954712 + }, + { + "auxiliary_loss_clip": 0.0109086, + "auxiliary_loss_mlp": 0.01035106, + "balance_loss_clip": 1.01633012, + "balance_loss_mlp": 1.02702034, + "epoch": 0.205862017135127, + "flos": 30736520288640.0, + "grad_norm": 1.8105349906243213, + "language_loss": 0.71785295, + "learning_rate": 3.5964495571133835e-06, + "loss": 0.73911256, + "num_input_tokens_seen": 73958250, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.640625, + "step": 3424, + "time_per_iteration": 2.444450855255127 + }, + { + "auxiliary_loss_clip": 0.01089295, + "auxiliary_loss_mlp": 0.01038805, + "balance_loss_clip": 1.02225924, + "balance_loss_mlp": 1.02796054, + "epoch": 0.20592214038779497, + "flos": 21323482295040.0, + "grad_norm": 1.562763777237167, + "language_loss": 0.75264859, + "learning_rate": 3.596221943685928e-06, + "loss": 0.7739296, + "num_input_tokens_seen": 73977775, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.61328125, + "step": 3425, + "time_per_iteration": 2.4433488845825195 + }, + { + "auxiliary_loss_clip": 0.01093009, + "auxiliary_loss_mlp": 0.010466, + "balance_loss_clip": 1.02914762, + "balance_loss_mlp": 1.02995443, + "epoch": 0.20598226364046296, + "flos": 22890593848320.0, + "grad_norm": 1.7784879169232835, + "language_loss": 0.87846279, + "learning_rate": 3.5959942732934184e-06, + "loss": 0.89985889, + "num_input_tokens_seen": 73996590, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.62890625, + "step": 3426, + "time_per_iteration": 2.3792903423309326 + }, + { + "auxiliary_loss_clip": 0.01090758, + "auxiliary_loss_mlp": 0.01034388, + "balance_loss_clip": 1.01623249, + "balance_loss_mlp": 1.02898359, + "epoch": 0.20604238689313092, + "flos": 23877413827200.0, + "grad_norm": 1.4953182586608793, + "language_loss": 0.76311988, + "learning_rate": 3.595766545943978e-06, + "loss": 0.78437138, + "num_input_tokens_seen": 74015935, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.6171875, + "step": 3427, + "time_per_iteration": 2.434981346130371 + }, + { + "auxiliary_loss_clip": 0.01090663, + "auxiliary_loss_mlp": 0.01040024, + "balance_loss_clip": 1.02062881, + "balance_loss_mlp": 1.02673054, + "epoch": 0.2061025101457989, + "flos": 22490454222720.0, + "grad_norm": 1.7734894470627613, + "language_loss": 0.73887622, + "learning_rate": 3.5955387616457347e-06, + "loss": 0.7601831, + "num_input_tokens_seen": 74036575, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 0.640625, + "step": 3428, + "time_per_iteration": 2.378920078277588 + }, + { + "auxiliary_loss_clip": 0.01089111, + "auxiliary_loss_mlp": 0.01038805, + "balance_loss_clip": 1.01951647, + "balance_loss_mlp": 1.02474248, + "epoch": 0.20616263339846685, + "flos": 22777964202240.0, + "grad_norm": 1.6950960930414565, + "language_loss": 0.73476946, + "learning_rate": 3.5953109204068167e-06, + "loss": 0.75604856, + "num_input_tokens_seen": 74055365, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 0.64453125, + "step": 3429, + "time_per_iteration": 2.4017746448516846 + }, + { + "auxiliary_loss_clip": 0.01096581, + "auxiliary_loss_mlp": 0.01037211, + "balance_loss_clip": 1.01843596, + "balance_loss_mlp": 1.03064489, + "epoch": 0.20622275665113482, + "flos": 20881272614400.0, + "grad_norm": 2.21737742663155, + "language_loss": 0.84927869, + "learning_rate": 3.5950830222353563e-06, + "loss": 0.87061667, + "num_input_tokens_seen": 74074875, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.65625, + "step": 3430, + "time_per_iteration": 2.3771278858184814 + }, + { + "auxiliary_loss_clip": 0.01092067, + "auxiliary_loss_mlp": 0.01036544, + "balance_loss_clip": 1.01673174, + "balance_loss_mlp": 1.02689767, + "epoch": 0.20628287990380278, + "flos": 19353403296000.0, + "grad_norm": 4.316906960372921, + "language_loss": 0.68970323, + "learning_rate": 3.594855067139486e-06, + "loss": 0.7109893, + "num_input_tokens_seen": 74094505, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 0.65234375, + "step": 3431, + "time_per_iteration": 2.4158811569213867 + }, + { + "auxiliary_loss_clip": 0.01021422, + "auxiliary_loss_mlp": 0.01005794, + "balance_loss_clip": 1.00362396, + "balance_loss_mlp": 1.00544739, + "epoch": 0.20634300315647078, + "flos": 71514759156480.0, + "grad_norm": 0.8058109401968928, + "language_loss": 0.60227937, + "learning_rate": 3.59462705512734e-06, + "loss": 0.6225515, + "num_input_tokens_seen": 74158500, + "router_z_loss_clip": 0.02172852, + "router_z_loss_mlp": 0.16015625, + "step": 3432, + "time_per_iteration": 3.1592984199523926 + }, + { + "auxiliary_loss_clip": 0.0109354, + "auxiliary_loss_mlp": 0.01042487, + "balance_loss_clip": 1.02429545, + "balance_loss_mlp": 1.02901411, + "epoch": 0.20640312640913874, + "flos": 21722923693440.0, + "grad_norm": 1.5470741195994464, + "language_loss": 0.72117704, + "learning_rate": 3.594398986207056e-06, + "loss": 0.74253732, + "num_input_tokens_seen": 74176685, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.64453125, + "step": 3433, + "time_per_iteration": 2.407956600189209 + }, + { + "auxiliary_loss_clip": 0.01092175, + "auxiliary_loss_mlp": 0.01037297, + "balance_loss_clip": 1.01973784, + "balance_loss_mlp": 1.02718782, + "epoch": 0.2064632496618067, + "flos": 20553682527360.0, + "grad_norm": 1.724537070049624, + "language_loss": 0.86899883, + "learning_rate": 3.5941708603867747e-06, + "loss": 0.8902936, + "num_input_tokens_seen": 74194935, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.6484375, + "step": 3434, + "time_per_iteration": 2.3818016052246094 + }, + { + "auxiliary_loss_clip": 0.0109335, + "auxiliary_loss_mlp": 0.01039946, + "balance_loss_clip": 1.02058661, + "balance_loss_mlp": 1.02697265, + "epoch": 0.20652337291447467, + "flos": 29822040380160.0, + "grad_norm": 1.6386270787125943, + "language_loss": 0.69367266, + "learning_rate": 3.5939426776746356e-06, + "loss": 0.71500558, + "num_input_tokens_seen": 74215400, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 0.6640625, + "step": 3435, + "time_per_iteration": 2.4560468196868896 + }, + { + "auxiliary_loss_clip": 0.01090102, + "auxiliary_loss_mlp": 0.01038034, + "balance_loss_clip": 1.01921082, + "balance_loss_mlp": 1.02793837, + "epoch": 0.20658349616714264, + "flos": 26212439934720.0, + "grad_norm": 2.479641386098983, + "language_loss": 0.89462423, + "learning_rate": 3.593714438078782e-06, + "loss": 0.91590559, + "num_input_tokens_seen": 74234090, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 0.62109375, + "step": 3436, + "time_per_iteration": 2.416600227355957 + }, + { + "auxiliary_loss_clip": 0.01092537, + "auxiliary_loss_mlp": 0.01032888, + "balance_loss_clip": 1.01438689, + "balance_loss_mlp": 1.02791631, + "epoch": 0.2066436194198106, + "flos": 25993185396480.0, + "grad_norm": 1.8382826318204257, + "language_loss": 0.76266444, + "learning_rate": 3.59348614160736e-06, + "loss": 0.78391868, + "num_input_tokens_seen": 74253345, + "router_z_loss_clip": 0.18457031, + "router_z_loss_mlp": 0.6484375, + "step": 3437, + "time_per_iteration": 2.427842378616333 + }, + { + "auxiliary_loss_clip": 0.01090912, + "auxiliary_loss_mlp": 0.0103481, + "balance_loss_clip": 1.01736999, + "balance_loss_mlp": 1.02654123, + "epoch": 0.20670374267247857, + "flos": 21360001443840.0, + "grad_norm": 2.126482635828829, + "language_loss": 0.77968448, + "learning_rate": 3.5932577882685164e-06, + "loss": 0.80094171, + "num_input_tokens_seen": 74271615, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.64453125, + "step": 3438, + "time_per_iteration": 2.369384765625 + }, + { + "auxiliary_loss_clip": 0.01020046, + "auxiliary_loss_mlp": 0.01010954, + "balance_loss_clip": 1.00886786, + "balance_loss_mlp": 1.00374675, + "epoch": 0.20676386592514656, + "flos": 66379977567360.0, + "grad_norm": 0.8477758193078978, + "language_loss": 0.67162991, + "learning_rate": 3.593029378070401e-06, + "loss": 0.69193995, + "num_input_tokens_seen": 74331390, + "router_z_loss_clip": 0.02087402, + "router_z_loss_mlp": 0.16308594, + "step": 3439, + "time_per_iteration": 2.9917290210723877 + }, + { + "auxiliary_loss_clip": 0.01091483, + "auxiliary_loss_mlp": 0.01031657, + "balance_loss_clip": 1.01343036, + "balance_loss_mlp": 1.02587008, + "epoch": 0.20682398917781453, + "flos": 17273627205120.0, + "grad_norm": 2.236879316241981, + "language_loss": 0.84296453, + "learning_rate": 3.5928009110211646e-06, + "loss": 0.86419594, + "num_input_tokens_seen": 74347335, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 0.65625, + "step": 3440, + "time_per_iteration": 2.3410487174987793 + }, + { + "auxiliary_loss_clip": 0.01092394, + "auxiliary_loss_mlp": 0.01042595, + "balance_loss_clip": 1.02478552, + "balance_loss_mlp": 1.02785289, + "epoch": 0.2068841124304825, + "flos": 18076315340160.0, + "grad_norm": 2.077953152278859, + "language_loss": 0.84483582, + "learning_rate": 3.592572387128961e-06, + "loss": 0.86618572, + "num_input_tokens_seen": 74366310, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.6484375, + "step": 3441, + "time_per_iteration": 2.353750705718994 + }, + { + "auxiliary_loss_clip": 0.01091054, + "auxiliary_loss_mlp": 0.01035021, + "balance_loss_clip": 1.01772404, + "balance_loss_mlp": 1.02873576, + "epoch": 0.20694423568315046, + "flos": 27345720533760.0, + "grad_norm": 1.7365703515053852, + "language_loss": 0.85869229, + "learning_rate": 3.5923438064019457e-06, + "loss": 0.87995303, + "num_input_tokens_seen": 74387100, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.625, + "step": 3442, + "time_per_iteration": 2.439638614654541 + }, + { + "auxiliary_loss_clip": 0.01096404, + "auxiliary_loss_mlp": 0.01042477, + "balance_loss_clip": 1.02298689, + "balance_loss_mlp": 1.0300343, + "epoch": 0.20700435893581842, + "flos": 20228815526400.0, + "grad_norm": 1.9196280044402672, + "language_loss": 0.73328567, + "learning_rate": 3.5921151688482754e-06, + "loss": 0.75467443, + "num_input_tokens_seen": 74404460, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 0.6640625, + "step": 3443, + "time_per_iteration": 2.372800827026367 + }, + { + "auxiliary_loss_clip": 0.01091153, + "auxiliary_loss_mlp": 0.01036531, + "balance_loss_clip": 1.01932955, + "balance_loss_mlp": 1.02771688, + "epoch": 0.2070644821884864, + "flos": 20630072315520.0, + "grad_norm": 1.8510691069900311, + "language_loss": 0.85541952, + "learning_rate": 3.5918864744761106e-06, + "loss": 0.87669635, + "num_input_tokens_seen": 74423790, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.63671875, + "step": 3444, + "time_per_iteration": 2.381258249282837 + }, + { + "auxiliary_loss_clip": 0.01018973, + "auxiliary_loss_mlp": 0.0100074, + "balance_loss_clip": 0.99864203, + "balance_loss_mlp": 1.00311446, + "epoch": 0.20712460544115438, + "flos": 65937907532160.0, + "grad_norm": 0.6898303371855926, + "language_loss": 0.57121408, + "learning_rate": 3.5916577232936116e-06, + "loss": 0.59141123, + "num_input_tokens_seen": 74488130, + "router_z_loss_clip": 0.02099609, + "router_z_loss_mlp": 0.15820312, + "step": 3445, + "time_per_iteration": 2.9848201274871826 + }, + { + "auxiliary_loss_clip": 0.01090997, + "auxiliary_loss_mlp": 0.01038907, + "balance_loss_clip": 1.0204649, + "balance_loss_mlp": 1.02755046, + "epoch": 0.20718472869382235, + "flos": 19424765848320.0, + "grad_norm": 1.4452724688645875, + "language_loss": 0.78202057, + "learning_rate": 3.591428915308944e-06, + "loss": 0.80331963, + "num_input_tokens_seen": 74506720, + "router_z_loss_clip": 0.18457031, + "router_z_loss_mlp": 0.6328125, + "step": 3446, + "time_per_iteration": 2.362121105194092 + }, + { + "auxiliary_loss_clip": 0.01096816, + "auxiliary_loss_mlp": 0.01041338, + "balance_loss_clip": 1.01996422, + "balance_loss_mlp": 1.02874351, + "epoch": 0.2072448519464903, + "flos": 24497890243200.0, + "grad_norm": 2.1383938563221787, + "language_loss": 0.62538004, + "learning_rate": 3.5912000505302706e-06, + "loss": 0.64676166, + "num_input_tokens_seen": 74525330, + "router_z_loss_clip": 0.21386719, + "router_z_loss_mlp": 0.6796875, + "step": 3447, + "time_per_iteration": 2.4085800647735596 + }, + { + "auxiliary_loss_clip": 0.01094221, + "auxiliary_loss_mlp": 0.010403, + "balance_loss_clip": 1.02170324, + "balance_loss_mlp": 1.02841926, + "epoch": 0.20730497519915828, + "flos": 23074586046720.0, + "grad_norm": 1.8454981068552492, + "language_loss": 0.85958946, + "learning_rate": 3.590971128965761e-06, + "loss": 0.88093472, + "num_input_tokens_seen": 74544535, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.65625, + "step": 3448, + "time_per_iteration": 2.3759377002716064 + }, + { + "auxiliary_loss_clip": 0.01092135, + "auxiliary_loss_mlp": 0.01041375, + "balance_loss_clip": 1.02066863, + "balance_loss_mlp": 1.02737164, + "epoch": 0.20736509845182624, + "flos": 21067987898880.0, + "grad_norm": 2.0909084611212125, + "language_loss": 0.75326729, + "learning_rate": 3.5907421506235844e-06, + "loss": 0.77460241, + "num_input_tokens_seen": 74562300, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 0.6484375, + "step": 3449, + "time_per_iteration": 2.3815219402313232 + }, + { + "auxiliary_loss_clip": 0.01094984, + "auxiliary_loss_mlp": 0.01040994, + "balance_loss_clip": 1.02082419, + "balance_loss_mlp": 1.0273211, + "epoch": 0.2074252217044942, + "flos": 17632499736960.0, + "grad_norm": 1.9843231891096198, + "language_loss": 0.76733154, + "learning_rate": 3.5905131155119124e-06, + "loss": 0.78869134, + "num_input_tokens_seen": 74580080, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 0.67578125, + "step": 3450, + "time_per_iteration": 2.3549654483795166 + }, + { + "auxiliary_loss_clip": 0.01095435, + "auxiliary_loss_mlp": 0.01035646, + "balance_loss_clip": 1.01689434, + "balance_loss_mlp": 1.02833676, + "epoch": 0.20748534495716217, + "flos": 23545948579200.0, + "grad_norm": 1.6702905135861918, + "language_loss": 0.82409501, + "learning_rate": 3.590284023638918e-06, + "loss": 0.84540582, + "num_input_tokens_seen": 74598980, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.671875, + "step": 3451, + "time_per_iteration": 3.779995918273926 + }, + { + "auxiliary_loss_clip": 0.01019154, + "auxiliary_loss_mlp": 0.01001525, + "balance_loss_clip": 0.99953437, + "balance_loss_mlp": 1.0033164, + "epoch": 0.20754546820983016, + "flos": 52250313738240.0, + "grad_norm": 1.2365319613705386, + "language_loss": 0.56629205, + "learning_rate": 3.5900548750127784e-06, + "loss": 0.58649886, + "num_input_tokens_seen": 74655275, + "router_z_loss_clip": 0.01989746, + "router_z_loss_mlp": 0.15820312, + "step": 3452, + "time_per_iteration": 2.9152235984802246 + }, + { + "auxiliary_loss_clip": 0.01096062, + "auxiliary_loss_mlp": 0.01043294, + "balance_loss_clip": 1.02307582, + "balance_loss_mlp": 1.02723455, + "epoch": 0.20760559146249813, + "flos": 20411341447680.0, + "grad_norm": 1.8644726447354187, + "language_loss": 0.87885737, + "learning_rate": 3.5898256696416704e-06, + "loss": 0.90025091, + "num_input_tokens_seen": 74674560, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 0.6875, + "step": 3453, + "time_per_iteration": 2.374065637588501 + }, + { + "auxiliary_loss_clip": 0.01093974, + "auxiliary_loss_mlp": 0.01034654, + "balance_loss_clip": 1.01370907, + "balance_loss_mlp": 1.02928376, + "epoch": 0.2076657147151661, + "flos": 23184876631680.0, + "grad_norm": 4.938075755184813, + "language_loss": 0.80229759, + "learning_rate": 3.589596407533773e-06, + "loss": 0.82358384, + "num_input_tokens_seen": 74694500, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 0.6484375, + "step": 3454, + "time_per_iteration": 5.1431028842926025 + }, + { + "auxiliary_loss_clip": 0.01093993, + "auxiliary_loss_mlp": 0.01042325, + "balance_loss_clip": 1.0225246, + "balance_loss_mlp": 1.02817988, + "epoch": 0.20772583796783406, + "flos": 18292323121920.0, + "grad_norm": 2.81641100334554, + "language_loss": 0.76740152, + "learning_rate": 3.589367088697269e-06, + "loss": 0.78876472, + "num_input_tokens_seen": 74710485, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 0.65625, + "step": 3455, + "time_per_iteration": 2.3422646522521973 + }, + { + "auxiliary_loss_clip": 0.01091646, + "auxiliary_loss_mlp": 0.01035814, + "balance_loss_clip": 1.017205, + "balance_loss_mlp": 1.02829957, + "epoch": 0.20778596122050202, + "flos": 17601845696640.0, + "grad_norm": 2.3858522831270363, + "language_loss": 0.80792063, + "learning_rate": 3.5891377131403423e-06, + "loss": 0.8291952, + "num_input_tokens_seen": 74727450, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.6328125, + "step": 3456, + "time_per_iteration": 2.3777241706848145 + }, + { + "auxiliary_loss_clip": 0.01096006, + "auxiliary_loss_mlp": 0.01036075, + "balance_loss_clip": 1.01687002, + "balance_loss_mlp": 1.02885818, + "epoch": 0.20784608447317, + "flos": 23804445352320.0, + "grad_norm": 1.4983478884464172, + "language_loss": 0.77782631, + "learning_rate": 3.5889082808711776e-06, + "loss": 0.79914713, + "num_input_tokens_seen": 74746725, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 0.671875, + "step": 3457, + "time_per_iteration": 3.7937073707580566 + }, + { + "auxiliary_loss_clip": 0.01095138, + "auxiliary_loss_mlp": 0.01040004, + "balance_loss_clip": 1.0195601, + "balance_loss_mlp": 1.02762175, + "epoch": 0.20790620772583795, + "flos": 17638329934080.0, + "grad_norm": 1.7504821464077382, + "language_loss": 0.83592123, + "learning_rate": 3.5886787918979645e-06, + "loss": 0.85727262, + "num_input_tokens_seen": 74765255, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 0.67578125, + "step": 3458, + "time_per_iteration": 2.3449764251708984 + }, + { + "auxiliary_loss_clip": 0.01090537, + "auxiliary_loss_mlp": 0.01033334, + "balance_loss_clip": 1.01442719, + "balance_loss_mlp": 1.02720952, + "epoch": 0.20796633097850595, + "flos": 27672228368640.0, + "grad_norm": 1.650509758721497, + "language_loss": 0.76041085, + "learning_rate": 3.588449246228891e-06, + "loss": 0.78164959, + "num_input_tokens_seen": 74785710, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 0.6328125, + "step": 3459, + "time_per_iteration": 2.4398462772369385 + }, + { + "auxiliary_loss_clip": 0.01087872, + "auxiliary_loss_mlp": 0.0103317, + "balance_loss_clip": 1.0155623, + "balance_loss_mlp": 1.02618527, + "epoch": 0.2080264542311739, + "flos": 19244578988160.0, + "grad_norm": 2.161015419600236, + "language_loss": 0.76956034, + "learning_rate": 3.5882196438721504e-06, + "loss": 0.79077077, + "num_input_tokens_seen": 74804490, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.6171875, + "step": 3460, + "time_per_iteration": 2.3799078464508057 + }, + { + "auxiliary_loss_clip": 0.01095987, + "auxiliary_loss_mlp": 0.0104126, + "balance_loss_clip": 1.02234113, + "balance_loss_mlp": 1.02936792, + "epoch": 0.20808657748384188, + "flos": 27524720407680.0, + "grad_norm": 1.7015066210378609, + "language_loss": 0.75361431, + "learning_rate": 3.5879899848359367e-06, + "loss": 0.7749868, + "num_input_tokens_seen": 74826340, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 0.66796875, + "step": 3461, + "time_per_iteration": 2.442997694015503 + }, + { + "auxiliary_loss_clip": 0.01093002, + "auxiliary_loss_mlp": 0.01037101, + "balance_loss_clip": 1.01685953, + "balance_loss_mlp": 1.02591705, + "epoch": 0.20814670073650984, + "flos": 26905710268800.0, + "grad_norm": 3.551495866186673, + "language_loss": 0.88157642, + "learning_rate": 3.587760269128444e-06, + "loss": 0.90287745, + "num_input_tokens_seen": 74844960, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 0.671875, + "step": 3462, + "time_per_iteration": 2.41481614112854 + }, + { + "auxiliary_loss_clip": 0.01017315, + "auxiliary_loss_mlp": 0.01001502, + "balance_loss_clip": 0.99936807, + "balance_loss_mlp": 1.00202131, + "epoch": 0.2082068239891778, + "flos": 70172383224960.0, + "grad_norm": 0.7527823553943578, + "language_loss": 0.58998442, + "learning_rate": 3.587530496757872e-06, + "loss": 0.61017263, + "num_input_tokens_seen": 74909075, + "router_z_loss_clip": 0.0213623, + "router_z_loss_mlp": 0.15234375, + "step": 3463, + "time_per_iteration": 3.123060941696167 + }, + { + "auxiliary_loss_clip": 0.01092986, + "auxiliary_loss_mlp": 0.0104451, + "balance_loss_clip": 1.02433991, + "balance_loss_mlp": 1.02627039, + "epoch": 0.20826694724184577, + "flos": 24606924019200.0, + "grad_norm": 2.303712602197063, + "language_loss": 0.66127217, + "learning_rate": 3.5873006677324204e-06, + "loss": 0.68264711, + "num_input_tokens_seen": 74928125, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 0.66796875, + "step": 3464, + "time_per_iteration": 2.4263315200805664 + }, + { + "auxiliary_loss_clip": 0.01096491, + "auxiliary_loss_mlp": 0.0103888, + "balance_loss_clip": 1.01911533, + "balance_loss_mlp": 1.02857804, + "epoch": 0.20832707049451377, + "flos": 12892097399040.0, + "grad_norm": 1.8989058898156979, + "language_loss": 0.83964658, + "learning_rate": 3.587070782060291e-06, + "loss": 0.86100036, + "num_input_tokens_seen": 74945090, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 0.6796875, + "step": 3465, + "time_per_iteration": 2.366279125213623 + }, + { + "auxiliary_loss_clip": 0.01094326, + "auxiliary_loss_mlp": 0.01037246, + "balance_loss_clip": 1.018327, + "balance_loss_mlp": 1.02799881, + "epoch": 0.20838719374718173, + "flos": 22197777361920.0, + "grad_norm": 2.5378212427357774, + "language_loss": 0.81736517, + "learning_rate": 3.5868408397496874e-06, + "loss": 0.83868092, + "num_input_tokens_seen": 74963630, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 0.6640625, + "step": 3466, + "time_per_iteration": 2.37129807472229 + }, + { + "auxiliary_loss_clip": 0.01092957, + "auxiliary_loss_mlp": 0.01034349, + "balance_loss_clip": 1.01684952, + "balance_loss_mlp": 1.02911043, + "epoch": 0.2084473169998497, + "flos": 15157750901760.0, + "grad_norm": 1.750566358385547, + "language_loss": 0.81937397, + "learning_rate": 3.5866108408088166e-06, + "loss": 0.84064704, + "num_input_tokens_seen": 74981875, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.640625, + "step": 3467, + "time_per_iteration": 2.3644158840179443 + }, + { + "auxiliary_loss_clip": 0.01089734, + "auxiliary_loss_mlp": 0.01037296, + "balance_loss_clip": 1.0203985, + "balance_loss_mlp": 1.02880931, + "epoch": 0.20850744025251766, + "flos": 17455838924160.0, + "grad_norm": 2.1968935159291356, + "language_loss": 0.81883782, + "learning_rate": 3.5863807852458858e-06, + "loss": 0.8401081, + "num_input_tokens_seen": 74999155, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.609375, + "step": 3468, + "time_per_iteration": 2.355436325073242 + }, + { + "auxiliary_loss_clip": 0.01095281, + "auxiliary_loss_mlp": 0.01041346, + "balance_loss_clip": 1.01943529, + "balance_loss_mlp": 1.02690184, + "epoch": 0.20856756350518563, + "flos": 25697890183680.0, + "grad_norm": 1.9899682280664843, + "language_loss": 0.89986277, + "learning_rate": 3.5861506730691054e-06, + "loss": 0.921229, + "num_input_tokens_seen": 75017850, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 0.68359375, + "step": 3469, + "time_per_iteration": 2.412328004837036 + }, + { + "auxiliary_loss_clip": 0.01095278, + "auxiliary_loss_mlp": 0.01031567, + "balance_loss_clip": 1.01392412, + "balance_loss_mlp": 1.03051162, + "epoch": 0.2086276867578536, + "flos": 37887535560960.0, + "grad_norm": 1.9551924636038538, + "language_loss": 0.76771545, + "learning_rate": 3.5859205042866877e-06, + "loss": 0.78898388, + "num_input_tokens_seen": 75039270, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.6484375, + "step": 3470, + "time_per_iteration": 2.532867193222046 + }, + { + "auxiliary_loss_clip": 0.01091899, + "auxiliary_loss_mlp": 0.01033378, + "balance_loss_clip": 1.0141381, + "balance_loss_mlp": 1.02850223, + "epoch": 0.20868781001052156, + "flos": 25555863306240.0, + "grad_norm": 4.5420928354475, + "language_loss": 0.7589848, + "learning_rate": 3.5856902789068465e-06, + "loss": 0.78023756, + "num_input_tokens_seen": 75059350, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 0.63671875, + "step": 3471, + "time_per_iteration": 2.42753005027771 + }, + { + "auxiliary_loss_clip": 0.01096855, + "auxiliary_loss_mlp": 0.01040165, + "balance_loss_clip": 1.0192678, + "balance_loss_mlp": 1.02707171, + "epoch": 0.20874793326318955, + "flos": 27527897341440.0, + "grad_norm": 1.698707599750462, + "language_loss": 0.75870049, + "learning_rate": 3.585459996937798e-06, + "loss": 0.78007072, + "num_input_tokens_seen": 75080150, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 0.6953125, + "step": 3472, + "time_per_iteration": 2.454162836074829 + }, + { + "auxiliary_loss_clip": 0.01093067, + "auxiliary_loss_mlp": 0.01032191, + "balance_loss_clip": 1.01401162, + "balance_loss_mlp": 1.02869606, + "epoch": 0.20880805651585752, + "flos": 18547887340800.0, + "grad_norm": 2.029897599746445, + "language_loss": 0.84425724, + "learning_rate": 3.585229658387761e-06, + "loss": 0.86550981, + "num_input_tokens_seen": 75097920, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.640625, + "step": 3473, + "time_per_iteration": 2.374516487121582 + }, + { + "auxiliary_loss_clip": 0.01022572, + "auxiliary_loss_mlp": 0.01001798, + "balance_loss_clip": 0.99985516, + "balance_loss_mlp": 1.00716949, + "epoch": 0.20886817976852548, + "flos": 65943318792960.0, + "grad_norm": 0.891368913184291, + "language_loss": 0.63656253, + "learning_rate": 3.5849992632649552e-06, + "loss": 0.65680623, + "num_input_tokens_seen": 75152410, + "router_z_loss_clip": 0.01940918, + "router_z_loss_mlp": 0.15429688, + "step": 3474, + "time_per_iteration": 2.9225313663482666 + }, + { + "auxiliary_loss_clip": 0.01096738, + "auxiliary_loss_mlp": 0.01040183, + "balance_loss_clip": 1.02053738, + "balance_loss_mlp": 1.03014934, + "epoch": 0.20892830302119345, + "flos": 36537688598400.0, + "grad_norm": 2.0460801325355913, + "language_loss": 0.69748187, + "learning_rate": 3.5847688115776024e-06, + "loss": 0.71885109, + "num_input_tokens_seen": 75173265, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 0.6640625, + "step": 3475, + "time_per_iteration": 2.5197010040283203 + }, + { + "auxiliary_loss_clip": 0.01094102, + "auxiliary_loss_mlp": 0.0103653, + "balance_loss_clip": 1.0175643, + "balance_loss_mlp": 1.02916694, + "epoch": 0.2089884262738614, + "flos": 20955777189120.0, + "grad_norm": 1.5004689408877798, + "language_loss": 0.70123768, + "learning_rate": 3.5845383033339274e-06, + "loss": 0.72254401, + "num_input_tokens_seen": 75193640, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 0.6484375, + "step": 3476, + "time_per_iteration": 2.383960723876953 + }, + { + "auxiliary_loss_clip": 0.01092594, + "auxiliary_loss_mlp": 0.01034302, + "balance_loss_clip": 1.01690972, + "balance_loss_mlp": 1.02826881, + "epoch": 0.20904854952652938, + "flos": 22782921615360.0, + "grad_norm": 2.0210394947761574, + "language_loss": 0.89149666, + "learning_rate": 3.584307738542156e-06, + "loss": 0.91276556, + "num_input_tokens_seen": 75212545, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.640625, + "step": 3477, + "time_per_iteration": 2.394998550415039 + }, + { + "auxiliary_loss_clip": 0.01092902, + "auxiliary_loss_mlp": 0.01034753, + "balance_loss_clip": 1.01601338, + "balance_loss_mlp": 1.02812278, + "epoch": 0.20910867277919734, + "flos": 27302184201600.0, + "grad_norm": 2.0467141777077686, + "language_loss": 0.67719901, + "learning_rate": 3.5840771172105174e-06, + "loss": 0.6984756, + "num_input_tokens_seen": 75230865, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.6484375, + "step": 3478, + "time_per_iteration": 2.425455331802368 + }, + { + "auxiliary_loss_clip": 0.01093302, + "auxiliary_loss_mlp": 0.01039866, + "balance_loss_clip": 1.02109003, + "balance_loss_mlp": 1.02950048, + "epoch": 0.20916879603186533, + "flos": 14318369061120.0, + "grad_norm": 2.151720096034742, + "language_loss": 0.84817231, + "learning_rate": 3.5838464393472406e-06, + "loss": 0.86950397, + "num_input_tokens_seen": 75248285, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.63671875, + "step": 3479, + "time_per_iteration": 2.3746531009674072 + }, + { + "auxiliary_loss_clip": 0.01094081, + "auxiliary_loss_mlp": 0.01034833, + "balance_loss_clip": 1.01621294, + "balance_loss_mlp": 1.0284301, + "epoch": 0.2092289192845333, + "flos": 22271932823040.0, + "grad_norm": 2.8047810504700625, + "language_loss": 0.73754495, + "learning_rate": 3.5836157049605587e-06, + "loss": 0.75883412, + "num_input_tokens_seen": 75266310, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.65625, + "step": 3480, + "time_per_iteration": 2.388402223587036 + }, + { + "auxiliary_loss_clip": 0.01091454, + "auxiliary_loss_mlp": 0.01035426, + "balance_loss_clip": 1.01849866, + "balance_loss_mlp": 1.02858162, + "epoch": 0.20928904253720126, + "flos": 14829811701120.0, + "grad_norm": 2.1618652065751394, + "language_loss": 0.75717729, + "learning_rate": 3.5833849140587057e-06, + "loss": 0.77844608, + "num_input_tokens_seen": 75284175, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.62890625, + "step": 3481, + "time_per_iteration": 2.3657357692718506 + }, + { + "auxiliary_loss_clip": 0.01093397, + "auxiliary_loss_mlp": 0.01040802, + "balance_loss_clip": 1.02238441, + "balance_loss_mlp": 1.02931952, + "epoch": 0.20934916578986923, + "flos": 23258019663360.0, + "grad_norm": 2.4154735349596668, + "language_loss": 0.85249126, + "learning_rate": 3.583154066649918e-06, + "loss": 0.8738333, + "num_input_tokens_seen": 75303465, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.640625, + "step": 3482, + "time_per_iteration": 2.4067273139953613 + }, + { + "auxiliary_loss_clip": 0.01094505, + "auxiliary_loss_mlp": 0.01035863, + "balance_loss_clip": 1.016289, + "balance_loss_mlp": 1.02950621, + "epoch": 0.2094092890425372, + "flos": 32013049662720.0, + "grad_norm": 5.260844561923305, + "language_loss": 0.71030521, + "learning_rate": 3.5829231627424345e-06, + "loss": 0.73160881, + "num_input_tokens_seen": 75325290, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 0.6484375, + "step": 3483, + "time_per_iteration": 2.4645891189575195 + }, + { + "auxiliary_loss_clip": 0.01093589, + "auxiliary_loss_mlp": 0.01041059, + "balance_loss_clip": 1.02218843, + "balance_loss_mlp": 1.02667093, + "epoch": 0.20946941229520516, + "flos": 20009630810880.0, + "grad_norm": 1.5227581604886158, + "language_loss": 0.75268054, + "learning_rate": 3.5826922023444945e-06, + "loss": 0.77402705, + "num_input_tokens_seen": 75343895, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 0.66796875, + "step": 3484, + "time_per_iteration": 2.4082491397857666 + }, + { + "auxiliary_loss_clip": 0.01093195, + "auxiliary_loss_mlp": 0.01035365, + "balance_loss_clip": 1.01635146, + "balance_loss_mlp": 1.02883124, + "epoch": 0.20952953554787315, + "flos": 30738684792960.0, + "grad_norm": 1.5972834679782852, + "language_loss": 0.70754176, + "learning_rate": 3.582461185464342e-06, + "loss": 0.72882736, + "num_input_tokens_seen": 75367100, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 0.64453125, + "step": 3485, + "time_per_iteration": 2.4705119132995605 + }, + { + "auxiliary_loss_clip": 0.01095458, + "auxiliary_loss_mlp": 0.01036053, + "balance_loss_clip": 1.01784933, + "balance_loss_mlp": 1.03004837, + "epoch": 0.20958965880054112, + "flos": 27048086259840.0, + "grad_norm": 2.164433217911864, + "language_loss": 0.83064806, + "learning_rate": 3.5822301121102195e-06, + "loss": 0.8519631, + "num_input_tokens_seen": 75389925, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.65625, + "step": 3486, + "time_per_iteration": 2.4376449584960938 + }, + { + "auxiliary_loss_clip": 0.01094113, + "auxiliary_loss_mlp": 0.01038986, + "balance_loss_clip": 1.02024603, + "balance_loss_mlp": 1.02828133, + "epoch": 0.20964978205320908, + "flos": 34202697402240.0, + "grad_norm": 1.6871712925684774, + "language_loss": 0.8739146, + "learning_rate": 3.5819989822903744e-06, + "loss": 0.89524567, + "num_input_tokens_seen": 75408575, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.65625, + "step": 3487, + "time_per_iteration": 2.474391460418701 + }, + { + "auxiliary_loss_clip": 0.01092124, + "auxiliary_loss_mlp": 0.01039022, + "balance_loss_clip": 1.02004361, + "balance_loss_mlp": 1.02856433, + "epoch": 0.20970990530587705, + "flos": 23476261772160.0, + "grad_norm": 2.695113070981032, + "language_loss": 0.72291046, + "learning_rate": 3.5817677960130547e-06, + "loss": 0.74422193, + "num_input_tokens_seen": 75427155, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 0.63671875, + "step": 3488, + "time_per_iteration": 2.3911118507385254 + }, + { + "auxiliary_loss_clip": 0.01092648, + "auxiliary_loss_mlp": 0.01034636, + "balance_loss_clip": 1.0165875, + "balance_loss_mlp": 1.02789044, + "epoch": 0.209770028558545, + "flos": 18550470781440.0, + "grad_norm": 2.7849653661817775, + "language_loss": 0.81004465, + "learning_rate": 3.5815365532865113e-06, + "loss": 0.83131742, + "num_input_tokens_seen": 75444450, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.6484375, + "step": 3489, + "time_per_iteration": 2.354342460632324 + }, + { + "auxiliary_loss_clip": 0.01091426, + "auxiliary_loss_mlp": 0.01036575, + "balance_loss_clip": 1.0183357, + "balance_loss_mlp": 1.02779531, + "epoch": 0.20983015181121298, + "flos": 21615914776320.0, + "grad_norm": 1.74949974030656, + "language_loss": 0.73232079, + "learning_rate": 3.5813052541189972e-06, + "loss": 0.75360084, + "num_input_tokens_seen": 75462625, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 0.63671875, + "step": 3490, + "time_per_iteration": 3.758084535598755 + }, + { + "auxiliary_loss_clip": 0.01088927, + "auxiliary_loss_mlp": 0.01039904, + "balance_loss_clip": 1.02260721, + "balance_loss_mlp": 1.02791572, + "epoch": 0.20989027506388094, + "flos": 16613873642880.0, + "grad_norm": 1.7351063103210318, + "language_loss": 0.70122451, + "learning_rate": 3.581073898518766e-06, + "loss": 0.72251278, + "num_input_tokens_seen": 75480640, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.609375, + "step": 3491, + "time_per_iteration": 2.357231378555298 + }, + { + "auxiliary_loss_clip": 0.01091895, + "auxiliary_loss_mlp": 0.01040048, + "balance_loss_clip": 1.02015138, + "balance_loss_mlp": 1.0268507, + "epoch": 0.20995039831654894, + "flos": 23215844874240.0, + "grad_norm": 2.4201866668159187, + "language_loss": 0.7964893, + "learning_rate": 3.5808424864940737e-06, + "loss": 0.81780875, + "num_input_tokens_seen": 75494900, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 0.65234375, + "step": 3492, + "time_per_iteration": 2.3604140281677246 + }, + { + "auxiliary_loss_clip": 0.01092678, + "auxiliary_loss_mlp": 0.01037695, + "balance_loss_clip": 1.01931262, + "balance_loss_mlp": 1.02888894, + "epoch": 0.2100105215692169, + "flos": 18146595640320.0, + "grad_norm": 2.5281521179956346, + "language_loss": 0.86871386, + "learning_rate": 3.5806110180531797e-06, + "loss": 0.89001751, + "num_input_tokens_seen": 75513370, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.63671875, + "step": 3493, + "time_per_iteration": 3.7610621452331543 + }, + { + "auxiliary_loss_clip": 0.0108737, + "auxiliary_loss_mlp": 0.01032863, + "balance_loss_clip": 1.01510119, + "balance_loss_mlp": 1.02624381, + "epoch": 0.21007064482188487, + "flos": 15960683416320.0, + "grad_norm": 1.8347118331738057, + "language_loss": 0.69150156, + "learning_rate": 3.5803794932043447e-06, + "loss": 0.71270388, + "num_input_tokens_seen": 75532480, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.609375, + "step": 3494, + "time_per_iteration": 3.748657703399658 + }, + { + "auxiliary_loss_clip": 0.01096434, + "auxiliary_loss_mlp": 0.01030427, + "balance_loss_clip": 1.0126884, + "balance_loss_mlp": 1.02939677, + "epoch": 0.21013076807455283, + "flos": 32232932605440.0, + "grad_norm": 1.7145772598145161, + "language_loss": 0.78903693, + "learning_rate": 3.58014791195583e-06, + "loss": 0.81030554, + "num_input_tokens_seen": 75552745, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.671875, + "step": 3495, + "time_per_iteration": 2.453927993774414 + }, + { + "auxiliary_loss_clip": 0.01090338, + "auxiliary_loss_mlp": 0.01032527, + "balance_loss_clip": 1.01542044, + "balance_loss_mlp": 1.02628589, + "epoch": 0.2101908913272208, + "flos": 23695481399040.0, + "grad_norm": 2.2656349365308595, + "language_loss": 0.77215421, + "learning_rate": 3.579916274315902e-06, + "loss": 0.79338288, + "num_input_tokens_seen": 75574355, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.640625, + "step": 3496, + "time_per_iteration": 3.774770975112915 + }, + { + "auxiliary_loss_clip": 0.01092697, + "auxiliary_loss_mlp": 0.01046259, + "balance_loss_clip": 1.02656555, + "balance_loss_mlp": 1.02746975, + "epoch": 0.21025101457988876, + "flos": 20374752476160.0, + "grad_norm": 2.149596692604933, + "language_loss": 0.82383239, + "learning_rate": 3.5796845802928254e-06, + "loss": 0.84522194, + "num_input_tokens_seen": 75592215, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 0.65234375, + "step": 3497, + "time_per_iteration": 2.397838830947876 + }, + { + "auxiliary_loss_clip": 0.01093279, + "auxiliary_loss_mlp": 0.01036384, + "balance_loss_clip": 1.01709604, + "balance_loss_mlp": 1.02829242, + "epoch": 0.21031113783255675, + "flos": 25774454528640.0, + "grad_norm": 1.9210431419349947, + "language_loss": 0.67437673, + "learning_rate": 3.5794528298948696e-06, + "loss": 0.69567335, + "num_input_tokens_seen": 75610740, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 0.6484375, + "step": 3498, + "time_per_iteration": 2.430166721343994 + }, + { + "auxiliary_loss_clip": 0.01092432, + "auxiliary_loss_mlp": 0.01035971, + "balance_loss_clip": 1.01723123, + "balance_loss_mlp": 1.0269568, + "epoch": 0.21037126108522472, + "flos": 22017101742720.0, + "grad_norm": 2.40011080587237, + "language_loss": 0.80467963, + "learning_rate": 3.579221023130306e-06, + "loss": 0.82596362, + "num_input_tokens_seen": 75631005, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.65234375, + "step": 3499, + "time_per_iteration": 2.386989116668701 + }, + { + "auxiliary_loss_clip": 0.01091396, + "auxiliary_loss_mlp": 0.01036061, + "balance_loss_clip": 1.01844192, + "balance_loss_mlp": 1.02774572, + "epoch": 0.21043138433789269, + "flos": 25333327100160.0, + "grad_norm": 2.0378850646674143, + "language_loss": 0.78460246, + "learning_rate": 3.578989160007405e-06, + "loss": 0.80587709, + "num_input_tokens_seen": 75650655, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.63671875, + "step": 3500, + "time_per_iteration": 2.434589385986328 + }, + { + "auxiliary_loss_clip": 0.01091267, + "auxiliary_loss_mlp": 0.01037002, + "balance_loss_clip": 1.01852453, + "balance_loss_mlp": 1.02694094, + "epoch": 0.21049150759056065, + "flos": 25555479281280.0, + "grad_norm": 3.590639146678211, + "language_loss": 0.73696578, + "learning_rate": 3.5787572405344437e-06, + "loss": 0.75824845, + "num_input_tokens_seen": 75669895, + "router_z_loss_clip": 0.18457031, + "router_z_loss_mlp": 0.640625, + "step": 3501, + "time_per_iteration": 2.405604124069214 + }, + { + "auxiliary_loss_clip": 0.01088548, + "auxiliary_loss_mlp": 0.01038961, + "balance_loss_clip": 1.02068686, + "balance_loss_mlp": 1.02602112, + "epoch": 0.21055163084322862, + "flos": 24494538752640.0, + "grad_norm": 1.4629088850255327, + "language_loss": 0.75636578, + "learning_rate": 3.578525264719697e-06, + "loss": 0.77764094, + "num_input_tokens_seen": 75689535, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 0.625, + "step": 3502, + "time_per_iteration": 2.4369428157806396 + }, + { + "auxiliary_loss_clip": 0.01091219, + "auxiliary_loss_mlp": 0.01035417, + "balance_loss_clip": 1.01767862, + "balance_loss_mlp": 1.02838397, + "epoch": 0.21061175409589658, + "flos": 25737865557120.0, + "grad_norm": 1.859324534822765, + "language_loss": 0.77522284, + "learning_rate": 3.578293232571444e-06, + "loss": 0.79648918, + "num_input_tokens_seen": 75709265, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.625, + "step": 3503, + "time_per_iteration": 2.4071285724639893 + }, + { + "auxiliary_loss_clip": 0.01095094, + "auxiliary_loss_mlp": 0.01042293, + "balance_loss_clip": 1.02020359, + "balance_loss_mlp": 1.02660537, + "epoch": 0.21067187734856455, + "flos": 18988176896640.0, + "grad_norm": 2.2934284972015995, + "language_loss": 0.7852217, + "learning_rate": 3.5780611440979655e-06, + "loss": 0.80659562, + "num_input_tokens_seen": 75727050, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.68359375, + "step": 3504, + "time_per_iteration": 2.367966413497925 + }, + { + "auxiliary_loss_clip": 0.01096176, + "auxiliary_loss_mlp": 0.010381, + "balance_loss_clip": 1.01759577, + "balance_loss_mlp": 1.02878189, + "epoch": 0.21073200060123254, + "flos": 24680206696320.0, + "grad_norm": 1.8603292502709545, + "language_loss": 0.7668277, + "learning_rate": 3.5778289993075442e-06, + "loss": 0.78817046, + "num_input_tokens_seen": 75747175, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 0.671875, + "step": 3505, + "time_per_iteration": 2.4021263122558594 + }, + { + "auxiliary_loss_clip": 0.01089215, + "auxiliary_loss_mlp": 0.0104564, + "balance_loss_clip": 1.02707958, + "balance_loss_mlp": 1.02682805, + "epoch": 0.2107921238539005, + "flos": 28548059535360.0, + "grad_norm": 1.901215270424609, + "language_loss": 0.63815582, + "learning_rate": 3.5775967982084644e-06, + "loss": 0.65950441, + "num_input_tokens_seen": 75767690, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.625, + "step": 3506, + "time_per_iteration": 2.438072443008423 + }, + { + "auxiliary_loss_clip": 0.01091591, + "auxiliary_loss_mlp": 0.01036945, + "balance_loss_clip": 1.01754928, + "balance_loss_mlp": 1.02711058, + "epoch": 0.21085224710656847, + "flos": 25884640379520.0, + "grad_norm": 1.602578378639749, + "language_loss": 0.82088757, + "learning_rate": 3.5773645408090126e-06, + "loss": 0.84217298, + "num_input_tokens_seen": 75787255, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 0.640625, + "step": 3507, + "time_per_iteration": 2.4251205921173096 + }, + { + "auxiliary_loss_clip": 0.01092175, + "auxiliary_loss_mlp": 0.01033608, + "balance_loss_clip": 1.01497543, + "balance_loss_mlp": 1.02826834, + "epoch": 0.21091237035923643, + "flos": 14975399537280.0, + "grad_norm": 1.8195482504015763, + "language_loss": 0.75703776, + "learning_rate": 3.577132227117478e-06, + "loss": 0.77829552, + "num_input_tokens_seen": 75805890, + "router_z_loss_clip": 0.18652344, + "router_z_loss_mlp": 0.640625, + "step": 3508, + "time_per_iteration": 2.3597803115844727 + }, + { + "auxiliary_loss_clip": 0.01095042, + "auxiliary_loss_mlp": 0.01036924, + "balance_loss_clip": 1.01726651, + "balance_loss_mlp": 1.02855814, + "epoch": 0.2109724936119044, + "flos": 16361591091840.0, + "grad_norm": 2.685959813157059, + "language_loss": 0.85451281, + "learning_rate": 3.576899857142152e-06, + "loss": 0.87583244, + "num_input_tokens_seen": 75821620, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 0.6640625, + "step": 3509, + "time_per_iteration": 2.334444999694824 + }, + { + "auxiliary_loss_clip": 0.01095768, + "auxiliary_loss_mlp": 0.0103721, + "balance_loss_clip": 1.01792216, + "balance_loss_mlp": 1.02884912, + "epoch": 0.21103261686457236, + "flos": 31501188086400.0, + "grad_norm": 1.882526021084428, + "language_loss": 0.68351012, + "learning_rate": 3.5766674308913254e-06, + "loss": 0.70483989, + "num_input_tokens_seen": 75842490, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 0.671875, + "step": 3510, + "time_per_iteration": 2.4673638343811035 + }, + { + "auxiliary_loss_clip": 0.01091588, + "auxiliary_loss_mlp": 0.01032657, + "balance_loss_clip": 1.01431036, + "balance_loss_mlp": 1.02609229, + "epoch": 0.21109274011724033, + "flos": 27342857802240.0, + "grad_norm": 1.6090744940540962, + "language_loss": 0.71818495, + "learning_rate": 3.5764349483732937e-06, + "loss": 0.73942745, + "num_input_tokens_seen": 75865985, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.65625, + "step": 3511, + "time_per_iteration": 2.44651460647583 + }, + { + "auxiliary_loss_clip": 0.01095289, + "auxiliary_loss_mlp": 0.01037677, + "balance_loss_clip": 1.01693511, + "balance_loss_mlp": 1.02731109, + "epoch": 0.21115286336990832, + "flos": 17819459400960.0, + "grad_norm": 3.0959236714669123, + "language_loss": 0.69297749, + "learning_rate": 3.5762024095963543e-06, + "loss": 0.71430719, + "num_input_tokens_seen": 75882745, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 0.6796875, + "step": 3512, + "time_per_iteration": 2.3455917835235596 + }, + { + "auxiliary_loss_clip": 0.01091973, + "auxiliary_loss_mlp": 0.01039771, + "balance_loss_clip": 1.01988673, + "balance_loss_mlp": 1.02665246, + "epoch": 0.2112129866225763, + "flos": 27196781207040.0, + "grad_norm": 1.918463867823304, + "language_loss": 0.73346496, + "learning_rate": 3.575969814568805e-06, + "loss": 0.75478244, + "num_input_tokens_seen": 75904305, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 0.65234375, + "step": 3513, + "time_per_iteration": 2.4369938373565674 + }, + { + "auxiliary_loss_clip": 0.01089965, + "auxiliary_loss_mlp": 0.01029703, + "balance_loss_clip": 1.0124774, + "balance_loss_mlp": 1.02838898, + "epoch": 0.21127310987524425, + "flos": 23730185157120.0, + "grad_norm": 1.6947620457212758, + "language_loss": 0.74049127, + "learning_rate": 3.5757371632989477e-06, + "loss": 0.76168793, + "num_input_tokens_seen": 75923710, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.6171875, + "step": 3514, + "time_per_iteration": 2.3905675411224365 + }, + { + "auxiliary_loss_clip": 0.01093489, + "auxiliary_loss_mlp": 0.01036614, + "balance_loss_clip": 1.01751673, + "balance_loss_mlp": 1.0279429, + "epoch": 0.21133323312791222, + "flos": 18331530445440.0, + "grad_norm": 2.141505591336286, + "language_loss": 0.76685333, + "learning_rate": 3.5755044557950832e-06, + "loss": 0.7881543, + "num_input_tokens_seen": 75942625, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 0.65625, + "step": 3515, + "time_per_iteration": 2.3575501441955566 + }, + { + "auxiliary_loss_clip": 0.0109182, + "auxiliary_loss_mlp": 0.01037405, + "balance_loss_clip": 1.01941621, + "balance_loss_mlp": 1.02857447, + "epoch": 0.21139335638058018, + "flos": 17930238744960.0, + "grad_norm": 1.8311208917963162, + "language_loss": 0.68553162, + "learning_rate": 3.575271692065518e-06, + "loss": 0.70682395, + "num_input_tokens_seen": 75959930, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.6328125, + "step": 3516, + "time_per_iteration": 2.3643317222595215 + }, + { + "auxiliary_loss_clip": 0.01095717, + "auxiliary_loss_mlp": 0.01041849, + "balance_loss_clip": 1.02270365, + "balance_loss_mlp": 1.02930379, + "epoch": 0.21145347963324815, + "flos": 24570928540800.0, + "grad_norm": 1.742982541473488, + "language_loss": 0.85129178, + "learning_rate": 3.575038872118558e-06, + "loss": 0.87266737, + "num_input_tokens_seen": 75980335, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 0.6640625, + "step": 3517, + "time_per_iteration": 2.4110751152038574 + }, + { + "auxiliary_loss_clip": 0.01090777, + "auxiliary_loss_mlp": 0.0103677, + "balance_loss_clip": 1.01811337, + "balance_loss_mlp": 1.02686214, + "epoch": 0.21151360288591614, + "flos": 35844488087040.0, + "grad_norm": 1.8859616750495563, + "language_loss": 0.62656885, + "learning_rate": 3.5748059959625122e-06, + "loss": 0.64784431, + "num_input_tokens_seen": 76002095, + "router_z_loss_clip": 0.18652344, + "router_z_loss_mlp": 0.640625, + "step": 3518, + "time_per_iteration": 2.496013641357422 + }, + { + "auxiliary_loss_clip": 0.01091664, + "auxiliary_loss_mlp": 0.01043294, + "balance_loss_clip": 1.02456641, + "balance_loss_mlp": 1.02831721, + "epoch": 0.2115737261385841, + "flos": 24640510613760.0, + "grad_norm": 1.8366046838889278, + "language_loss": 0.88661051, + "learning_rate": 3.574573063605691e-06, + "loss": 0.90796006, + "num_input_tokens_seen": 76020425, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.6328125, + "step": 3519, + "time_per_iteration": 2.4008982181549072 + }, + { + "auxiliary_loss_clip": 0.01095851, + "auxiliary_loss_mlp": 0.01036825, + "balance_loss_clip": 1.01750147, + "balance_loss_mlp": 1.02927542, + "epoch": 0.21163384939125207, + "flos": 25225759601280.0, + "grad_norm": 1.6634088581540472, + "language_loss": 0.81199706, + "learning_rate": 3.574340075056408e-06, + "loss": 0.83332372, + "num_input_tokens_seen": 76041210, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 0.6640625, + "step": 3520, + "time_per_iteration": 2.4060888290405273 + }, + { + "auxiliary_loss_clip": 0.01088627, + "auxiliary_loss_mlp": 0.01039287, + "balance_loss_clip": 1.02148867, + "balance_loss_mlp": 1.02626753, + "epoch": 0.21169397264392004, + "flos": 26066328428160.0, + "grad_norm": 1.655151660929098, + "language_loss": 0.75676954, + "learning_rate": 3.5741070303229776e-06, + "loss": 0.77804863, + "num_input_tokens_seen": 76062685, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.625, + "step": 3521, + "time_per_iteration": 2.4211723804473877 + }, + { + "auxiliary_loss_clip": 0.01094089, + "auxiliary_loss_mlp": 0.01037113, + "balance_loss_clip": 1.02080548, + "balance_loss_mlp": 1.02887058, + "epoch": 0.211754095896588, + "flos": 23107264945920.0, + "grad_norm": 3.0514413758967063, + "language_loss": 0.75513554, + "learning_rate": 3.5738739294137154e-06, + "loss": 0.77644765, + "num_input_tokens_seen": 76082300, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.65234375, + "step": 3522, + "time_per_iteration": 2.3824939727783203 + }, + { + "auxiliary_loss_clip": 0.01090576, + "auxiliary_loss_mlp": 0.01048629, + "balance_loss_clip": 1.02953196, + "balance_loss_mlp": 1.02632892, + "epoch": 0.21181421914925597, + "flos": 27921264163200.0, + "grad_norm": 1.8193334486733563, + "language_loss": 0.69856447, + "learning_rate": 3.573640772336942e-06, + "loss": 0.71995652, + "num_input_tokens_seen": 76101135, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 0.64453125, + "step": 3523, + "time_per_iteration": 2.4629838466644287 + }, + { + "auxiliary_loss_clip": 0.0109289, + "auxiliary_loss_mlp": 0.01044684, + "balance_loss_clip": 1.0258131, + "balance_loss_mlp": 1.02810442, + "epoch": 0.21187434240192393, + "flos": 17127690255360.0, + "grad_norm": 3.4908955838004045, + "language_loss": 0.77000618, + "learning_rate": 3.573407559100977e-06, + "loss": 0.79138196, + "num_input_tokens_seen": 76119320, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 0.6484375, + "step": 3524, + "time_per_iteration": 2.3608884811401367 + }, + { + "auxiliary_loss_clip": 0.01089836, + "auxiliary_loss_mlp": 0.01037284, + "balance_loss_clip": 1.01856828, + "balance_loss_mlp": 1.02484906, + "epoch": 0.21193446565459192, + "flos": 22346193018240.0, + "grad_norm": 1.9693963647455388, + "language_loss": 0.81461942, + "learning_rate": 3.573174289714143e-06, + "loss": 0.83589065, + "num_input_tokens_seen": 76137445, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.6484375, + "step": 3525, + "time_per_iteration": 2.3843276500701904 + }, + { + "auxiliary_loss_clip": 0.01092364, + "auxiliary_loss_mlp": 0.0103408, + "balance_loss_clip": 1.01537633, + "balance_loss_mlp": 1.02833056, + "epoch": 0.2119945889072599, + "flos": 27198072927360.0, + "grad_norm": 1.6840448047188865, + "language_loss": 0.74895763, + "learning_rate": 3.572940964184766e-06, + "loss": 0.77022207, + "num_input_tokens_seen": 76159500, + "router_z_loss_clip": 0.18652344, + "router_z_loss_mlp": 0.640625, + "step": 3526, + "time_per_iteration": 2.4151835441589355 + }, + { + "auxiliary_loss_clip": 0.01092074, + "auxiliary_loss_mlp": 0.01032019, + "balance_loss_clip": 1.01324344, + "balance_loss_mlp": 1.02757716, + "epoch": 0.21205471215992786, + "flos": 20990934794880.0, + "grad_norm": 1.6340175886380324, + "language_loss": 0.77082324, + "learning_rate": 3.572707582521172e-06, + "loss": 0.79206413, + "num_input_tokens_seen": 76177990, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.64453125, + "step": 3527, + "time_per_iteration": 2.389474630355835 + }, + { + "auxiliary_loss_clip": 0.01090427, + "auxiliary_loss_mlp": 0.01046113, + "balance_loss_clip": 1.02573991, + "balance_loss_mlp": 1.02565289, + "epoch": 0.21211483541259582, + "flos": 20776602758400.0, + "grad_norm": 1.8858213112158622, + "language_loss": 0.7840848, + "learning_rate": 3.5724741447316894e-06, + "loss": 0.8054502, + "num_input_tokens_seen": 76197125, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 0.6484375, + "step": 3528, + "time_per_iteration": 2.3695602416992188 + }, + { + "auxiliary_loss_clip": 0.01091783, + "auxiliary_loss_mlp": 0.01037309, + "balance_loss_clip": 1.01918912, + "balance_loss_mlp": 1.02790213, + "epoch": 0.21217495866526379, + "flos": 18988979857920.0, + "grad_norm": 1.857439679164292, + "language_loss": 0.8140527, + "learning_rate": 3.57224065082465e-06, + "loss": 0.8353436, + "num_input_tokens_seen": 76216215, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.640625, + "step": 3529, + "time_per_iteration": 2.3696351051330566 + }, + { + "auxiliary_loss_clip": 0.01094116, + "auxiliary_loss_mlp": 0.0104552, + "balance_loss_clip": 1.02585042, + "balance_loss_mlp": 1.02790391, + "epoch": 0.21223508191793175, + "flos": 20666277262080.0, + "grad_norm": 2.0687359976770687, + "language_loss": 0.7687794, + "learning_rate": 3.572007100808386e-06, + "loss": 0.7901758, + "num_input_tokens_seen": 76237010, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 0.66015625, + "step": 3530, + "time_per_iteration": 3.769836187362671 + }, + { + "auxiliary_loss_clip": 0.01089458, + "auxiliary_loss_mlp": 0.01036236, + "balance_loss_clip": 1.01879537, + "balance_loss_mlp": 1.02676487, + "epoch": 0.21229520517059972, + "flos": 21615391105920.0, + "grad_norm": 2.4408438047408825, + "language_loss": 0.83416092, + "learning_rate": 3.5717734946912323e-06, + "loss": 0.85541785, + "num_input_tokens_seen": 76255965, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.625, + "step": 3531, + "time_per_iteration": 2.370994806289673 + }, + { + "auxiliary_loss_clip": 0.01096942, + "auxiliary_loss_mlp": 0.01036436, + "balance_loss_clip": 1.01590848, + "balance_loss_mlp": 1.03025162, + "epoch": 0.2123553284232677, + "flos": 13990185480960.0, + "grad_norm": 2.124867427867221, + "language_loss": 0.73241019, + "learning_rate": 3.5715398324815248e-06, + "loss": 0.75374401, + "num_input_tokens_seen": 76272150, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 0.66796875, + "step": 3532, + "time_per_iteration": 2.333064556121826 + }, + { + "auxiliary_loss_clip": 0.01091105, + "auxiliary_loss_mlp": 0.01040545, + "balance_loss_clip": 1.02134037, + "balance_loss_mlp": 1.02644968, + "epoch": 0.21241545167593567, + "flos": 18295779346560.0, + "grad_norm": 1.5600115485066246, + "language_loss": 0.73706496, + "learning_rate": 3.5713061141876038e-06, + "loss": 0.75838149, + "num_input_tokens_seen": 76291425, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 0.6484375, + "step": 3533, + "time_per_iteration": 3.7267649173736572 + }, + { + "auxiliary_loss_clip": 0.01092779, + "auxiliary_loss_mlp": 0.01036701, + "balance_loss_clip": 1.0176512, + "balance_loss_mlp": 1.0267204, + "epoch": 0.21247557492860364, + "flos": 34711731158400.0, + "grad_norm": 1.816658117639494, + "language_loss": 0.71616459, + "learning_rate": 3.57107233981781e-06, + "loss": 0.73745942, + "num_input_tokens_seen": 76313975, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 0.66015625, + "step": 3534, + "time_per_iteration": 3.8768603801727295 + }, + { + "auxiliary_loss_clip": 0.01092431, + "auxiliary_loss_mlp": 0.01035892, + "balance_loss_clip": 1.01673508, + "balance_loss_mlp": 1.02860451, + "epoch": 0.2125356981812716, + "flos": 22052748107520.0, + "grad_norm": 1.7443495829552542, + "language_loss": 0.71470159, + "learning_rate": 3.570838509380485e-06, + "loss": 0.7359848, + "num_input_tokens_seen": 76330955, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 0.63671875, + "step": 3535, + "time_per_iteration": 2.3606069087982178 + }, + { + "auxiliary_loss_clip": 0.01088114, + "auxiliary_loss_mlp": 0.0104386, + "balance_loss_clip": 1.02588296, + "balance_loss_mlp": 1.02686036, + "epoch": 0.21259582143393957, + "flos": 28547082017280.0, + "grad_norm": 2.676692349575711, + "language_loss": 0.70630693, + "learning_rate": 3.5706046228839744e-06, + "loss": 0.72762668, + "num_input_tokens_seen": 76352680, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.61328125, + "step": 3536, + "time_per_iteration": 3.8124794960021973 + }, + { + "auxiliary_loss_clip": 0.01093179, + "auxiliary_loss_mlp": 0.01040017, + "balance_loss_clip": 1.0213486, + "balance_loss_mlp": 1.02764964, + "epoch": 0.21265594468660753, + "flos": 20119851573120.0, + "grad_norm": 1.8183940484573426, + "language_loss": 0.88206851, + "learning_rate": 3.5703706803366245e-06, + "loss": 0.90340042, + "num_input_tokens_seen": 76370750, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.65625, + "step": 3537, + "time_per_iteration": 2.3919172286987305 + }, + { + "auxiliary_loss_clip": 0.01087442, + "auxiliary_loss_mlp": 0.01034226, + "balance_loss_clip": 1.01663041, + "balance_loss_mlp": 1.02580369, + "epoch": 0.21271606793927553, + "flos": 23075039894400.0, + "grad_norm": 1.7987711909369988, + "language_loss": 0.80270451, + "learning_rate": 3.5701366817467852e-06, + "loss": 0.8239212, + "num_input_tokens_seen": 76390610, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.6171875, + "step": 3538, + "time_per_iteration": 2.3892593383789062 + }, + { + "auxiliary_loss_clip": 0.01089552, + "auxiliary_loss_mlp": 0.01033069, + "balance_loss_clip": 1.01600981, + "balance_loss_mlp": 1.02698207, + "epoch": 0.2127761911919435, + "flos": 26387215534080.0, + "grad_norm": 1.5432817295161552, + "language_loss": 0.87025869, + "learning_rate": 3.569902627122807e-06, + "loss": 0.89148492, + "num_input_tokens_seen": 76408860, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.625, + "step": 3539, + "time_per_iteration": 2.413520097732544 + }, + { + "auxiliary_loss_clip": 0.01092517, + "auxiliary_loss_mlp": 0.01035804, + "balance_loss_clip": 1.01732671, + "balance_loss_mlp": 1.02834845, + "epoch": 0.21283631444461146, + "flos": 20227279426560.0, + "grad_norm": 1.9640439737483033, + "language_loss": 0.58144236, + "learning_rate": 3.5696685164730413e-06, + "loss": 0.60272551, + "num_input_tokens_seen": 76424980, + "router_z_loss_clip": 0.18457031, + "router_z_loss_mlp": 0.640625, + "step": 3540, + "time_per_iteration": 2.396209478378296 + }, + { + "auxiliary_loss_clip": 0.0109066, + "auxiliary_loss_mlp": 0.0103758, + "balance_loss_clip": 1.01773214, + "balance_loss_mlp": 1.02585626, + "epoch": 0.21289643769727942, + "flos": 13516134773760.0, + "grad_norm": 2.8270848855133885, + "language_loss": 0.76063997, + "learning_rate": 3.569434349805844e-06, + "loss": 0.7819224, + "num_input_tokens_seen": 76443135, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 0.6484375, + "step": 3541, + "time_per_iteration": 2.3696482181549072 + }, + { + "auxiliary_loss_clip": 0.01088552, + "auxiliary_loss_mlp": 0.01033831, + "balance_loss_clip": 1.01646209, + "balance_loss_mlp": 1.02636743, + "epoch": 0.2129565609499474, + "flos": 24825864355200.0, + "grad_norm": 1.8189748043941079, + "language_loss": 0.69292998, + "learning_rate": 3.569200127129572e-06, + "loss": 0.71415377, + "num_input_tokens_seen": 76462470, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.62109375, + "step": 3542, + "time_per_iteration": 2.408999443054199 + }, + { + "auxiliary_loss_clip": 0.01088057, + "auxiliary_loss_mlp": 0.01037721, + "balance_loss_clip": 1.02069831, + "balance_loss_mlp": 1.02653039, + "epoch": 0.21301668420261535, + "flos": 23658124377600.0, + "grad_norm": 1.9558757392083184, + "language_loss": 0.76540411, + "learning_rate": 3.568965848452584e-06, + "loss": 0.78666192, + "num_input_tokens_seen": 76481995, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.61328125, + "step": 3543, + "time_per_iteration": 2.38474178314209 + }, + { + "auxiliary_loss_clip": 0.01090817, + "auxiliary_loss_mlp": 0.01036364, + "balance_loss_clip": 1.01883984, + "balance_loss_mlp": 1.02940273, + "epoch": 0.21307680745528332, + "flos": 16361870382720.0, + "grad_norm": 1.787852595170984, + "language_loss": 0.66598499, + "learning_rate": 3.568731513783241e-06, + "loss": 0.68725681, + "num_input_tokens_seen": 76500245, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.6171875, + "step": 3544, + "time_per_iteration": 2.3578274250030518 + }, + { + "auxiliary_loss_clip": 0.01090795, + "auxiliary_loss_mlp": 0.01035123, + "balance_loss_clip": 1.01752782, + "balance_loss_mlp": 1.02702022, + "epoch": 0.2131369307079513, + "flos": 19098048545280.0, + "grad_norm": 1.7353996480229472, + "language_loss": 0.71131104, + "learning_rate": 3.568497123129905e-06, + "loss": 0.73257023, + "num_input_tokens_seen": 76519535, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.63671875, + "step": 3545, + "time_per_iteration": 2.364079713821411 + }, + { + "auxiliary_loss_clip": 0.01092134, + "auxiliary_loss_mlp": 0.01038229, + "balance_loss_clip": 1.01905966, + "balance_loss_mlp": 1.02641511, + "epoch": 0.21319705396061928, + "flos": 30370979687040.0, + "grad_norm": 3.304056499257508, + "language_loss": 0.72142816, + "learning_rate": 3.568262676500942e-06, + "loss": 0.74273175, + "num_input_tokens_seen": 76542065, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 0.65625, + "step": 3546, + "time_per_iteration": 2.44181489944458 + }, + { + "auxiliary_loss_clip": 0.01090506, + "auxiliary_loss_mlp": 0.01039076, + "balance_loss_clip": 1.02145648, + "balance_loss_mlp": 1.02731371, + "epoch": 0.21325717721328724, + "flos": 21755288390400.0, + "grad_norm": 2.2793165837832254, + "language_loss": 0.80409074, + "learning_rate": 3.568028173904717e-06, + "loss": 0.82538652, + "num_input_tokens_seen": 76560540, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.6328125, + "step": 3547, + "time_per_iteration": 2.367537260055542 + }, + { + "auxiliary_loss_clip": 0.01092303, + "auxiliary_loss_mlp": 0.01040354, + "balance_loss_clip": 1.02100635, + "balance_loss_mlp": 1.02711093, + "epoch": 0.2133173004659552, + "flos": 28729607938560.0, + "grad_norm": 2.235681769906824, + "language_loss": 0.74547142, + "learning_rate": 3.567793615349601e-06, + "loss": 0.76679802, + "num_input_tokens_seen": 76581760, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 0.65234375, + "step": 3548, + "time_per_iteration": 2.429685354232788 + }, + { + "auxiliary_loss_clip": 0.0109612, + "auxiliary_loss_mlp": 0.01039787, + "balance_loss_clip": 1.019665, + "balance_loss_mlp": 1.02880847, + "epoch": 0.21337742371862317, + "flos": 16836130558080.0, + "grad_norm": 1.8959408548935859, + "language_loss": 0.74469119, + "learning_rate": 3.567559000843963e-06, + "loss": 0.76605028, + "num_input_tokens_seen": 76599940, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 0.671875, + "step": 3549, + "time_per_iteration": 2.352445363998413 + }, + { + "auxiliary_loss_clip": 0.0109448, + "auxiliary_loss_mlp": 0.01034412, + "balance_loss_clip": 1.01723397, + "balance_loss_mlp": 1.02977777, + "epoch": 0.21343754697129114, + "flos": 24423804604800.0, + "grad_norm": 1.6763507306574892, + "language_loss": 0.8074863, + "learning_rate": 3.567324330396177e-06, + "loss": 0.82877523, + "num_input_tokens_seen": 76619580, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.6484375, + "step": 3550, + "time_per_iteration": 2.4111640453338623 + }, + { + "auxiliary_loss_clip": 0.01090743, + "auxiliary_loss_mlp": 0.01033572, + "balance_loss_clip": 1.01688302, + "balance_loss_mlp": 1.02898681, + "epoch": 0.21349767022395913, + "flos": 19276908773760.0, + "grad_norm": 1.6263183111974162, + "language_loss": 0.87823749, + "learning_rate": 3.5670896040146173e-06, + "loss": 0.89948064, + "num_input_tokens_seen": 76638195, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.6171875, + "step": 3551, + "time_per_iteration": 2.378389358520508 + }, + { + "auxiliary_loss_clip": 0.01091534, + "auxiliary_loss_mlp": 0.01032952, + "balance_loss_clip": 1.01468873, + "balance_loss_mlp": 1.02740884, + "epoch": 0.2135577934766271, + "flos": 17346595680000.0, + "grad_norm": 1.9473713258331604, + "language_loss": 0.83161962, + "learning_rate": 3.5668548217076605e-06, + "loss": 0.85286444, + "num_input_tokens_seen": 76656695, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 0.640625, + "step": 3552, + "time_per_iteration": 2.3684539794921875 + }, + { + "auxiliary_loss_clip": 0.0108993, + "auxiliary_loss_mlp": 0.01038122, + "balance_loss_clip": 1.02004933, + "balance_loss_mlp": 1.02729726, + "epoch": 0.21361791672929506, + "flos": 24056169321600.0, + "grad_norm": 1.6071835531330094, + "language_loss": 0.76515716, + "learning_rate": 3.5666199834836855e-06, + "loss": 0.78643763, + "num_input_tokens_seen": 76677430, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.625, + "step": 3553, + "time_per_iteration": 2.418617010116577 + }, + { + "auxiliary_loss_clip": 0.01089905, + "auxiliary_loss_mlp": 0.01038041, + "balance_loss_clip": 1.02044606, + "balance_loss_mlp": 1.02761078, + "epoch": 0.21367803998196302, + "flos": 22161258213120.0, + "grad_norm": 1.560754988037051, + "language_loss": 0.72691786, + "learning_rate": 3.5663850893510734e-06, + "loss": 0.74819732, + "num_input_tokens_seen": 76697615, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.625, + "step": 3554, + "time_per_iteration": 2.400346517562866 + }, + { + "auxiliary_loss_clip": 0.01090047, + "auxiliary_loss_mlp": 0.01034065, + "balance_loss_clip": 1.01655304, + "balance_loss_mlp": 1.02673614, + "epoch": 0.213738163234631, + "flos": 20885811091200.0, + "grad_norm": 2.035509144294839, + "language_loss": 0.67685765, + "learning_rate": 3.566150139318206e-06, + "loss": 0.69809878, + "num_input_tokens_seen": 76715685, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.6328125, + "step": 3555, + "time_per_iteration": 2.361581802368164 + }, + { + "auxiliary_loss_clip": 0.01091967, + "auxiliary_loss_mlp": 0.01036928, + "balance_loss_clip": 1.01806927, + "balance_loss_mlp": 1.0273869, + "epoch": 0.21379828648729896, + "flos": 28401843294720.0, + "grad_norm": 1.8029032971913221, + "language_loss": 0.64442456, + "learning_rate": 3.56591513339347e-06, + "loss": 0.66571343, + "num_input_tokens_seen": 76735405, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 0.64453125, + "step": 3556, + "time_per_iteration": 2.4424357414245605 + }, + { + "auxiliary_loss_clip": 0.01093633, + "auxiliary_loss_mlp": 0.01040235, + "balance_loss_clip": 1.02211511, + "balance_loss_mlp": 1.02866197, + "epoch": 0.21385840973996692, + "flos": 25478600734080.0, + "grad_norm": 1.699493702630759, + "language_loss": 0.7273261, + "learning_rate": 3.56568007158525e-06, + "loss": 0.74866474, + "num_input_tokens_seen": 76754395, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.6484375, + "step": 3557, + "time_per_iteration": 2.4168779850006104 + }, + { + "auxiliary_loss_clip": 0.01093852, + "auxiliary_loss_mlp": 0.01035784, + "balance_loss_clip": 1.01659131, + "balance_loss_mlp": 1.02738237, + "epoch": 0.2139185329926349, + "flos": 28073031310080.0, + "grad_norm": 1.6753315684596148, + "language_loss": 0.67222565, + "learning_rate": 3.565444953901935e-06, + "loss": 0.69352198, + "num_input_tokens_seen": 76777210, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 0.6640625, + "step": 3558, + "time_per_iteration": 2.455333709716797 + }, + { + "auxiliary_loss_clip": 0.01094301, + "auxiliary_loss_mlp": 0.01039093, + "balance_loss_clip": 1.02034104, + "balance_loss_mlp": 1.0276885, + "epoch": 0.21397865624530288, + "flos": 19607710705920.0, + "grad_norm": 1.8618981889895183, + "language_loss": 0.79920673, + "learning_rate": 3.5652097803519173e-06, + "loss": 0.82054073, + "num_input_tokens_seen": 76795830, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.6640625, + "step": 3559, + "time_per_iteration": 2.3532660007476807 + }, + { + "auxiliary_loss_clip": 0.01088294, + "auxiliary_loss_mlp": 0.01035535, + "balance_loss_clip": 1.01770151, + "balance_loss_mlp": 1.02630925, + "epoch": 0.21403877949797084, + "flos": 24680311430400.0, + "grad_norm": 1.5941598822264722, + "language_loss": 0.67690969, + "learning_rate": 3.5649745509435887e-06, + "loss": 0.69814801, + "num_input_tokens_seen": 76814700, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 0.6171875, + "step": 3560, + "time_per_iteration": 2.426039695739746 + }, + { + "auxiliary_loss_clip": 0.01093116, + "auxiliary_loss_mlp": 0.01038342, + "balance_loss_clip": 1.02019823, + "balance_loss_mlp": 1.02826059, + "epoch": 0.2140989027506388, + "flos": 19860237636480.0, + "grad_norm": 1.9546038125169252, + "language_loss": 0.72802126, + "learning_rate": 3.564739265685344e-06, + "loss": 0.74933589, + "num_input_tokens_seen": 76833400, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.6484375, + "step": 3561, + "time_per_iteration": 2.373037576675415 + }, + { + "auxiliary_loss_clip": 0.01092301, + "auxiliary_loss_mlp": 0.01037435, + "balance_loss_clip": 1.01895809, + "balance_loss_mlp": 1.02813137, + "epoch": 0.21415902600330677, + "flos": 19134323314560.0, + "grad_norm": 2.091336605978353, + "language_loss": 0.77274024, + "learning_rate": 3.56450392458558e-06, + "loss": 0.79403764, + "num_input_tokens_seen": 76850645, + "router_z_loss_clip": 0.18457031, + "router_z_loss_mlp": 0.640625, + "step": 3562, + "time_per_iteration": 2.371242046356201 + }, + { + "auxiliary_loss_clip": 0.01092097, + "auxiliary_loss_mlp": 0.0103462, + "balance_loss_clip": 1.01640487, + "balance_loss_mlp": 1.02860069, + "epoch": 0.21421914925597474, + "flos": 22271548798080.0, + "grad_norm": 2.146029902485723, + "language_loss": 0.84829164, + "learning_rate": 3.564268527652695e-06, + "loss": 0.86955887, + "num_input_tokens_seen": 76870135, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.63671875, + "step": 3563, + "time_per_iteration": 2.3777878284454346 + }, + { + "auxiliary_loss_clip": 0.01090524, + "auxiliary_loss_mlp": 0.01030805, + "balance_loss_clip": 1.01405644, + "balance_loss_mlp": 1.02772045, + "epoch": 0.2142792725086427, + "flos": 33873710860800.0, + "grad_norm": 1.4489453244027943, + "language_loss": 0.76527488, + "learning_rate": 3.5640330748950902e-06, + "loss": 0.78648818, + "num_input_tokens_seen": 76893905, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.62890625, + "step": 3564, + "time_per_iteration": 2.523754835128784 + }, + { + "auxiliary_loss_clip": 0.01089924, + "auxiliary_loss_mlp": 0.01034831, + "balance_loss_clip": 1.01730728, + "balance_loss_mlp": 1.02771103, + "epoch": 0.2143393957613107, + "flos": 19859329941120.0, + "grad_norm": 1.750469628884493, + "language_loss": 0.88693774, + "learning_rate": 3.5637975663211677e-06, + "loss": 0.9081853, + "num_input_tokens_seen": 76914205, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.625, + "step": 3565, + "time_per_iteration": 2.399671792984009 + }, + { + "auxiliary_loss_clip": 0.01021939, + "auxiliary_loss_mlp": 0.01002098, + "balance_loss_clip": 1.0002147, + "balance_loss_mlp": 1.00620675, + "epoch": 0.21439951901397866, + "flos": 68526891936000.0, + "grad_norm": 0.8405105559125486, + "language_loss": 0.52237988, + "learning_rate": 3.563562001939333e-06, + "loss": 0.54262018, + "num_input_tokens_seen": 76975650, + "router_z_loss_clip": 0.01879883, + "router_z_loss_mlp": 0.15722656, + "step": 3566, + "time_per_iteration": 2.9677493572235107 + }, + { + "auxiliary_loss_clip": 0.01087065, + "auxiliary_loss_mlp": 0.01034586, + "balance_loss_clip": 1.01798034, + "balance_loss_mlp": 1.02784598, + "epoch": 0.21445964226664663, + "flos": 19681970901120.0, + "grad_norm": 10.254505098981035, + "language_loss": 0.66883928, + "learning_rate": 3.563326381757993e-06, + "loss": 0.69005579, + "num_input_tokens_seen": 76992615, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.59375, + "step": 3567, + "time_per_iteration": 2.350949287414551 + }, + { + "auxiliary_loss_clip": 0.01088321, + "auxiliary_loss_mlp": 0.01035676, + "balance_loss_clip": 1.01858127, + "balance_loss_mlp": 1.027457, + "epoch": 0.2145197655193146, + "flos": 31105796405760.0, + "grad_norm": 1.6680941296164373, + "language_loss": 0.74234676, + "learning_rate": 3.563090705785555e-06, + "loss": 0.76358676, + "num_input_tokens_seen": 77017005, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.609375, + "step": 3568, + "time_per_iteration": 2.4811275005340576 + }, + { + "auxiliary_loss_clip": 0.01092114, + "auxiliary_loss_mlp": 0.01039167, + "balance_loss_clip": 1.02103531, + "balance_loss_mlp": 1.0285691, + "epoch": 0.21457988877198256, + "flos": 20119746839040.0, + "grad_norm": 1.5262703727022713, + "language_loss": 0.77591181, + "learning_rate": 3.5628549740304307e-06, + "loss": 0.79722464, + "num_input_tokens_seen": 77034990, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.63671875, + "step": 3569, + "time_per_iteration": 2.3640496730804443 + }, + { + "auxiliary_loss_clip": 0.01097877, + "auxiliary_loss_mlp": 0.01042975, + "balance_loss_clip": 1.02281713, + "balance_loss_mlp": 1.02996242, + "epoch": 0.21464001202465052, + "flos": 18587059752960.0, + "grad_norm": 2.594278766033549, + "language_loss": 0.70476753, + "learning_rate": 3.562619186501032e-06, + "loss": 0.72617602, + "num_input_tokens_seen": 77052610, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 0.6796875, + "step": 3570, + "time_per_iteration": 3.727426767349243 + }, + { + "auxiliary_loss_clip": 0.01093941, + "auxiliary_loss_mlp": 0.01036131, + "balance_loss_clip": 1.01766574, + "balance_loss_mlp": 1.02872419, + "epoch": 0.21470013527731852, + "flos": 21834087062400.0, + "grad_norm": 2.066701985883769, + "language_loss": 0.78749084, + "learning_rate": 3.562383343205774e-06, + "loss": 0.80879158, + "num_input_tokens_seen": 77072475, + "router_z_loss_clip": 0.18457031, + "router_z_loss_mlp": 0.65234375, + "step": 3571, + "time_per_iteration": 2.376452922821045 + }, + { + "auxiliary_loss_clip": 0.01093271, + "auxiliary_loss_mlp": 0.01034033, + "balance_loss_clip": 1.01385117, + "balance_loss_mlp": 1.02877557, + "epoch": 0.21476025852998648, + "flos": 17602229721600.0, + "grad_norm": 2.0248957534966694, + "language_loss": 0.82518691, + "learning_rate": 3.5621474441530744e-06, + "loss": 0.84645993, + "num_input_tokens_seen": 77089930, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 0.64453125, + "step": 3572, + "time_per_iteration": 3.739330530166626 + }, + { + "auxiliary_loss_clip": 0.01095537, + "auxiliary_loss_mlp": 0.01037888, + "balance_loss_clip": 1.01826668, + "balance_loss_mlp": 1.02786791, + "epoch": 0.21482038178265445, + "flos": 24826946607360.0, + "grad_norm": 4.239586664123104, + "language_loss": 0.64667124, + "learning_rate": 3.5619114893513508e-06, + "loss": 0.66800553, + "num_input_tokens_seen": 77108970, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 0.67578125, + "step": 3573, + "time_per_iteration": 3.7828338146209717 + }, + { + "auxiliary_loss_clip": 0.01086014, + "auxiliary_loss_mlp": 0.01036069, + "balance_loss_clip": 1.01881957, + "balance_loss_mlp": 1.02638626, + "epoch": 0.2148805050353224, + "flos": 23257111968000.0, + "grad_norm": 1.954719978928461, + "language_loss": 0.75047588, + "learning_rate": 3.5616754788090235e-06, + "loss": 0.77169669, + "num_input_tokens_seen": 77126045, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.59375, + "step": 3574, + "time_per_iteration": 2.391507148742676 + }, + { + "auxiliary_loss_clip": 0.01088668, + "auxiliary_loss_mlp": 0.01033446, + "balance_loss_clip": 1.01467037, + "balance_loss_mlp": 1.0269568, + "epoch": 0.21494062828799038, + "flos": 21320130804480.0, + "grad_norm": 1.8067005748642064, + "language_loss": 0.71893322, + "learning_rate": 3.561439412534515e-06, + "loss": 0.74015439, + "num_input_tokens_seen": 77144600, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.6171875, + "step": 3575, + "time_per_iteration": 2.3854336738586426 + }, + { + "auxiliary_loss_clip": 0.01090527, + "auxiliary_loss_mlp": 0.01033478, + "balance_loss_clip": 1.01526237, + "balance_loss_mlp": 1.02761424, + "epoch": 0.21500075154065834, + "flos": 18842344680960.0, + "grad_norm": 1.7983627799214463, + "language_loss": 0.68403685, + "learning_rate": 3.561203290536251e-06, + "loss": 0.70527697, + "num_input_tokens_seen": 77162965, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.62890625, + "step": 3576, + "time_per_iteration": 3.8433103561401367 + }, + { + "auxiliary_loss_clip": 0.01095384, + "auxiliary_loss_mlp": 0.01034632, + "balance_loss_clip": 1.01510596, + "balance_loss_mlp": 1.02919865, + "epoch": 0.2150608747933263, + "flos": 18441018069120.0, + "grad_norm": 1.767014805779442, + "language_loss": 0.88839924, + "learning_rate": 3.560967112822657e-06, + "loss": 0.90969938, + "num_input_tokens_seen": 77179960, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 0.6640625, + "step": 3577, + "time_per_iteration": 2.347475528717041 + }, + { + "auxiliary_loss_clip": 0.01020887, + "auxiliary_loss_mlp": 0.01001967, + "balance_loss_clip": 0.99982095, + "balance_loss_mlp": 1.00577354, + "epoch": 0.2151209980459943, + "flos": 66595042742400.0, + "grad_norm": 0.8061405269424026, + "language_loss": 0.56194675, + "learning_rate": 3.5607308794021623e-06, + "loss": 0.58217531, + "num_input_tokens_seen": 77239500, + "router_z_loss_clip": 0.02148438, + "router_z_loss_mlp": 0.15136719, + "step": 3578, + "time_per_iteration": 2.9712166786193848 + }, + { + "auxiliary_loss_clip": 0.01091219, + "auxiliary_loss_mlp": 0.01036472, + "balance_loss_clip": 1.01766121, + "balance_loss_mlp": 1.02836454, + "epoch": 0.21518112129866226, + "flos": 21574926973440.0, + "grad_norm": 1.6810222968183248, + "language_loss": 0.88131016, + "learning_rate": 3.5604945902831975e-06, + "loss": 0.90258706, + "num_input_tokens_seen": 77254680, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.62890625, + "step": 3579, + "time_per_iteration": 2.3581838607788086 + }, + { + "auxiliary_loss_clip": 0.01093006, + "auxiliary_loss_mlp": 0.01041605, + "balance_loss_clip": 1.0219475, + "balance_loss_mlp": 1.02872515, + "epoch": 0.21524124455133023, + "flos": 20046603807360.0, + "grad_norm": 1.8056512339471995, + "language_loss": 0.78003907, + "learning_rate": 3.560258245474194e-06, + "loss": 0.80138516, + "num_input_tokens_seen": 77274060, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 0.640625, + "step": 3580, + "time_per_iteration": 2.4127047061920166 + }, + { + "auxiliary_loss_clip": 0.01090745, + "auxiliary_loss_mlp": 0.01038444, + "balance_loss_clip": 1.02080107, + "balance_loss_mlp": 1.02885795, + "epoch": 0.2153013678039982, + "flos": 23950661592960.0, + "grad_norm": 1.9748070906757218, + "language_loss": 0.7300638, + "learning_rate": 3.5600218449835876e-06, + "loss": 0.75135565, + "num_input_tokens_seen": 77293255, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.6171875, + "step": 3581, + "time_per_iteration": 2.3917815685272217 + }, + { + "auxiliary_loss_clip": 0.01090551, + "auxiliary_loss_mlp": 0.01042521, + "balance_loss_clip": 1.02257752, + "balance_loss_mlp": 1.02849686, + "epoch": 0.21536149105666616, + "flos": 20593797546240.0, + "grad_norm": 5.8618237838227305, + "language_loss": 0.70694757, + "learning_rate": 3.559785388819815e-06, + "loss": 0.72827828, + "num_input_tokens_seen": 77312390, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 0.62109375, + "step": 3582, + "time_per_iteration": 2.394432306289673 + }, + { + "auxiliary_loss_clip": 0.01091108, + "auxiliary_loss_mlp": 0.01037939, + "balance_loss_clip": 1.01861525, + "balance_loss_mlp": 1.02831674, + "epoch": 0.21542161430933413, + "flos": 12859209031680.0, + "grad_norm": 2.2836442038875817, + "language_loss": 0.83861291, + "learning_rate": 3.5595488769913134e-06, + "loss": 0.8599034, + "num_input_tokens_seen": 77330985, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 0.62890625, + "step": 3583, + "time_per_iteration": 2.3503217697143555 + }, + { + "auxiliary_loss_clip": 0.01095956, + "auxiliary_loss_mlp": 0.01041814, + "balance_loss_clip": 1.02240705, + "balance_loss_mlp": 1.03027546, + "epoch": 0.21548173756200212, + "flos": 26102742842880.0, + "grad_norm": 2.252417622498085, + "language_loss": 0.83012831, + "learning_rate": 3.5593123095065245e-06, + "loss": 0.85150599, + "num_input_tokens_seen": 77350770, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 0.65625, + "step": 3584, + "time_per_iteration": 2.413504123687744 + }, + { + "auxiliary_loss_clip": 0.0109106, + "auxiliary_loss_mlp": 0.01037681, + "balance_loss_clip": 1.02002573, + "balance_loss_mlp": 1.02791607, + "epoch": 0.21554186081467008, + "flos": 22162689578880.0, + "grad_norm": 2.2177269774423825, + "language_loss": 0.89685279, + "learning_rate": 3.55907568637389e-06, + "loss": 0.91814023, + "num_input_tokens_seen": 77370510, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.6328125, + "step": 3585, + "time_per_iteration": 2.3835370540618896 + }, + { + "auxiliary_loss_clip": 0.01092262, + "auxiliary_loss_mlp": 0.01042329, + "balance_loss_clip": 1.02404249, + "balance_loss_mlp": 1.02910924, + "epoch": 0.21560198406733805, + "flos": 22965622093440.0, + "grad_norm": 1.9904693913663805, + "language_loss": 0.74803352, + "learning_rate": 3.558839007601855e-06, + "loss": 0.76937938, + "num_input_tokens_seen": 77390645, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.62890625, + "step": 3586, + "time_per_iteration": 2.3789827823638916 + }, + { + "auxiliary_loss_clip": 0.01091172, + "auxiliary_loss_mlp": 0.01038639, + "balance_loss_clip": 1.02122235, + "balance_loss_mlp": 1.02772009, + "epoch": 0.215662107320006, + "flos": 22782956526720.0, + "grad_norm": 1.892175981842573, + "language_loss": 0.82819235, + "learning_rate": 3.558602273198865e-06, + "loss": 0.84949052, + "num_input_tokens_seen": 77409655, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.6328125, + "step": 3587, + "time_per_iteration": 2.4005024433135986 + }, + { + "auxiliary_loss_clip": 0.01092139, + "auxiliary_loss_mlp": 0.01033209, + "balance_loss_clip": 1.01403999, + "balance_loss_mlp": 1.02778912, + "epoch": 0.21572223057267398, + "flos": 30882527061120.0, + "grad_norm": 1.9281663194590999, + "language_loss": 0.75907916, + "learning_rate": 3.558365483173369e-06, + "loss": 0.78033262, + "num_input_tokens_seen": 77430560, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 0.64453125, + "step": 3588, + "time_per_iteration": 2.4512038230895996 + }, + { + "auxiliary_loss_clip": 0.01089851, + "auxiliary_loss_mlp": 0.0103441, + "balance_loss_clip": 1.01693416, + "balance_loss_mlp": 1.02671766, + "epoch": 0.21578235382534194, + "flos": 26909166493440.0, + "grad_norm": 1.7400579268636616, + "language_loss": 0.80721128, + "learning_rate": 3.5581286375338183e-06, + "loss": 0.8284539, + "num_input_tokens_seen": 77455000, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.6328125, + "step": 3589, + "time_per_iteration": 2.464797019958496 + }, + { + "auxiliary_loss_clip": 0.01092967, + "auxiliary_loss_mlp": 0.0103363, + "balance_loss_clip": 1.01548052, + "balance_loss_mlp": 1.02881718, + "epoch": 0.2158424770780099, + "flos": 24424572654720.0, + "grad_norm": 1.796896618802849, + "language_loss": 0.72772634, + "learning_rate": 3.557891736288664e-06, + "loss": 0.74899232, + "num_input_tokens_seen": 77475075, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.640625, + "step": 3590, + "time_per_iteration": 2.4086365699768066 + }, + { + "auxiliary_loss_clip": 0.01094064, + "auxiliary_loss_mlp": 0.01040865, + "balance_loss_clip": 1.02068269, + "balance_loss_mlp": 1.02712011, + "epoch": 0.2159026003306779, + "flos": 23948881113600.0, + "grad_norm": 1.8203562472573895, + "language_loss": 0.84164977, + "learning_rate": 3.5576547794463608e-06, + "loss": 0.86299908, + "num_input_tokens_seen": 77495945, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 0.671875, + "step": 3591, + "time_per_iteration": 2.4037511348724365 + }, + { + "auxiliary_loss_clip": 0.01097554, + "auxiliary_loss_mlp": 0.01038275, + "balance_loss_clip": 1.01684117, + "balance_loss_mlp": 1.02815509, + "epoch": 0.21596272358334587, + "flos": 30039758818560.0, + "grad_norm": 1.9655070881946446, + "language_loss": 0.69344044, + "learning_rate": 3.557417767015366e-06, + "loss": 0.71479869, + "num_input_tokens_seen": 77517140, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 0.6953125, + "step": 3592, + "time_per_iteration": 2.4410736560821533 + }, + { + "auxiliary_loss_clip": 0.01094848, + "auxiliary_loss_mlp": 0.01041074, + "balance_loss_clip": 1.02173829, + "balance_loss_mlp": 1.02889371, + "epoch": 0.21602284683601383, + "flos": 20375171412480.0, + "grad_norm": 2.5800130780775246, + "language_loss": 0.83571708, + "learning_rate": 3.557180699004137e-06, + "loss": 0.85707629, + "num_input_tokens_seen": 77536085, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 0.66015625, + "step": 3593, + "time_per_iteration": 2.3666434288024902 + }, + { + "auxiliary_loss_clip": 0.01093772, + "auxiliary_loss_mlp": 0.01046233, + "balance_loss_clip": 1.02636051, + "balance_loss_mlp": 1.02722239, + "epoch": 0.2160829700886818, + "flos": 20776288556160.0, + "grad_norm": 3.305035296664814, + "language_loss": 0.75017703, + "learning_rate": 3.556943575421134e-06, + "loss": 0.77157712, + "num_input_tokens_seen": 77553675, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 0.6640625, + "step": 3594, + "time_per_iteration": 2.364056348800659 + }, + { + "auxiliary_loss_clip": 0.01090224, + "auxiliary_loss_mlp": 0.01033744, + "balance_loss_clip": 1.01548111, + "balance_loss_mlp": 1.02737951, + "epoch": 0.21614309334134976, + "flos": 22308661440000.0, + "grad_norm": 1.470275823256459, + "language_loss": 0.80304974, + "learning_rate": 3.55670639627482e-06, + "loss": 0.82428944, + "num_input_tokens_seen": 77573360, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 0.62890625, + "step": 3595, + "time_per_iteration": 2.4001059532165527 + }, + { + "auxiliary_loss_clip": 0.01093998, + "auxiliary_loss_mlp": 0.01035091, + "balance_loss_clip": 1.01549327, + "balance_loss_mlp": 1.02793932, + "epoch": 0.21620321659401773, + "flos": 19608513667200.0, + "grad_norm": 1.8896780145521013, + "language_loss": 0.78621411, + "learning_rate": 3.556469161573659e-06, + "loss": 0.80750501, + "num_input_tokens_seen": 77591865, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 0.66015625, + "step": 3596, + "time_per_iteration": 2.349175214767456 + }, + { + "auxiliary_loss_clip": 0.01088236, + "auxiliary_loss_mlp": 0.01036131, + "balance_loss_clip": 1.01828587, + "balance_loss_mlp": 1.02725124, + "epoch": 0.2162633398466857, + "flos": 18843531667200.0, + "grad_norm": 2.4123382452893813, + "language_loss": 0.83061266, + "learning_rate": 3.556231871326118e-06, + "loss": 0.85185635, + "num_input_tokens_seen": 77611600, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 0.609375, + "step": 3597, + "time_per_iteration": 2.3800954818725586 + }, + { + "auxiliary_loss_clip": 0.01091466, + "auxiliary_loss_mlp": 0.01035036, + "balance_loss_clip": 1.01570058, + "balance_loss_mlp": 1.02618265, + "epoch": 0.21632346309935369, + "flos": 18767875017600.0, + "grad_norm": 1.568841190746847, + "language_loss": 0.8058297, + "learning_rate": 3.5559945255406635e-06, + "loss": 0.82709467, + "num_input_tokens_seen": 77630665, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 0.65625, + "step": 3598, + "time_per_iteration": 2.363445520401001 + }, + { + "auxiliary_loss_clip": 0.01093682, + "auxiliary_loss_mlp": 0.01040875, + "balance_loss_clip": 1.02007306, + "balance_loss_mlp": 1.02620852, + "epoch": 0.21638358635202165, + "flos": 26322939987840.0, + "grad_norm": 1.781455726806936, + "language_loss": 0.82309818, + "learning_rate": 3.555757124225767e-06, + "loss": 0.8444438, + "num_input_tokens_seen": 77650835, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 0.67578125, + "step": 3599, + "time_per_iteration": 2.4402010440826416 + }, + { + "auxiliary_loss_clip": 0.01088121, + "auxiliary_loss_mlp": 0.01033317, + "balance_loss_clip": 1.01438618, + "balance_loss_mlp": 1.02644491, + "epoch": 0.21644370960468962, + "flos": 20739804318720.0, + "grad_norm": 1.7064381165347018, + "language_loss": 0.76435298, + "learning_rate": 3.5555196673899015e-06, + "loss": 0.7855674, + "num_input_tokens_seen": 77669000, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 0.6171875, + "step": 3600, + "time_per_iteration": 2.372791290283203 + }, + { + "auxiliary_loss_clip": 0.01091173, + "auxiliary_loss_mlp": 0.01037773, + "balance_loss_clip": 1.01993895, + "balance_loss_mlp": 1.02560043, + "epoch": 0.21650383285735758, + "flos": 23951080529280.0, + "grad_norm": 1.6802841912055213, + "language_loss": 0.79665279, + "learning_rate": 3.5552821550415396e-06, + "loss": 0.81794226, + "num_input_tokens_seen": 77688745, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.65625, + "step": 3601, + "time_per_iteration": 2.4099018573760986 + }, + { + "auxiliary_loss_clip": 0.01091825, + "auxiliary_loss_mlp": 0.01036843, + "balance_loss_clip": 1.01902688, + "balance_loss_mlp": 1.02843213, + "epoch": 0.21656395611002555, + "flos": 23694957728640.0, + "grad_norm": 1.801246734435365, + "language_loss": 0.83378541, + "learning_rate": 3.5550445871891585e-06, + "loss": 0.85507202, + "num_input_tokens_seen": 77708445, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.6328125, + "step": 3602, + "time_per_iteration": 2.38606858253479 + }, + { + "auxiliary_loss_clip": 0.0109169, + "auxiliary_loss_mlp": 0.0103813, + "balance_loss_clip": 1.01948524, + "balance_loss_mlp": 1.02627182, + "epoch": 0.2166240793626935, + "flos": 20665055364480.0, + "grad_norm": 2.1522657692338827, + "language_loss": 0.7434755, + "learning_rate": 3.554806963841236e-06, + "loss": 0.76477373, + "num_input_tokens_seen": 77728465, + "router_z_loss_clip": 0.18652344, + "router_z_loss_mlp": 0.65625, + "step": 3603, + "time_per_iteration": 2.4077882766723633 + }, + { + "auxiliary_loss_clip": 0.01088508, + "auxiliary_loss_mlp": 0.01036431, + "balance_loss_clip": 1.01874018, + "balance_loss_mlp": 1.02596533, + "epoch": 0.2166842026153615, + "flos": 21579325804800.0, + "grad_norm": 1.6006828308954126, + "language_loss": 0.74189007, + "learning_rate": 3.554569285006253e-06, + "loss": 0.76313949, + "num_input_tokens_seen": 77746735, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.625, + "step": 3604, + "time_per_iteration": 2.368547201156616 + }, + { + "auxiliary_loss_clip": 0.01087886, + "auxiliary_loss_mlp": 0.01032106, + "balance_loss_clip": 1.0141654, + "balance_loss_mlp": 1.02642488, + "epoch": 0.21674432586802947, + "flos": 25628761958400.0, + "grad_norm": 1.6650597996806924, + "language_loss": 0.79789186, + "learning_rate": 3.5543315506926903e-06, + "loss": 0.8190918, + "num_input_tokens_seen": 77768105, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.6171875, + "step": 3605, + "time_per_iteration": 2.428480625152588 + }, + { + "auxiliary_loss_clip": 0.01022334, + "auxiliary_loss_mlp": 0.01005826, + "balance_loss_clip": 1.00375164, + "balance_loss_mlp": 1.00654793, + "epoch": 0.21680444912069743, + "flos": 56414893155840.0, + "grad_norm": 0.6891858059380648, + "language_loss": 0.5835098, + "learning_rate": 3.5540937609090334e-06, + "loss": 0.60379136, + "num_input_tokens_seen": 77833750, + "router_z_loss_clip": 0.02075195, + "router_z_loss_mlp": 0.15820312, + "step": 3606, + "time_per_iteration": 3.1059229373931885 + }, + { + "auxiliary_loss_clip": 0.01092309, + "auxiliary_loss_mlp": 0.01040399, + "balance_loss_clip": 1.02065825, + "balance_loss_mlp": 1.02667499, + "epoch": 0.2168645723733654, + "flos": 23877797852160.0, + "grad_norm": 2.1719810960899184, + "language_loss": 0.73038226, + "learning_rate": 3.5538559156637675e-06, + "loss": 0.75170934, + "num_input_tokens_seen": 77853780, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 0.65625, + "step": 3607, + "time_per_iteration": 2.390773057937622 + }, + { + "auxiliary_loss_clip": 0.01094745, + "auxiliary_loss_mlp": 0.01037115, + "balance_loss_clip": 1.01831567, + "balance_loss_mlp": 1.02730608, + "epoch": 0.21692469562603336, + "flos": 16945234156800.0, + "grad_norm": 1.8680270875352738, + "language_loss": 0.76685357, + "learning_rate": 3.5536180149653805e-06, + "loss": 0.78817213, + "num_input_tokens_seen": 77872575, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.67578125, + "step": 3608, + "time_per_iteration": 2.34770131111145 + }, + { + "auxiliary_loss_clip": 0.01093642, + "auxiliary_loss_mlp": 0.01038162, + "balance_loss_clip": 1.01961255, + "balance_loss_mlp": 1.02752662, + "epoch": 0.21698481887870133, + "flos": 25117877900160.0, + "grad_norm": 1.8803327586037324, + "language_loss": 0.7461046, + "learning_rate": 3.5533800588223636e-06, + "loss": 0.76742268, + "num_input_tokens_seen": 77892700, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.66015625, + "step": 3609, + "time_per_iteration": 3.786299467086792 + }, + { + "auxiliary_loss_clip": 0.01094841, + "auxiliary_loss_mlp": 0.01040215, + "balance_loss_clip": 1.0212965, + "balance_loss_mlp": 1.02876544, + "epoch": 0.2170449421313693, + "flos": 17893719596160.0, + "grad_norm": 1.7049591031448108, + "language_loss": 0.88620341, + "learning_rate": 3.553142047243208e-06, + "loss": 0.90755397, + "num_input_tokens_seen": 77911060, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 0.6640625, + "step": 3610, + "time_per_iteration": 2.3434276580810547 + }, + { + "auxiliary_loss_clip": 0.01091445, + "auxiliary_loss_mlp": 0.01032203, + "balance_loss_clip": 1.013201, + "balance_loss_mlp": 1.02796221, + "epoch": 0.2171050653840373, + "flos": 22637333779200.0, + "grad_norm": 1.7517505191453933, + "language_loss": 0.7782172, + "learning_rate": 3.5529039802364077e-06, + "loss": 0.79945368, + "num_input_tokens_seen": 77929930, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 0.6328125, + "step": 3611, + "time_per_iteration": 3.74212908744812 + }, + { + "auxiliary_loss_clip": 0.01087637, + "auxiliary_loss_mlp": 0.01036932, + "balance_loss_clip": 1.01925349, + "balance_loss_mlp": 1.0257417, + "epoch": 0.21716518863670525, + "flos": 19498991132160.0, + "grad_norm": 3.32508520275354, + "language_loss": 0.63251287, + "learning_rate": 3.552665857810459e-06, + "loss": 0.65375859, + "num_input_tokens_seen": 77949060, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.6171875, + "step": 3612, + "time_per_iteration": 2.364633798599243 + }, + { + "auxiliary_loss_clip": 0.01090932, + "auxiliary_loss_mlp": 0.01035258, + "balance_loss_clip": 1.01723361, + "balance_loss_mlp": 1.02709413, + "epoch": 0.21722531188937322, + "flos": 19791004677120.0, + "grad_norm": 2.148770153023427, + "language_loss": 0.75538373, + "learning_rate": 3.5524276799738594e-06, + "loss": 0.77664566, + "num_input_tokens_seen": 77967920, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.63671875, + "step": 3613, + "time_per_iteration": 3.8147830963134766 + }, + { + "auxiliary_loss_clip": 0.010892, + "auxiliary_loss_mlp": 0.01043653, + "balance_loss_clip": 1.02454424, + "balance_loss_mlp": 1.02751851, + "epoch": 0.21728543514204118, + "flos": 13333539029760.0, + "grad_norm": 2.2336364813128204, + "language_loss": 0.70556039, + "learning_rate": 3.5521894467351095e-06, + "loss": 0.7268889, + "num_input_tokens_seen": 77985330, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 0.6171875, + "step": 3614, + "time_per_iteration": 2.370621919631958 + }, + { + "auxiliary_loss_clip": 0.01091621, + "auxiliary_loss_mlp": 0.01035006, + "balance_loss_clip": 1.01693392, + "balance_loss_mlp": 1.02753961, + "epoch": 0.21734555839470915, + "flos": 15230963756160.0, + "grad_norm": 2.8856236911761113, + "language_loss": 0.73475075, + "learning_rate": 3.551951158102711e-06, + "loss": 0.75601697, + "num_input_tokens_seen": 78003105, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.640625, + "step": 3615, + "time_per_iteration": 2.3954524993896484 + }, + { + "auxiliary_loss_clip": 0.01095758, + "auxiliary_loss_mlp": 0.01038228, + "balance_loss_clip": 1.01796234, + "balance_loss_mlp": 1.02757883, + "epoch": 0.2174056816473771, + "flos": 19972972016640.0, + "grad_norm": 2.0453382039960673, + "language_loss": 0.89979649, + "learning_rate": 3.5517128140851682e-06, + "loss": 0.92113632, + "num_input_tokens_seen": 78019655, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 0.68359375, + "step": 3616, + "time_per_iteration": 3.74100399017334 + }, + { + "auxiliary_loss_clip": 0.01093413, + "auxiliary_loss_mlp": 0.01036404, + "balance_loss_clip": 1.01679385, + "balance_loss_mlp": 1.02681398, + "epoch": 0.21746580490004508, + "flos": 16686458092800.0, + "grad_norm": 2.8210824049104897, + "language_loss": 0.81051397, + "learning_rate": 3.551474414690986e-06, + "loss": 0.83181214, + "num_input_tokens_seen": 78036025, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 0.66796875, + "step": 3617, + "time_per_iteration": 2.3564059734344482 + }, + { + "auxiliary_loss_clip": 0.01094452, + "auxiliary_loss_mlp": 0.01040592, + "balance_loss_clip": 1.02130365, + "balance_loss_mlp": 1.0282445, + "epoch": 0.21752592815271307, + "flos": 25771207772160.0, + "grad_norm": 1.8723426473039015, + "language_loss": 0.75308621, + "learning_rate": 3.551235959928673e-06, + "loss": 0.77443665, + "num_input_tokens_seen": 78055645, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 0.6640625, + "step": 3618, + "time_per_iteration": 2.418596029281616 + }, + { + "auxiliary_loss_clip": 0.01090999, + "auxiliary_loss_mlp": 0.01038202, + "balance_loss_clip": 1.01904535, + "balance_loss_mlp": 1.0256393, + "epoch": 0.21758605140538104, + "flos": 11253902584320.0, + "grad_norm": 1.7939286684639173, + "language_loss": 0.69565201, + "learning_rate": 3.550997449806739e-06, + "loss": 0.71694398, + "num_input_tokens_seen": 78071660, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 0.65234375, + "step": 3619, + "time_per_iteration": 2.3391482830047607 + }, + { + "auxiliary_loss_clip": 0.01096509, + "auxiliary_loss_mlp": 0.01037495, + "balance_loss_clip": 1.01752806, + "balance_loss_mlp": 1.02994764, + "epoch": 0.217646174658049, + "flos": 19241681345280.0, + "grad_norm": 2.7936134290692864, + "language_loss": 0.78751981, + "learning_rate": 3.5507588843336953e-06, + "loss": 0.80885983, + "num_input_tokens_seen": 78091265, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 0.6640625, + "step": 3620, + "time_per_iteration": 2.3868601322174072 + }, + { + "auxiliary_loss_clip": 0.01087877, + "auxiliary_loss_mlp": 0.01034422, + "balance_loss_clip": 1.01588476, + "balance_loss_mlp": 1.02686977, + "epoch": 0.21770629791071697, + "flos": 21943993622400.0, + "grad_norm": 1.4737300606646127, + "language_loss": 0.80021012, + "learning_rate": 3.5505202635180556e-06, + "loss": 0.82143313, + "num_input_tokens_seen": 78110095, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.609375, + "step": 3621, + "time_per_iteration": 2.3791918754577637 + }, + { + "auxiliary_loss_clip": 0.01088616, + "auxiliary_loss_mlp": 0.01033982, + "balance_loss_clip": 1.01675606, + "balance_loss_mlp": 1.02592874, + "epoch": 0.21776642116338493, + "flos": 24935596358400.0, + "grad_norm": 1.5861360912655296, + "language_loss": 0.87613547, + "learning_rate": 3.550281587368337e-06, + "loss": 0.89736146, + "num_input_tokens_seen": 78129475, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.625, + "step": 3622, + "time_per_iteration": 2.414642333984375 + }, + { + "auxiliary_loss_clip": 0.01093598, + "auxiliary_loss_mlp": 0.01037296, + "balance_loss_clip": 1.01670909, + "balance_loss_mlp": 1.02774429, + "epoch": 0.2178265444160529, + "flos": 17820367096320.0, + "grad_norm": 2.1261771538098895, + "language_loss": 0.77160025, + "learning_rate": 3.550042855893056e-06, + "loss": 0.79290915, + "num_input_tokens_seen": 78146880, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 0.66015625, + "step": 3623, + "time_per_iteration": 2.3404550552368164 + }, + { + "auxiliary_loss_clip": 0.01095072, + "auxiliary_loss_mlp": 0.01045093, + "balance_loss_clip": 1.0246253, + "balance_loss_mlp": 1.02859235, + "epoch": 0.2178866676687209, + "flos": 17711926813440.0, + "grad_norm": 1.852630681807572, + "language_loss": 0.8442961, + "learning_rate": 3.549804069100733e-06, + "loss": 0.86569786, + "num_input_tokens_seen": 78165065, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 0.6640625, + "step": 3624, + "time_per_iteration": 2.369171142578125 + }, + { + "auxiliary_loss_clip": 0.01097251, + "auxiliary_loss_mlp": 0.01039975, + "balance_loss_clip": 1.02087736, + "balance_loss_mlp": 1.03009987, + "epoch": 0.21794679092138886, + "flos": 16944919954560.0, + "grad_norm": 2.420979482155318, + "language_loss": 0.77038062, + "learning_rate": 3.5495652269998887e-06, + "loss": 0.79175287, + "num_input_tokens_seen": 78180005, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 0.671875, + "step": 3625, + "time_per_iteration": 2.350097417831421 + }, + { + "auxiliary_loss_clip": 0.01019424, + "auxiliary_loss_mlp": 0.01003061, + "balance_loss_clip": 1.00118947, + "balance_loss_mlp": 1.00478375, + "epoch": 0.21800691417405682, + "flos": 63715371425280.0, + "grad_norm": 0.8079825791042722, + "language_loss": 0.60650551, + "learning_rate": 3.549326329599048e-06, + "loss": 0.62673038, + "num_input_tokens_seen": 78245350, + "router_z_loss_clip": 0.01867676, + "router_z_loss_mlp": 0.14648438, + "step": 3626, + "time_per_iteration": 3.112251043319702 + }, + { + "auxiliary_loss_clip": 0.01094413, + "auxiliary_loss_mlp": 0.01038838, + "balance_loss_clip": 1.01894259, + "balance_loss_mlp": 1.02658427, + "epoch": 0.21806703742672479, + "flos": 21615321283200.0, + "grad_norm": 1.8550901630318493, + "language_loss": 0.90389788, + "learning_rate": 3.549087376906736e-06, + "loss": 0.92523026, + "num_input_tokens_seen": 78264165, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 0.6796875, + "step": 3627, + "time_per_iteration": 2.389331102371216 + }, + { + "auxiliary_loss_clip": 0.01091615, + "auxiliary_loss_mlp": 0.01036037, + "balance_loss_clip": 1.01701105, + "balance_loss_mlp": 1.02693403, + "epoch": 0.21812716067939275, + "flos": 19353857143680.0, + "grad_norm": 1.645331613507943, + "language_loss": 0.73366785, + "learning_rate": 3.5488483689314795e-06, + "loss": 0.75494438, + "num_input_tokens_seen": 78283745, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 0.6484375, + "step": 3628, + "time_per_iteration": 2.3655855655670166 + }, + { + "auxiliary_loss_clip": 0.01088537, + "auxiliary_loss_mlp": 0.01039362, + "balance_loss_clip": 1.02062225, + "balance_loss_mlp": 1.0253588, + "epoch": 0.21818728393206072, + "flos": 23546995920000.0, + "grad_norm": 2.1342552613748897, + "language_loss": 0.77395225, + "learning_rate": 3.5486093056818094e-06, + "loss": 0.79523122, + "num_input_tokens_seen": 78302900, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.6328125, + "step": 3629, + "time_per_iteration": 2.394785165786743 + }, + { + "auxiliary_loss_clip": 0.01092003, + "auxiliary_loss_mlp": 0.01034264, + "balance_loss_clip": 1.01708627, + "balance_loss_mlp": 1.0276897, + "epoch": 0.21824740718472868, + "flos": 30224379421440.0, + "grad_norm": 1.687506484159848, + "language_loss": 0.7134552, + "learning_rate": 3.5483701871662566e-06, + "loss": 0.73471785, + "num_input_tokens_seen": 78326470, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.640625, + "step": 3630, + "time_per_iteration": 2.464221477508545 + }, + { + "auxiliary_loss_clip": 0.01085825, + "auxiliary_loss_mlp": 0.01034814, + "balance_loss_clip": 1.01738513, + "balance_loss_mlp": 1.02591598, + "epoch": 0.21830753043739667, + "flos": 26133641262720.0, + "grad_norm": 1.6603689984438528, + "language_loss": 0.76497871, + "learning_rate": 3.5481310133933546e-06, + "loss": 0.78618515, + "num_input_tokens_seen": 78345810, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.59765625, + "step": 3631, + "time_per_iteration": 2.4137797355651855 + }, + { + "auxiliary_loss_clip": 0.01090745, + "auxiliary_loss_mlp": 0.01032561, + "balance_loss_clip": 1.01465559, + "balance_loss_mlp": 1.02812052, + "epoch": 0.21836765369006464, + "flos": 21719781671040.0, + "grad_norm": 2.53121131401874, + "language_loss": 0.75330448, + "learning_rate": 3.547891784371639e-06, + "loss": 0.77453762, + "num_input_tokens_seen": 78364085, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.625, + "step": 3632, + "time_per_iteration": 2.3723511695861816 + }, + { + "auxiliary_loss_clip": 0.01088971, + "auxiliary_loss_mlp": 0.01031218, + "balance_loss_clip": 1.0143615, + "balance_loss_mlp": 1.02587771, + "epoch": 0.2184277769427326, + "flos": 19936592513280.0, + "grad_norm": 3.415645181158411, + "language_loss": 0.84082043, + "learning_rate": 3.547652500109647e-06, + "loss": 0.8620224, + "num_input_tokens_seen": 78381385, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.6328125, + "step": 3633, + "time_per_iteration": 2.364938259124756 + }, + { + "auxiliary_loss_clip": 0.01089758, + "auxiliary_loss_mlp": 0.01043776, + "balance_loss_clip": 1.0248574, + "balance_loss_mlp": 1.02721274, + "epoch": 0.21848790019540057, + "flos": 20339175934080.0, + "grad_norm": 1.5651391173505917, + "language_loss": 0.81594479, + "learning_rate": 3.547413160615919e-06, + "loss": 0.83728015, + "num_input_tokens_seen": 78400500, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 0.625, + "step": 3634, + "time_per_iteration": 2.4022185802459717 + }, + { + "auxiliary_loss_clip": 0.01093251, + "auxiliary_loss_mlp": 0.01032686, + "balance_loss_clip": 1.01472163, + "balance_loss_mlp": 1.02807343, + "epoch": 0.21854802344806853, + "flos": 15449904092160.0, + "grad_norm": 1.8847199864380413, + "language_loss": 0.74939668, + "learning_rate": 3.5471737658989956e-06, + "loss": 0.77065599, + "num_input_tokens_seen": 78418340, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.65234375, + "step": 3635, + "time_per_iteration": 2.3405630588531494 + }, + { + "auxiliary_loss_clip": 0.01090406, + "auxiliary_loss_mlp": 0.01038356, + "balance_loss_clip": 1.02033162, + "balance_loss_mlp": 1.02717638, + "epoch": 0.2186081467007365, + "flos": 16319939973120.0, + "grad_norm": 1.8491380683041085, + "language_loss": 0.87294209, + "learning_rate": 3.54693431596742e-06, + "loss": 0.89422971, + "num_input_tokens_seen": 78434375, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.6328125, + "step": 3636, + "time_per_iteration": 2.358074188232422 + }, + { + "auxiliary_loss_clip": 0.01090906, + "auxiliary_loss_mlp": 0.01039013, + "balance_loss_clip": 1.02036333, + "balance_loss_mlp": 1.02711272, + "epoch": 0.2186682699534045, + "flos": 21688185024000.0, + "grad_norm": 2.011595711849557, + "language_loss": 0.75966811, + "learning_rate": 3.5466948108297377e-06, + "loss": 0.78096724, + "num_input_tokens_seen": 78451735, + "router_z_loss_clip": 0.18652344, + "router_z_loss_mlp": 0.63671875, + "step": 3637, + "time_per_iteration": 2.3807408809661865 + }, + { + "auxiliary_loss_clip": 0.01093942, + "auxiliary_loss_mlp": 0.01035223, + "balance_loss_clip": 1.01469493, + "balance_loss_mlp": 1.02733397, + "epoch": 0.21872839320607246, + "flos": 17738566047360.0, + "grad_norm": 2.6371077640017346, + "language_loss": 0.89282644, + "learning_rate": 3.5464552504944965e-06, + "loss": 0.91411805, + "num_input_tokens_seen": 78462730, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 0.6640625, + "step": 3638, + "time_per_iteration": 2.3330798149108887 + }, + { + "auxiliary_loss_clip": 0.01091267, + "auxiliary_loss_mlp": 0.01042488, + "balance_loss_clip": 1.02266312, + "balance_loss_mlp": 1.02771962, + "epoch": 0.21878851645874042, + "flos": 18651544767360.0, + "grad_norm": 2.216887211390042, + "language_loss": 0.89628458, + "learning_rate": 3.546215634970245e-06, + "loss": 0.91762209, + "num_input_tokens_seen": 78476300, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 0.6328125, + "step": 3639, + "time_per_iteration": 2.320844888687134 + }, + { + "auxiliary_loss_clip": 0.01089655, + "auxiliary_loss_mlp": 0.01035217, + "balance_loss_clip": 1.01769292, + "balance_loss_mlp": 1.02573586, + "epoch": 0.2188486397114084, + "flos": 25556107685760.0, + "grad_norm": 1.9554778272301963, + "language_loss": 0.79085922, + "learning_rate": 3.545975964265535e-06, + "loss": 0.81210792, + "num_input_tokens_seen": 78496135, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.640625, + "step": 3640, + "time_per_iteration": 2.4163002967834473 + }, + { + "auxiliary_loss_clip": 0.01094824, + "auxiliary_loss_mlp": 0.01044022, + "balance_loss_clip": 1.02319622, + "balance_loss_mlp": 1.03164196, + "epoch": 0.21890876296407635, + "flos": 17891171066880.0, + "grad_norm": 2.3250227569417903, + "language_loss": 0.72241938, + "learning_rate": 3.5457362383889196e-06, + "loss": 0.74380779, + "num_input_tokens_seen": 78513855, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 0.6328125, + "step": 3641, + "time_per_iteration": 2.3454127311706543 + }, + { + "auxiliary_loss_clip": 0.01092003, + "auxiliary_loss_mlp": 0.01038669, + "balance_loss_clip": 1.02020335, + "balance_loss_mlp": 1.02923465, + "epoch": 0.21896888621674432, + "flos": 17748131760000.0, + "grad_norm": 1.9020619069361877, + "language_loss": 0.81124008, + "learning_rate": 3.5454964573489542e-06, + "loss": 0.83254683, + "num_input_tokens_seen": 78531740, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.62890625, + "step": 3642, + "time_per_iteration": 2.3891549110412598 + }, + { + "auxiliary_loss_clip": 0.01094523, + "auxiliary_loss_mlp": 0.01041683, + "balance_loss_clip": 1.02157211, + "balance_loss_mlp": 1.02773678, + "epoch": 0.21902900946941228, + "flos": 23075039894400.0, + "grad_norm": 1.67525366925596, + "language_loss": 0.71565598, + "learning_rate": 3.545256621154196e-06, + "loss": 0.73701805, + "num_input_tokens_seen": 78549600, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 0.66796875, + "step": 3643, + "time_per_iteration": 2.394054651260376 + }, + { + "auxiliary_loss_clip": 0.01095848, + "auxiliary_loss_mlp": 0.01042595, + "balance_loss_clip": 1.02365232, + "balance_loss_mlp": 1.0290705, + "epoch": 0.21908913272208028, + "flos": 48176718923520.0, + "grad_norm": 2.473822385752144, + "language_loss": 0.68169093, + "learning_rate": 3.545016729813203e-06, + "loss": 0.70307541, + "num_input_tokens_seen": 78573350, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 0.66796875, + "step": 3644, + "time_per_iteration": 2.6351137161254883 + }, + { + "auxiliary_loss_clip": 0.01094533, + "auxiliary_loss_mlp": 0.01035764, + "balance_loss_clip": 1.01649952, + "balance_loss_mlp": 1.02735949, + "epoch": 0.21914925597474824, + "flos": 22235658053760.0, + "grad_norm": 2.4144990113309537, + "language_loss": 0.77770472, + "learning_rate": 3.544776783334538e-06, + "loss": 0.79900765, + "num_input_tokens_seen": 78591005, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 0.671875, + "step": 3645, + "time_per_iteration": 2.394505262374878 + }, + { + "auxiliary_loss_clip": 0.01092502, + "auxiliary_loss_mlp": 0.01037682, + "balance_loss_clip": 1.01994359, + "balance_loss_mlp": 1.02877426, + "epoch": 0.2192093792274162, + "flos": 22124564507520.0, + "grad_norm": 1.5917323717771417, + "language_loss": 0.82426739, + "learning_rate": 3.5445367817267623e-06, + "loss": 0.84556925, + "num_input_tokens_seen": 78610645, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.63671875, + "step": 3646, + "time_per_iteration": 2.38145112991333 + }, + { + "auxiliary_loss_clip": 0.01089747, + "auxiliary_loss_mlp": 0.01031535, + "balance_loss_clip": 1.01435649, + "balance_loss_mlp": 1.02789617, + "epoch": 0.21926950248008417, + "flos": 15668530225920.0, + "grad_norm": 1.707471653178866, + "language_loss": 0.82878518, + "learning_rate": 3.5442967249984427e-06, + "loss": 0.84999806, + "num_input_tokens_seen": 78628340, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.6171875, + "step": 3647, + "time_per_iteration": 2.3597939014434814 + }, + { + "auxiliary_loss_clip": 0.0108968, + "auxiliary_loss_mlp": 0.01042894, + "balance_loss_clip": 1.02485752, + "balance_loss_mlp": 1.02583981, + "epoch": 0.21932962573275214, + "flos": 30261212772480.0, + "grad_norm": 1.6655901629453167, + "language_loss": 0.72428632, + "learning_rate": 3.544056613158145e-06, + "loss": 0.74561208, + "num_input_tokens_seen": 78649355, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.63671875, + "step": 3648, + "time_per_iteration": 3.8688979148864746 + }, + { + "auxiliary_loss_clip": 0.01092046, + "auxiliary_loss_mlp": 0.01037342, + "balance_loss_clip": 1.01795888, + "balance_loss_mlp": 1.02543092, + "epoch": 0.2193897489854201, + "flos": 10779363118080.0, + "grad_norm": 2.509063598410687, + "language_loss": 0.74706012, + "learning_rate": 3.5438164462144383e-06, + "loss": 0.768354, + "num_input_tokens_seen": 78664915, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 0.66796875, + "step": 3649, + "time_per_iteration": 2.347658157348633 + }, + { + "auxiliary_loss_clip": 0.01085887, + "auxiliary_loss_mlp": 0.01033565, + "balance_loss_clip": 1.01688766, + "balance_loss_mlp": 1.02621293, + "epoch": 0.21944987223808807, + "flos": 19132368278400.0, + "grad_norm": 3.118793264486232, + "language_loss": 0.86427206, + "learning_rate": 3.5435762241758944e-06, + "loss": 0.88546658, + "num_input_tokens_seen": 78681475, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.59765625, + "step": 3650, + "time_per_iteration": 2.349252700805664 + }, + { + "auxiliary_loss_clip": 0.01089468, + "auxiliary_loss_mlp": 0.01036268, + "balance_loss_clip": 1.01769543, + "balance_loss_mlp": 1.02543175, + "epoch": 0.21950999549075606, + "flos": 22709988051840.0, + "grad_norm": 2.10465932582765, + "language_loss": 0.83604038, + "learning_rate": 3.5433359470510855e-06, + "loss": 0.85729772, + "num_input_tokens_seen": 78702300, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.640625, + "step": 3651, + "time_per_iteration": 3.7655768394470215 + }, + { + "auxiliary_loss_clip": 0.01088492, + "auxiliary_loss_mlp": 0.01040807, + "balance_loss_clip": 1.02285457, + "balance_loss_mlp": 1.0253855, + "epoch": 0.21957011874342403, + "flos": 10560562427520.0, + "grad_norm": 1.7186149489782925, + "language_loss": 0.74512058, + "learning_rate": 3.5430956148485864e-06, + "loss": 0.76641357, + "num_input_tokens_seen": 78720230, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.62890625, + "step": 3652, + "time_per_iteration": 3.7480454444885254 + }, + { + "auxiliary_loss_clip": 0.01020375, + "auxiliary_loss_mlp": 0.0100853, + "balance_loss_clip": 1.00630081, + "balance_loss_mlp": 1.00518227, + "epoch": 0.219630241996092, + "flos": 65745047848320.0, + "grad_norm": 0.7414611093971468, + "language_loss": 0.51583755, + "learning_rate": 3.542855227576974e-06, + "loss": 0.53612655, + "num_input_tokens_seen": 78780200, + "router_z_loss_clip": 0.02233887, + "router_z_loss_mlp": 0.15234375, + "step": 3653, + "time_per_iteration": 3.0350093841552734 + }, + { + "auxiliary_loss_clip": 0.01093367, + "auxiliary_loss_mlp": 0.01038173, + "balance_loss_clip": 1.01992249, + "balance_loss_mlp": 1.02969611, + "epoch": 0.21969036524875996, + "flos": 23695376664960.0, + "grad_norm": 1.9980404981099662, + "language_loss": 0.75457841, + "learning_rate": 3.5426147852448276e-06, + "loss": 0.77589381, + "num_input_tokens_seen": 78800575, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.63671875, + "step": 3654, + "time_per_iteration": 2.3962314128875732 + }, + { + "auxiliary_loss_clip": 0.01095718, + "auxiliary_loss_mlp": 0.01043015, + "balance_loss_clip": 1.02336919, + "balance_loss_mlp": 1.02950132, + "epoch": 0.21975048850142792, + "flos": 19640040491520.0, + "grad_norm": 1.88640288741463, + "language_loss": 0.724374, + "learning_rate": 3.542374287860727e-06, + "loss": 0.74576128, + "num_input_tokens_seen": 78819585, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 0.66015625, + "step": 3655, + "time_per_iteration": 3.762683391571045 + }, + { + "auxiliary_loss_clip": 0.01093002, + "auxiliary_loss_mlp": 0.0104213, + "balance_loss_clip": 1.02427268, + "balance_loss_mlp": 1.02878046, + "epoch": 0.21981061175409589, + "flos": 22447651029120.0, + "grad_norm": 1.5176434959899086, + "language_loss": 0.80999374, + "learning_rate": 3.542133735433256e-06, + "loss": 0.83134508, + "num_input_tokens_seen": 78837330, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 0.640625, + "step": 3656, + "time_per_iteration": 2.3991000652313232 + }, + { + "auxiliary_loss_clip": 0.01093215, + "auxiliary_loss_mlp": 0.0103585, + "balance_loss_clip": 1.0165143, + "balance_loss_mlp": 1.02960467, + "epoch": 0.21987073500676388, + "flos": 18150051864960.0, + "grad_norm": 2.1322825570433572, + "language_loss": 0.84672594, + "learning_rate": 3.541893127970999e-06, + "loss": 0.8680166, + "num_input_tokens_seen": 78854955, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 0.63671875, + "step": 3657, + "time_per_iteration": 2.3575732707977295 + }, + { + "auxiliary_loss_clip": 0.01091715, + "auxiliary_loss_mlp": 0.01032139, + "balance_loss_clip": 1.01344705, + "balance_loss_mlp": 1.02620268, + "epoch": 0.21993085825943184, + "flos": 25625096265600.0, + "grad_norm": 1.613644961719573, + "language_loss": 0.8030948, + "learning_rate": 3.541652465482542e-06, + "loss": 0.82433337, + "num_input_tokens_seen": 78874965, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.65625, + "step": 3658, + "time_per_iteration": 2.4104864597320557 + }, + { + "auxiliary_loss_clip": 0.0101832, + "auxiliary_loss_mlp": 0.01001977, + "balance_loss_clip": 0.99959266, + "balance_loss_mlp": 1.00357008, + "epoch": 0.2199909815120998, + "flos": 70919349096960.0, + "grad_norm": 0.7926029416663449, + "language_loss": 0.58215219, + "learning_rate": 3.5414117479764744e-06, + "loss": 0.60235518, + "num_input_tokens_seen": 78937740, + "router_z_loss_clip": 0.02380371, + "router_z_loss_mlp": 0.14746094, + "step": 3659, + "time_per_iteration": 3.1130309104919434 + }, + { + "auxiliary_loss_clip": 0.01092748, + "auxiliary_loss_mlp": 0.01031998, + "balance_loss_clip": 1.0131036, + "balance_loss_mlp": 1.02801776, + "epoch": 0.22005110476476777, + "flos": 21542457542400.0, + "grad_norm": 4.135589391861042, + "language_loss": 0.74065894, + "learning_rate": 3.5411709754613864e-06, + "loss": 0.76190639, + "num_input_tokens_seen": 78955055, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 0.6484375, + "step": 3660, + "time_per_iteration": 2.3905675411224365 + }, + { + "auxiliary_loss_clip": 0.01092462, + "auxiliary_loss_mlp": 0.0103411, + "balance_loss_clip": 1.01503634, + "balance_loss_mlp": 1.02795982, + "epoch": 0.22011122801743574, + "flos": 22053411423360.0, + "grad_norm": 1.6393205509940065, + "language_loss": 0.81110561, + "learning_rate": 3.5409301479458707e-06, + "loss": 0.83237135, + "num_input_tokens_seen": 78974895, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 0.64453125, + "step": 3661, + "time_per_iteration": 2.3919174671173096 + }, + { + "auxiliary_loss_clip": 0.01093048, + "auxiliary_loss_mlp": 0.01038529, + "balance_loss_clip": 1.02069509, + "balance_loss_mlp": 1.02945328, + "epoch": 0.2201713512701037, + "flos": 26686385907840.0, + "grad_norm": 1.6930319666421592, + "language_loss": 0.73479098, + "learning_rate": 3.5406892654385223e-06, + "loss": 0.75610673, + "num_input_tokens_seen": 78994990, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.63671875, + "step": 3662, + "time_per_iteration": 2.4425196647644043 + }, + { + "auxiliary_loss_clip": 0.01091396, + "auxiliary_loss_mlp": 0.01037748, + "balance_loss_clip": 1.02082038, + "balance_loss_mlp": 1.02982497, + "epoch": 0.22023147452277167, + "flos": 22161153479040.0, + "grad_norm": 1.4448901819100592, + "language_loss": 0.78305918, + "learning_rate": 3.540448327947936e-06, + "loss": 0.80435061, + "num_input_tokens_seen": 79014405, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.61328125, + "step": 3663, + "time_per_iteration": 2.3859312534332275 + }, + { + "auxiliary_loss_clip": 0.0109561, + "auxiliary_loss_mlp": 0.01036633, + "balance_loss_clip": 1.01720226, + "balance_loss_mlp": 1.02975941, + "epoch": 0.22029159777543966, + "flos": 22522330160640.0, + "grad_norm": 2.53099107067621, + "language_loss": 0.80450189, + "learning_rate": 3.5402073354827123e-06, + "loss": 0.82582432, + "num_input_tokens_seen": 79032375, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 0.66015625, + "step": 3664, + "time_per_iteration": 2.3736634254455566 + }, + { + "auxiliary_loss_clip": 0.01095337, + "auxiliary_loss_mlp": 0.01041624, + "balance_loss_clip": 1.02133501, + "balance_loss_mlp": 1.02853584, + "epoch": 0.22035172102810763, + "flos": 13041630218880.0, + "grad_norm": 2.928973983608562, + "language_loss": 0.76736879, + "learning_rate": 3.5399662880514497e-06, + "loss": 0.78873837, + "num_input_tokens_seen": 79049635, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 0.66796875, + "step": 3665, + "time_per_iteration": 2.37835693359375 + }, + { + "auxiliary_loss_clip": 0.01090654, + "auxiliary_loss_mlp": 0.01042395, + "balance_loss_clip": 1.02353609, + "balance_loss_mlp": 1.02657104, + "epoch": 0.2204118442807756, + "flos": 12165031002240.0, + "grad_norm": 2.601770157104923, + "language_loss": 0.98128355, + "learning_rate": 3.5397251856627524e-06, + "loss": 1.00261414, + "num_input_tokens_seen": 79062890, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 0.640625, + "step": 3666, + "time_per_iteration": 2.326894521713257 + }, + { + "auxiliary_loss_clip": 0.01092503, + "auxiliary_loss_mlp": 0.01038429, + "balance_loss_clip": 1.02005911, + "balance_loss_mlp": 1.02742922, + "epoch": 0.22047196753344356, + "flos": 40107383493120.0, + "grad_norm": 1.8165924049231157, + "language_loss": 0.80317688, + "learning_rate": 3.5394840283252236e-06, + "loss": 0.8244862, + "num_input_tokens_seen": 79085495, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.6484375, + "step": 3667, + "time_per_iteration": 2.5395076274871826 + }, + { + "auxiliary_loss_clip": 0.01094245, + "auxiliary_loss_mlp": 0.01040842, + "balance_loss_clip": 1.02099347, + "balance_loss_mlp": 1.0274899, + "epoch": 0.22053209078611152, + "flos": 20700178058880.0, + "grad_norm": 1.7430325284434647, + "language_loss": 0.77049088, + "learning_rate": 3.53924281604747e-06, + "loss": 0.79184175, + "num_input_tokens_seen": 79101820, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 0.66796875, + "step": 3668, + "time_per_iteration": 2.359900951385498 + }, + { + "auxiliary_loss_clip": 0.010929, + "auxiliary_loss_mlp": 0.01040206, + "balance_loss_clip": 1.02078676, + "balance_loss_mlp": 1.02938235, + "epoch": 0.2205922140387795, + "flos": 24715189745280.0, + "grad_norm": 1.595331213977969, + "language_loss": 0.71167451, + "learning_rate": 3.5390015488381e-06, + "loss": 0.73300552, + "num_input_tokens_seen": 79123320, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 0.63671875, + "step": 3669, + "time_per_iteration": 2.455019235610962 + }, + { + "auxiliary_loss_clip": 0.01091524, + "auxiliary_loss_mlp": 0.01032673, + "balance_loss_clip": 1.01417136, + "balance_loss_mlp": 1.02698052, + "epoch": 0.22065233729144745, + "flos": 23476122126720.0, + "grad_norm": 2.3214837709153855, + "language_loss": 0.85482693, + "learning_rate": 3.5387602267057227e-06, + "loss": 0.87606883, + "num_input_tokens_seen": 79141615, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.64453125, + "step": 3670, + "time_per_iteration": 2.3846724033355713 + }, + { + "auxiliary_loss_clip": 0.01097451, + "auxiliary_loss_mlp": 0.01037446, + "balance_loss_clip": 1.018718, + "balance_loss_mlp": 1.03035593, + "epoch": 0.22071246054411545, + "flos": 35224116405120.0, + "grad_norm": 1.8583426685725373, + "language_loss": 0.76785362, + "learning_rate": 3.5385188496589516e-06, + "loss": 0.78920257, + "num_input_tokens_seen": 79164910, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.671875, + "step": 3671, + "time_per_iteration": 2.4917216300964355 + }, + { + "auxiliary_loss_clip": 0.01094413, + "auxiliary_loss_mlp": 0.01037797, + "balance_loss_clip": 1.01946259, + "balance_loss_mlp": 1.02923059, + "epoch": 0.2207725837967834, + "flos": 18149318726400.0, + "grad_norm": 1.928649615301234, + "language_loss": 0.81345391, + "learning_rate": 3.5382774177064007e-06, + "loss": 0.83477604, + "num_input_tokens_seen": 79179685, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.65234375, + "step": 3672, + "time_per_iteration": 2.3476321697235107 + }, + { + "auxiliary_loss_clip": 0.01095329, + "auxiliary_loss_mlp": 0.01043748, + "balance_loss_clip": 1.02569973, + "balance_loss_mlp": 1.02908397, + "epoch": 0.22083270704945138, + "flos": 20478793927680.0, + "grad_norm": 1.8496931939472288, + "language_loss": 0.73409235, + "learning_rate": 3.538035930856685e-06, + "loss": 0.75548315, + "num_input_tokens_seen": 79196285, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.6640625, + "step": 3673, + "time_per_iteration": 2.371300220489502 + }, + { + "auxiliary_loss_clip": 0.01097022, + "auxiliary_loss_mlp": 0.01036525, + "balance_loss_clip": 1.01691508, + "balance_loss_mlp": 1.03150582, + "epoch": 0.22089283030211934, + "flos": 34124527134720.0, + "grad_norm": 1.8457946037171171, + "language_loss": 0.76104242, + "learning_rate": 3.5377943891184234e-06, + "loss": 0.7823779, + "num_input_tokens_seen": 79216060, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 0.65234375, + "step": 3674, + "time_per_iteration": 2.4701383113861084 + }, + { + "auxiliary_loss_clip": 0.0109407, + "auxiliary_loss_mlp": 0.01039001, + "balance_loss_clip": 1.01868737, + "balance_loss_mlp": 1.02909803, + "epoch": 0.2209529535547873, + "flos": 18076245517440.0, + "grad_norm": 1.890786765170995, + "language_loss": 0.7416544, + "learning_rate": 3.5375527925002357e-06, + "loss": 0.76298511, + "num_input_tokens_seen": 79235145, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 0.6484375, + "step": 3675, + "time_per_iteration": 2.3533267974853516 + }, + { + "auxiliary_loss_clip": 0.01093822, + "auxiliary_loss_mlp": 0.01041447, + "balance_loss_clip": 1.02355337, + "balance_loss_mlp": 1.02760625, + "epoch": 0.22101307680745527, + "flos": 27234103317120.0, + "grad_norm": 1.6694224754185738, + "language_loss": 0.80026603, + "learning_rate": 3.537311141010744e-06, + "loss": 0.82161874, + "num_input_tokens_seen": 79256960, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 0.6640625, + "step": 3676, + "time_per_iteration": 2.4297144412994385 + }, + { + "auxiliary_loss_clip": 0.0109421, + "auxiliary_loss_mlp": 0.0104276, + "balance_loss_clip": 1.02320921, + "balance_loss_mlp": 1.02742362, + "epoch": 0.22107320006012326, + "flos": 16542371445120.0, + "grad_norm": 2.0785573910341615, + "language_loss": 0.75704879, + "learning_rate": 3.5370694346585718e-06, + "loss": 0.77841848, + "num_input_tokens_seen": 79274860, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 0.66796875, + "step": 3677, + "time_per_iteration": 2.4076225757598877 + }, + { + "auxiliary_loss_clip": 0.01089111, + "auxiliary_loss_mlp": 0.01038556, + "balance_loss_clip": 1.02018619, + "balance_loss_mlp": 1.02509761, + "epoch": 0.22113332331279123, + "flos": 22053376512000.0, + "grad_norm": 1.6958729395602445, + "language_loss": 0.83066964, + "learning_rate": 3.5368276734523457e-06, + "loss": 0.85194635, + "num_input_tokens_seen": 79294005, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.640625, + "step": 3678, + "time_per_iteration": 2.39967942237854 + }, + { + "auxiliary_loss_clip": 0.01093189, + "auxiliary_loss_mlp": 0.0104082, + "balance_loss_clip": 1.02174675, + "balance_loss_mlp": 1.02956486, + "epoch": 0.2211934465654592, + "flos": 26611636953600.0, + "grad_norm": 1.7033107421027307, + "language_loss": 0.8909936, + "learning_rate": 3.536585857400693e-06, + "loss": 0.91233373, + "num_input_tokens_seen": 79314005, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 0.63671875, + "step": 3679, + "time_per_iteration": 2.4268648624420166 + }, + { + "auxiliary_loss_clip": 0.01094554, + "auxiliary_loss_mlp": 0.01042149, + "balance_loss_clip": 1.02258635, + "balance_loss_mlp": 1.02932191, + "epoch": 0.22125356981812716, + "flos": 16359496410240.0, + "grad_norm": 2.1653341397111094, + "language_loss": 0.87012517, + "learning_rate": 3.5363439865122436e-06, + "loss": 0.89149213, + "num_input_tokens_seen": 79331030, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 0.65234375, + "step": 3680, + "time_per_iteration": 2.3618102073669434 + }, + { + "auxiliary_loss_clip": 0.01091267, + "auxiliary_loss_mlp": 0.01040509, + "balance_loss_clip": 1.02138829, + "balance_loss_mlp": 1.02905679, + "epoch": 0.22131369307079513, + "flos": 21650094864000.0, + "grad_norm": 1.7555962133591956, + "language_loss": 0.81380695, + "learning_rate": 3.5361020607956292e-06, + "loss": 0.83512473, + "num_input_tokens_seen": 79348560, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 0.62109375, + "step": 3681, + "time_per_iteration": 2.4112987518310547 + }, + { + "auxiliary_loss_clip": 0.01090275, + "auxiliary_loss_mlp": 0.01039758, + "balance_loss_clip": 1.02131629, + "balance_loss_mlp": 1.02861273, + "epoch": 0.2213738163234631, + "flos": 19608513667200.0, + "grad_norm": 2.0406900260686482, + "language_loss": 0.79659057, + "learning_rate": 3.535860080259484e-06, + "loss": 0.81789088, + "num_input_tokens_seen": 79367175, + "router_z_loss_clip": 0.18457031, + "router_z_loss_mlp": 0.6171875, + "step": 3682, + "time_per_iteration": 2.3819100856781006 + }, + { + "auxiliary_loss_clip": 0.01092671, + "auxiliary_loss_mlp": 0.01038685, + "balance_loss_clip": 1.01928973, + "balance_loss_mlp": 1.02692652, + "epoch": 0.22143393957613106, + "flos": 23622268544640.0, + "grad_norm": 1.5759452235882812, + "language_loss": 0.77408659, + "learning_rate": 3.5356180449124424e-06, + "loss": 0.79540014, + "num_input_tokens_seen": 79388435, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 0.65625, + "step": 3683, + "time_per_iteration": 2.40122652053833 + }, + { + "auxiliary_loss_clip": 0.01092856, + "auxiliary_loss_mlp": 0.0104137, + "balance_loss_clip": 1.02216589, + "balance_loss_mlp": 1.02644992, + "epoch": 0.22149406282879905, + "flos": 26176584101760.0, + "grad_norm": 1.753791680216186, + "language_loss": 0.72255617, + "learning_rate": 3.535375954763143e-06, + "loss": 0.74389839, + "num_input_tokens_seen": 79407910, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 0.6640625, + "step": 3684, + "time_per_iteration": 2.4123446941375732 + }, + { + "auxiliary_loss_clip": 0.01095201, + "auxiliary_loss_mlp": 0.01042766, + "balance_loss_clip": 1.02344263, + "balance_loss_mlp": 1.02942276, + "epoch": 0.221554186081467, + "flos": 14537867978880.0, + "grad_norm": 1.8519296152254818, + "language_loss": 0.79865098, + "learning_rate": 3.535133809820226e-06, + "loss": 0.82003069, + "num_input_tokens_seen": 79424020, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 0.65625, + "step": 3685, + "time_per_iteration": 2.3655717372894287 + }, + { + "auxiliary_loss_clip": 0.01088917, + "auxiliary_loss_mlp": 0.01043412, + "balance_loss_clip": 1.02562642, + "balance_loss_mlp": 1.02571535, + "epoch": 0.22161430933413498, + "flos": 22237124330880.0, + "grad_norm": 1.503663529879709, + "language_loss": 0.87465549, + "learning_rate": 3.5348916100923318e-06, + "loss": 0.89597881, + "num_input_tokens_seen": 79445605, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.6328125, + "step": 3686, + "time_per_iteration": 2.425128936767578 + }, + { + "auxiliary_loss_clip": 0.01089607, + "auxiliary_loss_mlp": 0.01036597, + "balance_loss_clip": 1.01724911, + "balance_loss_mlp": 1.02612162, + "epoch": 0.22167443258680294, + "flos": 23475423899520.0, + "grad_norm": 1.8229604735511442, + "language_loss": 0.77771688, + "learning_rate": 3.534649355588104e-06, + "loss": 0.79897892, + "num_input_tokens_seen": 79463850, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 0.63671875, + "step": 3687, + "time_per_iteration": 2.4009549617767334 + }, + { + "auxiliary_loss_clip": 0.01096379, + "auxiliary_loss_mlp": 0.01043615, + "balance_loss_clip": 1.02317119, + "balance_loss_mlp": 1.02852297, + "epoch": 0.2217345558394709, + "flos": 23220034237440.0, + "grad_norm": 1.7664825785154417, + "language_loss": 0.84929752, + "learning_rate": 3.534407046316189e-06, + "loss": 0.8706975, + "num_input_tokens_seen": 79482845, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 0.6796875, + "step": 3688, + "time_per_iteration": 3.7730188369750977 + }, + { + "auxiliary_loss_clip": 0.01097376, + "auxiliary_loss_mlp": 0.01036841, + "balance_loss_clip": 1.01797032, + "balance_loss_mlp": 1.02991366, + "epoch": 0.22179467909213887, + "flos": 20010049747200.0, + "grad_norm": 1.68473272440209, + "language_loss": 0.8142544, + "learning_rate": 3.5341646822852324e-06, + "loss": 0.83559656, + "num_input_tokens_seen": 79501550, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 0.671875, + "step": 3689, + "time_per_iteration": 2.377899646759033 + }, + { + "auxiliary_loss_clip": 0.01091099, + "auxiliary_loss_mlp": 0.010402, + "balance_loss_clip": 1.02169847, + "balance_loss_mlp": 1.02827299, + "epoch": 0.22185480234480687, + "flos": 19682005812480.0, + "grad_norm": 1.7421215786775817, + "language_loss": 0.69994974, + "learning_rate": 3.5339222635038852e-06, + "loss": 0.72126275, + "num_input_tokens_seen": 79519680, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.62890625, + "step": 3690, + "time_per_iteration": 3.8241169452667236 + }, + { + "auxiliary_loss_clip": 0.01093793, + "auxiliary_loss_mlp": 0.01037736, + "balance_loss_clip": 1.01738739, + "balance_loss_mlp": 1.02620494, + "epoch": 0.22191492559747483, + "flos": 21980233480320.0, + "grad_norm": 1.8785714147008459, + "language_loss": 0.72514445, + "learning_rate": 3.533679789980798e-06, + "loss": 0.74645978, + "num_input_tokens_seen": 79539000, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 0.67578125, + "step": 3691, + "time_per_iteration": 2.3743176460266113 + }, + { + "auxiliary_loss_clip": 0.0109577, + "auxiliary_loss_mlp": 0.01037011, + "balance_loss_clip": 1.01713848, + "balance_loss_mlp": 1.03043437, + "epoch": 0.2219750488501428, + "flos": 23220941932800.0, + "grad_norm": 1.8898359293696025, + "language_loss": 0.71459144, + "learning_rate": 3.5334372617246243e-06, + "loss": 0.73591924, + "num_input_tokens_seen": 79559695, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 0.65234375, + "step": 3692, + "time_per_iteration": 3.8005530834198 + }, + { + "auxiliary_loss_clip": 0.0109597, + "auxiliary_loss_mlp": 0.01042106, + "balance_loss_clip": 1.02129185, + "balance_loss_mlp": 1.0287292, + "epoch": 0.22203517210281076, + "flos": 22452643353600.0, + "grad_norm": 1.5534491366171796, + "language_loss": 0.88025165, + "learning_rate": 3.533194678744019e-06, + "loss": 0.90163249, + "num_input_tokens_seen": 79579095, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 0.671875, + "step": 3693, + "time_per_iteration": 2.387392520904541 + }, + { + "auxiliary_loss_clip": 0.01089581, + "auxiliary_loss_mlp": 0.01032849, + "balance_loss_clip": 1.01600492, + "balance_loss_mlp": 1.02734554, + "epoch": 0.22209529535547873, + "flos": 17563650802560.0, + "grad_norm": 2.050372563762818, + "language_loss": 0.85483646, + "learning_rate": 3.53295204104764e-06, + "loss": 0.87606072, + "num_input_tokens_seen": 79596430, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.62109375, + "step": 3694, + "time_per_iteration": 2.3603076934814453 + }, + { + "auxiliary_loss_clip": 0.01093952, + "auxiliary_loss_mlp": 0.01041397, + "balance_loss_clip": 1.02131057, + "balance_loss_mlp": 1.02634811, + "epoch": 0.2221554186081467, + "flos": 21467987879040.0, + "grad_norm": 2.895360832649079, + "language_loss": 0.69272387, + "learning_rate": 3.532709348644146e-06, + "loss": 0.71407735, + "num_input_tokens_seen": 79615825, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 0.67578125, + "step": 3695, + "time_per_iteration": 3.7850799560546875 + }, + { + "auxiliary_loss_clip": 0.01090176, + "auxiliary_loss_mlp": 0.01032612, + "balance_loss_clip": 1.01567209, + "balance_loss_mlp": 1.02860951, + "epoch": 0.22221554186081466, + "flos": 27672193457280.0, + "grad_norm": 1.5009527281737483, + "language_loss": 0.71522045, + "learning_rate": 3.532466601542197e-06, + "loss": 0.73644829, + "num_input_tokens_seen": 79637875, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.6171875, + "step": 3696, + "time_per_iteration": 2.4492974281311035 + }, + { + "auxiliary_loss_clip": 0.01091476, + "auxiliary_loss_mlp": 0.01035465, + "balance_loss_clip": 1.0167017, + "balance_loss_mlp": 1.02732062, + "epoch": 0.22227566511348265, + "flos": 25957713588480.0, + "grad_norm": 1.740135248140743, + "language_loss": 0.87784004, + "learning_rate": 3.532223799750458e-06, + "loss": 0.89910948, + "num_input_tokens_seen": 79656970, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.640625, + "step": 3697, + "time_per_iteration": 2.404921293258667 + }, + { + "auxiliary_loss_clip": 0.01085887, + "auxiliary_loss_mlp": 0.01033762, + "balance_loss_clip": 1.01768041, + "balance_loss_mlp": 1.02554131, + "epoch": 0.22233578836615062, + "flos": 39202085272320.0, + "grad_norm": 1.534990969190034, + "language_loss": 0.66207892, + "learning_rate": 3.5319809432775916e-06, + "loss": 0.6832754, + "num_input_tokens_seen": 79680275, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.6015625, + "step": 3698, + "time_per_iteration": 2.548260450363159 + }, + { + "auxiliary_loss_clip": 0.0109311, + "auxiliary_loss_mlp": 0.01035733, + "balance_loss_clip": 1.01574123, + "balance_loss_mlp": 1.02653658, + "epoch": 0.22239591161881858, + "flos": 36282298936320.0, + "grad_norm": 1.8441220316297018, + "language_loss": 0.82402086, + "learning_rate": 3.531738032132267e-06, + "loss": 0.84530926, + "num_input_tokens_seen": 79701255, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 0.6640625, + "step": 3699, + "time_per_iteration": 2.4940197467803955 + }, + { + "auxiliary_loss_clip": 0.01091321, + "auxiliary_loss_mlp": 0.01036938, + "balance_loss_clip": 1.01679158, + "balance_loss_mlp": 1.02728462, + "epoch": 0.22245603487148655, + "flos": 19718559872640.0, + "grad_norm": 1.8064773091700361, + "language_loss": 0.79589581, + "learning_rate": 3.531495066323152e-06, + "loss": 0.81717837, + "num_input_tokens_seen": 79721315, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 0.640625, + "step": 3700, + "time_per_iteration": 2.3791542053222656 + }, + { + "auxiliary_loss_clip": 0.01097251, + "auxiliary_loss_mlp": 0.01036792, + "balance_loss_clip": 1.01687169, + "balance_loss_mlp": 1.02991736, + "epoch": 0.2225161581241545, + "flos": 46278700704000.0, + "grad_norm": 1.9619386117639426, + "language_loss": 0.72068286, + "learning_rate": 3.5312520458589176e-06, + "loss": 0.74202335, + "num_input_tokens_seen": 79742705, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 0.671875, + "step": 3701, + "time_per_iteration": 2.5742242336273193 + }, + { + "auxiliary_loss_clip": 0.01090285, + "auxiliary_loss_mlp": 0.01031364, + "balance_loss_clip": 1.01376891, + "balance_loss_mlp": 1.02655578, + "epoch": 0.22257628137682248, + "flos": 23695062462720.0, + "grad_norm": 1.7292443873516452, + "language_loss": 0.80025822, + "learning_rate": 3.5310089707482366e-06, + "loss": 0.82147467, + "num_input_tokens_seen": 79763000, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.63671875, + "step": 3702, + "time_per_iteration": 2.3922669887542725 + }, + { + "auxiliary_loss_clip": 0.01084204, + "auxiliary_loss_mlp": 0.01035934, + "balance_loss_clip": 1.01753962, + "balance_loss_mlp": 1.02507901, + "epoch": 0.22263640462949044, + "flos": 19352984359680.0, + "grad_norm": 1.942009494971027, + "language_loss": 0.78257668, + "learning_rate": 3.5307658409997834e-06, + "loss": 0.80377805, + "num_input_tokens_seen": 79781335, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.58984375, + "step": 3703, + "time_per_iteration": 2.350100040435791 + }, + { + "auxiliary_loss_clip": 0.01090873, + "auxiliary_loss_mlp": 0.01037792, + "balance_loss_clip": 1.01740718, + "balance_loss_mlp": 1.02613699, + "epoch": 0.22269652788215843, + "flos": 20775031747200.0, + "grad_norm": 1.8852992213241526, + "language_loss": 0.75073087, + "learning_rate": 3.530522656622235e-06, + "loss": 0.7720176, + "num_input_tokens_seen": 79800150, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 0.6484375, + "step": 3704, + "time_per_iteration": 2.369068145751953 + }, + { + "auxiliary_loss_clip": 0.01088072, + "auxiliary_loss_mlp": 0.01033246, + "balance_loss_clip": 1.0162226, + "balance_loss_mlp": 1.02771926, + "epoch": 0.2227566511348264, + "flos": 47957045448960.0, + "grad_norm": 1.9031711690997903, + "language_loss": 0.64479697, + "learning_rate": 3.53027941762427e-06, + "loss": 0.66601014, + "num_input_tokens_seen": 79822390, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.6015625, + "step": 3705, + "time_per_iteration": 2.602240800857544 + }, + { + "auxiliary_loss_clip": 0.01017685, + "auxiliary_loss_mlp": 0.01001016, + "balance_loss_clip": 0.998954, + "balance_loss_mlp": 1.0022943, + "epoch": 0.22281677438749437, + "flos": 66216166001280.0, + "grad_norm": 1.2868087929793475, + "language_loss": 0.65118122, + "learning_rate": 3.5300361240145692e-06, + "loss": 0.67136824, + "num_input_tokens_seen": 79873350, + "router_z_loss_clip": 0.02062988, + "router_z_loss_mlp": 0.15429688, + "step": 3706, + "time_per_iteration": 2.882190704345703 + }, + { + "auxiliary_loss_clip": 0.01088894, + "auxiliary_loss_mlp": 0.01033967, + "balance_loss_clip": 1.01579905, + "balance_loss_mlp": 1.02643394, + "epoch": 0.22287689764016233, + "flos": 21870536388480.0, + "grad_norm": 1.7319909772015547, + "language_loss": 0.80544299, + "learning_rate": 3.5297927758018147e-06, + "loss": 0.8266716, + "num_input_tokens_seen": 79891715, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.625, + "step": 3707, + "time_per_iteration": 2.378695249557495 + }, + { + "auxiliary_loss_clip": 0.01089534, + "auxiliary_loss_mlp": 0.01032534, + "balance_loss_clip": 1.01420009, + "balance_loss_mlp": 1.02666414, + "epoch": 0.2229370208928303, + "flos": 27671250850560.0, + "grad_norm": 1.9187304729061032, + "language_loss": 0.78919291, + "learning_rate": 3.5295493729946913e-06, + "loss": 0.8104136, + "num_input_tokens_seen": 79911175, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.62890625, + "step": 3708, + "time_per_iteration": 2.4452548027038574 + }, + { + "auxiliary_loss_clip": 0.01092096, + "auxiliary_loss_mlp": 0.01038704, + "balance_loss_clip": 1.02138281, + "balance_loss_mlp": 1.02801836, + "epoch": 0.22299714414549826, + "flos": 30153331071360.0, + "grad_norm": 1.9911645034789165, + "language_loss": 0.80301565, + "learning_rate": 3.529305915601885e-06, + "loss": 0.82432365, + "num_input_tokens_seen": 79931875, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.640625, + "step": 3709, + "time_per_iteration": 2.4480786323547363 + }, + { + "auxiliary_loss_clip": 0.01089464, + "auxiliary_loss_mlp": 0.01041277, + "balance_loss_clip": 1.02328813, + "balance_loss_mlp": 1.0251708, + "epoch": 0.22305726739816625, + "flos": 23142178172160.0, + "grad_norm": 1.9254495213443301, + "language_loss": 0.68630362, + "learning_rate": 3.5290624036320843e-06, + "loss": 0.70761108, + "num_input_tokens_seen": 79952445, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.640625, + "step": 3710, + "time_per_iteration": 2.4239273071289062 + }, + { + "auxiliary_loss_clip": 0.01093177, + "auxiliary_loss_mlp": 0.01037456, + "balance_loss_clip": 1.01965773, + "balance_loss_mlp": 1.02766013, + "epoch": 0.22311739065083422, + "flos": 19171051931520.0, + "grad_norm": 2.317621370224953, + "language_loss": 0.90193641, + "learning_rate": 3.5288188370939796e-06, + "loss": 0.92324269, + "num_input_tokens_seen": 79971030, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.65625, + "step": 3711, + "time_per_iteration": 2.3515431880950928 + }, + { + "auxiliary_loss_clip": 0.01088843, + "auxiliary_loss_mlp": 0.01040751, + "balance_loss_clip": 1.02275085, + "balance_loss_mlp": 1.02645743, + "epoch": 0.22317751390350218, + "flos": 13617138936960.0, + "grad_norm": 4.652123448453264, + "language_loss": 0.89180648, + "learning_rate": 3.5285752159962636e-06, + "loss": 0.91310239, + "num_input_tokens_seen": 79982085, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.625, + "step": 3712, + "time_per_iteration": 2.307796001434326 + }, + { + "auxiliary_loss_clip": 0.01090042, + "auxiliary_loss_mlp": 0.01038735, + "balance_loss_clip": 1.01963782, + "balance_loss_mlp": 1.02730691, + "epoch": 0.22323763715617015, + "flos": 11028468735360.0, + "grad_norm": 3.9441240187166886, + "language_loss": 0.74791253, + "learning_rate": 3.5283315403476293e-06, + "loss": 0.76920033, + "num_input_tokens_seen": 79997460, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 0.625, + "step": 3713, + "time_per_iteration": 2.323309898376465 + }, + { + "auxiliary_loss_clip": 0.01091054, + "auxiliary_loss_mlp": 0.01038528, + "balance_loss_clip": 1.01916838, + "balance_loss_mlp": 1.02821803, + "epoch": 0.22329776040883811, + "flos": 41350012070400.0, + "grad_norm": 2.2935807959358128, + "language_loss": 0.62543035, + "learning_rate": 3.5280878101567746e-06, + "loss": 0.64672613, + "num_input_tokens_seen": 80022450, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 0.62890625, + "step": 3714, + "time_per_iteration": 2.562187671661377 + }, + { + "auxiliary_loss_clip": 0.01089448, + "auxiliary_loss_mlp": 0.01030597, + "balance_loss_clip": 1.01393175, + "balance_loss_mlp": 1.02717614, + "epoch": 0.22335788366150608, + "flos": 25118296836480.0, + "grad_norm": 2.254111070255291, + "language_loss": 0.79423189, + "learning_rate": 3.527844025432396e-06, + "loss": 0.81543237, + "num_input_tokens_seen": 80042100, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.625, + "step": 3715, + "time_per_iteration": 2.4427640438079834 + }, + { + "auxiliary_loss_clip": 0.01092928, + "auxiliary_loss_mlp": 0.01043068, + "balance_loss_clip": 1.02509141, + "balance_loss_mlp": 1.02842188, + "epoch": 0.22341800691417404, + "flos": 16982416621440.0, + "grad_norm": 1.6876057861486706, + "language_loss": 0.76629359, + "learning_rate": 3.5276001861831945e-06, + "loss": 0.78765357, + "num_input_tokens_seen": 80059690, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.6484375, + "step": 3716, + "time_per_iteration": 2.3579540252685547 + }, + { + "auxiliary_loss_clip": 0.01091692, + "auxiliary_loss_mlp": 0.01042045, + "balance_loss_clip": 1.02359092, + "balance_loss_mlp": 1.0271709, + "epoch": 0.22347813016684204, + "flos": 14135878051200.0, + "grad_norm": 2.5328957448662504, + "language_loss": 0.789105, + "learning_rate": 3.527356292417872e-06, + "loss": 0.81044239, + "num_input_tokens_seen": 80076060, + "router_z_loss_clip": 0.18457031, + "router_z_loss_mlp": 0.64453125, + "step": 3717, + "time_per_iteration": 2.3554940223693848 + }, + { + "auxiliary_loss_clip": 0.01091346, + "auxiliary_loss_mlp": 0.01041562, + "balance_loss_clip": 1.02334726, + "balance_loss_mlp": 1.0271852, + "epoch": 0.22353825341951, + "flos": 23582118614400.0, + "grad_norm": 1.81745241849482, + "language_loss": 0.68541479, + "learning_rate": 3.527112344145132e-06, + "loss": 0.70674384, + "num_input_tokens_seen": 80094760, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.640625, + "step": 3718, + "time_per_iteration": 2.411811113357544 + }, + { + "auxiliary_loss_clip": 0.01091912, + "auxiliary_loss_mlp": 0.01034974, + "balance_loss_clip": 1.01513803, + "balance_loss_mlp": 1.02732539, + "epoch": 0.22359837667217797, + "flos": 29822948075520.0, + "grad_norm": 1.6166210451380636, + "language_loss": 0.80225945, + "learning_rate": 3.5268683413736808e-06, + "loss": 0.82352829, + "num_input_tokens_seen": 80114475, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 0.64453125, + "step": 3719, + "time_per_iteration": 2.4607787132263184 + }, + { + "auxiliary_loss_clip": 0.0109472, + "auxiliary_loss_mlp": 0.01043127, + "balance_loss_clip": 1.02308762, + "balance_loss_mlp": 1.02672648, + "epoch": 0.22365849992484593, + "flos": 17602124987520.0, + "grad_norm": 2.78199127452028, + "language_loss": 0.86761129, + "learning_rate": 3.526624284112226e-06, + "loss": 0.88898981, + "num_input_tokens_seen": 80132920, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 0.6796875, + "step": 3720, + "time_per_iteration": 2.3771848678588867 + }, + { + "auxiliary_loss_clip": 0.0108845, + "auxiliary_loss_mlp": 0.01033403, + "balance_loss_clip": 1.01518726, + "balance_loss_mlp": 1.02660191, + "epoch": 0.2237186231775139, + "flos": 22709848406400.0, + "grad_norm": 1.5872777188625669, + "language_loss": 0.74140322, + "learning_rate": 3.5263801723694774e-06, + "loss": 0.76262176, + "num_input_tokens_seen": 80152845, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.6171875, + "step": 3721, + "time_per_iteration": 2.3775877952575684 + }, + { + "auxiliary_loss_clip": 0.01092908, + "auxiliary_loss_mlp": 0.01037039, + "balance_loss_clip": 1.01834702, + "balance_loss_mlp": 1.02702498, + "epoch": 0.22377874643018186, + "flos": 13370651671680.0, + "grad_norm": 1.954124855675532, + "language_loss": 0.79196149, + "learning_rate": 3.5261360061541464e-06, + "loss": 0.81326091, + "num_input_tokens_seen": 80170680, + "router_z_loss_clip": 0.18652344, + "router_z_loss_mlp": 0.65625, + "step": 3722, + "time_per_iteration": 2.3554270267486572 + }, + { + "auxiliary_loss_clip": 0.01088159, + "auxiliary_loss_mlp": 0.01031285, + "balance_loss_clip": 1.01399994, + "balance_loss_mlp": 1.02750111, + "epoch": 0.22383886968284986, + "flos": 17893998887040.0, + "grad_norm": 1.9514543259974668, + "language_loss": 0.81833661, + "learning_rate": 3.5258917854749476e-06, + "loss": 0.839531, + "num_input_tokens_seen": 80189030, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.609375, + "step": 3723, + "time_per_iteration": 2.3552000522613525 + }, + { + "auxiliary_loss_clip": 0.01091717, + "auxiliary_loss_mlp": 0.01036152, + "balance_loss_clip": 1.01831865, + "balance_loss_mlp": 1.02737498, + "epoch": 0.22389899293551782, + "flos": 23877972408960.0, + "grad_norm": 2.2324367537490097, + "language_loss": 0.84569204, + "learning_rate": 3.5256475103405957e-06, + "loss": 0.86697072, + "num_input_tokens_seen": 80208365, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.640625, + "step": 3724, + "time_per_iteration": 2.4105513095855713 + }, + { + "auxiliary_loss_clip": 0.01088083, + "auxiliary_loss_mlp": 0.01033511, + "balance_loss_clip": 1.01564169, + "balance_loss_mlp": 1.0258913, + "epoch": 0.2239591161881858, + "flos": 27271181047680.0, + "grad_norm": 2.9199364331410944, + "language_loss": 0.78727692, + "learning_rate": 3.525403180759809e-06, + "loss": 0.8084929, + "num_input_tokens_seen": 80228685, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 0.62109375, + "step": 3725, + "time_per_iteration": 2.414835214614868 + }, + { + "auxiliary_loss_clip": 0.01089535, + "auxiliary_loss_mlp": 0.0103743, + "balance_loss_clip": 1.01965594, + "balance_loss_mlp": 1.02788854, + "epoch": 0.22401923944085375, + "flos": 22235762787840.0, + "grad_norm": 1.7324304899188787, + "language_loss": 0.77203864, + "learning_rate": 3.5251587967413065e-06, + "loss": 0.79330832, + "num_input_tokens_seen": 80247635, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.6171875, + "step": 3726, + "time_per_iteration": 2.375723123550415 + }, + { + "auxiliary_loss_clip": 0.01098139, + "auxiliary_loss_mlp": 0.01038683, + "balance_loss_clip": 1.01871562, + "balance_loss_mlp": 1.02910829, + "epoch": 0.22407936269352172, + "flos": 12052959937920.0, + "grad_norm": 2.304369136807236, + "language_loss": 0.72655082, + "learning_rate": 3.5249143582938096e-06, + "loss": 0.74791902, + "num_input_tokens_seen": 80260045, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 0.69140625, + "step": 3727, + "time_per_iteration": 3.7323873043060303 + }, + { + "auxiliary_loss_clip": 0.0109475, + "auxiliary_loss_mlp": 0.01035442, + "balance_loss_clip": 1.01540256, + "balance_loss_mlp": 1.02777815, + "epoch": 0.22413948594618968, + "flos": 19352565423360.0, + "grad_norm": 1.9528337117237478, + "language_loss": 0.87160379, + "learning_rate": 3.5246698654260416e-06, + "loss": 0.89290571, + "num_input_tokens_seen": 80277680, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 0.66796875, + "step": 3728, + "time_per_iteration": 2.378406524658203 + }, + { + "auxiliary_loss_clip": 0.01092542, + "auxiliary_loss_mlp": 0.010429, + "balance_loss_clip": 1.02264631, + "balance_loss_mlp": 1.02828956, + "epoch": 0.22419960919885765, + "flos": 24168868790400.0, + "grad_norm": 2.361303933533291, + "language_loss": 0.80444628, + "learning_rate": 3.5244253181467284e-06, + "loss": 0.82580072, + "num_input_tokens_seen": 80294795, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 0.64453125, + "step": 3729, + "time_per_iteration": 3.749037742614746 + }, + { + "auxiliary_loss_clip": 0.01087705, + "auxiliary_loss_mlp": 0.01042811, + "balance_loss_clip": 1.02552557, + "balance_loss_mlp": 1.02664852, + "epoch": 0.22425973245152564, + "flos": 27377805939840.0, + "grad_norm": 1.6124074225621932, + "language_loss": 0.86935675, + "learning_rate": 3.5241807164645963e-06, + "loss": 0.89066195, + "num_input_tokens_seen": 80315425, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.609375, + "step": 3730, + "time_per_iteration": 2.427229881286621 + }, + { + "auxiliary_loss_clip": 0.01086321, + "auxiliary_loss_mlp": 0.01029149, + "balance_loss_clip": 1.01183999, + "balance_loss_mlp": 1.0268898, + "epoch": 0.2243198557041936, + "flos": 13734795818880.0, + "grad_norm": 1.7947927727917496, + "language_loss": 0.7302593, + "learning_rate": 3.5239360603883754e-06, + "loss": 0.75141394, + "num_input_tokens_seen": 80333905, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.59375, + "step": 3731, + "time_per_iteration": 2.356825113296509 + }, + { + "auxiliary_loss_clip": 0.01091264, + "auxiliary_loss_mlp": 0.01033938, + "balance_loss_clip": 1.01621199, + "balance_loss_mlp": 1.02796686, + "epoch": 0.22437997895686157, + "flos": 19529854640640.0, + "grad_norm": 1.7613538554927222, + "language_loss": 0.75165671, + "learning_rate": 3.523691349926797e-06, + "loss": 0.77290875, + "num_input_tokens_seen": 80352165, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.6328125, + "step": 3732, + "time_per_iteration": 3.7950668334960938 + }, + { + "auxiliary_loss_clip": 0.01092827, + "auxiliary_loss_mlp": 0.01037561, + "balance_loss_clip": 1.01928651, + "balance_loss_mlp": 1.02970839, + "epoch": 0.22444010220952954, + "flos": 23695097374080.0, + "grad_norm": 1.8654483444328418, + "language_loss": 0.88087487, + "learning_rate": 3.523446585088593e-06, + "loss": 0.90217876, + "num_input_tokens_seen": 80371305, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 0.6328125, + "step": 3733, + "time_per_iteration": 2.41096830368042 + }, + { + "auxiliary_loss_clip": 0.01089176, + "auxiliary_loss_mlp": 0.01037158, + "balance_loss_clip": 1.01857305, + "balance_loss_mlp": 1.02695596, + "epoch": 0.2245002254621975, + "flos": 22381804471680.0, + "grad_norm": 1.4733569513093463, + "language_loss": 0.84390181, + "learning_rate": 3.5232017658825e-06, + "loss": 0.86516517, + "num_input_tokens_seen": 80391020, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.625, + "step": 3734, + "time_per_iteration": 2.37752366065979 + }, + { + "auxiliary_loss_clip": 0.01092387, + "auxiliary_loss_mlp": 0.01041879, + "balance_loss_clip": 1.02395022, + "balance_loss_mlp": 1.02973032, + "epoch": 0.22456034871486547, + "flos": 26941112254080.0, + "grad_norm": 2.1503147038180583, + "language_loss": 0.76111364, + "learning_rate": 3.522956892317253e-06, + "loss": 0.78245628, + "num_input_tokens_seen": 80411365, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.625, + "step": 3735, + "time_per_iteration": 3.800471782684326 + }, + { + "auxiliary_loss_clip": 0.01086445, + "auxiliary_loss_mlp": 0.01037819, + "balance_loss_clip": 1.02121353, + "balance_loss_mlp": 1.02820969, + "epoch": 0.22462047196753343, + "flos": 28982344337280.0, + "grad_norm": 1.6132254381629896, + "language_loss": 0.84712738, + "learning_rate": 3.5227119644015922e-06, + "loss": 0.86837006, + "num_input_tokens_seen": 80431075, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.58203125, + "step": 3736, + "time_per_iteration": 2.451631546020508 + }, + { + "auxiliary_loss_clip": 0.01090463, + "auxiliary_loss_mlp": 0.01036675, + "balance_loss_clip": 1.01853168, + "balance_loss_mlp": 1.02825832, + "epoch": 0.22468059522020142, + "flos": 20010294126720.0, + "grad_norm": 1.6984357524143952, + "language_loss": 0.86714351, + "learning_rate": 3.5224669821442586e-06, + "loss": 0.88841492, + "num_input_tokens_seen": 80449240, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.62109375, + "step": 3737, + "time_per_iteration": 2.40842866897583 + }, + { + "auxiliary_loss_clip": 0.01091119, + "auxiliary_loss_mlp": 0.01042948, + "balance_loss_clip": 1.02404153, + "balance_loss_mlp": 1.02781999, + "epoch": 0.2247407184728694, + "flos": 29312971712640.0, + "grad_norm": 1.7691706746175149, + "language_loss": 0.7931546, + "learning_rate": 3.522221945553995e-06, + "loss": 0.81449533, + "num_input_tokens_seen": 80467900, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 0.6328125, + "step": 3738, + "time_per_iteration": 2.4300787448883057 + }, + { + "auxiliary_loss_clip": 0.01090441, + "auxiliary_loss_mlp": 0.01035812, + "balance_loss_clip": 1.01818752, + "balance_loss_mlp": 1.02712739, + "epoch": 0.22480084172553735, + "flos": 22309254933120.0, + "grad_norm": 1.538876126115887, + "language_loss": 0.76541984, + "learning_rate": 3.521976854639546e-06, + "loss": 0.78668243, + "num_input_tokens_seen": 80487100, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.6328125, + "step": 3739, + "time_per_iteration": 2.3852591514587402 + }, + { + "auxiliary_loss_clip": 0.01088489, + "auxiliary_loss_mlp": 0.01035824, + "balance_loss_clip": 1.01801443, + "balance_loss_mlp": 1.0266794, + "epoch": 0.22486096497820532, + "flos": 25590148128000.0, + "grad_norm": 1.7292536875838214, + "language_loss": 0.74429131, + "learning_rate": 3.5217317094096576e-06, + "loss": 0.7655344, + "num_input_tokens_seen": 80508625, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.6171875, + "step": 3740, + "time_per_iteration": 2.423879384994507 + }, + { + "auxiliary_loss_clip": 0.01088777, + "auxiliary_loss_mlp": 0.01031744, + "balance_loss_clip": 1.01453042, + "balance_loss_mlp": 1.02751994, + "epoch": 0.22492108823087328, + "flos": 17638853604480.0, + "grad_norm": 1.6801050871333163, + "language_loss": 0.75905859, + "learning_rate": 3.5214865098730785e-06, + "loss": 0.78026378, + "num_input_tokens_seen": 80527345, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.61328125, + "step": 3741, + "time_per_iteration": 2.361703872680664 + }, + { + "auxiliary_loss_clip": 0.01088098, + "auxiliary_loss_mlp": 0.01032122, + "balance_loss_clip": 1.01457429, + "balance_loss_mlp": 1.02740383, + "epoch": 0.22498121148354125, + "flos": 16033721713920.0, + "grad_norm": 1.6865714909942253, + "language_loss": 0.87917626, + "learning_rate": 3.52124125603856e-06, + "loss": 0.90037847, + "num_input_tokens_seen": 80545545, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.60546875, + "step": 3742, + "time_per_iteration": 2.3657896518707275 + }, + { + "auxiliary_loss_clip": 0.01087726, + "auxiliary_loss_mlp": 0.01038411, + "balance_loss_clip": 1.02014816, + "balance_loss_mlp": 1.02707016, + "epoch": 0.22504133473620924, + "flos": 24022652549760.0, + "grad_norm": 1.645102043551272, + "language_loss": 0.81376117, + "learning_rate": 3.520995947914854e-06, + "loss": 0.83502257, + "num_input_tokens_seen": 80565040, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 0.609375, + "step": 3743, + "time_per_iteration": 2.4094078540802 + }, + { + "auxiliary_loss_clip": 0.01089297, + "auxiliary_loss_mlp": 0.01030499, + "balance_loss_clip": 1.01363063, + "balance_loss_mlp": 1.02649498, + "epoch": 0.2251014579888772, + "flos": 16763022437760.0, + "grad_norm": 1.9188485400483928, + "language_loss": 0.63366693, + "learning_rate": 3.520750585510715e-06, + "loss": 0.65486485, + "num_input_tokens_seen": 80582815, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.62890625, + "step": 3744, + "time_per_iteration": 2.356104850769043 + }, + { + "auxiliary_loss_clip": 0.0108808, + "auxiliary_loss_mlp": 0.01036718, + "balance_loss_clip": 1.01964712, + "balance_loss_mlp": 1.02572632, + "epoch": 0.22516158124154517, + "flos": 13990150569600.0, + "grad_norm": 2.771667982573645, + "language_loss": 0.76202762, + "learning_rate": 3.5205051688348997e-06, + "loss": 0.7832756, + "num_input_tokens_seen": 80600865, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.625, + "step": 3745, + "time_per_iteration": 2.3604066371917725 + }, + { + "auxiliary_loss_clip": 0.01087369, + "auxiliary_loss_mlp": 0.01036915, + "balance_loss_clip": 1.01923621, + "balance_loss_mlp": 1.02590382, + "epoch": 0.22522170449421314, + "flos": 14389207943040.0, + "grad_norm": 1.9527389856733632, + "language_loss": 0.80728346, + "learning_rate": 3.520259697896166e-06, + "loss": 0.82852626, + "num_input_tokens_seen": 80617455, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.61328125, + "step": 3746, + "time_per_iteration": 2.345534324645996 + }, + { + "auxiliary_loss_clip": 0.0108884, + "auxiliary_loss_mlp": 0.0103852, + "balance_loss_clip": 1.02028084, + "balance_loss_mlp": 1.02645183, + "epoch": 0.2252818277468811, + "flos": 23804410440960.0, + "grad_norm": 2.1405810057766455, + "language_loss": 0.86256254, + "learning_rate": 3.5200141727032744e-06, + "loss": 0.88383621, + "num_input_tokens_seen": 80635125, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 0.625, + "step": 3747, + "time_per_iteration": 2.391200304031372 + }, + { + "auxiliary_loss_clip": 0.01087546, + "auxiliary_loss_mlp": 0.010311, + "balance_loss_clip": 1.01382697, + "balance_loss_mlp": 1.02491987, + "epoch": 0.22534195099954907, + "flos": 24716865490560.0, + "grad_norm": 1.869984033909797, + "language_loss": 0.76360589, + "learning_rate": 3.519768593264987e-06, + "loss": 0.78479242, + "num_input_tokens_seen": 80656370, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.625, + "step": 3748, + "time_per_iteration": 2.4164044857025146 + }, + { + "auxiliary_loss_clip": 0.01091219, + "auxiliary_loss_mlp": 0.01033318, + "balance_loss_clip": 1.01700997, + "balance_loss_mlp": 1.02786016, + "epoch": 0.22540207425221703, + "flos": 21031294193280.0, + "grad_norm": 1.5998245596982745, + "language_loss": 0.79927492, + "learning_rate": 3.519522959590068e-06, + "loss": 0.82052028, + "num_input_tokens_seen": 80676495, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.6328125, + "step": 3749, + "time_per_iteration": 2.3905251026153564 + }, + { + "auxiliary_loss_clip": 0.01084441, + "auxiliary_loss_mlp": 0.01034705, + "balance_loss_clip": 1.01809907, + "balance_loss_mlp": 1.02500296, + "epoch": 0.22546219750488503, + "flos": 19389363863040.0, + "grad_norm": 1.5118862951767982, + "language_loss": 0.79424167, + "learning_rate": 3.5192772716872827e-06, + "loss": 0.81543308, + "num_input_tokens_seen": 80694755, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.59375, + "step": 3750, + "time_per_iteration": 2.358569860458374 + }, + { + "auxiliary_loss_clip": 0.01090923, + "auxiliary_loss_mlp": 0.01036412, + "balance_loss_clip": 1.0186379, + "balance_loss_mlp": 1.02761602, + "epoch": 0.225522320757553, + "flos": 25191439868160.0, + "grad_norm": 1.724506224620897, + "language_loss": 0.8158868, + "learning_rate": 3.5190315295653996e-06, + "loss": 0.83716011, + "num_input_tokens_seen": 80713670, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.6328125, + "step": 3751, + "time_per_iteration": 2.4105162620544434 + }, + { + "auxiliary_loss_clip": 0.01091251, + "auxiliary_loss_mlp": 0.01035425, + "balance_loss_clip": 1.01785338, + "balance_loss_mlp": 1.02820849, + "epoch": 0.22558244401022096, + "flos": 17162219456640.0, + "grad_norm": 1.9508434743411043, + "language_loss": 0.83576322, + "learning_rate": 3.518785733233189e-06, + "loss": 0.85702997, + "num_input_tokens_seen": 80731450, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.62890625, + "step": 3752, + "time_per_iteration": 2.3523709774017334 + }, + { + "auxiliary_loss_clip": 0.01087461, + "auxiliary_loss_mlp": 0.01032953, + "balance_loss_clip": 1.01646626, + "balance_loss_mlp": 1.02681637, + "epoch": 0.22564256726288892, + "flos": 15230125883520.0, + "grad_norm": 1.6685247434861339, + "language_loss": 0.78270149, + "learning_rate": 3.518539882699422e-06, + "loss": 0.80390561, + "num_input_tokens_seen": 80748415, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.60546875, + "step": 3753, + "time_per_iteration": 2.360663890838623 + }, + { + "auxiliary_loss_clip": 0.01086084, + "auxiliary_loss_mlp": 0.01036631, + "balance_loss_clip": 1.01841629, + "balance_loss_mlp": 1.02530885, + "epoch": 0.2257026905155569, + "flos": 34567225574400.0, + "grad_norm": 2.207977685473492, + "language_loss": 0.7851181, + "learning_rate": 3.518293977972873e-06, + "loss": 0.80634522, + "num_input_tokens_seen": 80770835, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.60546875, + "step": 3754, + "time_per_iteration": 2.481651544570923 + }, + { + "auxiliary_loss_clip": 0.01088618, + "auxiliary_loss_mlp": 0.01033595, + "balance_loss_clip": 1.01536787, + "balance_loss_mlp": 1.02941537, + "epoch": 0.22576281376822485, + "flos": 19937395474560.0, + "grad_norm": 4.7922080843468144, + "language_loss": 0.70141995, + "learning_rate": 3.5180480190623173e-06, + "loss": 0.72264206, + "num_input_tokens_seen": 80787840, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 0.59375, + "step": 3755, + "time_per_iteration": 2.3769328594207764 + }, + { + "auxiliary_loss_clip": 0.01093101, + "auxiliary_loss_mlp": 0.01042235, + "balance_loss_clip": 1.02322125, + "balance_loss_mlp": 1.02921903, + "epoch": 0.22582293702089282, + "flos": 24601023999360.0, + "grad_norm": 2.122203370092252, + "language_loss": 0.77696723, + "learning_rate": 3.517802005976533e-06, + "loss": 0.79832059, + "num_input_tokens_seen": 80806335, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 0.640625, + "step": 3756, + "time_per_iteration": 2.3991057872772217 + }, + { + "auxiliary_loss_clip": 0.01090352, + "auxiliary_loss_mlp": 0.01037603, + "balance_loss_clip": 1.0203414, + "balance_loss_mlp": 1.0274297, + "epoch": 0.2258830602735608, + "flos": 23034436116480.0, + "grad_norm": 1.6967341676801726, + "language_loss": 0.82532358, + "learning_rate": 3.5175559387242988e-06, + "loss": 0.8466031, + "num_input_tokens_seen": 80825355, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.62890625, + "step": 3757, + "time_per_iteration": 2.410648822784424 + }, + { + "auxiliary_loss_clip": 0.01089515, + "auxiliary_loss_mlp": 0.01031701, + "balance_loss_clip": 1.01335442, + "balance_loss_mlp": 1.0271461, + "epoch": 0.22594318352622877, + "flos": 22157487786240.0, + "grad_norm": 1.7322563283534598, + "language_loss": 0.73117363, + "learning_rate": 3.517309817314397e-06, + "loss": 0.75238585, + "num_input_tokens_seen": 80842570, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.625, + "step": 3758, + "time_per_iteration": 2.3524677753448486 + }, + { + "auxiliary_loss_clip": 0.01092195, + "auxiliary_loss_mlp": 0.01040809, + "balance_loss_clip": 1.02203381, + "balance_loss_mlp": 1.02854705, + "epoch": 0.22600330677889674, + "flos": 20593273875840.0, + "grad_norm": 2.1786804289924566, + "language_loss": 0.77346629, + "learning_rate": 3.5170636417556113e-06, + "loss": 0.79479635, + "num_input_tokens_seen": 80858745, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.63671875, + "step": 3759, + "time_per_iteration": 2.3560667037963867 + }, + { + "auxiliary_loss_clip": 0.01091526, + "auxiliary_loss_mlp": 0.01036689, + "balance_loss_clip": 1.01812804, + "balance_loss_mlp": 1.02664661, + "epoch": 0.2260634300315647, + "flos": 35658436118400.0, + "grad_norm": 2.3520698153818103, + "language_loss": 0.78370064, + "learning_rate": 3.516817412056726e-06, + "loss": 0.80498278, + "num_input_tokens_seen": 80880085, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.6484375, + "step": 3760, + "time_per_iteration": 2.4952027797698975 + }, + { + "auxiliary_loss_clip": 0.01017567, + "auxiliary_loss_mlp": 0.01005238, + "balance_loss_clip": 1.00330639, + "balance_loss_mlp": 1.00258195, + "epoch": 0.22612355328423267, + "flos": 72087579699840.0, + "grad_norm": 0.9459283340013351, + "language_loss": 0.60087264, + "learning_rate": 3.516571128226529e-06, + "loss": 0.62110072, + "num_input_tokens_seen": 80937660, + "router_z_loss_clip": 0.01928711, + "router_z_loss_mlp": 0.15039062, + "step": 3761, + "time_per_iteration": 2.972327470779419 + }, + { + "auxiliary_loss_clip": 0.01091507, + "auxiliary_loss_mlp": 0.01038033, + "balance_loss_clip": 1.01953173, + "balance_loss_mlp": 1.02764082, + "epoch": 0.22618367653690064, + "flos": 22782677235840.0, + "grad_norm": 1.9425600130414724, + "language_loss": 0.7698741, + "learning_rate": 3.51632479027381e-06, + "loss": 0.79116946, + "num_input_tokens_seen": 80956265, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.640625, + "step": 3762, + "time_per_iteration": 2.386636972427368 + }, + { + "auxiliary_loss_clip": 0.01091936, + "auxiliary_loss_mlp": 0.01034725, + "balance_loss_clip": 1.01662946, + "balance_loss_mlp": 1.02746713, + "epoch": 0.22624379978956863, + "flos": 20447232192000.0, + "grad_norm": 2.0975437539423507, + "language_loss": 0.78804028, + "learning_rate": 3.5160783982073595e-06, + "loss": 0.80930692, + "num_input_tokens_seen": 80975185, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.64453125, + "step": 3763, + "time_per_iteration": 2.3685462474823 + }, + { + "auxiliary_loss_clip": 0.01093227, + "auxiliary_loss_mlp": 0.01038478, + "balance_loss_clip": 1.01939213, + "balance_loss_mlp": 1.02874374, + "epoch": 0.2263039230422366, + "flos": 17493335591040.0, + "grad_norm": 1.612111878829377, + "language_loss": 0.9122529, + "learning_rate": 3.5158319520359703e-06, + "loss": 0.93356991, + "num_input_tokens_seen": 80992830, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 0.64453125, + "step": 3764, + "time_per_iteration": 2.3483502864837646 + }, + { + "auxiliary_loss_clip": 0.01090226, + "auxiliary_loss_mlp": 0.01038333, + "balance_loss_clip": 1.02058291, + "balance_loss_mlp": 1.0291301, + "epoch": 0.22636404629490456, + "flos": 28328490794880.0, + "grad_norm": 1.8936628916638818, + "language_loss": 0.75164557, + "learning_rate": 3.515585451768438e-06, + "loss": 0.7729311, + "num_input_tokens_seen": 81013675, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.609375, + "step": 3765, + "time_per_iteration": 2.42386794090271 + }, + { + "auxiliary_loss_clip": 0.01089541, + "auxiliary_loss_mlp": 0.01037407, + "balance_loss_clip": 1.01898909, + "balance_loss_mlp": 1.02846622, + "epoch": 0.22642416954757252, + "flos": 17488308355200.0, + "grad_norm": 2.0950946375948987, + "language_loss": 0.89427751, + "learning_rate": 3.51533889741356e-06, + "loss": 0.91554701, + "num_input_tokens_seen": 81030345, + "router_z_loss_clip": 0.18457031, + "router_z_loss_mlp": 0.609375, + "step": 3766, + "time_per_iteration": 3.7598423957824707 + }, + { + "auxiliary_loss_clip": 0.01088469, + "auxiliary_loss_mlp": 0.01033004, + "balance_loss_clip": 1.01396644, + "balance_loss_mlp": 1.02780557, + "epoch": 0.2264842928002405, + "flos": 24383515029120.0, + "grad_norm": 1.5057994214588577, + "language_loss": 0.74437904, + "learning_rate": 3.515092288980135e-06, + "loss": 0.76559377, + "num_input_tokens_seen": 81051000, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 0.60546875, + "step": 3767, + "time_per_iteration": 2.406935453414917 + }, + { + "auxiliary_loss_clip": 0.01088493, + "auxiliary_loss_mlp": 0.01034417, + "balance_loss_clip": 1.01472366, + "balance_loss_mlp": 1.02614951, + "epoch": 0.22654441605290845, + "flos": 19829443950720.0, + "grad_norm": 1.371887647497329, + "language_loss": 0.71427721, + "learning_rate": 3.5148456264769625e-06, + "loss": 0.7355063, + "num_input_tokens_seen": 81071205, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 0.625, + "step": 3768, + "time_per_iteration": 2.3927323818206787 + }, + { + "auxiliary_loss_clip": 0.01093473, + "auxiliary_loss_mlp": 0.01042629, + "balance_loss_clip": 1.02324569, + "balance_loss_mlp": 1.03060412, + "epoch": 0.22660453930557642, + "flos": 27453322944000.0, + "grad_norm": 1.9943603153548417, + "language_loss": 0.78653377, + "learning_rate": 3.5145989099128465e-06, + "loss": 0.80789483, + "num_input_tokens_seen": 81091880, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 0.62890625, + "step": 3769, + "time_per_iteration": 3.8303472995758057 + }, + { + "auxiliary_loss_clip": 0.01092521, + "auxiliary_loss_mlp": 0.01037786, + "balance_loss_clip": 1.0190109, + "balance_loss_mlp": 1.02721548, + "epoch": 0.2266646625582444, + "flos": 23987006184960.0, + "grad_norm": 1.8630213958713637, + "language_loss": 0.68793172, + "learning_rate": 3.5143521392965914e-06, + "loss": 0.70923483, + "num_input_tokens_seen": 81113290, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.65234375, + "step": 3770, + "time_per_iteration": 2.429872751235962 + }, + { + "auxiliary_loss_clip": 0.01091515, + "auxiliary_loss_mlp": 0.01035967, + "balance_loss_clip": 1.01748979, + "balance_loss_mlp": 1.02674448, + "epoch": 0.22672478581091238, + "flos": 26026946547840.0, + "grad_norm": 1.4792494688847235, + "language_loss": 0.80242562, + "learning_rate": 3.5141053146370047e-06, + "loss": 0.82370043, + "num_input_tokens_seen": 81133535, + "router_z_loss_clip": 0.18457031, + "router_z_loss_mlp": 0.6484375, + "step": 3771, + "time_per_iteration": 3.772441864013672 + }, + { + "auxiliary_loss_clip": 0.01087493, + "auxiliary_loss_mlp": 0.01044862, + "balance_loss_clip": 1.02613378, + "balance_loss_mlp": 1.02693558, + "epoch": 0.22678490906358034, + "flos": 23840685210240.0, + "grad_norm": 1.475174598611432, + "language_loss": 0.78800523, + "learning_rate": 3.513858435942893e-06, + "loss": 0.80932879, + "num_input_tokens_seen": 81154650, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.60546875, + "step": 3772, + "time_per_iteration": 2.4286961555480957 + }, + { + "auxiliary_loss_clip": 0.01017631, + "auxiliary_loss_mlp": 0.01005329, + "balance_loss_clip": 1.00324333, + "balance_loss_mlp": 1.00297904, + "epoch": 0.2268450323162483, + "flos": 65044409351040.0, + "grad_norm": 0.6507708474992, + "language_loss": 0.54394597, + "learning_rate": 3.5136115032230683e-06, + "loss": 0.56417555, + "num_input_tokens_seen": 81221240, + "router_z_loss_clip": 0.02087402, + "router_z_loss_mlp": 0.14648438, + "step": 3773, + "time_per_iteration": 3.0998849868774414 + }, + { + "auxiliary_loss_clip": 0.0108632, + "auxiliary_loss_mlp": 0.01035357, + "balance_loss_clip": 1.01757145, + "balance_loss_mlp": 1.0259006, + "epoch": 0.22690515556891627, + "flos": 22525053246720.0, + "grad_norm": 1.9145992280613224, + "language_loss": 0.70580399, + "learning_rate": 3.5133645164863427e-06, + "loss": 0.7270208, + "num_input_tokens_seen": 81241520, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.6015625, + "step": 3774, + "time_per_iteration": 2.4120614528656006 + }, + { + "auxiliary_loss_clip": 0.01086498, + "auxiliary_loss_mlp": 0.01039357, + "balance_loss_clip": 1.02208352, + "balance_loss_mlp": 1.02557778, + "epoch": 0.22696527882158424, + "flos": 18222461758080.0, + "grad_norm": 2.097856629454698, + "language_loss": 0.74524856, + "learning_rate": 3.5131174757415298e-06, + "loss": 0.76650709, + "num_input_tokens_seen": 81256825, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.609375, + "step": 3775, + "time_per_iteration": 3.832169532775879 + }, + { + "auxiliary_loss_clip": 0.01088387, + "auxiliary_loss_mlp": 0.01034693, + "balance_loss_clip": 1.01764596, + "balance_loss_mlp": 1.02649021, + "epoch": 0.22702540207425223, + "flos": 17018307365760.0, + "grad_norm": 1.7580701183983654, + "language_loss": 0.82575047, + "learning_rate": 3.512870380997446e-06, + "loss": 0.84698129, + "num_input_tokens_seen": 81275695, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.62109375, + "step": 3776, + "time_per_iteration": 2.366807460784912 + }, + { + "auxiliary_loss_clip": 0.01090168, + "auxiliary_loss_mlp": 0.01037382, + "balance_loss_clip": 1.01935768, + "balance_loss_mlp": 1.02692699, + "epoch": 0.2270855253269202, + "flos": 21324634369920.0, + "grad_norm": 3.686259464893189, + "language_loss": 0.83219683, + "learning_rate": 3.5126232322629114e-06, + "loss": 0.85347235, + "num_input_tokens_seen": 81294920, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.6328125, + "step": 3777, + "time_per_iteration": 2.394103765487671 + }, + { + "auxiliary_loss_clip": 0.01090977, + "auxiliary_loss_mlp": 0.01036228, + "balance_loss_clip": 1.01834691, + "balance_loss_mlp": 1.02901387, + "epoch": 0.22714564857958816, + "flos": 23549334981120.0, + "grad_norm": 2.746636482275372, + "language_loss": 0.72792417, + "learning_rate": 3.5123760295467435e-06, + "loss": 0.74919617, + "num_input_tokens_seen": 81314275, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 0.62109375, + "step": 3778, + "time_per_iteration": 2.4154746532440186 + }, + { + "auxiliary_loss_clip": 0.01088858, + "auxiliary_loss_mlp": 0.01036751, + "balance_loss_clip": 1.01973939, + "balance_loss_mlp": 1.0261116, + "epoch": 0.22720577183225613, + "flos": 25988821476480.0, + "grad_norm": 3.9146729769934834, + "language_loss": 0.64117897, + "learning_rate": 3.5121287728577657e-06, + "loss": 0.66243505, + "num_input_tokens_seen": 81333890, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.625, + "step": 3779, + "time_per_iteration": 2.416792154312134 + }, + { + "auxiliary_loss_clip": 0.01088829, + "auxiliary_loss_mlp": 0.01033635, + "balance_loss_clip": 1.01594448, + "balance_loss_mlp": 1.02821076, + "epoch": 0.2272658950849241, + "flos": 20813017173120.0, + "grad_norm": 1.5792572457746858, + "language_loss": 0.70214581, + "learning_rate": 3.5118814622048012e-06, + "loss": 0.72337043, + "num_input_tokens_seen": 81353640, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.609375, + "step": 3780, + "time_per_iteration": 2.3873915672302246 + }, + { + "auxiliary_loss_clip": 0.01089188, + "auxiliary_loss_mlp": 0.0103907, + "balance_loss_clip": 1.02037811, + "balance_loss_mlp": 1.02833617, + "epoch": 0.22732601833759206, + "flos": 23908347158400.0, + "grad_norm": 1.6702840634077138, + "language_loss": 0.89330554, + "learning_rate": 3.5116340975966766e-06, + "loss": 0.91458809, + "num_input_tokens_seen": 81371595, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.609375, + "step": 3781, + "time_per_iteration": 2.38491153717041 + }, + { + "auxiliary_loss_clip": 0.01088734, + "auxiliary_loss_mlp": 0.01035524, + "balance_loss_clip": 1.01687992, + "balance_loss_mlp": 1.02617013, + "epoch": 0.22738614159026002, + "flos": 15923500951680.0, + "grad_norm": 1.9916366175952125, + "language_loss": 0.74606478, + "learning_rate": 3.5113866790422195e-06, + "loss": 0.7673074, + "num_input_tokens_seen": 81388435, + "router_z_loss_clip": 0.18652344, + "router_z_loss_mlp": 0.625, + "step": 3782, + "time_per_iteration": 2.357717514038086 + }, + { + "auxiliary_loss_clip": 0.01087326, + "auxiliary_loss_mlp": 0.01033593, + "balance_loss_clip": 1.01683199, + "balance_loss_mlp": 1.02649641, + "epoch": 0.22744626484292801, + "flos": 24204410421120.0, + "grad_norm": 1.4565571588063595, + "language_loss": 0.82687902, + "learning_rate": 3.51113920655026e-06, + "loss": 0.84808826, + "num_input_tokens_seen": 81410195, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.609375, + "step": 3783, + "time_per_iteration": 2.4019362926483154 + }, + { + "auxiliary_loss_clip": 0.01088729, + "auxiliary_loss_mlp": 0.01037909, + "balance_loss_clip": 1.01919305, + "balance_loss_mlp": 1.02750099, + "epoch": 0.22750638809559598, + "flos": 24790427458560.0, + "grad_norm": 1.8598769008253748, + "language_loss": 0.76036566, + "learning_rate": 3.510891680129629e-06, + "loss": 0.78163207, + "num_input_tokens_seen": 81430060, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.61328125, + "step": 3784, + "time_per_iteration": 2.408007860183716 + }, + { + "auxiliary_loss_clip": 0.01085311, + "auxiliary_loss_mlp": 0.0103358, + "balance_loss_clip": 1.0153892, + "balance_loss_mlp": 1.02449834, + "epoch": 0.22756651134826394, + "flos": 22235413674240.0, + "grad_norm": 1.684686076380219, + "language_loss": 0.7122314, + "learning_rate": 3.51064409978916e-06, + "loss": 0.73342031, + "num_input_tokens_seen": 81447375, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.609375, + "step": 3785, + "time_per_iteration": 2.3783719539642334 + }, + { + "auxiliary_loss_clip": 0.0101713, + "auxiliary_loss_mlp": 0.01009888, + "balance_loss_clip": 1.00788498, + "balance_loss_mlp": 1.00312281, + "epoch": 0.2276266346009319, + "flos": 62704006894080.0, + "grad_norm": 0.8252731897482487, + "language_loss": 0.61938071, + "learning_rate": 3.5103964655376894e-06, + "loss": 0.63965088, + "num_input_tokens_seen": 81505235, + "router_z_loss_clip": 0.02001953, + "router_z_loss_mlp": 0.140625, + "step": 3786, + "time_per_iteration": 2.984421730041504 + }, + { + "auxiliary_loss_clip": 0.01094468, + "auxiliary_loss_mlp": 0.01037413, + "balance_loss_clip": 1.01802897, + "balance_loss_mlp": 1.02929771, + "epoch": 0.22768675785359987, + "flos": 18613245139200.0, + "grad_norm": 2.307840027034818, + "language_loss": 0.86449611, + "learning_rate": 3.510148777384054e-06, + "loss": 0.88581491, + "num_input_tokens_seen": 81518685, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 0.65234375, + "step": 3787, + "time_per_iteration": 2.3531198501586914 + }, + { + "auxiliary_loss_clip": 0.01085635, + "auxiliary_loss_mlp": 0.01033202, + "balance_loss_clip": 1.01651263, + "balance_loss_mlp": 1.02698684, + "epoch": 0.22774688110626784, + "flos": 26868981651840.0, + "grad_norm": 1.231732535080988, + "language_loss": 0.72669089, + "learning_rate": 3.5099010353370934e-06, + "loss": 0.74787927, + "num_input_tokens_seen": 81538940, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.5859375, + "step": 3788, + "time_per_iteration": 2.4292914867401123 + }, + { + "auxiliary_loss_clip": 0.01087532, + "auxiliary_loss_mlp": 0.01029437, + "balance_loss_clip": 1.01235449, + "balance_loss_mlp": 1.02739811, + "epoch": 0.2278070043589358, + "flos": 15552863291520.0, + "grad_norm": 2.5254162728632545, + "language_loss": 0.67666602, + "learning_rate": 3.5096532394056487e-06, + "loss": 0.69783568, + "num_input_tokens_seen": 81555525, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.6015625, + "step": 3789, + "time_per_iteration": 2.3270905017852783 + }, + { + "auxiliary_loss_clip": 0.01087963, + "auxiliary_loss_mlp": 0.01039137, + "balance_loss_clip": 1.02088594, + "balance_loss_mlp": 1.02717233, + "epoch": 0.2278671276116038, + "flos": 22415775091200.0, + "grad_norm": 1.8408681320616835, + "language_loss": 0.75489384, + "learning_rate": 3.5094053895985632e-06, + "loss": 0.77616483, + "num_input_tokens_seen": 81576305, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 0.609375, + "step": 3790, + "time_per_iteration": 2.4211106300354004 + }, + { + "auxiliary_loss_clip": 0.01085204, + "auxiliary_loss_mlp": 0.01031517, + "balance_loss_clip": 1.01396954, + "balance_loss_mlp": 1.02503514, + "epoch": 0.22792725086427176, + "flos": 20630316695040.0, + "grad_norm": 1.87915916690466, + "language_loss": 0.90536761, + "learning_rate": 3.5091574859246818e-06, + "loss": 0.92653483, + "num_input_tokens_seen": 81594115, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.6015625, + "step": 3791, + "time_per_iteration": 2.3681387901306152 + }, + { + "auxiliary_loss_clip": 0.01087979, + "auxiliary_loss_mlp": 0.01035071, + "balance_loss_clip": 1.01642704, + "balance_loss_mlp": 1.02538478, + "epoch": 0.22798737411693973, + "flos": 31427661029760.0, + "grad_norm": 2.0857950402727568, + "language_loss": 0.82091236, + "learning_rate": 3.508909528392852e-06, + "loss": 0.84214282, + "num_input_tokens_seen": 81615355, + "router_z_loss_clip": 0.18652344, + "router_z_loss_mlp": 0.625, + "step": 3792, + "time_per_iteration": 2.462547779083252 + }, + { + "auxiliary_loss_clip": 0.01016086, + "auxiliary_loss_mlp": 0.01011285, + "balance_loss_clip": 1.00915098, + "balance_loss_mlp": 1.00226247, + "epoch": 0.2280474973696077, + "flos": 52394121095040.0, + "grad_norm": 1.1432011257847694, + "language_loss": 0.65706909, + "learning_rate": 3.5086615170119224e-06, + "loss": 0.67734277, + "num_input_tokens_seen": 81662075, + "router_z_loss_clip": 0.0213623, + "router_z_loss_mlp": 0.13867188, + "step": 3793, + "time_per_iteration": 2.7637650966644287 + }, + { + "auxiliary_loss_clip": 0.0109298, + "auxiliary_loss_mlp": 0.01042921, + "balance_loss_clip": 1.02335882, + "balance_loss_mlp": 1.02766323, + "epoch": 0.22810762062227566, + "flos": 26394861121920.0, + "grad_norm": 2.602653448535137, + "language_loss": 0.76358742, + "learning_rate": 3.508413451790744e-06, + "loss": 0.78494644, + "num_input_tokens_seen": 81681625, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 0.65234375, + "step": 3794, + "time_per_iteration": 2.426140069961548 + }, + { + "auxiliary_loss_clip": 0.01088427, + "auxiliary_loss_mlp": 0.01034406, + "balance_loss_clip": 1.01600027, + "balance_loss_mlp": 1.02610791, + "epoch": 0.22816774387494362, + "flos": 25629076160640.0, + "grad_norm": 1.725615627529767, + "language_loss": 0.80870014, + "learning_rate": 3.50816533273817e-06, + "loss": 0.82992846, + "num_input_tokens_seen": 81701170, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.625, + "step": 3795, + "time_per_iteration": 2.411367654800415 + }, + { + "auxiliary_loss_clip": 0.01086954, + "auxiliary_loss_mlp": 0.01039503, + "balance_loss_clip": 1.02106178, + "balance_loss_mlp": 1.02629185, + "epoch": 0.22822786712761162, + "flos": 22450618494720.0, + "grad_norm": 1.6378856943664284, + "language_loss": 0.76962423, + "learning_rate": 3.507917159863054e-06, + "loss": 0.79088885, + "num_input_tokens_seen": 81721265, + "router_z_loss_clip": 0.18457031, + "router_z_loss_mlp": 0.60546875, + "step": 3796, + "time_per_iteration": 2.4178855419158936 + }, + { + "auxiliary_loss_clip": 0.01086449, + "auxiliary_loss_mlp": 0.01032688, + "balance_loss_clip": 1.01664233, + "balance_loss_mlp": 1.02601695, + "epoch": 0.22828799038027958, + "flos": 12201759619200.0, + "grad_norm": 2.2935202414850733, + "language_loss": 0.95839965, + "learning_rate": 3.507668933174254e-06, + "loss": 0.97959107, + "num_input_tokens_seen": 81736565, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.60546875, + "step": 3797, + "time_per_iteration": 2.3302457332611084 + }, + { + "auxiliary_loss_clip": 0.01088704, + "auxiliary_loss_mlp": 0.01037217, + "balance_loss_clip": 1.0200386, + "balance_loss_mlp": 1.02784896, + "epoch": 0.22834811363294755, + "flos": 22084763690880.0, + "grad_norm": 1.497063928582699, + "language_loss": 0.81556934, + "learning_rate": 3.5074206526806274e-06, + "loss": 0.83682853, + "num_input_tokens_seen": 81756240, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.609375, + "step": 3798, + "time_per_iteration": 2.421319007873535 + }, + { + "auxiliary_loss_clip": 0.01086642, + "auxiliary_loss_mlp": 0.01034656, + "balance_loss_clip": 1.01555872, + "balance_loss_mlp": 1.02524805, + "epoch": 0.2284082368856155, + "flos": 24859555683840.0, + "grad_norm": 1.9221700156500818, + "language_loss": 0.79203105, + "learning_rate": 3.507172318391036e-06, + "loss": 0.8132441, + "num_input_tokens_seen": 81775720, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 0.61328125, + "step": 3799, + "time_per_iteration": 2.40585994720459 + }, + { + "auxiliary_loss_clip": 0.01087693, + "auxiliary_loss_mlp": 0.01032413, + "balance_loss_clip": 1.01581931, + "balance_loss_mlp": 1.02675533, + "epoch": 0.22846836013828348, + "flos": 23291815726080.0, + "grad_norm": 1.4823114103646389, + "language_loss": 0.74984872, + "learning_rate": 3.506923930314341e-06, + "loss": 0.77104974, + "num_input_tokens_seen": 81795830, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.609375, + "step": 3800, + "time_per_iteration": 2.3957200050354004 + }, + { + "auxiliary_loss_clip": 0.01088192, + "auxiliary_loss_mlp": 0.01038338, + "balance_loss_clip": 1.0217557, + "balance_loss_mlp": 1.0274626, + "epoch": 0.22852848339095144, + "flos": 27415093138560.0, + "grad_norm": 1.7382232068920265, + "language_loss": 0.64025426, + "learning_rate": 3.5066754884594072e-06, + "loss": 0.66151953, + "num_input_tokens_seen": 81815745, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.609375, + "step": 3801, + "time_per_iteration": 2.4217402935028076 + }, + { + "auxiliary_loss_clip": 0.01084292, + "auxiliary_loss_mlp": 0.01028622, + "balance_loss_clip": 1.0131011, + "balance_loss_mlp": 1.02608454, + "epoch": 0.2285886066436194, + "flos": 26320007433600.0, + "grad_norm": 1.5845091426353741, + "language_loss": 0.81693745, + "learning_rate": 3.5064269928351005e-06, + "loss": 0.83806658, + "num_input_tokens_seen": 81835155, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.58203125, + "step": 3802, + "time_per_iteration": 2.4141721725463867 + }, + { + "auxiliary_loss_clip": 0.01089418, + "auxiliary_loss_mlp": 0.01041514, + "balance_loss_clip": 1.02356124, + "balance_loss_mlp": 1.02765429, + "epoch": 0.2286487298962874, + "flos": 29715171108480.0, + "grad_norm": 1.7399230486354544, + "language_loss": 0.78634125, + "learning_rate": 3.5061784434502897e-06, + "loss": 0.80765057, + "num_input_tokens_seen": 81855655, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.6171875, + "step": 3803, + "time_per_iteration": 2.431398391723633 + }, + { + "auxiliary_loss_clip": 0.01086104, + "auxiliary_loss_mlp": 0.01035511, + "balance_loss_clip": 1.0178566, + "balance_loss_mlp": 1.02444887, + "epoch": 0.22870885314895537, + "flos": 21286160184960.0, + "grad_norm": 1.7096143204037866, + "language_loss": 0.85129672, + "learning_rate": 3.505929840313845e-06, + "loss": 0.87251282, + "num_input_tokens_seen": 81876385, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.6171875, + "step": 3804, + "time_per_iteration": 2.4004709720611572 + }, + { + "auxiliary_loss_clip": 0.0109045, + "auxiliary_loss_mlp": 0.01037, + "balance_loss_clip": 1.01934481, + "balance_loss_mlp": 1.02662039, + "epoch": 0.22876897640162333, + "flos": 14938566186240.0, + "grad_norm": 1.8684811704061102, + "language_loss": 0.76703346, + "learning_rate": 3.5056811834346382e-06, + "loss": 0.78830791, + "num_input_tokens_seen": 81893225, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.640625, + "step": 3805, + "time_per_iteration": 2.343780040740967 + }, + { + "auxiliary_loss_clip": 0.01089323, + "auxiliary_loss_mlp": 0.0103756, + "balance_loss_clip": 1.01934469, + "balance_loss_mlp": 1.02592516, + "epoch": 0.2288290996542913, + "flos": 18112939223040.0, + "grad_norm": 2.326606822596895, + "language_loss": 0.78419352, + "learning_rate": 3.5054324728215423e-06, + "loss": 0.80546236, + "num_input_tokens_seen": 81911350, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.6328125, + "step": 3806, + "time_per_iteration": 3.7545509338378906 + }, + { + "auxiliary_loss_clip": 0.01088861, + "auxiliary_loss_mlp": 0.0104011, + "balance_loss_clip": 1.02324224, + "balance_loss_mlp": 1.02702498, + "epoch": 0.22888922290695926, + "flos": 39853983778560.0, + "grad_norm": 3.2673323541602923, + "language_loss": 0.70302856, + "learning_rate": 3.505183708483434e-06, + "loss": 0.72431827, + "num_input_tokens_seen": 81935420, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.6171875, + "step": 3807, + "time_per_iteration": 2.5256102085113525 + }, + { + "auxiliary_loss_clip": 0.01094187, + "auxiliary_loss_mlp": 0.01042855, + "balance_loss_clip": 1.02417517, + "balance_loss_mlp": 1.02921653, + "epoch": 0.22894934615962723, + "flos": 23402664892800.0, + "grad_norm": 2.0048278010572735, + "language_loss": 0.65318346, + "learning_rate": 3.504934890429191e-06, + "loss": 0.67455387, + "num_input_tokens_seen": 81953845, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.6484375, + "step": 3808, + "time_per_iteration": 2.4050979614257812 + }, + { + "auxiliary_loss_clip": 0.01089716, + "auxiliary_loss_mlp": 0.01041493, + "balance_loss_clip": 1.02494645, + "balance_loss_mlp": 1.02788758, + "epoch": 0.22900946941229522, + "flos": 18842030478720.0, + "grad_norm": 1.9090335901439184, + "language_loss": 0.75185037, + "learning_rate": 3.5046860186676936e-06, + "loss": 0.77316242, + "num_input_tokens_seen": 81972100, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.6171875, + "step": 3809, + "time_per_iteration": 3.7812273502349854 + }, + { + "auxiliary_loss_clip": 0.01087151, + "auxiliary_loss_mlp": 0.01036026, + "balance_loss_clip": 1.01870501, + "balance_loss_mlp": 1.02752638, + "epoch": 0.22906959266496318, + "flos": 22928299983360.0, + "grad_norm": 1.434623301717465, + "language_loss": 0.81609118, + "learning_rate": 3.504437093207822e-06, + "loss": 0.83732295, + "num_input_tokens_seen": 81992760, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.59375, + "step": 3810, + "time_per_iteration": 3.8533847332000732 + }, + { + "auxiliary_loss_clip": 0.01087667, + "auxiliary_loss_mlp": 0.01033409, + "balance_loss_clip": 1.01723254, + "balance_loss_mlp": 1.02792406, + "epoch": 0.22912971591763115, + "flos": 19353508030080.0, + "grad_norm": 2.011777331327998, + "language_loss": 0.7841962, + "learning_rate": 3.5041881140584602e-06, + "loss": 0.80540693, + "num_input_tokens_seen": 82009080, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.59765625, + "step": 3811, + "time_per_iteration": 2.3599648475646973 + }, + { + "auxiliary_loss_clip": 0.01087382, + "auxiliary_loss_mlp": 0.01036212, + "balance_loss_clip": 1.01943886, + "balance_loss_mlp": 1.02636838, + "epoch": 0.22918983917029911, + "flos": 19932647529600.0, + "grad_norm": 1.8143441437452754, + "language_loss": 0.83240467, + "learning_rate": 3.5039390812284937e-06, + "loss": 0.85364068, + "num_input_tokens_seen": 82026705, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.609375, + "step": 3812, + "time_per_iteration": 2.364509105682373 + }, + { + "auxiliary_loss_clip": 0.01092594, + "auxiliary_loss_mlp": 0.01038553, + "balance_loss_clip": 1.0198257, + "balance_loss_mlp": 1.02847826, + "epoch": 0.22924996242296708, + "flos": 16689949228800.0, + "grad_norm": 2.681985172712628, + "language_loss": 0.83799887, + "learning_rate": 3.5036899947268105e-06, + "loss": 0.85931039, + "num_input_tokens_seen": 82043245, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.640625, + "step": 3813, + "time_per_iteration": 2.3506107330322266 + }, + { + "auxiliary_loss_clip": 0.01087346, + "auxiliary_loss_mlp": 0.01031595, + "balance_loss_clip": 1.01410723, + "balance_loss_mlp": 1.02632737, + "epoch": 0.22931008567563504, + "flos": 33034782867840.0, + "grad_norm": 1.673959294344394, + "language_loss": 0.70345366, + "learning_rate": 3.5034408545623e-06, + "loss": 0.72464311, + "num_input_tokens_seen": 82066870, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.609375, + "step": 3814, + "time_per_iteration": 3.8504204750061035 + }, + { + "auxiliary_loss_clip": 0.01084833, + "auxiliary_loss_mlp": 0.01035658, + "balance_loss_clip": 1.01902843, + "balance_loss_mlp": 1.02492142, + "epoch": 0.229370208928303, + "flos": 23329591683840.0, + "grad_norm": 2.337415803708085, + "language_loss": 0.66801226, + "learning_rate": 3.5031916607438516e-06, + "loss": 0.68921727, + "num_input_tokens_seen": 82083180, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.59765625, + "step": 3815, + "time_per_iteration": 2.3961167335510254 + }, + { + "auxiliary_loss_clip": 0.01089226, + "auxiliary_loss_mlp": 0.01045217, + "balance_loss_clip": 1.02741861, + "balance_loss_mlp": 1.02737093, + "epoch": 0.229430332180971, + "flos": 28616070597120.0, + "grad_norm": 1.7927608595653053, + "language_loss": 0.83907104, + "learning_rate": 3.50294241328036e-06, + "loss": 0.86041558, + "num_input_tokens_seen": 82102950, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.6171875, + "step": 3816, + "time_per_iteration": 2.436598539352417 + }, + { + "auxiliary_loss_clip": 0.01088234, + "auxiliary_loss_mlp": 0.010322, + "balance_loss_clip": 1.01387787, + "balance_loss_mlp": 1.02649224, + "epoch": 0.22949045543363897, + "flos": 17237247701760.0, + "grad_norm": 2.5401060833844453, + "language_loss": 0.8700307, + "learning_rate": 3.5026931121807195e-06, + "loss": 0.89123505, + "num_input_tokens_seen": 82119510, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.6171875, + "step": 3817, + "time_per_iteration": 2.340454578399658 + }, + { + "auxiliary_loss_clip": 0.01088736, + "auxiliary_loss_mlp": 0.01035284, + "balance_loss_clip": 1.0173552, + "balance_loss_mlp": 1.02608013, + "epoch": 0.22955057868630693, + "flos": 27488236170240.0, + "grad_norm": 1.7309962909696994, + "language_loss": 0.75099266, + "learning_rate": 3.5024437574538275e-06, + "loss": 0.77223289, + "num_input_tokens_seen": 82140095, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.625, + "step": 3818, + "time_per_iteration": 2.4216606616973877 + }, + { + "auxiliary_loss_clip": 0.01088797, + "auxiliary_loss_mlp": 0.01029978, + "balance_loss_clip": 1.013134, + "balance_loss_mlp": 1.02624011, + "epoch": 0.2296107019389749, + "flos": 23475319165440.0, + "grad_norm": 1.5652278066751535, + "language_loss": 0.7429148, + "learning_rate": 3.5021943491085823e-06, + "loss": 0.76410252, + "num_input_tokens_seen": 82159510, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.625, + "step": 3819, + "time_per_iteration": 2.4085023403167725 + }, + { + "auxiliary_loss_clip": 0.01088546, + "auxiliary_loss_mlp": 0.01034126, + "balance_loss_clip": 1.01664972, + "balance_loss_mlp": 1.02787375, + "epoch": 0.22967082519164286, + "flos": 31283818761600.0, + "grad_norm": 1.8277362002645299, + "language_loss": 0.81004488, + "learning_rate": 3.5019448871538853e-06, + "loss": 0.83127153, + "num_input_tokens_seen": 82179580, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.609375, + "step": 3820, + "time_per_iteration": 2.443455219268799 + }, + { + "auxiliary_loss_clip": 0.01090382, + "auxiliary_loss_mlp": 0.01036679, + "balance_loss_clip": 1.01803446, + "balance_loss_mlp": 1.02673411, + "epoch": 0.22973094844431083, + "flos": 14642188721280.0, + "grad_norm": 1.8653661172565414, + "language_loss": 0.69101381, + "learning_rate": 3.501695371598638e-06, + "loss": 0.71228445, + "num_input_tokens_seen": 82195585, + "router_z_loss_clip": 0.18652344, + "router_z_loss_mlp": 0.63671875, + "step": 3821, + "time_per_iteration": 2.3479127883911133 + }, + { + "auxiliary_loss_clip": 0.01087111, + "auxiliary_loss_mlp": 0.01032182, + "balance_loss_clip": 1.01444376, + "balance_loss_mlp": 1.02677619, + "epoch": 0.2297910716969788, + "flos": 22822652609280.0, + "grad_norm": 1.6122764700202343, + "language_loss": 0.82934833, + "learning_rate": 3.501445802451746e-06, + "loss": 0.85054123, + "num_input_tokens_seen": 82217530, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.6015625, + "step": 3822, + "time_per_iteration": 2.4323556423187256 + }, + { + "auxiliary_loss_clip": 0.01085935, + "auxiliary_loss_mlp": 0.01031304, + "balance_loss_clip": 1.01349413, + "balance_loss_mlp": 1.02483392, + "epoch": 0.2298511949496468, + "flos": 23037927252480.0, + "grad_norm": 1.6174899692367815, + "language_loss": 0.6632266, + "learning_rate": 3.5011961797221158e-06, + "loss": 0.68439901, + "num_input_tokens_seen": 82237980, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.61328125, + "step": 3823, + "time_per_iteration": 2.385617971420288 + }, + { + "auxiliary_loss_clip": 0.01018244, + "auxiliary_loss_mlp": 0.0100774, + "balance_loss_clip": 1.00547528, + "balance_loss_mlp": 1.00400758, + "epoch": 0.22991131820231475, + "flos": 66887684691840.0, + "grad_norm": 0.8083559781092068, + "language_loss": 0.56830817, + "learning_rate": 3.5009465034186554e-06, + "loss": 0.58856803, + "num_input_tokens_seen": 82301785, + "router_z_loss_clip": 0.02270508, + "router_z_loss_mlp": 0.14257812, + "step": 3824, + "time_per_iteration": 3.1239378452301025 + }, + { + "auxiliary_loss_clip": 0.01084447, + "auxiliary_loss_mlp": 0.01033754, + "balance_loss_clip": 1.01605201, + "balance_loss_mlp": 1.02550197, + "epoch": 0.22997144145498272, + "flos": 17886492944640.0, + "grad_norm": 3.065801308947649, + "language_loss": 0.73238909, + "learning_rate": 3.500696773550275e-06, + "loss": 0.75357115, + "num_input_tokens_seen": 82317355, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.58984375, + "step": 3825, + "time_per_iteration": 2.321202516555786 + }, + { + "auxiliary_loss_clip": 0.01091978, + "auxiliary_loss_mlp": 0.01038205, + "balance_loss_clip": 1.0186305, + "balance_loss_mlp": 1.0295465, + "epoch": 0.23003156470765068, + "flos": 24675807864960.0, + "grad_norm": 1.7841718054769142, + "language_loss": 0.8789047, + "learning_rate": 3.5004469901258873e-06, + "loss": 0.90020657, + "num_input_tokens_seen": 82336645, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 0.625, + "step": 3826, + "time_per_iteration": 2.4327785968780518 + }, + { + "auxiliary_loss_clip": 0.01089859, + "auxiliary_loss_mlp": 0.01041345, + "balance_loss_clip": 1.02131748, + "balance_loss_mlp": 1.02550173, + "epoch": 0.23009168796031865, + "flos": 15813245278080.0, + "grad_norm": 2.330247921206323, + "language_loss": 0.81820428, + "learning_rate": 3.5001971531544053e-06, + "loss": 0.83951628, + "num_input_tokens_seen": 82354225, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 0.64453125, + "step": 3827, + "time_per_iteration": 2.365079164505005 + }, + { + "auxiliary_loss_clip": 0.01085776, + "auxiliary_loss_mlp": 0.0103543, + "balance_loss_clip": 1.01772785, + "balance_loss_mlp": 1.02626896, + "epoch": 0.2301518112129866, + "flos": 16212023360640.0, + "grad_norm": 1.9029908701929552, + "language_loss": 0.86464047, + "learning_rate": 3.499947262644747e-06, + "loss": 0.88585246, + "num_input_tokens_seen": 82370240, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.59375, + "step": 3828, + "time_per_iteration": 2.3360729217529297 + }, + { + "auxiliary_loss_clip": 0.01088078, + "auxiliary_loss_mlp": 0.01039298, + "balance_loss_clip": 1.02073741, + "balance_loss_mlp": 1.02658987, + "epoch": 0.2302119344656546, + "flos": 20594391039360.0, + "grad_norm": 1.8951397423855496, + "language_loss": 0.70642465, + "learning_rate": 3.4996973186058284e-06, + "loss": 0.72769845, + "num_input_tokens_seen": 82389145, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.6171875, + "step": 3829, + "time_per_iteration": 2.386470079421997 + }, + { + "auxiliary_loss_clip": 0.0108747, + "auxiliary_loss_mlp": 0.01032478, + "balance_loss_clip": 1.01601517, + "balance_loss_mlp": 1.02867401, + "epoch": 0.23027205771832257, + "flos": 26795698974720.0, + "grad_norm": 1.4732655685815619, + "language_loss": 0.84161735, + "learning_rate": 3.4994473210465706e-06, + "loss": 0.86281681, + "num_input_tokens_seen": 82409185, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.5859375, + "step": 3830, + "time_per_iteration": 2.4395828247070312 + }, + { + "auxiliary_loss_clip": 0.01087966, + "auxiliary_loss_mlp": 0.010417, + "balance_loss_clip": 1.02379513, + "balance_loss_mlp": 1.0273571, + "epoch": 0.23033218097099054, + "flos": 43871439260160.0, + "grad_norm": 1.6509844910798381, + "language_loss": 0.67261213, + "learning_rate": 3.499197269975895e-06, + "loss": 0.69390881, + "num_input_tokens_seen": 82432070, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 0.60546875, + "step": 3831, + "time_per_iteration": 2.5767300128936768 + }, + { + "auxiliary_loss_clip": 0.01089142, + "auxiliary_loss_mlp": 0.01037978, + "balance_loss_clip": 1.01898837, + "balance_loss_mlp": 1.02638268, + "epoch": 0.2303923042236585, + "flos": 26066468073600.0, + "grad_norm": 2.0001274322563645, + "language_loss": 0.74681842, + "learning_rate": 3.4989471654027247e-06, + "loss": 0.76808959, + "num_input_tokens_seen": 82450625, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 0.62890625, + "step": 3832, + "time_per_iteration": 2.4149653911590576 + }, + { + "auxiliary_loss_clip": 0.01087654, + "auxiliary_loss_mlp": 0.01038064, + "balance_loss_clip": 1.01906228, + "balance_loss_mlp": 1.02671075, + "epoch": 0.23045242747632647, + "flos": 18295395321600.0, + "grad_norm": 1.6620348391588846, + "language_loss": 0.87407881, + "learning_rate": 3.4986970073359865e-06, + "loss": 0.89533603, + "num_input_tokens_seen": 82468575, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 0.609375, + "step": 3833, + "time_per_iteration": 2.358140468597412 + }, + { + "auxiliary_loss_clip": 0.01087184, + "auxiliary_loss_mlp": 0.01039239, + "balance_loss_clip": 1.02097583, + "balance_loss_mlp": 1.02623963, + "epoch": 0.23051255072899443, + "flos": 25519344157440.0, + "grad_norm": 1.7262268838511203, + "language_loss": 0.74829298, + "learning_rate": 3.498446795784607e-06, + "loss": 0.76955724, + "num_input_tokens_seen": 82488655, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 0.609375, + "step": 3834, + "time_per_iteration": 2.412421941757202 + }, + { + "auxiliary_loss_clip": 0.01087917, + "auxiliary_loss_mlp": 0.0104055, + "balance_loss_clip": 1.02178693, + "balance_loss_mlp": 1.02836227, + "epoch": 0.2305726739816624, + "flos": 21214134316800.0, + "grad_norm": 1.6548479875892168, + "language_loss": 0.85558653, + "learning_rate": 3.4981965307575153e-06, + "loss": 0.87687123, + "num_input_tokens_seen": 82507220, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.59375, + "step": 3835, + "time_per_iteration": 2.3953325748443604 + }, + { + "auxiliary_loss_clip": 0.01094482, + "auxiliary_loss_mlp": 0.01041777, + "balance_loss_clip": 1.02287018, + "balance_loss_mlp": 1.02779126, + "epoch": 0.2306327972343304, + "flos": 23330010620160.0, + "grad_norm": 2.0091674081892386, + "language_loss": 0.81818467, + "learning_rate": 3.4979462122636436e-06, + "loss": 0.83954728, + "num_input_tokens_seen": 82527920, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 0.66796875, + "step": 3836, + "time_per_iteration": 2.389854907989502 + }, + { + "auxiliary_loss_clip": 0.0109041, + "auxiliary_loss_mlp": 0.01038697, + "balance_loss_clip": 1.01990986, + "balance_loss_mlp": 1.02915406, + "epoch": 0.23069292048699835, + "flos": 20665718680320.0, + "grad_norm": 3.6008190810214966, + "language_loss": 0.79711235, + "learning_rate": 3.497695840311925e-06, + "loss": 0.81840348, + "num_input_tokens_seen": 82549040, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.61328125, + "step": 3837, + "time_per_iteration": 2.3949389457702637 + }, + { + "auxiliary_loss_clip": 0.01018597, + "auxiliary_loss_mlp": 0.01004495, + "balance_loss_clip": 1.00242114, + "balance_loss_mlp": 1.00478077, + "epoch": 0.23075304373966632, + "flos": 70451828680320.0, + "grad_norm": 0.9064880113181014, + "language_loss": 0.65390468, + "learning_rate": 3.4974454149112943e-06, + "loss": 0.67413557, + "num_input_tokens_seen": 82604070, + "router_z_loss_clip": 0.02075195, + "router_z_loss_mlp": 0.13867188, + "step": 3838, + "time_per_iteration": 2.934767246246338 + }, + { + "auxiliary_loss_clip": 0.01085654, + "auxiliary_loss_mlp": 0.01036766, + "balance_loss_clip": 1.01905131, + "balance_loss_mlp": 1.02633429, + "epoch": 0.23081316699233428, + "flos": 16617050576640.0, + "grad_norm": 1.8372022434081068, + "language_loss": 0.75830615, + "learning_rate": 3.4971949360706887e-06, + "loss": 0.77953029, + "num_input_tokens_seen": 82619665, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.59375, + "step": 3839, + "time_per_iteration": 2.355003833770752 + }, + { + "auxiliary_loss_clip": 0.01093763, + "auxiliary_loss_mlp": 0.01037886, + "balance_loss_clip": 1.01964688, + "balance_loss_mlp": 1.03051066, + "epoch": 0.23087329024500225, + "flos": 13297229349120.0, + "grad_norm": 1.7005344560926294, + "language_loss": 0.68687391, + "learning_rate": 3.4969444037990466e-06, + "loss": 0.70819044, + "num_input_tokens_seen": 82637530, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.6328125, + "step": 3840, + "time_per_iteration": 2.3609533309936523 + }, + { + "auxiliary_loss_clip": 0.01090251, + "auxiliary_loss_mlp": 0.01030775, + "balance_loss_clip": 1.01127279, + "balance_loss_mlp": 1.02743244, + "epoch": 0.23093341349767021, + "flos": 17784755642880.0, + "grad_norm": 1.9811088588370027, + "language_loss": 0.79281104, + "learning_rate": 3.49669381810531e-06, + "loss": 0.81402135, + "num_input_tokens_seen": 82656130, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 0.62890625, + "step": 3841, + "time_per_iteration": 2.3632378578186035 + }, + { + "auxiliary_loss_clip": 0.01088904, + "auxiliary_loss_mlp": 0.01034697, + "balance_loss_clip": 1.01825762, + "balance_loss_mlp": 1.02759445, + "epoch": 0.23099353675033818, + "flos": 23986936362240.0, + "grad_norm": 1.742101409675352, + "language_loss": 0.82951319, + "learning_rate": 3.4964431789984204e-06, + "loss": 0.85074925, + "num_input_tokens_seen": 82675295, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.609375, + "step": 3842, + "time_per_iteration": 2.3822836875915527 + }, + { + "auxiliary_loss_clip": 0.01086767, + "auxiliary_loss_mlp": 0.01042601, + "balance_loss_clip": 1.02311015, + "balance_loss_mlp": 1.0251447, + "epoch": 0.23105366000300617, + "flos": 35993601970560.0, + "grad_norm": 1.4223679128032825, + "language_loss": 0.66324598, + "learning_rate": 3.496192486487323e-06, + "loss": 0.68453968, + "num_input_tokens_seen": 82703260, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 0.6171875, + "step": 3843, + "time_per_iteration": 2.578294038772583 + }, + { + "auxiliary_loss_clip": 0.01087501, + "auxiliary_loss_mlp": 0.0103935, + "balance_loss_clip": 1.02137303, + "balance_loss_mlp": 1.02754712, + "epoch": 0.23111378325567414, + "flos": 31244087767680.0, + "grad_norm": 1.8211004731245037, + "language_loss": 0.77522135, + "learning_rate": 3.495941740580965e-06, + "loss": 0.79648989, + "num_input_tokens_seen": 82725060, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.6015625, + "step": 3844, + "time_per_iteration": 2.45005202293396 + }, + { + "auxiliary_loss_clip": 0.01089918, + "auxiliary_loss_mlp": 0.01036045, + "balance_loss_clip": 1.01710296, + "balance_loss_mlp": 1.02741623, + "epoch": 0.2311739065083421, + "flos": 19207221966720.0, + "grad_norm": 1.6325071452183224, + "language_loss": 0.77953732, + "learning_rate": 3.495690941288294e-06, + "loss": 0.80079699, + "num_input_tokens_seen": 82742960, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 0.625, + "step": 3845, + "time_per_iteration": 3.743535041809082 + }, + { + "auxiliary_loss_clip": 0.01083719, + "auxiliary_loss_mlp": 0.01030808, + "balance_loss_clip": 1.01330805, + "balance_loss_mlp": 1.02568829, + "epoch": 0.23123402976101007, + "flos": 23359268206080.0, + "grad_norm": 2.5102203572769435, + "language_loss": 0.76146102, + "learning_rate": 3.495440088618261e-06, + "loss": 0.78260636, + "num_input_tokens_seen": 82760205, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.578125, + "step": 3846, + "time_per_iteration": 2.3775031566619873 + }, + { + "auxiliary_loss_clip": 0.01086575, + "auxiliary_loss_mlp": 0.01035472, + "balance_loss_clip": 1.01675582, + "balance_loss_mlp": 1.02657318, + "epoch": 0.23129415301367803, + "flos": 13734516528000.0, + "grad_norm": 1.716294172686985, + "language_loss": 0.69583297, + "learning_rate": 3.4951891825798177e-06, + "loss": 0.71705341, + "num_input_tokens_seen": 82778590, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.6015625, + "step": 3847, + "time_per_iteration": 2.3594911098480225 + }, + { + "auxiliary_loss_clip": 0.01016762, + "auxiliary_loss_mlp": 0.0100073, + "balance_loss_clip": 0.99875158, + "balance_loss_mlp": 1.00283837, + "epoch": 0.231354276266346, + "flos": 69733699591680.0, + "grad_norm": 0.7874101911604106, + "language_loss": 0.61031818, + "learning_rate": 3.4949382231819186e-06, + "loss": 0.6304931, + "num_input_tokens_seen": 82833925, + "router_z_loss_clip": 0.01977539, + "router_z_loss_mlp": 0.13867188, + "step": 3848, + "time_per_iteration": 2.929974317550659 + }, + { + "auxiliary_loss_clip": 0.01085614, + "auxiliary_loss_mlp": 0.01037029, + "balance_loss_clip": 1.01906395, + "balance_loss_mlp": 1.02494383, + "epoch": 0.231414399519014, + "flos": 18835118029440.0, + "grad_norm": 2.3678579967272486, + "language_loss": 0.78285599, + "learning_rate": 3.4946872104335192e-06, + "loss": 0.80408239, + "num_input_tokens_seen": 82850625, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.609375, + "step": 3849, + "time_per_iteration": 3.8106307983398438 + }, + { + "auxiliary_loss_clip": 0.01087009, + "auxiliary_loss_mlp": 0.01037892, + "balance_loss_clip": 1.01940298, + "balance_loss_mlp": 1.02645183, + "epoch": 0.23147452277168196, + "flos": 36134057836800.0, + "grad_norm": 1.8836628050263342, + "language_loss": 0.71054161, + "learning_rate": 3.4944361443435788e-06, + "loss": 0.73179066, + "num_input_tokens_seen": 82872105, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.60546875, + "step": 3850, + "time_per_iteration": 3.8503291606903076 + }, + { + "auxiliary_loss_clip": 0.01085889, + "auxiliary_loss_mlp": 0.01032545, + "balance_loss_clip": 1.01435411, + "balance_loss_mlp": 1.02487469, + "epoch": 0.23153464602434992, + "flos": 20811900009600.0, + "grad_norm": 1.6771053465701393, + "language_loss": 0.76121211, + "learning_rate": 3.4941850249210562e-06, + "loss": 0.7823965, + "num_input_tokens_seen": 82890595, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.609375, + "step": 3851, + "time_per_iteration": 2.372323513031006 + }, + { + "auxiliary_loss_clip": 0.01085647, + "auxiliary_loss_mlp": 0.01032212, + "balance_loss_clip": 1.01492655, + "balance_loss_mlp": 1.02648795, + "epoch": 0.2315947692770179, + "flos": 19938198435840.0, + "grad_norm": 1.6970532262689262, + "language_loss": 0.69931245, + "learning_rate": 3.4939338521749137e-06, + "loss": 0.72049105, + "num_input_tokens_seen": 82908910, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.59375, + "step": 3852, + "time_per_iteration": 2.37731671333313 + }, + { + "auxiliary_loss_clip": 0.01088036, + "auxiliary_loss_mlp": 0.01034083, + "balance_loss_clip": 1.01686323, + "balance_loss_mlp": 1.02657413, + "epoch": 0.23165489252968585, + "flos": 12854845111680.0, + "grad_norm": 2.373656272034946, + "language_loss": 0.67155361, + "learning_rate": 3.493682626114115e-06, + "loss": 0.69277483, + "num_input_tokens_seen": 82925405, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.6171875, + "step": 3853, + "time_per_iteration": 2.344114303588867 + }, + { + "auxiliary_loss_clip": 0.01087381, + "auxiliary_loss_mlp": 0.01034954, + "balance_loss_clip": 1.01583314, + "balance_loss_mlp": 1.02618957, + "epoch": 0.23171501578235382, + "flos": 30626962842240.0, + "grad_norm": 1.6026857997278943, + "language_loss": 0.79938722, + "learning_rate": 3.4934313467476255e-06, + "loss": 0.82061064, + "num_input_tokens_seen": 82945615, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 0.61328125, + "step": 3854, + "time_per_iteration": 3.821682929992676 + }, + { + "auxiliary_loss_clip": 0.01090967, + "auxiliary_loss_mlp": 0.01037838, + "balance_loss_clip": 1.01839483, + "balance_loss_mlp": 1.02631581, + "epoch": 0.23177513903502178, + "flos": 23841627816960.0, + "grad_norm": 2.244247077098742, + "language_loss": 0.65230674, + "learning_rate": 3.4931800140844123e-06, + "loss": 0.67359477, + "num_input_tokens_seen": 82967570, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 0.6484375, + "step": 3855, + "time_per_iteration": 2.4144818782806396 + }, + { + "auxiliary_loss_clip": 0.01087764, + "auxiliary_loss_mlp": 0.01040759, + "balance_loss_clip": 1.02191257, + "balance_loss_mlp": 1.02582347, + "epoch": 0.23183526228768978, + "flos": 29568989779200.0, + "grad_norm": 2.3325060406619604, + "language_loss": 0.70744312, + "learning_rate": 3.4929286281334455e-06, + "loss": 0.72872829, + "num_input_tokens_seen": 82987435, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 0.6171875, + "step": 3856, + "time_per_iteration": 2.4286093711853027 + }, + { + "auxiliary_loss_clip": 0.01086147, + "auxiliary_loss_mlp": 0.01037107, + "balance_loss_clip": 1.02171731, + "balance_loss_mlp": 1.02714205, + "epoch": 0.23189538554035774, + "flos": 34457284103040.0, + "grad_norm": 1.524265996803013, + "language_loss": 0.7678349, + "learning_rate": 3.4926771889036964e-06, + "loss": 0.78906745, + "num_input_tokens_seen": 83010505, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.58984375, + "step": 3857, + "time_per_iteration": 2.5084476470947266 + }, + { + "auxiliary_loss_clip": 0.01091045, + "auxiliary_loss_mlp": 0.01043371, + "balance_loss_clip": 1.02281952, + "balance_loss_mlp": 1.02686715, + "epoch": 0.2319555087930257, + "flos": 18002858106240.0, + "grad_norm": 2.153866176548299, + "language_loss": 0.91147965, + "learning_rate": 3.4924256964041387e-06, + "loss": 0.93282378, + "num_input_tokens_seen": 83026705, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 0.640625, + "step": 3858, + "time_per_iteration": 2.3367738723754883 + }, + { + "auxiliary_loss_clip": 0.01085803, + "auxiliary_loss_mlp": 0.0103625, + "balance_loss_clip": 1.0198946, + "balance_loss_mlp": 1.02741051, + "epoch": 0.23201563204569367, + "flos": 23142876399360.0, + "grad_norm": 2.0352397765961436, + "language_loss": 0.76528925, + "learning_rate": 3.492174150643746e-06, + "loss": 0.78650975, + "num_input_tokens_seen": 83046500, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.5859375, + "step": 3859, + "time_per_iteration": 2.3958740234375 + }, + { + "auxiliary_loss_clip": 0.01083474, + "auxiliary_loss_mlp": 0.01028742, + "balance_loss_clip": 1.01096749, + "balance_loss_mlp": 1.02491474, + "epoch": 0.23207575529836164, + "flos": 20666940577920.0, + "grad_norm": 1.711314371365438, + "language_loss": 0.84139782, + "learning_rate": 3.4919225516314967e-06, + "loss": 0.86251998, + "num_input_tokens_seen": 83065280, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.5859375, + "step": 3860, + "time_per_iteration": 2.3686769008636475 + }, + { + "auxiliary_loss_clip": 0.0108558, + "auxiliary_loss_mlp": 0.01036474, + "balance_loss_clip": 1.01955855, + "balance_loss_mlp": 1.02602863, + "epoch": 0.2321358785510296, + "flos": 16471253272320.0, + "grad_norm": 2.2656897178590616, + "language_loss": 0.83054185, + "learning_rate": 3.491670899376369e-06, + "loss": 0.85176235, + "num_input_tokens_seen": 83082310, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.59375, + "step": 3861, + "time_per_iteration": 2.350154399871826 + }, + { + "auxiliary_loss_clip": 0.01085996, + "auxiliary_loss_mlp": 0.01035652, + "balance_loss_clip": 1.01777101, + "balance_loss_mlp": 1.02532959, + "epoch": 0.2321960018036976, + "flos": 21615251460480.0, + "grad_norm": 1.4941762555325888, + "language_loss": 0.85749722, + "learning_rate": 3.491419193887344e-06, + "loss": 0.87871373, + "num_input_tokens_seen": 83102065, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 0.609375, + "step": 3862, + "time_per_iteration": 2.37319278717041 + }, + { + "auxiliary_loss_clip": 0.01084899, + "auxiliary_loss_mlp": 0.01035926, + "balance_loss_clip": 1.01992798, + "balance_loss_mlp": 1.02606058, + "epoch": 0.23225612505636556, + "flos": 22270431634560.0, + "grad_norm": 1.3822871416728404, + "language_loss": 0.74682367, + "learning_rate": 3.4911674351734036e-06, + "loss": 0.76803184, + "num_input_tokens_seen": 83121445, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.58984375, + "step": 3863, + "time_per_iteration": 2.3927905559539795 + }, + { + "auxiliary_loss_clip": 0.01086858, + "auxiliary_loss_mlp": 0.01034666, + "balance_loss_clip": 1.01740432, + "balance_loss_mlp": 1.02881217, + "epoch": 0.23231624830903352, + "flos": 17051475024000.0, + "grad_norm": 1.7194030568120946, + "language_loss": 0.7429074, + "learning_rate": 3.490915623243534e-06, + "loss": 0.76412261, + "num_input_tokens_seen": 83138175, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.58203125, + "step": 3864, + "time_per_iteration": 2.334650754928589 + }, + { + "auxiliary_loss_clip": 0.01085904, + "auxiliary_loss_mlp": 0.01028943, + "balance_loss_clip": 1.01183629, + "balance_loss_mlp": 1.02545595, + "epoch": 0.2323763715617015, + "flos": 34638657949440.0, + "grad_norm": 1.6110785405592347, + "language_loss": 0.70623219, + "learning_rate": 3.490663758106721e-06, + "loss": 0.72738063, + "num_input_tokens_seen": 83161975, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.60546875, + "step": 3865, + "time_per_iteration": 2.5042667388916016 + }, + { + "auxiliary_loss_clip": 0.01093552, + "auxiliary_loss_mlp": 0.01043105, + "balance_loss_clip": 1.02183831, + "balance_loss_mlp": 1.02737474, + "epoch": 0.23243649481436945, + "flos": 25550661513600.0, + "grad_norm": 1.8160159251501384, + "language_loss": 0.95263124, + "learning_rate": 3.4904118397719527e-06, + "loss": 0.97399777, + "num_input_tokens_seen": 83180905, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 0.66015625, + "step": 3866, + "time_per_iteration": 2.4118688106536865 + }, + { + "auxiliary_loss_clip": 0.01084664, + "auxiliary_loss_mlp": 0.01035237, + "balance_loss_clip": 1.01782107, + "balance_loss_mlp": 1.02631032, + "epoch": 0.23249661806703742, + "flos": 20482494531840.0, + "grad_norm": 2.6971448797499766, + "language_loss": 0.7372874, + "learning_rate": 3.4901598682482198e-06, + "loss": 0.75848639, + "num_input_tokens_seen": 83196390, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.58203125, + "step": 3867, + "time_per_iteration": 2.367206573486328 + }, + { + "auxiliary_loss_clip": 0.0108622, + "auxiliary_loss_mlp": 0.01038454, + "balance_loss_clip": 1.02017939, + "balance_loss_mlp": 1.02629495, + "epoch": 0.23255674131970538, + "flos": 20375555437440.0, + "grad_norm": 1.6609717679460123, + "language_loss": 0.82445127, + "learning_rate": 3.489907843544514e-06, + "loss": 0.845698, + "num_input_tokens_seen": 83216165, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 0.59765625, + "step": 3868, + "time_per_iteration": 2.369361639022827 + }, + { + "auxiliary_loss_clip": 0.01084422, + "auxiliary_loss_mlp": 0.01033168, + "balance_loss_clip": 1.01701486, + "balance_loss_mlp": 1.02724826, + "epoch": 0.23261686457237338, + "flos": 17055140716800.0, + "grad_norm": 59.581132554126874, + "language_loss": 0.72886205, + "learning_rate": 3.48965576566983e-06, + "loss": 0.75003791, + "num_input_tokens_seen": 83233845, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.5703125, + "step": 3869, + "time_per_iteration": 2.351043701171875 + }, + { + "auxiliary_loss_clip": 0.01086381, + "auxiliary_loss_mlp": 0.01036553, + "balance_loss_clip": 1.01924992, + "balance_loss_mlp": 1.02770185, + "epoch": 0.23267698782504134, + "flos": 29168570862720.0, + "grad_norm": 1.7408269110206762, + "language_loss": 0.7938329, + "learning_rate": 3.4894036346331633e-06, + "loss": 0.81506222, + "num_input_tokens_seen": 83254930, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.5859375, + "step": 3870, + "time_per_iteration": 2.4447643756866455 + }, + { + "auxiliary_loss_clip": 0.0109148, + "auxiliary_loss_mlp": 0.01036928, + "balance_loss_clip": 1.01787853, + "balance_loss_mlp": 1.02846575, + "epoch": 0.2327371110777093, + "flos": 21173705095680.0, + "grad_norm": 1.785973332768856, + "language_loss": 0.70797658, + "learning_rate": 3.4891514504435122e-06, + "loss": 0.72926068, + "num_input_tokens_seen": 83272095, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 0.6328125, + "step": 3871, + "time_per_iteration": 2.384000062942505 + }, + { + "auxiliary_loss_clip": 0.01091167, + "auxiliary_loss_mlp": 0.01050932, + "balance_loss_clip": 1.03105998, + "balance_loss_mlp": 1.02725577, + "epoch": 0.23279723433037727, + "flos": 24861964567680.0, + "grad_norm": 1.9030353513864107, + "language_loss": 0.68676955, + "learning_rate": 3.488899213109877e-06, + "loss": 0.7081905, + "num_input_tokens_seen": 83290980, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 0.63671875, + "step": 3872, + "time_per_iteration": 2.4129700660705566 + }, + { + "auxiliary_loss_clip": 0.01092725, + "auxiliary_loss_mlp": 0.01037721, + "balance_loss_clip": 1.01759875, + "balance_loss_mlp": 1.02817512, + "epoch": 0.23285735758304524, + "flos": 38799082915200.0, + "grad_norm": 1.5032490787504662, + "language_loss": 0.77853107, + "learning_rate": 3.4886469226412574e-06, + "loss": 0.79983556, + "num_input_tokens_seen": 83315175, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 0.6484375, + "step": 3873, + "time_per_iteration": 2.5239038467407227 + }, + { + "auxiliary_loss_clip": 0.0101591, + "auxiliary_loss_mlp": 0.010095, + "balance_loss_clip": 1.00760484, + "balance_loss_mlp": 1.00248373, + "epoch": 0.2329174808357132, + "flos": 53941086927360.0, + "grad_norm": 0.8483968145996765, + "language_loss": 0.60481763, + "learning_rate": 3.48839457904666e-06, + "loss": 0.62507164, + "num_input_tokens_seen": 83372060, + "router_z_loss_clip": 0.0189209, + "router_z_loss_mlp": 0.13476562, + "step": 3874, + "time_per_iteration": 2.94453501701355 + }, + { + "auxiliary_loss_clip": 0.01088082, + "auxiliary_loss_mlp": 0.01041089, + "balance_loss_clip": 1.02206349, + "balance_loss_mlp": 1.0273155, + "epoch": 0.23297760408838117, + "flos": 21214937278080.0, + "grad_norm": 3.6098683362584985, + "language_loss": 0.80544692, + "learning_rate": 3.488142182335088e-06, + "loss": 0.82673866, + "num_input_tokens_seen": 83389795, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 0.609375, + "step": 3875, + "time_per_iteration": 2.3684475421905518 + }, + { + "auxiliary_loss_clip": 0.01088315, + "auxiliary_loss_mlp": 0.01032235, + "balance_loss_clip": 1.01553428, + "balance_loss_mlp": 1.02850652, + "epoch": 0.23303772734104916, + "flos": 28401738560640.0, + "grad_norm": 1.8928251504185174, + "language_loss": 0.61316186, + "learning_rate": 3.4878897325155493e-06, + "loss": 0.63436735, + "num_input_tokens_seen": 83410005, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.59765625, + "step": 3876, + "time_per_iteration": 2.4443576335906982 + }, + { + "auxiliary_loss_clip": 0.01091508, + "auxiliary_loss_mlp": 0.01041779, + "balance_loss_clip": 1.02286053, + "balance_loss_mlp": 1.02828074, + "epoch": 0.23309785059371713, + "flos": 24313618753920.0, + "grad_norm": 1.8395140767558795, + "language_loss": 0.70228851, + "learning_rate": 3.4876372295970533e-06, + "loss": 0.72362137, + "num_input_tokens_seen": 83430250, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 0.6328125, + "step": 3877, + "time_per_iteration": 2.39897084236145 + }, + { + "auxiliary_loss_clip": 0.01090161, + "auxiliary_loss_mlp": 0.01050481, + "balance_loss_clip": 1.03028655, + "balance_loss_mlp": 1.02755427, + "epoch": 0.2331579738463851, + "flos": 15992140417920.0, + "grad_norm": 2.1233250462097635, + "language_loss": 0.80935645, + "learning_rate": 3.4873846735886113e-06, + "loss": 0.8307628, + "num_input_tokens_seen": 83447950, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 0.625, + "step": 3878, + "time_per_iteration": 2.351627826690674 + }, + { + "auxiliary_loss_clip": 0.01093163, + "auxiliary_loss_mlp": 0.01041984, + "balance_loss_clip": 1.02279115, + "balance_loss_mlp": 1.02850604, + "epoch": 0.23321809709905306, + "flos": 36425547711360.0, + "grad_norm": 1.592302194329684, + "language_loss": 0.75221008, + "learning_rate": 3.487132064499237e-06, + "loss": 0.7735616, + "num_input_tokens_seen": 83467785, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 0.64453125, + "step": 3879, + "time_per_iteration": 2.490724802017212 + }, + { + "auxiliary_loss_clip": 0.01089623, + "auxiliary_loss_mlp": 0.01038217, + "balance_loss_clip": 1.01984656, + "balance_loss_mlp": 1.02678251, + "epoch": 0.23327822035172102, + "flos": 21323691763200.0, + "grad_norm": 1.9218040636305243, + "language_loss": 0.8951329, + "learning_rate": 3.4868794023379433e-06, + "loss": 0.91641128, + "num_input_tokens_seen": 83485390, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.62890625, + "step": 3880, + "time_per_iteration": 2.3741278648376465 + }, + { + "auxiliary_loss_clip": 0.01091293, + "auxiliary_loss_mlp": 0.01035241, + "balance_loss_clip": 1.01701427, + "balance_loss_mlp": 1.02880943, + "epoch": 0.233338343604389, + "flos": 19170877374720.0, + "grad_norm": 1.6104074644906894, + "language_loss": 0.71677834, + "learning_rate": 3.4866266871137495e-06, + "loss": 0.73804367, + "num_input_tokens_seen": 83504890, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 0.625, + "step": 3881, + "time_per_iteration": 2.383903741836548 + }, + { + "auxiliary_loss_clip": 0.01085002, + "auxiliary_loss_mlp": 0.01037509, + "balance_loss_clip": 1.01941347, + "balance_loss_mlp": 1.02560568, + "epoch": 0.23339846685705698, + "flos": 26907106723200.0, + "grad_norm": 1.5624443064870686, + "language_loss": 0.68101043, + "learning_rate": 3.486373918835673e-06, + "loss": 0.70223552, + "num_input_tokens_seen": 83526475, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.59375, + "step": 3882, + "time_per_iteration": 2.4449117183685303 + }, + { + "auxiliary_loss_clip": 0.01088653, + "auxiliary_loss_mlp": 0.01037966, + "balance_loss_clip": 1.01885712, + "balance_loss_mlp": 1.02729297, + "epoch": 0.23345859010972494, + "flos": 32341791824640.0, + "grad_norm": 1.8848536996230683, + "language_loss": 0.76615065, + "learning_rate": 3.486121097512735e-06, + "loss": 0.78741682, + "num_input_tokens_seen": 83546620, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 0.6171875, + "step": 3883, + "time_per_iteration": 2.4524500370025635 + }, + { + "auxiliary_loss_clip": 0.01015285, + "auxiliary_loss_mlp": 0.01002841, + "balance_loss_clip": 1.0010047, + "balance_loss_mlp": 1.00191069, + "epoch": 0.2335187133623929, + "flos": 58480633013760.0, + "grad_norm": 0.777824197148817, + "language_loss": 0.59107447, + "learning_rate": 3.4858682231539575e-06, + "loss": 0.61125576, + "num_input_tokens_seen": 83616160, + "router_z_loss_clip": 0.01831055, + "router_z_loss_mlp": 0.13378906, + "step": 3884, + "time_per_iteration": 3.1615242958068848 + }, + { + "auxiliary_loss_clip": 0.01087233, + "auxiliary_loss_mlp": 0.01035903, + "balance_loss_clip": 1.0174377, + "balance_loss_mlp": 1.02715921, + "epoch": 0.23357883661506088, + "flos": 24501067176960.0, + "grad_norm": 1.6848883906985879, + "language_loss": 0.8042841, + "learning_rate": 3.4856152957683654e-06, + "loss": 0.82551551, + "num_input_tokens_seen": 83636795, + "router_z_loss_clip": 0.18457031, + "router_z_loss_mlp": 0.6015625, + "step": 3885, + "time_per_iteration": 3.812267780303955 + }, + { + "auxiliary_loss_clip": 0.01088251, + "auxiliary_loss_mlp": 0.01038334, + "balance_loss_clip": 1.01920128, + "balance_loss_mlp": 1.02714074, + "epoch": 0.23363895986772884, + "flos": 18947642941440.0, + "grad_norm": 2.032075325271931, + "language_loss": 0.88071245, + "learning_rate": 3.4853623153649843e-06, + "loss": 0.90197825, + "num_input_tokens_seen": 83654050, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 0.609375, + "step": 3886, + "time_per_iteration": 2.342637777328491 + }, + { + "auxiliary_loss_clip": 0.01092208, + "auxiliary_loss_mlp": 0.01035319, + "balance_loss_clip": 1.01576889, + "balance_loss_mlp": 1.02930665, + "epoch": 0.2336990831203968, + "flos": 31685459575680.0, + "grad_norm": 1.705614046026682, + "language_loss": 0.72942907, + "learning_rate": 3.4851092819528434e-06, + "loss": 0.75070429, + "num_input_tokens_seen": 83673720, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 0.62890625, + "step": 3887, + "time_per_iteration": 2.468019485473633 + }, + { + "auxiliary_loss_clip": 0.01090694, + "auxiliary_loss_mlp": 0.010366, + "balance_loss_clip": 1.01852798, + "balance_loss_mlp": 1.02882934, + "epoch": 0.23375920637306477, + "flos": 27708503137920.0, + "grad_norm": 1.7049365643049947, + "language_loss": 0.83646351, + "learning_rate": 3.4848561955409723e-06, + "loss": 0.85773647, + "num_input_tokens_seen": 83693470, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.6171875, + "step": 3888, + "time_per_iteration": 3.776660442352295 + }, + { + "auxiliary_loss_clip": 0.01088951, + "auxiliary_loss_mlp": 0.01040598, + "balance_loss_clip": 1.02158463, + "balance_loss_mlp": 1.02729416, + "epoch": 0.23381932962573276, + "flos": 17674674526080.0, + "grad_norm": 2.582089726648754, + "language_loss": 0.8758015, + "learning_rate": 3.4846030561384036e-06, + "loss": 0.89709705, + "num_input_tokens_seen": 83711620, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 0.6171875, + "step": 3889, + "time_per_iteration": 2.367008924484253 + }, + { + "auxiliary_loss_clip": 0.01090311, + "auxiliary_loss_mlp": 0.01035352, + "balance_loss_clip": 1.01644588, + "balance_loss_mlp": 1.02713513, + "epoch": 0.23387945287840073, + "flos": 14390010904320.0, + "grad_norm": 6.79406742982316, + "language_loss": 0.76318294, + "learning_rate": 3.48434986375417e-06, + "loss": 0.78443956, + "num_input_tokens_seen": 83727890, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 0.6328125, + "step": 3890, + "time_per_iteration": 3.715498924255371 + }, + { + "auxiliary_loss_clip": 0.01089352, + "auxiliary_loss_mlp": 0.01032187, + "balance_loss_clip": 1.01415098, + "balance_loss_mlp": 1.02785158, + "epoch": 0.2339395761310687, + "flos": 46096244605440.0, + "grad_norm": 1.6402951909836185, + "language_loss": 0.73124486, + "learning_rate": 3.4840966183973085e-06, + "loss": 0.75246024, + "num_input_tokens_seen": 83749370, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.6171875, + "step": 3891, + "time_per_iteration": 2.5878403186798096 + }, + { + "auxiliary_loss_clip": 0.01084362, + "auxiliary_loss_mlp": 0.01034546, + "balance_loss_clip": 1.0168438, + "balance_loss_mlp": 1.0261457, + "epoch": 0.23399969938373666, + "flos": 22380966599040.0, + "grad_norm": 1.624153552412621, + "language_loss": 0.82987958, + "learning_rate": 3.483843320076856e-06, + "loss": 0.85106862, + "num_input_tokens_seen": 83769560, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.58203125, + "step": 3892, + "time_per_iteration": 2.4133431911468506 + }, + { + "auxiliary_loss_clip": 0.01089271, + "auxiliary_loss_mlp": 0.01039675, + "balance_loss_clip": 1.02114964, + "balance_loss_mlp": 1.02633345, + "epoch": 0.23405982263640462, + "flos": 43506841265280.0, + "grad_norm": 1.5790185781033186, + "language_loss": 0.64797843, + "learning_rate": 3.4835899688018522e-06, + "loss": 0.66926789, + "num_input_tokens_seen": 83795635, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.62890625, + "step": 3893, + "time_per_iteration": 2.57216477394104 + }, + { + "auxiliary_loss_clip": 0.01087905, + "auxiliary_loss_mlp": 0.01036994, + "balance_loss_clip": 1.01685905, + "balance_loss_mlp": 1.02709126, + "epoch": 0.2341199458890726, + "flos": 22563597254400.0, + "grad_norm": 1.9864293071099186, + "language_loss": 0.79282415, + "learning_rate": 3.4833365645813384e-06, + "loss": 0.81407309, + "num_input_tokens_seen": 83814090, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 0.609375, + "step": 3894, + "time_per_iteration": 3.8073062896728516 + }, + { + "auxiliary_loss_clip": 0.010865, + "auxiliary_loss_mlp": 0.01032128, + "balance_loss_clip": 1.01431835, + "balance_loss_mlp": 1.02638721, + "epoch": 0.23418006914174055, + "flos": 25632672030720.0, + "grad_norm": 1.399182172015132, + "language_loss": 0.81676078, + "learning_rate": 3.483083107424359e-06, + "loss": 0.83794707, + "num_input_tokens_seen": 83836870, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.6015625, + "step": 3895, + "time_per_iteration": 2.4315969944000244 + }, + { + "auxiliary_loss_clip": 0.0108965, + "auxiliary_loss_mlp": 0.01042139, + "balance_loss_clip": 1.02311325, + "balance_loss_mlp": 1.02690482, + "epoch": 0.23424019239440855, + "flos": 13545287625600.0, + "grad_norm": 2.4338500250449786, + "language_loss": 0.80449915, + "learning_rate": 3.4828295973399576e-06, + "loss": 0.82581705, + "num_input_tokens_seen": 83853275, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 0.62890625, + "step": 3896, + "time_per_iteration": 2.3407487869262695 + }, + { + "auxiliary_loss_clip": 0.01089227, + "auxiliary_loss_mlp": 0.01039079, + "balance_loss_clip": 1.01896858, + "balance_loss_mlp": 1.02640009, + "epoch": 0.2343003156470765, + "flos": 22418393443200.0, + "grad_norm": 1.6071938501328131, + "language_loss": 0.83172464, + "learning_rate": 3.4825760343371826e-06, + "loss": 0.85300767, + "num_input_tokens_seen": 83872340, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 0.62890625, + "step": 3897, + "time_per_iteration": 2.3793880939483643 + }, + { + "auxiliary_loss_clip": 0.01090787, + "auxiliary_loss_mlp": 0.01040324, + "balance_loss_clip": 1.02080941, + "balance_loss_mlp": 1.0267725, + "epoch": 0.23436043889974448, + "flos": 14790010884480.0, + "grad_norm": 1.5581287520768663, + "language_loss": 0.79288226, + "learning_rate": 3.482322418425083e-06, + "loss": 0.81419337, + "num_input_tokens_seen": 83888795, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 0.640625, + "step": 3898, + "time_per_iteration": 2.358301877975464 + }, + { + "auxiliary_loss_clip": 0.01088283, + "auxiliary_loss_mlp": 0.01036677, + "balance_loss_clip": 1.01794958, + "balance_loss_mlp": 1.02875996, + "epoch": 0.23442056215241244, + "flos": 22964609664000.0, + "grad_norm": 2.0556150322534488, + "language_loss": 0.73653591, + "learning_rate": 3.4820687496127086e-06, + "loss": 0.7577855, + "num_input_tokens_seen": 83906820, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.59375, + "step": 3899, + "time_per_iteration": 2.374284029006958 + }, + { + "auxiliary_loss_clip": 0.01089848, + "auxiliary_loss_mlp": 0.01035129, + "balance_loss_clip": 1.01480436, + "balance_loss_mlp": 1.02681553, + "epoch": 0.2344806854050804, + "flos": 23070885442560.0, + "grad_norm": 1.7523844490759546, + "language_loss": 0.75317299, + "learning_rate": 3.481815027909113e-06, + "loss": 0.77442276, + "num_input_tokens_seen": 83926370, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 0.62890625, + "step": 3900, + "time_per_iteration": 2.3924379348754883 + }, + { + "auxiliary_loss_clip": 0.01086836, + "auxiliary_loss_mlp": 0.01044223, + "balance_loss_clip": 1.02541137, + "balance_loss_mlp": 1.02613676, + "epoch": 0.23454080865774837, + "flos": 16326119283840.0, + "grad_norm": 1.8746186612364086, + "language_loss": 0.67176574, + "learning_rate": 3.481561253323351e-06, + "loss": 0.69307637, + "num_input_tokens_seen": 83944600, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.60546875, + "step": 3901, + "time_per_iteration": 2.3454980850219727 + }, + { + "auxiliary_loss_clip": 0.01016168, + "auxiliary_loss_mlp": 0.01003361, + "balance_loss_clip": 1.00156093, + "balance_loss_mlp": 1.00265777, + "epoch": 0.23460093191041637, + "flos": 67757860218240.0, + "grad_norm": 0.7569055479015939, + "language_loss": 0.58216643, + "learning_rate": 3.4813074258644786e-06, + "loss": 0.60236168, + "num_input_tokens_seen": 84005100, + "router_z_loss_clip": 0.01794434, + "router_z_loss_mlp": 0.13476562, + "step": 3902, + "time_per_iteration": 2.9768431186676025 + }, + { + "auxiliary_loss_clip": 0.01089177, + "auxiliary_loss_mlp": 0.01041788, + "balance_loss_clip": 1.02316713, + "balance_loss_mlp": 1.02695894, + "epoch": 0.23466105516308433, + "flos": 20076769088640.0, + "grad_norm": 1.847964685177598, + "language_loss": 0.80235386, + "learning_rate": 3.4810535455415547e-06, + "loss": 0.82366347, + "num_input_tokens_seen": 84023775, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.62109375, + "step": 3903, + "time_per_iteration": 2.3543636798858643 + }, + { + "auxiliary_loss_clip": 0.01087197, + "auxiliary_loss_mlp": 0.01035546, + "balance_loss_clip": 1.01584053, + "balance_loss_mlp": 1.02472389, + "epoch": 0.2347211784157523, + "flos": 24534549037440.0, + "grad_norm": 1.847324200959818, + "language_loss": 0.82054985, + "learning_rate": 3.4807996123636394e-06, + "loss": 0.84177727, + "num_input_tokens_seen": 84042605, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 0.625, + "step": 3904, + "time_per_iteration": 2.3945391178131104 + }, + { + "auxiliary_loss_clip": 0.01087658, + "auxiliary_loss_mlp": 0.01038302, + "balance_loss_clip": 1.01990783, + "balance_loss_mlp": 1.02744102, + "epoch": 0.23478130166842026, + "flos": 23803921681920.0, + "grad_norm": 1.8055937303680838, + "language_loss": 0.71191037, + "learning_rate": 3.4805456263397954e-06, + "loss": 0.73316991, + "num_input_tokens_seen": 84061520, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.6015625, + "step": 3905, + "time_per_iteration": 2.3860695362091064 + }, + { + "auxiliary_loss_clip": 0.01085371, + "auxiliary_loss_mlp": 0.01036509, + "balance_loss_clip": 1.0176506, + "balance_loss_mlp": 1.02658606, + "epoch": 0.23484142492108823, + "flos": 24092583736320.0, + "grad_norm": 1.705053924116823, + "language_loss": 0.7110635, + "learning_rate": 3.480291587479086e-06, + "loss": 0.73228228, + "num_input_tokens_seen": 84081800, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 0.58984375, + "step": 3906, + "time_per_iteration": 2.4205853939056396 + }, + { + "auxiliary_loss_clip": 0.01089727, + "auxiliary_loss_mlp": 0.01035227, + "balance_loss_clip": 1.01469874, + "balance_loss_mlp": 1.02454805, + "epoch": 0.2349015481737562, + "flos": 29094555047040.0, + "grad_norm": 1.8734093881306393, + "language_loss": 0.73802781, + "learning_rate": 3.4800374957905777e-06, + "loss": 0.75927734, + "num_input_tokens_seen": 84102340, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 0.65234375, + "step": 3907, + "time_per_iteration": 2.4347403049468994 + }, + { + "auxiliary_loss_clip": 0.01089199, + "auxiliary_loss_mlp": 0.01046658, + "balance_loss_clip": 1.02795398, + "balance_loss_mlp": 1.02631307, + "epoch": 0.23496167142642416, + "flos": 18915313155840.0, + "grad_norm": 1.6152982109399794, + "language_loss": 0.7262612, + "learning_rate": 3.4797833512833376e-06, + "loss": 0.74761975, + "num_input_tokens_seen": 84120370, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.62890625, + "step": 3908, + "time_per_iteration": 2.3700857162475586 + }, + { + "auxiliary_loss_clip": 0.01014905, + "auxiliary_loss_mlp": 0.01001075, + "balance_loss_clip": 0.99928665, + "balance_loss_mlp": 1.00184536, + "epoch": 0.23502179467909215, + "flos": 55865255621760.0, + "grad_norm": 1.0311515853158357, + "language_loss": 0.73314607, + "learning_rate": 3.479529153966437e-06, + "loss": 0.75330579, + "num_input_tokens_seen": 84165515, + "router_z_loss_clip": 0.01782227, + "router_z_loss_mlp": 0.13085938, + "step": 3909, + "time_per_iteration": 2.7518317699432373 + }, + { + "auxiliary_loss_clip": 0.01085447, + "auxiliary_loss_mlp": 0.01042184, + "balance_loss_clip": 1.02412355, + "balance_loss_mlp": 1.02521765, + "epoch": 0.23508191793176011, + "flos": 23400709856640.0, + "grad_norm": 1.610049055586779, + "language_loss": 0.8800478, + "learning_rate": 3.479274903848947e-06, + "loss": 0.90132415, + "num_input_tokens_seen": 84184540, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.6015625, + "step": 3910, + "time_per_iteration": 2.4032609462738037 + }, + { + "auxiliary_loss_clip": 0.01089212, + "auxiliary_loss_mlp": 0.01036128, + "balance_loss_clip": 1.0178293, + "balance_loss_mlp": 1.02690864, + "epoch": 0.23514204118442808, + "flos": 20046638718720.0, + "grad_norm": 2.4689970801754817, + "language_loss": 0.76217383, + "learning_rate": 3.4790206009399396e-06, + "loss": 0.78342724, + "num_input_tokens_seen": 84202025, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 0.625, + "step": 3911, + "time_per_iteration": 2.3631575107574463 + }, + { + "auxiliary_loss_clip": 0.01086303, + "auxiliary_loss_mlp": 0.01035563, + "balance_loss_clip": 1.01840854, + "balance_loss_mlp": 1.02771223, + "epoch": 0.23520216443709605, + "flos": 21579500361600.0, + "grad_norm": 1.5249697522270447, + "language_loss": 0.82007813, + "learning_rate": 3.4787662452484923e-06, + "loss": 0.84129679, + "num_input_tokens_seen": 84221895, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.5859375, + "step": 3912, + "time_per_iteration": 2.392092227935791 + }, + { + "auxiliary_loss_clip": 0.01086741, + "auxiliary_loss_mlp": 0.01046849, + "balance_loss_clip": 1.02833557, + "balance_loss_mlp": 1.02640569, + "epoch": 0.235262287689764, + "flos": 23184667163520.0, + "grad_norm": 1.9560258940231139, + "language_loss": 0.71403623, + "learning_rate": 3.4785118367836816e-06, + "loss": 0.73537213, + "num_input_tokens_seen": 84240455, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.6015625, + "step": 3913, + "time_per_iteration": 2.378814697265625 + }, + { + "auxiliary_loss_clip": 0.01092186, + "auxiliary_loss_mlp": 0.01038486, + "balance_loss_clip": 1.01817274, + "balance_loss_mlp": 1.02705812, + "epoch": 0.23532241094243198, + "flos": 23184108581760.0, + "grad_norm": 1.590253749853247, + "language_loss": 0.76320422, + "learning_rate": 3.4782573755545866e-06, + "loss": 0.78451097, + "num_input_tokens_seen": 84261605, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 0.65234375, + "step": 3914, + "time_per_iteration": 2.3976385593414307 + }, + { + "auxiliary_loss_clip": 0.01088772, + "auxiliary_loss_mlp": 0.01036684, + "balance_loss_clip": 1.01784861, + "balance_loss_mlp": 1.02729559, + "epoch": 0.23538253419509997, + "flos": 17018377188480.0, + "grad_norm": 2.153764461088028, + "language_loss": 0.89869112, + "learning_rate": 3.478002861570288e-06, + "loss": 0.91994566, + "num_input_tokens_seen": 84278675, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 0.61328125, + "step": 3915, + "time_per_iteration": 2.331655740737915 + }, + { + "auxiliary_loss_clip": 0.01014715, + "auxiliary_loss_mlp": 0.01002028, + "balance_loss_clip": 1.00016809, + "balance_loss_mlp": 1.00118518, + "epoch": 0.23544265744776793, + "flos": 63445807751040.0, + "grad_norm": 0.8060170169430493, + "language_loss": 0.59416699, + "learning_rate": 3.47774829483987e-06, + "loss": 0.61433446, + "num_input_tokens_seen": 84329765, + "router_z_loss_clip": 0.01855469, + "router_z_loss_mlp": 0.13574219, + "step": 3916, + "time_per_iteration": 2.953676700592041 + }, + { + "auxiliary_loss_clip": 0.01014091, + "auxiliary_loss_mlp": 0.0100179, + "balance_loss_clip": 0.999942, + "balance_loss_mlp": 1.00077128, + "epoch": 0.2355027807004359, + "flos": 70511668617600.0, + "grad_norm": 0.8909998172565021, + "language_loss": 0.49440813, + "learning_rate": 3.4774936753724156e-06, + "loss": 0.51456696, + "num_input_tokens_seen": 84393680, + "router_z_loss_clip": 0.01843262, + "router_z_loss_mlp": 0.1328125, + "step": 3917, + "time_per_iteration": 3.037407398223877 + }, + { + "auxiliary_loss_clip": 0.01092182, + "auxiliary_loss_mlp": 0.01038689, + "balance_loss_clip": 1.02008009, + "balance_loss_mlp": 1.02693367, + "epoch": 0.23556290395310386, + "flos": 21433214298240.0, + "grad_norm": 2.0099807227836446, + "language_loss": 0.76812083, + "learning_rate": 3.4772390031770126e-06, + "loss": 0.78942955, + "num_input_tokens_seen": 84412640, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.65234375, + "step": 3918, + "time_per_iteration": 2.3877439498901367 + }, + { + "auxiliary_loss_clip": 0.01093324, + "auxiliary_loss_mlp": 0.01036577, + "balance_loss_clip": 1.01768243, + "balance_loss_mlp": 1.02766883, + "epoch": 0.23562302720577183, + "flos": 18185453850240.0, + "grad_norm": 1.8471874145199307, + "language_loss": 0.69392735, + "learning_rate": 3.47698427826275e-06, + "loss": 0.71522635, + "num_input_tokens_seen": 84431605, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 0.65625, + "step": 3919, + "time_per_iteration": 2.3740577697753906 + }, + { + "auxiliary_loss_clip": 0.01086102, + "auxiliary_loss_mlp": 0.01033642, + "balance_loss_clip": 1.01590323, + "balance_loss_mlp": 1.02619362, + "epoch": 0.2356831504584398, + "flos": 33729065631360.0, + "grad_norm": 1.6244493627930483, + "language_loss": 0.70738161, + "learning_rate": 3.4767295006387174e-06, + "loss": 0.72857904, + "num_input_tokens_seen": 84454210, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.6015625, + "step": 3920, + "time_per_iteration": 2.4963719844818115 + }, + { + "auxiliary_loss_clip": 0.01088191, + "auxiliary_loss_mlp": 0.01041816, + "balance_loss_clip": 1.02422094, + "balance_loss_mlp": 1.02713609, + "epoch": 0.23574327371110776, + "flos": 24931721197440.0, + "grad_norm": 1.505679621024444, + "language_loss": 0.7673465, + "learning_rate": 3.4764746703140077e-06, + "loss": 0.78864658, + "num_input_tokens_seen": 84475540, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.609375, + "step": 3921, + "time_per_iteration": 2.418783664703369 + }, + { + "auxiliary_loss_clip": 0.01088373, + "auxiliary_loss_mlp": 0.01041809, + "balance_loss_clip": 1.02302134, + "balance_loss_mlp": 1.02737713, + "epoch": 0.23580339696377575, + "flos": 17821135146240.0, + "grad_norm": 2.095434254877917, + "language_loss": 0.75107485, + "learning_rate": 3.476219787297715e-06, + "loss": 0.77237666, + "num_input_tokens_seen": 84494580, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.609375, + "step": 3922, + "time_per_iteration": 2.382678508758545 + }, + { + "auxiliary_loss_clip": 0.01085767, + "auxiliary_loss_mlp": 0.01034456, + "balance_loss_clip": 1.0165149, + "balance_loss_mlp": 1.02481282, + "epoch": 0.23586352021644372, + "flos": 26285408409600.0, + "grad_norm": 1.9576847834165292, + "language_loss": 0.80365449, + "learning_rate": 3.4759648515989356e-06, + "loss": 0.82485676, + "num_input_tokens_seen": 84513850, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.609375, + "step": 3923, + "time_per_iteration": 2.4159023761749268 + }, + { + "auxiliary_loss_clip": 0.01084837, + "auxiliary_loss_mlp": 0.01042515, + "balance_loss_clip": 1.0240252, + "balance_loss_mlp": 1.02530074, + "epoch": 0.23592364346911168, + "flos": 14245819522560.0, + "grad_norm": 2.599898710072028, + "language_loss": 0.74348283, + "learning_rate": 3.4757098632267663e-06, + "loss": 0.76475632, + "num_input_tokens_seen": 84532315, + "router_z_loss_clip": 0.18457031, + "router_z_loss_mlp": 0.59375, + "step": 3924, + "time_per_iteration": 3.7444300651550293 + }, + { + "auxiliary_loss_clip": 0.01089198, + "auxiliary_loss_mlp": 0.01035378, + "balance_loss_clip": 1.0174135, + "balance_loss_mlp": 1.02811205, + "epoch": 0.23598376672177965, + "flos": 18586955018880.0, + "grad_norm": 1.565306928828022, + "language_loss": 0.82739925, + "learning_rate": 3.4754548221903086e-06, + "loss": 0.84864497, + "num_input_tokens_seen": 84550970, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.609375, + "step": 3925, + "time_per_iteration": 2.3710172176361084 + }, + { + "auxiliary_loss_clip": 0.01091045, + "auxiliary_loss_mlp": 0.01042492, + "balance_loss_clip": 1.02396703, + "balance_loss_mlp": 1.02750456, + "epoch": 0.2360438899744476, + "flos": 22674411509760.0, + "grad_norm": 1.61479247563879, + "language_loss": 0.59434247, + "learning_rate": 3.475199728498664e-06, + "loss": 0.61567783, + "num_input_tokens_seen": 84571655, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.6328125, + "step": 3926, + "time_per_iteration": 2.402881383895874 + }, + { + "auxiliary_loss_clip": 0.01082674, + "auxiliary_loss_mlp": 0.01032391, + "balance_loss_clip": 1.0156306, + "balance_loss_mlp": 1.02566981, + "epoch": 0.23610401322711558, + "flos": 29568850133760.0, + "grad_norm": 1.9188225539479868, + "language_loss": 0.71234751, + "learning_rate": 3.474944582160935e-06, + "loss": 0.7334981, + "num_input_tokens_seen": 84593130, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.5703125, + "step": 3927, + "time_per_iteration": 2.434513568878174 + }, + { + "auxiliary_loss_clip": 0.01087163, + "auxiliary_loss_mlp": 0.01030594, + "balance_loss_clip": 1.01372635, + "balance_loss_mlp": 1.02647984, + "epoch": 0.23616413647978354, + "flos": 17857549560960.0, + "grad_norm": 1.6424193366904158, + "language_loss": 0.75205117, + "learning_rate": 3.4746893831862287e-06, + "loss": 0.77322876, + "num_input_tokens_seen": 84612410, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.609375, + "step": 3928, + "time_per_iteration": 3.757261037826538 + }, + { + "auxiliary_loss_clip": 0.01087093, + "auxiliary_loss_mlp": 0.01041277, + "balance_loss_clip": 1.02341962, + "balance_loss_mlp": 1.02587724, + "epoch": 0.23622425973245154, + "flos": 11034089464320.0, + "grad_norm": 5.534124166541476, + "language_loss": 0.81882471, + "learning_rate": 3.474434131583651e-06, + "loss": 0.84010839, + "num_input_tokens_seen": 84627610, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 0.609375, + "step": 3929, + "time_per_iteration": 3.7931859493255615 + }, + { + "auxiliary_loss_clip": 0.01092217, + "auxiliary_loss_mlp": 0.01038184, + "balance_loss_clip": 1.01809752, + "balance_loss_mlp": 1.02778602, + "epoch": 0.2362843829851195, + "flos": 23402944183680.0, + "grad_norm": 1.7850875375794433, + "language_loss": 0.7202158, + "learning_rate": 3.474178827362312e-06, + "loss": 0.74151981, + "num_input_tokens_seen": 84648415, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 0.64453125, + "step": 3930, + "time_per_iteration": 2.425452470779419 + }, + { + "auxiliary_loss_clip": 0.01086405, + "auxiliary_loss_mlp": 0.01034044, + "balance_loss_clip": 1.01482701, + "balance_loss_mlp": 1.02544904, + "epoch": 0.23634450623778747, + "flos": 39528313816320.0, + "grad_norm": 1.7034464418132031, + "language_loss": 0.73838532, + "learning_rate": 3.473923470531323e-06, + "loss": 0.75958979, + "num_input_tokens_seen": 84670080, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 0.609375, + "step": 3931, + "time_per_iteration": 2.5556492805480957 + }, + { + "auxiliary_loss_clip": 0.01089561, + "auxiliary_loss_mlp": 0.01034603, + "balance_loss_clip": 1.01506436, + "balance_loss_mlp": 1.02752662, + "epoch": 0.23640462949045543, + "flos": 24206016343680.0, + "grad_norm": 1.906088396646724, + "language_loss": 0.80198288, + "learning_rate": 3.4736680610997965e-06, + "loss": 0.82322443, + "num_input_tokens_seen": 84686465, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 0.62109375, + "step": 3932, + "time_per_iteration": 2.400491952896118 + }, + { + "auxiliary_loss_clip": 0.01087914, + "auxiliary_loss_mlp": 0.01036607, + "balance_loss_clip": 1.01971507, + "balance_loss_mlp": 1.0282166, + "epoch": 0.2364647527431234, + "flos": 26176409544960.0, + "grad_norm": 1.8649058480496419, + "language_loss": 0.85352182, + "learning_rate": 3.4734125990768476e-06, + "loss": 0.87476707, + "num_input_tokens_seen": 84708825, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.59765625, + "step": 3933, + "time_per_iteration": 3.808643341064453 + }, + { + "auxiliary_loss_clip": 0.01094796, + "auxiliary_loss_mlp": 0.01035201, + "balance_loss_clip": 1.01617527, + "balance_loss_mlp": 1.03128254, + "epoch": 0.23652487599579136, + "flos": 22635937324800.0, + "grad_norm": 2.366228012432977, + "language_loss": 0.82972109, + "learning_rate": 3.473157084471593e-06, + "loss": 0.85102105, + "num_input_tokens_seen": 84726165, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 0.63671875, + "step": 3934, + "time_per_iteration": 2.3947384357452393 + }, + { + "auxiliary_loss_clip": 0.01089428, + "auxiliary_loss_mlp": 0.01037945, + "balance_loss_clip": 1.01931226, + "balance_loss_mlp": 1.0274868, + "epoch": 0.23658499924845935, + "flos": 21761188410240.0, + "grad_norm": 1.9397582946968468, + "language_loss": 0.78524363, + "learning_rate": 3.472901517293152e-06, + "loss": 0.80651736, + "num_input_tokens_seen": 84745815, + "router_z_loss_clip": 0.18652344, + "router_z_loss_mlp": 0.6171875, + "step": 3935, + "time_per_iteration": 2.3859567642211914 + }, + { + "auxiliary_loss_clip": 0.01088847, + "auxiliary_loss_mlp": 0.01036016, + "balance_loss_clip": 1.01830173, + "balance_loss_mlp": 1.02928877, + "epoch": 0.23664512250112732, + "flos": 21797917027200.0, + "grad_norm": 2.019692854853557, + "language_loss": 0.79821914, + "learning_rate": 3.472645897550644e-06, + "loss": 0.81946778, + "num_input_tokens_seen": 84765415, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.59375, + "step": 3936, + "time_per_iteration": 2.403423547744751 + }, + { + "auxiliary_loss_clip": 0.01088847, + "auxiliary_loss_mlp": 0.01038309, + "balance_loss_clip": 1.02030861, + "balance_loss_mlp": 1.0280695, + "epoch": 0.23670524575379528, + "flos": 22636775197440.0, + "grad_norm": 1.7256088803435925, + "language_loss": 0.79123998, + "learning_rate": 3.4723902252531925e-06, + "loss": 0.81251156, + "num_input_tokens_seen": 84787080, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.609375, + "step": 3937, + "time_per_iteration": 2.4090025424957275 + }, + { + "auxiliary_loss_clip": 0.01086062, + "auxiliary_loss_mlp": 0.01036133, + "balance_loss_clip": 1.0198313, + "balance_loss_mlp": 1.02775407, + "epoch": 0.23676536900646325, + "flos": 16724129316480.0, + "grad_norm": 1.7795879807522792, + "language_loss": 0.85066378, + "learning_rate": 3.472134500409921e-06, + "loss": 0.87188578, + "num_input_tokens_seen": 84805395, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.58203125, + "step": 3938, + "time_per_iteration": 2.3687632083892822 + }, + { + "auxiliary_loss_clip": 0.01084712, + "auxiliary_loss_mlp": 0.01042457, + "balance_loss_clip": 1.024683, + "balance_loss_mlp": 1.02572882, + "epoch": 0.23682549225913122, + "flos": 11135093627520.0, + "grad_norm": 2.1358145243909723, + "language_loss": 0.94085848, + "learning_rate": 3.471878723029956e-06, + "loss": 0.96213019, + "num_input_tokens_seen": 84818090, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.58984375, + "step": 3939, + "time_per_iteration": 2.309140920639038 + }, + { + "auxiliary_loss_clip": 0.01086833, + "auxiliary_loss_mlp": 0.01036452, + "balance_loss_clip": 1.0177362, + "balance_loss_mlp": 1.0254091, + "epoch": 0.23688561551179918, + "flos": 22558290727680.0, + "grad_norm": 1.570453226292029, + "language_loss": 0.8218323, + "learning_rate": 3.4716228931224253e-06, + "loss": 0.84306526, + "num_input_tokens_seen": 84837695, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.61328125, + "step": 3940, + "time_per_iteration": 2.395319700241089 + }, + { + "auxiliary_loss_clip": 0.01092457, + "auxiliary_loss_mlp": 0.0103986, + "balance_loss_clip": 1.02151346, + "balance_loss_mlp": 1.02784467, + "epoch": 0.23694573876446715, + "flos": 18513916721280.0, + "grad_norm": 2.05952993011614, + "language_loss": 0.89007425, + "learning_rate": 3.4713670106964596e-06, + "loss": 0.91139746, + "num_input_tokens_seen": 84854630, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.6484375, + "step": 3941, + "time_per_iteration": 2.3423094749450684 + }, + { + "auxiliary_loss_clip": 0.01086054, + "auxiliary_loss_mlp": 0.01034234, + "balance_loss_clip": 1.01593494, + "balance_loss_mlp": 1.02536297, + "epoch": 0.23700586201713514, + "flos": 15334970296320.0, + "grad_norm": 1.912205034027911, + "language_loss": 0.84782934, + "learning_rate": 3.4711110757611897e-06, + "loss": 0.86903226, + "num_input_tokens_seen": 84871805, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 0.60546875, + "step": 3942, + "time_per_iteration": 2.341895818710327 + }, + { + "auxiliary_loss_clip": 0.0108676, + "auxiliary_loss_mlp": 0.010326, + "balance_loss_clip": 1.0150528, + "balance_loss_mlp": 1.02570486, + "epoch": 0.2370659852698031, + "flos": 23946576963840.0, + "grad_norm": 1.8294419142178115, + "language_loss": 0.81383359, + "learning_rate": 3.4708550883257496e-06, + "loss": 0.83502716, + "num_input_tokens_seen": 84889815, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.609375, + "step": 3943, + "time_per_iteration": 2.427168846130371 + }, + { + "auxiliary_loss_clip": 0.01088801, + "auxiliary_loss_mlp": 0.01034218, + "balance_loss_clip": 1.01532364, + "balance_loss_mlp": 1.02520633, + "epoch": 0.23712610852247107, + "flos": 15331863185280.0, + "grad_norm": 9.09334172791455, + "language_loss": 0.67308927, + "learning_rate": 3.4705990483992746e-06, + "loss": 0.69431949, + "num_input_tokens_seen": 84904380, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 0.63671875, + "step": 3944, + "time_per_iteration": 2.3187944889068604 + }, + { + "auxiliary_loss_clip": 0.0108925, + "auxiliary_loss_mlp": 0.01039393, + "balance_loss_clip": 1.01998532, + "balance_loss_mlp": 1.02612245, + "epoch": 0.23718623177513903, + "flos": 19681551964800.0, + "grad_norm": 1.671039651844236, + "language_loss": 0.75468224, + "learning_rate": 3.470342955990903e-06, + "loss": 0.77596867, + "num_input_tokens_seen": 84922935, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 0.6328125, + "step": 3945, + "time_per_iteration": 2.37501859664917 + }, + { + "auxiliary_loss_clip": 0.01086521, + "auxiliary_loss_mlp": 0.0103405, + "balance_loss_clip": 1.01713407, + "balance_loss_mlp": 1.02697992, + "epoch": 0.237246355027807, + "flos": 24972150418560.0, + "grad_norm": 1.4428215725623434, + "language_loss": 0.63798368, + "learning_rate": 3.470086811109773e-06, + "loss": 0.65918934, + "num_input_tokens_seen": 84943685, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.59375, + "step": 3946, + "time_per_iteration": 2.4165070056915283 + }, + { + "auxiliary_loss_clip": 0.01085289, + "auxiliary_loss_mlp": 0.01034038, + "balance_loss_clip": 1.01466691, + "balance_loss_mlp": 1.02429485, + "epoch": 0.23730647828047496, + "flos": 15376516680960.0, + "grad_norm": 3.4707146167044516, + "language_loss": 0.77266467, + "learning_rate": 3.469830613765026e-06, + "loss": 0.79385793, + "num_input_tokens_seen": 84959505, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 0.609375, + "step": 3947, + "time_per_iteration": 2.347360134124756 + }, + { + "auxiliary_loss_clip": 0.01091679, + "auxiliary_loss_mlp": 0.01034779, + "balance_loss_clip": 1.01659989, + "balance_loss_mlp": 1.02870166, + "epoch": 0.23736660153314296, + "flos": 28149316364160.0, + "grad_norm": 1.4133151306437455, + "language_loss": 0.80498493, + "learning_rate": 3.4695743639658065e-06, + "loss": 0.82624948, + "num_input_tokens_seen": 84982130, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.6328125, + "step": 3948, + "time_per_iteration": 2.452707529067993 + }, + { + "auxiliary_loss_clip": 0.01089203, + "auxiliary_loss_mlp": 0.01036481, + "balance_loss_clip": 1.0187546, + "balance_loss_mlp": 1.02731228, + "epoch": 0.23742672478581092, + "flos": 22085601563520.0, + "grad_norm": 1.7015740092423202, + "language_loss": 0.80448139, + "learning_rate": 3.4693180617212568e-06, + "loss": 0.82573825, + "num_input_tokens_seen": 85000640, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.6171875, + "step": 3949, + "time_per_iteration": 2.372466802597046 + }, + { + "auxiliary_loss_clip": 0.01087709, + "auxiliary_loss_mlp": 0.0103332, + "balance_loss_clip": 1.01535463, + "balance_loss_mlp": 1.02515912, + "epoch": 0.2374868480384789, + "flos": 19536068862720.0, + "grad_norm": 1.7302396041739636, + "language_loss": 0.73320466, + "learning_rate": 3.4690617070405255e-06, + "loss": 0.75441492, + "num_input_tokens_seen": 85018970, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.625, + "step": 3950, + "time_per_iteration": 2.3847105503082275 + }, + { + "auxiliary_loss_clip": 0.01082571, + "auxiliary_loss_mlp": 0.01030741, + "balance_loss_clip": 1.01471925, + "balance_loss_mlp": 1.024845, + "epoch": 0.23754697129114685, + "flos": 19421623825920.0, + "grad_norm": 1.8489448929771304, + "language_loss": 0.7334522, + "learning_rate": 3.4688052999327607e-06, + "loss": 0.75458527, + "num_input_tokens_seen": 85035905, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.578125, + "step": 3951, + "time_per_iteration": 2.384700059890747 + }, + { + "auxiliary_loss_clip": 0.01090804, + "auxiliary_loss_mlp": 0.01036593, + "balance_loss_clip": 1.01792455, + "balance_loss_mlp": 1.02802944, + "epoch": 0.23760709454381482, + "flos": 19499968650240.0, + "grad_norm": 1.677284176615736, + "language_loss": 0.73947823, + "learning_rate": 3.4685488404071133e-06, + "loss": 0.7607522, + "num_input_tokens_seen": 85054560, + "router_z_loss_clip": 0.18652344, + "router_z_loss_mlp": 0.62890625, + "step": 3952, + "time_per_iteration": 2.361034631729126 + }, + { + "auxiliary_loss_clip": 0.0108799, + "auxiliary_loss_mlp": 0.01036075, + "balance_loss_clip": 1.01976752, + "balance_loss_mlp": 1.02603579, + "epoch": 0.23766721779648278, + "flos": 27635360106240.0, + "grad_norm": 1.5815686078865576, + "language_loss": 0.71242404, + "learning_rate": 3.468292328472735e-06, + "loss": 0.73366463, + "num_input_tokens_seen": 85074425, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.6171875, + "step": 3953, + "time_per_iteration": 2.4313089847564697 + }, + { + "auxiliary_loss_clip": 0.01088287, + "auxiliary_loss_mlp": 0.01037609, + "balance_loss_clip": 1.01906037, + "balance_loss_mlp": 1.02595127, + "epoch": 0.23772734104915075, + "flos": 23403223474560.0, + "grad_norm": 1.7210777377658004, + "language_loss": 0.81412822, + "learning_rate": 3.468035764138781e-06, + "loss": 0.83538717, + "num_input_tokens_seen": 85092865, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.625, + "step": 3954, + "time_per_iteration": 2.399097442626953 + }, + { + "auxiliary_loss_clip": 0.01088944, + "auxiliary_loss_mlp": 0.01039018, + "balance_loss_clip": 1.01989686, + "balance_loss_mlp": 1.02651834, + "epoch": 0.23778746430181874, + "flos": 15704595527040.0, + "grad_norm": 2.0400836021449007, + "language_loss": 0.66015524, + "learning_rate": 3.467779147414406e-06, + "loss": 0.68143481, + "num_input_tokens_seen": 85110175, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 0.625, + "step": 3955, + "time_per_iteration": 2.3557753562927246 + }, + { + "auxiliary_loss_clip": 0.01087135, + "auxiliary_loss_mlp": 0.01039377, + "balance_loss_clip": 1.02224684, + "balance_loss_mlp": 1.02688372, + "epoch": 0.2378475875544867, + "flos": 19425464075520.0, + "grad_norm": 1.3208309825519677, + "language_loss": 0.83773601, + "learning_rate": 3.467522478308769e-06, + "loss": 0.85900116, + "num_input_tokens_seen": 85129925, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.6015625, + "step": 3956, + "time_per_iteration": 2.382812976837158 + }, + { + "auxiliary_loss_clip": 0.0108376, + "auxiliary_loss_mlp": 0.01033172, + "balance_loss_clip": 1.01636314, + "balance_loss_mlp": 1.02641165, + "epoch": 0.23790771080715467, + "flos": 22267603814400.0, + "grad_norm": 2.0295910925093192, + "language_loss": 0.84805679, + "learning_rate": 3.46726575683103e-06, + "loss": 0.8692261, + "num_input_tokens_seen": 85147755, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.57421875, + "step": 3957, + "time_per_iteration": 2.378727674484253 + }, + { + "auxiliary_loss_clip": 0.01088326, + "auxiliary_loss_mlp": 0.01040559, + "balance_loss_clip": 1.02335739, + "balance_loss_mlp": 1.02813232, + "epoch": 0.23796783405982264, + "flos": 20046289605120.0, + "grad_norm": 1.800800681265894, + "language_loss": 0.69876516, + "learning_rate": 3.4670089829903503e-06, + "loss": 0.72005403, + "num_input_tokens_seen": 85165270, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.6015625, + "step": 3958, + "time_per_iteration": 2.3807365894317627 + }, + { + "auxiliary_loss_clip": 0.01086791, + "auxiliary_loss_mlp": 0.01034691, + "balance_loss_clip": 1.01586759, + "balance_loss_mlp": 1.02595806, + "epoch": 0.2380279573124906, + "flos": 14245086384000.0, + "grad_norm": 2.2052733126330866, + "language_loss": 0.65799189, + "learning_rate": 3.466752156795893e-06, + "loss": 0.67920673, + "num_input_tokens_seen": 85181555, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 0.609375, + "step": 3959, + "time_per_iteration": 2.356825828552246 + }, + { + "auxiliary_loss_clip": 0.01086594, + "auxiliary_loss_mlp": 0.01037715, + "balance_loss_clip": 1.02021527, + "balance_loss_mlp": 1.02528739, + "epoch": 0.23808808056515857, + "flos": 21178103927040.0, + "grad_norm": 1.7360593288961228, + "language_loss": 0.72312939, + "learning_rate": 3.4664952782568253e-06, + "loss": 0.74437243, + "num_input_tokens_seen": 85199455, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.61328125, + "step": 3960, + "time_per_iteration": 2.375762462615967 + }, + { + "auxiliary_loss_clip": 0.01088092, + "auxiliary_loss_mlp": 0.01032354, + "balance_loss_clip": 1.01455569, + "balance_loss_mlp": 1.02788997, + "epoch": 0.23814820381782653, + "flos": 22527217751040.0, + "grad_norm": 1.5031513570709232, + "language_loss": 0.74265003, + "learning_rate": 3.466238347382313e-06, + "loss": 0.7638545, + "num_input_tokens_seen": 85219170, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.6015625, + "step": 3961, + "time_per_iteration": 2.3914270401000977 + }, + { + "auxiliary_loss_clip": 0.0108734, + "auxiliary_loss_mlp": 0.01038095, + "balance_loss_clip": 1.01850939, + "balance_loss_mlp": 1.0248363, + "epoch": 0.23820832707049452, + "flos": 22303389824640.0, + "grad_norm": 1.7264029518346724, + "language_loss": 0.66661024, + "learning_rate": 3.465981364181525e-06, + "loss": 0.6878646, + "num_input_tokens_seen": 85238480, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 0.625, + "step": 3962, + "time_per_iteration": 2.3761768341064453 + }, + { + "auxiliary_loss_clip": 0.01086207, + "auxiliary_loss_mlp": 0.0103699, + "balance_loss_clip": 1.01980567, + "balance_loss_mlp": 1.02590299, + "epoch": 0.2382684503231625, + "flos": 24863046819840.0, + "grad_norm": 1.5728260116930093, + "language_loss": 0.74494505, + "learning_rate": 3.4657243286636332e-06, + "loss": 0.766177, + "num_input_tokens_seen": 85259180, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.6015625, + "step": 3963, + "time_per_iteration": 3.786365270614624 + }, + { + "auxiliary_loss_clip": 0.01090366, + "auxiliary_loss_mlp": 0.01036066, + "balance_loss_clip": 1.01745772, + "balance_loss_mlp": 1.02874279, + "epoch": 0.23832857357583045, + "flos": 21870536388480.0, + "grad_norm": 1.8762828014026218, + "language_loss": 0.77434373, + "learning_rate": 3.4654672408378107e-06, + "loss": 0.79560804, + "num_input_tokens_seen": 85278550, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.6171875, + "step": 3964, + "time_per_iteration": 2.381047248840332 + }, + { + "auxiliary_loss_clip": 0.01086723, + "auxiliary_loss_mlp": 0.01035484, + "balance_loss_clip": 1.01869905, + "balance_loss_mlp": 1.02703702, + "epoch": 0.23838869682849842, + "flos": 21286998057600.0, + "grad_norm": 1.821916289597902, + "language_loss": 0.7098124, + "learning_rate": 3.4652101007132323e-06, + "loss": 0.73103452, + "num_input_tokens_seen": 85297345, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.59765625, + "step": 3965, + "time_per_iteration": 2.37895131111145 + }, + { + "auxiliary_loss_clip": 0.01083686, + "auxiliary_loss_mlp": 0.01040786, + "balance_loss_clip": 1.02383471, + "balance_loss_mlp": 1.02591228, + "epoch": 0.23844882008116638, + "flos": 16179658663680.0, + "grad_norm": 1.7384444598575801, + "language_loss": 0.77981144, + "learning_rate": 3.4649529082990743e-06, + "loss": 0.80105615, + "num_input_tokens_seen": 85315105, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.578125, + "step": 3966, + "time_per_iteration": 2.3644492626190186 + }, + { + "auxiliary_loss_clip": 0.01085032, + "auxiliary_loss_mlp": 0.01032293, + "balance_loss_clip": 1.0161047, + "balance_loss_mlp": 1.02626991, + "epoch": 0.23850894333383435, + "flos": 21068651214720.0, + "grad_norm": 1.667296043610137, + "language_loss": 0.68478042, + "learning_rate": 3.4646956636045152e-06, + "loss": 0.70595366, + "num_input_tokens_seen": 85334735, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.5859375, + "step": 3967, + "time_per_iteration": 2.3833632469177246 + }, + { + "auxiliary_loss_clip": 0.01086896, + "auxiliary_loss_mlp": 0.01039739, + "balance_loss_clip": 1.021667, + "balance_loss_mlp": 1.02540779, + "epoch": 0.23856906658650234, + "flos": 17200658730240.0, + "grad_norm": 1.8830429758951257, + "language_loss": 0.68007058, + "learning_rate": 3.4644383666387347e-06, + "loss": 0.70133692, + "num_input_tokens_seen": 85352875, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.6171875, + "step": 3968, + "time_per_iteration": 3.6999571323394775 + }, + { + "auxiliary_loss_clip": 0.01084379, + "auxiliary_loss_mlp": 0.01032561, + "balance_loss_clip": 1.01556206, + "balance_loss_mlp": 1.02466679, + "epoch": 0.2386291898391703, + "flos": 29493018927360.0, + "grad_norm": 1.8585158487140065, + "language_loss": 0.76495361, + "learning_rate": 3.464181017410917e-06, + "loss": 0.78612304, + "num_input_tokens_seen": 85372205, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.59765625, + "step": 3969, + "time_per_iteration": 3.7654290199279785 + }, + { + "auxiliary_loss_clip": 0.01082321, + "auxiliary_loss_mlp": 0.01031209, + "balance_loss_clip": 1.0156877, + "balance_loss_mlp": 1.02609754, + "epoch": 0.23868931309183827, + "flos": 21141375310080.0, + "grad_norm": 2.1294684203951957, + "language_loss": 0.76286387, + "learning_rate": 3.463923615930245e-06, + "loss": 0.7839992, + "num_input_tokens_seen": 85389705, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.5625, + "step": 3970, + "time_per_iteration": 2.417985439300537 + }, + { + "auxiliary_loss_clip": 0.01085232, + "auxiliary_loss_mlp": 0.01035932, + "balance_loss_clip": 1.01732349, + "balance_loss_mlp": 1.02476001, + "epoch": 0.23874943634450624, + "flos": 25658403569280.0, + "grad_norm": 2.0755100414343644, + "language_loss": 0.85321903, + "learning_rate": 3.4636661622059042e-06, + "loss": 0.87443066, + "num_input_tokens_seen": 85407855, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.60546875, + "step": 3971, + "time_per_iteration": 2.4149224758148193 + }, + { + "auxiliary_loss_clip": 0.01014603, + "auxiliary_loss_mlp": 0.01003239, + "balance_loss_clip": 1.00128365, + "balance_loss_mlp": 1.00180185, + "epoch": 0.2388095595971742, + "flos": 58983243079680.0, + "grad_norm": 0.7724852864188745, + "language_loss": 0.62812436, + "learning_rate": 3.4634086562470835e-06, + "loss": 0.64830279, + "num_input_tokens_seen": 85470885, + "router_z_loss_clip": 0.01953125, + "router_z_loss_mlp": 0.12792969, + "step": 3972, + "time_per_iteration": 3.096389055252075 + }, + { + "auxiliary_loss_clip": 0.01086765, + "auxiliary_loss_mlp": 0.01033762, + "balance_loss_clip": 1.01713181, + "balance_loss_mlp": 1.02610457, + "epoch": 0.23886968284984217, + "flos": 16799401941120.0, + "grad_norm": 1.9784460532311081, + "language_loss": 0.81820315, + "learning_rate": 3.463151098062972e-06, + "loss": 0.8394084, + "num_input_tokens_seen": 85488460, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.60546875, + "step": 3973, + "time_per_iteration": 3.750425338745117 + }, + { + "auxiliary_loss_clip": 0.0108626, + "auxiliary_loss_mlp": 0.01039221, + "balance_loss_clip": 1.0221262, + "balance_loss_mlp": 1.02646852, + "epoch": 0.23892980610251013, + "flos": 22381560092160.0, + "grad_norm": 1.5480586435699946, + "language_loss": 0.79407525, + "learning_rate": 3.4628934876627615e-06, + "loss": 0.81533009, + "num_input_tokens_seen": 85508590, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.6015625, + "step": 3974, + "time_per_iteration": 2.3793485164642334 + }, + { + "auxiliary_loss_clip": 0.01085883, + "auxiliary_loss_mlp": 0.01036276, + "balance_loss_clip": 1.01733375, + "balance_loss_mlp": 1.02519321, + "epoch": 0.23898992935517813, + "flos": 12822375680640.0, + "grad_norm": 2.6404490165027985, + "language_loss": 0.84646249, + "learning_rate": 3.4626358250556458e-06, + "loss": 0.86768401, + "num_input_tokens_seen": 85525970, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 0.609375, + "step": 3975, + "time_per_iteration": 2.3446414470672607 + }, + { + "auxiliary_loss_clip": 0.01084979, + "auxiliary_loss_mlp": 0.01029047, + "balance_loss_clip": 1.01221466, + "balance_loss_mlp": 1.025787, + "epoch": 0.2390500526078461, + "flos": 22344587095680.0, + "grad_norm": 2.070458941367138, + "language_loss": 0.83398485, + "learning_rate": 3.4623781102508193e-06, + "loss": 0.85512519, + "num_input_tokens_seen": 85543700, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.59375, + "step": 3976, + "time_per_iteration": 2.3877251148223877 + }, + { + "auxiliary_loss_clip": 0.01083738, + "auxiliary_loss_mlp": 0.01028535, + "balance_loss_clip": 1.01259685, + "balance_loss_mlp": 1.0247519, + "epoch": 0.23911017586051406, + "flos": 22634121934080.0, + "grad_norm": 1.7267919942100842, + "language_loss": 0.74378598, + "learning_rate": 3.46212034325748e-06, + "loss": 0.76490867, + "num_input_tokens_seen": 85562765, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.58984375, + "step": 3977, + "time_per_iteration": 2.3767457008361816 + }, + { + "auxiliary_loss_clip": 0.01088694, + "auxiliary_loss_mlp": 0.0103766, + "balance_loss_clip": 1.02079177, + "balance_loss_mlp": 1.02736568, + "epoch": 0.23917029911318202, + "flos": 23652329091840.0, + "grad_norm": 1.725665327127498, + "language_loss": 0.72010094, + "learning_rate": 3.4618625240848264e-06, + "loss": 0.74136448, + "num_input_tokens_seen": 85581755, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.61328125, + "step": 3978, + "time_per_iteration": 2.3961164951324463 + }, + { + "auxiliary_loss_clip": 0.01090494, + "auxiliary_loss_mlp": 0.01036339, + "balance_loss_clip": 1.01925635, + "balance_loss_mlp": 1.02804995, + "epoch": 0.23923042236585, + "flos": 22782502679040.0, + "grad_norm": 2.442932903960139, + "language_loss": 0.78815722, + "learning_rate": 3.4616046527420597e-06, + "loss": 0.80942559, + "num_input_tokens_seen": 85599455, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.625, + "step": 3979, + "time_per_iteration": 2.384162425994873 + }, + { + "auxiliary_loss_clip": 0.01085504, + "auxiliary_loss_mlp": 0.0104775, + "balance_loss_clip": 1.02966619, + "balance_loss_mlp": 1.02607298, + "epoch": 0.23929054561851795, + "flos": 28146453632640.0, + "grad_norm": 1.6629882301459435, + "language_loss": 0.81845599, + "learning_rate": 3.4613467292383832e-06, + "loss": 0.83978856, + "num_input_tokens_seen": 85619970, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.59375, + "step": 3980, + "time_per_iteration": 2.433626890182495 + }, + { + "auxiliary_loss_clip": 0.01084021, + "auxiliary_loss_mlp": 0.01030556, + "balance_loss_clip": 1.01403391, + "balance_loss_mlp": 1.02522397, + "epoch": 0.23935066887118592, + "flos": 21685531760640.0, + "grad_norm": 1.6318570764469262, + "language_loss": 0.83637077, + "learning_rate": 3.4610887535830005e-06, + "loss": 0.85751653, + "num_input_tokens_seen": 85638850, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.5859375, + "step": 3981, + "time_per_iteration": 2.396655321121216 + }, + { + "auxiliary_loss_clip": 0.01087803, + "auxiliary_loss_mlp": 0.01037309, + "balance_loss_clip": 1.01887965, + "balance_loss_mlp": 1.02614427, + "epoch": 0.2394107921238539, + "flos": 32120966275200.0, + "grad_norm": 1.686774687864199, + "language_loss": 0.76628423, + "learning_rate": 3.4608307257851186e-06, + "loss": 0.78753537, + "num_input_tokens_seen": 85656285, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.6171875, + "step": 3982, + "time_per_iteration": 2.5224783420562744 + }, + { + "auxiliary_loss_clip": 0.01084124, + "auxiliary_loss_mlp": 0.0103397, + "balance_loss_clip": 1.01786518, + "balance_loss_mlp": 1.02578878, + "epoch": 0.23947091537652188, + "flos": 17018237543040.0, + "grad_norm": 1.5623356818917253, + "language_loss": 0.77890348, + "learning_rate": 3.460572645853946e-06, + "loss": 0.80008441, + "num_input_tokens_seen": 85673020, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.5859375, + "step": 3983, + "time_per_iteration": 2.391585111618042 + }, + { + "auxiliary_loss_clip": 0.010859, + "auxiliary_loss_mlp": 0.01039087, + "balance_loss_clip": 1.02035952, + "balance_loss_mlp": 1.02555084, + "epoch": 0.23953103862918984, + "flos": 20592575648640.0, + "grad_norm": 2.045453054094221, + "language_loss": 0.73051536, + "learning_rate": 3.4603145137986925e-06, + "loss": 0.75176525, + "num_input_tokens_seen": 85692565, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.6015625, + "step": 3984, + "time_per_iteration": 2.386662006378174 + }, + { + "auxiliary_loss_clip": 0.01087001, + "auxiliary_loss_mlp": 0.01033368, + "balance_loss_clip": 1.0162853, + "balance_loss_mlp": 1.02588665, + "epoch": 0.2395911618818578, + "flos": 20703354992640.0, + "grad_norm": 2.915130429326987, + "language_loss": 0.79223025, + "learning_rate": 3.4600563296285704e-06, + "loss": 0.81343389, + "num_input_tokens_seen": 85709730, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.609375, + "step": 3985, + "time_per_iteration": 2.398803234100342 + }, + { + "auxiliary_loss_clip": 0.01087986, + "auxiliary_loss_mlp": 0.01040145, + "balance_loss_clip": 1.02210879, + "balance_loss_mlp": 1.02877975, + "epoch": 0.23965128513452577, + "flos": 27052275623040.0, + "grad_norm": 1.7163930383911268, + "language_loss": 0.73366785, + "learning_rate": 3.459798093352794e-06, + "loss": 0.75494915, + "num_input_tokens_seen": 85730045, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.59375, + "step": 3986, + "time_per_iteration": 2.4272451400756836 + }, + { + "auxiliary_loss_clip": 0.01088714, + "auxiliary_loss_mlp": 0.01039063, + "balance_loss_clip": 1.0219568, + "balance_loss_mlp": 1.02616405, + "epoch": 0.23971140838719374, + "flos": 23143330247040.0, + "grad_norm": 1.7421897543799214, + "language_loss": 0.87564272, + "learning_rate": 3.4595398049805783e-06, + "loss": 0.8969205, + "num_input_tokens_seen": 85747590, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.625, + "step": 3987, + "time_per_iteration": 2.3867604732513428 + }, + { + "auxiliary_loss_clip": 0.01081476, + "auxiliary_loss_mlp": 0.0103769, + "balance_loss_clip": 1.02188313, + "balance_loss_mlp": 1.02601147, + "epoch": 0.23977153163986173, + "flos": 18033756526080.0, + "grad_norm": 2.2483503373333007, + "language_loss": 0.82897425, + "learning_rate": 3.459281464521142e-06, + "loss": 0.8501659, + "num_input_tokens_seen": 85763460, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.5546875, + "step": 3988, + "time_per_iteration": 2.333495855331421 + }, + { + "auxiliary_loss_clip": 0.01085015, + "auxiliary_loss_mlp": 0.01030762, + "balance_loss_clip": 1.01317835, + "balance_loss_mlp": 1.02549815, + "epoch": 0.2398316548925297, + "flos": 18112415552640.0, + "grad_norm": 1.66702502877762, + "language_loss": 0.85587507, + "learning_rate": 3.459023071983703e-06, + "loss": 0.87703288, + "num_input_tokens_seen": 85782050, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.59375, + "step": 3989, + "time_per_iteration": 2.374199867248535 + }, + { + "auxiliary_loss_clip": 0.0108426, + "auxiliary_loss_mlp": 0.01034761, + "balance_loss_clip": 1.01782095, + "balance_loss_mlp": 1.0245297, + "epoch": 0.23989177814519766, + "flos": 12566916195840.0, + "grad_norm": 2.0491048247908528, + "language_loss": 0.8434158, + "learning_rate": 3.458764627377484e-06, + "loss": 0.86460602, + "num_input_tokens_seen": 85797400, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.59765625, + "step": 3990, + "time_per_iteration": 2.3547685146331787 + }, + { + "auxiliary_loss_clip": 0.01082816, + "auxiliary_loss_mlp": 0.01033799, + "balance_loss_clip": 1.01781273, + "balance_loss_mlp": 1.02518666, + "epoch": 0.23995190139786562, + "flos": 25263430824960.0, + "grad_norm": 1.8276982580432377, + "language_loss": 0.75659311, + "learning_rate": 3.458506130711708e-06, + "loss": 0.77775925, + "num_input_tokens_seen": 85818995, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.578125, + "step": 3991, + "time_per_iteration": 2.4176342487335205 + }, + { + "auxiliary_loss_clip": 0.01016676, + "auxiliary_loss_mlp": 0.01002435, + "balance_loss_clip": 1.00033665, + "balance_loss_mlp": 1.00356472, + "epoch": 0.2400120246505336, + "flos": 61957425047040.0, + "grad_norm": 0.8861831914865831, + "language_loss": 0.63729334, + "learning_rate": 3.4582475819955995e-06, + "loss": 0.65748447, + "num_input_tokens_seen": 85876695, + "router_z_loss_clip": 0.02099609, + "router_z_loss_mlp": 0.13085938, + "step": 3992, + "time_per_iteration": 2.9383041858673096 + }, + { + "auxiliary_loss_clip": 0.01015881, + "auxiliary_loss_mlp": 0.01003887, + "balance_loss_clip": 1.00176477, + "balance_loss_mlp": 1.00301957, + "epoch": 0.24007214790320155, + "flos": 66705333327360.0, + "grad_norm": 0.7551399710724225, + "language_loss": 0.62935436, + "learning_rate": 3.457988981238386e-06, + "loss": 0.64955205, + "num_input_tokens_seen": 85940990, + "router_z_loss_clip": 0.02124023, + "router_z_loss_mlp": 0.12890625, + "step": 3993, + "time_per_iteration": 3.1410489082336426 + }, + { + "auxiliary_loss_clip": 0.01090296, + "auxiliary_loss_mlp": 0.01040597, + "balance_loss_clip": 1.02278697, + "balance_loss_mlp": 1.02814472, + "epoch": 0.24013227115586952, + "flos": 25807971300480.0, + "grad_norm": 1.4519346805010132, + "language_loss": 0.76740205, + "learning_rate": 3.457730328449296e-06, + "loss": 0.78871101, + "num_input_tokens_seen": 85961165, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.62109375, + "step": 3994, + "time_per_iteration": 2.4315245151519775 + }, + { + "auxiliary_loss_clip": 0.01087546, + "auxiliary_loss_mlp": 0.01044022, + "balance_loss_clip": 1.02358913, + "balance_loss_mlp": 1.02564549, + "epoch": 0.2401923944085375, + "flos": 25556282242560.0, + "grad_norm": 1.8370602115244423, + "language_loss": 0.7836957, + "learning_rate": 3.457471623637561e-06, + "loss": 0.80501139, + "num_input_tokens_seen": 85982710, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 0.6171875, + "step": 3995, + "time_per_iteration": 2.4415955543518066 + }, + { + "auxiliary_loss_clip": 0.01015788, + "auxiliary_loss_mlp": 0.01010019, + "balance_loss_clip": 1.00781357, + "balance_loss_mlp": 1.00253904, + "epoch": 0.24025251766120548, + "flos": 54937751909760.0, + "grad_norm": 0.9058747089563907, + "language_loss": 0.63457066, + "learning_rate": 3.457212866812412e-06, + "loss": 0.65482873, + "num_input_tokens_seen": 86046935, + "router_z_loss_clip": 0.02209473, + "router_z_loss_mlp": 0.1328125, + "step": 3996, + "time_per_iteration": 3.1012489795684814 + }, + { + "auxiliary_loss_clip": 0.0108904, + "auxiliary_loss_mlp": 0.01034865, + "balance_loss_clip": 1.01726937, + "balance_loss_mlp": 1.02666843, + "epoch": 0.24031264091387344, + "flos": 20630037404160.0, + "grad_norm": 2.4189437709741135, + "language_loss": 0.70645809, + "learning_rate": 3.4569540579830853e-06, + "loss": 0.72769713, + "num_input_tokens_seen": 86064355, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.625, + "step": 3997, + "time_per_iteration": 2.3884925842285156 + }, + { + "auxiliary_loss_clip": 0.01086762, + "auxiliary_loss_mlp": 0.01035178, + "balance_loss_clip": 1.01834548, + "balance_loss_mlp": 1.02772617, + "epoch": 0.2403727641665414, + "flos": 20885217598080.0, + "grad_norm": 1.6862980998775594, + "language_loss": 0.87110609, + "learning_rate": 3.456695197158815e-06, + "loss": 0.89232552, + "num_input_tokens_seen": 86081340, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.58984375, + "step": 3998, + "time_per_iteration": 2.3747923374176025 + }, + { + "auxiliary_loss_clip": 0.01087042, + "auxiliary_loss_mlp": 0.01034927, + "balance_loss_clip": 1.01695061, + "balance_loss_mlp": 1.02427578, + "epoch": 0.24043288741920937, + "flos": 22818952005120.0, + "grad_norm": 1.8146267175905189, + "language_loss": 0.75909293, + "learning_rate": 3.4564362843488403e-06, + "loss": 0.78031266, + "num_input_tokens_seen": 86102260, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.62890625, + "step": 3999, + "time_per_iteration": 2.4066219329833984 + }, + { + "auxiliary_loss_clip": 0.01088464, + "auxiliary_loss_mlp": 0.01037473, + "balance_loss_clip": 1.02061713, + "balance_loss_mlp": 1.02898049, + "epoch": 0.24049301067187734, + "flos": 27958551361920.0, + "grad_norm": 1.994489415676596, + "language_loss": 0.72343725, + "learning_rate": 3.4561773195624015e-06, + "loss": 0.74469668, + "num_input_tokens_seen": 86123400, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.59375, + "step": 4000, + "time_per_iteration": 2.4440486431121826 + }, + { + "auxiliary_loss_clip": 0.01092571, + "auxiliary_loss_mlp": 0.01034281, + "balance_loss_clip": 1.01569605, + "balance_loss_mlp": 1.0288868, + "epoch": 0.24055313392454533, + "flos": 27450250744320.0, + "grad_norm": 1.828478248269627, + "language_loss": 0.66823041, + "learning_rate": 3.4559183028087394e-06, + "loss": 0.6894989, + "num_input_tokens_seen": 86144060, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.63671875, + "step": 4001, + "time_per_iteration": 2.4392964839935303 + }, + { + "auxiliary_loss_clip": 0.0108736, + "auxiliary_loss_mlp": 0.01032715, + "balance_loss_clip": 1.01525068, + "balance_loss_mlp": 1.02682328, + "epoch": 0.2406132571772133, + "flos": 25555444369920.0, + "grad_norm": 2.6887092498307257, + "language_loss": 0.82976794, + "learning_rate": 3.4556592340970983e-06, + "loss": 0.85096872, + "num_input_tokens_seen": 86163005, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.60546875, + "step": 4002, + "time_per_iteration": 2.4116885662078857 + }, + { + "auxiliary_loss_clip": 0.01088875, + "auxiliary_loss_mlp": 0.01037585, + "balance_loss_clip": 1.02040708, + "balance_loss_mlp": 1.02849126, + "epoch": 0.24067338042988126, + "flos": 24790217990400.0, + "grad_norm": 1.8993811145234223, + "language_loss": 0.83022451, + "learning_rate": 3.4554001134367237e-06, + "loss": 0.85148919, + "num_input_tokens_seen": 86182580, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.6015625, + "step": 4003, + "time_per_iteration": 3.7910866737365723 + }, + { + "auxiliary_loss_clip": 0.01088616, + "auxiliary_loss_mlp": 0.01030393, + "balance_loss_clip": 1.01308382, + "balance_loss_mlp": 1.02802551, + "epoch": 0.24073350368254923, + "flos": 21176882029440.0, + "grad_norm": 1.9592529864304187, + "language_loss": 0.87321031, + "learning_rate": 3.4551409408368627e-06, + "loss": 0.89440036, + "num_input_tokens_seen": 86200665, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.60546875, + "step": 4004, + "time_per_iteration": 2.387267589569092 + }, + { + "auxiliary_loss_clip": 0.01089989, + "auxiliary_loss_mlp": 0.01047377, + "balance_loss_clip": 1.02936447, + "balance_loss_mlp": 1.02769518, + "epoch": 0.2407936269352172, + "flos": 22493142397440.0, + "grad_norm": 1.7521791139733762, + "language_loss": 0.78025109, + "learning_rate": 3.4548817163067643e-06, + "loss": 0.80162477, + "num_input_tokens_seen": 86221640, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.625, + "step": 4005, + "time_per_iteration": 2.4593660831451416 + }, + { + "auxiliary_loss_clip": 0.01086327, + "auxiliary_loss_mlp": 0.01030745, + "balance_loss_clip": 1.01314926, + "balance_loss_mlp": 1.02741086, + "epoch": 0.24085375018788516, + "flos": 18550156579200.0, + "grad_norm": 1.6009108353701194, + "language_loss": 0.79030287, + "learning_rate": 3.4546224398556804e-06, + "loss": 0.81147361, + "num_input_tokens_seen": 86240795, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.58984375, + "step": 4006, + "time_per_iteration": 2.377345085144043 + }, + { + "auxiliary_loss_clip": 0.01090476, + "auxiliary_loss_mlp": 0.01036911, + "balance_loss_clip": 1.01683617, + "balance_loss_mlp": 1.02639747, + "epoch": 0.24091387344055312, + "flos": 24169392460800.0, + "grad_norm": 1.7063835853425924, + "language_loss": 0.70962864, + "learning_rate": 3.4543631114928627e-06, + "loss": 0.73090243, + "num_input_tokens_seen": 86262000, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 0.640625, + "step": 4007, + "time_per_iteration": 3.82952880859375 + }, + { + "auxiliary_loss_clip": 0.01087084, + "auxiliary_loss_mlp": 0.01033905, + "balance_loss_clip": 1.01698875, + "balance_loss_mlp": 1.02690339, + "epoch": 0.24097399669322112, + "flos": 11035520830080.0, + "grad_norm": 1.7695115017731504, + "language_loss": 0.76055849, + "learning_rate": 3.454103731227567e-06, + "loss": 0.78176832, + "num_input_tokens_seen": 86279680, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.6015625, + "step": 4008, + "time_per_iteration": 3.7759323120117188 + }, + { + "auxiliary_loss_clip": 0.0108743, + "auxiliary_loss_mlp": 0.01035047, + "balance_loss_clip": 1.01684332, + "balance_loss_mlp": 1.02693796, + "epoch": 0.24103411994588908, + "flos": 17164139581440.0, + "grad_norm": 2.4837077594938743, + "language_loss": 0.74269611, + "learning_rate": 3.4538442990690494e-06, + "loss": 0.7639209, + "num_input_tokens_seen": 86297180, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.6015625, + "step": 4009, + "time_per_iteration": 2.35188364982605 + }, + { + "auxiliary_loss_clip": 0.01086402, + "auxiliary_loss_mlp": 0.01034783, + "balance_loss_clip": 1.01790273, + "balance_loss_mlp": 1.02748787, + "epoch": 0.24109424319855705, + "flos": 20666905666560.0, + "grad_norm": 1.6192510377422105, + "language_loss": 0.80016541, + "learning_rate": 3.4535848150265684e-06, + "loss": 0.82137728, + "num_input_tokens_seen": 86317660, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.58984375, + "step": 4010, + "time_per_iteration": 2.394073009490967 + }, + { + "auxiliary_loss_clip": 0.01087002, + "auxiliary_loss_mlp": 0.01038271, + "balance_loss_clip": 1.01880407, + "balance_loss_mlp": 1.02544391, + "epoch": 0.241154366451225, + "flos": 28180598808960.0, + "grad_norm": 1.7287118550449876, + "language_loss": 0.70196462, + "learning_rate": 3.453325279109385e-06, + "loss": 0.72321737, + "num_input_tokens_seen": 86338325, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 0.6171875, + "step": 4011, + "time_per_iteration": 2.4279544353485107 + }, + { + "auxiliary_loss_clip": 0.01085882, + "auxiliary_loss_mlp": 0.01033038, + "balance_loss_clip": 1.01656342, + "balance_loss_mlp": 1.02537155, + "epoch": 0.24121448970389298, + "flos": 21688638871680.0, + "grad_norm": 1.6895419506254177, + "language_loss": 0.6934607, + "learning_rate": 3.45306569132676e-06, + "loss": 0.71464986, + "num_input_tokens_seen": 86357615, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.60546875, + "step": 4012, + "time_per_iteration": 3.7878193855285645 + }, + { + "auxiliary_loss_clip": 0.01084459, + "auxiliary_loss_mlp": 0.01034459, + "balance_loss_clip": 1.01620817, + "balance_loss_mlp": 1.02454436, + "epoch": 0.24127461295656094, + "flos": 39674634791040.0, + "grad_norm": 2.202969993406684, + "language_loss": 0.73634732, + "learning_rate": 3.4528060516879587e-06, + "loss": 0.75753653, + "num_input_tokens_seen": 86380355, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 0.6015625, + "step": 4013, + "time_per_iteration": 2.538541555404663 + }, + { + "auxiliary_loss_clip": 0.01088306, + "auxiliary_loss_mlp": 0.01029912, + "balance_loss_clip": 1.01327085, + "balance_loss_mlp": 1.02741277, + "epoch": 0.2413347362092289, + "flos": 19134846984960.0, + "grad_norm": 2.1172922822267872, + "language_loss": 0.88291633, + "learning_rate": 3.4525463602022465e-06, + "loss": 0.90409851, + "num_input_tokens_seen": 86399125, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.609375, + "step": 4014, + "time_per_iteration": 2.369629144668579 + }, + { + "auxiliary_loss_clip": 0.01089427, + "auxiliary_loss_mlp": 0.01035263, + "balance_loss_clip": 1.01686883, + "balance_loss_mlp": 1.02668977, + "epoch": 0.2413948594618969, + "flos": 26938319345280.0, + "grad_norm": 1.854821965618708, + "language_loss": 0.94809508, + "learning_rate": 3.452286616878891e-06, + "loss": 0.96934193, + "num_input_tokens_seen": 86418625, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.62890625, + "step": 4015, + "time_per_iteration": 2.415951728820801 + }, + { + "auxiliary_loss_clip": 0.01088434, + "auxiliary_loss_mlp": 0.01035645, + "balance_loss_clip": 1.01816916, + "balance_loss_mlp": 1.02596617, + "epoch": 0.24145498271456486, + "flos": 25226946587520.0, + "grad_norm": 1.5234313401014352, + "language_loss": 0.82684982, + "learning_rate": 3.4520268217271616e-06, + "loss": 0.84809065, + "num_input_tokens_seen": 86438375, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.625, + "step": 4016, + "time_per_iteration": 2.421196222305298 + }, + { + "auxiliary_loss_clip": 0.01085515, + "auxiliary_loss_mlp": 0.01034826, + "balance_loss_clip": 1.01752841, + "balance_loss_mlp": 1.02704406, + "epoch": 0.24151510596723283, + "flos": 40660163049600.0, + "grad_norm": 1.74949835870112, + "language_loss": 0.68899834, + "learning_rate": 3.4517669747563305e-06, + "loss": 0.71020174, + "num_input_tokens_seen": 86463230, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.5859375, + "step": 4017, + "time_per_iteration": 2.546879291534424 + }, + { + "auxiliary_loss_clip": 0.01089601, + "auxiliary_loss_mlp": 0.0103811, + "balance_loss_clip": 1.01883423, + "balance_loss_mlp": 1.02611876, + "epoch": 0.2415752292199008, + "flos": 18145792679040.0, + "grad_norm": 1.6398130322302436, + "language_loss": 0.8472954, + "learning_rate": 3.4515070759756704e-06, + "loss": 0.86857247, + "num_input_tokens_seen": 86481230, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 0.6328125, + "step": 4018, + "time_per_iteration": 2.3574776649475098 + }, + { + "auxiliary_loss_clip": 0.01016053, + "auxiliary_loss_mlp": 0.01008812, + "balance_loss_clip": 1.00705934, + "balance_loss_mlp": 1.00344074, + "epoch": 0.24163535247256876, + "flos": 67285275788160.0, + "grad_norm": 0.8154254800086314, + "language_loss": 0.60669744, + "learning_rate": 3.4512471253944563e-06, + "loss": 0.62694609, + "num_input_tokens_seen": 86541260, + "router_z_loss_clip": 0.01757812, + "router_z_loss_mlp": 0.12597656, + "step": 4019, + "time_per_iteration": 3.075002908706665 + }, + { + "auxiliary_loss_clip": 0.01085485, + "auxiliary_loss_mlp": 0.01031994, + "balance_loss_clip": 1.01525688, + "balance_loss_mlp": 1.02541828, + "epoch": 0.24169547572523672, + "flos": 24928963200000.0, + "grad_norm": 1.7501434214385798, + "language_loss": 0.73478138, + "learning_rate": 3.4509871230219653e-06, + "loss": 0.75595617, + "num_input_tokens_seen": 86559580, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.6015625, + "step": 4020, + "time_per_iteration": 2.409864664077759 + }, + { + "auxiliary_loss_clip": 0.01088366, + "auxiliary_loss_mlp": 0.01033731, + "balance_loss_clip": 1.01759028, + "balance_loss_mlp": 1.02827859, + "epoch": 0.24175559897790472, + "flos": 18727480707840.0, + "grad_norm": 5.2569282255920795, + "language_loss": 0.81802928, + "learning_rate": 3.4507270688674767e-06, + "loss": 0.83925021, + "num_input_tokens_seen": 86577560, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.6015625, + "step": 4021, + "time_per_iteration": 2.3730390071868896 + }, + { + "auxiliary_loss_clip": 0.0109031, + "auxiliary_loss_mlp": 0.01039498, + "balance_loss_clip": 1.02059174, + "balance_loss_mlp": 1.02699518, + "epoch": 0.24181572223057268, + "flos": 23038171632000.0, + "grad_norm": 1.8459056461712018, + "language_loss": 0.7628122, + "learning_rate": 3.45046696294027e-06, + "loss": 0.78411031, + "num_input_tokens_seen": 86595350, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 0.6328125, + "step": 4022, + "time_per_iteration": 2.4485411643981934 + }, + { + "auxiliary_loss_clip": 0.01089441, + "auxiliary_loss_mlp": 0.01044885, + "balance_loss_clip": 1.02590644, + "balance_loss_mlp": 1.02579141, + "epoch": 0.24187584548324065, + "flos": 20375101589760.0, + "grad_norm": 1.6864067988128646, + "language_loss": 0.74856055, + "learning_rate": 3.4502068052496283e-06, + "loss": 0.76990384, + "num_input_tokens_seen": 86614805, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 0.63671875, + "step": 4023, + "time_per_iteration": 2.3793880939483643 + }, + { + "auxiliary_loss_clip": 0.01086206, + "auxiliary_loss_mlp": 0.0104086, + "balance_loss_clip": 1.02359855, + "balance_loss_mlp": 1.02751613, + "epoch": 0.2419359687359086, + "flos": 21396450769920.0, + "grad_norm": 1.810165434136769, + "language_loss": 0.82164931, + "learning_rate": 3.449946595804837e-06, + "loss": 0.84292001, + "num_input_tokens_seen": 86633700, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.5859375, + "step": 4024, + "time_per_iteration": 2.3825526237487793 + }, + { + "auxiliary_loss_clip": 0.01084842, + "auxiliary_loss_mlp": 0.01037382, + "balance_loss_clip": 1.01990604, + "balance_loss_mlp": 1.02621841, + "epoch": 0.24199609198857658, + "flos": 18368398707840.0, + "grad_norm": 1.6964064599763604, + "language_loss": 0.86108863, + "learning_rate": 3.4496863346151805e-06, + "loss": 0.88231087, + "num_input_tokens_seen": 86650905, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.5859375, + "step": 4025, + "time_per_iteration": 2.3538613319396973 + }, + { + "auxiliary_loss_clip": 0.01088574, + "auxiliary_loss_mlp": 0.01045561, + "balance_loss_clip": 1.02785802, + "balance_loss_mlp": 1.02560258, + "epoch": 0.24205621524124454, + "flos": 19462856008320.0, + "grad_norm": 1.9397013807255439, + "language_loss": 0.71698177, + "learning_rate": 3.449426021689949e-06, + "loss": 0.73832315, + "num_input_tokens_seen": 86669185, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.62890625, + "step": 4026, + "time_per_iteration": 2.3856053352355957 + }, + { + "auxiliary_loss_clip": 0.01085522, + "auxiliary_loss_mlp": 0.01035277, + "balance_loss_clip": 1.01926684, + "balance_loss_mlp": 1.02678347, + "epoch": 0.2421163384939125, + "flos": 14975434448640.0, + "grad_norm": 1.8605379154607802, + "language_loss": 0.64380479, + "learning_rate": 3.44916565703843e-06, + "loss": 0.66501278, + "num_input_tokens_seen": 86686805, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.5859375, + "step": 4027, + "time_per_iteration": 2.3626890182495117 + }, + { + "auxiliary_loss_clip": 0.01088122, + "auxiliary_loss_mlp": 0.01033518, + "balance_loss_clip": 1.01759148, + "balance_loss_mlp": 1.02771604, + "epoch": 0.2421764617465805, + "flos": 18661040657280.0, + "grad_norm": 2.0249456577952, + "language_loss": 0.70523262, + "learning_rate": 3.4489052406699167e-06, + "loss": 0.72644901, + "num_input_tokens_seen": 86705520, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.6015625, + "step": 4028, + "time_per_iteration": 2.4186532497406006 + }, + { + "auxiliary_loss_clip": 0.01014611, + "auxiliary_loss_mlp": 0.01008285, + "balance_loss_clip": 1.00647306, + "balance_loss_mlp": 1.00195169, + "epoch": 0.24223658499924847, + "flos": 64343877454080.0, + "grad_norm": 0.8748438877926056, + "language_loss": 0.55332172, + "learning_rate": 3.4486447725937024e-06, + "loss": 0.57355064, + "num_input_tokens_seen": 86767320, + "router_z_loss_clip": 0.01806641, + "router_z_loss_mlp": 0.12695312, + "step": 4029, + "time_per_iteration": 3.024845600128174 + }, + { + "auxiliary_loss_clip": 0.01086545, + "auxiliary_loss_mlp": 0.01034274, + "balance_loss_clip": 1.01610661, + "balance_loss_mlp": 1.02595139, + "epoch": 0.24229670825191643, + "flos": 25774070503680.0, + "grad_norm": 3.093962376183483, + "language_loss": 0.73978829, + "learning_rate": 3.448384252819083e-06, + "loss": 0.7609964, + "num_input_tokens_seen": 86788110, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.609375, + "step": 4030, + "time_per_iteration": 2.437941789627075 + }, + { + "auxiliary_loss_clip": 0.0108547, + "auxiliary_loss_mlp": 0.01039609, + "balance_loss_clip": 1.02177572, + "balance_loss_mlp": 1.02543759, + "epoch": 0.2423568315045844, + "flos": 20666067793920.0, + "grad_norm": 1.906826339171084, + "language_loss": 0.76540571, + "learning_rate": 3.4481236813553544e-06, + "loss": 0.7866565, + "num_input_tokens_seen": 86807640, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 0.6015625, + "step": 4031, + "time_per_iteration": 2.4073100090026855 + }, + { + "auxiliary_loss_clip": 0.01088417, + "auxiliary_loss_mlp": 0.01036159, + "balance_loss_clip": 1.01839709, + "balance_loss_mlp": 1.02473474, + "epoch": 0.24241695475725236, + "flos": 22415775091200.0, + "grad_norm": 2.01946046040732, + "language_loss": 0.65303868, + "learning_rate": 3.447863058211817e-06, + "loss": 0.67428446, + "num_input_tokens_seen": 86826795, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.63671875, + "step": 4032, + "time_per_iteration": 2.392939329147339 + }, + { + "auxiliary_loss_clip": 0.01084638, + "auxiliary_loss_mlp": 0.01033699, + "balance_loss_clip": 1.01661599, + "balance_loss_mlp": 1.02461696, + "epoch": 0.24247707800992033, + "flos": 17128039368960.0, + "grad_norm": 2.019517899431195, + "language_loss": 0.81508386, + "learning_rate": 3.447602383397772e-06, + "loss": 0.83626723, + "num_input_tokens_seen": 86843175, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.6015625, + "step": 4033, + "time_per_iteration": 2.3657474517822266 + }, + { + "auxiliary_loss_clip": 0.01082597, + "auxiliary_loss_mlp": 0.01030679, + "balance_loss_clip": 1.01432991, + "balance_loss_mlp": 1.02487993, + "epoch": 0.2425372012625883, + "flos": 31612386366720.0, + "grad_norm": 2.0192584786228083, + "language_loss": 0.69367337, + "learning_rate": 3.447341656922521e-06, + "loss": 0.71480614, + "num_input_tokens_seen": 86863185, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.578125, + "step": 4034, + "time_per_iteration": 2.4704880714416504 + }, + { + "auxiliary_loss_clip": 0.01084263, + "auxiliary_loss_mlp": 0.01031035, + "balance_loss_clip": 1.01366591, + "balance_loss_mlp": 1.02409565, + "epoch": 0.24259732451525629, + "flos": 16325106854400.0, + "grad_norm": 3.8398012747057058, + "language_loss": 0.96217966, + "learning_rate": 3.4470808787953693e-06, + "loss": 0.98333263, + "num_input_tokens_seen": 86880040, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.6015625, + "step": 4035, + "time_per_iteration": 2.3749608993530273 + }, + { + "auxiliary_loss_clip": 0.01080649, + "auxiliary_loss_mlp": 0.01035368, + "balance_loss_clip": 1.01975203, + "balance_loss_mlp": 1.02442312, + "epoch": 0.24265744776792425, + "flos": 22855540976640.0, + "grad_norm": 1.5507496232106701, + "language_loss": 0.77687532, + "learning_rate": 3.4468200490256236e-06, + "loss": 0.7980355, + "num_input_tokens_seen": 86900610, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.5625, + "step": 4036, + "time_per_iteration": 2.391878366470337 + }, + { + "auxiliary_loss_clip": 0.01085758, + "auxiliary_loss_mlp": 0.01036933, + "balance_loss_clip": 1.01980233, + "balance_loss_mlp": 1.02556109, + "epoch": 0.24271757102059222, + "flos": 21870501477120.0, + "grad_norm": 1.6914231535311977, + "language_loss": 0.74482965, + "learning_rate": 3.4465591676225916e-06, + "loss": 0.76605654, + "num_input_tokens_seen": 86919385, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.6015625, + "step": 4037, + "time_per_iteration": 2.3816864490509033 + }, + { + "auxiliary_loss_clip": 0.01087143, + "auxiliary_loss_mlp": 0.01035193, + "balance_loss_clip": 1.01744318, + "balance_loss_mlp": 1.02559233, + "epoch": 0.24277769427326018, + "flos": 19207571080320.0, + "grad_norm": 2.739470830703685, + "language_loss": 0.76286781, + "learning_rate": 3.446298234595584e-06, + "loss": 0.78409111, + "num_input_tokens_seen": 86938885, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.6171875, + "step": 4038, + "time_per_iteration": 2.3826780319213867 + }, + { + "auxiliary_loss_clip": 0.01087325, + "auxiliary_loss_mlp": 0.01037104, + "balance_loss_clip": 1.01804256, + "balance_loss_mlp": 1.02585304, + "epoch": 0.24283781752592815, + "flos": 19498886398080.0, + "grad_norm": 1.621592340513611, + "language_loss": 0.71952415, + "learning_rate": 3.4460372499539133e-06, + "loss": 0.74076843, + "num_input_tokens_seen": 86957705, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 0.61328125, + "step": 4039, + "time_per_iteration": 2.373342990875244 + }, + { + "auxiliary_loss_clip": 0.01086069, + "auxiliary_loss_mlp": 0.01043094, + "balance_loss_clip": 1.02503335, + "balance_loss_mlp": 1.02581549, + "epoch": 0.2428979407785961, + "flos": 19901155616640.0, + "grad_norm": 1.665761264889598, + "language_loss": 0.78207022, + "learning_rate": 3.4457762137068923e-06, + "loss": 0.80336183, + "num_input_tokens_seen": 86975845, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.6015625, + "step": 4040, + "time_per_iteration": 2.3768017292022705 + }, + { + "auxiliary_loss_clip": 0.01079687, + "auxiliary_loss_mlp": 0.01029417, + "balance_loss_clip": 1.01310945, + "balance_loss_mlp": 1.02332282, + "epoch": 0.2429580640312641, + "flos": 24714770808960.0, + "grad_norm": 2.7083814955089975, + "language_loss": 0.805282, + "learning_rate": 3.4455151258638377e-06, + "loss": 0.8263731, + "num_input_tokens_seen": 86994800, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.5625, + "step": 4041, + "time_per_iteration": 2.411120653152466 + }, + { + "auxiliary_loss_clip": 0.01084251, + "auxiliary_loss_mlp": 0.0103811, + "balance_loss_clip": 1.02089632, + "balance_loss_mlp": 1.0257473, + "epoch": 0.24301818728393207, + "flos": 25629145983360.0, + "grad_norm": 1.9368423125732326, + "language_loss": 0.7684803, + "learning_rate": 3.445253986434066e-06, + "loss": 0.78970385, + "num_input_tokens_seen": 87016845, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.5859375, + "step": 4042, + "time_per_iteration": 3.937272548675537 + }, + { + "auxiliary_loss_clip": 0.01083316, + "auxiliary_loss_mlp": 0.01030303, + "balance_loss_clip": 1.01465082, + "balance_loss_mlp": 1.02557111, + "epoch": 0.24307831053660003, + "flos": 26140169687040.0, + "grad_norm": 1.6918458469240896, + "language_loss": 0.81556666, + "learning_rate": 3.4449927954268977e-06, + "loss": 0.83670294, + "num_input_tokens_seen": 87036270, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.578125, + "step": 4043, + "time_per_iteration": 2.4226481914520264 + }, + { + "auxiliary_loss_clip": 0.01085304, + "auxiliary_loss_mlp": 0.01033457, + "balance_loss_clip": 1.01509857, + "balance_loss_mlp": 1.02388227, + "epoch": 0.243138433789268, + "flos": 14971629110400.0, + "grad_norm": 2.114715848266188, + "language_loss": 0.72919255, + "learning_rate": 3.444731552851653e-06, + "loss": 0.75038016, + "num_input_tokens_seen": 87049920, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.61328125, + "step": 4044, + "time_per_iteration": 2.3478853702545166 + }, + { + "auxiliary_loss_clip": 0.01087591, + "auxiliary_loss_mlp": 0.01036146, + "balance_loss_clip": 1.01803792, + "balance_loss_mlp": 1.02715635, + "epoch": 0.24319855704193596, + "flos": 25190532172800.0, + "grad_norm": 1.7308929888142233, + "language_loss": 0.83334029, + "learning_rate": 3.4444702587176556e-06, + "loss": 0.85457766, + "num_input_tokens_seen": 87068230, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.6015625, + "step": 4045, + "time_per_iteration": 2.4102683067321777 + }, + { + "auxiliary_loss_clip": 0.01088269, + "auxiliary_loss_mlp": 0.01034224, + "balance_loss_clip": 1.01668811, + "balance_loss_mlp": 1.02836776, + "epoch": 0.24325868029460393, + "flos": 22126135518720.0, + "grad_norm": 1.5821011833520624, + "language_loss": 0.86685628, + "learning_rate": 3.4442089130342303e-06, + "loss": 0.88808119, + "num_input_tokens_seen": 87086435, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.59765625, + "step": 4046, + "time_per_iteration": 2.3933961391448975 + }, + { + "auxiliary_loss_clip": 0.01081384, + "auxiliary_loss_mlp": 0.01030579, + "balance_loss_clip": 1.01439667, + "balance_loss_mlp": 1.02443242, + "epoch": 0.2433188035472719, + "flos": 23581106184960.0, + "grad_norm": 1.7697531404225275, + "language_loss": 0.72610867, + "learning_rate": 3.443947515810704e-06, + "loss": 0.74722838, + "num_input_tokens_seen": 87105340, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.5703125, + "step": 4047, + "time_per_iteration": 3.804840087890625 + }, + { + "auxiliary_loss_clip": 0.01085507, + "auxiliary_loss_mlp": 0.01029184, + "balance_loss_clip": 1.01139784, + "balance_loss_mlp": 1.02518833, + "epoch": 0.2433789267999399, + "flos": 24461650385280.0, + "grad_norm": 2.5013067149024284, + "language_loss": 0.73022771, + "learning_rate": 3.4436860670564053e-06, + "loss": 0.7513746, + "num_input_tokens_seen": 87125780, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.6015625, + "step": 4048, + "time_per_iteration": 3.843885898590088 + }, + { + "auxiliary_loss_clip": 0.01083257, + "auxiliary_loss_mlp": 0.01034456, + "balance_loss_clip": 1.01883984, + "balance_loss_mlp": 1.02464604, + "epoch": 0.24343905005260785, + "flos": 16726957136640.0, + "grad_norm": 1.7660263379588135, + "language_loss": 0.73191977, + "learning_rate": 3.443424566780664e-06, + "loss": 0.75309694, + "num_input_tokens_seen": 87144470, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.5859375, + "step": 4049, + "time_per_iteration": 2.374965190887451 + }, + { + "auxiliary_loss_clip": 0.01081639, + "auxiliary_loss_mlp": 0.01031915, + "balance_loss_clip": 1.01581013, + "balance_loss_mlp": 1.02405167, + "epoch": 0.24349917330527582, + "flos": 20042833380480.0, + "grad_norm": 1.6627961175457093, + "language_loss": 0.73763454, + "learning_rate": 3.4431630149928126e-06, + "loss": 0.75877011, + "num_input_tokens_seen": 87162830, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.57421875, + "step": 4050, + "time_per_iteration": 2.358772039413452 + }, + { + "auxiliary_loss_clip": 0.0108224, + "auxiliary_loss_mlp": 0.01028995, + "balance_loss_clip": 1.01356351, + "balance_loss_mlp": 1.02507544, + "epoch": 0.24355929655794378, + "flos": 17419599066240.0, + "grad_norm": 2.8060132166891796, + "language_loss": 0.74937081, + "learning_rate": 3.442901411702186e-06, + "loss": 0.77048314, + "num_input_tokens_seen": 87180905, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.5703125, + "step": 4051, + "time_per_iteration": 2.374464750289917 + }, + { + "auxiliary_loss_clip": 0.01081998, + "auxiliary_loss_mlp": 0.01033713, + "balance_loss_clip": 1.01667809, + "balance_loss_mlp": 1.023803, + "epoch": 0.24361941981061175, + "flos": 25409751799680.0, + "grad_norm": 2.1546092405486124, + "language_loss": 0.70442474, + "learning_rate": 3.44263975691812e-06, + "loss": 0.72558182, + "num_input_tokens_seen": 87202290, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.578125, + "step": 4052, + "time_per_iteration": 3.8116774559020996 + }, + { + "auxiliary_loss_clip": 0.01084685, + "auxiliary_loss_mlp": 0.0103469, + "balance_loss_clip": 1.0171299, + "balance_loss_mlp": 1.02413774, + "epoch": 0.2436795430632797, + "flos": 22819685143680.0, + "grad_norm": 1.5679740817017886, + "language_loss": 0.80948436, + "learning_rate": 3.4423780506499513e-06, + "loss": 0.83067811, + "num_input_tokens_seen": 87221650, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.60546875, + "step": 4053, + "time_per_iteration": 2.396677017211914 + }, + { + "auxiliary_loss_clip": 0.01085847, + "auxiliary_loss_mlp": 0.01034954, + "balance_loss_clip": 1.01727521, + "balance_loss_mlp": 1.02519655, + "epoch": 0.2437396663159477, + "flos": 15156913029120.0, + "grad_norm": 1.5736054686827436, + "language_loss": 0.78104687, + "learning_rate": 3.44211629290702e-06, + "loss": 0.80225492, + "num_input_tokens_seen": 87238515, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.60546875, + "step": 4054, + "time_per_iteration": 2.352160930633545 + }, + { + "auxiliary_loss_clip": 0.01083136, + "auxiliary_loss_mlp": 0.01040874, + "balance_loss_clip": 1.02467966, + "balance_loss_mlp": 1.02459168, + "epoch": 0.24379978956861567, + "flos": 22090035306240.0, + "grad_norm": 1.711466330754344, + "language_loss": 0.8373248, + "learning_rate": 3.441854483698668e-06, + "loss": 0.85856485, + "num_input_tokens_seen": 87256290, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.5859375, + "step": 4055, + "time_per_iteration": 2.4138126373291016 + }, + { + "auxiliary_loss_clip": 0.01087056, + "auxiliary_loss_mlp": 0.01032593, + "balance_loss_clip": 1.01502132, + "balance_loss_mlp": 1.02472568, + "epoch": 0.24385991282128364, + "flos": 31466414505600.0, + "grad_norm": 2.374355288857586, + "language_loss": 0.54764944, + "learning_rate": 3.441592623034239e-06, + "loss": 0.56884593, + "num_input_tokens_seen": 87277085, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.625, + "step": 4056, + "time_per_iteration": 2.4502336978912354 + }, + { + "auxiliary_loss_clip": 0.01087786, + "auxiliary_loss_mlp": 0.01043258, + "balance_loss_clip": 1.02503085, + "balance_loss_mlp": 1.02590561, + "epoch": 0.2439200360739516, + "flos": 23837752656000.0, + "grad_norm": 2.9310997272899773, + "language_loss": 0.80442935, + "learning_rate": 3.4413307109230772e-06, + "loss": 0.8257398, + "num_input_tokens_seen": 87293020, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.6171875, + "step": 4057, + "time_per_iteration": 2.383955240249634 + }, + { + "auxiliary_loss_clip": 0.01083531, + "auxiliary_loss_mlp": 0.01033016, + "balance_loss_clip": 1.01710176, + "balance_loss_mlp": 1.02569675, + "epoch": 0.24398015932661957, + "flos": 19169027072640.0, + "grad_norm": 1.657710442182054, + "language_loss": 0.7917918, + "learning_rate": 3.44106874737453e-06, + "loss": 0.81295723, + "num_input_tokens_seen": 87311445, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.578125, + "step": 4058, + "time_per_iteration": 2.3650920391082764 + }, + { + "auxiliary_loss_clip": 0.01087073, + "auxiliary_loss_mlp": 0.01036755, + "balance_loss_clip": 1.01982701, + "balance_loss_mlp": 1.02571154, + "epoch": 0.24404028257928753, + "flos": 25261371054720.0, + "grad_norm": 1.5991901212591748, + "language_loss": 0.85366106, + "learning_rate": 3.440806732397945e-06, + "loss": 0.87489927, + "num_input_tokens_seen": 87332055, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.61328125, + "step": 4059, + "time_per_iteration": 2.430112600326538 + }, + { + "auxiliary_loss_clip": 0.01082426, + "auxiliary_loss_mlp": 0.01032771, + "balance_loss_clip": 1.0167253, + "balance_loss_mlp": 1.02541184, + "epoch": 0.2441004058319555, + "flos": 26466433142400.0, + "grad_norm": 1.5466199952626698, + "language_loss": 0.74175167, + "learning_rate": 3.4405446660026753e-06, + "loss": 0.76290363, + "num_input_tokens_seen": 87351295, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.5703125, + "step": 4060, + "time_per_iteration": 2.416564464569092 + }, + { + "auxiliary_loss_clip": 0.01089481, + "auxiliary_loss_mlp": 0.01043823, + "balance_loss_clip": 1.0230453, + "balance_loss_mlp": 1.02646375, + "epoch": 0.2441605290846235, + "flos": 26759319471360.0, + "grad_norm": 1.666582484146722, + "language_loss": 0.73178411, + "learning_rate": 3.4402825481980707e-06, + "loss": 0.7531172, + "num_input_tokens_seen": 87370650, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 0.6328125, + "step": 4061, + "time_per_iteration": 2.4373011589050293 + }, + { + "auxiliary_loss_clip": 0.01084766, + "auxiliary_loss_mlp": 0.01034174, + "balance_loss_clip": 1.01852179, + "balance_loss_mlp": 1.02573442, + "epoch": 0.24422065233729146, + "flos": 21104786338560.0, + "grad_norm": 1.6867490590711338, + "language_loss": 0.7636472, + "learning_rate": 3.4400203789934876e-06, + "loss": 0.78483665, + "num_input_tokens_seen": 87389020, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.58984375, + "step": 4062, + "time_per_iteration": 2.389997959136963 + }, + { + "auxiliary_loss_clip": 0.01083111, + "auxiliary_loss_mlp": 0.01034092, + "balance_loss_clip": 1.0172472, + "balance_loss_mlp": 1.02626681, + "epoch": 0.24428077558995942, + "flos": 25262034370560.0, + "grad_norm": 1.5326945868476514, + "language_loss": 0.85370314, + "learning_rate": 3.4397581583982814e-06, + "loss": 0.87487519, + "num_input_tokens_seen": 87409695, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.56640625, + "step": 4063, + "time_per_iteration": 2.4098868370056152 + }, + { + "auxiliary_loss_clip": 0.01087889, + "auxiliary_loss_mlp": 0.01032209, + "balance_loss_clip": 1.01442289, + "balance_loss_mlp": 1.02639985, + "epoch": 0.24434089884262739, + "flos": 20484240099840.0, + "grad_norm": 2.3194674917998577, + "language_loss": 0.6861009, + "learning_rate": 3.43949588642181e-06, + "loss": 0.70730186, + "num_input_tokens_seen": 87428250, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.61328125, + "step": 4064, + "time_per_iteration": 2.369448661804199 + }, + { + "auxiliary_loss_clip": 0.01090568, + "auxiliary_loss_mlp": 0.01032053, + "balance_loss_clip": 1.01345634, + "balance_loss_mlp": 1.0276202, + "epoch": 0.24440102209529535, + "flos": 23620802267520.0, + "grad_norm": 1.7129569413561863, + "language_loss": 0.70268422, + "learning_rate": 3.439233563073433e-06, + "loss": 0.72391045, + "num_input_tokens_seen": 87449380, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.62890625, + "step": 4065, + "time_per_iteration": 2.415884017944336 + }, + { + "auxiliary_loss_clip": 0.0109049, + "auxiliary_loss_mlp": 0.01039149, + "balance_loss_clip": 1.01882362, + "balance_loss_mlp": 1.0269953, + "epoch": 0.24446114534796332, + "flos": 20553787261440.0, + "grad_norm": 1.8505396824588383, + "language_loss": 0.83956051, + "learning_rate": 3.4389711883625124e-06, + "loss": 0.86085689, + "num_input_tokens_seen": 87465365, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 0.6328125, + "step": 4066, + "time_per_iteration": 2.3772501945495605 + }, + { + "auxiliary_loss_clip": 0.01018032, + "auxiliary_loss_mlp": 0.01005863, + "balance_loss_clip": 1.00388443, + "balance_loss_mlp": 1.00498223, + "epoch": 0.24452126860063128, + "flos": 60386717623680.0, + "grad_norm": 0.9313977616385747, + "language_loss": 0.52255923, + "learning_rate": 3.4387087622984114e-06, + "loss": 0.54279816, + "num_input_tokens_seen": 87522525, + "router_z_loss_clip": 0.01977539, + "router_z_loss_mlp": 0.13085938, + "step": 4067, + "time_per_iteration": 2.9381279945373535 + }, + { + "auxiliary_loss_clip": 0.01088623, + "auxiliary_loss_mlp": 0.01040489, + "balance_loss_clip": 1.02146351, + "balance_loss_mlp": 1.02648866, + "epoch": 0.24458139185329927, + "flos": 15120777905280.0, + "grad_norm": 2.6971374172915916, + "language_loss": 0.72052568, + "learning_rate": 3.4384462848904956e-06, + "loss": 0.74181682, + "num_input_tokens_seen": 87539170, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 0.62109375, + "step": 4068, + "time_per_iteration": 2.349055767059326 + }, + { + "auxiliary_loss_clip": 0.01087688, + "auxiliary_loss_mlp": 0.01039753, + "balance_loss_clip": 1.02128696, + "balance_loss_mlp": 1.02718902, + "epoch": 0.24464151510596724, + "flos": 27997549217280.0, + "grad_norm": 1.942589941292615, + "language_loss": 0.77926159, + "learning_rate": 3.438183756148132e-06, + "loss": 0.80053604, + "num_input_tokens_seen": 87558875, + "router_z_loss_clip": 0.18457031, + "router_z_loss_mlp": 0.60546875, + "step": 4069, + "time_per_iteration": 2.4349539279937744 + }, + { + "auxiliary_loss_clip": 0.01088186, + "auxiliary_loss_mlp": 0.01040825, + "balance_loss_clip": 1.02281272, + "balance_loss_mlp": 1.02837825, + "epoch": 0.2447016383586352, + "flos": 19791842549760.0, + "grad_norm": 1.8921631164805177, + "language_loss": 0.80191195, + "learning_rate": 3.4379211760806895e-06, + "loss": 0.82320201, + "num_input_tokens_seen": 87576485, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.59765625, + "step": 4070, + "time_per_iteration": 2.3638057708740234 + }, + { + "auxiliary_loss_clip": 0.0108554, + "auxiliary_loss_mlp": 0.01032592, + "balance_loss_clip": 1.01531875, + "balance_loss_mlp": 1.02607942, + "epoch": 0.24476176161130317, + "flos": 26066153871360.0, + "grad_norm": 1.5368435014631852, + "language_loss": 0.84227765, + "learning_rate": 3.4376585446975394e-06, + "loss": 0.86345899, + "num_input_tokens_seen": 87598620, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.59375, + "step": 4071, + "time_per_iteration": 2.4472923278808594 + }, + { + "auxiliary_loss_clip": 0.01089303, + "auxiliary_loss_mlp": 0.01039843, + "balance_loss_clip": 1.02109122, + "balance_loss_mlp": 1.02579284, + "epoch": 0.24482188486397113, + "flos": 18842554149120.0, + "grad_norm": 1.9747597626384026, + "language_loss": 0.80001962, + "learning_rate": 3.4373958620080535e-06, + "loss": 0.82131112, + "num_input_tokens_seen": 87616595, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.6328125, + "step": 4072, + "time_per_iteration": 2.3589060306549072 + }, + { + "auxiliary_loss_clip": 0.01086477, + "auxiliary_loss_mlp": 0.01040998, + "balance_loss_clip": 1.02470779, + "balance_loss_mlp": 1.02635396, + "epoch": 0.2448820081166391, + "flos": 21250723288320.0, + "grad_norm": 1.4779839136236708, + "language_loss": 0.70185995, + "learning_rate": 3.437133128021607e-06, + "loss": 0.7231347, + "num_input_tokens_seen": 87635755, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.6015625, + "step": 4073, + "time_per_iteration": 2.378868341445923 + }, + { + "auxiliary_loss_clip": 0.01084447, + "auxiliary_loss_mlp": 0.01034579, + "balance_loss_clip": 1.01822329, + "balance_loss_mlp": 1.02527618, + "epoch": 0.2449421313693071, + "flos": 23949474606720.0, + "grad_norm": 1.9629998189776336, + "language_loss": 0.67284667, + "learning_rate": 3.436870342747576e-06, + "loss": 0.69403696, + "num_input_tokens_seen": 87652885, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.58984375, + "step": 4074, + "time_per_iteration": 2.404540538787842 + }, + { + "auxiliary_loss_clip": 0.0101472, + "auxiliary_loss_mlp": 0.0100345, + "balance_loss_clip": 1.00168526, + "balance_loss_mlp": 1.00193989, + "epoch": 0.24500225462197506, + "flos": 60684631188480.0, + "grad_norm": 0.8952262686954836, + "language_loss": 0.68701637, + "learning_rate": 3.4366075061953383e-06, + "loss": 0.70719802, + "num_input_tokens_seen": 87713220, + "router_z_loss_clip": 0.0177002, + "router_z_loss_mlp": 0.12695312, + "step": 4075, + "time_per_iteration": 3.095064878463745 + }, + { + "auxiliary_loss_clip": 0.01086011, + "auxiliary_loss_mlp": 0.01034644, + "balance_loss_clip": 1.01684546, + "balance_loss_mlp": 1.02662492, + "epoch": 0.24506237787464302, + "flos": 26283069348480.0, + "grad_norm": 1.7342231317190084, + "language_loss": 0.79356635, + "learning_rate": 3.4363446183742745e-06, + "loss": 0.81477296, + "num_input_tokens_seen": 87732680, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.59375, + "step": 4076, + "time_per_iteration": 2.4168450832366943 + }, + { + "auxiliary_loss_clip": 0.01090587, + "auxiliary_loss_mlp": 0.01037191, + "balance_loss_clip": 1.01723552, + "balance_loss_mlp": 1.02680612, + "epoch": 0.245122501127311, + "flos": 20551413288960.0, + "grad_norm": 1.7433185223663015, + "language_loss": 0.81870311, + "learning_rate": 3.436081679293765e-06, + "loss": 0.83998084, + "num_input_tokens_seen": 87751880, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 0.640625, + "step": 4077, + "time_per_iteration": 2.394547700881958 + }, + { + "auxiliary_loss_clip": 0.01086478, + "auxiliary_loss_mlp": 0.0103991, + "balance_loss_clip": 1.0213728, + "balance_loss_mlp": 1.02566075, + "epoch": 0.24518262437997895, + "flos": 29131318575360.0, + "grad_norm": 1.9140340180836666, + "language_loss": 0.6226418, + "learning_rate": 3.435818688963195e-06, + "loss": 0.64390564, + "num_input_tokens_seen": 87771795, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.609375, + "step": 4078, + "time_per_iteration": 2.4321401119232178 + }, + { + "auxiliary_loss_clip": 0.01085197, + "auxiliary_loss_mlp": 0.01030136, + "balance_loss_clip": 1.01416183, + "balance_loss_mlp": 1.02681994, + "epoch": 0.24524274763264692, + "flos": 23475807924480.0, + "grad_norm": 1.5585717701530637, + "language_loss": 0.75791383, + "learning_rate": 3.4355556473919496e-06, + "loss": 0.77906722, + "num_input_tokens_seen": 87793640, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.5859375, + "step": 4079, + "time_per_iteration": 2.4183995723724365 + }, + { + "auxiliary_loss_clip": 0.01084062, + "auxiliary_loss_mlp": 0.01039861, + "balance_loss_clip": 1.0213238, + "balance_loss_mlp": 1.02516735, + "epoch": 0.24530287088531488, + "flos": 17200239793920.0, + "grad_norm": 1.632000518799865, + "language_loss": 0.74624711, + "learning_rate": 3.4352925545894158e-06, + "loss": 0.76748633, + "num_input_tokens_seen": 87812390, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.5859375, + "step": 4080, + "time_per_iteration": 2.383319139480591 + }, + { + "auxiliary_loss_clip": 0.01082398, + "auxiliary_loss_mlp": 0.01029089, + "balance_loss_clip": 1.0119822, + "balance_loss_mlp": 1.02472401, + "epoch": 0.24536299413798288, + "flos": 14866540318080.0, + "grad_norm": 1.7022497527737606, + "language_loss": 0.82644224, + "learning_rate": 3.4350294105649823e-06, + "loss": 0.84755707, + "num_input_tokens_seen": 87830640, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.57421875, + "step": 4081, + "time_per_iteration": 3.764127731323242 + }, + { + "auxiliary_loss_clip": 0.01084602, + "auxiliary_loss_mlp": 0.01039664, + "balance_loss_clip": 1.02171159, + "balance_loss_mlp": 1.02598345, + "epoch": 0.24542311739065084, + "flos": 35260600642560.0, + "grad_norm": 2.0308930023785985, + "language_loss": 0.73408455, + "learning_rate": 3.4347662153280407e-06, + "loss": 0.75532722, + "num_input_tokens_seen": 87850450, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.5859375, + "step": 4082, + "time_per_iteration": 2.5416347980499268 + }, + { + "auxiliary_loss_clip": 0.01082989, + "auxiliary_loss_mlp": 0.01039222, + "balance_loss_clip": 1.02315247, + "balance_loss_mlp": 1.02551281, + "epoch": 0.2454832406433188, + "flos": 21502167966720.0, + "grad_norm": 1.7776691365028165, + "language_loss": 0.71883935, + "learning_rate": 3.4345029688879837e-06, + "loss": 0.74006146, + "num_input_tokens_seen": 87868810, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.57421875, + "step": 4083, + "time_per_iteration": 2.378974676132202 + }, + { + "auxiliary_loss_clip": 0.01087814, + "auxiliary_loss_mlp": 0.01037797, + "balance_loss_clip": 1.01862836, + "balance_loss_mlp": 1.02536213, + "epoch": 0.24554336389598677, + "flos": 14755795885440.0, + "grad_norm": 1.9786899534505793, + "language_loss": 0.74808884, + "learning_rate": 3.4342396712542057e-06, + "loss": 0.76934499, + "num_input_tokens_seen": 87885685, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 0.625, + "step": 4084, + "time_per_iteration": 2.385164976119995 + }, + { + "auxiliary_loss_clip": 0.01085357, + "auxiliary_loss_mlp": 0.0103101, + "balance_loss_clip": 1.01353431, + "balance_loss_mlp": 1.02561307, + "epoch": 0.24560348714865474, + "flos": 14975504271360.0, + "grad_norm": 2.325602264681821, + "language_loss": 0.85318172, + "learning_rate": 3.433976322436103e-06, + "loss": 0.87434542, + "num_input_tokens_seen": 87903715, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.59765625, + "step": 4085, + "time_per_iteration": 2.3711678981781006 + }, + { + "auxiliary_loss_clip": 0.01085579, + "auxiliary_loss_mlp": 0.01039234, + "balance_loss_clip": 1.0209831, + "balance_loss_mlp": 1.02607286, + "epoch": 0.2456636104013227, + "flos": 22674202041600.0, + "grad_norm": 1.657697617162889, + "language_loss": 0.79277724, + "learning_rate": 3.433712922443074e-06, + "loss": 0.8140254, + "num_input_tokens_seen": 87923375, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 0.59375, + "step": 4086, + "time_per_iteration": 3.832991361618042 + }, + { + "auxiliary_loss_clip": 0.01083081, + "auxiliary_loss_mlp": 0.01034732, + "balance_loss_clip": 1.01735139, + "balance_loss_mlp": 1.02670062, + "epoch": 0.2457237336539907, + "flos": 27416629238400.0, + "grad_norm": 1.4285693402224882, + "language_loss": 0.75361806, + "learning_rate": 3.433449471284519e-06, + "loss": 0.77479619, + "num_input_tokens_seen": 87943115, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.5625, + "step": 4087, + "time_per_iteration": 2.438711643218994 + }, + { + "auxiliary_loss_clip": 0.01089521, + "auxiliary_loss_mlp": 0.01033842, + "balance_loss_clip": 1.0160439, + "balance_loss_mlp": 1.02893436, + "epoch": 0.24578385690665866, + "flos": 20411341447680.0, + "grad_norm": 2.8593998768559756, + "language_loss": 0.79570776, + "learning_rate": 3.433185968969839e-06, + "loss": 0.81694144, + "num_input_tokens_seen": 87959505, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.60546875, + "step": 4088, + "time_per_iteration": 3.823397636413574 + }, + { + "auxiliary_loss_clip": 0.01082546, + "auxiliary_loss_mlp": 0.01026468, + "balance_loss_clip": 1.00992155, + "balance_loss_mlp": 1.02513552, + "epoch": 0.24584398015932662, + "flos": 23914247178240.0, + "grad_norm": 1.441071505866467, + "language_loss": 0.77050972, + "learning_rate": 3.4329224155084386e-06, + "loss": 0.79159987, + "num_input_tokens_seen": 87979725, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.57421875, + "step": 4089, + "time_per_iteration": 2.4054677486419678 + }, + { + "auxiliary_loss_clip": 0.01085736, + "auxiliary_loss_mlp": 0.01040307, + "balance_loss_clip": 1.02196074, + "balance_loss_mlp": 1.0253849, + "epoch": 0.2459041034119946, + "flos": 41494866768000.0, + "grad_norm": 2.22920423312802, + "language_loss": 0.81344736, + "learning_rate": 3.4326588109097236e-06, + "loss": 0.8347078, + "num_input_tokens_seen": 87998270, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.6015625, + "step": 4090, + "time_per_iteration": 2.5362229347229004 + }, + { + "auxiliary_loss_clip": 0.01088724, + "auxiliary_loss_mlp": 0.01037483, + "balance_loss_clip": 1.01815963, + "balance_loss_mlp": 1.02650738, + "epoch": 0.24596422666466256, + "flos": 19935824463360.0, + "grad_norm": 1.7131929053449881, + "language_loss": 0.73610687, + "learning_rate": 3.4323951551831004e-06, + "loss": 0.75736898, + "num_input_tokens_seen": 88016760, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 0.62109375, + "step": 4091, + "time_per_iteration": 2.367933988571167 + }, + { + "auxiliary_loss_clip": 0.01087415, + "auxiliary_loss_mlp": 0.01037644, + "balance_loss_clip": 1.0196197, + "balance_loss_mlp": 1.02809274, + "epoch": 0.24602434991733052, + "flos": 21543295415040.0, + "grad_norm": 2.608448135050141, + "language_loss": 0.7709353, + "learning_rate": 3.432131448337979e-06, + "loss": 0.7921859, + "num_input_tokens_seen": 88036465, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.59375, + "step": 4092, + "time_per_iteration": 3.747690439224243 + }, + { + "auxiliary_loss_clip": 0.01088136, + "auxiliary_loss_mlp": 0.01034006, + "balance_loss_clip": 1.01592183, + "balance_loss_mlp": 1.02500153, + "epoch": 0.24608447316999849, + "flos": 23183968936320.0, + "grad_norm": 2.343426035774861, + "language_loss": 0.81301463, + "learning_rate": 3.43186769038377e-06, + "loss": 0.83423603, + "num_input_tokens_seen": 88053270, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.6328125, + "step": 4093, + "time_per_iteration": 2.376940965652466 + }, + { + "auxiliary_loss_clip": 0.01090146, + "auxiliary_loss_mlp": 0.0103768, + "balance_loss_clip": 1.01833272, + "balance_loss_mlp": 1.02576494, + "epoch": 0.24614459642266648, + "flos": 19641052920960.0, + "grad_norm": 3.50730343237232, + "language_loss": 0.8697294, + "learning_rate": 3.431603881329886e-06, + "loss": 0.89100766, + "num_input_tokens_seen": 88072305, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 0.64453125, + "step": 4094, + "time_per_iteration": 2.37156081199646 + }, + { + "auxiliary_loss_clip": 0.01017112, + "auxiliary_loss_mlp": 0.01004962, + "balance_loss_clip": 1.00329256, + "balance_loss_mlp": 1.00362504, + "epoch": 0.24620471967533444, + "flos": 61736913699840.0, + "grad_norm": 0.7448628737259854, + "language_loss": 0.57499409, + "learning_rate": 3.4313400211857424e-06, + "loss": 0.59521484, + "num_input_tokens_seen": 88137995, + "router_z_loss_clip": 0.01672363, + "router_z_loss_mlp": 0.13476562, + "step": 4095, + "time_per_iteration": 3.10062837600708 + }, + { + "auxiliary_loss_clip": 0.01016659, + "auxiliary_loss_mlp": 0.01002304, + "balance_loss_clip": 1.00047994, + "balance_loss_mlp": 1.00323367, + "epoch": 0.2462648429280024, + "flos": 69151103867520.0, + "grad_norm": 0.6407135851517143, + "language_loss": 0.56290764, + "learning_rate": 3.431076109960755e-06, + "loss": 0.58309728, + "num_input_tokens_seen": 88208490, + "router_z_loss_clip": 0.01818848, + "router_z_loss_mlp": 0.13476562, + "step": 4096, + "time_per_iteration": 3.1554715633392334 + }, + { + "auxiliary_loss_clip": 0.01087476, + "auxiliary_loss_mlp": 0.01034048, + "balance_loss_clip": 1.01619077, + "balance_loss_mlp": 1.02752233, + "epoch": 0.24632496618067037, + "flos": 29458350080640.0, + "grad_norm": 2.7101616297545945, + "language_loss": 0.77540457, + "learning_rate": 3.4308121476643423e-06, + "loss": 0.79661977, + "num_input_tokens_seen": 88228050, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 0.6015625, + "step": 4097, + "time_per_iteration": 2.458148241043091 + }, + { + "auxiliary_loss_clip": 0.01090136, + "auxiliary_loss_mlp": 0.01037499, + "balance_loss_clip": 1.01834202, + "balance_loss_mlp": 1.02733064, + "epoch": 0.24638508943333834, + "flos": 24315294499200.0, + "grad_norm": 1.7717785365262917, + "language_loss": 0.76084214, + "learning_rate": 3.4305481343059254e-06, + "loss": 0.7821185, + "num_input_tokens_seen": 88248090, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 0.62890625, + "step": 4098, + "time_per_iteration": 2.4008679389953613 + }, + { + "auxiliary_loss_clip": 0.01087306, + "auxiliary_loss_mlp": 0.01036631, + "balance_loss_clip": 1.01951289, + "balance_loss_mlp": 1.0264585, + "epoch": 0.2464452126860063, + "flos": 26612090801280.0, + "grad_norm": 2.4152333366728453, + "language_loss": 0.68078029, + "learning_rate": 3.4302840698949247e-06, + "loss": 0.70201969, + "num_input_tokens_seen": 88267545, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.609375, + "step": 4099, + "time_per_iteration": 2.4404308795928955 + }, + { + "auxiliary_loss_clip": 0.01082462, + "auxiliary_loss_mlp": 0.01034877, + "balance_loss_clip": 1.01868868, + "balance_loss_mlp": 1.02639103, + "epoch": 0.24650533593867427, + "flos": 31211059754880.0, + "grad_norm": 1.7694417215531217, + "language_loss": 0.65903497, + "learning_rate": 3.430019954440764e-06, + "loss": 0.68020833, + "num_input_tokens_seen": 88289785, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.5625, + "step": 4100, + "time_per_iteration": 2.4761061668395996 + }, + { + "auxiliary_loss_clip": 0.01016516, + "auxiliary_loss_mlp": 0.01008392, + "balance_loss_clip": 1.00666356, + "balance_loss_mlp": 1.0029285, + "epoch": 0.24656545919134226, + "flos": 68490791723520.0, + "grad_norm": 0.7184272720027738, + "language_loss": 0.61550868, + "learning_rate": 3.429755787952871e-06, + "loss": 0.63575774, + "num_input_tokens_seen": 88357320, + "router_z_loss_clip": 0.01733398, + "router_z_loss_mlp": 0.13574219, + "step": 4101, + "time_per_iteration": 3.150977373123169 + }, + { + "auxiliary_loss_clip": 0.01082988, + "auxiliary_loss_mlp": 0.01036277, + "balance_loss_clip": 1.01843143, + "balance_loss_mlp": 1.02551293, + "epoch": 0.24662558244401023, + "flos": 20083157867520.0, + "grad_norm": 1.6896791858925082, + "language_loss": 0.72792119, + "learning_rate": 3.429491570440671e-06, + "loss": 0.74911392, + "num_input_tokens_seen": 88377040, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.578125, + "step": 4102, + "time_per_iteration": 2.373002052307129 + }, + { + "auxiliary_loss_clip": 0.01086001, + "auxiliary_loss_mlp": 0.01035045, + "balance_loss_clip": 1.01811719, + "balance_loss_mlp": 1.02502012, + "epoch": 0.2466857056966782, + "flos": 30700036051200.0, + "grad_norm": 2.3135662645464823, + "language_loss": 0.76031542, + "learning_rate": 3.4292273019135936e-06, + "loss": 0.78152585, + "num_input_tokens_seen": 88395085, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.609375, + "step": 4103, + "time_per_iteration": 2.459496021270752 + }, + { + "auxiliary_loss_clip": 0.01087263, + "auxiliary_loss_mlp": 0.01031921, + "balance_loss_clip": 1.01391983, + "balance_loss_mlp": 1.02669179, + "epoch": 0.24674582894934616, + "flos": 22527427219200.0, + "grad_norm": 1.96725708445454, + "language_loss": 0.78242195, + "learning_rate": 3.4289629823810707e-06, + "loss": 0.80361378, + "num_input_tokens_seen": 88413205, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.60546875, + "step": 4104, + "time_per_iteration": 2.374256134033203 + }, + { + "auxiliary_loss_clip": 0.01087911, + "auxiliary_loss_mlp": 0.01035006, + "balance_loss_clip": 1.01605129, + "balance_loss_mlp": 1.02754247, + "epoch": 0.24680595220201412, + "flos": 20703250258560.0, + "grad_norm": 1.705733690607604, + "language_loss": 0.83070034, + "learning_rate": 3.4286986118525345e-06, + "loss": 0.85192949, + "num_input_tokens_seen": 88431525, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 0.6015625, + "step": 4105, + "time_per_iteration": 2.4024829864501953 + }, + { + "auxiliary_loss_clip": 0.0108874, + "auxiliary_loss_mlp": 0.01038361, + "balance_loss_clip": 1.02142143, + "balance_loss_mlp": 1.02883244, + "epoch": 0.2468660754546821, + "flos": 21830211901440.0, + "grad_norm": 1.8265082566322628, + "language_loss": 0.76055944, + "learning_rate": 3.4284341903374196e-06, + "loss": 0.78183043, + "num_input_tokens_seen": 88451210, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.6015625, + "step": 4106, + "time_per_iteration": 2.3864710330963135 + }, + { + "auxiliary_loss_clip": 0.01084883, + "auxiliary_loss_mlp": 0.01035939, + "balance_loss_clip": 1.01749706, + "balance_loss_mlp": 1.02491546, + "epoch": 0.24692619870735008, + "flos": 15266819589120.0, + "grad_norm": 2.2441239554161334, + "language_loss": 0.71969068, + "learning_rate": 3.4281697178451638e-06, + "loss": 0.74089891, + "num_input_tokens_seen": 88467790, + "router_z_loss_clip": 0.18457031, + "router_z_loss_mlp": 0.6015625, + "step": 4107, + "time_per_iteration": 2.363778829574585 + }, + { + "auxiliary_loss_clip": 0.01088094, + "auxiliary_loss_mlp": 0.01035049, + "balance_loss_clip": 1.01671481, + "balance_loss_mlp": 1.02747977, + "epoch": 0.24698632196001805, + "flos": 29678791605120.0, + "grad_norm": 1.5707344941189116, + "language_loss": 0.65555215, + "learning_rate": 3.4279051943852037e-06, + "loss": 0.67678356, + "num_input_tokens_seen": 88490330, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.60546875, + "step": 4108, + "time_per_iteration": 2.4472367763519287 + }, + { + "auxiliary_loss_clip": 0.01087845, + "auxiliary_loss_mlp": 0.01039652, + "balance_loss_clip": 1.02007771, + "balance_loss_mlp": 1.02634561, + "epoch": 0.247046445212686, + "flos": 39163925289600.0, + "grad_norm": 2.2702505861453557, + "language_loss": 0.72676706, + "learning_rate": 3.42764061996698e-06, + "loss": 0.74804205, + "num_input_tokens_seen": 88512435, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 0.61328125, + "step": 4109, + "time_per_iteration": 2.537430763244629 + }, + { + "auxiliary_loss_clip": 0.01088715, + "auxiliary_loss_mlp": 0.0103969, + "balance_loss_clip": 1.02140296, + "balance_loss_mlp": 1.02731705, + "epoch": 0.24710656846535398, + "flos": 22997847144960.0, + "grad_norm": 1.760139337317379, + "language_loss": 0.78744268, + "learning_rate": 3.4273759945999356e-06, + "loss": 0.80872673, + "num_input_tokens_seen": 88529780, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 0.61328125, + "step": 4110, + "time_per_iteration": 2.3913421630859375 + }, + { + "auxiliary_loss_clip": 0.01086692, + "auxiliary_loss_mlp": 0.01041434, + "balance_loss_clip": 1.02338529, + "balance_loss_mlp": 1.02663243, + "epoch": 0.24716669171802194, + "flos": 26431589738880.0, + "grad_norm": 2.6811779058260323, + "language_loss": 0.80902535, + "learning_rate": 3.4271113182935134e-06, + "loss": 0.83030665, + "num_input_tokens_seen": 88547200, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.6015625, + "step": 4111, + "time_per_iteration": 2.443218946456909 + }, + { + "auxiliary_loss_clip": 0.01086681, + "auxiliary_loss_mlp": 0.01034782, + "balance_loss_clip": 1.01852179, + "balance_loss_mlp": 1.02719283, + "epoch": 0.2472268149706899, + "flos": 23328788722560.0, + "grad_norm": 1.8822407601850477, + "language_loss": 0.748734, + "learning_rate": 3.4268465910571587e-06, + "loss": 0.7699486, + "num_input_tokens_seen": 88566415, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.59375, + "step": 4112, + "time_per_iteration": 2.400817632675171 + }, + { + "auxiliary_loss_clip": 0.01086286, + "auxiliary_loss_mlp": 0.01038362, + "balance_loss_clip": 1.02139902, + "balance_loss_mlp": 1.02570081, + "epoch": 0.24728693822335787, + "flos": 23767612001280.0, + "grad_norm": 1.8161260878033203, + "language_loss": 0.82026696, + "learning_rate": 3.42658181290032e-06, + "loss": 0.84151351, + "num_input_tokens_seen": 88585225, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.60546875, + "step": 4113, + "time_per_iteration": 2.4078712463378906 + }, + { + "auxiliary_loss_clip": 0.01083606, + "auxiliary_loss_mlp": 0.01032953, + "balance_loss_clip": 1.01504803, + "balance_loss_mlp": 1.02510583, + "epoch": 0.24734706147602586, + "flos": 19316500122240.0, + "grad_norm": 2.1578054812164176, + "language_loss": 0.86835074, + "learning_rate": 3.4263169838324458e-06, + "loss": 0.88951635, + "num_input_tokens_seen": 88603280, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.5859375, + "step": 4114, + "time_per_iteration": 2.366503953933716 + }, + { + "auxiliary_loss_clip": 0.01086202, + "auxiliary_loss_mlp": 0.01034469, + "balance_loss_clip": 1.01860249, + "balance_loss_mlp": 1.0259521, + "epoch": 0.24740718472869383, + "flos": 28035709200000.0, + "grad_norm": 1.5478396973111115, + "language_loss": 0.75643438, + "learning_rate": 3.4260521038629878e-06, + "loss": 0.77764106, + "num_input_tokens_seen": 88624925, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.6015625, + "step": 4115, + "time_per_iteration": 2.4414401054382324 + }, + { + "auxiliary_loss_clip": 0.01016377, + "auxiliary_loss_mlp": 0.01006983, + "balance_loss_clip": 1.00533772, + "balance_loss_mlp": 1.00268722, + "epoch": 0.2474673079813618, + "flos": 68103953326080.0, + "grad_norm": 0.68895193068118, + "language_loss": 0.58217251, + "learning_rate": 3.4257871730013974e-06, + "loss": 0.60240614, + "num_input_tokens_seen": 88691475, + "router_z_loss_clip": 0.01647949, + "router_z_loss_mlp": 0.13671875, + "step": 4116, + "time_per_iteration": 3.0879950523376465 + }, + { + "auxiliary_loss_clip": 0.01084034, + "auxiliary_loss_mlp": 0.01034311, + "balance_loss_clip": 1.01683486, + "balance_loss_mlp": 1.02543807, + "epoch": 0.24752743123402976, + "flos": 29460793875840.0, + "grad_norm": 1.3854555118940515, + "language_loss": 0.83491755, + "learning_rate": 3.4255221912571315e-06, + "loss": 0.85610104, + "num_input_tokens_seen": 88713425, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.5859375, + "step": 4117, + "time_per_iteration": 2.466036081314087 + }, + { + "auxiliary_loss_clip": 0.01016117, + "auxiliary_loss_mlp": 0.01006709, + "balance_loss_clip": 1.00502861, + "balance_loss_mlp": 1.00260723, + "epoch": 0.24758755448669773, + "flos": 58347545310720.0, + "grad_norm": 0.9030819957565785, + "language_loss": 0.63517618, + "learning_rate": 3.425257158639645e-06, + "loss": 0.65540445, + "num_input_tokens_seen": 88769995, + "router_z_loss_clip": 0.0168457, + "router_z_loss_mlp": 0.13476562, + "step": 4118, + "time_per_iteration": 2.903324604034424 + }, + { + "auxiliary_loss_clip": 0.01084254, + "auxiliary_loss_mlp": 0.01037389, + "balance_loss_clip": 1.02037787, + "balance_loss_mlp": 1.02594364, + "epoch": 0.2476476777393657, + "flos": 20483402227200.0, + "grad_norm": 1.489226425737867, + "language_loss": 0.79383802, + "learning_rate": 3.424992075158397e-06, + "loss": 0.81505442, + "num_input_tokens_seen": 88789970, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.5859375, + "step": 4119, + "time_per_iteration": 2.4023826122283936 + }, + { + "auxiliary_loss_clip": 0.01085144, + "auxiliary_loss_mlp": 0.01033568, + "balance_loss_clip": 1.01795745, + "balance_loss_mlp": 1.02686667, + "epoch": 0.24770780099203366, + "flos": 20484798681600.0, + "grad_norm": 1.4906477756345116, + "language_loss": 0.74584258, + "learning_rate": 3.4247269408228467e-06, + "loss": 0.76702964, + "num_input_tokens_seen": 88810000, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.58203125, + "step": 4120, + "time_per_iteration": 2.384490728378296 + }, + { + "auxiliary_loss_clip": 0.01087263, + "auxiliary_loss_mlp": 0.01043653, + "balance_loss_clip": 1.02555668, + "balance_loss_mlp": 1.02644324, + "epoch": 0.24776792424470165, + "flos": 15152653843200.0, + "grad_norm": 1.8729883823275078, + "language_loss": 0.88248277, + "learning_rate": 3.424461755642457e-06, + "loss": 0.90379196, + "num_input_tokens_seen": 88827515, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.609375, + "step": 4121, + "time_per_iteration": 3.769474506378174 + }, + { + "auxiliary_loss_clip": 0.01086276, + "auxiliary_loss_mlp": 0.01035794, + "balance_loss_clip": 1.01711369, + "balance_loss_mlp": 1.02580178, + "epoch": 0.2478280474973696, + "flos": 21724389970560.0, + "grad_norm": 4.090215250538317, + "language_loss": 0.69454342, + "learning_rate": 3.4241965196266912e-06, + "loss": 0.71576416, + "num_input_tokens_seen": 88845025, + "router_z_loss_clip": 0.18652344, + "router_z_loss_mlp": 0.60546875, + "step": 4122, + "time_per_iteration": 2.377908229827881 + }, + { + "auxiliary_loss_clip": 0.01085065, + "auxiliary_loss_mlp": 0.01037677, + "balance_loss_clip": 1.02011788, + "balance_loss_mlp": 1.02521086, + "epoch": 0.24788817075003758, + "flos": 20411166890880.0, + "grad_norm": 2.0516835943803082, + "language_loss": 0.8037625, + "learning_rate": 3.4239312327850155e-06, + "loss": 0.82498991, + "num_input_tokens_seen": 88861740, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.59765625, + "step": 4123, + "time_per_iteration": 2.3808038234710693 + }, + { + "auxiliary_loss_clip": 0.01084513, + "auxiliary_loss_mlp": 0.01037704, + "balance_loss_clip": 1.02137208, + "balance_loss_mlp": 1.02572, + "epoch": 0.24794829400270554, + "flos": 22593553067520.0, + "grad_norm": 1.7197151615863469, + "language_loss": 0.74973655, + "learning_rate": 3.423665895126897e-06, + "loss": 0.77095872, + "num_input_tokens_seen": 88879740, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.5859375, + "step": 4124, + "time_per_iteration": 2.4803411960601807 + }, + { + "auxiliary_loss_clip": 0.01084843, + "auxiliary_loss_mlp": 0.0103097, + "balance_loss_clip": 1.01540172, + "balance_loss_mlp": 1.02780402, + "epoch": 0.2480084172553735, + "flos": 39674495145600.0, + "grad_norm": 1.4040776014202108, + "language_loss": 0.73531151, + "learning_rate": 3.4234005066618047e-06, + "loss": 0.75646973, + "num_input_tokens_seen": 88904095, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.5703125, + "step": 4125, + "time_per_iteration": 3.9552083015441895 + }, + { + "auxiliary_loss_clip": 0.01088419, + "auxiliary_loss_mlp": 0.01038379, + "balance_loss_clip": 1.02046132, + "balance_loss_mlp": 1.02543163, + "epoch": 0.24806854050804147, + "flos": 22052643373440.0, + "grad_norm": 1.9689184165616531, + "language_loss": 0.69233143, + "learning_rate": 3.4231350673992093e-06, + "loss": 0.71359944, + "num_input_tokens_seen": 88920740, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.62890625, + "step": 4126, + "time_per_iteration": 2.3816680908203125 + }, + { + "auxiliary_loss_clip": 0.01086558, + "auxiliary_loss_mlp": 0.01040118, + "balance_loss_clip": 1.02257013, + "balance_loss_mlp": 1.02685738, + "epoch": 0.24812866376070947, + "flos": 15485864659200.0, + "grad_norm": 2.0665858456834374, + "language_loss": 0.80779505, + "learning_rate": 3.422869577348584e-06, + "loss": 0.82906175, + "num_input_tokens_seen": 88938510, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.59765625, + "step": 4127, + "time_per_iteration": 3.7433838844299316 + }, + { + "auxiliary_loss_clip": 0.01088358, + "auxiliary_loss_mlp": 0.01033442, + "balance_loss_clip": 1.01672888, + "balance_loss_mlp": 1.02749491, + "epoch": 0.24818878701337743, + "flos": 14756529024000.0, + "grad_norm": 3.4168117397882307, + "language_loss": 0.844118, + "learning_rate": 3.422604036519404e-06, + "loss": 0.865336, + "num_input_tokens_seen": 88955235, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.609375, + "step": 4128, + "time_per_iteration": 2.366837501525879 + }, + { + "auxiliary_loss_clip": 0.01085909, + "auxiliary_loss_mlp": 0.01035099, + "balance_loss_clip": 1.0185169, + "balance_loss_mlp": 1.02622509, + "epoch": 0.2482489102660454, + "flos": 27088271101440.0, + "grad_norm": 2.6987227673288805, + "language_loss": 0.6540767, + "learning_rate": 3.4223384449211457e-06, + "loss": 0.67528689, + "num_input_tokens_seen": 88975210, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.59765625, + "step": 4129, + "time_per_iteration": 2.4251272678375244 + }, + { + "auxiliary_loss_clip": 0.01085872, + "auxiliary_loss_mlp": 0.0103453, + "balance_loss_clip": 1.01737642, + "balance_loss_mlp": 1.0268786, + "epoch": 0.24830903351871336, + "flos": 26466363319680.0, + "grad_norm": 2.252155087549559, + "language_loss": 0.75110793, + "learning_rate": 3.4220728025632863e-06, + "loss": 0.77231193, + "num_input_tokens_seen": 88996120, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.58984375, + "step": 4130, + "time_per_iteration": 2.4295639991760254 + }, + { + "auxiliary_loss_clip": 0.01086163, + "auxiliary_loss_mlp": 0.01034899, + "balance_loss_clip": 1.01736295, + "balance_loss_mlp": 1.02593648, + "epoch": 0.24836915677138133, + "flos": 10227805459200.0, + "grad_norm": 2.069456691917796, + "language_loss": 0.76421893, + "learning_rate": 3.421807109455307e-06, + "loss": 0.78542954, + "num_input_tokens_seen": 89008685, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.6015625, + "step": 4131, + "time_per_iteration": 3.7015538215637207 + }, + { + "auxiliary_loss_clip": 0.01083747, + "auxiliary_loss_mlp": 0.01033424, + "balance_loss_clip": 1.01816547, + "balance_loss_mlp": 1.02656412, + "epoch": 0.2484292800240493, + "flos": 30079140698880.0, + "grad_norm": 1.6020381465042943, + "language_loss": 0.84020936, + "learning_rate": 3.4215413656066893e-06, + "loss": 0.86138105, + "num_input_tokens_seen": 89031160, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.5703125, + "step": 4132, + "time_per_iteration": 2.455026865005493 + }, + { + "auxiliary_loss_clip": 0.01087027, + "auxiliary_loss_mlp": 0.01032721, + "balance_loss_clip": 1.01451731, + "balance_loss_mlp": 1.02686334, + "epoch": 0.24848940327671726, + "flos": 13442118958080.0, + "grad_norm": 1.695454739942384, + "language_loss": 0.7100206, + "learning_rate": 3.4212755710269163e-06, + "loss": 0.7312181, + "num_input_tokens_seen": 89047235, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.6015625, + "step": 4133, + "time_per_iteration": 2.4285826683044434 + }, + { + "auxiliary_loss_clip": 0.01090741, + "auxiliary_loss_mlp": 0.01038368, + "balance_loss_clip": 1.01745844, + "balance_loss_mlp": 1.02635288, + "epoch": 0.24854952652938525, + "flos": 19969341235200.0, + "grad_norm": 2.252359131562968, + "language_loss": 0.60830545, + "learning_rate": 3.4210097257254748e-06, + "loss": 0.62959659, + "num_input_tokens_seen": 89064790, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 0.64453125, + "step": 4134, + "time_per_iteration": 2.362736463546753 + }, + { + "auxiliary_loss_clip": 0.01085969, + "auxiliary_loss_mlp": 0.01036455, + "balance_loss_clip": 1.01839435, + "balance_loss_mlp": 1.02588129, + "epoch": 0.24860964978205322, + "flos": 18149213992320.0, + "grad_norm": 1.9686896426832157, + "language_loss": 0.7874074, + "learning_rate": 3.420743829711851e-06, + "loss": 0.80863166, + "num_input_tokens_seen": 89083250, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.6015625, + "step": 4135, + "time_per_iteration": 2.378121852874756 + }, + { + "auxiliary_loss_clip": 0.01088973, + "auxiliary_loss_mlp": 0.01031641, + "balance_loss_clip": 1.01552403, + "balance_loss_mlp": 1.02883184, + "epoch": 0.24866977303472118, + "flos": 11727848557440.0, + "grad_norm": 8.824193874715622, + "language_loss": 0.83314431, + "learning_rate": 3.420477882995535e-06, + "loss": 0.85435045, + "num_input_tokens_seen": 89100905, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.6015625, + "step": 4136, + "time_per_iteration": 2.3484787940979004 + }, + { + "auxiliary_loss_clip": 0.01086764, + "auxiliary_loss_mlp": 0.01038229, + "balance_loss_clip": 1.02116978, + "balance_loss_mlp": 1.02579129, + "epoch": 0.24872989628738915, + "flos": 34822161388800.0, + "grad_norm": 1.8902196463562428, + "language_loss": 0.70785689, + "learning_rate": 3.420211885586017e-06, + "loss": 0.72910678, + "num_input_tokens_seen": 89122630, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.609375, + "step": 4137, + "time_per_iteration": 2.5101962089538574 + }, + { + "auxiliary_loss_clip": 0.01086507, + "auxiliary_loss_mlp": 0.01037768, + "balance_loss_clip": 1.0215317, + "balance_loss_mlp": 1.02494049, + "epoch": 0.2487900195400571, + "flos": 13698486138240.0, + "grad_norm": 1.8895909219766391, + "language_loss": 0.66491926, + "learning_rate": 3.41994583749279e-06, + "loss": 0.68616199, + "num_input_tokens_seen": 89141050, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.61328125, + "step": 4138, + "time_per_iteration": 2.365785598754883 + }, + { + "auxiliary_loss_clip": 0.01083291, + "auxiliary_loss_mlp": 0.01031741, + "balance_loss_clip": 1.0165236, + "balance_loss_mlp": 1.02615142, + "epoch": 0.24885014279272508, + "flos": 25336643679360.0, + "grad_norm": 1.858364232849583, + "language_loss": 0.83803201, + "learning_rate": 3.4196797387253482e-06, + "loss": 0.85918236, + "num_input_tokens_seen": 89160810, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.5703125, + "step": 4139, + "time_per_iteration": 2.4217495918273926 + }, + { + "auxiliary_loss_clip": 0.01086816, + "auxiliary_loss_mlp": 0.01035971, + "balance_loss_clip": 1.01742208, + "balance_loss_mlp": 1.02614295, + "epoch": 0.24891026604539307, + "flos": 20630386517760.0, + "grad_norm": 1.484568005547756, + "language_loss": 0.78808331, + "learning_rate": 3.419413589293189e-06, + "loss": 0.80931115, + "num_input_tokens_seen": 89180610, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.60546875, + "step": 4140, + "time_per_iteration": 2.404123067855835 + }, + { + "auxiliary_loss_clip": 0.01015239, + "auxiliary_loss_mlp": 0.01004717, + "balance_loss_clip": 1.00309598, + "balance_loss_mlp": 1.00217843, + "epoch": 0.24897038929806103, + "flos": 66957162030720.0, + "grad_norm": 0.83112928286704, + "language_loss": 0.61025429, + "learning_rate": 3.4191473892058094e-06, + "loss": 0.63045382, + "num_input_tokens_seen": 89241880, + "router_z_loss_clip": 0.01623535, + "router_z_loss_mlp": 0.13085938, + "step": 4141, + "time_per_iteration": 3.082777261734009 + }, + { + "auxiliary_loss_clip": 0.01089157, + "auxiliary_loss_mlp": 0.01040758, + "balance_loss_clip": 1.02350843, + "balance_loss_mlp": 1.02753234, + "epoch": 0.249030512550729, + "flos": 36391088332800.0, + "grad_norm": 1.8390721993300967, + "language_loss": 0.72601914, + "learning_rate": 3.4188811384727104e-06, + "loss": 0.74731827, + "num_input_tokens_seen": 89263340, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.6171875, + "step": 4142, + "time_per_iteration": 2.5076329708099365 + }, + { + "auxiliary_loss_clip": 0.01088461, + "auxiliary_loss_mlp": 0.01032706, + "balance_loss_clip": 1.01664865, + "balance_loss_mlp": 1.02794802, + "epoch": 0.24909063580339696, + "flos": 20153612724480.0, + "grad_norm": 1.6955684921265535, + "language_loss": 0.80873203, + "learning_rate": 3.418614837103393e-06, + "loss": 0.82994366, + "num_input_tokens_seen": 89282870, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.60546875, + "step": 4143, + "time_per_iteration": 2.396929979324341 + }, + { + "auxiliary_loss_clip": 0.01080377, + "auxiliary_loss_mlp": 0.01029764, + "balance_loss_clip": 1.01424301, + "balance_loss_mlp": 1.02441239, + "epoch": 0.24915075905606493, + "flos": 26395349880960.0, + "grad_norm": 1.850173296193879, + "language_loss": 0.58839977, + "learning_rate": 3.418348485107362e-06, + "loss": 0.60950112, + "num_input_tokens_seen": 89303830, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.55859375, + "step": 4144, + "time_per_iteration": 2.4307267665863037 + }, + { + "auxiliary_loss_clip": 0.01082384, + "auxiliary_loss_mlp": 0.01035851, + "balance_loss_clip": 1.01903057, + "balance_loss_mlp": 1.02435446, + "epoch": 0.2492108823087329, + "flos": 27525977216640.0, + "grad_norm": 8.220650966989995, + "language_loss": 0.78757977, + "learning_rate": 3.4180820824941213e-06, + "loss": 0.80876213, + "num_input_tokens_seen": 89324350, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.578125, + "step": 4145, + "time_per_iteration": 2.43618106842041 + }, + { + "auxiliary_loss_clip": 0.01094052, + "auxiliary_loss_mlp": 0.01040673, + "balance_loss_clip": 1.01956058, + "balance_loss_mlp": 1.02610385, + "epoch": 0.24927100556140086, + "flos": 16690437987840.0, + "grad_norm": 1.9456226615546552, + "language_loss": 0.65626216, + "learning_rate": 3.4178156292731787e-06, + "loss": 0.67760944, + "num_input_tokens_seen": 89342875, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 0.6796875, + "step": 4146, + "time_per_iteration": 2.3641622066497803 + }, + { + "auxiliary_loss_clip": 0.01015156, + "auxiliary_loss_mlp": 0.01001284, + "balance_loss_clip": 0.99965078, + "balance_loss_mlp": 1.00193524, + "epoch": 0.24933112881406885, + "flos": 62769225047040.0, + "grad_norm": 0.9470214086913717, + "language_loss": 0.6726746, + "learning_rate": 3.4175491254540436e-06, + "loss": 0.69283903, + "num_input_tokens_seen": 89404925, + "router_z_loss_clip": 0.01635742, + "router_z_loss_mlp": 0.13183594, + "step": 4147, + "time_per_iteration": 3.1399781703948975 + }, + { + "auxiliary_loss_clip": 0.01088461, + "auxiliary_loss_mlp": 0.01040376, + "balance_loss_clip": 1.02336502, + "balance_loss_mlp": 1.02815604, + "epoch": 0.24939125206673682, + "flos": 26650669720320.0, + "grad_norm": 1.7022715726379114, + "language_loss": 0.89207381, + "learning_rate": 3.4172825710462267e-06, + "loss": 0.91336215, + "num_input_tokens_seen": 89425090, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.6015625, + "step": 4148, + "time_per_iteration": 2.417715072631836 + }, + { + "auxiliary_loss_clip": 0.01090791, + "auxiliary_loss_mlp": 0.01041572, + "balance_loss_clip": 1.02139008, + "balance_loss_mlp": 1.02708268, + "epoch": 0.24945137531940478, + "flos": 20703285169920.0, + "grad_norm": 1.952899303527093, + "language_loss": 0.68199652, + "learning_rate": 3.4170159660592404e-06, + "loss": 0.70332015, + "num_input_tokens_seen": 89442615, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 0.63671875, + "step": 4149, + "time_per_iteration": 2.385103702545166 + }, + { + "auxiliary_loss_clip": 0.01084708, + "auxiliary_loss_mlp": 0.01031042, + "balance_loss_clip": 1.01358342, + "balance_loss_mlp": 1.02614653, + "epoch": 0.24951149857207275, + "flos": 23767542178560.0, + "grad_norm": 1.691960060504696, + "language_loss": 0.7100659, + "learning_rate": 3.416749310502599e-06, + "loss": 0.73122334, + "num_input_tokens_seen": 89463025, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.5859375, + "step": 4150, + "time_per_iteration": 2.413113594055176 + }, + { + "auxiliary_loss_clip": 0.01087064, + "auxiliary_loss_mlp": 0.01037824, + "balance_loss_clip": 1.01902509, + "balance_loss_mlp": 1.02616155, + "epoch": 0.2495716218247407, + "flos": 15664096483200.0, + "grad_norm": 1.7552566589860064, + "language_loss": 0.72874904, + "learning_rate": 3.4164826043858195e-06, + "loss": 0.74999797, + "num_input_tokens_seen": 89480225, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.609375, + "step": 4151, + "time_per_iteration": 2.353670358657837 + }, + { + "auxiliary_loss_clip": 0.01092564, + "auxiliary_loss_mlp": 0.01044131, + "balance_loss_clip": 1.02511668, + "balance_loss_mlp": 1.02713895, + "epoch": 0.24963174507740868, + "flos": 24051595933440.0, + "grad_norm": 2.5621696544024126, + "language_loss": 0.63709629, + "learning_rate": 3.416215847718419e-06, + "loss": 0.65846318, + "num_input_tokens_seen": 89496985, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 0.65234375, + "step": 4152, + "time_per_iteration": 2.3920669555664062 + }, + { + "auxiliary_loss_clip": 0.01085249, + "auxiliary_loss_mlp": 0.01038473, + "balance_loss_clip": 1.02147388, + "balance_loss_mlp": 1.02740765, + "epoch": 0.24969186833007664, + "flos": 21798405786240.0, + "grad_norm": 2.2758752004914005, + "language_loss": 0.77126729, + "learning_rate": 3.4159490405099183e-06, + "loss": 0.79250455, + "num_input_tokens_seen": 89514420, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.578125, + "step": 4153, + "time_per_iteration": 2.3825888633728027 + }, + { + "auxiliary_loss_clip": 0.01084138, + "auxiliary_loss_mlp": 0.01033125, + "balance_loss_clip": 1.01632833, + "balance_loss_mlp": 1.02591348, + "epoch": 0.24975199158274464, + "flos": 19937116183680.0, + "grad_norm": 1.7964037245080868, + "language_loss": 0.76339287, + "learning_rate": 3.4156821827698387e-06, + "loss": 0.78456545, + "num_input_tokens_seen": 89532925, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.5859375, + "step": 4154, + "time_per_iteration": 2.3665268421173096 + }, + { + "auxiliary_loss_clip": 0.01090074, + "auxiliary_loss_mlp": 0.01036943, + "balance_loss_clip": 1.01710677, + "balance_loss_mlp": 1.02616119, + "epoch": 0.2498121148354126, + "flos": 25337202261120.0, + "grad_norm": 2.1806513294402747, + "language_loss": 0.70880032, + "learning_rate": 3.4154152745077027e-06, + "loss": 0.73007047, + "num_input_tokens_seen": 89552855, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 0.640625, + "step": 4155, + "time_per_iteration": 2.402557134628296 + }, + { + "auxiliary_loss_clip": 0.01088748, + "auxiliary_loss_mlp": 0.01040095, + "balance_loss_clip": 1.02145052, + "balance_loss_mlp": 1.02694702, + "epoch": 0.24987223808808057, + "flos": 20557732245120.0, + "grad_norm": 1.5898528415838775, + "language_loss": 0.75153297, + "learning_rate": 3.4151483157330373e-06, + "loss": 0.77282143, + "num_input_tokens_seen": 89572830, + "router_z_loss_clip": 0.18652344, + "router_z_loss_mlp": 0.6171875, + "step": 4156, + "time_per_iteration": 2.3852381706237793 + }, + { + "auxiliary_loss_clip": 0.01085636, + "auxiliary_loss_mlp": 0.01030915, + "balance_loss_clip": 1.0144881, + "balance_loss_mlp": 1.02557111, + "epoch": 0.24993236134074853, + "flos": 19748201483520.0, + "grad_norm": 3.2139917966064724, + "language_loss": 0.76728547, + "learning_rate": 3.4148813064553686e-06, + "loss": 0.78845096, + "num_input_tokens_seen": 89590345, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.6015625, + "step": 4157, + "time_per_iteration": 2.372711181640625 + }, + { + "auxiliary_loss_clip": 0.01086456, + "auxiliary_loss_mlp": 0.01035255, + "balance_loss_clip": 1.01759982, + "balance_loss_mlp": 1.02622128, + "epoch": 0.2499924845934165, + "flos": 18769306383360.0, + "grad_norm": 1.5277581234599116, + "language_loss": 0.81389397, + "learning_rate": 3.4146142466842253e-06, + "loss": 0.83511102, + "num_input_tokens_seen": 89610295, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.6015625, + "step": 4158, + "time_per_iteration": 2.3913378715515137 + }, + { + "auxiliary_loss_clip": 0.01087098, + "auxiliary_loss_mlp": 0.01029935, + "balance_loss_clip": 1.01334131, + "balance_loss_mlp": 1.0268569, + "epoch": 0.25005260784608446, + "flos": 16871288163840.0, + "grad_norm": 1.8097729144184502, + "language_loss": 0.76135957, + "learning_rate": 3.414347136429138e-06, + "loss": 0.78252989, + "num_input_tokens_seen": 89627795, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.6015625, + "step": 4159, + "time_per_iteration": 2.3434834480285645 + }, + { + "auxiliary_loss_clip": 0.01089061, + "auxiliary_loss_mlp": 0.01033959, + "balance_loss_clip": 1.01550508, + "balance_loss_mlp": 1.02670324, + "epoch": 0.2501127310987524, + "flos": 22123901191680.0, + "grad_norm": 1.867266487831828, + "language_loss": 0.7131983, + "learning_rate": 3.4140799756996403e-06, + "loss": 0.73442852, + "num_input_tokens_seen": 89648090, + "router_z_loss_clip": 0.18457031, + "router_z_loss_mlp": 0.625, + "step": 4160, + "time_per_iteration": 2.400459051132202 + }, + { + "auxiliary_loss_clip": 0.01015589, + "auxiliary_loss_mlp": 0.01007315, + "balance_loss_clip": 1.00575364, + "balance_loss_mlp": 1.00244224, + "epoch": 0.2501728543514204, + "flos": 69454393781760.0, + "grad_norm": 0.7484803095786234, + "language_loss": 0.56746018, + "learning_rate": 3.4138127645052653e-06, + "loss": 0.58768922, + "num_input_tokens_seen": 89710345, + "router_z_loss_clip": 0.015625, + "router_z_loss_mlp": 0.13085938, + "step": 4161, + "time_per_iteration": 4.48866081237793 + }, + { + "auxiliary_loss_clip": 0.01093328, + "auxiliary_loss_mlp": 0.01042829, + "balance_loss_clip": 1.02357638, + "balance_loss_mlp": 1.02814245, + "epoch": 0.25023297760408836, + "flos": 16289041553280.0, + "grad_norm": 1.6335203249332388, + "language_loss": 0.80808181, + "learning_rate": 3.41354550285555e-06, + "loss": 0.8294434, + "num_input_tokens_seen": 89729390, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 0.65234375, + "step": 4162, + "time_per_iteration": 2.3679251670837402 + }, + { + "auxiliary_loss_clip": 0.01087607, + "auxiliary_loss_mlp": 0.01036578, + "balance_loss_clip": 1.01810074, + "balance_loss_mlp": 1.02499318, + "epoch": 0.2502931008567563, + "flos": 12237231427200.0, + "grad_norm": 2.073921826240484, + "language_loss": 0.87346721, + "learning_rate": 3.413278190760031e-06, + "loss": 0.89470905, + "num_input_tokens_seen": 89742805, + "router_z_loss_clip": 0.18457031, + "router_z_loss_mlp": 0.625, + "step": 4163, + "time_per_iteration": 2.352168083190918 + }, + { + "auxiliary_loss_clip": 0.01087454, + "auxiliary_loss_mlp": 0.01034365, + "balance_loss_clip": 1.01659131, + "balance_loss_mlp": 1.0266645, + "epoch": 0.25035322410942434, + "flos": 23180861825280.0, + "grad_norm": 1.5284149766660347, + "language_loss": 0.83000046, + "learning_rate": 3.413010828228249e-06, + "loss": 0.85121864, + "num_input_tokens_seen": 89761145, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.60546875, + "step": 4164, + "time_per_iteration": 2.3776345252990723 + }, + { + "auxiliary_loss_clip": 0.01086314, + "auxiliary_loss_mlp": 0.01038361, + "balance_loss_clip": 1.02285218, + "balance_loss_mlp": 1.02929795, + "epoch": 0.2504133473620923, + "flos": 20916639688320.0, + "grad_norm": 1.6790768031802228, + "language_loss": 0.7416389, + "learning_rate": 3.4127434152697453e-06, + "loss": 0.76288569, + "num_input_tokens_seen": 89780905, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.5703125, + "step": 4165, + "time_per_iteration": 3.7848379611968994 + }, + { + "auxiliary_loss_clip": 0.01090005, + "auxiliary_loss_mlp": 0.0103736, + "balance_loss_clip": 1.01851296, + "balance_loss_mlp": 1.02677178, + "epoch": 0.2504734706147603, + "flos": 20775520506240.0, + "grad_norm": 1.6360191852191746, + "language_loss": 0.73811507, + "learning_rate": 3.4124759518940637e-06, + "loss": 0.75938869, + "num_input_tokens_seen": 89799230, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 0.6328125, + "step": 4166, + "time_per_iteration": 2.3740110397338867 + }, + { + "auxiliary_loss_clip": 0.01082551, + "auxiliary_loss_mlp": 0.01040048, + "balance_loss_clip": 1.02244079, + "balance_loss_mlp": 1.02522874, + "epoch": 0.25053359386742824, + "flos": 24348322512000.0, + "grad_norm": 1.6573165904586133, + "language_loss": 0.8177495, + "learning_rate": 3.412208438110748e-06, + "loss": 0.83897543, + "num_input_tokens_seen": 89818240, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.57421875, + "step": 4167, + "time_per_iteration": 3.774843215942383 + }, + { + "auxiliary_loss_clip": 0.01085933, + "auxiliary_loss_mlp": 0.01033667, + "balance_loss_clip": 1.01667368, + "balance_loss_mlp": 1.02652943, + "epoch": 0.2505937171200962, + "flos": 21213296444160.0, + "grad_norm": 2.054142096798254, + "language_loss": 0.79331625, + "learning_rate": 3.411940873929346e-06, + "loss": 0.81451225, + "num_input_tokens_seen": 89834485, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.59375, + "step": 4168, + "time_per_iteration": 2.3814868927001953 + }, + { + "auxiliary_loss_clip": 0.01089124, + "auxiliary_loss_mlp": 0.01037903, + "balance_loss_clip": 1.01776874, + "balance_loss_mlp": 1.02649546, + "epoch": 0.25065384037276417, + "flos": 41425633808640.0, + "grad_norm": 1.949754426550837, + "language_loss": 0.69879848, + "learning_rate": 3.411673259359406e-06, + "loss": 0.72006875, + "num_input_tokens_seen": 89855645, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 0.625, + "step": 4169, + "time_per_iteration": 2.551462173461914 + }, + { + "auxiliary_loss_clip": 0.01082874, + "auxiliary_loss_mlp": 0.01036984, + "balance_loss_clip": 1.02077127, + "balance_loss_mlp": 1.02544272, + "epoch": 0.25071396362543213, + "flos": 26101241654400.0, + "grad_norm": 1.689448057975862, + "language_loss": 0.7732088, + "learning_rate": 3.411405594410479e-06, + "loss": 0.79440737, + "num_input_tokens_seen": 89874895, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.57421875, + "step": 4170, + "time_per_iteration": 2.441685676574707 + }, + { + "auxiliary_loss_clip": 0.01085652, + "auxiliary_loss_mlp": 0.01034209, + "balance_loss_clip": 1.01724577, + "balance_loss_mlp": 1.02592623, + "epoch": 0.2507740868781001, + "flos": 19097978722560.0, + "grad_norm": 2.3655046764559784, + "language_loss": 0.76594567, + "learning_rate": 3.4111378790921162e-06, + "loss": 0.78714424, + "num_input_tokens_seen": 89891700, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.59765625, + "step": 4171, + "time_per_iteration": 3.7297091484069824 + }, + { + "auxiliary_loss_clip": 0.01019602, + "auxiliary_loss_mlp": 0.01005348, + "balance_loss_clip": 1.0035243, + "balance_loss_mlp": 1.00611258, + "epoch": 0.25083421013076806, + "flos": 64338570927360.0, + "grad_norm": 0.8355812473618281, + "language_loss": 0.60083926, + "learning_rate": 3.4108701134138727e-06, + "loss": 0.62108874, + "num_input_tokens_seen": 89955775, + "router_z_loss_clip": 0.01818848, + "router_z_loss_mlp": 0.13476562, + "step": 4172, + "time_per_iteration": 3.0390610694885254 + }, + { + "auxiliary_loss_clip": 0.01086664, + "auxiliary_loss_mlp": 0.01034131, + "balance_loss_clip": 1.0154984, + "balance_loss_mlp": 1.02559733, + "epoch": 0.25089433338343603, + "flos": 24278461148160.0, + "grad_norm": 1.419423819463803, + "language_loss": 0.78949177, + "learning_rate": 3.4106022973853045e-06, + "loss": 0.8106997, + "num_input_tokens_seen": 89977150, + "router_z_loss_clip": 0.18652344, + "router_z_loss_mlp": 0.609375, + "step": 4173, + "time_per_iteration": 2.4250080585479736 + }, + { + "auxiliary_loss_clip": 0.01086158, + "auxiliary_loss_mlp": 0.01038105, + "balance_loss_clip": 1.01974726, + "balance_loss_mlp": 1.0268693, + "epoch": 0.250954456636104, + "flos": 14720568456960.0, + "grad_norm": 1.7970204160651273, + "language_loss": 0.83641088, + "learning_rate": 3.4103344310159685e-06, + "loss": 0.8576535, + "num_input_tokens_seen": 89994925, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.59375, + "step": 4174, + "time_per_iteration": 2.3676953315734863 + }, + { + "auxiliary_loss_clip": 0.01089318, + "auxiliary_loss_mlp": 0.01039375, + "balance_loss_clip": 1.02039671, + "balance_loss_mlp": 1.02797008, + "epoch": 0.25101457988877196, + "flos": 22272491404800.0, + "grad_norm": 2.0000421087695583, + "language_loss": 0.71564239, + "learning_rate": 3.4100665143154245e-06, + "loss": 0.7369293, + "num_input_tokens_seen": 90013235, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 0.61328125, + "step": 4175, + "time_per_iteration": 2.391385555267334 + }, + { + "auxiliary_loss_clip": 0.01085807, + "auxiliary_loss_mlp": 0.01031672, + "balance_loss_clip": 1.01374292, + "balance_loss_mlp": 1.02490354, + "epoch": 0.2510747031414399, + "flos": 25187843998080.0, + "grad_norm": 2.066901444341313, + "language_loss": 0.80781257, + "learning_rate": 3.409798547293234e-06, + "loss": 0.82898736, + "num_input_tokens_seen": 90032150, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.609375, + "step": 4176, + "time_per_iteration": 2.411552906036377 + }, + { + "auxiliary_loss_clip": 0.01090539, + "auxiliary_loss_mlp": 0.01033101, + "balance_loss_clip": 1.01437354, + "balance_loss_mlp": 1.02882624, + "epoch": 0.25113482639410795, + "flos": 20703145524480.0, + "grad_norm": 1.8239141578092002, + "language_loss": 0.8288976, + "learning_rate": 3.4095305299589593e-06, + "loss": 0.85013407, + "num_input_tokens_seen": 90049085, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.6171875, + "step": 4177, + "time_per_iteration": 2.3847196102142334 + }, + { + "auxiliary_loss_clip": 0.01090298, + "auxiliary_loss_mlp": 0.01039912, + "balance_loss_clip": 1.02176857, + "balance_loss_mlp": 1.02995527, + "epoch": 0.2511949496467759, + "flos": 21505868570880.0, + "grad_norm": 2.534211058518571, + "language_loss": 0.82993323, + "learning_rate": 3.409262462322166e-06, + "loss": 0.85123539, + "num_input_tokens_seen": 90067695, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.6015625, + "step": 4178, + "time_per_iteration": 2.3937385082244873 + }, + { + "auxiliary_loss_clip": 0.01085071, + "auxiliary_loss_mlp": 0.01038297, + "balance_loss_clip": 1.02128625, + "balance_loss_mlp": 1.0263536, + "epoch": 0.2512550728994439, + "flos": 20701015931520.0, + "grad_norm": 2.069309614923113, + "language_loss": 0.75970161, + "learning_rate": 3.40899434439242e-06, + "loss": 0.78093529, + "num_input_tokens_seen": 90083890, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.5859375, + "step": 4179, + "time_per_iteration": 2.383949041366577 + }, + { + "auxiliary_loss_clip": 0.01088564, + "auxiliary_loss_mlp": 0.01040843, + "balance_loss_clip": 1.02212667, + "balance_loss_mlp": 1.02778363, + "epoch": 0.25131519615211184, + "flos": 18477641952000.0, + "grad_norm": 1.8852729951139797, + "language_loss": 0.70328605, + "learning_rate": 3.4087261761792908e-06, + "loss": 0.72458005, + "num_input_tokens_seen": 90100995, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.609375, + "step": 4180, + "time_per_iteration": 2.3751630783081055 + }, + { + "auxiliary_loss_clip": 0.01088551, + "auxiliary_loss_mlp": 0.01038615, + "balance_loss_clip": 1.02038801, + "balance_loss_mlp": 1.02811086, + "epoch": 0.2513753194047798, + "flos": 20483925897600.0, + "grad_norm": 2.393589595119289, + "language_loss": 0.86134291, + "learning_rate": 3.4084579576923477e-06, + "loss": 0.88261461, + "num_input_tokens_seen": 90120365, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.60546875, + "step": 4181, + "time_per_iteration": 2.387640953063965 + }, + { + "auxiliary_loss_clip": 0.01086036, + "auxiliary_loss_mlp": 0.01035151, + "balance_loss_clip": 1.01811576, + "balance_loss_mlp": 1.02726042, + "epoch": 0.25143544265744777, + "flos": 37668560313600.0, + "grad_norm": 1.9317609465821208, + "language_loss": 0.68366534, + "learning_rate": 3.4081896889411634e-06, + "loss": 0.7048772, + "num_input_tokens_seen": 90142610, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.5859375, + "step": 4182, + "time_per_iteration": 2.5509138107299805 + }, + { + "auxiliary_loss_clip": 0.01018334, + "auxiliary_loss_mlp": 0.01004811, + "balance_loss_clip": 1.00305831, + "balance_loss_mlp": 1.00486362, + "epoch": 0.25149556591011574, + "flos": 69364283829120.0, + "grad_norm": 0.8462462626347791, + "language_loss": 0.700665, + "learning_rate": 3.407921369935311e-06, + "loss": 0.72089636, + "num_input_tokens_seen": 90200555, + "router_z_loss_clip": 0.01757812, + "router_z_loss_mlp": 0.13476562, + "step": 4183, + "time_per_iteration": 3.0337069034576416 + }, + { + "auxiliary_loss_clip": 0.01085617, + "auxiliary_loss_mlp": 0.01040285, + "balance_loss_clip": 1.02190304, + "balance_loss_mlp": 1.02508175, + "epoch": 0.2515556891627837, + "flos": 13989557076480.0, + "grad_norm": 1.8504628554017286, + "language_loss": 0.74262583, + "learning_rate": 3.407653000684367e-06, + "loss": 0.7638849, + "num_input_tokens_seen": 90218120, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.60546875, + "step": 4184, + "time_per_iteration": 2.3725948333740234 + }, + { + "auxiliary_loss_clip": 0.01089891, + "auxiliary_loss_mlp": 0.01037587, + "balance_loss_clip": 1.01989603, + "balance_loss_mlp": 1.03025913, + "epoch": 0.25161581241545167, + "flos": 22162445199360.0, + "grad_norm": 1.7187934828601634, + "language_loss": 0.83149958, + "learning_rate": 3.407384581197908e-06, + "loss": 0.85277438, + "num_input_tokens_seen": 90236790, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.59375, + "step": 4185, + "time_per_iteration": 2.37890362739563 + }, + { + "auxiliary_loss_clip": 0.01016642, + "auxiliary_loss_mlp": 0.01001929, + "balance_loss_clip": 1.00006974, + "balance_loss_mlp": 1.00283837, + "epoch": 0.25167593566811963, + "flos": 69355486166400.0, + "grad_norm": 0.7935930278087194, + "language_loss": 0.61524451, + "learning_rate": 3.4071161114855134e-06, + "loss": 0.63543022, + "num_input_tokens_seen": 90297070, + "router_z_loss_clip": 0.01855469, + "router_z_loss_mlp": 0.13867188, + "step": 4186, + "time_per_iteration": 2.9305331707000732 + }, + { + "auxiliary_loss_clip": 0.01085627, + "auxiliary_loss_mlp": 0.01037325, + "balance_loss_clip": 1.01971817, + "balance_loss_mlp": 1.02617478, + "epoch": 0.2517360589207876, + "flos": 13260605466240.0, + "grad_norm": 1.849702776315767, + "language_loss": 0.78915787, + "learning_rate": 3.406847591556764e-06, + "loss": 0.81038737, + "num_input_tokens_seen": 90315255, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.59375, + "step": 4187, + "time_per_iteration": 2.353400468826294 + }, + { + "auxiliary_loss_clip": 0.0108619, + "auxiliary_loss_mlp": 0.01044634, + "balance_loss_clip": 1.02737236, + "balance_loss_mlp": 1.02780402, + "epoch": 0.25179618217345556, + "flos": 20375764905600.0, + "grad_norm": 1.4541123230854403, + "language_loss": 0.79604644, + "learning_rate": 3.406579021421244e-06, + "loss": 0.81735468, + "num_input_tokens_seen": 90334990, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.5859375, + "step": 4188, + "time_per_iteration": 2.3837485313415527 + }, + { + "auxiliary_loss_clip": 0.01083816, + "auxiliary_loss_mlp": 0.01040109, + "balance_loss_clip": 1.02266884, + "balance_loss_mlp": 1.02533996, + "epoch": 0.25185630542612353, + "flos": 27663709996800.0, + "grad_norm": 1.8785720766922807, + "language_loss": 0.74433601, + "learning_rate": 3.406310401088536e-06, + "loss": 0.76557529, + "num_input_tokens_seen": 90351825, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.5859375, + "step": 4189, + "time_per_iteration": 2.4144246578216553 + }, + { + "auxiliary_loss_clip": 0.01082466, + "auxiliary_loss_mlp": 0.01033653, + "balance_loss_clip": 1.01698756, + "balance_loss_mlp": 1.0256393, + "epoch": 0.25191642867879155, + "flos": 20995368537600.0, + "grad_norm": 1.9791243396858431, + "language_loss": 0.84094393, + "learning_rate": 3.4060417305682274e-06, + "loss": 0.86210507, + "num_input_tokens_seen": 90369860, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.56640625, + "step": 4190, + "time_per_iteration": 2.387373685836792 + }, + { + "auxiliary_loss_clip": 0.01088221, + "auxiliary_loss_mlp": 0.01037147, + "balance_loss_clip": 1.01832414, + "balance_loss_mlp": 1.02850175, + "epoch": 0.2519765519314595, + "flos": 21104611781760.0, + "grad_norm": 2.8854020559471363, + "language_loss": 0.75412244, + "learning_rate": 3.4057730098699065e-06, + "loss": 0.77537614, + "num_input_tokens_seen": 90389245, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 0.59765625, + "step": 4191, + "time_per_iteration": 2.384077548980713 + }, + { + "auxiliary_loss_clip": 0.01022705, + "auxiliary_loss_mlp": 0.01001011, + "balance_loss_clip": 0.99906814, + "balance_loss_mlp": 1.00911868, + "epoch": 0.2520366751841275, + "flos": 62741503560960.0, + "grad_norm": 1.3038135781693232, + "language_loss": 0.57152498, + "learning_rate": 3.405504239003163e-06, + "loss": 0.59176207, + "num_input_tokens_seen": 90456735, + "router_z_loss_clip": 0.01940918, + "router_z_loss_mlp": 0.13574219, + "step": 4192, + "time_per_iteration": 3.1122801303863525 + }, + { + "auxiliary_loss_clip": 0.0108913, + "auxiliary_loss_mlp": 0.01036046, + "balance_loss_clip": 1.01792598, + "balance_loss_mlp": 1.03009593, + "epoch": 0.25209679843679544, + "flos": 22229792945280.0, + "grad_norm": 1.962746330273317, + "language_loss": 0.76137161, + "learning_rate": 3.4052354179775883e-06, + "loss": 0.78262341, + "num_input_tokens_seen": 90474165, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.59375, + "step": 4193, + "time_per_iteration": 2.3875482082366943 + }, + { + "auxiliary_loss_clip": 0.01089056, + "auxiliary_loss_mlp": 0.01036504, + "balance_loss_clip": 1.01816988, + "balance_loss_mlp": 1.02842879, + "epoch": 0.2521569216894634, + "flos": 12165833963520.0, + "grad_norm": 2.4049709684284055, + "language_loss": 0.83728766, + "learning_rate": 3.4049665468027763e-06, + "loss": 0.85854328, + "num_input_tokens_seen": 90491660, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.60546875, + "step": 4194, + "time_per_iteration": 2.385765314102173 + }, + { + "auxiliary_loss_clip": 0.01088363, + "auxiliary_loss_mlp": 0.01038036, + "balance_loss_clip": 1.0210371, + "balance_loss_mlp": 1.02698064, + "epoch": 0.2522170449421314, + "flos": 23698553598720.0, + "grad_norm": 1.4476206676743426, + "language_loss": 0.88394225, + "learning_rate": 3.404697625488322e-06, + "loss": 0.90520626, + "num_input_tokens_seen": 90514025, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.61328125, + "step": 4195, + "time_per_iteration": 2.4313528537750244 + }, + { + "auxiliary_loss_clip": 0.01088156, + "auxiliary_loss_mlp": 0.01036198, + "balance_loss_clip": 1.01648068, + "balance_loss_mlp": 1.02781153, + "epoch": 0.25227716819479934, + "flos": 20954520380160.0, + "grad_norm": 2.4660462571920565, + "language_loss": 0.86479378, + "learning_rate": 3.4044286540438233e-06, + "loss": 0.88603729, + "num_input_tokens_seen": 90533530, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 0.6015625, + "step": 4196, + "time_per_iteration": 2.407398223876953 + }, + { + "auxiliary_loss_clip": 0.01087827, + "auxiliary_loss_mlp": 0.01037191, + "balance_loss_clip": 1.01951194, + "balance_loss_mlp": 1.0276016, + "epoch": 0.2523372914474673, + "flos": 23330220088320.0, + "grad_norm": 1.7398215060718052, + "language_loss": 0.83336049, + "learning_rate": 3.4041596324788778e-06, + "loss": 0.85461068, + "num_input_tokens_seen": 90554025, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.6015625, + "step": 4197, + "time_per_iteration": 2.426469326019287 + }, + { + "auxiliary_loss_clip": 0.01092347, + "auxiliary_loss_mlp": 0.01037642, + "balance_loss_clip": 1.01761484, + "balance_loss_mlp": 1.02999306, + "epoch": 0.25239741470013527, + "flos": 36969005934720.0, + "grad_norm": 1.8691213880443476, + "language_loss": 0.72345132, + "learning_rate": 3.403890560803088e-06, + "loss": 0.74475121, + "num_input_tokens_seen": 90576930, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 0.625, + "step": 4198, + "time_per_iteration": 2.512997627258301 + }, + { + "auxiliary_loss_clip": 0.0109069, + "auxiliary_loss_mlp": 0.01040852, + "balance_loss_clip": 1.02180219, + "balance_loss_mlp": 1.02847815, + "epoch": 0.25245753795280323, + "flos": 18514754593920.0, + "grad_norm": 1.7566041300172366, + "language_loss": 0.77064091, + "learning_rate": 3.4036214390260546e-06, + "loss": 0.79195631, + "num_input_tokens_seen": 90595710, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 0.62109375, + "step": 4199, + "time_per_iteration": 2.366682529449463 + }, + { + "auxiliary_loss_clip": 0.01086724, + "auxiliary_loss_mlp": 0.01032855, + "balance_loss_clip": 1.01598668, + "balance_loss_mlp": 1.02713084, + "epoch": 0.2525176612054712, + "flos": 32343467569920.0, + "grad_norm": 1.9349715598519899, + "language_loss": 0.73080075, + "learning_rate": 3.403352267157383e-06, + "loss": 0.75199652, + "num_input_tokens_seen": 90617945, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.59375, + "step": 4200, + "time_per_iteration": 2.491309404373169 + }, + { + "auxiliary_loss_clip": 0.01088802, + "auxiliary_loss_mlp": 0.01036934, + "balance_loss_clip": 1.02017355, + "balance_loss_mlp": 1.02801931, + "epoch": 0.25257778445813917, + "flos": 45256513651200.0, + "grad_norm": 1.5135461051709393, + "language_loss": 0.82346237, + "learning_rate": 3.4030830452066785e-06, + "loss": 0.84471977, + "num_input_tokens_seen": 90640855, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.60546875, + "step": 4201, + "time_per_iteration": 4.075268268585205 + }, + { + "auxiliary_loss_clip": 0.01089096, + "auxiliary_loss_mlp": 0.01036666, + "balance_loss_clip": 1.01867688, + "balance_loss_mlp": 1.02672911, + "epoch": 0.25263790771080713, + "flos": 23366669414400.0, + "grad_norm": 2.7475426529850826, + "language_loss": 0.74723589, + "learning_rate": 3.4028137731835492e-06, + "loss": 0.76849353, + "num_input_tokens_seen": 90661350, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.625, + "step": 4202, + "time_per_iteration": 2.4181506633758545 + }, + { + "auxiliary_loss_clip": 0.01086922, + "auxiliary_loss_mlp": 0.01039966, + "balance_loss_clip": 1.02253139, + "balance_loss_mlp": 1.02769208, + "epoch": 0.25269803096347515, + "flos": 18514056366720.0, + "grad_norm": 1.9061824622614174, + "language_loss": 0.73042041, + "learning_rate": 3.4025444510976045e-06, + "loss": 0.75168931, + "num_input_tokens_seen": 90680540, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.59375, + "step": 4203, + "time_per_iteration": 2.3866050243377686 + }, + { + "auxiliary_loss_clip": 0.01085818, + "auxiliary_loss_mlp": 0.01034929, + "balance_loss_clip": 1.01728582, + "balance_loss_mlp": 1.0260551, + "epoch": 0.2527581542161431, + "flos": 24609332903040.0, + "grad_norm": 2.1212812715222102, + "language_loss": 0.77547503, + "learning_rate": 3.4022750789584568e-06, + "loss": 0.79668248, + "num_input_tokens_seen": 90703460, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.59765625, + "step": 4204, + "time_per_iteration": 2.4628543853759766 + }, + { + "auxiliary_loss_clip": 0.01086552, + "auxiliary_loss_mlp": 0.01041565, + "balance_loss_clip": 1.0235641, + "balance_loss_mlp": 1.02526116, + "epoch": 0.2528182774688111, + "flos": 12640443252480.0, + "grad_norm": 1.992688282780428, + "language_loss": 0.72095698, + "learning_rate": 3.4020056567757183e-06, + "loss": 0.74223816, + "num_input_tokens_seen": 90718815, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.61328125, + "step": 4205, + "time_per_iteration": 3.713306188583374 + }, + { + "auxiliary_loss_clip": 0.01083553, + "auxiliary_loss_mlp": 0.0103194, + "balance_loss_clip": 1.01643646, + "balance_loss_mlp": 1.02674294, + "epoch": 0.25287840072147905, + "flos": 46935032952960.0, + "grad_norm": 1.311174117916942, + "language_loss": 0.75730765, + "learning_rate": 3.401736184559005e-06, + "loss": 0.77846253, + "num_input_tokens_seen": 90742125, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.5703125, + "step": 4206, + "time_per_iteration": 2.612483263015747 + }, + { + "auxiliary_loss_clip": 0.0108488, + "auxiliary_loss_mlp": 0.01037535, + "balance_loss_clip": 1.01982069, + "balance_loss_mlp": 1.02526879, + "epoch": 0.252938523974147, + "flos": 18878724184320.0, + "grad_norm": 1.7209736363570025, + "language_loss": 0.79218537, + "learning_rate": 3.401466662317932e-06, + "loss": 0.81340957, + "num_input_tokens_seen": 90760785, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.59375, + "step": 4207, + "time_per_iteration": 3.774508476257324 + }, + { + "auxiliary_loss_clip": 0.01085521, + "auxiliary_loss_mlp": 0.01034666, + "balance_loss_clip": 1.01836991, + "balance_loss_mlp": 1.0268259, + "epoch": 0.252998647226815, + "flos": 21433633234560.0, + "grad_norm": 1.4719136538962954, + "language_loss": 0.7642712, + "learning_rate": 3.4011970900621192e-06, + "loss": 0.78547311, + "num_input_tokens_seen": 90780045, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.5859375, + "step": 4208, + "time_per_iteration": 2.397585391998291 + }, + { + "auxiliary_loss_clip": 0.01082874, + "auxiliary_loss_mlp": 0.01029827, + "balance_loss_clip": 1.01245856, + "balance_loss_mlp": 1.0249666, + "epoch": 0.25305877047948294, + "flos": 25441138978560.0, + "grad_norm": 2.1309049178116477, + "language_loss": 0.69913232, + "learning_rate": 3.400927467801186e-06, + "loss": 0.72025931, + "num_input_tokens_seen": 90797980, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.578125, + "step": 4209, + "time_per_iteration": 2.41082501411438 + }, + { + "auxiliary_loss_clip": 0.01020904, + "auxiliary_loss_mlp": 0.01007153, + "balance_loss_clip": 1.00536501, + "balance_loss_mlp": 1.00777543, + "epoch": 0.2531188937321509, + "flos": 60182335324800.0, + "grad_norm": 0.7702165785040264, + "language_loss": 0.55134249, + "learning_rate": 3.400657795544756e-06, + "loss": 0.57162297, + "num_input_tokens_seen": 90864865, + "router_z_loss_clip": 0.01782227, + "router_z_loss_mlp": 0.13085938, + "step": 4210, + "time_per_iteration": 4.450890779495239 + }, + { + "auxiliary_loss_clip": 0.01084906, + "auxiliary_loss_mlp": 0.01032695, + "balance_loss_clip": 1.01618409, + "balance_loss_mlp": 1.02576089, + "epoch": 0.25317901698481887, + "flos": 19681377408000.0, + "grad_norm": 2.7408645727011143, + "language_loss": 0.79702961, + "learning_rate": 3.400388073302452e-06, + "loss": 0.8182056, + "num_input_tokens_seen": 90882885, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.58984375, + "step": 4211, + "time_per_iteration": 2.3642923831939697 + }, + { + "auxiliary_loss_clip": 0.01083493, + "auxiliary_loss_mlp": 0.01034482, + "balance_loss_clip": 1.01818037, + "balance_loss_mlp": 1.02719128, + "epoch": 0.25323914023748684, + "flos": 24423246023040.0, + "grad_norm": 1.536516128160232, + "language_loss": 0.78452933, + "learning_rate": 3.4001183010838995e-06, + "loss": 0.80570906, + "num_input_tokens_seen": 90902985, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.5625, + "step": 4212, + "time_per_iteration": 2.4152677059173584 + }, + { + "auxiliary_loss_clip": 0.01085529, + "auxiliary_loss_mlp": 0.01033761, + "balance_loss_clip": 1.01636815, + "balance_loss_mlp": 1.02613354, + "epoch": 0.2532992634901548, + "flos": 25446270948480.0, + "grad_norm": 2.47862791972638, + "language_loss": 0.53626394, + "learning_rate": 3.3998484788987264e-06, + "loss": 0.55745685, + "num_input_tokens_seen": 90923550, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.59375, + "step": 4213, + "time_per_iteration": 2.415127754211426 + }, + { + "auxiliary_loss_clip": 0.01087302, + "auxiliary_loss_mlp": 0.0104273, + "balance_loss_clip": 1.02405012, + "balance_loss_mlp": 1.02718604, + "epoch": 0.25335938674282277, + "flos": 18879527145600.0, + "grad_norm": 2.1809503383736786, + "language_loss": 0.64555001, + "learning_rate": 3.3995786067565623e-06, + "loss": 0.66685027, + "num_input_tokens_seen": 90943260, + "router_z_loss_clip": 0.18652344, + "router_z_loss_mlp": 0.6015625, + "step": 4214, + "time_per_iteration": 2.403613567352295 + }, + { + "auxiliary_loss_clip": 0.01016404, + "auxiliary_loss_mlp": 0.01008317, + "balance_loss_clip": 1.00636172, + "balance_loss_mlp": 1.00312185, + "epoch": 0.25341950999549073, + "flos": 53059809588480.0, + "grad_norm": 0.8423539403247228, + "language_loss": 0.58051109, + "learning_rate": 3.3993086846670376e-06, + "loss": 0.60075825, + "num_input_tokens_seen": 90996295, + "router_z_loss_clip": 0.01953125, + "router_z_loss_mlp": 0.1328125, + "step": 4215, + "time_per_iteration": 2.7951724529266357 + }, + { + "auxiliary_loss_clip": 0.01085358, + "auxiliary_loss_mlp": 0.01030724, + "balance_loss_clip": 1.01349866, + "balance_loss_mlp": 1.02700901, + "epoch": 0.2534796332481587, + "flos": 39018686567040.0, + "grad_norm": 1.6232707828678796, + "language_loss": 0.83765221, + "learning_rate": 3.3990387126397854e-06, + "loss": 0.85881305, + "num_input_tokens_seen": 91017545, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.58203125, + "step": 4216, + "time_per_iteration": 2.5465731620788574 + }, + { + "auxiliary_loss_clip": 0.01088035, + "auxiliary_loss_mlp": 0.01033414, + "balance_loss_clip": 1.01587868, + "balance_loss_mlp": 1.02852917, + "epoch": 0.2535397565008267, + "flos": 23585854129920.0, + "grad_norm": 2.016891040914528, + "language_loss": 0.80153388, + "learning_rate": 3.3987686906844404e-06, + "loss": 0.82274836, + "num_input_tokens_seen": 91037715, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.59375, + "step": 4217, + "time_per_iteration": 2.393773078918457 + }, + { + "auxiliary_loss_clip": 0.01083478, + "auxiliary_loss_mlp": 0.01034403, + "balance_loss_clip": 1.01786864, + "balance_loss_mlp": 1.02516747, + "epoch": 0.2535998797534947, + "flos": 19280364998400.0, + "grad_norm": 2.1267848717555147, + "language_loss": 0.75011122, + "learning_rate": 3.398498618810639e-06, + "loss": 0.77129006, + "num_input_tokens_seen": 91055295, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.5859375, + "step": 4218, + "time_per_iteration": 2.372349739074707 + }, + { + "auxiliary_loss_clip": 0.01086645, + "auxiliary_loss_mlp": 0.01034857, + "balance_loss_clip": 1.01753581, + "balance_loss_mlp": 1.02585709, + "epoch": 0.25366000300616265, + "flos": 24023246042880.0, + "grad_norm": 1.6683740752352614, + "language_loss": 0.74832523, + "learning_rate": 3.398228497028019e-06, + "loss": 0.76954031, + "num_input_tokens_seen": 91075485, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.609375, + "step": 4219, + "time_per_iteration": 2.4075748920440674 + }, + { + "auxiliary_loss_clip": 0.0108855, + "auxiliary_loss_mlp": 0.01042162, + "balance_loss_clip": 1.02486539, + "balance_loss_mlp": 1.02807808, + "epoch": 0.2537201262588306, + "flos": 16288448060160.0, + "grad_norm": 1.708694570165322, + "language_loss": 0.81267452, + "learning_rate": 3.397958325346221e-06, + "loss": 0.83398163, + "num_input_tokens_seen": 91093620, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.6015625, + "step": 4220, + "time_per_iteration": 2.3648316860198975 + }, + { + "auxiliary_loss_clip": 0.01088201, + "auxiliary_loss_mlp": 0.01037211, + "balance_loss_clip": 1.0197705, + "balance_loss_mlp": 1.02815259, + "epoch": 0.2537802495114986, + "flos": 23293561294080.0, + "grad_norm": 3.126759026973794, + "language_loss": 0.70966601, + "learning_rate": 3.397688103774886e-06, + "loss": 0.7309202, + "num_input_tokens_seen": 91114110, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.6015625, + "step": 4221, + "time_per_iteration": 2.44842267036438 + }, + { + "auxiliary_loss_clip": 0.01085814, + "auxiliary_loss_mlp": 0.0103377, + "balance_loss_clip": 1.01663923, + "balance_loss_mlp": 1.02692389, + "epoch": 0.25384037276416654, + "flos": 17638190288640.0, + "grad_norm": 1.6753200340995202, + "language_loss": 0.61910427, + "learning_rate": 3.397417832323658e-06, + "loss": 0.64030015, + "num_input_tokens_seen": 91133135, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.5859375, + "step": 4222, + "time_per_iteration": 2.399949312210083 + }, + { + "auxiliary_loss_clip": 0.01090904, + "auxiliary_loss_mlp": 0.01039178, + "balance_loss_clip": 1.02009249, + "balance_loss_mlp": 1.028826, + "epoch": 0.2539004960168345, + "flos": 21505973304960.0, + "grad_norm": 1.7557900932408543, + "language_loss": 0.7456938, + "learning_rate": 3.397147511002182e-06, + "loss": 0.7669946, + "num_input_tokens_seen": 91151805, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 0.62109375, + "step": 4223, + "time_per_iteration": 2.3815507888793945 + }, + { + "auxiliary_loss_clip": 0.01086685, + "auxiliary_loss_mlp": 0.0103843, + "balance_loss_clip": 1.02089465, + "balance_loss_mlp": 1.02803266, + "epoch": 0.2539606192695025, + "flos": 23949788808960.0, + "grad_norm": 1.487638072061383, + "language_loss": 0.79764968, + "learning_rate": 3.3968771398201056e-06, + "loss": 0.81890082, + "num_input_tokens_seen": 91172270, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.5859375, + "step": 4224, + "time_per_iteration": 2.4221978187561035 + }, + { + "auxiliary_loss_clip": 0.01082935, + "auxiliary_loss_mlp": 0.01034567, + "balance_loss_clip": 1.01698387, + "balance_loss_mlp": 1.02560687, + "epoch": 0.25402074252217044, + "flos": 24168659322240.0, + "grad_norm": 1.4043746669844293, + "language_loss": 0.77372491, + "learning_rate": 3.396606718787077e-06, + "loss": 0.79489988, + "num_input_tokens_seen": 91192080, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.5703125, + "step": 4225, + "time_per_iteration": 2.4019360542297363 + }, + { + "auxiliary_loss_clip": 0.01087068, + "auxiliary_loss_mlp": 0.01045849, + "balance_loss_clip": 1.02768099, + "balance_loss_mlp": 1.02729058, + "epoch": 0.2540808657748384, + "flos": 22302831242880.0, + "grad_norm": 2.6425992178423963, + "language_loss": 0.84850371, + "learning_rate": 3.396336247912747e-06, + "loss": 0.86983287, + "num_input_tokens_seen": 91211450, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.59765625, + "step": 4226, + "time_per_iteration": 2.39050555229187 + }, + { + "auxiliary_loss_clip": 0.01083659, + "auxiliary_loss_mlp": 0.01044263, + "balance_loss_clip": 1.02712059, + "balance_loss_mlp": 1.02492332, + "epoch": 0.25414098902750637, + "flos": 27598317287040.0, + "grad_norm": 1.5257596202667512, + "language_loss": 0.70935285, + "learning_rate": 3.396065727206768e-06, + "loss": 0.73063207, + "num_input_tokens_seen": 91231835, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.5859375, + "step": 4227, + "time_per_iteration": 2.4347763061523438 + }, + { + "auxiliary_loss_clip": 0.01017919, + "auxiliary_loss_mlp": 0.01000842, + "balance_loss_clip": 0.99894607, + "balance_loss_mlp": 1.00497365, + "epoch": 0.25420111228017434, + "flos": 58167847209600.0, + "grad_norm": 0.9906463064364873, + "language_loss": 0.61949646, + "learning_rate": 3.395795156678795e-06, + "loss": 0.63968408, + "num_input_tokens_seen": 91288755, + "router_z_loss_clip": 0.0189209, + "router_z_loss_mlp": 0.12890625, + "step": 4228, + "time_per_iteration": 2.874880790710449 + }, + { + "auxiliary_loss_clip": 0.01087209, + "auxiliary_loss_mlp": 0.010317, + "balance_loss_clip": 1.01291287, + "balance_loss_mlp": 1.02655411, + "epoch": 0.2542612355328423, + "flos": 11463870700800.0, + "grad_norm": 2.3320528795276307, + "language_loss": 0.85919857, + "learning_rate": 3.395524536338483e-06, + "loss": 0.88038766, + "num_input_tokens_seen": 91302485, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.60546875, + "step": 4229, + "time_per_iteration": 2.342778205871582 + }, + { + "auxiliary_loss_clip": 0.01087712, + "auxiliary_loss_mlp": 0.01039547, + "balance_loss_clip": 1.02093852, + "balance_loss_mlp": 1.0279516, + "epoch": 0.2543213587855103, + "flos": 22964784220800.0, + "grad_norm": 2.0199073729481953, + "language_loss": 0.77261305, + "learning_rate": 3.3952538661954893e-06, + "loss": 0.79388565, + "num_input_tokens_seen": 91321120, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.59765625, + "step": 4230, + "time_per_iteration": 2.4160499572753906 + }, + { + "auxiliary_loss_clip": 0.01084105, + "auxiliary_loss_mlp": 0.01033837, + "balance_loss_clip": 1.01538348, + "balance_loss_mlp": 1.02499151, + "epoch": 0.2543814820381783, + "flos": 18252382659840.0, + "grad_norm": 2.2727168383174248, + "language_loss": 0.75731349, + "learning_rate": 3.3949831462594743e-06, + "loss": 0.77849293, + "num_input_tokens_seen": 91338575, + "router_z_loss_clip": 0.18457031, + "router_z_loss_mlp": 0.58984375, + "step": 4231, + "time_per_iteration": 2.355001449584961 + }, + { + "auxiliary_loss_clip": 0.01085398, + "auxiliary_loss_mlp": 0.01037483, + "balance_loss_clip": 1.01989961, + "balance_loss_mlp": 1.0256772, + "epoch": 0.25444160529084625, + "flos": 15631801608960.0, + "grad_norm": 1.8001754541316275, + "language_loss": 0.73925924, + "learning_rate": 3.3947123765400994e-06, + "loss": 0.76048803, + "num_input_tokens_seen": 91357355, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.59375, + "step": 4232, + "time_per_iteration": 2.379716634750366 + }, + { + "auxiliary_loss_clip": 0.01086377, + "auxiliary_loss_mlp": 0.01045583, + "balance_loss_clip": 1.02578199, + "balance_loss_mlp": 1.02716863, + "epoch": 0.2545017285435142, + "flos": 24600639974400.0, + "grad_norm": 1.7991696273225741, + "language_loss": 0.8663975, + "learning_rate": 3.394441557047028e-06, + "loss": 0.88771713, + "num_input_tokens_seen": 91376515, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 0.59375, + "step": 4233, + "time_per_iteration": 2.4171550273895264 + }, + { + "auxiliary_loss_clip": 0.01080724, + "auxiliary_loss_mlp": 0.01037627, + "balance_loss_clip": 1.02109265, + "balance_loss_mlp": 1.02476168, + "epoch": 0.2545618517961822, + "flos": 24677972369280.0, + "grad_norm": 1.5881667641686952, + "language_loss": 0.7487973, + "learning_rate": 3.3941706877899236e-06, + "loss": 0.76998085, + "num_input_tokens_seen": 91397595, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.5625, + "step": 4234, + "time_per_iteration": 2.4654290676116943 + }, + { + "auxiliary_loss_clip": 0.01086916, + "auxiliary_loss_mlp": 0.01038339, + "balance_loss_clip": 1.02132797, + "balance_loss_mlp": 1.02555203, + "epoch": 0.25462197504885015, + "flos": 23913967887360.0, + "grad_norm": 1.3432440338004685, + "language_loss": 0.74730933, + "learning_rate": 3.393899768778454e-06, + "loss": 0.76856196, + "num_input_tokens_seen": 91417775, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.6171875, + "step": 4235, + "time_per_iteration": 2.4672210216522217 + }, + { + "auxiliary_loss_clip": 0.01091606, + "auxiliary_loss_mlp": 0.01042467, + "balance_loss_clip": 1.02248764, + "balance_loss_mlp": 1.02715826, + "epoch": 0.2546820983015181, + "flos": 24788262954240.0, + "grad_norm": 2.5389337739658586, + "language_loss": 0.64470553, + "learning_rate": 3.393628800022287e-06, + "loss": 0.66604626, + "num_input_tokens_seen": 91437665, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 0.64453125, + "step": 4236, + "time_per_iteration": 2.437122106552124 + }, + { + "auxiliary_loss_clip": 0.01084494, + "auxiliary_loss_mlp": 0.01033891, + "balance_loss_clip": 1.01757097, + "balance_loss_mlp": 1.02604151, + "epoch": 0.2547422215541861, + "flos": 18733136348160.0, + "grad_norm": 1.7228645594771752, + "language_loss": 0.66689718, + "learning_rate": 3.393357781531093e-06, + "loss": 0.68808103, + "num_input_tokens_seen": 91456705, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.5859375, + "step": 4237, + "time_per_iteration": 2.3892457485198975 + }, + { + "auxiliary_loss_clip": 0.01086732, + "auxiliary_loss_mlp": 0.01039802, + "balance_loss_clip": 1.02211189, + "balance_loss_mlp": 1.02712703, + "epoch": 0.25480234480685404, + "flos": 21031398927360.0, + "grad_norm": 2.1291563061300707, + "language_loss": 0.75285828, + "learning_rate": 3.393086713314544e-06, + "loss": 0.77412361, + "num_input_tokens_seen": 91475535, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.59765625, + "step": 4238, + "time_per_iteration": 2.3946433067321777 + }, + { + "auxiliary_loss_clip": 0.01090295, + "auxiliary_loss_mlp": 0.01039888, + "balance_loss_clip": 1.02094567, + "balance_loss_mlp": 1.02938437, + "epoch": 0.254862468059522, + "flos": 25081009637760.0, + "grad_norm": 2.803089062677574, + "language_loss": 0.80558288, + "learning_rate": 3.3928155953823137e-06, + "loss": 0.82688469, + "num_input_tokens_seen": 91499140, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 0.609375, + "step": 4239, + "time_per_iteration": 2.4331414699554443 + }, + { + "auxiliary_loss_clip": 0.0108447, + "auxiliary_loss_mlp": 0.01033939, + "balance_loss_clip": 1.01623678, + "balance_loss_mlp": 1.02648723, + "epoch": 0.25492259131219, + "flos": 20557348220160.0, + "grad_norm": 1.748066251671453, + "language_loss": 0.77362287, + "learning_rate": 3.3925444277440774e-06, + "loss": 0.79480696, + "num_input_tokens_seen": 91518335, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.578125, + "step": 4240, + "time_per_iteration": 3.7819595336914062 + }, + { + "auxiliary_loss_clip": 0.01089638, + "auxiliary_loss_mlp": 0.01035941, + "balance_loss_clip": 1.01597381, + "balance_loss_mlp": 1.02600455, + "epoch": 0.25498271456485794, + "flos": 25041418289280.0, + "grad_norm": 1.719907468259334, + "language_loss": 0.83467364, + "learning_rate": 3.392273210409512e-06, + "loss": 0.85592937, + "num_input_tokens_seen": 91537655, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 0.6328125, + "step": 4241, + "time_per_iteration": 2.411308765411377 + }, + { + "auxiliary_loss_clip": 0.01086503, + "auxiliary_loss_mlp": 0.0104171, + "balance_loss_clip": 1.02353048, + "balance_loss_mlp": 1.02611661, + "epoch": 0.2550428378175259, + "flos": 26177177594880.0, + "grad_norm": 1.7606285079797142, + "language_loss": 0.7337594, + "learning_rate": 3.392001943388298e-06, + "loss": 0.75504154, + "num_input_tokens_seen": 91557545, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.60546875, + "step": 4242, + "time_per_iteration": 2.4295384883880615 + }, + { + "auxiliary_loss_clip": 0.01085302, + "auxiliary_loss_mlp": 0.01034616, + "balance_loss_clip": 1.01708043, + "balance_loss_mlp": 1.0259043, + "epoch": 0.2551029610701939, + "flos": 15266295918720.0, + "grad_norm": 2.2515804794555145, + "language_loss": 0.72305548, + "learning_rate": 3.3917306266901146e-06, + "loss": 0.74425465, + "num_input_tokens_seen": 91574405, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.59375, + "step": 4243, + "time_per_iteration": 2.3567705154418945 + }, + { + "auxiliary_loss_clip": 0.0108642, + "auxiliary_loss_mlp": 0.01033751, + "balance_loss_clip": 1.01590562, + "balance_loss_mlp": 1.0265739, + "epoch": 0.2551630843228619, + "flos": 18111263477760.0, + "grad_norm": 1.5851356583756018, + "language_loss": 0.81755608, + "learning_rate": 3.3914592603246458e-06, + "loss": 0.83875787, + "num_input_tokens_seen": 91593755, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 0.59765625, + "step": 4244, + "time_per_iteration": 3.7711994647979736 + }, + { + "auxiliary_loss_clip": 0.01016159, + "auxiliary_loss_mlp": 0.01010972, + "balance_loss_clip": 1.00915968, + "balance_loss_mlp": 1.00335169, + "epoch": 0.25522320757552985, + "flos": 70516381651200.0, + "grad_norm": 0.6901398321048748, + "language_loss": 0.57706642, + "learning_rate": 3.391187844301575e-06, + "loss": 0.59733772, + "num_input_tokens_seen": 91660335, + "router_z_loss_clip": 0.01806641, + "router_z_loss_mlp": 0.12890625, + "step": 4245, + "time_per_iteration": 3.1313469409942627 + }, + { + "auxiliary_loss_clip": 0.01086005, + "auxiliary_loss_mlp": 0.01046074, + "balance_loss_clip": 1.02726281, + "balance_loss_mlp": 1.02540946, + "epoch": 0.2552833308281978, + "flos": 22891990302720.0, + "grad_norm": 3.2779920259881727, + "language_loss": 0.65447509, + "learning_rate": 3.3909163786305884e-06, + "loss": 0.67579591, + "num_input_tokens_seen": 91678500, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 0.609375, + "step": 4246, + "time_per_iteration": 3.786039352416992 + }, + { + "auxiliary_loss_clip": 0.01081173, + "auxiliary_loss_mlp": 0.01037314, + "balance_loss_clip": 1.02039862, + "balance_loss_mlp": 1.02552485, + "epoch": 0.2553434540808658, + "flos": 22052538639360.0, + "grad_norm": 2.3662608062947648, + "language_loss": 0.81410658, + "learning_rate": 3.390644863321374e-06, + "loss": 0.8352915, + "num_input_tokens_seen": 91696430, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.5546875, + "step": 4247, + "time_per_iteration": 2.4006686210632324 + }, + { + "auxiliary_loss_clip": 0.01090575, + "auxiliary_loss_mlp": 0.0103918, + "balance_loss_clip": 1.01817489, + "balance_loss_mlp": 1.02558231, + "epoch": 0.25540357733353375, + "flos": 16543279140480.0, + "grad_norm": 3.738766888506868, + "language_loss": 0.83157945, + "learning_rate": 3.390373298383622e-06, + "loss": 0.8528769, + "num_input_tokens_seen": 91713270, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 0.6484375, + "step": 4248, + "time_per_iteration": 2.34535551071167 + }, + { + "auxiliary_loss_clip": 0.01087325, + "auxiliary_loss_mlp": 0.01034741, + "balance_loss_clip": 1.01693141, + "balance_loss_mlp": 1.02715528, + "epoch": 0.2554637005862017, + "flos": 17564104650240.0, + "grad_norm": 1.8196853331893625, + "language_loss": 0.84300339, + "learning_rate": 3.390101683827023e-06, + "loss": 0.86422402, + "num_input_tokens_seen": 91728865, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 0.6015625, + "step": 4249, + "time_per_iteration": 2.344696044921875 + }, + { + "auxiliary_loss_clip": 0.01014514, + "auxiliary_loss_mlp": 0.01004274, + "balance_loss_clip": 1.0023433, + "balance_loss_mlp": 1.00165677, + "epoch": 0.2555238238388697, + "flos": 72241650996480.0, + "grad_norm": 0.769060138190621, + "language_loss": 0.5633142, + "learning_rate": 3.389830019661271e-06, + "loss": 0.58350205, + "num_input_tokens_seen": 91787470, + "router_z_loss_clip": 0.01928711, + "router_z_loss_mlp": 0.12890625, + "step": 4250, + "time_per_iteration": 4.34425950050354 + }, + { + "auxiliary_loss_clip": 0.01086496, + "auxiliary_loss_mlp": 0.01031947, + "balance_loss_clip": 1.0136838, + "balance_loss_mlp": 1.02591741, + "epoch": 0.25558394709153764, + "flos": 24388262974080.0, + "grad_norm": 5.30759907953262, + "language_loss": 0.80202079, + "learning_rate": 3.3895583058960604e-06, + "loss": 0.82320523, + "num_input_tokens_seen": 91805640, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 0.60546875, + "step": 4251, + "time_per_iteration": 2.4071388244628906 + }, + { + "auxiliary_loss_clip": 0.01014107, + "auxiliary_loss_mlp": 0.01002477, + "balance_loss_clip": 1.00067675, + "balance_loss_mlp": 1.00154877, + "epoch": 0.2556440703442056, + "flos": 69227772946560.0, + "grad_norm": 0.860774705426065, + "language_loss": 0.66104627, + "learning_rate": 3.3892865425410884e-06, + "loss": 0.68121207, + "num_input_tokens_seen": 91869695, + "router_z_loss_clip": 0.01794434, + "router_z_loss_mlp": 0.125, + "step": 4252, + "time_per_iteration": 3.0563292503356934 + }, + { + "auxiliary_loss_clip": 0.01084835, + "auxiliary_loss_mlp": 0.01033213, + "balance_loss_clip": 1.01695287, + "balance_loss_mlp": 1.02676415, + "epoch": 0.2557041935968736, + "flos": 24862732617600.0, + "grad_norm": 3.350375536497017, + "language_loss": 0.73127812, + "learning_rate": 3.389014729606054e-06, + "loss": 0.75245857, + "num_input_tokens_seen": 91889920, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.58203125, + "step": 4253, + "time_per_iteration": 2.4125022888183594 + }, + { + "auxiliary_loss_clip": 0.01086663, + "auxiliary_loss_mlp": 0.01040386, + "balance_loss_clip": 1.02300525, + "balance_loss_mlp": 1.02779603, + "epoch": 0.25576431684954154, + "flos": 22491012804480.0, + "grad_norm": 2.293465887887409, + "language_loss": 0.72769624, + "learning_rate": 3.388742867100656e-06, + "loss": 0.74896675, + "num_input_tokens_seen": 91908665, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.58984375, + "step": 4254, + "time_per_iteration": 2.400686264038086 + }, + { + "auxiliary_loss_clip": 0.01086128, + "auxiliary_loss_mlp": 0.01038194, + "balance_loss_clip": 1.01901293, + "balance_loss_mlp": 1.02678394, + "epoch": 0.2558244401022095, + "flos": 19825778257920.0, + "grad_norm": 1.6383043855915715, + "language_loss": 0.80807006, + "learning_rate": 3.388470955034598e-06, + "loss": 0.82931328, + "num_input_tokens_seen": 91927855, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 0.59375, + "step": 4255, + "time_per_iteration": 2.3769102096557617 + }, + { + "auxiliary_loss_clip": 0.01085968, + "auxiliary_loss_mlp": 0.01036717, + "balance_loss_clip": 1.01915765, + "balance_loss_mlp": 1.02709889, + "epoch": 0.2558845633548775, + "flos": 23219405832960.0, + "grad_norm": 1.5961840010848884, + "language_loss": 0.85364938, + "learning_rate": 3.3881989934175822e-06, + "loss": 0.8748762, + "num_input_tokens_seen": 91948500, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.58984375, + "step": 4256, + "time_per_iteration": 2.415609121322632 + }, + { + "auxiliary_loss_clip": 0.01088586, + "auxiliary_loss_mlp": 0.01036816, + "balance_loss_clip": 1.01798713, + "balance_loss_mlp": 1.02706409, + "epoch": 0.2559446866075455, + "flos": 16836898608000.0, + "grad_norm": 2.0173160266685497, + "language_loss": 0.75138247, + "learning_rate": 3.387926982259316e-06, + "loss": 0.77263653, + "num_input_tokens_seen": 91968375, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 0.6171875, + "step": 4257, + "time_per_iteration": 2.3902125358581543 + }, + { + "auxiliary_loss_clip": 0.01018854, + "auxiliary_loss_mlp": 0.01003803, + "balance_loss_clip": 1.0018599, + "balance_loss_mlp": 1.00605226, + "epoch": 0.25600480986021346, + "flos": 57590627834880.0, + "grad_norm": 0.7968367734150151, + "language_loss": 0.65308678, + "learning_rate": 3.387654921569505e-06, + "loss": 0.67331338, + "num_input_tokens_seen": 92028490, + "router_z_loss_clip": 0.01940918, + "router_z_loss_mlp": 0.12792969, + "step": 4258, + "time_per_iteration": 2.994932174682617 + }, + { + "auxiliary_loss_clip": 0.01082901, + "auxiliary_loss_mlp": 0.01031022, + "balance_loss_clip": 1.015203, + "balance_loss_mlp": 1.02647078, + "epoch": 0.2560649331128814, + "flos": 27818270052480.0, + "grad_norm": 1.6186365576918889, + "language_loss": 0.7640518, + "learning_rate": 3.3873828113578604e-06, + "loss": 0.78519106, + "num_input_tokens_seen": 92048060, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.5625, + "step": 4259, + "time_per_iteration": 2.4414870738983154 + }, + { + "auxiliary_loss_clip": 0.01017088, + "auxiliary_loss_mlp": 0.01002281, + "balance_loss_clip": 1.00038564, + "balance_loss_mlp": 1.00432873, + "epoch": 0.2561250563655494, + "flos": 70946896026240.0, + "grad_norm": 0.7973289789284315, + "language_loss": 0.58468884, + "learning_rate": 3.387110651634092e-06, + "loss": 0.60488254, + "num_input_tokens_seen": 92118180, + "router_z_loss_clip": 0.0189209, + "router_z_loss_mlp": 0.12695312, + "step": 4260, + "time_per_iteration": 3.1298787593841553 + }, + { + "auxiliary_loss_clip": 0.01083809, + "auxiliary_loss_mlp": 0.01036781, + "balance_loss_clip": 1.01862574, + "balance_loss_mlp": 1.02459311, + "epoch": 0.25618517961821735, + "flos": 27011217997440.0, + "grad_norm": 1.8000391782145087, + "language_loss": 0.77640504, + "learning_rate": 3.3868384424079122e-06, + "loss": 0.79761088, + "num_input_tokens_seen": 92137570, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.59375, + "step": 4261, + "time_per_iteration": 2.428130626678467 + }, + { + "auxiliary_loss_clip": 0.01083037, + "auxiliary_loss_mlp": 0.01037047, + "balance_loss_clip": 1.0212872, + "balance_loss_mlp": 1.02597356, + "epoch": 0.2562453028708853, + "flos": 23067394306560.0, + "grad_norm": 1.5256314587185946, + "language_loss": 0.83025563, + "learning_rate": 3.3865661836890356e-06, + "loss": 0.85145652, + "num_input_tokens_seen": 92157625, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.5703125, + "step": 4262, + "time_per_iteration": 2.4298012256622314 + }, + { + "auxiliary_loss_clip": 0.01088283, + "auxiliary_loss_mlp": 0.01035921, + "balance_loss_clip": 1.01607275, + "balance_loss_mlp": 1.02493632, + "epoch": 0.2563054261235533, + "flos": 15120079678080.0, + "grad_norm": 2.158784763976801, + "language_loss": 0.74157685, + "learning_rate": 3.3862938754871786e-06, + "loss": 0.76281887, + "num_input_tokens_seen": 92175350, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 0.6328125, + "step": 4263, + "time_per_iteration": 2.3613972663879395 + }, + { + "auxiliary_loss_clip": 0.0108525, + "auxiliary_loss_mlp": 0.01049369, + "balance_loss_clip": 1.03203559, + "balance_loss_mlp": 1.02811015, + "epoch": 0.25636554937622125, + "flos": 27853637126400.0, + "grad_norm": 1.9657386555212584, + "language_loss": 0.82568431, + "learning_rate": 3.3860215178120597e-06, + "loss": 0.84703052, + "num_input_tokens_seen": 92196070, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.5703125, + "step": 4264, + "time_per_iteration": 2.458491086959839 + }, + { + "auxiliary_loss_clip": 0.01084013, + "auxiliary_loss_mlp": 0.01035816, + "balance_loss_clip": 1.01804233, + "balance_loss_mlp": 1.02565813, + "epoch": 0.2564256726288892, + "flos": 28905430878720.0, + "grad_norm": 1.7246676496995823, + "language_loss": 0.74102837, + "learning_rate": 3.385749110673398e-06, + "loss": 0.7622267, + "num_input_tokens_seen": 92216310, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.58203125, + "step": 4265, + "time_per_iteration": 2.4494590759277344 + }, + { + "auxiliary_loss_clip": 0.01079177, + "auxiliary_loss_mlp": 0.01032899, + "balance_loss_clip": 1.01693678, + "balance_loss_mlp": 1.02260828, + "epoch": 0.2564857958815572, + "flos": 18513951632640.0, + "grad_norm": 1.623519102787576, + "language_loss": 0.81270957, + "learning_rate": 3.3854766540809143e-06, + "loss": 0.8338303, + "num_input_tokens_seen": 92234510, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.56640625, + "step": 4266, + "time_per_iteration": 2.3781354427337646 + }, + { + "auxiliary_loss_clip": 0.01082828, + "auxiliary_loss_mlp": 0.01032787, + "balance_loss_clip": 1.0170517, + "balance_loss_mlp": 1.02566898, + "epoch": 0.25654591913422514, + "flos": 25807203250560.0, + "grad_norm": 1.4080021820802395, + "language_loss": 0.79135948, + "learning_rate": 3.3852041480443337e-06, + "loss": 0.81251562, + "num_input_tokens_seen": 92254070, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.5703125, + "step": 4267, + "time_per_iteration": 2.4150991439819336 + }, + { + "auxiliary_loss_clip": 0.01080846, + "auxiliary_loss_mlp": 0.0103577, + "balance_loss_clip": 1.01916432, + "balance_loss_mlp": 1.02583539, + "epoch": 0.2566060423868931, + "flos": 19098642038400.0, + "grad_norm": 1.5835765877395969, + "language_loss": 0.7891286, + "learning_rate": 3.3849315925733793e-06, + "loss": 0.81029481, + "num_input_tokens_seen": 92275060, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.546875, + "step": 4268, + "time_per_iteration": 2.4076755046844482 + }, + { + "auxiliary_loss_clip": 0.01083921, + "auxiliary_loss_mlp": 0.01041253, + "balance_loss_clip": 1.02471876, + "balance_loss_mlp": 1.02735698, + "epoch": 0.25666616563956113, + "flos": 23841523082880.0, + "grad_norm": 1.5222660198042288, + "language_loss": 0.67860067, + "learning_rate": 3.384658987677779e-06, + "loss": 0.69985247, + "num_input_tokens_seen": 92293610, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.5625, + "step": 4269, + "time_per_iteration": 2.4145662784576416 + }, + { + "auxiliary_loss_clip": 0.01086155, + "auxiliary_loss_mlp": 0.01037277, + "balance_loss_clip": 1.01988482, + "balance_loss_mlp": 1.02690673, + "epoch": 0.2567262888922291, + "flos": 14603574890880.0, + "grad_norm": 2.464202610279144, + "language_loss": 0.78836644, + "learning_rate": 3.3843863333672617e-06, + "loss": 0.80960071, + "num_input_tokens_seen": 92308305, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.59375, + "step": 4270, + "time_per_iteration": 2.348062038421631 + }, + { + "auxiliary_loss_clip": 0.01086721, + "auxiliary_loss_mlp": 0.01037053, + "balance_loss_clip": 1.01806307, + "balance_loss_mlp": 1.02600288, + "epoch": 0.25678641214489706, + "flos": 32921839019520.0, + "grad_norm": 2.483350980795855, + "language_loss": 0.67900097, + "learning_rate": 3.3841136296515574e-06, + "loss": 0.70023876, + "num_input_tokens_seen": 92329875, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 0.609375, + "step": 4271, + "time_per_iteration": 2.4771621227264404 + }, + { + "auxiliary_loss_clip": 0.01086933, + "auxiliary_loss_mlp": 0.01042427, + "balance_loss_clip": 1.02485585, + "balance_loss_mlp": 1.02654815, + "epoch": 0.256846535397565, + "flos": 24097750617600.0, + "grad_norm": 1.3568710177011531, + "language_loss": 0.87126815, + "learning_rate": 3.3838408765403974e-06, + "loss": 0.89256179, + "num_input_tokens_seen": 92348780, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.60546875, + "step": 4272, + "time_per_iteration": 2.4306132793426514 + }, + { + "auxiliary_loss_clip": 0.01083198, + "auxiliary_loss_mlp": 0.01034773, + "balance_loss_clip": 1.01661706, + "balance_loss_mlp": 1.0257237, + "epoch": 0.256906658650233, + "flos": 19717442709120.0, + "grad_norm": 1.7764758828185845, + "language_loss": 0.82027292, + "learning_rate": 3.3835680740435164e-06, + "loss": 0.8414526, + "num_input_tokens_seen": 92368175, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.57421875, + "step": 4273, + "time_per_iteration": 2.3793649673461914 + }, + { + "auxiliary_loss_clip": 0.01078188, + "auxiliary_loss_mlp": 0.01040335, + "balance_loss_clip": 1.02480221, + "balance_loss_mlp": 1.02476501, + "epoch": 0.25696678190290095, + "flos": 22925018315520.0, + "grad_norm": 1.5924830037239621, + "language_loss": 0.77150172, + "learning_rate": 3.38329522217065e-06, + "loss": 0.79268694, + "num_input_tokens_seen": 92387755, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.53515625, + "step": 4274, + "time_per_iteration": 2.4188337326049805 + }, + { + "auxiliary_loss_clip": 0.01079178, + "auxiliary_loss_mlp": 0.01029395, + "balance_loss_clip": 1.01338482, + "balance_loss_mlp": 1.02376032, + "epoch": 0.2570269051555689, + "flos": 27306617944320.0, + "grad_norm": 1.6610631604241781, + "language_loss": 0.83655322, + "learning_rate": 3.383022320931535e-06, + "loss": 0.85763896, + "num_input_tokens_seen": 92409850, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.5546875, + "step": 4275, + "time_per_iteration": 2.4397976398468018 + }, + { + "auxiliary_loss_clip": 0.01082623, + "auxiliary_loss_mlp": 0.01028516, + "balance_loss_clip": 1.01073003, + "balance_loss_mlp": 1.02453136, + "epoch": 0.2570870284082369, + "flos": 27562182163200.0, + "grad_norm": 1.9879882609096986, + "language_loss": 0.78657633, + "learning_rate": 3.3827493703359116e-06, + "loss": 0.80768776, + "num_input_tokens_seen": 92431250, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.58203125, + "step": 4276, + "time_per_iteration": 2.4634361267089844 + }, + { + "auxiliary_loss_clip": 0.01081022, + "auxiliary_loss_mlp": 0.01036053, + "balance_loss_clip": 1.0200851, + "balance_loss_mlp": 1.02425969, + "epoch": 0.25714715166090485, + "flos": 28729573027200.0, + "grad_norm": 1.5803680779159166, + "language_loss": 0.79060209, + "learning_rate": 3.38247637039352e-06, + "loss": 0.81177282, + "num_input_tokens_seen": 92452065, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.5703125, + "step": 4277, + "time_per_iteration": 2.441990613937378 + }, + { + "auxiliary_loss_clip": 0.01079616, + "auxiliary_loss_mlp": 0.01030886, + "balance_loss_clip": 1.01493597, + "balance_loss_mlp": 1.02376914, + "epoch": 0.2572072749135728, + "flos": 20115243273600.0, + "grad_norm": 4.284323005793387, + "language_loss": 0.78460282, + "learning_rate": 3.3822033211141018e-06, + "loss": 0.80570781, + "num_input_tokens_seen": 92470025, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.55859375, + "step": 4278, + "time_per_iteration": 2.3912453651428223 + }, + { + "auxiliary_loss_clip": 0.01084793, + "auxiliary_loss_mlp": 0.01031413, + "balance_loss_clip": 1.01468754, + "balance_loss_mlp": 1.02562034, + "epoch": 0.2572673981662408, + "flos": 26029669633920.0, + "grad_norm": 2.938846801497052, + "language_loss": 0.74501789, + "learning_rate": 3.381930222507403e-06, + "loss": 0.76617998, + "num_input_tokens_seen": 92489825, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.59375, + "step": 4279, + "time_per_iteration": 2.410918951034546 + }, + { + "auxiliary_loss_clip": 0.01081741, + "auxiliary_loss_mlp": 0.01041469, + "balance_loss_clip": 1.02433884, + "balance_loss_mlp": 1.02301311, + "epoch": 0.25732752141890874, + "flos": 16105712670720.0, + "grad_norm": 2.87928010463899, + "language_loss": 0.85400975, + "learning_rate": 3.3816570745831696e-06, + "loss": 0.87524188, + "num_input_tokens_seen": 92507270, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.5859375, + "step": 4280, + "time_per_iteration": 3.738787889480591 + }, + { + "auxiliary_loss_clip": 0.01083485, + "auxiliary_loss_mlp": 0.01031873, + "balance_loss_clip": 1.01500511, + "balance_loss_mlp": 1.02460313, + "epoch": 0.2573876446715767, + "flos": 22523447324160.0, + "grad_norm": 2.8416407300649067, + "language_loss": 0.78913325, + "learning_rate": 3.3813838773511496e-06, + "loss": 0.81028682, + "num_input_tokens_seen": 92526300, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.5859375, + "step": 4281, + "time_per_iteration": 2.3913938999176025 + }, + { + "auxiliary_loss_clip": 0.01082384, + "auxiliary_loss_mlp": 0.01032961, + "balance_loss_clip": 1.01530564, + "balance_loss_mlp": 1.02549577, + "epoch": 0.2574477679242447, + "flos": 23949718986240.0, + "grad_norm": 1.6785965482775702, + "language_loss": 0.87130249, + "learning_rate": 3.3811106308210916e-06, + "loss": 0.89245594, + "num_input_tokens_seen": 92546465, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.5703125, + "step": 4282, + "time_per_iteration": 2.4007761478424072 + }, + { + "auxiliary_loss_clip": 0.01084002, + "auxiliary_loss_mlp": 0.01034894, + "balance_loss_clip": 1.01863384, + "balance_loss_mlp": 1.02449942, + "epoch": 0.2575078911769127, + "flos": 21980617505280.0, + "grad_norm": 1.4833750982305345, + "language_loss": 0.7042622, + "learning_rate": 3.380837335002748e-06, + "loss": 0.72545123, + "num_input_tokens_seen": 92567260, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.59375, + "step": 4283, + "time_per_iteration": 3.8296406269073486 + }, + { + "auxiliary_loss_clip": 0.0108106, + "auxiliary_loss_mlp": 0.01032604, + "balance_loss_clip": 1.01710677, + "balance_loss_mlp": 1.02648377, + "epoch": 0.25756801442958066, + "flos": 21944307824640.0, + "grad_norm": 1.657453372544856, + "language_loss": 0.80645716, + "learning_rate": 3.380563989905872e-06, + "loss": 0.8275938, + "num_input_tokens_seen": 92585425, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.546875, + "step": 4284, + "time_per_iteration": 2.3946008682250977 + }, + { + "auxiliary_loss_clip": 0.01081427, + "auxiliary_loss_mlp": 0.01029221, + "balance_loss_clip": 1.01440346, + "balance_loss_mlp": 1.02537215, + "epoch": 0.2576281376822486, + "flos": 35260530819840.0, + "grad_norm": 2.1585192943874416, + "language_loss": 0.69971889, + "learning_rate": 3.3802905955402185e-06, + "loss": 0.72082543, + "num_input_tokens_seen": 92604770, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.55859375, + "step": 4285, + "time_per_iteration": 3.8747050762176514 + }, + { + "auxiliary_loss_clip": 0.01083203, + "auxiliary_loss_mlp": 0.01030471, + "balance_loss_clip": 1.01509285, + "balance_loss_mlp": 1.02660656, + "epoch": 0.2576882609349166, + "flos": 14131549042560.0, + "grad_norm": 1.759252741359028, + "language_loss": 0.58124995, + "learning_rate": 3.3800171519155443e-06, + "loss": 0.60238665, + "num_input_tokens_seen": 92622635, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.5625, + "step": 4286, + "time_per_iteration": 2.3480701446533203 + }, + { + "auxiliary_loss_clip": 0.01087779, + "auxiliary_loss_mlp": 0.01040366, + "balance_loss_clip": 1.02334321, + "balance_loss_mlp": 1.02740264, + "epoch": 0.25774838418758456, + "flos": 23257216702080.0, + "grad_norm": 2.6005133627988863, + "language_loss": 0.64120221, + "learning_rate": 3.379743659041607e-06, + "loss": 0.66248363, + "num_input_tokens_seen": 92642960, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.60546875, + "step": 4287, + "time_per_iteration": 2.4109575748443604 + }, + { + "auxiliary_loss_clip": 0.01085027, + "auxiliary_loss_mlp": 0.01035588, + "balance_loss_clip": 1.01774263, + "balance_loss_mlp": 1.02515745, + "epoch": 0.2578085074402525, + "flos": 22600640073600.0, + "grad_norm": 1.7000196163437455, + "language_loss": 0.717278, + "learning_rate": 3.3794701169281686e-06, + "loss": 0.73848414, + "num_input_tokens_seen": 92662455, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.6015625, + "step": 4288, + "time_per_iteration": 2.376340389251709 + }, + { + "auxiliary_loss_clip": 0.01079433, + "auxiliary_loss_mlp": 0.01031555, + "balance_loss_clip": 1.01621902, + "balance_loss_mlp": 1.02430677, + "epoch": 0.2578686306929205, + "flos": 24570684161280.0, + "grad_norm": 1.4056603383260875, + "language_loss": 0.76661074, + "learning_rate": 3.37919652558499e-06, + "loss": 0.78772056, + "num_input_tokens_seen": 92683520, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.55078125, + "step": 4289, + "time_per_iteration": 2.418536901473999 + }, + { + "auxiliary_loss_clip": 0.0108003, + "auxiliary_loss_mlp": 0.01032842, + "balance_loss_clip": 1.01683259, + "balance_loss_mlp": 1.02402079, + "epoch": 0.25792875394558845, + "flos": 18112974134400.0, + "grad_norm": 62.9597934246925, + "language_loss": 0.85113913, + "learning_rate": 3.3789228850218347e-06, + "loss": 0.87226784, + "num_input_tokens_seen": 92701450, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.5625, + "step": 4290, + "time_per_iteration": 3.752499580383301 + }, + { + "auxiliary_loss_clip": 0.01083866, + "auxiliary_loss_mlp": 0.01038385, + "balance_loss_clip": 1.01965714, + "balance_loss_mlp": 1.02597737, + "epoch": 0.2579888771982564, + "flos": 17711926813440.0, + "grad_norm": 1.761805413096687, + "language_loss": 0.72238749, + "learning_rate": 3.3786491952484686e-06, + "loss": 0.74360996, + "num_input_tokens_seen": 92720355, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.58203125, + "step": 4291, + "time_per_iteration": 2.3929483890533447 + }, + { + "auxiliary_loss_clip": 0.0108336, + "auxiliary_loss_mlp": 0.01034549, + "balance_loss_clip": 1.01665545, + "balance_loss_mlp": 1.02428102, + "epoch": 0.2580490004509244, + "flos": 16433966073600.0, + "grad_norm": 2.5180673896506716, + "language_loss": 0.80971766, + "learning_rate": 3.378375456274659e-06, + "loss": 0.83089674, + "num_input_tokens_seen": 92736755, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.58984375, + "step": 4292, + "time_per_iteration": 2.368997573852539 + }, + { + "auxiliary_loss_clip": 0.01084007, + "auxiliary_loss_mlp": 0.01037285, + "balance_loss_clip": 1.02002382, + "balance_loss_mlp": 1.02616954, + "epoch": 0.25810912370359235, + "flos": 33833840221440.0, + "grad_norm": 2.157634695913549, + "language_loss": 0.67968988, + "learning_rate": 3.378101668110175e-06, + "loss": 0.70090276, + "num_input_tokens_seen": 92757655, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.578125, + "step": 4293, + "time_per_iteration": 2.503197431564331 + }, + { + "auxiliary_loss_clip": 0.01077826, + "auxiliary_loss_mlp": 0.01030119, + "balance_loss_clip": 1.01525354, + "balance_loss_mlp": 1.02417076, + "epoch": 0.2581692469562603, + "flos": 25191020931840.0, + "grad_norm": 1.8207685713190493, + "language_loss": 0.75422269, + "learning_rate": 3.377827830764788e-06, + "loss": 0.77530217, + "num_input_tokens_seen": 92776100, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.53515625, + "step": 4294, + "time_per_iteration": 2.422917366027832 + }, + { + "auxiliary_loss_clip": 0.01080427, + "auxiliary_loss_mlp": 0.01034352, + "balance_loss_clip": 1.01683974, + "balance_loss_mlp": 1.0239749, + "epoch": 0.2582293702089283, + "flos": 34930811139840.0, + "grad_norm": 2.302495236749481, + "language_loss": 0.80801058, + "learning_rate": 3.3775539442482695e-06, + "loss": 0.82915831, + "num_input_tokens_seen": 92798880, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.56640625, + "step": 4295, + "time_per_iteration": 2.4947853088378906 + }, + { + "auxiliary_loss_clip": 0.01085873, + "auxiliary_loss_mlp": 0.01037403, + "balance_loss_clip": 1.01964092, + "balance_loss_mlp": 1.02579057, + "epoch": 0.2582894934615963, + "flos": 26832532325760.0, + "grad_norm": 1.9080965762459037, + "language_loss": 0.72517002, + "learning_rate": 3.377280008570394e-06, + "loss": 0.74640274, + "num_input_tokens_seen": 92817750, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.6015625, + "step": 4296, + "time_per_iteration": 2.4470784664154053 + }, + { + "auxiliary_loss_clip": 0.01084967, + "auxiliary_loss_mlp": 0.01034937, + "balance_loss_clip": 1.01725817, + "balance_loss_mlp": 1.02626336, + "epoch": 0.25834961671426426, + "flos": 23514072641280.0, + "grad_norm": 2.297984378339985, + "language_loss": 0.87064862, + "learning_rate": 3.3770060237409382e-06, + "loss": 0.89184773, + "num_input_tokens_seen": 92837995, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.5859375, + "step": 4297, + "time_per_iteration": 2.4318974018096924 + }, + { + "auxiliary_loss_clip": 0.01083034, + "auxiliary_loss_mlp": 0.01041631, + "balance_loss_clip": 1.02636003, + "balance_loss_mlp": 1.02587152, + "epoch": 0.25840973996693223, + "flos": 22450059912960.0, + "grad_norm": 1.6756887965074723, + "language_loss": 0.84725773, + "learning_rate": 3.3767319897696795e-06, + "loss": 0.86850429, + "num_input_tokens_seen": 92857245, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.5703125, + "step": 4298, + "time_per_iteration": 2.3790769577026367 + }, + { + "auxiliary_loss_clip": 0.01082709, + "auxiliary_loss_mlp": 0.01031622, + "balance_loss_clip": 1.01518261, + "balance_loss_mlp": 1.02583182, + "epoch": 0.2584698632196002, + "flos": 11290072619520.0, + "grad_norm": 2.024330256562373, + "language_loss": 0.83507544, + "learning_rate": 3.376457906666397e-06, + "loss": 0.85621876, + "num_input_tokens_seen": 92873265, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.5703125, + "step": 4299, + "time_per_iteration": 2.352186918258667 + }, + { + "auxiliary_loss_clip": 0.01079615, + "auxiliary_loss_mlp": 0.01031882, + "balance_loss_clip": 1.01738024, + "balance_loss_mlp": 1.02467215, + "epoch": 0.25852998647226816, + "flos": 17929051758720.0, + "grad_norm": 1.9727076145250175, + "language_loss": 0.82848322, + "learning_rate": 3.3761837744408728e-06, + "loss": 0.84959817, + "num_input_tokens_seen": 92890880, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.55078125, + "step": 4300, + "time_per_iteration": 2.355560541152954 + }, + { + "auxiliary_loss_clip": 0.01082258, + "auxiliary_loss_mlp": 0.01035253, + "balance_loss_clip": 1.01826572, + "balance_loss_mlp": 1.0248692, + "epoch": 0.2585901097249361, + "flos": 33254700721920.0, + "grad_norm": 1.7783040865562103, + "language_loss": 0.67306131, + "learning_rate": 3.375909593102889e-06, + "loss": 0.6942364, + "num_input_tokens_seen": 92910770, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.57421875, + "step": 4301, + "time_per_iteration": 2.46988582611084 + }, + { + "auxiliary_loss_clip": 0.01085816, + "auxiliary_loss_mlp": 0.01034949, + "balance_loss_clip": 1.01676941, + "balance_loss_mlp": 1.02482903, + "epoch": 0.2586502329776041, + "flos": 18440319841920.0, + "grad_norm": 3.446607948280226, + "language_loss": 0.80717486, + "learning_rate": 3.3756353626622325e-06, + "loss": 0.82838249, + "num_input_tokens_seen": 92929520, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.609375, + "step": 4302, + "time_per_iteration": 2.3469271659851074 + }, + { + "auxiliary_loss_clip": 0.01084192, + "auxiliary_loss_mlp": 0.01035901, + "balance_loss_clip": 1.01944971, + "balance_loss_mlp": 1.0264796, + "epoch": 0.25871035623027205, + "flos": 17967141918720.0, + "grad_norm": 1.760854526492328, + "language_loss": 0.92042071, + "learning_rate": 3.375361083128687e-06, + "loss": 0.94162166, + "num_input_tokens_seen": 92947890, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.578125, + "step": 4303, + "time_per_iteration": 2.3587801456451416 + }, + { + "auxiliary_loss_clip": 0.01082902, + "auxiliary_loss_mlp": 0.01032423, + "balance_loss_clip": 1.01561487, + "balance_loss_mlp": 1.02608013, + "epoch": 0.25877047948294, + "flos": 27776618933760.0, + "grad_norm": 1.7360749554289387, + "language_loss": 0.67793036, + "learning_rate": 3.3750867545120434e-06, + "loss": 0.69908363, + "num_input_tokens_seen": 92967690, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.5703125, + "step": 4304, + "time_per_iteration": 2.4346425533294678 + }, + { + "auxiliary_loss_clip": 0.01084052, + "auxiliary_loss_mlp": 0.01042595, + "balance_loss_clip": 1.02508342, + "balance_loss_mlp": 1.02554274, + "epoch": 0.258830602735608, + "flos": 27124615693440.0, + "grad_norm": 2.5129854023402016, + "language_loss": 0.72535753, + "learning_rate": 3.3748123768220902e-06, + "loss": 0.74662399, + "num_input_tokens_seen": 92986830, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.5859375, + "step": 4305, + "time_per_iteration": 2.425720691680908 + }, + { + "auxiliary_loss_clip": 0.01082325, + "auxiliary_loss_mlp": 0.01032634, + "balance_loss_clip": 1.01633763, + "balance_loss_mlp": 1.02396238, + "epoch": 0.25889072598827595, + "flos": 17890612485120.0, + "grad_norm": 1.9504844839412772, + "language_loss": 0.75319511, + "learning_rate": 3.3745379500686197e-06, + "loss": 0.77434468, + "num_input_tokens_seen": 93002740, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.5859375, + "step": 4306, + "time_per_iteration": 2.3431077003479004 + }, + { + "auxiliary_loss_clip": 0.01021114, + "auxiliary_loss_mlp": 0.01007076, + "balance_loss_clip": 1.00535893, + "balance_loss_mlp": 1.00742817, + "epoch": 0.2589508492409439, + "flos": 53932184530560.0, + "grad_norm": 0.8516271944646232, + "language_loss": 0.57163322, + "learning_rate": 3.3742634742614256e-06, + "loss": 0.59191501, + "num_input_tokens_seen": 93058645, + "router_z_loss_clip": 0.01721191, + "router_z_loss_mlp": 0.13671875, + "step": 4307, + "time_per_iteration": 2.960516929626465 + }, + { + "auxiliary_loss_clip": 0.01080167, + "auxiliary_loss_mlp": 0.0102746, + "balance_loss_clip": 1.01196325, + "balance_loss_mlp": 1.02441955, + "epoch": 0.2590109724936119, + "flos": 22124739064320.0, + "grad_norm": 1.473653396558904, + "language_loss": 0.71911383, + "learning_rate": 3.373988949410303e-06, + "loss": 0.74019015, + "num_input_tokens_seen": 93077140, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.55859375, + "step": 4308, + "time_per_iteration": 2.3743488788604736 + }, + { + "auxiliary_loss_clip": 0.01083955, + "auxiliary_loss_mlp": 0.0103448, + "balance_loss_clip": 1.01788592, + "balance_loss_mlp": 1.02536333, + "epoch": 0.2590710957462799, + "flos": 13473610871040.0, + "grad_norm": 1.7904500554462124, + "language_loss": 0.84118432, + "learning_rate": 3.3737143755250488e-06, + "loss": 0.8623687, + "num_input_tokens_seen": 93093580, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.5859375, + "step": 4309, + "time_per_iteration": 2.355070114135742 + }, + { + "auxiliary_loss_clip": 0.01082653, + "auxiliary_loss_mlp": 0.01037472, + "balance_loss_clip": 1.0214262, + "balance_loss_mlp": 1.02643061, + "epoch": 0.25913121899894787, + "flos": 22306077999360.0, + "grad_norm": 1.4843665105559463, + "language_loss": 0.8458032, + "learning_rate": 3.3734397526154626e-06, + "loss": 0.86700445, + "num_input_tokens_seen": 93112345, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.5625, + "step": 4310, + "time_per_iteration": 2.380340099334717 + }, + { + "auxiliary_loss_clip": 0.0108319, + "auxiliary_loss_mlp": 0.0103013, + "balance_loss_clip": 1.01346493, + "balance_loss_mlp": 1.02490306, + "epoch": 0.25919134225161583, + "flos": 25810554741120.0, + "grad_norm": 1.6865712353990208, + "language_loss": 0.7702111, + "learning_rate": 3.373165080691344e-06, + "loss": 0.79134429, + "num_input_tokens_seen": 93131545, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.58203125, + "step": 4311, + "time_per_iteration": 2.4091074466705322 + }, + { + "auxiliary_loss_clip": 0.01081977, + "auxiliary_loss_mlp": 0.01033583, + "balance_loss_clip": 1.01733518, + "balance_loss_mlp": 1.02419209, + "epoch": 0.2592514655042838, + "flos": 31210920109440.0, + "grad_norm": 1.6445433309728497, + "language_loss": 0.72107434, + "learning_rate": 3.3728903597624967e-06, + "loss": 0.74222994, + "num_input_tokens_seen": 93150730, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.578125, + "step": 4312, + "time_per_iteration": 2.4867470264434814 + }, + { + "auxiliary_loss_clip": 0.01081404, + "auxiliary_loss_mlp": 0.01032719, + "balance_loss_clip": 1.01629162, + "balance_loss_mlp": 1.02468252, + "epoch": 0.25931158875695176, + "flos": 18474115904640.0, + "grad_norm": 1.6707254952935937, + "language_loss": 0.69457316, + "learning_rate": 3.372615589838724e-06, + "loss": 0.7157144, + "num_input_tokens_seen": 93167895, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.56640625, + "step": 4313, + "time_per_iteration": 2.34779953956604 + }, + { + "auxiliary_loss_clip": 0.01080702, + "auxiliary_loss_mlp": 0.01033108, + "balance_loss_clip": 1.01840973, + "balance_loss_mlp": 1.02468693, + "epoch": 0.2593717120096197, + "flos": 19206942675840.0, + "grad_norm": 1.5203121298846591, + "language_loss": 0.80340791, + "learning_rate": 3.3723407709298314e-06, + "loss": 0.8245461, + "num_input_tokens_seen": 93187650, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.55859375, + "step": 4314, + "time_per_iteration": 2.3789196014404297 + }, + { + "auxiliary_loss_clip": 0.01084503, + "auxiliary_loss_mlp": 0.01037627, + "balance_loss_clip": 1.0201509, + "balance_loss_mlp": 1.0246979, + "epoch": 0.2594318352622877, + "flos": 31246775942400.0, + "grad_norm": 2.262450722488703, + "language_loss": 0.67424631, + "learning_rate": 3.3720659030456262e-06, + "loss": 0.69546759, + "num_input_tokens_seen": 93207370, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.59765625, + "step": 4315, + "time_per_iteration": 2.4545397758483887 + }, + { + "auxiliary_loss_clip": 0.0107976, + "auxiliary_loss_mlp": 0.01031527, + "balance_loss_clip": 1.01591063, + "balance_loss_mlp": 1.02369475, + "epoch": 0.25949195851495566, + "flos": 22236042078720.0, + "grad_norm": 1.4924082349050867, + "language_loss": 0.79146779, + "learning_rate": 3.371790986195919e-06, + "loss": 0.8125807, + "num_input_tokens_seen": 93227925, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.5625, + "step": 4316, + "time_per_iteration": 2.390685558319092 + }, + { + "auxiliary_loss_clip": 0.01081859, + "auxiliary_loss_mlp": 0.01035054, + "balance_loss_clip": 1.01837635, + "balance_loss_mlp": 1.02379608, + "epoch": 0.2595520817676236, + "flos": 28074427764480.0, + "grad_norm": 1.4888116581640165, + "language_loss": 0.77716893, + "learning_rate": 3.37151602039052e-06, + "loss": 0.79833806, + "num_input_tokens_seen": 93250020, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.578125, + "step": 4317, + "time_per_iteration": 2.4294650554656982 + }, + { + "auxiliary_loss_clip": 0.01084033, + "auxiliary_loss_mlp": 0.01044569, + "balance_loss_clip": 1.02638912, + "balance_loss_mlp": 1.02693534, + "epoch": 0.2596122050202916, + "flos": 20189992227840.0, + "grad_norm": 1.9285170049777474, + "language_loss": 0.78200823, + "learning_rate": 3.3712410056392418e-06, + "loss": 0.8032943, + "num_input_tokens_seen": 93269070, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.5703125, + "step": 4318, + "time_per_iteration": 2.3697478771209717 + }, + { + "auxiliary_loss_clip": 0.01078955, + "auxiliary_loss_mlp": 0.01029743, + "balance_loss_clip": 1.01239765, + "balance_loss_mlp": 1.02347076, + "epoch": 0.25967232827295955, + "flos": 22526868637440.0, + "grad_norm": 1.673626722566245, + "language_loss": 0.76285136, + "learning_rate": 3.3709659419518994e-06, + "loss": 0.78393841, + "num_input_tokens_seen": 93290250, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.5546875, + "step": 4319, + "time_per_iteration": 2.406689405441284 + }, + { + "auxiliary_loss_clip": 0.01077989, + "auxiliary_loss_mlp": 0.01033653, + "balance_loss_clip": 1.01764274, + "balance_loss_mlp": 1.0242908, + "epoch": 0.2597324515256275, + "flos": 21067219848960.0, + "grad_norm": 1.548263438035279, + "language_loss": 0.76447415, + "learning_rate": 3.3706908293383095e-06, + "loss": 0.78559065, + "num_input_tokens_seen": 93310090, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.53515625, + "step": 4320, + "time_per_iteration": 3.782745838165283 + }, + { + "auxiliary_loss_clip": 0.0108315, + "auxiliary_loss_mlp": 0.01033877, + "balance_loss_clip": 1.01718819, + "balance_loss_mlp": 1.02675653, + "epoch": 0.2597925747782955, + "flos": 22049047503360.0, + "grad_norm": 1.5114599799288724, + "language_loss": 0.71181488, + "learning_rate": 3.37041566780829e-06, + "loss": 0.73298526, + "num_input_tokens_seen": 93329570, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.5625, + "step": 4321, + "time_per_iteration": 2.4091897010803223 + }, + { + "auxiliary_loss_clip": 0.01085197, + "auxiliary_loss_mlp": 0.01037141, + "balance_loss_clip": 1.02032089, + "balance_loss_mlp": 1.02483368, + "epoch": 0.2598526980309635, + "flos": 19535929217280.0, + "grad_norm": 1.8954433697780415, + "language_loss": 0.74305975, + "learning_rate": 3.3701404573716597e-06, + "loss": 0.76428312, + "num_input_tokens_seen": 93347920, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.6015625, + "step": 4322, + "time_per_iteration": 2.3573434352874756 + }, + { + "auxiliary_loss_clip": 0.0108302, + "auxiliary_loss_mlp": 0.01035727, + "balance_loss_clip": 1.01884699, + "balance_loss_mlp": 1.02527928, + "epoch": 0.25991282128363147, + "flos": 24494154727680.0, + "grad_norm": 2.23228778176635, + "language_loss": 0.74200404, + "learning_rate": 3.3698651980382417e-06, + "loss": 0.76319158, + "num_input_tokens_seen": 93367145, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.578125, + "step": 4323, + "time_per_iteration": 3.823972225189209 + }, + { + "auxiliary_loss_clip": 0.0108677, + "auxiliary_loss_mlp": 0.01039805, + "balance_loss_clip": 1.02110124, + "balance_loss_mlp": 1.02496362, + "epoch": 0.25997294453629943, + "flos": 24200465437440.0, + "grad_norm": 2.0590544925981082, + "language_loss": 0.66615325, + "learning_rate": 3.3695898898178573e-06, + "loss": 0.68741906, + "num_input_tokens_seen": 93386555, + "router_z_loss_clip": 0.18652344, + "router_z_loss_mlp": 0.6171875, + "step": 4324, + "time_per_iteration": 2.414278030395508 + }, + { + "auxiliary_loss_clip": 0.01081461, + "auxiliary_loss_mlp": 0.01033286, + "balance_loss_clip": 1.01802182, + "balance_loss_mlp": 1.02527416, + "epoch": 0.2600330677889674, + "flos": 31430104824960.0, + "grad_norm": 1.9668820867681012, + "language_loss": 0.71053094, + "learning_rate": 3.3693145327203336e-06, + "loss": 0.73167843, + "num_input_tokens_seen": 93405590, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.5625, + "step": 4325, + "time_per_iteration": 3.812192440032959 + }, + { + "auxiliary_loss_clip": 0.01080737, + "auxiliary_loss_mlp": 0.0103104, + "balance_loss_clip": 1.01441014, + "balance_loss_mlp": 1.023157, + "epoch": 0.26009319104163536, + "flos": 32265262391040.0, + "grad_norm": 1.7332947526171827, + "language_loss": 0.72819197, + "learning_rate": 3.3690391267554963e-06, + "loss": 0.74930972, + "num_input_tokens_seen": 93424750, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.57421875, + "step": 4326, + "time_per_iteration": 2.4504213333129883 + }, + { + "auxiliary_loss_clip": 0.01078319, + "auxiliary_loss_mlp": 0.01032779, + "balance_loss_clip": 1.01828325, + "balance_loss_mlp": 1.02417922, + "epoch": 0.26015331429430333, + "flos": 26285548055040.0, + "grad_norm": 1.7490654353053325, + "language_loss": 0.8679921, + "learning_rate": 3.3687636719331744e-06, + "loss": 0.88910306, + "num_input_tokens_seen": 93443465, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.5390625, + "step": 4327, + "time_per_iteration": 2.418240547180176 + }, + { + "auxiliary_loss_clip": 0.01085405, + "auxiliary_loss_mlp": 0.01041617, + "balance_loss_clip": 1.02375913, + "balance_loss_mlp": 1.02679777, + "epoch": 0.2602134375469713, + "flos": 21141270576000.0, + "grad_norm": 1.4251562769475314, + "language_loss": 0.801377, + "learning_rate": 3.368488168263198e-06, + "loss": 0.82264721, + "num_input_tokens_seen": 93462580, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 0.5859375, + "step": 4328, + "time_per_iteration": 2.38944673538208 + }, + { + "auxiliary_loss_clip": 0.01079567, + "auxiliary_loss_mlp": 0.01036565, + "balance_loss_clip": 1.02085328, + "balance_loss_mlp": 1.02369261, + "epoch": 0.26027356079963926, + "flos": 25920147098880.0, + "grad_norm": 1.4861544983352568, + "language_loss": 0.87905395, + "learning_rate": 3.3682126157553983e-06, + "loss": 0.90021527, + "num_input_tokens_seen": 93482790, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.55859375, + "step": 4329, + "time_per_iteration": 3.8182191848754883 + }, + { + "auxiliary_loss_clip": 0.01079179, + "auxiliary_loss_mlp": 0.01032644, + "balance_loss_clip": 1.0176115, + "balance_loss_mlp": 1.02456927, + "epoch": 0.2603336840523072, + "flos": 26358027770880.0, + "grad_norm": 1.9163600045187557, + "language_loss": 0.77740896, + "learning_rate": 3.3679370144196106e-06, + "loss": 0.79852718, + "num_input_tokens_seen": 93498795, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.546875, + "step": 4330, + "time_per_iteration": 2.404296636581421 + }, + { + "auxiliary_loss_clip": 0.0108491, + "auxiliary_loss_mlp": 0.01032365, + "balance_loss_clip": 1.01608062, + "balance_loss_mlp": 1.02601981, + "epoch": 0.2603938073049752, + "flos": 23512536541440.0, + "grad_norm": 1.521642229427162, + "language_loss": 0.75395083, + "learning_rate": 3.367661364265669e-06, + "loss": 0.77512348, + "num_input_tokens_seen": 93518335, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.5859375, + "step": 4331, + "time_per_iteration": 2.4064319133758545 + }, + { + "auxiliary_loss_clip": 0.01079989, + "auxiliary_loss_mlp": 0.01028631, + "balance_loss_clip": 1.01396835, + "balance_loss_mlp": 1.02551913, + "epoch": 0.26045393055764315, + "flos": 25373127916800.0, + "grad_norm": 1.3703309693452361, + "language_loss": 0.690584, + "learning_rate": 3.367385665303412e-06, + "loss": 0.71167016, + "num_input_tokens_seen": 93539170, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.546875, + "step": 4332, + "time_per_iteration": 2.4051527976989746 + }, + { + "auxiliary_loss_clip": 0.01082459, + "auxiliary_loss_mlp": 0.01035422, + "balance_loss_clip": 1.01947129, + "balance_loss_mlp": 1.02486897, + "epoch": 0.2605140538103111, + "flos": 27634068385920.0, + "grad_norm": 1.9334433825288309, + "language_loss": 0.79419458, + "learning_rate": 3.3671099175426773e-06, + "loss": 0.81537342, + "num_input_tokens_seen": 93558480, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.57421875, + "step": 4333, + "time_per_iteration": 2.426959753036499 + }, + { + "auxiliary_loss_clip": 0.01080782, + "auxiliary_loss_mlp": 0.01034208, + "balance_loss_clip": 1.01866937, + "balance_loss_mlp": 1.02513885, + "epoch": 0.2605741770629791, + "flos": 13769045729280.0, + "grad_norm": 1.8848828748133224, + "language_loss": 0.80427253, + "learning_rate": 3.366834120993307e-06, + "loss": 0.82542241, + "num_input_tokens_seen": 93575220, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.55859375, + "step": 4334, + "time_per_iteration": 2.33345627784729 + }, + { + "auxiliary_loss_clip": 0.01081458, + "auxiliary_loss_mlp": 0.01032849, + "balance_loss_clip": 1.01612377, + "balance_loss_mlp": 1.02395833, + "epoch": 0.26063430031564705, + "flos": 26030472595200.0, + "grad_norm": 1.868475617096109, + "language_loss": 0.79615092, + "learning_rate": 3.3665582756651424e-06, + "loss": 0.817294, + "num_input_tokens_seen": 93597015, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.57421875, + "step": 4335, + "time_per_iteration": 2.4325897693634033 + }, + { + "auxiliary_loss_clip": 0.01018495, + "auxiliary_loss_mlp": 0.01000548, + "balance_loss_clip": 0.99874836, + "balance_loss_mlp": 1.00523376, + "epoch": 0.26069442356831507, + "flos": 62440587619200.0, + "grad_norm": 0.854688124413243, + "language_loss": 0.60818154, + "learning_rate": 3.366282381568028e-06, + "loss": 0.62837195, + "num_input_tokens_seen": 93657775, + "router_z_loss_clip": 0.01794434, + "router_z_loss_mlp": 0.1328125, + "step": 4336, + "time_per_iteration": 3.009134531021118 + }, + { + "auxiliary_loss_clip": 0.01081802, + "auxiliary_loss_mlp": 0.01038724, + "balance_loss_clip": 1.0222609, + "balance_loss_mlp": 1.02470827, + "epoch": 0.26075454682098304, + "flos": 13625517663360.0, + "grad_norm": 1.9892896970213614, + "language_loss": 0.76825356, + "learning_rate": 3.3660064387118104e-06, + "loss": 0.78945875, + "num_input_tokens_seen": 93676145, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.5703125, + "step": 4337, + "time_per_iteration": 2.3651371002197266 + }, + { + "auxiliary_loss_clip": 0.01084815, + "auxiliary_loss_mlp": 0.01033032, + "balance_loss_clip": 1.01597285, + "balance_loss_mlp": 1.02653301, + "epoch": 0.260814670073651, + "flos": 12125823678720.0, + "grad_norm": 2.0704255675615615, + "language_loss": 0.74591124, + "learning_rate": 3.3657304471063363e-06, + "loss": 0.76708972, + "num_input_tokens_seen": 93692480, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.5859375, + "step": 4338, + "time_per_iteration": 2.338366746902466 + }, + { + "auxiliary_loss_clip": 0.01088111, + "auxiliary_loss_mlp": 0.01037521, + "balance_loss_clip": 1.0206883, + "balance_loss_mlp": 1.02748883, + "epoch": 0.26087479332631897, + "flos": 15121615777920.0, + "grad_norm": 4.316681840861605, + "language_loss": 0.80428994, + "learning_rate": 3.3654544067614557e-06, + "loss": 0.82554621, + "num_input_tokens_seen": 93710165, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.60546875, + "step": 4339, + "time_per_iteration": 2.3600778579711914 + }, + { + "auxiliary_loss_clip": 0.010831, + "auxiliary_loss_mlp": 0.01037218, + "balance_loss_clip": 1.02179277, + "balance_loss_mlp": 1.02638113, + "epoch": 0.26093491657898693, + "flos": 24679787760000.0, + "grad_norm": 1.8164692196083743, + "language_loss": 0.76641595, + "learning_rate": 3.36517831768702e-06, + "loss": 0.78761917, + "num_input_tokens_seen": 93730185, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.56640625, + "step": 4340, + "time_per_iteration": 2.4018912315368652 + }, + { + "auxiliary_loss_clip": 0.0108625, + "auxiliary_loss_mlp": 0.01036358, + "balance_loss_clip": 1.02020526, + "balance_loss_mlp": 1.02616417, + "epoch": 0.2609950398316549, + "flos": 25115050080000.0, + "grad_norm": 1.4591585974620058, + "language_loss": 0.82838297, + "learning_rate": 3.3649021798928813e-06, + "loss": 0.84960902, + "num_input_tokens_seen": 93747690, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.6015625, + "step": 4341, + "time_per_iteration": 2.3946807384490967 + }, + { + "auxiliary_loss_clip": 0.01083036, + "auxiliary_loss_mlp": 0.01037099, + "balance_loss_clip": 1.01970696, + "balance_loss_mlp": 1.0253309, + "epoch": 0.26105516308432286, + "flos": 28547326396800.0, + "grad_norm": 3.6554276347206565, + "language_loss": 0.76271361, + "learning_rate": 3.364625993388895e-06, + "loss": 0.78391492, + "num_input_tokens_seen": 93767405, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.578125, + "step": 4342, + "time_per_iteration": 2.430886745452881 + }, + { + "auxiliary_loss_clip": 0.01081981, + "auxiliary_loss_mlp": 0.01029674, + "balance_loss_clip": 1.01296103, + "balance_loss_mlp": 1.02477884, + "epoch": 0.2611152863369908, + "flos": 39529046954880.0, + "grad_norm": 1.7236296549883814, + "language_loss": 0.66337711, + "learning_rate": 3.364349758184917e-06, + "loss": 0.68449366, + "num_input_tokens_seen": 93789950, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.5703125, + "step": 4343, + "time_per_iteration": 2.5204410552978516 + }, + { + "auxiliary_loss_clip": 0.01084426, + "auxiliary_loss_mlp": 0.01036005, + "balance_loss_clip": 1.01945925, + "balance_loss_mlp": 1.02507138, + "epoch": 0.2611754095896588, + "flos": 13734481616640.0, + "grad_norm": 1.7901669156582372, + "language_loss": 0.73423326, + "learning_rate": 3.3640734742908066e-06, + "loss": 0.75543761, + "num_input_tokens_seen": 93807835, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.59375, + "step": 4344, + "time_per_iteration": 2.355929374694824 + }, + { + "auxiliary_loss_clip": 0.01084147, + "auxiliary_loss_mlp": 0.0103658, + "balance_loss_clip": 1.0190804, + "balance_loss_mlp": 1.02618527, + "epoch": 0.26123553284232676, + "flos": 21505589280000.0, + "grad_norm": 2.259646251158903, + "language_loss": 0.86677957, + "learning_rate": 3.3637971417164213e-06, + "loss": 0.88798684, + "num_input_tokens_seen": 93825670, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.578125, + "step": 4345, + "time_per_iteration": 2.3687942028045654 + }, + { + "auxiliary_loss_clip": 0.01081716, + "auxiliary_loss_mlp": 0.01035815, + "balance_loss_clip": 1.02000141, + "balance_loss_mlp": 1.02591062, + "epoch": 0.2612956560949947, + "flos": 21138791869440.0, + "grad_norm": 1.8617342643171846, + "language_loss": 0.76585996, + "learning_rate": 3.3635207604716254e-06, + "loss": 0.78703523, + "num_input_tokens_seen": 93844045, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.55859375, + "step": 4346, + "time_per_iteration": 2.3935353755950928 + }, + { + "auxiliary_loss_clip": 0.01081684, + "auxiliary_loss_mlp": 0.01039103, + "balance_loss_clip": 1.02169895, + "balance_loss_mlp": 1.02359319, + "epoch": 0.2613557793476627, + "flos": 25117842988800.0, + "grad_norm": 1.5974130175040966, + "language_loss": 0.75614232, + "learning_rate": 3.36324433056628e-06, + "loss": 0.77735019, + "num_input_tokens_seen": 93864380, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.58203125, + "step": 4347, + "time_per_iteration": 2.405407190322876 + }, + { + "auxiliary_loss_clip": 0.01081437, + "auxiliary_loss_mlp": 0.0103207, + "balance_loss_clip": 1.01563072, + "balance_loss_mlp": 1.02452922, + "epoch": 0.26141590260033065, + "flos": 26066502984960.0, + "grad_norm": 2.738091999126562, + "language_loss": 0.73471534, + "learning_rate": 3.3629678520102517e-06, + "loss": 0.75585037, + "num_input_tokens_seen": 93885475, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.5703125, + "step": 4348, + "time_per_iteration": 2.451572895050049 + }, + { + "auxiliary_loss_clip": 0.01084936, + "auxiliary_loss_mlp": 0.01037315, + "balance_loss_clip": 1.02075648, + "balance_loss_mlp": 1.02558911, + "epoch": 0.2614760258529987, + "flos": 25700368890240.0, + "grad_norm": 1.7207225096228225, + "language_loss": 0.90501082, + "learning_rate": 3.3626913248134065e-06, + "loss": 0.92623335, + "num_input_tokens_seen": 93905545, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.59375, + "step": 4349, + "time_per_iteration": 2.4037556648254395 + }, + { + "auxiliary_loss_clip": 0.01081057, + "auxiliary_loss_mlp": 0.01028907, + "balance_loss_clip": 1.01292086, + "balance_loss_mlp": 1.02502859, + "epoch": 0.26153614910566664, + "flos": 17456188037760.0, + "grad_norm": 1.6986535503867581, + "language_loss": 0.80059385, + "learning_rate": 3.3624147489856134e-06, + "loss": 0.82169342, + "num_input_tokens_seen": 93924185, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.55859375, + "step": 4350, + "time_per_iteration": 2.3875317573547363 + }, + { + "auxiliary_loss_clip": 0.0108049, + "auxiliary_loss_mlp": 0.01033093, + "balance_loss_clip": 1.01740551, + "balance_loss_mlp": 1.02485681, + "epoch": 0.2615962723583346, + "flos": 17711856990720.0, + "grad_norm": 1.8550050424244182, + "language_loss": 0.62284708, + "learning_rate": 3.3621381245367425e-06, + "loss": 0.64398295, + "num_input_tokens_seen": 93942825, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.5546875, + "step": 4351, + "time_per_iteration": 2.353825092315674 + }, + { + "auxiliary_loss_clip": 0.0108404, + "auxiliary_loss_mlp": 0.01032888, + "balance_loss_clip": 1.01572168, + "balance_loss_mlp": 1.02454305, + "epoch": 0.26165639561100257, + "flos": 23256623208960.0, + "grad_norm": 1.7505439297140573, + "language_loss": 0.83453608, + "learning_rate": 3.361861451476665e-06, + "loss": 0.85570538, + "num_input_tokens_seen": 93962045, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.59375, + "step": 4352, + "time_per_iteration": 2.4056544303894043 + }, + { + "auxiliary_loss_clip": 0.01019163, + "auxiliary_loss_mlp": 0.01005607, + "balance_loss_clip": 1.00368738, + "balance_loss_mlp": 1.00525677, + "epoch": 0.26171651886367053, + "flos": 66734940026880.0, + "grad_norm": 0.7962985566204221, + "language_loss": 0.705495, + "learning_rate": 3.361584729815256e-06, + "loss": 0.7257427, + "num_input_tokens_seen": 94021175, + "router_z_loss_clip": 0.01916504, + "router_z_loss_mlp": 0.13867188, + "step": 4353, + "time_per_iteration": 2.9259660243988037 + }, + { + "auxiliary_loss_clip": 0.01081611, + "auxiliary_loss_mlp": 0.01038839, + "balance_loss_clip": 1.02209008, + "balance_loss_mlp": 1.02363181, + "epoch": 0.2617766421163385, + "flos": 22348392433920.0, + "grad_norm": 1.7339058908910299, + "language_loss": 0.77563334, + "learning_rate": 3.36130795956239e-06, + "loss": 0.79683781, + "num_input_tokens_seen": 94043370, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.58203125, + "step": 4354, + "time_per_iteration": 2.4646551609039307 + }, + { + "auxiliary_loss_clip": 0.01086978, + "auxiliary_loss_mlp": 0.01035421, + "balance_loss_clip": 1.01889896, + "balance_loss_mlp": 1.02577257, + "epoch": 0.26183676536900646, + "flos": 26065944403200.0, + "grad_norm": 1.9438712587568305, + "language_loss": 0.6831277, + "learning_rate": 3.3610311407279456e-06, + "loss": 0.70435178, + "num_input_tokens_seen": 94063510, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.609375, + "step": 4355, + "time_per_iteration": 2.4147260189056396 + }, + { + "auxiliary_loss_clip": 0.01082708, + "auxiliary_loss_mlp": 0.01030286, + "balance_loss_clip": 1.01249969, + "balance_loss_mlp": 1.0246644, + "epoch": 0.26189688862167443, + "flos": 20995403448960.0, + "grad_norm": 1.7696500051215371, + "language_loss": 0.67444134, + "learning_rate": 3.3607542733218002e-06, + "loss": 0.69557124, + "num_input_tokens_seen": 94083865, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.578125, + "step": 4356, + "time_per_iteration": 2.4343392848968506 + }, + { + "auxiliary_loss_clip": 0.01017185, + "auxiliary_loss_mlp": 0.01002721, + "balance_loss_clip": 1.00067043, + "balance_loss_mlp": 1.00336766, + "epoch": 0.2619570118743424, + "flos": 65795007870720.0, + "grad_norm": 0.6852037391226983, + "language_loss": 0.53176242, + "learning_rate": 3.360477357353835e-06, + "loss": 0.55196142, + "num_input_tokens_seen": 94144095, + "router_z_loss_clip": 0.02050781, + "router_z_loss_mlp": 0.13867188, + "step": 4357, + "time_per_iteration": 2.9522459506988525 + }, + { + "auxiliary_loss_clip": 0.01085013, + "auxiliary_loss_mlp": 0.01035973, + "balance_loss_clip": 1.0194509, + "balance_loss_mlp": 1.02569139, + "epoch": 0.26201713512701036, + "flos": 28765568505600.0, + "grad_norm": 1.9117433597624476, + "language_loss": 0.83305454, + "learning_rate": 3.3602003928339325e-06, + "loss": 0.85426438, + "num_input_tokens_seen": 94163035, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.59375, + "step": 4358, + "time_per_iteration": 2.458768367767334 + }, + { + "auxiliary_loss_clip": 0.01086796, + "auxiliary_loss_mlp": 0.01036543, + "balance_loss_clip": 1.01835179, + "balance_loss_mlp": 1.02564311, + "epoch": 0.2620772583796783, + "flos": 26431310448000.0, + "grad_norm": 2.0582838763570064, + "language_loss": 0.67504764, + "learning_rate": 3.359923379771977e-06, + "loss": 0.69628096, + "num_input_tokens_seen": 94182520, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.609375, + "step": 4359, + "time_per_iteration": 3.8571369647979736 + }, + { + "auxiliary_loss_clip": 0.0108345, + "auxiliary_loss_mlp": 0.01031978, + "balance_loss_clip": 1.0154438, + "balance_loss_mlp": 1.02413988, + "epoch": 0.2621373816323463, + "flos": 20155532849280.0, + "grad_norm": 2.1522994421529535, + "language_loss": 0.78343588, + "learning_rate": 3.359646318177854e-06, + "loss": 0.80459011, + "num_input_tokens_seen": 94201795, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.59375, + "step": 4360, + "time_per_iteration": 2.396752119064331 + }, + { + "auxiliary_loss_clip": 0.01083539, + "auxiliary_loss_mlp": 0.01034785, + "balance_loss_clip": 1.0194782, + "balance_loss_mlp": 1.02581263, + "epoch": 0.26219750488501425, + "flos": 28619980669440.0, + "grad_norm": 1.685938352541504, + "language_loss": 0.67987466, + "learning_rate": 3.3593692080614515e-06, + "loss": 0.70105791, + "num_input_tokens_seen": 94222390, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.578125, + "step": 4361, + "time_per_iteration": 2.4369585514068604 + }, + { + "auxiliary_loss_clip": 0.01084144, + "auxiliary_loss_mlp": 0.01041632, + "balance_loss_clip": 1.02305889, + "balance_loss_mlp": 1.02448654, + "epoch": 0.2622576281376823, + "flos": 15041839587840.0, + "grad_norm": 1.7258041002874784, + "language_loss": 0.84457237, + "learning_rate": 3.3590920494326585e-06, + "loss": 0.86583012, + "num_input_tokens_seen": 94239980, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.59765625, + "step": 4362, + "time_per_iteration": 3.718644618988037 + }, + { + "auxiliary_loss_clip": 0.01083127, + "auxiliary_loss_mlp": 0.01040111, + "balance_loss_clip": 1.02224183, + "balance_loss_mlp": 1.02676213, + "epoch": 0.26231775139035024, + "flos": 26394965856000.0, + "grad_norm": 3.6317635192113453, + "language_loss": 0.65254724, + "learning_rate": 3.3588148423013665e-06, + "loss": 0.67377967, + "num_input_tokens_seen": 94260715, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 0.5625, + "step": 4363, + "time_per_iteration": 2.4228336811065674 + }, + { + "auxiliary_loss_clip": 0.0101675, + "auxiliary_loss_mlp": 0.01002206, + "balance_loss_clip": 1.00038254, + "balance_loss_mlp": 1.00312877, + "epoch": 0.2623778746430182, + "flos": 65405341653120.0, + "grad_norm": 0.885519350546974, + "language_loss": 0.61138755, + "learning_rate": 3.3585375866774683e-06, + "loss": 0.63157707, + "num_input_tokens_seen": 94321285, + "router_z_loss_clip": 0.01818848, + "router_z_loss_mlp": 0.13671875, + "step": 4364, + "time_per_iteration": 3.109790325164795 + }, + { + "auxiliary_loss_clip": 0.0108592, + "auxiliary_loss_mlp": 0.01039824, + "balance_loss_clip": 1.0216918, + "balance_loss_mlp": 1.02690482, + "epoch": 0.26243799789568617, + "flos": 12603400433280.0, + "grad_norm": 2.486995179763905, + "language_loss": 0.71691263, + "learning_rate": 3.3582602825708577e-06, + "loss": 0.73817003, + "num_input_tokens_seen": 94335420, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.58984375, + "step": 4365, + "time_per_iteration": 3.7059733867645264 + }, + { + "auxiliary_loss_clip": 0.01084926, + "auxiliary_loss_mlp": 0.01034484, + "balance_loss_clip": 1.01810503, + "balance_loss_mlp": 1.02658844, + "epoch": 0.26249812114835414, + "flos": 28622494287360.0, + "grad_norm": 1.5865053590280984, + "language_loss": 0.77013832, + "learning_rate": 3.3579829299914314e-06, + "loss": 0.79133248, + "num_input_tokens_seen": 94357440, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.5859375, + "step": 4366, + "time_per_iteration": 2.4810895919799805 + }, + { + "auxiliary_loss_clip": 0.01086018, + "auxiliary_loss_mlp": 0.01037542, + "balance_loss_clip": 1.02080488, + "balance_loss_mlp": 1.0271523, + "epoch": 0.2625582444010221, + "flos": 14464515479040.0, + "grad_norm": 2.1766757845115405, + "language_loss": 0.75699329, + "learning_rate": 3.3577055289490875e-06, + "loss": 0.77822882, + "num_input_tokens_seen": 94375690, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.58984375, + "step": 4367, + "time_per_iteration": 2.3700788021087646 + }, + { + "auxiliary_loss_clip": 0.01080943, + "auxiliary_loss_mlp": 0.01030906, + "balance_loss_clip": 1.01578438, + "balance_loss_mlp": 1.02506244, + "epoch": 0.26261836765369007, + "flos": 16612372454400.0, + "grad_norm": 1.5430765527569217, + "language_loss": 0.6939097, + "learning_rate": 3.357428079453726e-06, + "loss": 0.71502817, + "num_input_tokens_seen": 94393190, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.55859375, + "step": 4368, + "time_per_iteration": 2.3812553882598877 + }, + { + "auxiliary_loss_clip": 0.01081122, + "auxiliary_loss_mlp": 0.01036805, + "balance_loss_clip": 1.02010322, + "balance_loss_mlp": 1.02419424, + "epoch": 0.26267849090635803, + "flos": 20518943857920.0, + "grad_norm": 1.9212666150589235, + "language_loss": 0.78736794, + "learning_rate": 3.357150581515248e-06, + "loss": 0.8085472, + "num_input_tokens_seen": 94410975, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.5703125, + "step": 4369, + "time_per_iteration": 3.818350076675415 + }, + { + "auxiliary_loss_clip": 0.01082732, + "auxiliary_loss_mlp": 0.01034902, + "balance_loss_clip": 1.01781893, + "balance_loss_mlp": 1.02509856, + "epoch": 0.262738614159026, + "flos": 21322888801920.0, + "grad_norm": 1.8764586658406883, + "language_loss": 0.83259284, + "learning_rate": 3.3568730351435565e-06, + "loss": 0.85376918, + "num_input_tokens_seen": 94429985, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.578125, + "step": 4370, + "time_per_iteration": 2.4190120697021484 + }, + { + "auxiliary_loss_clip": 0.01086615, + "auxiliary_loss_mlp": 0.01042301, + "balance_loss_clip": 1.02450299, + "balance_loss_mlp": 1.02636886, + "epoch": 0.26279873741169396, + "flos": 17602613746560.0, + "grad_norm": 1.9960765006733125, + "language_loss": 0.71356869, + "learning_rate": 3.356595440348557e-06, + "loss": 0.73485786, + "num_input_tokens_seen": 94448660, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.6015625, + "step": 4371, + "time_per_iteration": 2.3624267578125 + }, + { + "auxiliary_loss_clip": 0.01016725, + "auxiliary_loss_mlp": 0.01008475, + "balance_loss_clip": 1.00663948, + "balance_loss_mlp": 1.00319839, + "epoch": 0.2628588606643619, + "flos": 60946514363520.0, + "grad_norm": 0.6903965674816097, + "language_loss": 0.56374061, + "learning_rate": 3.356317797140156e-06, + "loss": 0.5839926, + "num_input_tokens_seen": 94515630, + "router_z_loss_clip": 0.01831055, + "router_z_loss_mlp": 0.13476562, + "step": 4372, + "time_per_iteration": 3.1685798168182373 + }, + { + "auxiliary_loss_clip": 0.01078309, + "auxiliary_loss_mlp": 0.01032138, + "balance_loss_clip": 1.01613426, + "balance_loss_mlp": 1.02404928, + "epoch": 0.2629189839170299, + "flos": 27015093158400.0, + "grad_norm": 1.568344460884402, + "language_loss": 0.77413881, + "learning_rate": 3.3560401055282617e-06, + "loss": 0.79524326, + "num_input_tokens_seen": 94535385, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.5390625, + "step": 4373, + "time_per_iteration": 2.413288116455078 + }, + { + "auxiliary_loss_clip": 0.01083175, + "auxiliary_loss_mlp": 0.01033638, + "balance_loss_clip": 1.01822376, + "balance_loss_mlp": 1.02676439, + "epoch": 0.26297910716969786, + "flos": 17018900858880.0, + "grad_norm": 2.2738660561378423, + "language_loss": 0.71958148, + "learning_rate": 3.3557623655227835e-06, + "loss": 0.7407496, + "num_input_tokens_seen": 94552650, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.5625, + "step": 4374, + "time_per_iteration": 2.3885531425476074 + }, + { + "auxiliary_loss_clip": 0.01083907, + "auxiliary_loss_mlp": 0.01037403, + "balance_loss_clip": 1.01941419, + "balance_loss_mlp": 1.02640259, + "epoch": 0.2630392304223659, + "flos": 24896284300800.0, + "grad_norm": 1.9842920010001697, + "language_loss": 0.80709791, + "learning_rate": 3.355484577133634e-06, + "loss": 0.82831097, + "num_input_tokens_seen": 94574075, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.57421875, + "step": 4375, + "time_per_iteration": 2.4129323959350586 + }, + { + "auxiliary_loss_clip": 0.01080039, + "auxiliary_loss_mlp": 0.01030826, + "balance_loss_clip": 1.0147804, + "balance_loss_mlp": 1.02443516, + "epoch": 0.26309935367503384, + "flos": 32852640971520.0, + "grad_norm": 1.8311917808216, + "language_loss": 0.66401243, + "learning_rate": 3.3552067403707272e-06, + "loss": 0.68512112, + "num_input_tokens_seen": 94594255, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.5546875, + "step": 4376, + "time_per_iteration": 2.492330551147461 + }, + { + "auxiliary_loss_clip": 0.01082875, + "auxiliary_loss_mlp": 0.01034176, + "balance_loss_clip": 1.01810658, + "balance_loss_mlp": 1.0257802, + "epoch": 0.2631594769277018, + "flos": 15887051625600.0, + "grad_norm": 2.1582891324182305, + "language_loss": 0.69157761, + "learning_rate": 3.3549288552439777e-06, + "loss": 0.71274817, + "num_input_tokens_seen": 94611410, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.5703125, + "step": 4377, + "time_per_iteration": 2.3599071502685547 + }, + { + "auxiliary_loss_clip": 0.01082518, + "auxiliary_loss_mlp": 0.01030591, + "balance_loss_clip": 1.01411593, + "balance_loss_mlp": 1.02396512, + "epoch": 0.2632196001803698, + "flos": 50803060348800.0, + "grad_norm": 1.7938992099265185, + "language_loss": 0.79095978, + "learning_rate": 3.3546509217633025e-06, + "loss": 0.81209087, + "num_input_tokens_seen": 94636575, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.5859375, + "step": 4378, + "time_per_iteration": 2.6629154682159424 + }, + { + "auxiliary_loss_clip": 0.01081721, + "auxiliary_loss_mlp": 0.01036983, + "balance_loss_clip": 1.02221322, + "balance_loss_mlp": 1.02553117, + "epoch": 0.26327972343303774, + "flos": 13732247289600.0, + "grad_norm": 2.2174823951933673, + "language_loss": 0.76924193, + "learning_rate": 3.3543729399386207e-06, + "loss": 0.790429, + "num_input_tokens_seen": 94654345, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.5625, + "step": 4379, + "time_per_iteration": 2.3615598678588867 + }, + { + "auxiliary_loss_clip": 0.01084724, + "auxiliary_loss_mlp": 0.01038309, + "balance_loss_clip": 1.01914012, + "balance_loss_mlp": 1.02540565, + "epoch": 0.2633398466857057, + "flos": 23767926203520.0, + "grad_norm": 6.362286528594949, + "language_loss": 0.77420974, + "learning_rate": 3.354094909779852e-06, + "loss": 0.79544008, + "num_input_tokens_seen": 94673985, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 0.59375, + "step": 4380, + "time_per_iteration": 2.4259772300720215 + }, + { + "auxiliary_loss_clip": 0.01083306, + "auxiliary_loss_mlp": 0.01031373, + "balance_loss_clip": 1.0150708, + "balance_loss_mlp": 1.02437663, + "epoch": 0.26339996993837367, + "flos": 27598980602880.0, + "grad_norm": 1.725418056129015, + "language_loss": 0.63635713, + "learning_rate": 3.353816831296919e-06, + "loss": 0.6575039, + "num_input_tokens_seen": 94693145, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.58984375, + "step": 4381, + "time_per_iteration": 2.430692434310913 + }, + { + "auxiliary_loss_clip": 0.01081333, + "auxiliary_loss_mlp": 0.01030871, + "balance_loss_clip": 1.01564837, + "balance_loss_mlp": 1.0243516, + "epoch": 0.26346009319104163, + "flos": 16945373802240.0, + "grad_norm": 1.7225451930022155, + "language_loss": 0.82855475, + "learning_rate": 3.353538704499747e-06, + "loss": 0.84967685, + "num_input_tokens_seen": 94710185, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.5703125, + "step": 4382, + "time_per_iteration": 2.3888862133026123 + }, + { + "auxiliary_loss_clip": 0.01088813, + "auxiliary_loss_mlp": 0.01039227, + "balance_loss_clip": 1.02026033, + "balance_loss_mlp": 1.02641439, + "epoch": 0.2635202164437096, + "flos": 37230714552960.0, + "grad_norm": 1.8838636120225258, + "language_loss": 0.70196539, + "learning_rate": 3.3532605293982592e-06, + "loss": 0.72324574, + "num_input_tokens_seen": 94730280, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 0.625, + "step": 4383, + "time_per_iteration": 2.5154879093170166 + }, + { + "auxiliary_loss_clip": 0.0108451, + "auxiliary_loss_mlp": 0.01033798, + "balance_loss_clip": 1.01813376, + "balance_loss_mlp": 1.02588332, + "epoch": 0.26358033969637756, + "flos": 20995298714880.0, + "grad_norm": 1.6678512838406812, + "language_loss": 0.69217885, + "learning_rate": 3.3529823060023847e-06, + "loss": 0.71336192, + "num_input_tokens_seen": 94748560, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.5859375, + "step": 4384, + "time_per_iteration": 2.4252703189849854 + }, + { + "auxiliary_loss_clip": 0.01081172, + "auxiliary_loss_mlp": 0.01034558, + "balance_loss_clip": 1.01836967, + "balance_loss_mlp": 1.02483261, + "epoch": 0.26364046294904553, + "flos": 27744847729920.0, + "grad_norm": 1.9567158780207452, + "language_loss": 0.70250678, + "learning_rate": 3.352704034322052e-06, + "loss": 0.72366405, + "num_input_tokens_seen": 94767570, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.5625, + "step": 4385, + "time_per_iteration": 2.419276237487793 + }, + { + "auxiliary_loss_clip": 0.01085956, + "auxiliary_loss_mlp": 0.01034463, + "balance_loss_clip": 1.0178926, + "balance_loss_mlp": 1.028512, + "epoch": 0.2637005862017135, + "flos": 22891990302720.0, + "grad_norm": 2.8594680097210334, + "language_loss": 0.85318404, + "learning_rate": 3.352425714367191e-06, + "loss": 0.87438822, + "num_input_tokens_seen": 94784985, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.57421875, + "step": 4386, + "time_per_iteration": 2.4315130710601807 + }, + { + "auxiliary_loss_clip": 0.01085615, + "auxiliary_loss_mlp": 0.01042264, + "balance_loss_clip": 1.02513361, + "balance_loss_mlp": 1.0259155, + "epoch": 0.26376070945438146, + "flos": 15047949075840.0, + "grad_norm": 3.1188781365018405, + "language_loss": 0.77222967, + "learning_rate": 3.352147346147736e-06, + "loss": 0.79350853, + "num_input_tokens_seen": 94802545, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.59765625, + "step": 4387, + "time_per_iteration": 2.3469078540802 + }, + { + "auxiliary_loss_clip": 0.01084967, + "auxiliary_loss_mlp": 0.01040062, + "balance_loss_clip": 1.02309895, + "balance_loss_mlp": 1.02795732, + "epoch": 0.2638208327070494, + "flos": 21140781816960.0, + "grad_norm": 1.8367250148165088, + "language_loss": 0.75953889, + "learning_rate": 3.35186892967362e-06, + "loss": 0.78078914, + "num_input_tokens_seen": 94820730, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.5703125, + "step": 4388, + "time_per_iteration": 2.39621639251709 + }, + { + "auxiliary_loss_clip": 0.01081453, + "auxiliary_loss_mlp": 0.01031921, + "balance_loss_clip": 1.01557755, + "balance_loss_mlp": 1.0245564, + "epoch": 0.26388095595971744, + "flos": 21724529616000.0, + "grad_norm": 2.0267249034861945, + "language_loss": 0.86646807, + "learning_rate": 3.3515904649547797e-06, + "loss": 0.88760179, + "num_input_tokens_seen": 94839175, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.5703125, + "step": 4389, + "time_per_iteration": 2.369663715362549 + }, + { + "auxiliary_loss_clip": 0.01015645, + "auxiliary_loss_mlp": 0.01003079, + "balance_loss_clip": 1.00141013, + "balance_loss_mlp": 1.00241423, + "epoch": 0.2639410792123854, + "flos": 65512036368000.0, + "grad_norm": 0.8054941886791257, + "language_loss": 0.60376012, + "learning_rate": 3.351311952001152e-06, + "loss": 0.62394738, + "num_input_tokens_seen": 94898865, + "router_z_loss_clip": 0.01672363, + "router_z_loss_mlp": 0.1328125, + "step": 4390, + "time_per_iteration": 3.0563459396362305 + }, + { + "auxiliary_loss_clip": 0.01083623, + "auxiliary_loss_mlp": 0.01034569, + "balance_loss_clip": 1.0171051, + "balance_loss_mlp": 1.02455616, + "epoch": 0.2640012024650534, + "flos": 23947519570560.0, + "grad_norm": 1.6027148621301397, + "language_loss": 0.77809501, + "learning_rate": 3.3510333908226765e-06, + "loss": 0.79927695, + "num_input_tokens_seen": 94917490, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.58984375, + "step": 4391, + "time_per_iteration": 2.4106431007385254 + }, + { + "auxiliary_loss_clip": 0.01014529, + "auxiliary_loss_mlp": 0.01001722, + "balance_loss_clip": 1.00002944, + "balance_loss_mlp": 1.00144744, + "epoch": 0.26406132571772134, + "flos": 56437620451200.0, + "grad_norm": 0.8326141864888711, + "language_loss": 0.58650523, + "learning_rate": 3.3507547814292953e-06, + "loss": 0.60666776, + "num_input_tokens_seen": 94969065, + "router_z_loss_clip": 0.01696777, + "router_z_loss_mlp": 0.13085938, + "step": 4392, + "time_per_iteration": 3.031676769256592 + }, + { + "auxiliary_loss_clip": 0.0108808, + "auxiliary_loss_mlp": 0.01032876, + "balance_loss_clip": 1.01624668, + "balance_loss_mlp": 1.02769303, + "epoch": 0.2641214489703893, + "flos": 22089476724480.0, + "grad_norm": 1.7174289514152088, + "language_loss": 0.68520582, + "learning_rate": 3.35047612383095e-06, + "loss": 0.70641536, + "num_input_tokens_seen": 94988540, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.6015625, + "step": 4393, + "time_per_iteration": 2.3734850883483887 + }, + { + "auxiliary_loss_clip": 0.01086166, + "auxiliary_loss_mlp": 0.01036641, + "balance_loss_clip": 1.01744819, + "balance_loss_mlp": 1.02461278, + "epoch": 0.26418157222305727, + "flos": 16543837722240.0, + "grad_norm": 1.8403628843433328, + "language_loss": 0.83997947, + "learning_rate": 3.3501974180375857e-06, + "loss": 0.86120754, + "num_input_tokens_seen": 95004810, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 0.6171875, + "step": 4394, + "time_per_iteration": 2.378603458404541 + }, + { + "auxiliary_loss_clip": 0.01089048, + "auxiliary_loss_mlp": 0.01038934, + "balance_loss_clip": 1.01870465, + "balance_loss_mlp": 1.0268085, + "epoch": 0.26424169547572524, + "flos": 18001566385920.0, + "grad_norm": 1.9561862411264384, + "language_loss": 0.70409507, + "learning_rate": 3.349918664059149e-06, + "loss": 0.72537494, + "num_input_tokens_seen": 95024085, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 0.625, + "step": 4395, + "time_per_iteration": 2.3475661277770996 + }, + { + "auxiliary_loss_clip": 0.01083131, + "auxiliary_loss_mlp": 0.01030646, + "balance_loss_clip": 1.01376581, + "balance_loss_mlp": 1.02572632, + "epoch": 0.2643018187283932, + "flos": 16982207153280.0, + "grad_norm": 5.891390896581395, + "language_loss": 0.86426324, + "learning_rate": 3.3496398619055876e-06, + "loss": 0.88540101, + "num_input_tokens_seen": 95042515, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.57421875, + "step": 4396, + "time_per_iteration": 2.4067912101745605 + }, + { + "auxiliary_loss_clip": 0.01016641, + "auxiliary_loss_mlp": 0.01002237, + "balance_loss_clip": 1.00038898, + "balance_loss_mlp": 1.00320745, + "epoch": 0.26436194198106117, + "flos": 59661396794880.0, + "grad_norm": 0.7841854801878994, + "language_loss": 0.5502708, + "learning_rate": 3.3493610115868505e-06, + "loss": 0.5704596, + "num_input_tokens_seen": 95094835, + "router_z_loss_clip": 0.01843262, + "router_z_loss_mlp": 0.13476562, + "step": 4397, + "time_per_iteration": 2.792224884033203 + }, + { + "auxiliary_loss_clip": 0.01084127, + "auxiliary_loss_mlp": 0.01038971, + "balance_loss_clip": 1.02174485, + "balance_loss_mlp": 1.02656198, + "epoch": 0.26442206523372913, + "flos": 32920093451520.0, + "grad_norm": 2.3186804286987184, + "language_loss": 0.78261548, + "learning_rate": 3.3490821131128905e-06, + "loss": 0.80384642, + "num_input_tokens_seen": 95113480, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.578125, + "step": 4398, + "time_per_iteration": 2.5096499919891357 + }, + { + "auxiliary_loss_clip": 0.01089577, + "auxiliary_loss_mlp": 0.01036791, + "balance_loss_clip": 1.01837349, + "balance_loss_mlp": 1.02938032, + "epoch": 0.2644821884863971, + "flos": 21030281763840.0, + "grad_norm": 1.6897059478807268, + "language_loss": 0.67123854, + "learning_rate": 3.34880316649366e-06, + "loss": 0.6925022, + "num_input_tokens_seen": 95132580, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.6015625, + "step": 4399, + "time_per_iteration": 3.792250633239746 + }, + { + "auxiliary_loss_clip": 0.01078808, + "auxiliary_loss_mlp": 0.01031263, + "balance_loss_clip": 1.01645744, + "balance_loss_mlp": 1.02631593, + "epoch": 0.26454231173906506, + "flos": 20775764885760.0, + "grad_norm": 1.8342268768383256, + "language_loss": 0.86510193, + "learning_rate": 3.3485241717391137e-06, + "loss": 0.88620263, + "num_input_tokens_seen": 95152375, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.5234375, + "step": 4400, + "time_per_iteration": 2.416428327560425 + }, + { + "auxiliary_loss_clip": 0.01087089, + "auxiliary_loss_mlp": 0.01033508, + "balance_loss_clip": 1.01489949, + "balance_loss_mlp": 1.02763569, + "epoch": 0.264602434991733, + "flos": 16617713892480.0, + "grad_norm": 1.8259711791839561, + "language_loss": 0.75744885, + "learning_rate": 3.348245128859209e-06, + "loss": 0.77865481, + "num_input_tokens_seen": 95170265, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.59375, + "step": 4401, + "time_per_iteration": 2.401073932647705 + }, + { + "auxiliary_loss_clip": 0.01088378, + "auxiliary_loss_mlp": 0.01041038, + "balance_loss_clip": 1.02122557, + "balance_loss_mlp": 1.02647591, + "epoch": 0.26466255824440105, + "flos": 19061669041920.0, + "grad_norm": 1.7328212095312157, + "language_loss": 0.88417149, + "learning_rate": 3.3479660378639036e-06, + "loss": 0.9054656, + "num_input_tokens_seen": 95188655, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 0.62109375, + "step": 4402, + "time_per_iteration": 3.7659802436828613 + }, + { + "auxiliary_loss_clip": 0.01084767, + "auxiliary_loss_mlp": 0.01031153, + "balance_loss_clip": 1.01411796, + "balance_loss_mlp": 1.02577353, + "epoch": 0.264722681497069, + "flos": 22637438513280.0, + "grad_norm": 1.7664811494722308, + "language_loss": 0.78145206, + "learning_rate": 3.3476868987631575e-06, + "loss": 0.80261123, + "num_input_tokens_seen": 95209615, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.58984375, + "step": 4403, + "time_per_iteration": 2.400317430496216 + }, + { + "auxiliary_loss_clip": 0.01084942, + "auxiliary_loss_mlp": 0.01034145, + "balance_loss_clip": 1.01712227, + "balance_loss_mlp": 1.02524889, + "epoch": 0.264782804749737, + "flos": 22491152449920.0, + "grad_norm": 1.8419357354733938, + "language_loss": 0.88121796, + "learning_rate": 3.3474077115669327e-06, + "loss": 0.90240884, + "num_input_tokens_seen": 95227810, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.59765625, + "step": 4404, + "time_per_iteration": 2.41160249710083 + }, + { + "auxiliary_loss_clip": 0.01084569, + "auxiliary_loss_mlp": 0.01033044, + "balance_loss_clip": 1.01737428, + "balance_loss_mlp": 1.02517438, + "epoch": 0.26484292800240494, + "flos": 16799332118400.0, + "grad_norm": 1.6810498335282373, + "language_loss": 0.76011705, + "learning_rate": 3.347128476285193e-06, + "loss": 0.78129321, + "num_input_tokens_seen": 95245890, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.59375, + "step": 4405, + "time_per_iteration": 3.7762291431427 + }, + { + "auxiliary_loss_clip": 0.0108679, + "auxiliary_loss_mlp": 0.01033609, + "balance_loss_clip": 1.01527512, + "balance_loss_mlp": 1.02725601, + "epoch": 0.2649030512550729, + "flos": 20448523912320.0, + "grad_norm": 1.7145958692588852, + "language_loss": 0.7003299, + "learning_rate": 3.346849192927903e-06, + "loss": 0.72153401, + "num_input_tokens_seen": 95264955, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.59375, + "step": 4406, + "time_per_iteration": 2.4083986282348633 + }, + { + "auxiliary_loss_clip": 0.01083162, + "auxiliary_loss_mlp": 0.01034772, + "balance_loss_clip": 1.01766515, + "balance_loss_mlp": 1.02575684, + "epoch": 0.2649631745077409, + "flos": 22415111775360.0, + "grad_norm": 1.6771126599034714, + "language_loss": 0.83475494, + "learning_rate": 3.3465698615050295e-06, + "loss": 0.85593432, + "num_input_tokens_seen": 95284245, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.57421875, + "step": 4407, + "time_per_iteration": 2.385935068130493 + }, + { + "auxiliary_loss_clip": 0.01082729, + "auxiliary_loss_mlp": 0.0102705, + "balance_loss_clip": 1.01056385, + "balance_loss_mlp": 1.02529013, + "epoch": 0.26502329776040884, + "flos": 35114663692800.0, + "grad_norm": 1.8707253853818477, + "language_loss": 0.75918061, + "learning_rate": 3.346290482026542e-06, + "loss": 0.78027844, + "num_input_tokens_seen": 95307125, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.57421875, + "step": 4408, + "time_per_iteration": 3.924562931060791 + }, + { + "auxiliary_loss_clip": 0.01081612, + "auxiliary_loss_mlp": 0.01034148, + "balance_loss_clip": 1.01716042, + "balance_loss_mlp": 1.02506995, + "epoch": 0.2650834210130768, + "flos": 38686069244160.0, + "grad_norm": 1.7151014031569332, + "language_loss": 0.71050882, + "learning_rate": 3.3460110545024094e-06, + "loss": 0.73166645, + "num_input_tokens_seen": 95329150, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.56640625, + "step": 4409, + "time_per_iteration": 2.530848979949951 + }, + { + "auxiliary_loss_clip": 0.01085514, + "auxiliary_loss_mlp": 0.01038395, + "balance_loss_clip": 1.02051401, + "balance_loss_mlp": 1.02572131, + "epoch": 0.26514354426574477, + "flos": 24715713415680.0, + "grad_norm": 1.9588266821955322, + "language_loss": 0.73921341, + "learning_rate": 3.3457315789426054e-06, + "loss": 0.76045251, + "num_input_tokens_seen": 95349880, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 0.59765625, + "step": 4410, + "time_per_iteration": 2.467231273651123 + }, + { + "auxiliary_loss_clip": 0.0109092, + "auxiliary_loss_mlp": 0.01036425, + "balance_loss_clip": 1.01836467, + "balance_loss_mlp": 1.0285635, + "epoch": 0.26520366751841273, + "flos": 20339001377280.0, + "grad_norm": 1.8507813084534541, + "language_loss": 0.73387146, + "learning_rate": 3.345452055357103e-06, + "loss": 0.75514489, + "num_input_tokens_seen": 95368570, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.625, + "step": 4411, + "time_per_iteration": 2.424858331680298 + }, + { + "auxiliary_loss_clip": 0.01084199, + "auxiliary_loss_mlp": 0.01036434, + "balance_loss_clip": 1.0188024, + "balance_loss_mlp": 1.02611351, + "epoch": 0.2652637907710807, + "flos": 22342841527680.0, + "grad_norm": 2.0768969364102667, + "language_loss": 0.81789207, + "learning_rate": 3.345172483755878e-06, + "loss": 0.83909839, + "num_input_tokens_seen": 95387065, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.578125, + "step": 4412, + "time_per_iteration": 2.4285106658935547 + }, + { + "auxiliary_loss_clip": 0.01085009, + "auxiliary_loss_mlp": 0.01037716, + "balance_loss_clip": 1.02158701, + "balance_loss_mlp": 1.02665496, + "epoch": 0.26532391402374866, + "flos": 19353228739200.0, + "grad_norm": 2.04123985777002, + "language_loss": 0.74686402, + "learning_rate": 3.3448928641489057e-06, + "loss": 0.76809126, + "num_input_tokens_seen": 95406345, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.5859375, + "step": 4413, + "time_per_iteration": 2.3657784461975098 + }, + { + "auxiliary_loss_clip": 0.01016001, + "auxiliary_loss_mlp": 0.01002366, + "balance_loss_clip": 1.00072134, + "balance_loss_mlp": 1.00263, + "epoch": 0.26538403727641663, + "flos": 44784800138880.0, + "grad_norm": 0.8627612956571837, + "language_loss": 0.5696466, + "learning_rate": 3.344613196546168e-06, + "loss": 0.58983028, + "num_input_tokens_seen": 95463595, + "router_z_loss_clip": 0.01647949, + "router_z_loss_mlp": 0.1328125, + "step": 4414, + "time_per_iteration": 2.9951398372650146 + }, + { + "auxiliary_loss_clip": 0.01080325, + "auxiliary_loss_mlp": 0.01033703, + "balance_loss_clip": 1.01805067, + "balance_loss_mlp": 1.02451253, + "epoch": 0.26544416052908465, + "flos": 28180913011200.0, + "grad_norm": 1.7867920966863948, + "language_loss": 0.74504185, + "learning_rate": 3.3443334809576434e-06, + "loss": 0.76618218, + "num_input_tokens_seen": 95484115, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.55859375, + "step": 4415, + "time_per_iteration": 2.4403231143951416 + }, + { + "auxiliary_loss_clip": 0.01086161, + "auxiliary_loss_mlp": 0.01034844, + "balance_loss_clip": 1.01561606, + "balance_loss_mlp": 1.0252192, + "epoch": 0.2655042837817526, + "flos": 17564349029760.0, + "grad_norm": 2.116248324309045, + "language_loss": 0.86852682, + "learning_rate": 3.344053717393315e-06, + "loss": 0.88973689, + "num_input_tokens_seen": 95501435, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 0.609375, + "step": 4416, + "time_per_iteration": 2.389538526535034 + }, + { + "auxiliary_loss_clip": 0.01087798, + "auxiliary_loss_mlp": 0.01033258, + "balance_loss_clip": 1.01535249, + "balance_loss_mlp": 1.02786207, + "epoch": 0.2655644070344206, + "flos": 23403502765440.0, + "grad_norm": 1.6650351412956075, + "language_loss": 0.76366687, + "learning_rate": 3.343773905863167e-06, + "loss": 0.78487742, + "num_input_tokens_seen": 95520135, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.59765625, + "step": 4417, + "time_per_iteration": 2.390695333480835 + }, + { + "auxiliary_loss_clip": 0.01082511, + "auxiliary_loss_mlp": 0.01034343, + "balance_loss_clip": 1.01598489, + "balance_loss_mlp": 1.02553582, + "epoch": 0.26562453028708854, + "flos": 26467271015040.0, + "grad_norm": 1.6762372542929278, + "language_loss": 0.79955584, + "learning_rate": 3.3434940463771847e-06, + "loss": 0.82072443, + "num_input_tokens_seen": 95541705, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.5703125, + "step": 4418, + "time_per_iteration": 2.4588255882263184 + }, + { + "auxiliary_loss_clip": 0.01086041, + "auxiliary_loss_mlp": 0.0103504, + "balance_loss_clip": 1.01663399, + "balance_loss_mlp": 1.02636266, + "epoch": 0.2656846535397565, + "flos": 19206593562240.0, + "grad_norm": 3.1997615779180197, + "language_loss": 0.67047536, + "learning_rate": 3.343214138945356e-06, + "loss": 0.69168615, + "num_input_tokens_seen": 95560300, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.59765625, + "step": 4419, + "time_per_iteration": 2.3462040424346924 + }, + { + "auxiliary_loss_clip": 0.0108648, + "auxiliary_loss_mlp": 0.01037498, + "balance_loss_clip": 1.01873422, + "balance_loss_mlp": 1.02627826, + "epoch": 0.2657447767924245, + "flos": 30550119206400.0, + "grad_norm": 1.6345564071009095, + "language_loss": 0.79285121, + "learning_rate": 3.3429341835776695e-06, + "loss": 0.81409103, + "num_input_tokens_seen": 95580150, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.6015625, + "step": 4420, + "time_per_iteration": 2.4733612537384033 + }, + { + "auxiliary_loss_clip": 0.01087562, + "auxiliary_loss_mlp": 0.01038186, + "balance_loss_clip": 1.01818299, + "balance_loss_mlp": 1.02625501, + "epoch": 0.26580490004509244, + "flos": 20921701835520.0, + "grad_norm": 1.8012763149988291, + "language_loss": 0.81564724, + "learning_rate": 3.342654180284117e-06, + "loss": 0.83690476, + "num_input_tokens_seen": 95597570, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 0.61328125, + "step": 4421, + "time_per_iteration": 2.3722877502441406 + }, + { + "auxiliary_loss_clip": 0.0108215, + "auxiliary_loss_mlp": 0.01031909, + "balance_loss_clip": 1.01545858, + "balance_loss_mlp": 1.02542126, + "epoch": 0.2658650232977604, + "flos": 43943988798720.0, + "grad_norm": 1.6239842243624345, + "language_loss": 0.6596427, + "learning_rate": 3.3423741290746897e-06, + "loss": 0.68078327, + "num_input_tokens_seen": 95619415, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.56640625, + "step": 4422, + "time_per_iteration": 2.5988452434539795 + }, + { + "auxiliary_loss_clip": 0.01084078, + "auxiliary_loss_mlp": 0.01034599, + "balance_loss_clip": 1.0167774, + "balance_loss_mlp": 1.02452087, + "epoch": 0.26592514655042837, + "flos": 29715136197120.0, + "grad_norm": 2.050396479599726, + "language_loss": 0.73857653, + "learning_rate": 3.342094029959383e-06, + "loss": 0.75976324, + "num_input_tokens_seen": 95639155, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.59765625, + "step": 4423, + "time_per_iteration": 2.4294188022613525 + }, + { + "auxiliary_loss_clip": 0.01081754, + "auxiliary_loss_mlp": 0.01039642, + "balance_loss_clip": 1.02213025, + "balance_loss_mlp": 1.02350807, + "epoch": 0.26598526980309634, + "flos": 46676082332160.0, + "grad_norm": 1.6057609936293193, + "language_loss": 0.77617615, + "learning_rate": 3.341813882948193e-06, + "loss": 0.7973901, + "num_input_tokens_seen": 95663320, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.58203125, + "step": 4424, + "time_per_iteration": 2.6134867668151855 + }, + { + "auxiliary_loss_clip": 0.0108458, + "auxiliary_loss_mlp": 0.01038497, + "balance_loss_clip": 1.02152133, + "balance_loss_mlp": 1.02587628, + "epoch": 0.2660453930557643, + "flos": 11508663841920.0, + "grad_norm": 1.9080086544576587, + "language_loss": 0.78946781, + "learning_rate": 3.341533688051117e-06, + "loss": 0.81069863, + "num_input_tokens_seen": 95680260, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.5859375, + "step": 4425, + "time_per_iteration": 2.344456195831299 + }, + { + "auxiliary_loss_clip": 0.01083499, + "auxiliary_loss_mlp": 0.01036544, + "balance_loss_clip": 1.02078414, + "balance_loss_mlp": 1.02756691, + "epoch": 0.26610551630843227, + "flos": 24790392547200.0, + "grad_norm": 2.648059298186932, + "language_loss": 0.8029412, + "learning_rate": 3.3412534452781543e-06, + "loss": 0.82414162, + "num_input_tokens_seen": 95701140, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.55859375, + "step": 4426, + "time_per_iteration": 2.4670002460479736 + }, + { + "auxiliary_loss_clip": 0.01087237, + "auxiliary_loss_mlp": 0.01035856, + "balance_loss_clip": 1.01826704, + "balance_loss_mlp": 1.02924085, + "epoch": 0.26616563956110023, + "flos": 27635150638080.0, + "grad_norm": 1.7151093890687164, + "language_loss": 0.76837152, + "learning_rate": 3.3409731546393067e-06, + "loss": 0.78960252, + "num_input_tokens_seen": 95722060, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.578125, + "step": 4427, + "time_per_iteration": 2.4607059955596924 + }, + { + "auxiliary_loss_clip": 0.01079872, + "auxiliary_loss_mlp": 0.01028893, + "balance_loss_clip": 1.01305056, + "balance_loss_mlp": 1.02486455, + "epoch": 0.26622576281376825, + "flos": 28361728275840.0, + "grad_norm": 1.4907260181840092, + "language_loss": 0.76644647, + "learning_rate": 3.3406928161445756e-06, + "loss": 0.78753412, + "num_input_tokens_seen": 95742495, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.546875, + "step": 4428, + "time_per_iteration": 2.4728152751922607 + }, + { + "auxiliary_loss_clip": 0.01084616, + "auxiliary_loss_mlp": 0.01028649, + "balance_loss_clip": 1.01249576, + "balance_loss_mlp": 1.02543139, + "epoch": 0.2662858860664362, + "flos": 18040354773120.0, + "grad_norm": 2.0009360026637766, + "language_loss": 0.82547939, + "learning_rate": 3.340412429803967e-06, + "loss": 0.84661198, + "num_input_tokens_seen": 95761510, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.58984375, + "step": 4429, + "time_per_iteration": 2.355893135070801 + }, + { + "auxiliary_loss_clip": 0.01081085, + "auxiliary_loss_mlp": 0.0103807, + "balance_loss_clip": 1.02099943, + "balance_loss_mlp": 1.02443218, + "epoch": 0.2663460093191042, + "flos": 22744761632640.0, + "grad_norm": 1.8525218053636021, + "language_loss": 0.72379601, + "learning_rate": 3.3401319956274872e-06, + "loss": 0.74498761, + "num_input_tokens_seen": 95782385, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.56640625, + "step": 4430, + "time_per_iteration": 2.426131010055542 + }, + { + "auxiliary_loss_clip": 0.01086066, + "auxiliary_loss_mlp": 0.01040186, + "balance_loss_clip": 1.02156508, + "balance_loss_mlp": 1.02631724, + "epoch": 0.26640613257177215, + "flos": 16507842243840.0, + "grad_norm": 4.51152311135043, + "language_loss": 0.81984192, + "learning_rate": 3.3398515136251435e-06, + "loss": 0.84110445, + "num_input_tokens_seen": 95800595, + "router_z_loss_clip": 0.18652344, + "router_z_loss_mlp": 0.59765625, + "step": 4431, + "time_per_iteration": 2.3608851432800293 + }, + { + "auxiliary_loss_clip": 0.01089014, + "auxiliary_loss_mlp": 0.01042729, + "balance_loss_clip": 1.02395391, + "balance_loss_mlp": 1.02706099, + "epoch": 0.2664662558244401, + "flos": 23074830426240.0, + "grad_norm": 2.1298469512924907, + "language_loss": 0.7598294, + "learning_rate": 3.3395709838069463e-06, + "loss": 0.78114682, + "num_input_tokens_seen": 95818480, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 0.6171875, + "step": 4432, + "time_per_iteration": 2.405139446258545 + }, + { + "auxiliary_loss_clip": 0.01081227, + "auxiliary_loss_mlp": 0.01031299, + "balance_loss_clip": 1.01426446, + "balance_loss_mlp": 1.02397203, + "epoch": 0.2665263790771081, + "flos": 23768135671680.0, + "grad_norm": 1.8088381619840193, + "language_loss": 0.82675636, + "learning_rate": 3.3392904061829054e-06, + "loss": 0.84788167, + "num_input_tokens_seen": 95837205, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.57421875, + "step": 4433, + "time_per_iteration": 2.3934218883514404 + }, + { + "auxiliary_loss_clip": 0.01082703, + "auxiliary_loss_mlp": 0.01040388, + "balance_loss_clip": 1.02269769, + "balance_loss_mlp": 1.02573907, + "epoch": 0.26658650232977604, + "flos": 28000027923840.0, + "grad_norm": 2.320778181328883, + "language_loss": 0.76531565, + "learning_rate": 3.3390097807630353e-06, + "loss": 0.78654659, + "num_input_tokens_seen": 95858395, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.5703125, + "step": 4434, + "time_per_iteration": 2.459339141845703 + }, + { + "auxiliary_loss_clip": 0.01083463, + "auxiliary_loss_mlp": 0.01033409, + "balance_loss_clip": 1.01747119, + "balance_loss_mlp": 1.02591419, + "epoch": 0.266646625582444, + "flos": 22162549933440.0, + "grad_norm": 2.067932987258131, + "language_loss": 0.82558548, + "learning_rate": 3.3387291075573508e-06, + "loss": 0.84675425, + "num_input_tokens_seen": 95877875, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.57421875, + "step": 4435, + "time_per_iteration": 2.3734118938446045 + }, + { + "auxiliary_loss_clip": 0.01086973, + "auxiliary_loss_mlp": 0.01043836, + "balance_loss_clip": 1.02633619, + "balance_loss_mlp": 1.02657735, + "epoch": 0.266706748835112, + "flos": 27852345406080.0, + "grad_norm": 8.327866476886152, + "language_loss": 0.88007295, + "learning_rate": 3.3384483865758677e-06, + "loss": 0.90138102, + "num_input_tokens_seen": 95895820, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.60546875, + "step": 4436, + "time_per_iteration": 2.447432041168213 + }, + { + "auxiliary_loss_clip": 0.01083402, + "auxiliary_loss_mlp": 0.01033034, + "balance_loss_clip": 1.01635647, + "balance_loss_mlp": 1.02495182, + "epoch": 0.26676687208777994, + "flos": 25810938766080.0, + "grad_norm": 1.6499649120763606, + "language_loss": 0.78677368, + "learning_rate": 3.3381676178286047e-06, + "loss": 0.80793798, + "num_input_tokens_seen": 95918025, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.5859375, + "step": 4437, + "time_per_iteration": 2.450538396835327 + }, + { + "auxiliary_loss_clip": 0.01082209, + "auxiliary_loss_mlp": 0.01033849, + "balance_loss_clip": 1.01734996, + "balance_loss_mlp": 1.02518868, + "epoch": 0.2668269953404479, + "flos": 36063114220800.0, + "grad_norm": 1.9899374655950153, + "language_loss": 0.63907933, + "learning_rate": 3.337886801325582e-06, + "loss": 0.66023993, + "num_input_tokens_seen": 95937725, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.5703125, + "step": 4438, + "time_per_iteration": 3.877885341644287 + }, + { + "auxiliary_loss_clip": 0.01082817, + "auxiliary_loss_mlp": 0.0103528, + "balance_loss_clip": 1.0182811, + "balance_loss_mlp": 1.02498817, + "epoch": 0.26688711859311587, + "flos": 26569985834880.0, + "grad_norm": 1.9465374522148835, + "language_loss": 0.75652754, + "learning_rate": 3.3376059370768202e-06, + "loss": 0.77770853, + "num_input_tokens_seen": 95956335, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.578125, + "step": 4439, + "time_per_iteration": 2.3986637592315674 + }, + { + "auxiliary_loss_clip": 0.01084893, + "auxiliary_loss_mlp": 0.01032577, + "balance_loss_clip": 1.01428986, + "balance_loss_mlp": 1.02523625, + "epoch": 0.26694724184578383, + "flos": 26760331900800.0, + "grad_norm": 1.7451553085331943, + "language_loss": 0.71426797, + "learning_rate": 3.337325025092344e-06, + "loss": 0.73544276, + "num_input_tokens_seen": 95977135, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 0.59765625, + "step": 4440, + "time_per_iteration": 2.4282066822052 + }, + { + "auxiliary_loss_clip": 0.01085171, + "auxiliary_loss_mlp": 0.01041042, + "balance_loss_clip": 1.02288067, + "balance_loss_mlp": 1.02618694, + "epoch": 0.2670073650984518, + "flos": 20958535186560.0, + "grad_norm": 1.8660410835930377, + "language_loss": 0.66998219, + "learning_rate": 3.337044065382177e-06, + "loss": 0.69124424, + "num_input_tokens_seen": 95995435, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.58984375, + "step": 4441, + "time_per_iteration": 3.757274866104126 + }, + { + "auxiliary_loss_clip": 0.01082918, + "auxiliary_loss_mlp": 0.01035758, + "balance_loss_clip": 1.0182941, + "balance_loss_mlp": 1.02545154, + "epoch": 0.2670674883511198, + "flos": 28364800475520.0, + "grad_norm": 1.4717365983727981, + "language_loss": 0.76371771, + "learning_rate": 3.3367630579563465e-06, + "loss": 0.78490448, + "num_input_tokens_seen": 96016340, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.57421875, + "step": 4442, + "time_per_iteration": 2.42803692817688 + }, + { + "auxiliary_loss_clip": 0.01016479, + "auxiliary_loss_mlp": 0.01003547, + "balance_loss_clip": 1.00148439, + "balance_loss_mlp": 1.00325418, + "epoch": 0.2671276116037878, + "flos": 58968370840320.0, + "grad_norm": 0.9241934998108896, + "language_loss": 0.61196792, + "learning_rate": 3.3364820028248816e-06, + "loss": 0.63216817, + "num_input_tokens_seen": 96071205, + "router_z_loss_clip": 0.02062988, + "router_z_loss_mlp": 0.13183594, + "step": 4443, + "time_per_iteration": 2.927440643310547 + }, + { + "auxiliary_loss_clip": 0.01083943, + "auxiliary_loss_mlp": 0.01035382, + "balance_loss_clip": 1.01788235, + "balance_loss_mlp": 1.02477372, + "epoch": 0.26718773485645575, + "flos": 43943395305600.0, + "grad_norm": 1.5111749557406475, + "language_loss": 0.76085961, + "learning_rate": 3.336200899997812e-06, + "loss": 0.78205287, + "num_input_tokens_seen": 96094240, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.59375, + "step": 4444, + "time_per_iteration": 2.577420949935913 + }, + { + "auxiliary_loss_clip": 0.01084404, + "auxiliary_loss_mlp": 0.01031682, + "balance_loss_clip": 1.01355004, + "balance_loss_mlp": 1.02521873, + "epoch": 0.2672478581091237, + "flos": 25227156055680.0, + "grad_norm": 1.6970284162687346, + "language_loss": 0.80564058, + "learning_rate": 3.3359197494851687e-06, + "loss": 0.82680142, + "num_input_tokens_seen": 96114105, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.59375, + "step": 4445, + "time_per_iteration": 3.7857730388641357 + }, + { + "auxiliary_loss_clip": 0.01083295, + "auxiliary_loss_mlp": 0.01027723, + "balance_loss_clip": 1.00950766, + "balance_loss_mlp": 1.02388334, + "epoch": 0.2673079813617917, + "flos": 15267273436800.0, + "grad_norm": 1.8964928673104666, + "language_loss": 0.89086282, + "learning_rate": 3.335638551296986e-06, + "loss": 0.911973, + "num_input_tokens_seen": 96132140, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.59375, + "step": 4446, + "time_per_iteration": 2.3497743606567383 + }, + { + "auxiliary_loss_clip": 0.01083816, + "auxiliary_loss_mlp": 0.01034704, + "balance_loss_clip": 1.01788342, + "balance_loss_mlp": 1.02553654, + "epoch": 0.26736810461445965, + "flos": 25811532259200.0, + "grad_norm": 1.7236954988024644, + "language_loss": 0.67969894, + "learning_rate": 3.3353573054432997e-06, + "loss": 0.70088416, + "num_input_tokens_seen": 96152090, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.58203125, + "step": 4447, + "time_per_iteration": 2.416245222091675 + }, + { + "auxiliary_loss_clip": 0.01083702, + "auxiliary_loss_mlp": 0.01033604, + "balance_loss_clip": 1.01583028, + "balance_loss_mlp": 1.02472448, + "epoch": 0.2674282278671276, + "flos": 24311663717760.0, + "grad_norm": 1.8990166927242131, + "language_loss": 0.83607161, + "learning_rate": 3.335076011934146e-06, + "loss": 0.85724467, + "num_input_tokens_seen": 96170015, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.58984375, + "step": 4448, + "time_per_iteration": 3.8704729080200195 + }, + { + "auxiliary_loss_clip": 0.0108171, + "auxiliary_loss_mlp": 0.01040913, + "balance_loss_clip": 1.0238775, + "balance_loss_mlp": 1.02470696, + "epoch": 0.2674883511197956, + "flos": 22814553173760.0, + "grad_norm": 1.6268594302914823, + "language_loss": 0.84376764, + "learning_rate": 3.3347946707795627e-06, + "loss": 0.86499381, + "num_input_tokens_seen": 96188065, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.5703125, + "step": 4449, + "time_per_iteration": 2.4050981998443604 + }, + { + "auxiliary_loss_clip": 0.01089157, + "auxiliary_loss_mlp": 0.0104556, + "balance_loss_clip": 1.02492476, + "balance_loss_mlp": 1.02528596, + "epoch": 0.26754847437246354, + "flos": 25369113110400.0, + "grad_norm": 1.6909772018346736, + "language_loss": 0.83999503, + "learning_rate": 3.3345132819895918e-06, + "loss": 0.86134219, + "num_input_tokens_seen": 96205780, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 0.63671875, + "step": 4450, + "time_per_iteration": 2.420567750930786 + }, + { + "auxiliary_loss_clip": 0.01080093, + "auxiliary_loss_mlp": 0.0103072, + "balance_loss_clip": 1.01474607, + "balance_loss_mlp": 1.02417755, + "epoch": 0.2676085976251315, + "flos": 20229374108160.0, + "grad_norm": 1.837832292531925, + "language_loss": 0.81029725, + "learning_rate": 3.3342318455742748e-06, + "loss": 0.8314054, + "num_input_tokens_seen": 96224990, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.55859375, + "step": 4451, + "time_per_iteration": 2.363407850265503 + }, + { + "auxiliary_loss_clip": 0.01081973, + "auxiliary_loss_mlp": 0.0103268, + "balance_loss_clip": 1.01626468, + "balance_loss_mlp": 1.0250107, + "epoch": 0.26766872087779947, + "flos": 28036966008960.0, + "grad_norm": 1.5996466916580925, + "language_loss": 0.86545944, + "learning_rate": 3.333950361543655e-06, + "loss": 0.88660598, + "num_input_tokens_seen": 96245345, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.5703125, + "step": 4452, + "time_per_iteration": 2.436713695526123 + }, + { + "auxiliary_loss_clip": 0.01086419, + "auxiliary_loss_mlp": 0.01040497, + "balance_loss_clip": 1.02232981, + "balance_loss_mlp": 1.02634895, + "epoch": 0.26772884413046744, + "flos": 18324408528000.0, + "grad_norm": 2.205205885981051, + "language_loss": 0.83156228, + "learning_rate": 3.333668829907778e-06, + "loss": 0.85283148, + "num_input_tokens_seen": 96259000, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.6015625, + "step": 4453, + "time_per_iteration": 2.3421266078948975 + }, + { + "auxiliary_loss_clip": 0.01083446, + "auxiliary_loss_mlp": 0.01040058, + "balance_loss_clip": 1.02271318, + "balance_loss_mlp": 1.02626002, + "epoch": 0.2677889673831354, + "flos": 22126414809600.0, + "grad_norm": 1.6591621254335382, + "language_loss": 0.79507649, + "learning_rate": 3.333387250676692e-06, + "loss": 0.8163116, + "num_input_tokens_seen": 96277000, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.5703125, + "step": 4454, + "time_per_iteration": 2.399909734725952 + }, + { + "auxiliary_loss_clip": 0.01085202, + "auxiliary_loss_mlp": 0.01034051, + "balance_loss_clip": 1.01743317, + "balance_loss_mlp": 1.02547109, + "epoch": 0.2678490906358034, + "flos": 23728649057280.0, + "grad_norm": 1.984671084137888, + "language_loss": 0.72933257, + "learning_rate": 3.3331056238604437e-06, + "loss": 0.75052512, + "num_input_tokens_seen": 96297010, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.59765625, + "step": 4455, + "time_per_iteration": 2.3977081775665283 + }, + { + "auxiliary_loss_clip": 0.0107994, + "auxiliary_loss_mlp": 0.01033716, + "balance_loss_clip": 1.01782513, + "balance_loss_mlp": 1.02473307, + "epoch": 0.2679092138884714, + "flos": 21761781903360.0, + "grad_norm": 1.4850696264004095, + "language_loss": 0.73548663, + "learning_rate": 3.3328239494690856e-06, + "loss": 0.75662315, + "num_input_tokens_seen": 96315780, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.5546875, + "step": 4456, + "time_per_iteration": 2.3810369968414307 + }, + { + "auxiliary_loss_clip": 0.01084747, + "auxiliary_loss_mlp": 0.01032753, + "balance_loss_clip": 1.01532412, + "balance_loss_mlp": 1.0246042, + "epoch": 0.26796933714113935, + "flos": 19860272547840.0, + "grad_norm": 2.1995952153618297, + "language_loss": 0.70346195, + "learning_rate": 3.332542227512669e-06, + "loss": 0.72463691, + "num_input_tokens_seen": 96333465, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.59765625, + "step": 4457, + "time_per_iteration": 2.371623992919922 + }, + { + "auxiliary_loss_clip": 0.01084452, + "auxiliary_loss_mlp": 0.01031935, + "balance_loss_clip": 1.01416063, + "balance_loss_mlp": 1.0263896, + "epoch": 0.2680294603938073, + "flos": 20046848186880.0, + "grad_norm": 1.6161514704959818, + "language_loss": 0.78763568, + "learning_rate": 3.332260458001248e-06, + "loss": 0.80879956, + "num_input_tokens_seen": 96352005, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.58203125, + "step": 4458, + "time_per_iteration": 2.3723626136779785 + }, + { + "auxiliary_loss_clip": 0.01015893, + "auxiliary_loss_mlp": 0.01003243, + "balance_loss_clip": 1.00110888, + "balance_loss_mlp": 1.00234771, + "epoch": 0.2680895836464753, + "flos": 72110237172480.0, + "grad_norm": 0.8507323796003085, + "language_loss": 0.58605993, + "learning_rate": 3.3319786409448776e-06, + "loss": 0.6062513, + "num_input_tokens_seen": 96406265, + "router_z_loss_clip": 0.0213623, + "router_z_loss_mlp": 0.13574219, + "step": 4459, + "time_per_iteration": 3.0016634464263916 + }, + { + "auxiliary_loss_clip": 0.01080804, + "auxiliary_loss_mlp": 0.01031638, + "balance_loss_clip": 1.0156517, + "balance_loss_mlp": 1.02370119, + "epoch": 0.26814970689914325, + "flos": 20448000241920.0, + "grad_norm": 4.775208096770727, + "language_loss": 0.85241407, + "learning_rate": 3.3316967763536167e-06, + "loss": 0.87353843, + "num_input_tokens_seen": 96425225, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.5703125, + "step": 4460, + "time_per_iteration": 2.3793869018554688 + }, + { + "auxiliary_loss_clip": 0.0108355, + "auxiliary_loss_mlp": 0.01033353, + "balance_loss_clip": 1.01696181, + "balance_loss_mlp": 1.02480114, + "epoch": 0.2682098301518112, + "flos": 17565710572800.0, + "grad_norm": 2.0053813378551646, + "language_loss": 0.68624145, + "learning_rate": 3.331414864237523e-06, + "loss": 0.70741045, + "num_input_tokens_seen": 96443780, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.5859375, + "step": 4461, + "time_per_iteration": 2.3408210277557373 + }, + { + "auxiliary_loss_clip": 0.01081883, + "auxiliary_loss_mlp": 0.01034846, + "balance_loss_clip": 1.01706028, + "balance_loss_mlp": 1.0243696, + "epoch": 0.2682699534044792, + "flos": 18332263584000.0, + "grad_norm": 1.5110738980636826, + "language_loss": 0.67033225, + "learning_rate": 3.331132904606658e-06, + "loss": 0.69149953, + "num_input_tokens_seen": 96464530, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.57421875, + "step": 4462, + "time_per_iteration": 2.398047685623169 + }, + { + "auxiliary_loss_clip": 0.0108338, + "auxiliary_loss_mlp": 0.01036181, + "balance_loss_clip": 1.01815701, + "balance_loss_mlp": 1.02598441, + "epoch": 0.26833007665714714, + "flos": 25300124530560.0, + "grad_norm": 1.5166486785855047, + "language_loss": 0.69246829, + "learning_rate": 3.330850897471083e-06, + "loss": 0.71366394, + "num_input_tokens_seen": 96483345, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.57421875, + "step": 4463, + "time_per_iteration": 2.4224934577941895 + }, + { + "auxiliary_loss_clip": 0.01084357, + "auxiliary_loss_mlp": 0.01033975, + "balance_loss_clip": 1.01672494, + "balance_loss_mlp": 1.02491117, + "epoch": 0.2683901999098151, + "flos": 16099044600960.0, + "grad_norm": 5.2496402568894025, + "language_loss": 0.77723622, + "learning_rate": 3.3305688428408634e-06, + "loss": 0.79841954, + "num_input_tokens_seen": 96498305, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.59375, + "step": 4464, + "time_per_iteration": 2.3258721828460693 + }, + { + "auxiliary_loss_clip": 0.01081374, + "auxiliary_loss_mlp": 0.01030645, + "balance_loss_clip": 1.01394343, + "balance_loss_mlp": 1.02392387, + "epoch": 0.2684503231624831, + "flos": 27306827412480.0, + "grad_norm": 1.7469384464010271, + "language_loss": 0.70595694, + "learning_rate": 3.330286740726064e-06, + "loss": 0.72707713, + "num_input_tokens_seen": 96519740, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.57421875, + "step": 4465, + "time_per_iteration": 2.4362828731536865 + }, + { + "auxiliary_loss_clip": 0.01014993, + "auxiliary_loss_mlp": 0.01005577, + "balance_loss_clip": 1.0037415, + "balance_loss_mlp": 1.00172997, + "epoch": 0.26851044641515104, + "flos": 71854498396800.0, + "grad_norm": 0.6721368002712322, + "language_loss": 0.53067428, + "learning_rate": 3.3300045911367527e-06, + "loss": 0.55088001, + "num_input_tokens_seen": 96588870, + "router_z_loss_clip": 0.01831055, + "router_z_loss_mlp": 0.1328125, + "step": 4466, + "time_per_iteration": 3.1842169761657715 + }, + { + "auxiliary_loss_clip": 0.01080139, + "auxiliary_loss_mlp": 0.01036146, + "balance_loss_clip": 1.01872981, + "balance_loss_mlp": 1.02496719, + "epoch": 0.268570569667819, + "flos": 18732787234560.0, + "grad_norm": 1.793836553207938, + "language_loss": 0.74083984, + "learning_rate": 3.3297223940829993e-06, + "loss": 0.76200271, + "num_input_tokens_seen": 96605100, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.55078125, + "step": 4467, + "time_per_iteration": 2.346865177154541 + }, + { + "auxiliary_loss_clip": 0.01083256, + "auxiliary_loss_mlp": 0.01036764, + "balance_loss_clip": 1.019526, + "balance_loss_mlp": 1.02402246, + "epoch": 0.268630692920487, + "flos": 18177633705600.0, + "grad_norm": 2.191278880275011, + "language_loss": 0.80200934, + "learning_rate": 3.3294401495748733e-06, + "loss": 0.82320952, + "num_input_tokens_seen": 96621410, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.59375, + "step": 4468, + "time_per_iteration": 2.3384740352630615 + }, + { + "auxiliary_loss_clip": 0.01014854, + "auxiliary_loss_mlp": 0.01003605, + "balance_loss_clip": 1.00197148, + "balance_loss_mlp": 1.00198507, + "epoch": 0.268690816173155, + "flos": 68728025612160.0, + "grad_norm": 0.8446363794832309, + "language_loss": 0.594836, + "learning_rate": 3.3291578576224487e-06, + "loss": 0.61502063, + "num_input_tokens_seen": 96684810, + "router_z_loss_clip": 0.01635742, + "router_z_loss_mlp": 0.12890625, + "step": 4469, + "time_per_iteration": 3.1167914867401123 + }, + { + "auxiliary_loss_clip": 0.01084916, + "auxiliary_loss_mlp": 0.01037397, + "balance_loss_clip": 1.01933658, + "balance_loss_mlp": 1.02664685, + "epoch": 0.26875093942582295, + "flos": 23292548864640.0, + "grad_norm": 2.008031001637529, + "language_loss": 0.81701338, + "learning_rate": 3.328875518235799e-06, + "loss": 0.83823645, + "num_input_tokens_seen": 96701920, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.58203125, + "step": 4470, + "time_per_iteration": 2.3879146575927734 + }, + { + "auxiliary_loss_clip": 0.01077523, + "auxiliary_loss_mlp": 0.01031587, + "balance_loss_clip": 1.01607752, + "balance_loss_mlp": 1.02323973, + "epoch": 0.2688110626784909, + "flos": 21542387719680.0, + "grad_norm": 1.6038821516622725, + "language_loss": 0.82837486, + "learning_rate": 3.328593131425e-06, + "loss": 0.84946591, + "num_input_tokens_seen": 96721260, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.54296875, + "step": 4471, + "time_per_iteration": 2.3821933269500732 + }, + { + "auxiliary_loss_clip": 0.0108063, + "auxiliary_loss_mlp": 0.01033538, + "balance_loss_clip": 1.017838, + "balance_loss_mlp": 1.02507091, + "epoch": 0.2688711859311589, + "flos": 28399399499520.0, + "grad_norm": 1.9848390976572985, + "language_loss": 0.69358706, + "learning_rate": 3.3283106972001303e-06, + "loss": 0.71472877, + "num_input_tokens_seen": 96740385, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.5546875, + "step": 4472, + "time_per_iteration": 2.44398832321167 + }, + { + "auxiliary_loss_clip": 0.0108238, + "auxiliary_loss_mlp": 0.01036086, + "balance_loss_clip": 1.02001083, + "balance_loss_mlp": 1.02520609, + "epoch": 0.26893130918382685, + "flos": 25993743978240.0, + "grad_norm": 1.6237287978466413, + "language_loss": 0.67831284, + "learning_rate": 3.3280282155712684e-06, + "loss": 0.69949752, + "num_input_tokens_seen": 96761860, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.5703125, + "step": 4473, + "time_per_iteration": 2.4261202812194824 + }, + { + "auxiliary_loss_clip": 0.01079509, + "auxiliary_loss_mlp": 0.01039558, + "balance_loss_clip": 1.02351272, + "balance_loss_mlp": 1.02467394, + "epoch": 0.2689914324364948, + "flos": 20338582440960.0, + "grad_norm": 1.6609874782940333, + "language_loss": 0.82910681, + "learning_rate": 3.3277456865484956e-06, + "loss": 0.85029745, + "num_input_tokens_seen": 96781890, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.546875, + "step": 4474, + "time_per_iteration": 2.38499116897583 + }, + { + "auxiliary_loss_clip": 0.01079055, + "auxiliary_loss_mlp": 0.01040125, + "balance_loss_clip": 1.02459157, + "balance_loss_mlp": 1.02461219, + "epoch": 0.2690515556891628, + "flos": 19463519324160.0, + "grad_norm": 1.931247857755048, + "language_loss": 0.70672166, + "learning_rate": 3.3274631101418942e-06, + "loss": 0.7279135, + "num_input_tokens_seen": 96800390, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.54296875, + "step": 4475, + "time_per_iteration": 2.363485336303711 + }, + { + "auxiliary_loss_clip": 0.01080789, + "auxiliary_loss_mlp": 0.01038167, + "balance_loss_clip": 1.02138877, + "balance_loss_mlp": 1.02387619, + "epoch": 0.26911167894183075, + "flos": 18145757767680.0, + "grad_norm": 1.6858005728302072, + "language_loss": 0.72981411, + "learning_rate": 3.32718048636155e-06, + "loss": 0.75100362, + "num_input_tokens_seen": 96816685, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.5703125, + "step": 4476, + "time_per_iteration": 2.3534677028656006 + }, + { + "auxiliary_loss_clip": 0.01078047, + "auxiliary_loss_mlp": 0.01030438, + "balance_loss_clip": 1.01535809, + "balance_loss_mlp": 1.02385592, + "epoch": 0.2691718021944987, + "flos": 19974089180160.0, + "grad_norm": 1.7933882984678577, + "language_loss": 0.81010187, + "learning_rate": 3.3268978152175474e-06, + "loss": 0.83118671, + "num_input_tokens_seen": 96836285, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.54296875, + "step": 4477, + "time_per_iteration": 2.430464744567871 + }, + { + "auxiliary_loss_clip": 0.01080129, + "auxiliary_loss_mlp": 0.01035734, + "balance_loss_clip": 1.0198555, + "balance_loss_mlp": 1.02281356, + "epoch": 0.2692319254471667, + "flos": 37445814639360.0, + "grad_norm": 1.5332979734477428, + "language_loss": 0.64887929, + "learning_rate": 3.3266150967199752e-06, + "loss": 0.67003787, + "num_input_tokens_seen": 96857745, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.5703125, + "step": 4478, + "time_per_iteration": 3.8750460147857666 + }, + { + "auxiliary_loss_clip": 0.01079824, + "auxiliary_loss_mlp": 0.01032956, + "balance_loss_clip": 1.01612389, + "balance_loss_mlp": 1.02383912, + "epoch": 0.26929204869983464, + "flos": 22126694100480.0, + "grad_norm": 1.9379773878178308, + "language_loss": 0.80459404, + "learning_rate": 3.3263323308789225e-06, + "loss": 0.82572174, + "num_input_tokens_seen": 96877295, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.55859375, + "step": 4479, + "time_per_iteration": 2.3893632888793945 + }, + { + "auxiliary_loss_clip": 0.01081605, + "auxiliary_loss_mlp": 0.01037132, + "balance_loss_clip": 1.02120578, + "balance_loss_mlp": 1.02322698, + "epoch": 0.2693521719525026, + "flos": 19791772727040.0, + "grad_norm": 2.4539283943618018, + "language_loss": 0.80995786, + "learning_rate": 3.3260495177044806e-06, + "loss": 0.83114529, + "num_input_tokens_seen": 96896160, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.5859375, + "step": 4480, + "time_per_iteration": 2.3738739490509033 + }, + { + "auxiliary_loss_clip": 0.01075662, + "auxiliary_loss_mlp": 0.01028928, + "balance_loss_clip": 1.01440787, + "balance_loss_mlp": 1.02309871, + "epoch": 0.2694122952051706, + "flos": 20993378590080.0, + "grad_norm": 1.5241829458860563, + "language_loss": 0.78021401, + "learning_rate": 3.325766657206743e-06, + "loss": 0.80125993, + "num_input_tokens_seen": 96915410, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.5234375, + "step": 4481, + "time_per_iteration": 3.7754180431365967 + }, + { + "auxiliary_loss_clip": 0.01081858, + "auxiliary_loss_mlp": 0.01036941, + "balance_loss_clip": 1.02078819, + "balance_loss_mlp": 1.02476931, + "epoch": 0.2694724184578386, + "flos": 25848086319360.0, + "grad_norm": 1.9495649123639187, + "language_loss": 0.7387563, + "learning_rate": 3.3254837493958032e-06, + "loss": 0.75994426, + "num_input_tokens_seen": 96937865, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.5703125, + "step": 4482, + "time_per_iteration": 2.448399543762207 + }, + { + "auxiliary_loss_clip": 0.01083359, + "auxiliary_loss_mlp": 0.01030811, + "balance_loss_clip": 1.01433611, + "balance_loss_mlp": 1.02650142, + "epoch": 0.26953254171050656, + "flos": 21725856247680.0, + "grad_norm": 1.8219840131068075, + "language_loss": 0.72292948, + "learning_rate": 3.3252007942817575e-06, + "loss": 0.74407113, + "num_input_tokens_seen": 96957710, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.5703125, + "step": 4483, + "time_per_iteration": 2.3748221397399902 + }, + { + "auxiliary_loss_clip": 0.01083569, + "auxiliary_loss_mlp": 0.01036102, + "balance_loss_clip": 1.0182085, + "balance_loss_mlp": 1.02419567, + "epoch": 0.2695926649631745, + "flos": 19681901078400.0, + "grad_norm": 3.6791924321997707, + "language_loss": 0.86999154, + "learning_rate": 3.324917791874705e-06, + "loss": 0.8911882, + "num_input_tokens_seen": 96975890, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 0.59375, + "step": 4484, + "time_per_iteration": 3.836228132247925 + }, + { + "auxiliary_loss_clip": 0.01082217, + "auxiliary_loss_mlp": 0.01030014, + "balance_loss_clip": 1.01475477, + "balance_loss_mlp": 1.02469516, + "epoch": 0.2696527882158425, + "flos": 32885319870720.0, + "grad_norm": 1.4598561573104194, + "language_loss": 0.66220701, + "learning_rate": 3.324634742184744e-06, + "loss": 0.68332934, + "num_input_tokens_seen": 96998595, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.578125, + "step": 4485, + "time_per_iteration": 2.4773032665252686 + }, + { + "auxiliary_loss_clip": 0.0108219, + "auxiliary_loss_mlp": 0.01034617, + "balance_loss_clip": 1.01782048, + "balance_loss_mlp": 1.02494526, + "epoch": 0.26971291146851045, + "flos": 12124182844800.0, + "grad_norm": 2.2793826971817217, + "language_loss": 0.72640491, + "learning_rate": 3.324351645221977e-06, + "loss": 0.74757296, + "num_input_tokens_seen": 97013715, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.5703125, + "step": 4486, + "time_per_iteration": 2.3354594707489014 + }, + { + "auxiliary_loss_clip": 0.01087052, + "auxiliary_loss_mlp": 0.01038748, + "balance_loss_clip": 1.02220225, + "balance_loss_mlp": 1.02663565, + "epoch": 0.2697730347211784, + "flos": 22633458618240.0, + "grad_norm": 1.7549295907656801, + "language_loss": 0.84021676, + "learning_rate": 3.3240685009965065e-06, + "loss": 0.86147475, + "num_input_tokens_seen": 97031570, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.60546875, + "step": 4487, + "time_per_iteration": 2.3800137042999268 + }, + { + "auxiliary_loss_clip": 0.01082813, + "auxiliary_loss_mlp": 0.01036629, + "balance_loss_clip": 1.0196538, + "balance_loss_mlp": 1.02519119, + "epoch": 0.2698331579738464, + "flos": 23511943048320.0, + "grad_norm": 2.158231889353338, + "language_loss": 0.71825504, + "learning_rate": 3.3237853095184365e-06, + "loss": 0.73944944, + "num_input_tokens_seen": 97049815, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.57421875, + "step": 4488, + "time_per_iteration": 3.789289712905884 + }, + { + "auxiliary_loss_clip": 0.01082345, + "auxiliary_loss_mlp": 0.01036868, + "balance_loss_clip": 1.0201546, + "balance_loss_mlp": 1.02542758, + "epoch": 0.26989328122651435, + "flos": 24639986943360.0, + "grad_norm": 1.7189116236507618, + "language_loss": 0.83743238, + "learning_rate": 3.3235020707978747e-06, + "loss": 0.85862446, + "num_input_tokens_seen": 97067570, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.5703125, + "step": 4489, + "time_per_iteration": 2.407876491546631 + }, + { + "auxiliary_loss_clip": 0.01084075, + "auxiliary_loss_mlp": 0.01038257, + "balance_loss_clip": 1.02117491, + "balance_loss_mlp": 1.02525234, + "epoch": 0.2699534044791823, + "flos": 10771996821120.0, + "grad_norm": 2.4911137031080997, + "language_loss": 0.89687502, + "learning_rate": 3.323218784844928e-06, + "loss": 0.91809833, + "num_input_tokens_seen": 97082180, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.5859375, + "step": 4490, + "time_per_iteration": 2.357886791229248 + }, + { + "auxiliary_loss_clip": 0.01079328, + "auxiliary_loss_mlp": 0.01032723, + "balance_loss_clip": 1.01703525, + "balance_loss_mlp": 1.02451301, + "epoch": 0.2700135277318503, + "flos": 36170192960640.0, + "grad_norm": 1.9868071240327854, + "language_loss": 0.73218596, + "learning_rate": 3.322935451669706e-06, + "loss": 0.75330651, + "num_input_tokens_seen": 97103470, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.546875, + "step": 4491, + "time_per_iteration": 2.493103504180908 + }, + { + "auxiliary_loss_clip": 0.01085746, + "auxiliary_loss_mlp": 0.0104042, + "balance_loss_clip": 1.0233376, + "balance_loss_mlp": 1.02712989, + "epoch": 0.27007365098451824, + "flos": 17417713852800.0, + "grad_norm": 2.7418608079080005, + "language_loss": 0.74333012, + "learning_rate": 3.322652071282322e-06, + "loss": 0.76459181, + "num_input_tokens_seen": 97118100, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.5859375, + "step": 4492, + "time_per_iteration": 2.3537187576293945 + }, + { + "auxiliary_loss_clip": 0.01082365, + "auxiliary_loss_mlp": 0.01030478, + "balance_loss_clip": 1.01431274, + "balance_loss_mlp": 1.0255754, + "epoch": 0.2701337742371862, + "flos": 23184562429440.0, + "grad_norm": 1.9799655065005513, + "language_loss": 0.88832211, + "learning_rate": 3.3223686436928874e-06, + "loss": 0.90945053, + "num_input_tokens_seen": 97136765, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.5703125, + "step": 4493, + "time_per_iteration": 2.388761281967163 + }, + { + "auxiliary_loss_clip": 0.0108122, + "auxiliary_loss_mlp": 0.01032849, + "balance_loss_clip": 1.01787066, + "balance_loss_mlp": 1.02600765, + "epoch": 0.2701938974898542, + "flos": 24388297885440.0, + "grad_norm": 1.453558530019349, + "language_loss": 0.71056789, + "learning_rate": 3.322085168911517e-06, + "loss": 0.73170859, + "num_input_tokens_seen": 97157470, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.55078125, + "step": 4494, + "time_per_iteration": 2.4287607669830322 + }, + { + "auxiliary_loss_clip": 0.01077588, + "auxiliary_loss_mlp": 0.01035566, + "balance_loss_clip": 1.01991987, + "balance_loss_mlp": 1.02271843, + "epoch": 0.2702540207425222, + "flos": 26213103250560.0, + "grad_norm": 1.9191194347989033, + "language_loss": 0.8621546, + "learning_rate": 3.321801646948328e-06, + "loss": 0.88328612, + "num_input_tokens_seen": 97176905, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.546875, + "step": 4495, + "time_per_iteration": 2.4040403366088867 + }, + { + "auxiliary_loss_clip": 0.01081717, + "auxiliary_loss_mlp": 0.01032528, + "balance_loss_clip": 1.01673281, + "balance_loss_mlp": 1.02571392, + "epoch": 0.27031414399519016, + "flos": 22925367429120.0, + "grad_norm": 1.605148411167534, + "language_loss": 0.76578534, + "learning_rate": 3.321518077813438e-06, + "loss": 0.78692782, + "num_input_tokens_seen": 97196380, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.5625, + "step": 4496, + "time_per_iteration": 2.4081809520721436 + }, + { + "auxiliary_loss_clip": 0.01015898, + "auxiliary_loss_mlp": 0.01008557, + "balance_loss_clip": 1.00716174, + "balance_loss_mlp": 1.0027616, + "epoch": 0.2703742672478581, + "flos": 63016759653120.0, + "grad_norm": 0.7033280233919958, + "language_loss": 0.50172031, + "learning_rate": 3.321234461516967e-06, + "loss": 0.52196479, + "num_input_tokens_seen": 97260100, + "router_z_loss_clip": 0.01397705, + "router_z_loss_mlp": 0.13085938, + "step": 4497, + "time_per_iteration": 3.089003086090088 + }, + { + "auxiliary_loss_clip": 0.01083106, + "auxiliary_loss_mlp": 0.01036326, + "balance_loss_clip": 1.02027988, + "balance_loss_mlp": 1.02663255, + "epoch": 0.2704343905005261, + "flos": 18839900885760.0, + "grad_norm": 1.5242792975239683, + "language_loss": 0.72006714, + "learning_rate": 3.3209507980690375e-06, + "loss": 0.74126148, + "num_input_tokens_seen": 97277935, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.56640625, + "step": 4498, + "time_per_iteration": 2.4045588970184326 + }, + { + "auxiliary_loss_clip": 0.01015125, + "auxiliary_loss_mlp": 0.0100553, + "balance_loss_clip": 1.00398076, + "balance_loss_mlp": 1.00229216, + "epoch": 0.27049451375319405, + "flos": 71230042085760.0, + "grad_norm": 0.7530257662081888, + "language_loss": 0.59190583, + "learning_rate": 3.3206670874797717e-06, + "loss": 0.6121124, + "num_input_tokens_seen": 97338845, + "router_z_loss_clip": 0.01544189, + "router_z_loss_mlp": 0.12890625, + "step": 4499, + "time_per_iteration": 3.0591161251068115 + }, + { + "auxiliary_loss_clip": 0.01079214, + "auxiliary_loss_mlp": 0.01030893, + "balance_loss_clip": 1.01448965, + "balance_loss_mlp": 1.02431273, + "epoch": 0.270554637005862, + "flos": 24277483630080.0, + "grad_norm": 1.8792993719860758, + "language_loss": 0.7363081, + "learning_rate": 3.3203833297592943e-06, + "loss": 0.7574091, + "num_input_tokens_seen": 97356640, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.546875, + "step": 4500, + "time_per_iteration": 2.4270050525665283 + }, + { + "auxiliary_loss_clip": 0.01079428, + "auxiliary_loss_mlp": 0.01036149, + "balance_loss_clip": 1.02034807, + "balance_loss_mlp": 1.02346158, + "epoch": 0.27061476025853, + "flos": 17631557130240.0, + "grad_norm": 2.8435527376846386, + "language_loss": 0.80917323, + "learning_rate": 3.3200995249177324e-06, + "loss": 0.830329, + "num_input_tokens_seen": 97372585, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.55859375, + "step": 4501, + "time_per_iteration": 2.3408093452453613 + }, + { + "auxiliary_loss_clip": 0.0108064, + "auxiliary_loss_mlp": 0.01034326, + "balance_loss_clip": 1.01761258, + "balance_loss_mlp": 1.02366543, + "epoch": 0.27067488351119795, + "flos": 22709045445120.0, + "grad_norm": 2.0256480088020057, + "language_loss": 0.72636497, + "learning_rate": 3.3198156729652144e-06, + "loss": 0.74751461, + "num_input_tokens_seen": 97393315, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.5703125, + "step": 4502, + "time_per_iteration": 2.4176254272460938 + }, + { + "auxiliary_loss_clip": 0.01081711, + "auxiliary_loss_mlp": 0.0103455, + "balance_loss_clip": 1.01634669, + "balance_loss_mlp": 1.02251172, + "epoch": 0.2707350067638659, + "flos": 41717996467200.0, + "grad_norm": 1.7990912716135352, + "language_loss": 0.68265796, + "learning_rate": 3.31953177391187e-06, + "loss": 0.70382053, + "num_input_tokens_seen": 97417860, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.59375, + "step": 4503, + "time_per_iteration": 2.562025785446167 + }, + { + "auxiliary_loss_clip": 0.01080471, + "auxiliary_loss_mlp": 0.01031379, + "balance_loss_clip": 1.01566768, + "balance_loss_mlp": 1.02419126, + "epoch": 0.2707951300165339, + "flos": 20192017086720.0, + "grad_norm": 1.8419824353840168, + "language_loss": 0.67978293, + "learning_rate": 3.319247827767831e-06, + "loss": 0.70090151, + "num_input_tokens_seen": 97436780, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.5625, + "step": 4504, + "time_per_iteration": 2.3766422271728516 + }, + { + "auxiliary_loss_clip": 0.01081967, + "auxiliary_loss_mlp": 0.01034347, + "balance_loss_clip": 1.01815891, + "balance_loss_mlp": 1.02605009, + "epoch": 0.27085525326920185, + "flos": 21432900096000.0, + "grad_norm": 1.4023218067460208, + "language_loss": 0.75659472, + "learning_rate": 3.31896383454323e-06, + "loss": 0.77775782, + "num_input_tokens_seen": 97456190, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.55859375, + "step": 4505, + "time_per_iteration": 2.384819746017456 + }, + { + "auxiliary_loss_clip": 0.01086919, + "auxiliary_loss_mlp": 0.01041346, + "balance_loss_clip": 1.02313077, + "balance_loss_mlp": 1.02600741, + "epoch": 0.2709153765218698, + "flos": 17674290501120.0, + "grad_norm": 2.908619247116778, + "language_loss": 0.73532224, + "learning_rate": 3.3186797942482025e-06, + "loss": 0.75660491, + "num_input_tokens_seen": 97474545, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.609375, + "step": 4506, + "time_per_iteration": 2.3712241649627686 + }, + { + "auxiliary_loss_clip": 0.01083816, + "auxiliary_loss_mlp": 0.01033292, + "balance_loss_clip": 1.0161016, + "balance_loss_mlp": 1.02535522, + "epoch": 0.2709754997745378, + "flos": 24455261606400.0, + "grad_norm": 1.9930554118530837, + "language_loss": 0.80757821, + "learning_rate": 3.3183957068928855e-06, + "loss": 0.8287493, + "num_input_tokens_seen": 97494520, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.5859375, + "step": 4507, + "time_per_iteration": 2.4048187732696533 + }, + { + "auxiliary_loss_clip": 0.01015596, + "auxiliary_loss_mlp": 0.01003149, + "balance_loss_clip": 1.00131321, + "balance_loss_mlp": 1.00270295, + "epoch": 0.2710356230272058, + "flos": 65207664201600.0, + "grad_norm": 0.7384456755439164, + "language_loss": 0.50821882, + "learning_rate": 3.318111572487417e-06, + "loss": 0.52840626, + "num_input_tokens_seen": 97552455, + "router_z_loss_clip": 0.01831055, + "router_z_loss_mlp": 0.12890625, + "step": 4508, + "time_per_iteration": 2.949289083480835 + }, + { + "auxiliary_loss_clip": 0.01078772, + "auxiliary_loss_mlp": 0.01030216, + "balance_loss_clip": 1.01543427, + "balance_loss_mlp": 1.02388668, + "epoch": 0.27109574627987376, + "flos": 25483243944960.0, + "grad_norm": 2.124940899839455, + "language_loss": 0.74865228, + "learning_rate": 3.3178273910419376e-06, + "loss": 0.76974213, + "num_input_tokens_seen": 97572650, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.546875, + "step": 4509, + "time_per_iteration": 2.4222006797790527 + }, + { + "auxiliary_loss_clip": 0.01077173, + "auxiliary_loss_mlp": 0.01032298, + "balance_loss_clip": 1.01814198, + "balance_loss_mlp": 1.02336025, + "epoch": 0.2711558695325417, + "flos": 19681761432960.0, + "grad_norm": 1.9374771585781625, + "language_loss": 0.71486199, + "learning_rate": 3.3175431625665876e-06, + "loss": 0.73595667, + "num_input_tokens_seen": 97591150, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.5390625, + "step": 4510, + "time_per_iteration": 2.387803316116333 + }, + { + "auxiliary_loss_clip": 0.01081239, + "auxiliary_loss_mlp": 0.01029419, + "balance_loss_clip": 1.01385069, + "balance_loss_mlp": 1.02615893, + "epoch": 0.2712159927852097, + "flos": 18586780462080.0, + "grad_norm": 2.3580156690873473, + "language_loss": 0.69878447, + "learning_rate": 3.317258887071512e-06, + "loss": 0.71989107, + "num_input_tokens_seen": 97607410, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.55078125, + "step": 4511, + "time_per_iteration": 2.3518013954162598 + }, + { + "auxiliary_loss_clip": 0.01082062, + "auxiliary_loss_mlp": 0.01035458, + "balance_loss_clip": 1.01885176, + "balance_loss_mlp": 1.02499878, + "epoch": 0.27127611603787766, + "flos": 25629041249280.0, + "grad_norm": 1.9857478699695057, + "language_loss": 0.80716193, + "learning_rate": 3.3169745645668546e-06, + "loss": 0.82833719, + "num_input_tokens_seen": 97626870, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.5703125, + "step": 4512, + "time_per_iteration": 2.4412453174591064 + }, + { + "auxiliary_loss_clip": 0.01077305, + "auxiliary_loss_mlp": 0.01025574, + "balance_loss_clip": 1.01119757, + "balance_loss_mlp": 1.02384079, + "epoch": 0.2713362392905456, + "flos": 23147833812480.0, + "grad_norm": 1.585749548534528, + "language_loss": 0.80106896, + "learning_rate": 3.3166901950627627e-06, + "loss": 0.82209772, + "num_input_tokens_seen": 97646595, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.53515625, + "step": 4513, + "time_per_iteration": 2.399441957473755 + }, + { + "auxiliary_loss_clip": 0.01080465, + "auxiliary_loss_mlp": 0.0102953, + "balance_loss_clip": 1.01450968, + "balance_loss_mlp": 1.02370095, + "epoch": 0.2713963625432136, + "flos": 18365151951360.0, + "grad_norm": 1.8860261571152785, + "language_loss": 0.88569802, + "learning_rate": 3.3164057785693846e-06, + "loss": 0.90679801, + "num_input_tokens_seen": 97665485, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.5703125, + "step": 4514, + "time_per_iteration": 2.365255832672119 + }, + { + "auxiliary_loss_clip": 0.01081314, + "auxiliary_loss_mlp": 0.01036157, + "balance_loss_clip": 1.02045166, + "balance_loss_mlp": 1.02581787, + "epoch": 0.27145648579588155, + "flos": 22490663690880.0, + "grad_norm": 3.4338607898771696, + "language_loss": 0.92019075, + "learning_rate": 3.316121315096871e-06, + "loss": 0.94136542, + "num_input_tokens_seen": 97683800, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.5546875, + "step": 4515, + "time_per_iteration": 2.397843360900879 + }, + { + "auxiliary_loss_clip": 0.01087085, + "auxiliary_loss_mlp": 0.01042334, + "balance_loss_clip": 1.02433372, + "balance_loss_mlp": 1.02682376, + "epoch": 0.2715166090485495, + "flos": 19238329854720.0, + "grad_norm": 2.0003139354196002, + "language_loss": 0.7318427, + "learning_rate": 3.3158368046553724e-06, + "loss": 0.75313687, + "num_input_tokens_seen": 97700505, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.60546875, + "step": 4516, + "time_per_iteration": 2.380821704864502 + }, + { + "auxiliary_loss_clip": 0.01081555, + "auxiliary_loss_mlp": 0.01032495, + "balance_loss_clip": 1.01571, + "balance_loss_mlp": 1.02499664, + "epoch": 0.2715767323012175, + "flos": 17708714968320.0, + "grad_norm": 1.8621901672453118, + "language_loss": 0.7606324, + "learning_rate": 3.315552247255043e-06, + "loss": 0.78177291, + "num_input_tokens_seen": 97717410, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.56640625, + "step": 4517, + "time_per_iteration": 3.725590467453003 + }, + { + "auxiliary_loss_clip": 0.01080733, + "auxiliary_loss_mlp": 0.01033027, + "balance_loss_clip": 1.01642156, + "balance_loss_mlp": 1.02456748, + "epoch": 0.27163685555388545, + "flos": 22381734648960.0, + "grad_norm": 2.4437410929350625, + "language_loss": 0.77187204, + "learning_rate": 3.3152676429060385e-06, + "loss": 0.79300964, + "num_input_tokens_seen": 97734545, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.5625, + "step": 4518, + "time_per_iteration": 2.3933472633361816 + }, + { + "auxiliary_loss_clip": 0.01080056, + "auxiliary_loss_mlp": 0.01027976, + "balance_loss_clip": 1.01262212, + "balance_loss_mlp": 1.02426314, + "epoch": 0.2716969788065534, + "flos": 22345599525120.0, + "grad_norm": 1.6206087204756392, + "language_loss": 0.68530637, + "learning_rate": 3.3149829916185147e-06, + "loss": 0.70638669, + "num_input_tokens_seen": 97754000, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.55859375, + "step": 4519, + "time_per_iteration": 2.3842036724090576 + }, + { + "auxiliary_loss_clip": 0.01079199, + "auxiliary_loss_mlp": 0.01029333, + "balance_loss_clip": 1.01446807, + "balance_loss_mlp": 1.02388358, + "epoch": 0.2717571020592214, + "flos": 25227295701120.0, + "grad_norm": 2.082781881877512, + "language_loss": 0.75273436, + "learning_rate": 3.314698293402631e-06, + "loss": 0.77381968, + "num_input_tokens_seen": 97772080, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.5546875, + "step": 4520, + "time_per_iteration": 3.889113426208496 + }, + { + "auxiliary_loss_clip": 0.01085247, + "auxiliary_loss_mlp": 0.01038384, + "balance_loss_clip": 1.02028799, + "balance_loss_mlp": 1.02649176, + "epoch": 0.2718172253118894, + "flos": 20188840152960.0, + "grad_norm": 2.0582810574866595, + "language_loss": 0.76261693, + "learning_rate": 3.314413548268546e-06, + "loss": 0.78385329, + "num_input_tokens_seen": 97789370, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.5859375, + "step": 4521, + "time_per_iteration": 2.3611133098602295 + }, + { + "auxiliary_loss_clip": 0.01081571, + "auxiliary_loss_mlp": 0.01036065, + "balance_loss_clip": 1.01924443, + "balance_loss_mlp": 1.02396441, + "epoch": 0.27187734856455736, + "flos": 14318264327040.0, + "grad_norm": 2.250188091427427, + "language_loss": 0.74971151, + "learning_rate": 3.3141287562264232e-06, + "loss": 0.77088785, + "num_input_tokens_seen": 97807385, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.578125, + "step": 4522, + "time_per_iteration": 2.402076005935669 + }, + { + "auxiliary_loss_clip": 0.01084066, + "auxiliary_loss_mlp": 0.01029721, + "balance_loss_clip": 1.01403904, + "balance_loss_mlp": 1.02588773, + "epoch": 0.27193747181722533, + "flos": 21106566817920.0, + "grad_norm": 1.922816484571974, + "language_loss": 0.72921813, + "learning_rate": 3.3138439172864258e-06, + "loss": 0.75035608, + "num_input_tokens_seen": 97827930, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.58203125, + "step": 4523, + "time_per_iteration": 2.3831393718719482 + }, + { + "auxiliary_loss_clip": 0.01078028, + "auxiliary_loss_mlp": 0.01035109, + "balance_loss_clip": 1.01927781, + "balance_loss_mlp": 1.02376866, + "epoch": 0.2719975950698933, + "flos": 19681761432960.0, + "grad_norm": 1.4698126783393182, + "language_loss": 0.74400955, + "learning_rate": 3.313559031458718e-06, + "loss": 0.76514089, + "num_input_tokens_seen": 97847440, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.54296875, + "step": 4524, + "time_per_iteration": 3.7996976375579834 + }, + { + "auxiliary_loss_clip": 0.01080931, + "auxiliary_loss_mlp": 0.01032492, + "balance_loss_clip": 1.01647043, + "balance_loss_mlp": 1.02484608, + "epoch": 0.27205771832256126, + "flos": 24753314816640.0, + "grad_norm": 2.4126873356979854, + "language_loss": 0.76205564, + "learning_rate": 3.313274098753467e-06, + "loss": 0.78318995, + "num_input_tokens_seen": 97867620, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.5625, + "step": 4525, + "time_per_iteration": 2.404662609100342 + }, + { + "auxiliary_loss_clip": 0.01080402, + "auxiliary_loss_mlp": 0.01035931, + "balance_loss_clip": 1.02060056, + "balance_loss_mlp": 1.02542555, + "epoch": 0.2721178415752292, + "flos": 21754694897280.0, + "grad_norm": 2.314706947911429, + "language_loss": 0.81361967, + "learning_rate": 3.3129891191808423e-06, + "loss": 0.83478296, + "num_input_tokens_seen": 97884345, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.546875, + "step": 4526, + "time_per_iteration": 2.4147696495056152 + }, + { + "auxiliary_loss_clip": 0.01084144, + "auxiliary_loss_mlp": 0.01032727, + "balance_loss_clip": 1.01467907, + "balance_loss_mlp": 1.0242039, + "epoch": 0.2721779648278972, + "flos": 12676019794560.0, + "grad_norm": 1.9890912207025393, + "language_loss": 0.76664495, + "learning_rate": 3.312704092751013e-06, + "loss": 0.78781366, + "num_input_tokens_seen": 97901500, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.6015625, + "step": 4527, + "time_per_iteration": 3.844364643096924 + }, + { + "auxiliary_loss_clip": 0.01079931, + "auxiliary_loss_mlp": 0.0103544, + "balance_loss_clip": 1.01782107, + "balance_loss_mlp": 1.02274859, + "epoch": 0.27223808808056515, + "flos": 16252278024960.0, + "grad_norm": 1.9572044950173368, + "language_loss": 0.82010406, + "learning_rate": 3.312419019474151e-06, + "loss": 0.84125781, + "num_input_tokens_seen": 97917800, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.5703125, + "step": 4528, + "time_per_iteration": 2.352949380874634 + }, + { + "auxiliary_loss_clip": 0.01079986, + "auxiliary_loss_mlp": 0.01038852, + "balance_loss_clip": 1.02316439, + "balance_loss_mlp": 1.02441454, + "epoch": 0.2722982113332331, + "flos": 27744568439040.0, + "grad_norm": 2.253802296169956, + "language_loss": 0.77230215, + "learning_rate": 3.3121338993604306e-06, + "loss": 0.79349053, + "num_input_tokens_seen": 97937225, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.5546875, + "step": 4529, + "time_per_iteration": 2.4298012256622314 + }, + { + "auxiliary_loss_clip": 0.0108186, + "auxiliary_loss_mlp": 0.01037919, + "balance_loss_clip": 1.02209949, + "balance_loss_mlp": 1.02347469, + "epoch": 0.2723583345859011, + "flos": 21725158020480.0, + "grad_norm": 1.8764821701819914, + "language_loss": 0.82439351, + "learning_rate": 3.3118487324200267e-06, + "loss": 0.84559131, + "num_input_tokens_seen": 97956845, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.5859375, + "step": 4530, + "time_per_iteration": 2.3779683113098145 + }, + { + "auxiliary_loss_clip": 0.01079985, + "auxiliary_loss_mlp": 0.01034443, + "balance_loss_clip": 1.01832604, + "balance_loss_mlp": 1.02333164, + "epoch": 0.27241845783856905, + "flos": 17346316389120.0, + "grad_norm": 2.0206716892452983, + "language_loss": 0.91379881, + "learning_rate": 3.3115635186631156e-06, + "loss": 0.93494308, + "num_input_tokens_seen": 97972465, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.56640625, + "step": 4531, + "time_per_iteration": 2.35880708694458 + }, + { + "auxiliary_loss_clip": 0.01081303, + "auxiliary_loss_mlp": 0.01038301, + "balance_loss_clip": 1.02195787, + "balance_loss_mlp": 1.02361417, + "epoch": 0.272478581091237, + "flos": 24753140259840.0, + "grad_norm": 2.0681426646192302, + "language_loss": 0.76825052, + "learning_rate": 3.3112782580998767e-06, + "loss": 0.78944653, + "num_input_tokens_seen": 97990770, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.578125, + "step": 4532, + "time_per_iteration": 2.401804208755493 + }, + { + "auxiliary_loss_clip": 0.01077176, + "auxiliary_loss_mlp": 0.0103064, + "balance_loss_clip": 1.01507747, + "balance_loss_mlp": 1.02413034, + "epoch": 0.272538704343905, + "flos": 17889774612480.0, + "grad_norm": 2.8403586954987157, + "language_loss": 0.8898015, + "learning_rate": 3.3109929507404895e-06, + "loss": 0.91087973, + "num_input_tokens_seen": 98005775, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.53125, + "step": 4533, + "time_per_iteration": 2.3338263034820557 + }, + { + "auxiliary_loss_clip": 0.01078865, + "auxiliary_loss_mlp": 0.01027358, + "balance_loss_clip": 1.0114255, + "balance_loss_mlp": 1.02390444, + "epoch": 0.272598827596573, + "flos": 22930848512640.0, + "grad_norm": 1.933835864049127, + "language_loss": 0.71540821, + "learning_rate": 3.3107075965951355e-06, + "loss": 0.73647046, + "num_input_tokens_seen": 98025750, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.55078125, + "step": 4534, + "time_per_iteration": 2.399620771408081 + }, + { + "auxiliary_loss_clip": 0.01080491, + "auxiliary_loss_mlp": 0.01036193, + "balance_loss_clip": 1.01826429, + "balance_loss_mlp": 1.02281284, + "epoch": 0.27265895084924097, + "flos": 24237403522560.0, + "grad_norm": 2.4880431962628156, + "language_loss": 0.91023898, + "learning_rate": 3.3104221956739996e-06, + "loss": 0.93140578, + "num_input_tokens_seen": 98044955, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.578125, + "step": 4535, + "time_per_iteration": 2.3948521614074707 + }, + { + "auxiliary_loss_clip": 0.01083294, + "auxiliary_loss_mlp": 0.01031139, + "balance_loss_clip": 1.01386571, + "balance_loss_mlp": 1.02550101, + "epoch": 0.27271907410190893, + "flos": 27012020958720.0, + "grad_norm": 1.7963651630385258, + "language_loss": 0.73164904, + "learning_rate": 3.3101367479872667e-06, + "loss": 0.75279337, + "num_input_tokens_seen": 98065860, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.578125, + "step": 4536, + "time_per_iteration": 2.4268381595611572 + }, + { + "auxiliary_loss_clip": 0.01079914, + "auxiliary_loss_mlp": 0.0103043, + "balance_loss_clip": 1.01486695, + "balance_loss_mlp": 1.02373004, + "epoch": 0.2727791973545769, + "flos": 34451349171840.0, + "grad_norm": 1.8991001165122825, + "language_loss": 0.71817946, + "learning_rate": 3.309851253545123e-06, + "loss": 0.73928285, + "num_input_tokens_seen": 98085450, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.5625, + "step": 4537, + "time_per_iteration": 2.4885199069976807 + }, + { + "auxiliary_loss_clip": 0.01080491, + "auxiliary_loss_mlp": 0.01026808, + "balance_loss_clip": 1.01038146, + "balance_loss_mlp": 1.02329206, + "epoch": 0.27283932060724486, + "flos": 15041036626560.0, + "grad_norm": 4.403359888192737, + "language_loss": 0.78374529, + "learning_rate": 3.3095657123577572e-06, + "loss": 0.80481833, + "num_input_tokens_seen": 98099115, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.5703125, + "step": 4538, + "time_per_iteration": 2.3258514404296875 + }, + { + "auxiliary_loss_clip": 0.01081866, + "auxiliary_loss_mlp": 0.01034917, + "balance_loss_clip": 1.01869297, + "balance_loss_mlp": 1.02523971, + "epoch": 0.2728994438599128, + "flos": 21651351672960.0, + "grad_norm": 1.466428437806416, + "language_loss": 0.90227783, + "learning_rate": 3.30928012443536e-06, + "loss": 0.9234457, + "num_input_tokens_seen": 98118415, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.56640625, + "step": 4539, + "time_per_iteration": 2.377931833267212 + }, + { + "auxiliary_loss_clip": 0.0107986, + "auxiliary_loss_mlp": 0.01029768, + "balance_loss_clip": 1.01235127, + "balance_loss_mlp": 1.02367878, + "epoch": 0.2729595671125808, + "flos": 17487610128000.0, + "grad_norm": 1.7861445235460929, + "language_loss": 0.88047725, + "learning_rate": 3.308994489788123e-06, + "loss": 0.90157354, + "num_input_tokens_seen": 98136300, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.5625, + "step": 4540, + "time_per_iteration": 2.359163999557495 + }, + { + "auxiliary_loss_clip": 0.01080049, + "auxiliary_loss_mlp": 0.0102998, + "balance_loss_clip": 1.01464987, + "balance_loss_mlp": 1.02274692, + "epoch": 0.27301969036524876, + "flos": 19317128526720.0, + "grad_norm": 1.914978820946021, + "language_loss": 0.81810862, + "learning_rate": 3.308708808426239e-06, + "loss": 0.83920884, + "num_input_tokens_seen": 98154580, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.5703125, + "step": 4541, + "time_per_iteration": 2.3712704181671143 + }, + { + "auxiliary_loss_clip": 0.01080326, + "auxiliary_loss_mlp": 0.01030499, + "balance_loss_clip": 1.01346445, + "balance_loss_mlp": 1.02396894, + "epoch": 0.2730798136179167, + "flos": 21064706231040.0, + "grad_norm": 3.455597355351001, + "language_loss": 0.79502952, + "learning_rate": 3.308423080359905e-06, + "loss": 0.81613779, + "num_input_tokens_seen": 98173115, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.5625, + "step": 4542, + "time_per_iteration": 2.3875794410705566 + }, + { + "auxiliary_loss_clip": 0.01082546, + "auxiliary_loss_mlp": 0.01030478, + "balance_loss_clip": 1.0145514, + "balance_loss_mlp": 1.02544093, + "epoch": 0.2731399368705847, + "flos": 19170737729280.0, + "grad_norm": 2.2151386232309314, + "language_loss": 0.89530146, + "learning_rate": 3.3081373055993167e-06, + "loss": 0.91643178, + "num_input_tokens_seen": 98190260, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.5703125, + "step": 4543, + "time_per_iteration": 2.3860435485839844 + }, + { + "auxiliary_loss_clip": 0.0108106, + "auxiliary_loss_mlp": 0.01033381, + "balance_loss_clip": 1.01604807, + "balance_loss_mlp": 1.02252424, + "epoch": 0.27320006012325265, + "flos": 18289320744960.0, + "grad_norm": 1.7617949379491604, + "language_loss": 0.63285214, + "learning_rate": 3.3078514841546728e-06, + "loss": 0.65399659, + "num_input_tokens_seen": 98207115, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.5859375, + "step": 4544, + "time_per_iteration": 2.3588955402374268 + }, + { + "auxiliary_loss_clip": 0.01082279, + "auxiliary_loss_mlp": 0.01034088, + "balance_loss_clip": 1.01612341, + "balance_loss_mlp": 1.02477002, + "epoch": 0.2732601833759206, + "flos": 34859483498880.0, + "grad_norm": 1.7956993323709336, + "language_loss": 0.69581962, + "learning_rate": 3.307565616036174e-06, + "loss": 0.71698326, + "num_input_tokens_seen": 98230610, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.57421875, + "step": 4545, + "time_per_iteration": 2.503817558288574 + }, + { + "auxiliary_loss_clip": 0.01019235, + "auxiliary_loss_mlp": 0.01003612, + "balance_loss_clip": 1.00216949, + "balance_loss_mlp": 1.00478387, + "epoch": 0.2733203066285886, + "flos": 53907709800960.0, + "grad_norm": 0.7189251829199559, + "language_loss": 0.61637843, + "learning_rate": 3.3072797012540214e-06, + "loss": 0.63660687, + "num_input_tokens_seen": 98293585, + "router_z_loss_clip": 0.0144043, + "router_z_loss_mlp": 0.14453125, + "step": 4546, + "time_per_iteration": 3.0726189613342285 + }, + { + "auxiliary_loss_clip": 0.010849, + "auxiliary_loss_mlp": 0.01036062, + "balance_loss_clip": 1.01928902, + "balance_loss_mlp": 1.02839208, + "epoch": 0.2733804298812566, + "flos": 20659539369600.0, + "grad_norm": 1.8703870942387204, + "language_loss": 0.6477145, + "learning_rate": 3.306993739818419e-06, + "loss": 0.66892421, + "num_input_tokens_seen": 98311680, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.5625, + "step": 4547, + "time_per_iteration": 2.3983898162841797 + }, + { + "auxiliary_loss_clip": 0.01076253, + "auxiliary_loss_mlp": 0.01032038, + "balance_loss_clip": 1.01643991, + "balance_loss_mlp": 1.0233655, + "epoch": 0.27344055313392457, + "flos": 25883174102400.0, + "grad_norm": 1.9474178713735677, + "language_loss": 0.77696288, + "learning_rate": 3.3067077317395722e-06, + "loss": 0.79804575, + "num_input_tokens_seen": 98330770, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.52734375, + "step": 4548, + "time_per_iteration": 2.4211511611938477 + }, + { + "auxiliary_loss_clip": 0.01080083, + "auxiliary_loss_mlp": 0.0103302, + "balance_loss_clip": 1.01860762, + "balance_loss_mlp": 1.02451921, + "epoch": 0.27350067638659253, + "flos": 22928649096960.0, + "grad_norm": 1.8938285538899577, + "language_loss": 0.83013201, + "learning_rate": 3.3064216770276874e-06, + "loss": 0.85126305, + "num_input_tokens_seen": 98349860, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.5546875, + "step": 4549, + "time_per_iteration": 2.4124035835266113 + }, + { + "auxiliary_loss_clip": 0.01081643, + "auxiliary_loss_mlp": 0.01034475, + "balance_loss_clip": 1.01699853, + "balance_loss_mlp": 1.02370894, + "epoch": 0.2735607996392605, + "flos": 16574072826240.0, + "grad_norm": 2.3222990019488106, + "language_loss": 0.71106243, + "learning_rate": 3.3061355756929733e-06, + "loss": 0.73222363, + "num_input_tokens_seen": 98367040, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.578125, + "step": 4550, + "time_per_iteration": 2.3535232543945312 + }, + { + "auxiliary_loss_clip": 0.01079898, + "auxiliary_loss_mlp": 0.01028923, + "balance_loss_clip": 1.01433134, + "balance_loss_mlp": 1.02467299, + "epoch": 0.27362092289192846, + "flos": 19644299677440.0, + "grad_norm": 5.854621502080208, + "language_loss": 0.78400576, + "learning_rate": 3.305849427745641e-06, + "loss": 0.805094, + "num_input_tokens_seen": 98384010, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.5546875, + "step": 4551, + "time_per_iteration": 2.374575614929199 + }, + { + "auxiliary_loss_clip": 0.01081366, + "auxiliary_loss_mlp": 0.01037482, + "balance_loss_clip": 1.02162719, + "balance_loss_mlp": 1.02400029, + "epoch": 0.27368104614459643, + "flos": 17638190288640.0, + "grad_norm": 2.1163440576704122, + "language_loss": 0.70711285, + "learning_rate": 3.305563233195901e-06, + "loss": 0.72830129, + "num_input_tokens_seen": 98399625, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.57421875, + "step": 4552, + "time_per_iteration": 2.3439207077026367 + }, + { + "auxiliary_loss_clip": 0.01082133, + "auxiliary_loss_mlp": 0.01033672, + "balance_loss_clip": 1.01633906, + "balance_loss_mlp": 1.02481949, + "epoch": 0.2737411693972644, + "flos": 21578941779840.0, + "grad_norm": 1.8348327293437081, + "language_loss": 0.71781379, + "learning_rate": 3.305276992053968e-06, + "loss": 0.73897183, + "num_input_tokens_seen": 98417310, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.5703125, + "step": 4553, + "time_per_iteration": 2.3851184844970703 + }, + { + "auxiliary_loss_clip": 0.01080432, + "auxiliary_loss_mlp": 0.01033086, + "balance_loss_clip": 1.01634884, + "balance_loss_mlp": 1.02447724, + "epoch": 0.27380129264993236, + "flos": 25482859920000.0, + "grad_norm": 1.6884843676486623, + "language_loss": 0.59042692, + "learning_rate": 3.304990704330057e-06, + "loss": 0.61156213, + "num_input_tokens_seen": 98438670, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.55859375, + "step": 4554, + "time_per_iteration": 2.436256170272827 + }, + { + "auxiliary_loss_clip": 0.01084195, + "auxiliary_loss_mlp": 0.01030496, + "balance_loss_clip": 1.01349711, + "balance_loss_mlp": 1.02560186, + "epoch": 0.2738614159026003, + "flos": 18660202784640.0, + "grad_norm": 1.6264472836470485, + "language_loss": 0.73733985, + "learning_rate": 3.304704370034384e-06, + "loss": 0.75848675, + "num_input_tokens_seen": 98456060, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.5859375, + "step": 4555, + "time_per_iteration": 2.3795883655548096 + }, + { + "auxiliary_loss_clip": 0.01082826, + "auxiliary_loss_mlp": 0.01031209, + "balance_loss_clip": 1.01437676, + "balance_loss_mlp": 1.02564645, + "epoch": 0.2739215391552683, + "flos": 23476017392640.0, + "grad_norm": 7.1475439908342855, + "language_loss": 0.77335113, + "learning_rate": 3.3044179891771684e-06, + "loss": 0.79449153, + "num_input_tokens_seen": 98473765, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.5703125, + "step": 4556, + "time_per_iteration": 2.4172799587249756 + }, + { + "auxiliary_loss_clip": 0.0108907, + "auxiliary_loss_mlp": 0.01039534, + "balance_loss_clip": 1.0221293, + "balance_loss_mlp": 1.02616084, + "epoch": 0.27398166240793626, + "flos": 17127690255360.0, + "grad_norm": 2.317930043852284, + "language_loss": 0.82052898, + "learning_rate": 3.3041315617686298e-06, + "loss": 0.84181505, + "num_input_tokens_seen": 98490590, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.62890625, + "step": 4557, + "time_per_iteration": 3.7227389812469482 + }, + { + "auxiliary_loss_clip": 0.01080511, + "auxiliary_loss_mlp": 0.01035487, + "balance_loss_clip": 1.0191437, + "balance_loss_mlp": 1.02469683, + "epoch": 0.2740417856606042, + "flos": 23403607499520.0, + "grad_norm": 1.7172891543294506, + "language_loss": 0.72389257, + "learning_rate": 3.303845087818991e-06, + "loss": 0.74505258, + "num_input_tokens_seen": 98510590, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.55859375, + "step": 4558, + "time_per_iteration": 2.3939926624298096 + }, + { + "auxiliary_loss_clip": 0.01080288, + "auxiliary_loss_mlp": 0.01032133, + "balance_loss_clip": 1.01655221, + "balance_loss_mlp": 1.02412319, + "epoch": 0.2741019089132722, + "flos": 12779781955200.0, + "grad_norm": 2.5956017130927895, + "language_loss": 0.68328184, + "learning_rate": 3.3035585673384745e-06, + "loss": 0.70440602, + "num_input_tokens_seen": 98527875, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.5625, + "step": 4559, + "time_per_iteration": 2.37117862701416 + }, + { + "auxiliary_loss_clip": 0.01079388, + "auxiliary_loss_mlp": 0.01034454, + "balance_loss_clip": 1.01826513, + "balance_loss_mlp": 1.02354228, + "epoch": 0.27416203216594015, + "flos": 20630491251840.0, + "grad_norm": 1.769169887381498, + "language_loss": 0.72384578, + "learning_rate": 3.3032720003373057e-06, + "loss": 0.74498415, + "num_input_tokens_seen": 98547575, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.55859375, + "step": 4560, + "time_per_iteration": 3.7601819038391113 + }, + { + "auxiliary_loss_clip": 0.01081038, + "auxiliary_loss_mlp": 0.01035452, + "balance_loss_clip": 1.01945376, + "balance_loss_mlp": 1.02398014, + "epoch": 0.27422215541860817, + "flos": 26540379135360.0, + "grad_norm": 2.2048692146699955, + "language_loss": 0.81552273, + "learning_rate": 3.302985386825712e-06, + "loss": 0.83668756, + "num_input_tokens_seen": 98566290, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.5703125, + "step": 4561, + "time_per_iteration": 2.4507904052734375 + }, + { + "auxiliary_loss_clip": 0.01082992, + "auxiliary_loss_mlp": 0.01032897, + "balance_loss_clip": 1.0165652, + "balance_loss_mlp": 1.0246489, + "epoch": 0.27428227867127614, + "flos": 23330045531520.0, + "grad_norm": 2.4658243450454993, + "language_loss": 0.754637, + "learning_rate": 3.302698726813921e-06, + "loss": 0.77579594, + "num_input_tokens_seen": 98586255, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.5859375, + "step": 4562, + "time_per_iteration": 2.3994991779327393 + }, + { + "auxiliary_loss_clip": 0.01081196, + "auxiliary_loss_mlp": 0.01032378, + "balance_loss_clip": 1.01691628, + "balance_loss_mlp": 1.02525234, + "epoch": 0.2743424019239441, + "flos": 23034121914240.0, + "grad_norm": 1.7169728534065862, + "language_loss": 0.74742758, + "learning_rate": 3.3024120203121637e-06, + "loss": 0.76856327, + "num_input_tokens_seen": 98606030, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.5625, + "step": 4563, + "time_per_iteration": 2.418084144592285 + }, + { + "auxiliary_loss_clip": 0.0108587, + "auxiliary_loss_mlp": 0.01046262, + "balance_loss_clip": 1.02741539, + "balance_loss_mlp": 1.02481794, + "epoch": 0.27440252517661207, + "flos": 21980024012160.0, + "grad_norm": 1.5924422647558862, + "language_loss": 0.6261344, + "learning_rate": 3.302125267330672e-06, + "loss": 0.64745569, + "num_input_tokens_seen": 98625225, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 0.609375, + "step": 4564, + "time_per_iteration": 3.791637420654297 + }, + { + "auxiliary_loss_clip": 0.01082194, + "auxiliary_loss_mlp": 0.01033087, + "balance_loss_clip": 1.0160042, + "balance_loss_mlp": 1.02397561, + "epoch": 0.27446264842928003, + "flos": 40185867962880.0, + "grad_norm": 1.8725739588961885, + "language_loss": 0.78442985, + "learning_rate": 3.3018384678796786e-06, + "loss": 0.80558264, + "num_input_tokens_seen": 98649470, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.58203125, + "step": 4565, + "time_per_iteration": 2.54919695854187 + }, + { + "auxiliary_loss_clip": 0.01081763, + "auxiliary_loss_mlp": 0.01031001, + "balance_loss_clip": 1.01489639, + "balance_loss_mlp": 1.02574587, + "epoch": 0.274522771681948, + "flos": 13478847575040.0, + "grad_norm": 1.8001092575692066, + "language_loss": 0.68084419, + "learning_rate": 3.3015516219694186e-06, + "loss": 0.70197183, + "num_input_tokens_seen": 98666915, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.55859375, + "step": 4566, + "time_per_iteration": 3.755009412765503 + }, + { + "auxiliary_loss_clip": 0.01080487, + "auxiliary_loss_mlp": 0.01035123, + "balance_loss_clip": 1.02044201, + "balance_loss_mlp": 1.02552259, + "epoch": 0.27458289493461596, + "flos": 28620853453440.0, + "grad_norm": 1.875365240259041, + "language_loss": 0.61288488, + "learning_rate": 3.3012647296101296e-06, + "loss": 0.63404095, + "num_input_tokens_seen": 98688240, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.55078125, + "step": 4567, + "time_per_iteration": 2.4268686771392822 + }, + { + "auxiliary_loss_clip": 0.01082299, + "auxiliary_loss_mlp": 0.01036726, + "balance_loss_clip": 1.01952434, + "balance_loss_mlp": 1.02608514, + "epoch": 0.2746430181872839, + "flos": 20118804232320.0, + "grad_norm": 1.6513416938406327, + "language_loss": 0.82164323, + "learning_rate": 3.30097779081205e-06, + "loss": 0.84283352, + "num_input_tokens_seen": 98708245, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.5625, + "step": 4568, + "time_per_iteration": 2.390467882156372 + }, + { + "auxiliary_loss_clip": 0.01082325, + "auxiliary_loss_mlp": 0.01029951, + "balance_loss_clip": 1.01339316, + "balance_loss_mlp": 1.02532458, + "epoch": 0.2747031414399519, + "flos": 20192436023040.0, + "grad_norm": 1.8725538972554452, + "language_loss": 0.68479341, + "learning_rate": 3.300690805585419e-06, + "loss": 0.70591617, + "num_input_tokens_seen": 98724575, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.5703125, + "step": 4569, + "time_per_iteration": 2.3657889366149902 + }, + { + "auxiliary_loss_clip": 0.01083034, + "auxiliary_loss_mlp": 0.0103448, + "balance_loss_clip": 1.01734972, + "balance_loss_mlp": 1.02505016, + "epoch": 0.27476326469261986, + "flos": 13515506369280.0, + "grad_norm": 2.6925017276632617, + "language_loss": 0.70550632, + "learning_rate": 3.300403773940479e-06, + "loss": 0.72668153, + "num_input_tokens_seen": 98740700, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.58203125, + "step": 4570, + "time_per_iteration": 2.341416597366333 + }, + { + "auxiliary_loss_clip": 0.01017436, + "auxiliary_loss_mlp": 0.01010343, + "balance_loss_clip": 1.008793, + "balance_loss_mlp": 1.00421321, + "epoch": 0.2748233879452878, + "flos": 65934067282560.0, + "grad_norm": 0.7544978614366927, + "language_loss": 0.55759937, + "learning_rate": 3.3001166958874738e-06, + "loss": 0.5778771, + "num_input_tokens_seen": 98803030, + "router_z_loss_clip": 0.01544189, + "router_z_loss_mlp": 0.1328125, + "step": 4571, + "time_per_iteration": 3.134333372116089 + }, + { + "auxiliary_loss_clip": 0.01085919, + "auxiliary_loss_mlp": 0.01034416, + "balance_loss_clip": 1.01726198, + "balance_loss_mlp": 1.02698457, + "epoch": 0.2748835111979558, + "flos": 17383254474240.0, + "grad_norm": 2.4238304663184067, + "language_loss": 0.7786752, + "learning_rate": 3.299829571436648e-06, + "loss": 0.7998786, + "num_input_tokens_seen": 98820505, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.58984375, + "step": 4572, + "time_per_iteration": 2.3765413761138916 + }, + { + "auxiliary_loss_clip": 0.01076712, + "auxiliary_loss_mlp": 0.01028726, + "balance_loss_clip": 1.01426625, + "balance_loss_mlp": 1.02383137, + "epoch": 0.27494363445062375, + "flos": 23586412711680.0, + "grad_norm": 1.6327844311762805, + "language_loss": 0.81324852, + "learning_rate": 3.2995424005982475e-06, + "loss": 0.8343029, + "num_input_tokens_seen": 98842150, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.52734375, + "step": 4573, + "time_per_iteration": 2.4041430950164795 + }, + { + "auxiliary_loss_clip": 0.01079209, + "auxiliary_loss_mlp": 0.01028755, + "balance_loss_clip": 1.01344848, + "balance_loss_mlp": 1.02476406, + "epoch": 0.2750037577032918, + "flos": 17163650822400.0, + "grad_norm": 2.0728941071216362, + "language_loss": 0.7886256, + "learning_rate": 3.299255183382522e-06, + "loss": 0.80970526, + "num_input_tokens_seen": 98861050, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.54296875, + "step": 4574, + "time_per_iteration": 2.368095636367798 + }, + { + "auxiliary_loss_clip": 0.01082773, + "auxiliary_loss_mlp": 0.01035358, + "balance_loss_clip": 1.01949668, + "balance_loss_mlp": 1.02525806, + "epoch": 0.27506388095595974, + "flos": 24490942882560.0, + "grad_norm": 2.0706887074553095, + "language_loss": 0.74229103, + "learning_rate": 3.298967919799722e-06, + "loss": 0.76347232, + "num_input_tokens_seen": 98879695, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.57421875, + "step": 4575, + "time_per_iteration": 2.3982925415039062 + }, + { + "auxiliary_loss_clip": 0.01081555, + "auxiliary_loss_mlp": 0.01030455, + "balance_loss_clip": 1.01481473, + "balance_loss_mlp": 1.02672315, + "epoch": 0.2751240042086277, + "flos": 38763157259520.0, + "grad_norm": 1.6528136680417422, + "language_loss": 0.717233, + "learning_rate": 3.2986806098600973e-06, + "loss": 0.73835313, + "num_input_tokens_seen": 98902035, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.546875, + "step": 4576, + "time_per_iteration": 2.5271897315979004 + }, + { + "auxiliary_loss_clip": 0.01082386, + "auxiliary_loss_mlp": 0.01032288, + "balance_loss_clip": 1.01652813, + "balance_loss_mlp": 1.02639341, + "epoch": 0.27518412746129567, + "flos": 26905815002880.0, + "grad_norm": 1.6121338137589687, + "language_loss": 0.73308748, + "learning_rate": 3.298393253573902e-06, + "loss": 0.75423419, + "num_input_tokens_seen": 98921835, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.5625, + "step": 4577, + "time_per_iteration": 2.4411284923553467 + }, + { + "auxiliary_loss_clip": 0.01082051, + "auxiliary_loss_mlp": 0.01033499, + "balance_loss_clip": 1.01599896, + "balance_loss_mlp": 1.02485323, + "epoch": 0.27524425071396363, + "flos": 24899356500480.0, + "grad_norm": 2.3698060249909685, + "language_loss": 0.76228505, + "learning_rate": 3.298105850951392e-06, + "loss": 0.78344053, + "num_input_tokens_seen": 98939610, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.57421875, + "step": 4578, + "time_per_iteration": 2.4060704708099365 + }, + { + "auxiliary_loss_clip": 0.01083281, + "auxiliary_loss_mlp": 0.01034075, + "balance_loss_clip": 1.01386905, + "balance_loss_mlp": 1.02491999, + "epoch": 0.2753043739666316, + "flos": 26286804864000.0, + "grad_norm": 1.3655607706506157, + "language_loss": 0.65976298, + "learning_rate": 3.2978184020028232e-06, + "loss": 0.68093652, + "num_input_tokens_seen": 98962250, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 0.5859375, + "step": 4579, + "time_per_iteration": 2.453016519546509 + }, + { + "auxiliary_loss_clip": 0.0108413, + "auxiliary_loss_mlp": 0.01036867, + "balance_loss_clip": 1.01864028, + "balance_loss_mlp": 1.02587795, + "epoch": 0.27536449721929956, + "flos": 24205632318720.0, + "grad_norm": 1.7724725800381609, + "language_loss": 0.79881054, + "learning_rate": 3.297530906738454e-06, + "loss": 0.8200205, + "num_input_tokens_seen": 98981845, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 0.5859375, + "step": 4580, + "time_per_iteration": 2.41353440284729 + }, + { + "auxiliary_loss_clip": 0.01081912, + "auxiliary_loss_mlp": 0.01036067, + "balance_loss_clip": 1.018543, + "balance_loss_mlp": 1.02512968, + "epoch": 0.27542462047196753, + "flos": 19536243419520.0, + "grad_norm": 1.5336696001801486, + "language_loss": 0.67433882, + "learning_rate": 3.297243365168544e-06, + "loss": 0.69551861, + "num_input_tokens_seen": 99001855, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.56640625, + "step": 4581, + "time_per_iteration": 2.403851270675659 + }, + { + "auxiliary_loss_clip": 0.01082081, + "auxiliary_loss_mlp": 0.01033691, + "balance_loss_clip": 1.01818204, + "balance_loss_mlp": 1.02589679, + "epoch": 0.2754847437246355, + "flos": 14318299238400.0, + "grad_norm": 1.6953735686706208, + "language_loss": 0.78049088, + "learning_rate": 3.2969557773033555e-06, + "loss": 0.80164862, + "num_input_tokens_seen": 99019880, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.5625, + "step": 4582, + "time_per_iteration": 2.355501890182495 + }, + { + "auxiliary_loss_clip": 0.01081738, + "auxiliary_loss_mlp": 0.01031086, + "balance_loss_clip": 1.01573801, + "balance_loss_mlp": 1.02502, + "epoch": 0.27554486697730346, + "flos": 18837910938240.0, + "grad_norm": 1.6126989313064397, + "language_loss": 0.84509051, + "learning_rate": 3.296668143153152e-06, + "loss": 0.86621881, + "num_input_tokens_seen": 99037570, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.56640625, + "step": 4583, + "time_per_iteration": 2.3900318145751953 + }, + { + "auxiliary_loss_clip": 0.01083269, + "auxiliary_loss_mlp": 0.01027861, + "balance_loss_clip": 1.01138651, + "balance_loss_mlp": 1.02407956, + "epoch": 0.2756049902299714, + "flos": 22381210978560.0, + "grad_norm": 2.3227855332646983, + "language_loss": 0.66627789, + "learning_rate": 3.296380462728197e-06, + "loss": 0.68738925, + "num_input_tokens_seen": 99056875, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.59375, + "step": 4584, + "time_per_iteration": 2.3854594230651855 + }, + { + "auxiliary_loss_clip": 0.01079057, + "auxiliary_loss_mlp": 0.01035382, + "balance_loss_clip": 1.01950383, + "balance_loss_mlp": 1.02439451, + "epoch": 0.2756651134826394, + "flos": 19572867302400.0, + "grad_norm": 2.8000583162095687, + "language_loss": 0.77081859, + "learning_rate": 3.2960927360387585e-06, + "loss": 0.79196298, + "num_input_tokens_seen": 99074685, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.546875, + "step": 4585, + "time_per_iteration": 2.4002299308776855 + }, + { + "auxiliary_loss_clip": 0.01085499, + "auxiliary_loss_mlp": 0.01030279, + "balance_loss_clip": 1.01256418, + "balance_loss_mlp": 1.02719843, + "epoch": 0.27572523673530736, + "flos": 23585435193600.0, + "grad_norm": 1.5410654888681528, + "language_loss": 0.71680582, + "learning_rate": 3.2958049630951038e-06, + "loss": 0.73796368, + "num_input_tokens_seen": 99095300, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.5859375, + "step": 4586, + "time_per_iteration": 2.405465602874756 + }, + { + "auxiliary_loss_clip": 0.0108218, + "auxiliary_loss_mlp": 0.01032635, + "balance_loss_clip": 1.01612413, + "balance_loss_mlp": 1.02582371, + "epoch": 0.2757853599879754, + "flos": 22819021827840.0, + "grad_norm": 1.4985826599983278, + "language_loss": 0.80514759, + "learning_rate": 3.295517143907504e-06, + "loss": 0.82629573, + "num_input_tokens_seen": 99115965, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.5625, + "step": 4587, + "time_per_iteration": 2.390324592590332 + }, + { + "auxiliary_loss_clip": 0.01079336, + "auxiliary_loss_mlp": 0.01031305, + "balance_loss_clip": 1.01436579, + "balance_loss_mlp": 1.02460158, + "epoch": 0.27584548324064334, + "flos": 18550715160960.0, + "grad_norm": 1.9644734697829243, + "language_loss": 0.83078039, + "learning_rate": 3.2952292784862286e-06, + "loss": 0.85188687, + "num_input_tokens_seen": 99134265, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.546875, + "step": 4588, + "time_per_iteration": 2.3554866313934326 + }, + { + "auxiliary_loss_clip": 0.01080198, + "auxiliary_loss_mlp": 0.01031529, + "balance_loss_clip": 1.01501894, + "balance_loss_mlp": 1.02497363, + "epoch": 0.2759056064933113, + "flos": 23768729164800.0, + "grad_norm": 1.5054866330696839, + "language_loss": 0.75276405, + "learning_rate": 3.2949413668415526e-06, + "loss": 0.77388132, + "num_input_tokens_seen": 99156185, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.5546875, + "step": 4589, + "time_per_iteration": 2.415968656539917 + }, + { + "auxiliary_loss_clip": 0.01080159, + "auxiliary_loss_mlp": 0.01033699, + "balance_loss_clip": 1.01684308, + "balance_loss_mlp": 1.02463949, + "epoch": 0.27596572974597927, + "flos": 24280695475200.0, + "grad_norm": 1.6133244888946692, + "language_loss": 0.87966681, + "learning_rate": 3.29465340898375e-06, + "loss": 0.90080535, + "num_input_tokens_seen": 99176735, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.5546875, + "step": 4590, + "time_per_iteration": 2.4039664268493652 + }, + { + "auxiliary_loss_clip": 0.01081942, + "auxiliary_loss_mlp": 0.01035953, + "balance_loss_clip": 1.01848912, + "balance_loss_mlp": 1.0249052, + "epoch": 0.27602585299864724, + "flos": 35039600536320.0, + "grad_norm": 1.5111284181607159, + "language_loss": 0.71183312, + "learning_rate": 3.2943654049230982e-06, + "loss": 0.73301208, + "num_input_tokens_seen": 99199765, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.5703125, + "step": 4591, + "time_per_iteration": 2.498314619064331 + }, + { + "auxiliary_loss_clip": 0.01084116, + "auxiliary_loss_mlp": 0.01037484, + "balance_loss_clip": 1.0199604, + "balance_loss_mlp": 1.02584445, + "epoch": 0.2760859762513152, + "flos": 24308451872640.0, + "grad_norm": 2.921450069477129, + "language_loss": 0.79913783, + "learning_rate": 3.2940773546698745e-06, + "loss": 0.82035381, + "num_input_tokens_seen": 99218435, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.58203125, + "step": 4592, + "time_per_iteration": 2.396172046661377 + }, + { + "auxiliary_loss_clip": 0.01017458, + "auxiliary_loss_mlp": 0.01001094, + "balance_loss_clip": 0.99965775, + "balance_loss_mlp": 1.00372052, + "epoch": 0.27614609950398317, + "flos": 71257623926400.0, + "grad_norm": 0.7090126224094614, + "language_loss": 0.6163981, + "learning_rate": 3.2937892582343574e-06, + "loss": 0.63658363, + "num_input_tokens_seen": 99276200, + "router_z_loss_clip": 0.01434326, + "router_z_loss_mlp": 0.13769531, + "step": 4593, + "time_per_iteration": 2.9940025806427 + }, + { + "auxiliary_loss_clip": 0.01082116, + "auxiliary_loss_mlp": 0.01034019, + "balance_loss_clip": 1.01617348, + "balance_loss_mlp": 1.02496839, + "epoch": 0.27620622275665113, + "flos": 29673694546560.0, + "grad_norm": 1.8497206346215327, + "language_loss": 0.77398038, + "learning_rate": 3.2935011156268313e-06, + "loss": 0.7951417, + "num_input_tokens_seen": 99297625, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 0.5703125, + "step": 4594, + "time_per_iteration": 2.437809944152832 + }, + { + "auxiliary_loss_clip": 0.01082449, + "auxiliary_loss_mlp": 0.01030825, + "balance_loss_clip": 1.01533985, + "balance_loss_mlp": 1.02691197, + "epoch": 0.2762663460093191, + "flos": 15377145085440.0, + "grad_norm": 1.4000886716608236, + "language_loss": 0.91553831, + "learning_rate": 3.293212926857577e-06, + "loss": 0.93667102, + "num_input_tokens_seen": 99315790, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.5546875, + "step": 4595, + "time_per_iteration": 2.369683027267456 + }, + { + "auxiliary_loss_clip": 0.01084055, + "auxiliary_loss_mlp": 0.01042277, + "balance_loss_clip": 1.02439547, + "balance_loss_mlp": 1.02569568, + "epoch": 0.27632646926198706, + "flos": 20703040790400.0, + "grad_norm": 2.0167650406613666, + "language_loss": 0.69258326, + "learning_rate": 3.2929246919368796e-06, + "loss": 0.71384656, + "num_input_tokens_seen": 99334615, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 0.5859375, + "step": 4596, + "time_per_iteration": 3.773935317993164 + }, + { + "auxiliary_loss_clip": 0.01088814, + "auxiliary_loss_mlp": 0.01036872, + "balance_loss_clip": 1.01926494, + "balance_loss_mlp": 1.02745521, + "epoch": 0.276386592514655, + "flos": 32812107016320.0, + "grad_norm": 7.64244018348157, + "language_loss": 0.63830537, + "learning_rate": 3.2926364108750263e-06, + "loss": 0.65956223, + "num_input_tokens_seen": 99356685, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.61328125, + "step": 4597, + "time_per_iteration": 2.4807288646698 + }, + { + "auxiliary_loss_clip": 0.01084371, + "auxiliary_loss_mlp": 0.01039202, + "balance_loss_clip": 1.02322757, + "balance_loss_mlp": 1.02855742, + "epoch": 0.276446715767323, + "flos": 18550715160960.0, + "grad_norm": 1.9211654847174242, + "language_loss": 0.86382246, + "learning_rate": 3.292348083682304e-06, + "loss": 0.88505822, + "num_input_tokens_seen": 99374810, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.55859375, + "step": 4598, + "time_per_iteration": 2.3577988147735596 + }, + { + "auxiliary_loss_clip": 0.01084663, + "auxiliary_loss_mlp": 0.01033314, + "balance_loss_clip": 1.01526618, + "balance_loss_mlp": 1.02585006, + "epoch": 0.27650683901999096, + "flos": 22818533068800.0, + "grad_norm": 3.161751698695146, + "language_loss": 0.80081058, + "learning_rate": 3.2920597103690035e-06, + "loss": 0.82199037, + "num_input_tokens_seen": 99391290, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.58984375, + "step": 4599, + "time_per_iteration": 2.3980748653411865 + }, + { + "auxiliary_loss_clip": 0.01085815, + "auxiliary_loss_mlp": 0.01037315, + "balance_loss_clip": 1.01992202, + "balance_loss_mlp": 1.02654839, + "epoch": 0.276566962272659, + "flos": 21360455291520.0, + "grad_norm": 1.668511014208387, + "language_loss": 0.78368044, + "learning_rate": 3.2917712909454148e-06, + "loss": 0.80491167, + "num_input_tokens_seen": 99409120, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.59375, + "step": 4600, + "time_per_iteration": 3.7771215438842773 + }, + { + "auxiliary_loss_clip": 0.01087953, + "auxiliary_loss_mlp": 0.01041896, + "balance_loss_clip": 1.02378857, + "balance_loss_mlp": 1.02644753, + "epoch": 0.27662708552532694, + "flos": 17709692486400.0, + "grad_norm": 1.6785573351567948, + "language_loss": 0.73203355, + "learning_rate": 3.291482825421832e-06, + "loss": 0.75333202, + "num_input_tokens_seen": 99426180, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.61328125, + "step": 4601, + "time_per_iteration": 2.3417866230010986 + }, + { + "auxiliary_loss_clip": 0.01081014, + "auxiliary_loss_mlp": 0.01035789, + "balance_loss_clip": 1.01808619, + "balance_loss_mlp": 1.02512193, + "epoch": 0.2766872087779949, + "flos": 21251630983680.0, + "grad_norm": 1.4752810883295984, + "language_loss": 0.79993773, + "learning_rate": 3.2911943138085496e-06, + "loss": 0.82110578, + "num_input_tokens_seen": 99447720, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.55859375, + "step": 4602, + "time_per_iteration": 2.441012144088745 + }, + { + "auxiliary_loss_clip": 0.01086108, + "auxiliary_loss_mlp": 0.01044316, + "balance_loss_clip": 1.02509975, + "balance_loss_mlp": 1.02552462, + "epoch": 0.2767473320306629, + "flos": 12931095254400.0, + "grad_norm": 2.008895937106637, + "language_loss": 0.77057678, + "learning_rate": 3.290905756115863e-06, + "loss": 0.79188102, + "num_input_tokens_seen": 99464720, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 0.60546875, + "step": 4603, + "time_per_iteration": 3.690415143966675 + }, + { + "auxiliary_loss_clip": 0.01081639, + "auxiliary_loss_mlp": 0.01035623, + "balance_loss_clip": 1.01981592, + "balance_loss_mlp": 1.02666855, + "epoch": 0.27680745528333084, + "flos": 15011953597440.0, + "grad_norm": 1.5138019901121496, + "language_loss": 0.81632638, + "learning_rate": 3.2906171523540706e-06, + "loss": 0.83749896, + "num_input_tokens_seen": 99482310, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.55078125, + "step": 4604, + "time_per_iteration": 2.365110397338867 + }, + { + "auxiliary_loss_clip": 0.01083097, + "auxiliary_loss_mlp": 0.01030493, + "balance_loss_clip": 1.01291013, + "balance_loss_mlp": 1.0243206, + "epoch": 0.2768675785359988, + "flos": 22636740286080.0, + "grad_norm": 1.790714838560572, + "language_loss": 0.69702685, + "learning_rate": 3.2903285025334723e-06, + "loss": 0.71816272, + "num_input_tokens_seen": 99501255, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.5859375, + "step": 4605, + "time_per_iteration": 2.386167287826538 + }, + { + "auxiliary_loss_clip": 0.01016614, + "auxiliary_loss_mlp": 0.01006461, + "balance_loss_clip": 1.00462472, + "balance_loss_mlp": 1.00287592, + "epoch": 0.27692770178866677, + "flos": 66127171345920.0, + "grad_norm": 0.7063068457666765, + "language_loss": 0.57166123, + "learning_rate": 3.290039806664368e-06, + "loss": 0.591892, + "num_input_tokens_seen": 99568925, + "router_z_loss_clip": 0.01831055, + "router_z_loss_mlp": 0.13769531, + "step": 4606, + "time_per_iteration": 4.494596719741821 + }, + { + "auxiliary_loss_clip": 0.01085999, + "auxiliary_loss_mlp": 0.01035442, + "balance_loss_clip": 1.01853848, + "balance_loss_mlp": 1.02714872, + "epoch": 0.27698782504133473, + "flos": 26463884613120.0, + "grad_norm": 1.8228375418366305, + "language_loss": 0.691333, + "learning_rate": 3.2897510647570626e-06, + "loss": 0.71254742, + "num_input_tokens_seen": 99588455, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.5859375, + "step": 4607, + "time_per_iteration": 2.4294941425323486 + }, + { + "auxiliary_loss_clip": 0.01080386, + "auxiliary_loss_mlp": 0.01032574, + "balance_loss_clip": 1.01674294, + "balance_loss_mlp": 1.02615881, + "epoch": 0.2770479482940027, + "flos": 25883627950080.0, + "grad_norm": 1.8111466225444839, + "language_loss": 0.69376171, + "learning_rate": 3.2894622768218587e-06, + "loss": 0.71489131, + "num_input_tokens_seen": 99609355, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.54296875, + "step": 4608, + "time_per_iteration": 2.4185144901275635 + }, + { + "auxiliary_loss_clip": 0.01084521, + "auxiliary_loss_mlp": 0.01030639, + "balance_loss_clip": 1.01412845, + "balance_loss_mlp": 1.02504349, + "epoch": 0.27710807154667066, + "flos": 22856134469760.0, + "grad_norm": 1.7370462806151117, + "language_loss": 0.72688055, + "learning_rate": 3.289173442869063e-06, + "loss": 0.74803215, + "num_input_tokens_seen": 99628780, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.59375, + "step": 4609, + "time_per_iteration": 2.387050151824951 + }, + { + "auxiliary_loss_clip": 0.01082892, + "auxiliary_loss_mlp": 0.01038174, + "balance_loss_clip": 1.02035236, + "balance_loss_mlp": 1.02384937, + "epoch": 0.27716819479933863, + "flos": 17710146334080.0, + "grad_norm": 2.3648961972190876, + "language_loss": 0.83644879, + "learning_rate": 3.2888845629089833e-06, + "loss": 0.85765946, + "num_input_tokens_seen": 99644545, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.58984375, + "step": 4610, + "time_per_iteration": 2.3297476768493652 + }, + { + "auxiliary_loss_clip": 0.01088104, + "auxiliary_loss_mlp": 0.01041071, + "balance_loss_clip": 1.02184236, + "balance_loss_mlp": 1.02676606, + "epoch": 0.2772283180520066, + "flos": 19645032816000.0, + "grad_norm": 2.0264394387545672, + "language_loss": 0.68972641, + "learning_rate": 3.2885956369519287e-06, + "loss": 0.71101815, + "num_input_tokens_seen": 99663125, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 0.61328125, + "step": 4611, + "time_per_iteration": 2.359511137008667 + }, + { + "auxiliary_loss_clip": 0.01080434, + "auxiliary_loss_mlp": 0.01031907, + "balance_loss_clip": 1.01537228, + "balance_loss_mlp": 1.02680779, + "epoch": 0.27728844130467456, + "flos": 21031573484160.0, + "grad_norm": 2.072752939070887, + "language_loss": 0.73418701, + "learning_rate": 3.2883066650082106e-06, + "loss": 0.75531048, + "num_input_tokens_seen": 99682645, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.5390625, + "step": 4612, + "time_per_iteration": 2.380183696746826 + }, + { + "auxiliary_loss_clip": 0.01084216, + "auxiliary_loss_mlp": 0.01039433, + "balance_loss_clip": 1.02108717, + "balance_loss_mlp": 1.02541828, + "epoch": 0.2773485645573425, + "flos": 18988211808000.0, + "grad_norm": 2.4242154777305873, + "language_loss": 0.66694748, + "learning_rate": 3.288017647088142e-06, + "loss": 0.6881839, + "num_input_tokens_seen": 99700520, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.5859375, + "step": 4613, + "time_per_iteration": 2.379488468170166 + }, + { + "auxiliary_loss_clip": 0.01082104, + "auxiliary_loss_mlp": 0.0103418, + "balance_loss_clip": 1.01737177, + "balance_loss_mlp": 1.02516592, + "epoch": 0.27740868781001055, + "flos": 21467429297280.0, + "grad_norm": 1.653807165990663, + "language_loss": 0.7916218, + "learning_rate": 3.2877285832020363e-06, + "loss": 0.81278467, + "num_input_tokens_seen": 99720355, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.5703125, + "step": 4614, + "time_per_iteration": 2.3730251789093018 + }, + { + "auxiliary_loss_clip": 0.0108533, + "auxiliary_loss_mlp": 0.01032106, + "balance_loss_clip": 1.01401019, + "balance_loss_mlp": 1.02670074, + "epoch": 0.2774688110626785, + "flos": 19826825598720.0, + "grad_norm": 6.762787683563088, + "language_loss": 0.80093497, + "learning_rate": 3.28743947336021e-06, + "loss": 0.82210934, + "num_input_tokens_seen": 99736090, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.5859375, + "step": 4615, + "time_per_iteration": 2.3554940223693848 + }, + { + "auxiliary_loss_clip": 0.01085945, + "auxiliary_loss_mlp": 0.01037214, + "balance_loss_clip": 1.01949966, + "balance_loss_mlp": 1.02532148, + "epoch": 0.2775289343153465, + "flos": 18215444574720.0, + "grad_norm": 2.480429147021416, + "language_loss": 0.64004666, + "learning_rate": 3.2871503175729807e-06, + "loss": 0.66127825, + "num_input_tokens_seen": 99751805, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.60546875, + "step": 4616, + "time_per_iteration": 2.3342959880828857 + }, + { + "auxiliary_loss_clip": 0.01081778, + "auxiliary_loss_mlp": 0.01033609, + "balance_loss_clip": 1.01616848, + "balance_loss_mlp": 1.02426696, + "epoch": 0.27758905756801444, + "flos": 16471532563200.0, + "grad_norm": 2.0166782763790176, + "language_loss": 0.82064974, + "learning_rate": 3.286861115850667e-06, + "loss": 0.84180367, + "num_input_tokens_seen": 99770610, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.57421875, + "step": 4617, + "time_per_iteration": 2.3683419227600098 + }, + { + "auxiliary_loss_clip": 0.01082022, + "auxiliary_loss_mlp": 0.01036133, + "balance_loss_clip": 1.01902628, + "balance_loss_mlp": 1.02380896, + "epoch": 0.2776491808206824, + "flos": 18727410885120.0, + "grad_norm": 2.993876892819792, + "language_loss": 0.76747072, + "learning_rate": 3.286571868203591e-06, + "loss": 0.7886523, + "num_input_tokens_seen": 99787305, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.58203125, + "step": 4618, + "time_per_iteration": 2.369157552719116 + }, + { + "auxiliary_loss_clip": 0.01085982, + "auxiliary_loss_mlp": 0.01032293, + "balance_loss_clip": 1.01619971, + "balance_loss_mlp": 1.02738285, + "epoch": 0.27770930407335037, + "flos": 28036931097600.0, + "grad_norm": 2.1734029909292523, + "language_loss": 0.84983563, + "learning_rate": 3.286282574642074e-06, + "loss": 0.87101841, + "num_input_tokens_seen": 99808940, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.5859375, + "step": 4619, + "time_per_iteration": 2.4373459815979004 + }, + { + "auxiliary_loss_clip": 0.01080802, + "auxiliary_loss_mlp": 0.01033186, + "balance_loss_clip": 1.0183804, + "balance_loss_mlp": 1.02558863, + "epoch": 0.27776942732601834, + "flos": 23548706576640.0, + "grad_norm": 1.916943280515146, + "language_loss": 0.76711893, + "learning_rate": 3.2859932351764413e-06, + "loss": 0.78825879, + "num_input_tokens_seen": 99829575, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.5546875, + "step": 4620, + "time_per_iteration": 2.4141008853912354 + }, + { + "auxiliary_loss_clip": 0.01080938, + "auxiliary_loss_mlp": 0.01035295, + "balance_loss_clip": 1.0185225, + "balance_loss_mlp": 1.02503717, + "epoch": 0.2778295505786863, + "flos": 23907753665280.0, + "grad_norm": 2.5857560160133137, + "language_loss": 0.78369844, + "learning_rate": 3.2857038498170175e-06, + "loss": 0.80486083, + "num_input_tokens_seen": 99847575, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.55859375, + "step": 4621, + "time_per_iteration": 2.418139696121216 + }, + { + "auxiliary_loss_clip": 0.01085422, + "auxiliary_loss_mlp": 0.0103963, + "balance_loss_clip": 1.02109277, + "balance_loss_mlp": 1.02669621, + "epoch": 0.27788967383135427, + "flos": 25553454422400.0, + "grad_norm": 26.338067880850264, + "language_loss": 0.87562191, + "learning_rate": 3.2854144185741303e-06, + "loss": 0.89687252, + "num_input_tokens_seen": 99864995, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.5859375, + "step": 4622, + "time_per_iteration": 2.416123151779175 + }, + { + "auxiliary_loss_clip": 0.01084512, + "auxiliary_loss_mlp": 0.01041391, + "balance_loss_clip": 1.02359343, + "balance_loss_mlp": 1.02586424, + "epoch": 0.27794979708402223, + "flos": 16251719443200.0, + "grad_norm": 2.663149024933452, + "language_loss": 0.81161809, + "learning_rate": 3.285124941458109e-06, + "loss": 0.8328771, + "num_input_tokens_seen": 99881540, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.5859375, + "step": 4623, + "time_per_iteration": 2.3581767082214355 + }, + { + "auxiliary_loss_clip": 0.01085203, + "auxiliary_loss_mlp": 0.01034785, + "balance_loss_clip": 1.01748776, + "balance_loss_mlp": 1.02682495, + "epoch": 0.2780099203366902, + "flos": 20666591464320.0, + "grad_norm": 2.4735003687486343, + "language_loss": 0.81395102, + "learning_rate": 3.2848354184792845e-06, + "loss": 0.8351509, + "num_input_tokens_seen": 99899595, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.58203125, + "step": 4624, + "time_per_iteration": 2.3813822269439697 + }, + { + "auxiliary_loss_clip": 0.01082105, + "auxiliary_loss_mlp": 0.010371, + "balance_loss_clip": 1.01976693, + "balance_loss_mlp": 1.02624857, + "epoch": 0.27807004358935816, + "flos": 17738880249600.0, + "grad_norm": 2.8422111400850256, + "language_loss": 0.7708801, + "learning_rate": 3.284545849647989e-06, + "loss": 0.79207218, + "num_input_tokens_seen": 99913020, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.5546875, + "step": 4625, + "time_per_iteration": 2.3180911540985107 + }, + { + "auxiliary_loss_clip": 0.01082543, + "auxiliary_loss_mlp": 0.01035924, + "balance_loss_clip": 1.01828074, + "balance_loss_mlp": 1.02491689, + "epoch": 0.2781301668420261, + "flos": 16726189086720.0, + "grad_norm": 1.9770042224717426, + "language_loss": 0.69873786, + "learning_rate": 3.284256234974556e-06, + "loss": 0.71992254, + "num_input_tokens_seen": 99931405, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.578125, + "step": 4626, + "time_per_iteration": 2.353443145751953 + }, + { + "auxiliary_loss_clip": 0.0108804, + "auxiliary_loss_mlp": 0.01038142, + "balance_loss_clip": 1.01935506, + "balance_loss_mlp": 1.02624512, + "epoch": 0.27819029009469415, + "flos": 13843899417600.0, + "grad_norm": 2.3771990740611533, + "language_loss": 0.92214501, + "learning_rate": 3.2839665744693222e-06, + "loss": 0.94340682, + "num_input_tokens_seen": 99948100, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.6171875, + "step": 4627, + "time_per_iteration": 2.3378424644470215 + }, + { + "auxiliary_loss_clip": 0.01085744, + "auxiliary_loss_mlp": 0.01034219, + "balance_loss_clip": 1.01732719, + "balance_loss_mlp": 1.02700353, + "epoch": 0.2782504133473621, + "flos": 27088061633280.0, + "grad_norm": 1.7595041837186949, + "language_loss": 0.85137182, + "learning_rate": 3.2836768681426234e-06, + "loss": 0.87257147, + "num_input_tokens_seen": 99966470, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.5859375, + "step": 4628, + "time_per_iteration": 2.420797348022461 + }, + { + "auxiliary_loss_clip": 0.01078953, + "auxiliary_loss_mlp": 0.01035346, + "balance_loss_clip": 1.01926446, + "balance_loss_mlp": 1.02317333, + "epoch": 0.2783105366000301, + "flos": 21067778430720.0, + "grad_norm": 1.535526018524169, + "language_loss": 0.79222208, + "learning_rate": 3.2833871160047998e-06, + "loss": 0.81336504, + "num_input_tokens_seen": 99985930, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.5546875, + "step": 4629, + "time_per_iteration": 2.3647055625915527 + }, + { + "auxiliary_loss_clip": 0.01079858, + "auxiliary_loss_mlp": 0.01029621, + "balance_loss_clip": 1.01469588, + "balance_loss_mlp": 1.02572298, + "epoch": 0.27837065985269804, + "flos": 26500717964160.0, + "grad_norm": 1.469135987762367, + "language_loss": 0.84519124, + "learning_rate": 3.2830973180661907e-06, + "loss": 0.86628604, + "num_input_tokens_seen": 100006235, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.54296875, + "step": 4630, + "time_per_iteration": 2.460904121398926 + }, + { + "auxiliary_loss_clip": 0.01082379, + "auxiliary_loss_mlp": 0.01029705, + "balance_loss_clip": 1.01239622, + "balance_loss_mlp": 1.02568364, + "epoch": 0.278430783105366, + "flos": 20222356924800.0, + "grad_norm": 1.8684967864062993, + "language_loss": 0.80862486, + "learning_rate": 3.2828074743371394e-06, + "loss": 0.82974571, + "num_input_tokens_seen": 100023655, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.56640625, + "step": 4631, + "time_per_iteration": 2.3690006732940674 + }, + { + "auxiliary_loss_clip": 0.01088817, + "auxiliary_loss_mlp": 0.0103669, + "balance_loss_clip": 1.01936841, + "balance_loss_mlp": 1.02829027, + "epoch": 0.278490906358034, + "flos": 25591719139200.0, + "grad_norm": 1.7264174442040423, + "language_loss": 0.70928347, + "learning_rate": 3.2825175848279884e-06, + "loss": 0.73053855, + "num_input_tokens_seen": 100043280, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.60546875, + "step": 4632, + "time_per_iteration": 2.4467008113861084 + }, + { + "auxiliary_loss_clip": 0.01081805, + "auxiliary_loss_mlp": 0.01033646, + "balance_loss_clip": 1.01754093, + "balance_loss_mlp": 1.02730489, + "epoch": 0.27855102961070194, + "flos": 16170861000960.0, + "grad_norm": 1.83610757588639, + "language_loss": 0.81952357, + "learning_rate": 3.2822276495490844e-06, + "loss": 0.8406781, + "num_input_tokens_seen": 100057690, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.546875, + "step": 4633, + "time_per_iteration": 2.3516693115234375 + }, + { + "auxiliary_loss_clip": 0.01083561, + "auxiliary_loss_mlp": 0.01031399, + "balance_loss_clip": 1.01302898, + "balance_loss_mlp": 1.02532363, + "epoch": 0.2786111528633699, + "flos": 22926554415360.0, + "grad_norm": 1.6017873903105209, + "language_loss": 0.87525678, + "learning_rate": 3.2819376685107733e-06, + "loss": 0.89640629, + "num_input_tokens_seen": 100075875, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.58203125, + "step": 4634, + "time_per_iteration": 2.414952039718628 + }, + { + "auxiliary_loss_clip": 0.010871, + "auxiliary_loss_mlp": 0.01040336, + "balance_loss_clip": 1.02274013, + "balance_loss_mlp": 1.02746367, + "epoch": 0.27867127611603787, + "flos": 23403083829120.0, + "grad_norm": 1.561803190148723, + "language_loss": 0.76841503, + "learning_rate": 3.281647641723405e-06, + "loss": 0.78968936, + "num_input_tokens_seen": 100092930, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.59375, + "step": 4635, + "time_per_iteration": 3.816922903060913 + }, + { + "auxiliary_loss_clip": 0.01083017, + "auxiliary_loss_mlp": 0.01034938, + "balance_loss_clip": 1.01747417, + "balance_loss_mlp": 1.02431464, + "epoch": 0.27873139936870583, + "flos": 19827977673600.0, + "grad_norm": 1.6173373765387882, + "language_loss": 0.646577, + "learning_rate": 3.2813575691973288e-06, + "loss": 0.66775656, + "num_input_tokens_seen": 100110790, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.5859375, + "step": 4636, + "time_per_iteration": 2.3943421840667725 + }, + { + "auxiliary_loss_clip": 0.01086065, + "auxiliary_loss_mlp": 0.01036166, + "balance_loss_clip": 1.01816511, + "balance_loss_mlp": 1.02606583, + "epoch": 0.2787915226213738, + "flos": 17706829754880.0, + "grad_norm": 4.326128545341542, + "language_loss": 0.83744383, + "learning_rate": 3.2810674509428973e-06, + "loss": 0.85866612, + "num_input_tokens_seen": 100126970, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.6015625, + "step": 4637, + "time_per_iteration": 2.3751144409179688 + }, + { + "auxiliary_loss_clip": 0.01080055, + "auxiliary_loss_mlp": 0.01039026, + "balance_loss_clip": 1.02281356, + "balance_loss_mlp": 1.02487409, + "epoch": 0.27885164587404176, + "flos": 22089476724480.0, + "grad_norm": 1.4454892374667667, + "language_loss": 0.75655055, + "learning_rate": 3.2807772869704634e-06, + "loss": 0.77774143, + "num_input_tokens_seen": 100146720, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.55078125, + "step": 4638, + "time_per_iteration": 2.3966667652130127 + }, + { + "auxiliary_loss_clip": 0.01084943, + "auxiliary_loss_mlp": 0.01038278, + "balance_loss_clip": 1.02051592, + "balance_loss_mlp": 1.02643597, + "epoch": 0.27891176912670973, + "flos": 19206698296320.0, + "grad_norm": 1.7643012345139144, + "language_loss": 0.71589458, + "learning_rate": 3.2804870772903826e-06, + "loss": 0.73712677, + "num_input_tokens_seen": 100165920, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.5859375, + "step": 4639, + "time_per_iteration": 3.7666375637054443 + }, + { + "auxiliary_loss_clip": 0.01082703, + "auxiliary_loss_mlp": 0.01032289, + "balance_loss_clip": 1.0154686, + "balance_loss_mlp": 1.02486873, + "epoch": 0.27897189237937775, + "flos": 27598771134720.0, + "grad_norm": 1.6964359593390061, + "language_loss": 0.65952086, + "learning_rate": 3.2801968219130123e-06, + "loss": 0.68067074, + "num_input_tokens_seen": 100185525, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.578125, + "step": 4640, + "time_per_iteration": 2.4182565212249756 + }, + { + "auxiliary_loss_clip": 0.01085352, + "auxiliary_loss_mlp": 0.01034031, + "balance_loss_clip": 1.01653075, + "balance_loss_mlp": 1.02631187, + "epoch": 0.2790320156320457, + "flos": 21177161320320.0, + "grad_norm": 1.7423823552122508, + "language_loss": 0.71854401, + "learning_rate": 3.27990652084871e-06, + "loss": 0.73973787, + "num_input_tokens_seen": 100204850, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.58984375, + "step": 4641, + "time_per_iteration": 2.3853704929351807 + }, + { + "auxiliary_loss_clip": 0.01084863, + "auxiliary_loss_mlp": 0.01036614, + "balance_loss_clip": 1.01801729, + "balance_loss_mlp": 1.02554667, + "epoch": 0.2790921388847137, + "flos": 22782816881280.0, + "grad_norm": 2.0332748604370914, + "language_loss": 0.74730569, + "learning_rate": 3.279616174107837e-06, + "loss": 0.76852047, + "num_input_tokens_seen": 100224520, + "router_z_loss_clip": 0.18652344, + "router_z_loss_mlp": 0.59375, + "step": 4642, + "time_per_iteration": 2.3755061626434326 + }, + { + "auxiliary_loss_clip": 0.01086728, + "auxiliary_loss_mlp": 0.01034502, + "balance_loss_clip": 1.01653767, + "balance_loss_mlp": 1.02767062, + "epoch": 0.27915226213738165, + "flos": 23399627604480.0, + "grad_norm": 1.8367534855506094, + "language_loss": 0.8581357, + "learning_rate": 3.2793257817007537e-06, + "loss": 0.87934798, + "num_input_tokens_seen": 100243935, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.59375, + "step": 4643, + "time_per_iteration": 3.7561748027801514 + }, + { + "auxiliary_loss_clip": 0.01085337, + "auxiliary_loss_mlp": 0.01034537, + "balance_loss_clip": 1.01799083, + "balance_loss_mlp": 1.02705395, + "epoch": 0.2792123853900496, + "flos": 22746681757440.0, + "grad_norm": 1.6889448295685452, + "language_loss": 0.83135402, + "learning_rate": 3.279035343637824e-06, + "loss": 0.85255277, + "num_input_tokens_seen": 100262290, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.58203125, + "step": 4644, + "time_per_iteration": 2.422675132751465 + }, + { + "auxiliary_loss_clip": 0.01085248, + "auxiliary_loss_mlp": 0.01036146, + "balance_loss_clip": 1.01912344, + "balance_loss_mlp": 1.02601528, + "epoch": 0.2792725086427176, + "flos": 15048472746240.0, + "grad_norm": 2.105528629850968, + "language_loss": 0.78621614, + "learning_rate": 3.2787448599294135e-06, + "loss": 0.80743003, + "num_input_tokens_seen": 100280015, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.59375, + "step": 4645, + "time_per_iteration": 3.7727739810943604 + }, + { + "auxiliary_loss_clip": 0.01021076, + "auxiliary_loss_mlp": 0.01004018, + "balance_loss_clip": 1.00238454, + "balance_loss_mlp": 1.00637841, + "epoch": 0.27933263189538554, + "flos": 62541871073280.0, + "grad_norm": 0.7735840675798167, + "language_loss": 0.62300205, + "learning_rate": 3.2784543305858878e-06, + "loss": 0.64325297, + "num_input_tokens_seen": 100338935, + "router_z_loss_clip": 0.01635742, + "router_z_loss_mlp": 0.14648438, + "step": 4646, + "time_per_iteration": 3.005046844482422 + }, + { + "auxiliary_loss_clip": 0.01079181, + "auxiliary_loss_mlp": 0.01028121, + "balance_loss_clip": 1.01285076, + "balance_loss_mlp": 1.02494645, + "epoch": 0.2793927551480535, + "flos": 25117214584320.0, + "grad_norm": 1.5647752319544206, + "language_loss": 0.89292717, + "learning_rate": 3.2781637556176155e-06, + "loss": 0.91400021, + "num_input_tokens_seen": 100359905, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.54296875, + "step": 4647, + "time_per_iteration": 2.407451629638672 + }, + { + "auxiliary_loss_clip": 0.01084096, + "auxiliary_loss_mlp": 0.01035985, + "balance_loss_clip": 1.01748323, + "balance_loss_mlp": 1.02522612, + "epoch": 0.27945287840072147, + "flos": 21323517206400.0, + "grad_norm": 1.5465899249731474, + "language_loss": 0.87006688, + "learning_rate": 3.2778731350349673e-06, + "loss": 0.89126772, + "num_input_tokens_seen": 100376955, + "router_z_loss_clip": 0.18457031, + "router_z_loss_mlp": 0.5859375, + "step": 4648, + "time_per_iteration": 2.3792061805725098 + }, + { + "auxiliary_loss_clip": 0.01084972, + "auxiliary_loss_mlp": 0.01033372, + "balance_loss_clip": 1.01506186, + "balance_loss_mlp": 1.02492583, + "epoch": 0.27951300165338944, + "flos": 27449412871680.0, + "grad_norm": 2.5723668293156186, + "language_loss": 0.73237026, + "learning_rate": 3.2775824688483138e-06, + "loss": 0.75355369, + "num_input_tokens_seen": 100397545, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.6015625, + "step": 4649, + "time_per_iteration": 2.4175124168395996 + }, + { + "auxiliary_loss_clip": 0.01082694, + "auxiliary_loss_mlp": 0.01034673, + "balance_loss_clip": 1.01552844, + "balance_loss_mlp": 1.02423, + "epoch": 0.2795731249060574, + "flos": 15158100015360.0, + "grad_norm": 3.53622186829596, + "language_loss": 0.79836136, + "learning_rate": 3.2772917570680278e-06, + "loss": 0.81953508, + "num_input_tokens_seen": 100415080, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 0.5859375, + "step": 4650, + "time_per_iteration": 2.35884165763855 + }, + { + "auxiliary_loss_clip": 0.01020244, + "auxiliary_loss_mlp": 0.01001616, + "balance_loss_clip": 0.99998331, + "balance_loss_mlp": 1.00546682, + "epoch": 0.27963324815872537, + "flos": 60116628412800.0, + "grad_norm": 0.8185614181317514, + "language_loss": 0.58871585, + "learning_rate": 3.2770009997044846e-06, + "loss": 0.6089344, + "num_input_tokens_seen": 100471105, + "router_z_loss_clip": 0.01635742, + "router_z_loss_mlp": 0.14746094, + "step": 4651, + "time_per_iteration": 3.0610158443450928 + }, + { + "auxiliary_loss_clip": 0.01085743, + "auxiliary_loss_mlp": 0.01038878, + "balance_loss_clip": 1.01916051, + "balance_loss_mlp": 1.02432191, + "epoch": 0.27969337141139333, + "flos": 21764784280320.0, + "grad_norm": 1.580454444405715, + "language_loss": 0.73771393, + "learning_rate": 3.2767101967680607e-06, + "loss": 0.75896013, + "num_input_tokens_seen": 100492520, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 0.61328125, + "step": 4652, + "time_per_iteration": 2.41622257232666 + }, + { + "auxiliary_loss_clip": 0.01087177, + "auxiliary_loss_mlp": 0.01033025, + "balance_loss_clip": 1.01547742, + "balance_loss_mlp": 1.02712393, + "epoch": 0.27975349466406135, + "flos": 39850038794880.0, + "grad_norm": 2.315524986794949, + "language_loss": 0.79685175, + "learning_rate": 3.276419348269134e-06, + "loss": 0.81805372, + "num_input_tokens_seen": 100512870, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.6015625, + "step": 4653, + "time_per_iteration": 2.511756420135498 + }, + { + "auxiliary_loss_clip": 0.0108185, + "auxiliary_loss_mlp": 0.01034671, + "balance_loss_clip": 1.01760054, + "balance_loss_mlp": 1.02443719, + "epoch": 0.2798136179167293, + "flos": 21578732311680.0, + "grad_norm": 2.169439957846448, + "language_loss": 0.78860891, + "learning_rate": 3.2761284542180842e-06, + "loss": 0.80977416, + "num_input_tokens_seen": 100531655, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.57421875, + "step": 4654, + "time_per_iteration": 2.377237558364868 + }, + { + "auxiliary_loss_clip": 0.01086783, + "auxiliary_loss_mlp": 0.01041431, + "balance_loss_clip": 1.02271509, + "balance_loss_mlp": 1.02596807, + "epoch": 0.2798737411693973, + "flos": 21536766990720.0, + "grad_norm": 2.0206426507677584, + "language_loss": 0.81036353, + "learning_rate": 3.2758375146252924e-06, + "loss": 0.83164567, + "num_input_tokens_seen": 100548005, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.609375, + "step": 4655, + "time_per_iteration": 2.3665342330932617 + }, + { + "auxiliary_loss_clip": 0.01083031, + "auxiliary_loss_mlp": 0.01037939, + "balance_loss_clip": 1.02021265, + "balance_loss_mlp": 1.02424467, + "epoch": 0.27993386442206525, + "flos": 26979795907200.0, + "grad_norm": 1.612782826834512, + "language_loss": 0.8088764, + "learning_rate": 3.275546529501142e-06, + "loss": 0.83008605, + "num_input_tokens_seen": 100567980, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.58984375, + "step": 4656, + "time_per_iteration": 2.428345203399658 + }, + { + "auxiliary_loss_clip": 0.01083125, + "auxiliary_loss_mlp": 0.01031622, + "balance_loss_clip": 1.0142529, + "balance_loss_mlp": 1.0257895, + "epoch": 0.2799939876747332, + "flos": 24348811271040.0, + "grad_norm": 1.5835376969658335, + "language_loss": 0.83297378, + "learning_rate": 3.2752554988560165e-06, + "loss": 0.85412127, + "num_input_tokens_seen": 100588630, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.57421875, + "step": 4657, + "time_per_iteration": 2.4023072719573975 + }, + { + "auxiliary_loss_clip": 0.0108508, + "auxiliary_loss_mlp": 0.01037186, + "balance_loss_clip": 1.01986456, + "balance_loss_mlp": 1.0267309, + "epoch": 0.2800541109274012, + "flos": 33655573486080.0, + "grad_norm": 1.9253757388017114, + "language_loss": 0.63597363, + "learning_rate": 3.274964422700303e-06, + "loss": 0.65719628, + "num_input_tokens_seen": 100608775, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.5859375, + "step": 4658, + "time_per_iteration": 2.4912872314453125 + }, + { + "auxiliary_loss_clip": 0.01084028, + "auxiliary_loss_mlp": 0.01037027, + "balance_loss_clip": 1.01922905, + "balance_loss_mlp": 1.02554047, + "epoch": 0.28011423418006914, + "flos": 21613401158400.0, + "grad_norm": 3.6337546554885405, + "language_loss": 0.78746295, + "learning_rate": 3.274673301044388e-06, + "loss": 0.8086735, + "num_input_tokens_seen": 100627975, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.5859375, + "step": 4659, + "time_per_iteration": 2.400369644165039 + }, + { + "auxiliary_loss_clip": 0.01087056, + "auxiliary_loss_mlp": 0.01041065, + "balance_loss_clip": 1.02405405, + "balance_loss_mlp": 1.027619, + "epoch": 0.2801743574327371, + "flos": 23111314663680.0, + "grad_norm": 1.7404912596301556, + "language_loss": 0.78804803, + "learning_rate": 3.274382133898663e-06, + "loss": 0.80932927, + "num_input_tokens_seen": 100645430, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.59375, + "step": 4660, + "time_per_iteration": 2.39731764793396 + }, + { + "auxiliary_loss_clip": 0.01081603, + "auxiliary_loss_mlp": 0.01032864, + "balance_loss_clip": 1.01719964, + "balance_loss_mlp": 1.02556157, + "epoch": 0.2802344806854051, + "flos": 12640582897920.0, + "grad_norm": 1.7660425944191298, + "language_loss": 0.80277127, + "learning_rate": 3.2740909212735172e-06, + "loss": 0.82391596, + "num_input_tokens_seen": 100663775, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.5625, + "step": 4661, + "time_per_iteration": 2.3529486656188965 + }, + { + "auxiliary_loss_clip": 0.01085122, + "auxiliary_loss_mlp": 0.01045281, + "balance_loss_clip": 1.02856791, + "balance_loss_mlp": 1.02765727, + "epoch": 0.28029460393807304, + "flos": 37266395829120.0, + "grad_norm": 1.504622092235081, + "language_loss": 0.78868937, + "learning_rate": 3.273799663179343e-06, + "loss": 0.80999339, + "num_input_tokens_seen": 100686085, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.57421875, + "step": 4662, + "time_per_iteration": 2.497027635574341 + }, + { + "auxiliary_loss_clip": 0.01086981, + "auxiliary_loss_mlp": 0.01036987, + "balance_loss_clip": 1.01740074, + "balance_loss_mlp": 1.02619553, + "epoch": 0.280354727190741, + "flos": 20740048698240.0, + "grad_norm": 1.933075453578351, + "language_loss": 0.69929224, + "learning_rate": 3.273508359626536e-06, + "loss": 0.72053194, + "num_input_tokens_seen": 100705135, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 0.609375, + "step": 4663, + "time_per_iteration": 2.3991286754608154 + }, + { + "auxiliary_loss_clip": 0.01086619, + "auxiliary_loss_mlp": 0.01037239, + "balance_loss_clip": 1.01793909, + "balance_loss_mlp": 1.02787316, + "epoch": 0.28041485044340897, + "flos": 21469942915200.0, + "grad_norm": 1.6952368274520868, + "language_loss": 0.77922308, + "learning_rate": 3.2732170106254923e-06, + "loss": 0.80046165, + "num_input_tokens_seen": 100724960, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 0.58984375, + "step": 4664, + "time_per_iteration": 2.4059791564941406 + }, + { + "auxiliary_loss_clip": 0.01079312, + "auxiliary_loss_mlp": 0.01035245, + "balance_loss_clip": 1.01836514, + "balance_loss_mlp": 1.02439642, + "epoch": 0.28047497369607693, + "flos": 14793362375040.0, + "grad_norm": 1.9302255295511632, + "language_loss": 0.79399538, + "learning_rate": 3.272925616186607e-06, + "loss": 0.81514096, + "num_input_tokens_seen": 100741995, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.546875, + "step": 4665, + "time_per_iteration": 2.3554060459136963 + }, + { + "auxiliary_loss_clip": 0.01080987, + "auxiliary_loss_mlp": 0.01030825, + "balance_loss_clip": 1.01413536, + "balance_loss_mlp": 1.0249579, + "epoch": 0.2805350969487449, + "flos": 23069768279040.0, + "grad_norm": 1.7143730966235324, + "language_loss": 0.80576044, + "learning_rate": 3.2726341763202823e-06, + "loss": 0.82687855, + "num_input_tokens_seen": 100758985, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.5625, + "step": 4666, + "time_per_iteration": 2.378121852874756 + }, + { + "auxiliary_loss_clip": 0.01085558, + "auxiliary_loss_mlp": 0.01040671, + "balance_loss_clip": 1.02313566, + "balance_loss_mlp": 1.02600992, + "epoch": 0.2805952202014129, + "flos": 20478968484480.0, + "grad_norm": 1.9741226959435492, + "language_loss": 0.8464638, + "learning_rate": 3.2723426910369166e-06, + "loss": 0.86772609, + "num_input_tokens_seen": 100777820, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.59375, + "step": 4667, + "time_per_iteration": 2.4156687259674072 + }, + { + "auxiliary_loss_clip": 0.01085877, + "auxiliary_loss_mlp": 0.01037155, + "balance_loss_clip": 1.0192256, + "balance_loss_mlp": 1.02700794, + "epoch": 0.2806553434540809, + "flos": 27416105568000.0, + "grad_norm": 1.6805529541946873, + "language_loss": 0.79382908, + "learning_rate": 3.2720511603469136e-06, + "loss": 0.81505942, + "num_input_tokens_seen": 100798205, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.58984375, + "step": 4668, + "time_per_iteration": 2.422001838684082 + }, + { + "auxiliary_loss_clip": 0.01082204, + "auxiliary_loss_mlp": 0.01035529, + "balance_loss_clip": 1.01775444, + "balance_loss_mlp": 1.02319944, + "epoch": 0.28071546670674885, + "flos": 26503825075200.0, + "grad_norm": 1.4097722041710656, + "language_loss": 0.76093447, + "learning_rate": 3.2717595842606766e-06, + "loss": 0.78211176, + "num_input_tokens_seen": 100819800, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.58984375, + "step": 4669, + "time_per_iteration": 2.439584970474243 + }, + { + "auxiliary_loss_clip": 0.01082773, + "auxiliary_loss_mlp": 0.01036039, + "balance_loss_clip": 1.01906371, + "balance_loss_mlp": 1.02528119, + "epoch": 0.2807755899594168, + "flos": 20557627511040.0, + "grad_norm": 1.9977654540266632, + "language_loss": 0.78902614, + "learning_rate": 3.271467962788611e-06, + "loss": 0.81021422, + "num_input_tokens_seen": 100837880, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.57421875, + "step": 4670, + "time_per_iteration": 2.372771978378296 + }, + { + "auxiliary_loss_clip": 0.01086034, + "auxiliary_loss_mlp": 0.01040292, + "balance_loss_clip": 1.02133775, + "balance_loss_mlp": 1.02653217, + "epoch": 0.2808357132120848, + "flos": 24312257210880.0, + "grad_norm": 1.7770239131978183, + "language_loss": 0.79119527, + "learning_rate": 3.271176295941125e-06, + "loss": 0.81245852, + "num_input_tokens_seen": 100856350, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 0.59375, + "step": 4671, + "time_per_iteration": 2.4365363121032715 + }, + { + "auxiliary_loss_clip": 0.01080077, + "auxiliary_loss_mlp": 0.01030405, + "balance_loss_clip": 1.01521778, + "balance_loss_mlp": 1.02597547, + "epoch": 0.28089583646475275, + "flos": 26431205713920.0, + "grad_norm": 1.897473217997013, + "language_loss": 0.75169575, + "learning_rate": 3.270884583728626e-06, + "loss": 0.77280051, + "num_input_tokens_seen": 100876135, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.54296875, + "step": 4672, + "time_per_iteration": 2.426450252532959 + }, + { + "auxiliary_loss_clip": 0.0108321, + "auxiliary_loss_mlp": 0.01037734, + "balance_loss_clip": 1.01870811, + "balance_loss_mlp": 1.02466989, + "epoch": 0.2809559597174207, + "flos": 23110721170560.0, + "grad_norm": 2.6686100036483578, + "language_loss": 0.75073838, + "learning_rate": 3.2705928261615263e-06, + "loss": 0.77194774, + "num_input_tokens_seen": 100894790, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 0.5859375, + "step": 4673, + "time_per_iteration": 2.4145619869232178 + }, + { + "auxiliary_loss_clip": 0.01082577, + "auxiliary_loss_mlp": 0.01034431, + "balance_loss_clip": 1.01656151, + "balance_loss_mlp": 1.02480292, + "epoch": 0.2810160829700887, + "flos": 20922434974080.0, + "grad_norm": 4.5962322931246815, + "language_loss": 0.72032297, + "learning_rate": 3.270301023250237e-06, + "loss": 0.74149305, + "num_input_tokens_seen": 100915100, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 0.578125, + "step": 4674, + "time_per_iteration": 2.38694429397583 + }, + { + "auxiliary_loss_clip": 0.01085738, + "auxiliary_loss_mlp": 0.01035036, + "balance_loss_clip": 1.0167371, + "balance_loss_mlp": 1.02691054, + "epoch": 0.28107620622275664, + "flos": 14355027855360.0, + "grad_norm": 2.0331109639680824, + "language_loss": 0.76845366, + "learning_rate": 3.270009175005171e-06, + "loss": 0.78966129, + "num_input_tokens_seen": 100932795, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 0.58984375, + "step": 4675, + "time_per_iteration": 3.7418601512908936 + }, + { + "auxiliary_loss_clip": 0.01084226, + "auxiliary_loss_mlp": 0.01035303, + "balance_loss_clip": 1.01752877, + "balance_loss_mlp": 1.02490258, + "epoch": 0.2811363294754246, + "flos": 20918140876800.0, + "grad_norm": 2.2300754752588623, + "language_loss": 0.70094705, + "learning_rate": 3.2697172814367447e-06, + "loss": 0.72214234, + "num_input_tokens_seen": 100950505, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.59375, + "step": 4676, + "time_per_iteration": 2.360639810562134 + }, + { + "auxiliary_loss_clip": 0.01077331, + "auxiliary_loss_mlp": 0.0103494, + "balance_loss_clip": 1.01862073, + "balance_loss_mlp": 1.02451169, + "epoch": 0.28119645272809257, + "flos": 20593797546240.0, + "grad_norm": 1.5952549700482799, + "language_loss": 0.70463085, + "learning_rate": 3.269425342555375e-06, + "loss": 0.72575355, + "num_input_tokens_seen": 100968790, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.52734375, + "step": 4677, + "time_per_iteration": 2.3933629989624023 + }, + { + "auxiliary_loss_clip": 0.01084524, + "auxiliary_loss_mlp": 0.01034786, + "balance_loss_clip": 1.01754856, + "balance_loss_mlp": 1.02574372, + "epoch": 0.28125657598076054, + "flos": 25336259654400.0, + "grad_norm": 1.5735103055016368, + "language_loss": 0.63790667, + "learning_rate": 3.26913335837148e-06, + "loss": 0.65909982, + "num_input_tokens_seen": 100990205, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.5859375, + "step": 4678, + "time_per_iteration": 3.7785065174102783 + }, + { + "auxiliary_loss_clip": 0.01080307, + "auxiliary_loss_mlp": 0.0103161, + "balance_loss_clip": 1.01508796, + "balance_loss_mlp": 1.02366781, + "epoch": 0.2813166992334285, + "flos": 24825934177920.0, + "grad_norm": 9.072963622735532, + "language_loss": 0.70800877, + "learning_rate": 3.26884132889548e-06, + "loss": 0.72912794, + "num_input_tokens_seen": 101009815, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.56640625, + "step": 4679, + "time_per_iteration": 2.4279818534851074 + }, + { + "auxiliary_loss_clip": 0.01081878, + "auxiliary_loss_mlp": 0.01036609, + "balance_loss_clip": 1.0186677, + "balance_loss_mlp": 1.02452493, + "epoch": 0.2813768224860965, + "flos": 21759722133120.0, + "grad_norm": 1.8891183685317214, + "language_loss": 0.7466377, + "learning_rate": 3.268549254137797e-06, + "loss": 0.7678225, + "num_input_tokens_seen": 101026780, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.5703125, + "step": 4680, + "time_per_iteration": 2.3850209712982178 + }, + { + "auxiliary_loss_clip": 0.01084906, + "auxiliary_loss_mlp": 0.01031721, + "balance_loss_clip": 1.01566339, + "balance_loss_mlp": 1.02604866, + "epoch": 0.2814369457387645, + "flos": 24315643612800.0, + "grad_norm": 1.6765047915602025, + "language_loss": 0.77050579, + "learning_rate": 3.2682571341088537e-06, + "loss": 0.79167211, + "num_input_tokens_seen": 101046215, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.5859375, + "step": 4681, + "time_per_iteration": 2.4106409549713135 + }, + { + "auxiliary_loss_clip": 0.01083417, + "auxiliary_loss_mlp": 0.01029796, + "balance_loss_clip": 1.01258218, + "balance_loss_mlp": 1.02506196, + "epoch": 0.28149706899143245, + "flos": 18514335657600.0, + "grad_norm": 1.9595536419474688, + "language_loss": 0.73747474, + "learning_rate": 3.2679649688190765e-06, + "loss": 0.75860685, + "num_input_tokens_seen": 101063365, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.58203125, + "step": 4682, + "time_per_iteration": 3.7066774368286133 + }, + { + "auxiliary_loss_clip": 0.01081102, + "auxiliary_loss_mlp": 0.0103029, + "balance_loss_clip": 1.01382756, + "balance_loss_mlp": 1.02451968, + "epoch": 0.2815571922441004, + "flos": 24862104213120.0, + "grad_norm": 1.6084029234979913, + "language_loss": 0.80664718, + "learning_rate": 3.2676727582788904e-06, + "loss": 0.82776105, + "num_input_tokens_seen": 101083835, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.56640625, + "step": 4683, + "time_per_iteration": 2.3952901363372803 + }, + { + "auxiliary_loss_clip": 0.01084205, + "auxiliary_loss_mlp": 0.01032073, + "balance_loss_clip": 1.01508546, + "balance_loss_mlp": 1.02594912, + "epoch": 0.2816173154967684, + "flos": 19900597034880.0, + "grad_norm": 1.6779577758032087, + "language_loss": 0.76085913, + "learning_rate": 3.2673805024987246e-06, + "loss": 0.78202188, + "num_input_tokens_seen": 101101740, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.58203125, + "step": 4684, + "time_per_iteration": 2.3700644969940186 + }, + { + "auxiliary_loss_clip": 0.01080426, + "auxiliary_loss_mlp": 0.01034152, + "balance_loss_clip": 1.01786816, + "balance_loss_mlp": 1.02411342, + "epoch": 0.28167743874943635, + "flos": 17490437948160.0, + "grad_norm": 1.8118662902140592, + "language_loss": 0.76061994, + "learning_rate": 3.2670882014890085e-06, + "loss": 0.7817657, + "num_input_tokens_seen": 101120480, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.5625, + "step": 4685, + "time_per_iteration": 3.7422001361846924 + }, + { + "auxiliary_loss_clip": 0.0107851, + "auxiliary_loss_mlp": 0.01031878, + "balance_loss_clip": 1.01596391, + "balance_loss_mlp": 1.02470148, + "epoch": 0.2817375620021043, + "flos": 25300927491840.0, + "grad_norm": 1.4273435716694678, + "language_loss": 0.75569367, + "learning_rate": 3.2667958552601747e-06, + "loss": 0.77679753, + "num_input_tokens_seen": 101142910, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.5390625, + "step": 4686, + "time_per_iteration": 2.422046422958374 + }, + { + "auxiliary_loss_clip": 0.01086003, + "auxiliary_loss_mlp": 0.01037863, + "balance_loss_clip": 1.018682, + "balance_loss_mlp": 1.02572322, + "epoch": 0.2817976852547723, + "flos": 18692358013440.0, + "grad_norm": 2.7393855498237363, + "language_loss": 0.63064349, + "learning_rate": 3.266503463822655e-06, + "loss": 0.65188217, + "num_input_tokens_seen": 101160030, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 0.6015625, + "step": 4687, + "time_per_iteration": 2.3531594276428223 + }, + { + "auxiliary_loss_clip": 0.01083639, + "auxiliary_loss_mlp": 0.01037333, + "balance_loss_clip": 1.02036977, + "balance_loss_mlp": 1.02557552, + "epoch": 0.28185780850744024, + "flos": 22741305408000.0, + "grad_norm": 2.474246556760466, + "language_loss": 0.7603215, + "learning_rate": 3.266211027186884e-06, + "loss": 0.78153127, + "num_input_tokens_seen": 101177675, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.578125, + "step": 4688, + "time_per_iteration": 2.3718762397766113 + }, + { + "auxiliary_loss_clip": 0.0108024, + "auxiliary_loss_mlp": 0.01034938, + "balance_loss_clip": 1.01889277, + "balance_loss_mlp": 1.02497983, + "epoch": 0.2819179317601082, + "flos": 14933189836800.0, + "grad_norm": 2.014022123426822, + "language_loss": 0.78435707, + "learning_rate": 3.265918545363299e-06, + "loss": 0.80550891, + "num_input_tokens_seen": 101192225, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.5546875, + "step": 4689, + "time_per_iteration": 2.3210413455963135 + }, + { + "auxiliary_loss_clip": 0.010826, + "auxiliary_loss_mlp": 0.0103214, + "balance_loss_clip": 1.01487827, + "balance_loss_mlp": 1.02557874, + "epoch": 0.2819780550127762, + "flos": 23144307765120.0, + "grad_norm": 1.8974032460474843, + "language_loss": 0.78310406, + "learning_rate": 3.2656260183623373e-06, + "loss": 0.80425143, + "num_input_tokens_seen": 101210870, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.5703125, + "step": 4690, + "time_per_iteration": 2.3867547512054443 + }, + { + "auxiliary_loss_clip": 0.01082281, + "auxiliary_loss_mlp": 0.0103524, + "balance_loss_clip": 1.0188365, + "balance_loss_mlp": 1.02418804, + "epoch": 0.28203817826544414, + "flos": 21615286371840.0, + "grad_norm": 2.967278175848558, + "language_loss": 0.88040459, + "learning_rate": 3.265333446194439e-06, + "loss": 0.90157986, + "num_input_tokens_seen": 101229965, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.58203125, + "step": 4691, + "time_per_iteration": 2.373430013656616 + }, + { + "auxiliary_loss_clip": 0.010855, + "auxiliary_loss_mlp": 0.01038103, + "balance_loss_clip": 1.0203526, + "balance_loss_mlp": 1.02565289, + "epoch": 0.2820983015181121, + "flos": 24025585104000.0, + "grad_norm": 1.602629082049501, + "language_loss": 0.81951904, + "learning_rate": 3.2650408288700442e-06, + "loss": 0.84075511, + "num_input_tokens_seen": 101250980, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.59765625, + "step": 4692, + "time_per_iteration": 2.4163925647735596 + }, + { + "auxiliary_loss_clip": 0.01082316, + "auxiliary_loss_mlp": 0.01037073, + "balance_loss_clip": 1.01933467, + "balance_loss_mlp": 1.02429128, + "epoch": 0.2821584247707801, + "flos": 30006626071680.0, + "grad_norm": 1.530449377087271, + "language_loss": 0.74562776, + "learning_rate": 3.264748166399596e-06, + "loss": 0.76682162, + "num_input_tokens_seen": 101273335, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.578125, + "step": 4693, + "time_per_iteration": 2.4468326568603516 + }, + { + "auxiliary_loss_clip": 0.01083424, + "auxiliary_loss_mlp": 0.01029993, + "balance_loss_clip": 1.01282704, + "balance_loss_mlp": 1.02475452, + "epoch": 0.2822185480234481, + "flos": 21395717631360.0, + "grad_norm": 1.5564974454766827, + "language_loss": 0.77406073, + "learning_rate": 3.2644554587935397e-06, + "loss": 0.79519486, + "num_input_tokens_seen": 101292110, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.5859375, + "step": 4694, + "time_per_iteration": 2.374176502227783 + }, + { + "auxiliary_loss_clip": 0.01080679, + "auxiliary_loss_mlp": 0.0103188, + "balance_loss_clip": 1.0154531, + "balance_loss_mlp": 1.02528489, + "epoch": 0.28227867127611606, + "flos": 27451612287360.0, + "grad_norm": 2.4200896891866197, + "language_loss": 0.66699767, + "learning_rate": 3.2641627060623205e-06, + "loss": 0.68812329, + "num_input_tokens_seen": 101312815, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.5546875, + "step": 4695, + "time_per_iteration": 2.4323978424072266 + }, + { + "auxiliary_loss_clip": 0.010852, + "auxiliary_loss_mlp": 0.01038959, + "balance_loss_clip": 1.02069628, + "balance_loss_mlp": 1.02541971, + "epoch": 0.282338794528784, + "flos": 22592854840320.0, + "grad_norm": 1.9522033232109488, + "language_loss": 0.75408053, + "learning_rate": 3.263869908216387e-06, + "loss": 0.7753222, + "num_input_tokens_seen": 101329045, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 0.59765625, + "step": 4696, + "time_per_iteration": 2.363227367401123 + }, + { + "auxiliary_loss_clip": 0.01084991, + "auxiliary_loss_mlp": 0.01038945, + "balance_loss_clip": 1.02125454, + "balance_loss_mlp": 1.02661145, + "epoch": 0.282398917781452, + "flos": 42009311784960.0, + "grad_norm": 1.8101574755389118, + "language_loss": 0.62200546, + "learning_rate": 3.2635770652661866e-06, + "loss": 0.64324486, + "num_input_tokens_seen": 101352715, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.58203125, + "step": 4697, + "time_per_iteration": 2.547203540802002 + }, + { + "auxiliary_loss_clip": 0.01077729, + "auxiliary_loss_mlp": 0.01031658, + "balance_loss_clip": 1.01658988, + "balance_loss_mlp": 1.02276564, + "epoch": 0.28245904103411995, + "flos": 23223525373440.0, + "grad_norm": 1.526081382041632, + "language_loss": 0.73026216, + "learning_rate": 3.263284177222171e-06, + "loss": 0.75135601, + "num_input_tokens_seen": 101374640, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.55078125, + "step": 4698, + "time_per_iteration": 2.438342809677124 + }, + { + "auxiliary_loss_clip": 0.0107942, + "auxiliary_loss_mlp": 0.01034195, + "balance_loss_clip": 1.01820874, + "balance_loss_mlp": 1.02436626, + "epoch": 0.2825191642867879, + "flos": 25373442119040.0, + "grad_norm": 2.5472219262396782, + "language_loss": 0.74780232, + "learning_rate": 3.2629912440947927e-06, + "loss": 0.76893842, + "num_input_tokens_seen": 101393595, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.55078125, + "step": 4699, + "time_per_iteration": 2.4066805839538574 + }, + { + "auxiliary_loss_clip": 0.01084279, + "auxiliary_loss_mlp": 0.01033867, + "balance_loss_clip": 1.01485324, + "balance_loss_mlp": 1.02545559, + "epoch": 0.2825792875394559, + "flos": 17235886158720.0, + "grad_norm": 3.0409712150223758, + "language_loss": 0.79499203, + "learning_rate": 3.262698265894506e-06, + "loss": 0.81617349, + "num_input_tokens_seen": 101409265, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 0.58984375, + "step": 4700, + "time_per_iteration": 2.3409626483917236 + }, + { + "auxiliary_loss_clip": 0.01076736, + "auxiliary_loss_mlp": 0.01032518, + "balance_loss_clip": 1.01746178, + "balance_loss_mlp": 1.02353406, + "epoch": 0.28263941079212385, + "flos": 26722765411200.0, + "grad_norm": 1.7282565623755062, + "language_loss": 0.81723988, + "learning_rate": 3.2624052426317664e-06, + "loss": 0.83833241, + "num_input_tokens_seen": 101428365, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.53125, + "step": 4701, + "time_per_iteration": 2.4084768295288086 + }, + { + "auxiliary_loss_clip": 0.01081669, + "auxiliary_loss_mlp": 0.01029812, + "balance_loss_clip": 1.01359963, + "balance_loss_mlp": 1.02464962, + "epoch": 0.2826995340447918, + "flos": 25920147098880.0, + "grad_norm": 2.3292537520678533, + "language_loss": 0.73631829, + "learning_rate": 3.26211217431703e-06, + "loss": 0.75743312, + "num_input_tokens_seen": 101447280, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.5703125, + "step": 4702, + "time_per_iteration": 2.4375617504119873 + }, + { + "auxiliary_loss_clip": 0.01081064, + "auxiliary_loss_mlp": 0.01032388, + "balance_loss_clip": 1.01566291, + "balance_loss_mlp": 1.02562058, + "epoch": 0.2827596572974598, + "flos": 22378697360640.0, + "grad_norm": 6.575249261037677, + "language_loss": 0.78327727, + "learning_rate": 3.2618190609607577e-06, + "loss": 0.80441177, + "num_input_tokens_seen": 101465435, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.5546875, + "step": 4703, + "time_per_iteration": 2.3829078674316406 + }, + { + "auxiliary_loss_clip": 0.01078783, + "auxiliary_loss_mlp": 0.01035038, + "balance_loss_clip": 1.01760912, + "balance_loss_mlp": 1.02337193, + "epoch": 0.28281978055012774, + "flos": 33545736748800.0, + "grad_norm": 1.6192427527875588, + "language_loss": 0.69177246, + "learning_rate": 3.2615259025734085e-06, + "loss": 0.71291065, + "num_input_tokens_seen": 101486355, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.5546875, + "step": 4704, + "time_per_iteration": 2.495262861251831 + }, + { + "auxiliary_loss_clip": 0.0108131, + "auxiliary_loss_mlp": 0.01034461, + "balance_loss_clip": 1.0182128, + "balance_loss_mlp": 1.02644086, + "epoch": 0.2828799038027957, + "flos": 23439742623360.0, + "grad_norm": 3.3627548595005825, + "language_loss": 0.69745326, + "learning_rate": 3.261232699165445e-06, + "loss": 0.71861094, + "num_input_tokens_seen": 101505875, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.546875, + "step": 4705, + "time_per_iteration": 2.387277603149414 + }, + { + "auxiliary_loss_clip": 0.01016274, + "auxiliary_loss_mlp": 0.01002163, + "balance_loss_clip": 1.00050628, + "balance_loss_mlp": 1.00312138, + "epoch": 0.2829400270554637, + "flos": 69870629410560.0, + "grad_norm": 0.7218649370869256, + "language_loss": 0.59269124, + "learning_rate": 3.2609394507473305e-06, + "loss": 0.61287564, + "num_input_tokens_seen": 101565045, + "router_z_loss_clip": 0.01660156, + "router_z_loss_mlp": 0.13183594, + "step": 4706, + "time_per_iteration": 3.0769612789154053 + }, + { + "auxiliary_loss_clip": 0.01077743, + "auxiliary_loss_mlp": 0.01027441, + "balance_loss_clip": 1.01323092, + "balance_loss_mlp": 1.02472639, + "epoch": 0.2830001503081317, + "flos": 24787913840640.0, + "grad_norm": 2.506359337605025, + "language_loss": 0.82300955, + "learning_rate": 3.2606461573295303e-06, + "loss": 0.84406137, + "num_input_tokens_seen": 101585825, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.53125, + "step": 4707, + "time_per_iteration": 2.4101438522338867 + }, + { + "auxiliary_loss_clip": 0.01084681, + "auxiliary_loss_mlp": 0.01033054, + "balance_loss_clip": 1.01473212, + "balance_loss_mlp": 1.02583933, + "epoch": 0.28306027356079966, + "flos": 27668248473600.0, + "grad_norm": 1.639820805608009, + "language_loss": 0.80289948, + "learning_rate": 3.260352818922512e-06, + "loss": 0.82407683, + "num_input_tokens_seen": 101606105, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.5859375, + "step": 4708, + "time_per_iteration": 2.4202804565429688 + }, + { + "auxiliary_loss_clip": 0.01015369, + "auxiliary_loss_mlp": 0.01001945, + "balance_loss_clip": 1.00024033, + "balance_loss_mlp": 1.002424, + "epoch": 0.2831203968134676, + "flos": 60525288276480.0, + "grad_norm": 0.9240460877029365, + "language_loss": 0.62830722, + "learning_rate": 3.2600594355367434e-06, + "loss": 0.64848042, + "num_input_tokens_seen": 101656875, + "router_z_loss_clip": 0.01708984, + "router_z_loss_mlp": 0.12890625, + "step": 4709, + "time_per_iteration": 2.9079742431640625 + }, + { + "auxiliary_loss_clip": 0.01079717, + "auxiliary_loss_mlp": 0.01031115, + "balance_loss_clip": 1.01475906, + "balance_loss_mlp": 1.02518725, + "epoch": 0.2831805200661356, + "flos": 22053690714240.0, + "grad_norm": 1.3603003737777664, + "language_loss": 0.73955834, + "learning_rate": 3.259766007182695e-06, + "loss": 0.76066661, + "num_input_tokens_seen": 101676225, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.54296875, + "step": 4710, + "time_per_iteration": 2.3806557655334473 + }, + { + "auxiliary_loss_clip": 0.01081722, + "auxiliary_loss_mlp": 0.01042753, + "balance_loss_clip": 1.02480066, + "balance_loss_mlp": 1.0248003, + "epoch": 0.28324064331880355, + "flos": 22599592732800.0, + "grad_norm": 1.7428122146726683, + "language_loss": 0.79544723, + "learning_rate": 3.259472533870838e-06, + "loss": 0.81669199, + "num_input_tokens_seen": 101693710, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.5703125, + "step": 4711, + "time_per_iteration": 2.3633670806884766 + }, + { + "auxiliary_loss_clip": 0.01082922, + "auxiliary_loss_mlp": 0.01031946, + "balance_loss_clip": 1.01463687, + "balance_loss_mlp": 1.02554917, + "epoch": 0.2833007665714715, + "flos": 30402960359040.0, + "grad_norm": 4.880572078214499, + "language_loss": 0.70977688, + "learning_rate": 3.2591790156116466e-06, + "loss": 0.73092556, + "num_input_tokens_seen": 101714010, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.57421875, + "step": 4712, + "time_per_iteration": 2.441821575164795 + }, + { + "auxiliary_loss_clip": 0.01082213, + "auxiliary_loss_mlp": 0.01039373, + "balance_loss_clip": 1.0224216, + "balance_loss_mlp": 1.02480936, + "epoch": 0.2833608898241395, + "flos": 23548392374400.0, + "grad_norm": 1.8208215553892926, + "language_loss": 0.81730092, + "learning_rate": 3.258885452415595e-06, + "loss": 0.83851677, + "num_input_tokens_seen": 101732995, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.57421875, + "step": 4713, + "time_per_iteration": 2.383963108062744 + }, + { + "auxiliary_loss_clip": 0.01079658, + "auxiliary_loss_mlp": 0.01031279, + "balance_loss_clip": 1.01531672, + "balance_loss_mlp": 1.02457273, + "epoch": 0.28342101307680745, + "flos": 20265683788800.0, + "grad_norm": 1.8461358395380656, + "language_loss": 0.75691658, + "learning_rate": 3.2585918442931595e-06, + "loss": 0.77802593, + "num_input_tokens_seen": 101751385, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.55078125, + "step": 4714, + "time_per_iteration": 2.373971462249756 + }, + { + "auxiliary_loss_clip": 0.01080895, + "auxiliary_loss_mlp": 0.01037827, + "balance_loss_clip": 1.02190018, + "balance_loss_mlp": 1.02488649, + "epoch": 0.2834811363294754, + "flos": 30845728621440.0, + "grad_norm": 1.3725109146054504, + "language_loss": 0.7819891, + "learning_rate": 3.258298191254818e-06, + "loss": 0.80317628, + "num_input_tokens_seen": 101773825, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.5625, + "step": 4715, + "time_per_iteration": 3.8670871257781982 + }, + { + "auxiliary_loss_clip": 0.0108006, + "auxiliary_loss_mlp": 0.01029352, + "balance_loss_clip": 1.01266277, + "balance_loss_mlp": 1.02458525, + "epoch": 0.2835412595821434, + "flos": 22709918229120.0, + "grad_norm": 2.1575345564238635, + "language_loss": 0.73499966, + "learning_rate": 3.2580044933110513e-06, + "loss": 0.75609374, + "num_input_tokens_seen": 101791920, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.5546875, + "step": 4716, + "time_per_iteration": 2.385983467102051 + }, + { + "auxiliary_loss_clip": 0.01083392, + "auxiliary_loss_mlp": 0.01035076, + "balance_loss_clip": 1.01724267, + "balance_loss_mlp": 1.02334297, + "epoch": 0.28360138283481134, + "flos": 18076734276480.0, + "grad_norm": 2.7059929247232484, + "language_loss": 0.74639773, + "learning_rate": 3.2577107504723403e-06, + "loss": 0.76758242, + "num_input_tokens_seen": 101809515, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 0.6015625, + "step": 4717, + "time_per_iteration": 2.3544490337371826 + }, + { + "auxiliary_loss_clip": 0.01081762, + "auxiliary_loss_mlp": 0.01039139, + "balance_loss_clip": 1.02196097, + "balance_loss_mlp": 1.02436793, + "epoch": 0.2836615060874793, + "flos": 17853918779520.0, + "grad_norm": 1.5746816927948468, + "language_loss": 0.66727597, + "learning_rate": 3.2574169627491683e-06, + "loss": 0.68848491, + "num_input_tokens_seen": 101827735, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.57421875, + "step": 4718, + "time_per_iteration": 3.727241039276123 + }, + { + "auxiliary_loss_clip": 0.01082786, + "auxiliary_loss_mlp": 0.01035875, + "balance_loss_clip": 1.01866162, + "balance_loss_mlp": 1.02434754, + "epoch": 0.2837216293401473, + "flos": 25739087454720.0, + "grad_norm": 1.7899510086008625, + "language_loss": 0.71910429, + "learning_rate": 3.2571231301520187e-06, + "loss": 0.74029094, + "num_input_tokens_seen": 101845970, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.5859375, + "step": 4719, + "time_per_iteration": 2.4185657501220703 + }, + { + "auxiliary_loss_clip": 0.01081545, + "auxiliary_loss_mlp": 0.01035816, + "balance_loss_clip": 1.01876903, + "balance_loss_mlp": 1.02574909, + "epoch": 0.2837817525928153, + "flos": 20922469885440.0, + "grad_norm": 1.6717654750465205, + "language_loss": 0.80072182, + "learning_rate": 3.2568292526913785e-06, + "loss": 0.82189548, + "num_input_tokens_seen": 101865040, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.55859375, + "step": 4720, + "time_per_iteration": 2.3804264068603516 + }, + { + "auxiliary_loss_clip": 0.01082047, + "auxiliary_loss_mlp": 0.01031439, + "balance_loss_clip": 1.01422524, + "balance_loss_mlp": 1.02523673, + "epoch": 0.28384187584548326, + "flos": 18915697180800.0, + "grad_norm": 1.9430720799553722, + "language_loss": 0.79105258, + "learning_rate": 3.2565353303777353e-06, + "loss": 0.81218743, + "num_input_tokens_seen": 101883735, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.56640625, + "step": 4721, + "time_per_iteration": 2.3740267753601074 + }, + { + "auxiliary_loss_clip": 0.01080163, + "auxiliary_loss_mlp": 0.01032042, + "balance_loss_clip": 1.01484036, + "balance_loss_mlp": 1.0245564, + "epoch": 0.2839019990981512, + "flos": 27342753068160.0, + "grad_norm": 3.506974492931057, + "language_loss": 0.82741618, + "learning_rate": 3.256241363221578e-06, + "loss": 0.84853828, + "num_input_tokens_seen": 101903025, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.5546875, + "step": 4722, + "time_per_iteration": 3.805864095687866 + }, + { + "auxiliary_loss_clip": 0.0108301, + "auxiliary_loss_mlp": 0.01033363, + "balance_loss_clip": 1.01716208, + "balance_loss_mlp": 1.02534068, + "epoch": 0.2839621223508192, + "flos": 18113323248000.0, + "grad_norm": 1.5526628097344308, + "language_loss": 0.69962299, + "learning_rate": 3.2559473512333986e-06, + "loss": 0.72078663, + "num_input_tokens_seen": 101922255, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.57421875, + "step": 4723, + "time_per_iteration": 2.376183032989502 + }, + { + "auxiliary_loss_clip": 0.01081209, + "auxiliary_loss_mlp": 0.01028522, + "balance_loss_clip": 1.01126075, + "balance_loss_mlp": 1.02463102, + "epoch": 0.28402224560348716, + "flos": 26357189898240.0, + "grad_norm": 5.87236341881161, + "language_loss": 0.78273642, + "learning_rate": 3.2556532944236886e-06, + "loss": 0.80383372, + "num_input_tokens_seen": 101943100, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.56640625, + "step": 4724, + "time_per_iteration": 2.416632890701294 + }, + { + "auxiliary_loss_clip": 0.01083681, + "auxiliary_loss_mlp": 0.01035033, + "balance_loss_clip": 1.01827192, + "balance_loss_mlp": 1.02569687, + "epoch": 0.2840823688561551, + "flos": 24059660457600.0, + "grad_norm": 3.129995104210972, + "language_loss": 0.92471337, + "learning_rate": 3.2553591928029423e-06, + "loss": 0.94590056, + "num_input_tokens_seen": 101963160, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.578125, + "step": 4725, + "time_per_iteration": 3.7411210536956787 + }, + { + "auxiliary_loss_clip": 0.0108079, + "auxiliary_loss_mlp": 0.01032894, + "balance_loss_clip": 1.0165149, + "balance_loss_mlp": 1.0248549, + "epoch": 0.2841424921088231, + "flos": 29458559548800.0, + "grad_norm": 1.5321762783084771, + "language_loss": 0.88700426, + "learning_rate": 3.2550650463816557e-06, + "loss": 0.90814114, + "num_input_tokens_seen": 101984300, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.55859375, + "step": 4726, + "time_per_iteration": 2.452873945236206 + }, + { + "auxiliary_loss_clip": 0.01084567, + "auxiliary_loss_mlp": 0.01034398, + "balance_loss_clip": 1.01754165, + "balance_loss_mlp": 1.02653563, + "epoch": 0.28420261536149105, + "flos": 48098688301440.0, + "grad_norm": 2.085717828183791, + "language_loss": 0.78557849, + "learning_rate": 3.2547708551703256e-06, + "loss": 0.80676812, + "num_input_tokens_seen": 102005765, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.58203125, + "step": 4727, + "time_per_iteration": 2.6010992527008057 + }, + { + "auxiliary_loss_clip": 0.01078461, + "auxiliary_loss_mlp": 0.01027081, + "balance_loss_clip": 1.01164365, + "balance_loss_mlp": 1.02441645, + "epoch": 0.284262738614159, + "flos": 25664966904960.0, + "grad_norm": 2.181236525354293, + "language_loss": 0.6623621, + "learning_rate": 3.254476619179452e-06, + "loss": 0.68341756, + "num_input_tokens_seen": 102022755, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.5390625, + "step": 4728, + "time_per_iteration": 2.397697687149048 + }, + { + "auxiliary_loss_clip": 0.01078698, + "auxiliary_loss_mlp": 0.01035251, + "balance_loss_clip": 1.01803684, + "balance_loss_mlp": 1.022825, + "epoch": 0.284322861866827, + "flos": 19717966379520.0, + "grad_norm": 2.0880613096562373, + "language_loss": 0.76450562, + "learning_rate": 3.2541823384195344e-06, + "loss": 0.78564513, + "num_input_tokens_seen": 102041850, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.55859375, + "step": 4729, + "time_per_iteration": 2.364349126815796 + }, + { + "auxiliary_loss_clip": 0.01085252, + "auxiliary_loss_mlp": 0.01034272, + "balance_loss_clip": 1.01569974, + "balance_loss_mlp": 1.02589154, + "epoch": 0.28438298511949495, + "flos": 23914107532800.0, + "grad_norm": 2.1033505537452823, + "language_loss": 0.66414917, + "learning_rate": 3.253888012901075e-06, + "loss": 0.6853444, + "num_input_tokens_seen": 102059500, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.59375, + "step": 4730, + "time_per_iteration": 2.398761034011841 + }, + { + "auxiliary_loss_clip": 0.01080779, + "auxiliary_loss_mlp": 0.01029782, + "balance_loss_clip": 1.01216245, + "balance_loss_mlp": 1.02532625, + "epoch": 0.2844431083721629, + "flos": 26066153871360.0, + "grad_norm": 1.6757021520393514, + "language_loss": 0.7444194, + "learning_rate": 3.253593642634578e-06, + "loss": 0.76552504, + "num_input_tokens_seen": 102080460, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.5546875, + "step": 4731, + "time_per_iteration": 2.4312949180603027 + }, + { + "auxiliary_loss_clip": 0.01079736, + "auxiliary_loss_mlp": 0.01027994, + "balance_loss_clip": 1.01224649, + "balance_loss_mlp": 1.02410972, + "epoch": 0.2845032316248309, + "flos": 25809297932160.0, + "grad_norm": 1.3945684550033184, + "language_loss": 0.8347922, + "learning_rate": 3.2532992276305492e-06, + "loss": 0.85586947, + "num_input_tokens_seen": 102100950, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.5546875, + "step": 4732, + "time_per_iteration": 2.4226808547973633 + }, + { + "auxiliary_loss_clip": 0.01083088, + "auxiliary_loss_mlp": 0.01041881, + "balance_loss_clip": 1.02299786, + "balance_loss_mlp": 1.02479708, + "epoch": 0.2845633548774989, + "flos": 19822322033280.0, + "grad_norm": 1.603730583896771, + "language_loss": 0.78541684, + "learning_rate": 3.253004767899494e-06, + "loss": 0.80666649, + "num_input_tokens_seen": 102119345, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 0.5859375, + "step": 4733, + "time_per_iteration": 2.3851988315582275 + }, + { + "auxiliary_loss_clip": 0.01086631, + "auxiliary_loss_mlp": 0.0104098, + "balance_loss_clip": 1.02204967, + "balance_loss_mlp": 1.02597797, + "epoch": 0.28462347813016686, + "flos": 23181769520640.0, + "grad_norm": 3.6474735654880055, + "language_loss": 0.71310365, + "learning_rate": 3.252710263451922e-06, + "loss": 0.73437977, + "num_input_tokens_seen": 102139050, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 0.609375, + "step": 4734, + "time_per_iteration": 2.4062321186065674 + }, + { + "auxiliary_loss_clip": 0.01078318, + "auxiliary_loss_mlp": 0.01030252, + "balance_loss_clip": 1.01302624, + "balance_loss_mlp": 1.02225661, + "epoch": 0.2846836013828348, + "flos": 18659504557440.0, + "grad_norm": 1.7286046932569703, + "language_loss": 0.74091792, + "learning_rate": 3.2524157142983432e-06, + "loss": 0.7620036, + "num_input_tokens_seen": 102157935, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.55859375, + "step": 4735, + "time_per_iteration": 2.3742308616638184 + }, + { + "auxiliary_loss_clip": 0.01081223, + "auxiliary_loss_mlp": 0.01036929, + "balance_loss_clip": 1.02143192, + "balance_loss_mlp": 1.02526176, + "epoch": 0.2847437246355028, + "flos": 14172641579520.0, + "grad_norm": 1.6945225941925561, + "language_loss": 0.79613048, + "learning_rate": 3.252121120449269e-06, + "loss": 0.817312, + "num_input_tokens_seen": 102175325, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.55859375, + "step": 4736, + "time_per_iteration": 2.3473095893859863 + }, + { + "auxiliary_loss_clip": 0.0108318, + "auxiliary_loss_mlp": 0.01036294, + "balance_loss_clip": 1.01933074, + "balance_loss_mlp": 1.02610743, + "epoch": 0.28480384788817076, + "flos": 29277080968320.0, + "grad_norm": 2.2565295857640995, + "language_loss": 0.59148276, + "learning_rate": 3.251826481915213e-06, + "loss": 0.61267751, + "num_input_tokens_seen": 102196625, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.5703125, + "step": 4737, + "time_per_iteration": 2.468946695327759 + }, + { + "auxiliary_loss_clip": 0.01078242, + "auxiliary_loss_mlp": 0.01032174, + "balance_loss_clip": 1.0163312, + "balance_loss_mlp": 1.02363682, + "epoch": 0.2848639711408387, + "flos": 22600221137280.0, + "grad_norm": 2.4602628885469082, + "language_loss": 0.86252427, + "learning_rate": 3.2515317987066894e-06, + "loss": 0.88362849, + "num_input_tokens_seen": 102214975, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.546875, + "step": 4738, + "time_per_iteration": 2.3778250217437744 + }, + { + "auxiliary_loss_clip": 0.01086371, + "auxiliary_loss_mlp": 0.01039284, + "balance_loss_clip": 1.021415, + "balance_loss_mlp": 1.02600527, + "epoch": 0.2849240943935067, + "flos": 17598598940160.0, + "grad_norm": 2.3619050421880208, + "language_loss": 0.89823699, + "learning_rate": 3.2512370708342155e-06, + "loss": 0.91949356, + "num_input_tokens_seen": 102231885, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 0.60546875, + "step": 4739, + "time_per_iteration": 2.3856139183044434 + }, + { + "auxiliary_loss_clip": 0.01084186, + "auxiliary_loss_mlp": 0.01035109, + "balance_loss_clip": 1.0193857, + "balance_loss_mlp": 1.02761984, + "epoch": 0.28498421764617465, + "flos": 24861440897280.0, + "grad_norm": 1.3593881426134444, + "language_loss": 0.72196352, + "learning_rate": 3.25094229830831e-06, + "loss": 0.74315643, + "num_input_tokens_seen": 102252725, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.56640625, + "step": 4740, + "time_per_iteration": 2.405941963195801 + }, + { + "auxiliary_loss_clip": 0.01079864, + "auxiliary_loss_mlp": 0.01027855, + "balance_loss_clip": 1.0124476, + "balance_loss_mlp": 1.02569914, + "epoch": 0.2850443408988426, + "flos": 22781490249600.0, + "grad_norm": 1.5660180677894475, + "language_loss": 0.77726334, + "learning_rate": 3.2506474811394907e-06, + "loss": 0.79834056, + "num_input_tokens_seen": 102271730, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.5390625, + "step": 4741, + "time_per_iteration": 2.409609079360962 + }, + { + "auxiliary_loss_clip": 0.01082562, + "auxiliary_loss_mlp": 0.01031153, + "balance_loss_clip": 1.01353347, + "balance_loss_mlp": 1.02279437, + "epoch": 0.2851044641515106, + "flos": 18843042908160.0, + "grad_norm": 1.938301208297884, + "language_loss": 0.75858092, + "learning_rate": 3.2503526193382796e-06, + "loss": 0.77971804, + "num_input_tokens_seen": 102291325, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.59765625, + "step": 4742, + "time_per_iteration": 2.355731725692749 + }, + { + "auxiliary_loss_clip": 0.01087653, + "auxiliary_loss_mlp": 0.01040676, + "balance_loss_clip": 1.02131653, + "balance_loss_mlp": 1.02617121, + "epoch": 0.28516458740417855, + "flos": 18879492234240.0, + "grad_norm": 2.874478916625614, + "language_loss": 0.57906497, + "learning_rate": 3.2500577129152004e-06, + "loss": 0.60034823, + "num_input_tokens_seen": 102309000, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 0.6171875, + "step": 4743, + "time_per_iteration": 2.367086172103882 + }, + { + "auxiliary_loss_clip": 0.01086354, + "auxiliary_loss_mlp": 0.01034554, + "balance_loss_clip": 1.0167799, + "balance_loss_mlp": 1.02716076, + "epoch": 0.2852247106568465, + "flos": 25298693164800.0, + "grad_norm": 1.6172933456088672, + "language_loss": 0.73978841, + "learning_rate": 3.2497627618807767e-06, + "loss": 0.76099747, + "num_input_tokens_seen": 102329240, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.58984375, + "step": 4744, + "time_per_iteration": 2.4027342796325684 + }, + { + "auxiliary_loss_clip": 0.01080479, + "auxiliary_loss_mlp": 0.01033434, + "balance_loss_clip": 1.01743662, + "balance_loss_mlp": 1.02479768, + "epoch": 0.2852848339095145, + "flos": 11654600791680.0, + "grad_norm": 2.9323421196817407, + "language_loss": 0.77296233, + "learning_rate": 3.2494677662455355e-06, + "loss": 0.79410142, + "num_input_tokens_seen": 102344440, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.5546875, + "step": 4745, + "time_per_iteration": 2.360199213027954 + }, + { + "auxiliary_loss_clip": 0.01078276, + "auxiliary_loss_mlp": 0.01031276, + "balance_loss_clip": 1.01620817, + "balance_loss_mlp": 1.0251931, + "epoch": 0.2853449571621825, + "flos": 12932386974720.0, + "grad_norm": 1.674431880746482, + "language_loss": 0.82517254, + "learning_rate": 3.249172726020003e-06, + "loss": 0.84626806, + "num_input_tokens_seen": 102360985, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.53125, + "step": 4746, + "time_per_iteration": 2.323788642883301 + }, + { + "auxiliary_loss_clip": 0.01084347, + "auxiliary_loss_mlp": 0.01032491, + "balance_loss_clip": 1.01456237, + "balance_loss_mlp": 1.02449775, + "epoch": 0.28540508041485046, + "flos": 20009560988160.0, + "grad_norm": 1.7130432783013392, + "language_loss": 0.79733384, + "learning_rate": 3.248877641214709e-06, + "loss": 0.81850219, + "num_input_tokens_seen": 102380320, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 0.6015625, + "step": 4747, + "time_per_iteration": 2.3920280933380127 + }, + { + "auxiliary_loss_clip": 0.01015087, + "auxiliary_loss_mlp": 0.01007947, + "balance_loss_clip": 1.00650477, + "balance_loss_mlp": 1.00216842, + "epoch": 0.28546520366751843, + "flos": 68135864175360.0, + "grad_norm": 0.7816728800379726, + "language_loss": 0.60464311, + "learning_rate": 3.248582511840185e-06, + "loss": 0.6248734, + "num_input_tokens_seen": 102439140, + "router_z_loss_clip": 0.0144043, + "router_z_loss_mlp": 0.12890625, + "step": 4748, + "time_per_iteration": 2.9779348373413086 + }, + { + "auxiliary_loss_clip": 0.01081255, + "auxiliary_loss_mlp": 0.01037785, + "balance_loss_clip": 1.01980829, + "balance_loss_mlp": 1.02390397, + "epoch": 0.2855253269201864, + "flos": 13250969930880.0, + "grad_norm": 3.095614863227557, + "language_loss": 0.8050099, + "learning_rate": 3.2482873379069627e-06, + "loss": 0.82620031, + "num_input_tokens_seen": 102450990, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.5703125, + "step": 4749, + "time_per_iteration": 2.3364624977111816 + }, + { + "auxiliary_loss_clip": 0.01081563, + "auxiliary_loss_mlp": 0.01032524, + "balance_loss_clip": 1.01670456, + "balance_loss_mlp": 1.02490294, + "epoch": 0.28558545017285436, + "flos": 28619631555840.0, + "grad_norm": 1.907682446751592, + "language_loss": 0.70536107, + "learning_rate": 3.2479921194255764e-06, + "loss": 0.72650194, + "num_input_tokens_seen": 102471820, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.56640625, + "step": 4750, + "time_per_iteration": 2.4354352951049805 + }, + { + "auxiliary_loss_clip": 0.0108143, + "auxiliary_loss_mlp": 0.01031612, + "balance_loss_clip": 1.01534581, + "balance_loss_mlp": 1.02665794, + "epoch": 0.2856455734255223, + "flos": 34129065611520.0, + "grad_norm": 2.2333435843809397, + "language_loss": 0.81769609, + "learning_rate": 3.2476968564065613e-06, + "loss": 0.83882642, + "num_input_tokens_seen": 102492625, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.546875, + "step": 4751, + "time_per_iteration": 2.516223907470703 + }, + { + "auxiliary_loss_clip": 0.01080285, + "auxiliary_loss_mlp": 0.01030586, + "balance_loss_clip": 1.01438582, + "balance_loss_mlp": 1.02485085, + "epoch": 0.2857056966781903, + "flos": 39784576262400.0, + "grad_norm": 1.9672147354921232, + "language_loss": 0.79674977, + "learning_rate": 3.247401548860455e-06, + "loss": 0.81785846, + "num_input_tokens_seen": 102514145, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.5546875, + "step": 4752, + "time_per_iteration": 2.5313777923583984 + }, + { + "auxiliary_loss_clip": 0.01081205, + "auxiliary_loss_mlp": 0.01032259, + "balance_loss_clip": 1.01484227, + "balance_loss_mlp": 1.02471709, + "epoch": 0.28576581993085826, + "flos": 21871199704320.0, + "grad_norm": 1.7690911339342643, + "language_loss": 0.78648049, + "learning_rate": 3.247106196797796e-06, + "loss": 0.8076151, + "num_input_tokens_seen": 102532365, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.5625, + "step": 4753, + "time_per_iteration": 2.3985986709594727 + }, + { + "auxiliary_loss_clip": 0.01085125, + "auxiliary_loss_mlp": 0.01037552, + "balance_loss_clip": 1.0188477, + "balance_loss_mlp": 1.02637315, + "epoch": 0.2858259431835262, + "flos": 19090856805120.0, + "grad_norm": 2.0545765545697146, + "language_loss": 0.89546752, + "learning_rate": 3.2468108002291256e-06, + "loss": 0.91669428, + "num_input_tokens_seen": 102548425, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.5859375, + "step": 4754, + "time_per_iteration": 3.737626791000366 + }, + { + "auxiliary_loss_clip": 0.01080394, + "auxiliary_loss_mlp": 0.01036977, + "balance_loss_clip": 1.02078843, + "balance_loss_mlp": 1.02513647, + "epoch": 0.2858860664361942, + "flos": 20333415559680.0, + "grad_norm": 2.178307506385036, + "language_loss": 0.82149816, + "learning_rate": 3.2465153591649835e-06, + "loss": 0.84267181, + "num_input_tokens_seen": 102566370, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.55078125, + "step": 4755, + "time_per_iteration": 2.388486623764038 + }, + { + "auxiliary_loss_clip": 0.01081748, + "auxiliary_loss_mlp": 0.01034665, + "balance_loss_clip": 1.01823783, + "balance_loss_mlp": 1.02429199, + "epoch": 0.28594618968886215, + "flos": 24460603044480.0, + "grad_norm": 1.4870991408952334, + "language_loss": 0.83555698, + "learning_rate": 3.2462198736159157e-06, + "loss": 0.85672116, + "num_input_tokens_seen": 102588715, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.57421875, + "step": 4756, + "time_per_iteration": 2.4245755672454834 + }, + { + "auxiliary_loss_clip": 0.01016326, + "auxiliary_loss_mlp": 0.01005963, + "balance_loss_clip": 1.00446093, + "balance_loss_mlp": 1.00319099, + "epoch": 0.2860063129415301, + "flos": 71648964023040.0, + "grad_norm": 0.8636197065724464, + "language_loss": 0.61008024, + "learning_rate": 3.245924343592466e-06, + "loss": 0.63030314, + "num_input_tokens_seen": 102656715, + "router_z_loss_clip": 0.01501465, + "router_z_loss_mlp": 0.13085938, + "step": 4757, + "time_per_iteration": 3.183072566986084 + }, + { + "auxiliary_loss_clip": 0.01081288, + "auxiliary_loss_mlp": 0.01030334, + "balance_loss_clip": 1.01418066, + "balance_loss_mlp": 1.02508998, + "epoch": 0.2860664361941981, + "flos": 20557627511040.0, + "grad_norm": 1.9371765635545932, + "language_loss": 0.65740705, + "learning_rate": 3.2456287691051815e-06, + "loss": 0.67852324, + "num_input_tokens_seen": 102676545, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.5625, + "step": 4758, + "time_per_iteration": 3.7661352157592773 + }, + { + "auxiliary_loss_clip": 0.01083954, + "auxiliary_loss_mlp": 0.01034487, + "balance_loss_clip": 1.01677227, + "balance_loss_mlp": 1.02530575, + "epoch": 0.2861265594468661, + "flos": 35994788956800.0, + "grad_norm": 1.368761470475045, + "language_loss": 0.62679696, + "learning_rate": 3.24533315016461e-06, + "loss": 0.64798141, + "num_input_tokens_seen": 102702875, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.5859375, + "step": 4759, + "time_per_iteration": 2.5463781356811523 + }, + { + "auxiliary_loss_clip": 0.01080846, + "auxiliary_loss_mlp": 0.01034805, + "balance_loss_clip": 1.01798511, + "balance_loss_mlp": 1.02552783, + "epoch": 0.28618668269953407, + "flos": 20046394339200.0, + "grad_norm": 2.008926840236646, + "language_loss": 0.73942149, + "learning_rate": 3.245037486781302e-06, + "loss": 0.76057804, + "num_input_tokens_seen": 102723160, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.5546875, + "step": 4760, + "time_per_iteration": 2.407135009765625 + }, + { + "auxiliary_loss_clip": 0.01081765, + "auxiliary_loss_mlp": 0.01033092, + "balance_loss_clip": 1.01808345, + "balance_loss_mlp": 1.02776599, + "epoch": 0.28624680595220203, + "flos": 24970719052800.0, + "grad_norm": 1.9409226081942277, + "language_loss": 0.7247535, + "learning_rate": 3.2447417789658083e-06, + "loss": 0.74590206, + "num_input_tokens_seen": 102743855, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.5390625, + "step": 4761, + "time_per_iteration": 3.785780429840088 + }, + { + "auxiliary_loss_clip": 0.010811, + "auxiliary_loss_mlp": 0.0103727, + "balance_loss_clip": 1.02072382, + "balance_loss_mlp": 1.02470469, + "epoch": 0.28630692920487, + "flos": 22491152449920.0, + "grad_norm": 1.8461289578296582, + "language_loss": 0.7393499, + "learning_rate": 3.244446026728683e-06, + "loss": 0.76053357, + "num_input_tokens_seen": 102761370, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.5625, + "step": 4762, + "time_per_iteration": 2.3713278770446777 + }, + { + "auxiliary_loss_clip": 0.010833, + "auxiliary_loss_mlp": 0.01036969, + "balance_loss_clip": 1.02136397, + "balance_loss_mlp": 1.02807164, + "epoch": 0.28636705245753796, + "flos": 21248872986240.0, + "grad_norm": 1.4879677319151083, + "language_loss": 0.7620669, + "learning_rate": 3.2441502300804803e-06, + "loss": 0.78326958, + "num_input_tokens_seen": 102780885, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.5546875, + "step": 4763, + "time_per_iteration": 2.400841474533081 + }, + { + "auxiliary_loss_clip": 0.01081427, + "auxiliary_loss_mlp": 0.01035297, + "balance_loss_clip": 1.0176661, + "balance_loss_mlp": 1.02453864, + "epoch": 0.28642717571020593, + "flos": 24094678417920.0, + "grad_norm": 1.8148746077923816, + "language_loss": 0.76722366, + "learning_rate": 3.2438543890317557e-06, + "loss": 0.78839087, + "num_input_tokens_seen": 102801000, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.5703125, + "step": 4764, + "time_per_iteration": 3.7409722805023193 + }, + { + "auxiliary_loss_clip": 0.01084945, + "auxiliary_loss_mlp": 0.01035719, + "balance_loss_clip": 1.01578689, + "balance_loss_mlp": 1.02711725, + "epoch": 0.2864872989628739, + "flos": 22600290960000.0, + "grad_norm": 1.8969712200285918, + "language_loss": 0.70780838, + "learning_rate": 3.2435585035930676e-06, + "loss": 0.72901499, + "num_input_tokens_seen": 102820230, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 0.578125, + "step": 4765, + "time_per_iteration": 2.411360740661621 + }, + { + "auxiliary_loss_clip": 0.01078868, + "auxiliary_loss_mlp": 0.01027702, + "balance_loss_clip": 1.01191878, + "balance_loss_mlp": 1.02442336, + "epoch": 0.28654742221554186, + "flos": 32743677018240.0, + "grad_norm": 2.4557909303536225, + "language_loss": 0.76024294, + "learning_rate": 3.2432625737749754e-06, + "loss": 0.78130865, + "num_input_tokens_seen": 102842670, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.54296875, + "step": 4766, + "time_per_iteration": 2.459298849105835 + }, + { + "auxiliary_loss_clip": 0.01081553, + "auxiliary_loss_mlp": 0.01033938, + "balance_loss_clip": 1.01739168, + "balance_loss_mlp": 1.02570593, + "epoch": 0.2866075454682098, + "flos": 26980354488960.0, + "grad_norm": 2.0039010229122143, + "language_loss": 0.77118593, + "learning_rate": 3.2429665995880397e-06, + "loss": 0.79234087, + "num_input_tokens_seen": 102864480, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.55859375, + "step": 4767, + "time_per_iteration": 2.478315830230713 + }, + { + "auxiliary_loss_clip": 0.01082231, + "auxiliary_loss_mlp": 0.01037138, + "balance_loss_clip": 1.01984024, + "balance_loss_mlp": 1.02580881, + "epoch": 0.2866676687208778, + "flos": 23252852782080.0, + "grad_norm": 2.5076167194422685, + "language_loss": 0.65450412, + "learning_rate": 3.242670581042824e-06, + "loss": 0.67569774, + "num_input_tokens_seen": 102883740, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.5625, + "step": 4768, + "time_per_iteration": 2.3790221214294434 + }, + { + "auxiliary_loss_clip": 0.01084885, + "auxiliary_loss_mlp": 0.0103863, + "balance_loss_clip": 1.02176225, + "balance_loss_mlp": 1.02677703, + "epoch": 0.28672779197354575, + "flos": 21578662488960.0, + "grad_norm": 1.9179452799176788, + "language_loss": 0.70463544, + "learning_rate": 3.2423745181498907e-06, + "loss": 0.72587061, + "num_input_tokens_seen": 102902945, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.58203125, + "step": 4769, + "time_per_iteration": 2.3950836658477783 + }, + { + "auxiliary_loss_clip": 0.01082123, + "auxiliary_loss_mlp": 0.01029538, + "balance_loss_clip": 1.0129323, + "balance_loss_mlp": 1.02455568, + "epoch": 0.2867879152262137, + "flos": 19864531733760.0, + "grad_norm": 1.7190212108122154, + "language_loss": 0.74997473, + "learning_rate": 3.2420784109198076e-06, + "loss": 0.77109134, + "num_input_tokens_seen": 102922405, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.57421875, + "step": 4770, + "time_per_iteration": 2.3697502613067627 + }, + { + "auxiliary_loss_clip": 0.01085625, + "auxiliary_loss_mlp": 0.01029154, + "balance_loss_clip": 1.01197648, + "balance_loss_mlp": 1.02735996, + "epoch": 0.2868480384788817, + "flos": 28212265278720.0, + "grad_norm": 2.1302557520463994, + "language_loss": 0.67051756, + "learning_rate": 3.241782259363141e-06, + "loss": 0.69166529, + "num_input_tokens_seen": 102938980, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.58203125, + "step": 4771, + "time_per_iteration": 2.4191195964813232 + }, + { + "auxiliary_loss_clip": 0.01018615, + "auxiliary_loss_mlp": 0.01018355, + "balance_loss_clip": 1.01679361, + "balance_loss_mlp": 1.00506353, + "epoch": 0.2869081617315497, + "flos": 65421298010880.0, + "grad_norm": 0.7819715610590118, + "language_loss": 0.56847906, + "learning_rate": 3.241486063490459e-06, + "loss": 0.58884883, + "num_input_tokens_seen": 103000405, + "router_z_loss_clip": 0.015625, + "router_z_loss_mlp": 0.13476562, + "step": 4772, + "time_per_iteration": 2.961507558822632 + }, + { + "auxiliary_loss_clip": 0.01083547, + "auxiliary_loss_mlp": 0.01032684, + "balance_loss_clip": 1.01487398, + "balance_loss_mlp": 1.02495086, + "epoch": 0.28696828498421767, + "flos": 18659748936960.0, + "grad_norm": 3.2771513745431937, + "language_loss": 0.83096749, + "learning_rate": 3.241189823312334e-06, + "loss": 0.85212982, + "num_input_tokens_seen": 103017970, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.5859375, + "step": 4773, + "time_per_iteration": 2.374417304992676 + }, + { + "auxiliary_loss_clip": 0.01080541, + "auxiliary_loss_mlp": 0.01036948, + "balance_loss_clip": 1.01841056, + "balance_loss_mlp": 1.02297866, + "epoch": 0.28702840823688563, + "flos": 23658613136640.0, + "grad_norm": 2.0784744402834887, + "language_loss": 0.77495211, + "learning_rate": 3.2408935388393358e-06, + "loss": 0.79612702, + "num_input_tokens_seen": 103036385, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.57421875, + "step": 4774, + "time_per_iteration": 2.3903603553771973 + }, + { + "auxiliary_loss_clip": 0.01080459, + "auxiliary_loss_mlp": 0.01034836, + "balance_loss_clip": 1.01757431, + "balance_loss_mlp": 1.02470946, + "epoch": 0.2870885314895536, + "flos": 13803993866880.0, + "grad_norm": 2.618471709702865, + "language_loss": 0.73552686, + "learning_rate": 3.24059721008204e-06, + "loss": 0.75667977, + "num_input_tokens_seen": 103052170, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.5546875, + "step": 4775, + "time_per_iteration": 2.3653087615966797 + }, + { + "auxiliary_loss_clip": 0.01086239, + "auxiliary_loss_mlp": 0.01032819, + "balance_loss_clip": 1.01587892, + "balance_loss_mlp": 1.02752268, + "epoch": 0.28714865474222157, + "flos": 17785768072320.0, + "grad_norm": 1.634049834127892, + "language_loss": 0.8820895, + "learning_rate": 3.2403008370510207e-06, + "loss": 0.90328014, + "num_input_tokens_seen": 103070510, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.5859375, + "step": 4776, + "time_per_iteration": 2.352720260620117 + }, + { + "auxiliary_loss_clip": 0.01082094, + "auxiliary_loss_mlp": 0.0103184, + "balance_loss_clip": 1.01487613, + "balance_loss_mlp": 1.02560639, + "epoch": 0.28720877799488953, + "flos": 15996574160640.0, + "grad_norm": 1.644757821155631, + "language_loss": 0.7422418, + "learning_rate": 3.240004419756855e-06, + "loss": 0.76338118, + "num_input_tokens_seen": 103089590, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.5625, + "step": 4777, + "time_per_iteration": 2.40632700920105 + }, + { + "auxiliary_loss_clip": 0.01088839, + "auxiliary_loss_mlp": 0.01042297, + "balance_loss_clip": 1.02309251, + "balance_loss_mlp": 1.02881432, + "epoch": 0.2872689012475575, + "flos": 20922085860480.0, + "grad_norm": 5.515723857910451, + "language_loss": 0.80392218, + "learning_rate": 3.239707958210121e-06, + "loss": 0.82523352, + "num_input_tokens_seen": 103109080, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 0.6015625, + "step": 4778, + "time_per_iteration": 2.372300624847412 + }, + { + "auxiliary_loss_clip": 0.01081172, + "auxiliary_loss_mlp": 0.01039664, + "balance_loss_clip": 1.02213979, + "balance_loss_mlp": 1.02499616, + "epoch": 0.28732902450022546, + "flos": 21324040876800.0, + "grad_norm": 1.4811275860184796, + "language_loss": 0.74068809, + "learning_rate": 3.239411452421399e-06, + "loss": 0.76189649, + "num_input_tokens_seen": 103127755, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.5625, + "step": 4779, + "time_per_iteration": 2.4285295009613037 + }, + { + "auxiliary_loss_clip": 0.01084694, + "auxiliary_loss_mlp": 0.01034938, + "balance_loss_clip": 1.01655579, + "balance_loss_mlp": 1.027174, + "epoch": 0.2873891477528934, + "flos": 20849326853760.0, + "grad_norm": 1.5715025523207715, + "language_loss": 0.75808293, + "learning_rate": 3.2391149024012705e-06, + "loss": 0.77927923, + "num_input_tokens_seen": 103147035, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.57421875, + "step": 4780, + "time_per_iteration": 2.375492572784424 + }, + { + "auxiliary_loss_clip": 0.01081962, + "auxiliary_loss_mlp": 0.01041764, + "balance_loss_clip": 1.0255754, + "balance_loss_mlp": 1.0255785, + "epoch": 0.2874492710055614, + "flos": 17419110307200.0, + "grad_norm": 1.6819876732505463, + "language_loss": 0.8117137, + "learning_rate": 3.238818308160318e-06, + "loss": 0.83295095, + "num_input_tokens_seen": 103165410, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.56640625, + "step": 4781, + "time_per_iteration": 2.373920440673828 + }, + { + "auxiliary_loss_clip": 0.01085892, + "auxiliary_loss_mlp": 0.01037457, + "balance_loss_clip": 1.01897919, + "balance_loss_mlp": 1.02688193, + "epoch": 0.28750939425822936, + "flos": 13405983834240.0, + "grad_norm": 1.9601288546687496, + "language_loss": 0.86021197, + "learning_rate": 3.2385216697091277e-06, + "loss": 0.88144541, + "num_input_tokens_seen": 103183710, + "router_z_loss_clip": 0.18457031, + "router_z_loss_mlp": 0.58984375, + "step": 4782, + "time_per_iteration": 2.3698599338531494 + }, + { + "auxiliary_loss_clip": 0.01084486, + "auxiliary_loss_mlp": 0.01033596, + "balance_loss_clip": 1.01567316, + "balance_loss_mlp": 1.02744699, + "epoch": 0.2875695175108973, + "flos": 21869000288640.0, + "grad_norm": 1.4303310648446173, + "language_loss": 0.71079791, + "learning_rate": 3.238224987058284e-06, + "loss": 0.73197877, + "num_input_tokens_seen": 103203790, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 0.5703125, + "step": 4783, + "time_per_iteration": 2.4121170043945312 + }, + { + "auxiliary_loss_clip": 0.01083458, + "auxiliary_loss_mlp": 0.01035544, + "balance_loss_clip": 1.01886714, + "balance_loss_mlp": 1.0266844, + "epoch": 0.2876296407635653, + "flos": 26244385695360.0, + "grad_norm": 1.5075065675431283, + "language_loss": 0.76762807, + "learning_rate": 3.2379282602183757e-06, + "loss": 0.78881812, + "num_input_tokens_seen": 103223925, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.56640625, + "step": 4784, + "time_per_iteration": 2.4134573936462402 + }, + { + "auxiliary_loss_clip": 0.01080009, + "auxiliary_loss_mlp": 0.01033253, + "balance_loss_clip": 1.01619458, + "balance_loss_mlp": 1.02529049, + "epoch": 0.28768976401623325, + "flos": 25372499512320.0, + "grad_norm": 1.5139826738496214, + "language_loss": 0.75993824, + "learning_rate": 3.237631489199993e-06, + "loss": 0.78107083, + "num_input_tokens_seen": 103244760, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.546875, + "step": 4785, + "time_per_iteration": 2.434781789779663 + }, + { + "auxiliary_loss_clip": 0.01083871, + "auxiliary_loss_mlp": 0.01041697, + "balance_loss_clip": 1.02426863, + "balance_loss_mlp": 1.02640808, + "epoch": 0.28774988726890127, + "flos": 30663063054720.0, + "grad_norm": 1.9019191870517647, + "language_loss": 0.82909286, + "learning_rate": 3.2373346740137254e-06, + "loss": 0.85034847, + "num_input_tokens_seen": 103261995, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.57421875, + "step": 4786, + "time_per_iteration": 2.4442851543426514 + }, + { + "auxiliary_loss_clip": 0.01084181, + "auxiliary_loss_mlp": 0.0103358, + "balance_loss_clip": 1.01757026, + "balance_loss_mlp": 1.0267905, + "epoch": 0.28781001052156924, + "flos": 20594391039360.0, + "grad_norm": 1.7124788550907577, + "language_loss": 0.79891634, + "learning_rate": 3.237037814670166e-06, + "loss": 0.82009393, + "num_input_tokens_seen": 103279780, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.5703125, + "step": 4787, + "time_per_iteration": 2.3960533142089844 + }, + { + "auxiliary_loss_clip": 0.01082349, + "auxiliary_loss_mlp": 0.0103384, + "balance_loss_clip": 1.01644707, + "balance_loss_mlp": 1.02483702, + "epoch": 0.2878701337742372, + "flos": 26541112273920.0, + "grad_norm": 2.5294664105344555, + "language_loss": 0.83474612, + "learning_rate": 3.2367409111799082e-06, + "loss": 0.85590804, + "num_input_tokens_seen": 103300580, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.57421875, + "step": 4788, + "time_per_iteration": 2.4305059909820557 + }, + { + "auxiliary_loss_clip": 0.01085826, + "auxiliary_loss_mlp": 0.01037884, + "balance_loss_clip": 1.02094471, + "balance_loss_mlp": 1.02760315, + "epoch": 0.28793025702690517, + "flos": 28145615760000.0, + "grad_norm": 2.4678657642206128, + "language_loss": 0.74098521, + "learning_rate": 3.23644396355355e-06, + "loss": 0.76222229, + "num_input_tokens_seen": 103320430, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.58203125, + "step": 4789, + "time_per_iteration": 2.4473536014556885 + }, + { + "auxiliary_loss_clip": 0.01079558, + "auxiliary_loss_mlp": 0.01037631, + "balance_loss_clip": 1.02096558, + "balance_loss_mlp": 1.0238378, + "epoch": 0.28799038027957313, + "flos": 23804340618240.0, + "grad_norm": 1.7057604328566531, + "language_loss": 0.83811307, + "learning_rate": 3.2361469718016867e-06, + "loss": 0.859285, + "num_input_tokens_seen": 103337695, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.55859375, + "step": 4790, + "time_per_iteration": 2.4123950004577637 + }, + { + "auxiliary_loss_clip": 0.01084175, + "auxiliary_loss_mlp": 0.01035579, + "balance_loss_clip": 1.01851463, + "balance_loss_mlp": 1.02673221, + "epoch": 0.2880505035322411, + "flos": 22343085907200.0, + "grad_norm": 1.6504365710813103, + "language_loss": 0.77442652, + "learning_rate": 3.2358499359349177e-06, + "loss": 0.79562414, + "num_input_tokens_seen": 103357010, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.57421875, + "step": 4791, + "time_per_iteration": 2.412493944168091 + }, + { + "auxiliary_loss_clip": 0.01084831, + "auxiliary_loss_mlp": 0.01033791, + "balance_loss_clip": 1.01723278, + "balance_loss_mlp": 1.02510417, + "epoch": 0.28811062678490906, + "flos": 18003277042560.0, + "grad_norm": 1.7058950260499903, + "language_loss": 0.7090838, + "learning_rate": 3.2355528559638436e-06, + "loss": 0.73027009, + "num_input_tokens_seen": 103375600, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.59765625, + "step": 4792, + "time_per_iteration": 2.3605997562408447 + }, + { + "auxiliary_loss_clip": 0.01085408, + "auxiliary_loss_mlp": 0.01037614, + "balance_loss_clip": 1.02001834, + "balance_loss_mlp": 1.02661896, + "epoch": 0.28817075003757703, + "flos": 22089790926720.0, + "grad_norm": 1.8470566608293222, + "language_loss": 0.78890079, + "learning_rate": 3.235255731899066e-06, + "loss": 0.81013101, + "num_input_tokens_seen": 103395225, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.5859375, + "step": 4793, + "time_per_iteration": 2.398627519607544 + }, + { + "auxiliary_loss_clip": 0.01081445, + "auxiliary_loss_mlp": 0.01034572, + "balance_loss_clip": 1.01828766, + "balance_loss_mlp": 1.02635789, + "epoch": 0.288230873290245, + "flos": 41681512229760.0, + "grad_norm": 1.5947705443077573, + "language_loss": 0.78010929, + "learning_rate": 3.2349585637511896e-06, + "loss": 0.80126941, + "num_input_tokens_seen": 103417245, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.55078125, + "step": 4794, + "time_per_iteration": 3.9291160106658936 + }, + { + "auxiliary_loss_clip": 0.01083122, + "auxiliary_loss_mlp": 0.01044496, + "balance_loss_clip": 1.02718687, + "balance_loss_mlp": 1.02604651, + "epoch": 0.28829099654291296, + "flos": 18623439256320.0, + "grad_norm": 1.9674666181075553, + "language_loss": 0.82565165, + "learning_rate": 3.2346613515308176e-06, + "loss": 0.84692788, + "num_input_tokens_seen": 103435500, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.5703125, + "step": 4795, + "time_per_iteration": 2.3600006103515625 + }, + { + "auxiliary_loss_clip": 0.0108034, + "auxiliary_loss_mlp": 0.01035294, + "balance_loss_clip": 1.01966619, + "balance_loss_mlp": 1.0264374, + "epoch": 0.2883511197955809, + "flos": 24673852828800.0, + "grad_norm": 1.9384913000177635, + "language_loss": 0.74597645, + "learning_rate": 3.2343640952485586e-06, + "loss": 0.76713276, + "num_input_tokens_seen": 103451040, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.5390625, + "step": 4796, + "time_per_iteration": 2.3823251724243164 + }, + { + "auxiliary_loss_clip": 0.01086263, + "auxiliary_loss_mlp": 0.01036379, + "balance_loss_clip": 1.01729381, + "balance_loss_mlp": 1.0253582, + "epoch": 0.2884112430482489, + "flos": 23111035372800.0, + "grad_norm": 2.4303103980644996, + "language_loss": 0.72752434, + "learning_rate": 3.23406679491502e-06, + "loss": 0.74875081, + "num_input_tokens_seen": 103471330, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 0.609375, + "step": 4797, + "time_per_iteration": 3.788723945617676 + }, + { + "auxiliary_loss_clip": 0.01081171, + "auxiliary_loss_mlp": 0.01035123, + "balance_loss_clip": 1.01852298, + "balance_loss_mlp": 1.02505696, + "epoch": 0.28847136630091685, + "flos": 16872405327360.0, + "grad_norm": 2.0830388666046225, + "language_loss": 0.74375105, + "learning_rate": 3.2337694505408117e-06, + "loss": 0.76491398, + "num_input_tokens_seen": 103488060, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.5625, + "step": 4798, + "time_per_iteration": 2.3544442653656006 + }, + { + "auxiliary_loss_clip": 0.01083517, + "auxiliary_loss_mlp": 0.01040551, + "balance_loss_clip": 1.02148914, + "balance_loss_mlp": 1.02441955, + "epoch": 0.2885314895535849, + "flos": 25656588178560.0, + "grad_norm": 3.58061133885997, + "language_loss": 0.65353239, + "learning_rate": 3.2334720621365457e-06, + "loss": 0.67477304, + "num_input_tokens_seen": 103503600, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 0.59375, + "step": 4799, + "time_per_iteration": 2.393225908279419 + }, + { + "auxiliary_loss_clip": 0.01082389, + "auxiliary_loss_mlp": 0.01039719, + "balance_loss_clip": 1.02271986, + "balance_loss_mlp": 1.02558613, + "epoch": 0.28859161280625284, + "flos": 21106147881600.0, + "grad_norm": 2.0266151342269616, + "language_loss": 0.82235265, + "learning_rate": 3.2331746297128345e-06, + "loss": 0.84357375, + "num_input_tokens_seen": 103524195, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.5703125, + "step": 4800, + "time_per_iteration": 2.398787260055542 + }, + { + "auxiliary_loss_clip": 0.01079156, + "auxiliary_loss_mlp": 0.01033207, + "balance_loss_clip": 1.01694703, + "balance_loss_mlp": 1.02517045, + "epoch": 0.2886517360589208, + "flos": 26468318355840.0, + "grad_norm": 2.205300128290066, + "language_loss": 0.90891337, + "learning_rate": 3.2328771532802934e-06, + "loss": 0.93003696, + "num_input_tokens_seen": 103545235, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.5390625, + "step": 4801, + "time_per_iteration": 3.7470779418945312 + }, + { + "auxiliary_loss_clip": 0.01082675, + "auxiliary_loss_mlp": 0.01034742, + "balance_loss_clip": 1.01758766, + "balance_loss_mlp": 1.02651453, + "epoch": 0.28871185931158877, + "flos": 25264094140800.0, + "grad_norm": 5.102993571991013, + "language_loss": 0.73611045, + "learning_rate": 3.232579632849537e-06, + "loss": 0.75728464, + "num_input_tokens_seen": 103563305, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.5625, + "step": 4802, + "time_per_iteration": 2.443101406097412 + }, + { + "auxiliary_loss_clip": 0.01020999, + "auxiliary_loss_mlp": 0.01012151, + "balance_loss_clip": 1.0105896, + "balance_loss_mlp": 1.00733137, + "epoch": 0.28877198256425674, + "flos": 66662390488320.0, + "grad_norm": 0.7838931010390363, + "language_loss": 0.63035232, + "learning_rate": 3.232282068431185e-06, + "loss": 0.65068382, + "num_input_tokens_seen": 103625025, + "router_z_loss_clip": 0.01556396, + "router_z_loss_mlp": 0.13671875, + "step": 4803, + "time_per_iteration": 2.984715700149536 + }, + { + "auxiliary_loss_clip": 0.01080544, + "auxiliary_loss_mlp": 0.01037879, + "balance_loss_clip": 1.02194643, + "balance_loss_mlp": 1.02412534, + "epoch": 0.2888321058169247, + "flos": 20301993469440.0, + "grad_norm": 1.7477692837910497, + "language_loss": 0.70655793, + "learning_rate": 3.2319844600358554e-06, + "loss": 0.72774214, + "num_input_tokens_seen": 103644235, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.56640625, + "step": 4804, + "time_per_iteration": 3.8123722076416016 + }, + { + "auxiliary_loss_clip": 0.01083416, + "auxiliary_loss_mlp": 0.01037688, + "balance_loss_clip": 1.01964021, + "balance_loss_mlp": 1.02464998, + "epoch": 0.28889222906959267, + "flos": 25515643553280.0, + "grad_norm": 2.1381345292100202, + "language_loss": 0.68044317, + "learning_rate": 3.231686807674169e-06, + "loss": 0.70165426, + "num_input_tokens_seen": 103664700, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.5859375, + "step": 4805, + "time_per_iteration": 2.402233839035034 + }, + { + "auxiliary_loss_clip": 0.01081487, + "auxiliary_loss_mlp": 0.01031944, + "balance_loss_clip": 1.01514745, + "balance_loss_mlp": 1.02434111, + "epoch": 0.28895235232226063, + "flos": 32669940493440.0, + "grad_norm": 1.3754379833743893, + "language_loss": 0.69341135, + "learning_rate": 3.2313891113567496e-06, + "loss": 0.71454567, + "num_input_tokens_seen": 103686595, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.5703125, + "step": 4806, + "time_per_iteration": 2.4646334648132324 + }, + { + "auxiliary_loss_clip": 0.01079249, + "auxiliary_loss_mlp": 0.01036407, + "balance_loss_clip": 1.02004588, + "balance_loss_mlp": 1.02471232, + "epoch": 0.2890124755749286, + "flos": 29713425540480.0, + "grad_norm": 1.5752426751402027, + "language_loss": 0.71645749, + "learning_rate": 3.2310913710942193e-06, + "loss": 0.73761404, + "num_input_tokens_seen": 103707525, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.546875, + "step": 4807, + "time_per_iteration": 2.4353389739990234 + }, + { + "auxiliary_loss_clip": 0.01081557, + "auxiliary_loss_mlp": 0.01030749, + "balance_loss_clip": 1.01435733, + "balance_loss_mlp": 1.02440739, + "epoch": 0.28907259882759656, + "flos": 22673364168960.0, + "grad_norm": 1.82325769433667, + "language_loss": 0.81497622, + "learning_rate": 3.2307935868972055e-06, + "loss": 0.83609927, + "num_input_tokens_seen": 103727905, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.5703125, + "step": 4808, + "time_per_iteration": 2.391050100326538 + }, + { + "auxiliary_loss_clip": 0.01078893, + "auxiliary_loss_mlp": 0.01040459, + "balance_loss_clip": 1.02381706, + "balance_loss_mlp": 1.02490616, + "epoch": 0.2891327220802645, + "flos": 22564923886080.0, + "grad_norm": 1.4461880787855734, + "language_loss": 0.78218162, + "learning_rate": 3.2304957587763344e-06, + "loss": 0.80337512, + "num_input_tokens_seen": 103748335, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.5390625, + "step": 4809, + "time_per_iteration": 2.385453701019287 + }, + { + "auxiliary_loss_clip": 0.01084992, + "auxiliary_loss_mlp": 0.01040933, + "balance_loss_clip": 1.02265799, + "balance_loss_mlp": 1.0242722, + "epoch": 0.2891928453329325, + "flos": 21651735697920.0, + "grad_norm": 1.7445743669993674, + "language_loss": 0.7866075, + "learning_rate": 3.2301978867422352e-06, + "loss": 0.80786681, + "num_input_tokens_seen": 103767020, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 0.609375, + "step": 4810, + "time_per_iteration": 2.392239570617676 + }, + { + "auxiliary_loss_clip": 0.01080455, + "auxiliary_loss_mlp": 0.01030121, + "balance_loss_clip": 1.01421237, + "balance_loss_mlp": 1.02487373, + "epoch": 0.28925296858560046, + "flos": 23220976844160.0, + "grad_norm": 1.727744313539586, + "language_loss": 0.76764137, + "learning_rate": 3.2298999708055375e-06, + "loss": 0.78874707, + "num_input_tokens_seen": 103786355, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.5546875, + "step": 4811, + "time_per_iteration": 2.3839938640594482 + }, + { + "auxiliary_loss_clip": 0.01079242, + "auxiliary_loss_mlp": 0.01039131, + "balance_loss_clip": 1.02209604, + "balance_loss_mlp": 1.02362239, + "epoch": 0.2893130918382685, + "flos": 28620399605760.0, + "grad_norm": 1.4053197116203917, + "language_loss": 0.77415568, + "learning_rate": 3.229602010976873e-06, + "loss": 0.79533941, + "num_input_tokens_seen": 103809345, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.5546875, + "step": 4812, + "time_per_iteration": 2.4538538455963135 + }, + { + "auxiliary_loss_clip": 0.0101508, + "auxiliary_loss_mlp": 0.01020411, + "balance_loss_clip": 1.01896822, + "balance_loss_mlp": 1.00233889, + "epoch": 0.28937321509093644, + "flos": 72297615772800.0, + "grad_norm": 0.8436769741053833, + "language_loss": 0.60269272, + "learning_rate": 3.2293040072668768e-06, + "loss": 0.62304771, + "num_input_tokens_seen": 103871180, + "router_z_loss_clip": 0.0144043, + "router_z_loss_mlp": 0.12695312, + "step": 4813, + "time_per_iteration": 3.167846441268921 + }, + { + "auxiliary_loss_clip": 0.0107974, + "auxiliary_loss_mlp": 0.01031823, + "balance_loss_clip": 1.01550364, + "balance_loss_mlp": 1.02399731, + "epoch": 0.2894333383436044, + "flos": 16215479585280.0, + "grad_norm": 2.7462003041272225, + "language_loss": 0.82181168, + "learning_rate": 3.229005959686182e-06, + "loss": 0.84292722, + "num_input_tokens_seen": 103889040, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.55859375, + "step": 4814, + "time_per_iteration": 2.3672327995300293 + }, + { + "auxiliary_loss_clip": 0.01085028, + "auxiliary_loss_mlp": 0.01042757, + "balance_loss_clip": 1.02534032, + "balance_loss_mlp": 1.02587438, + "epoch": 0.2894934615962724, + "flos": 24827086252800.0, + "grad_norm": 1.5383767770677812, + "language_loss": 0.72341979, + "learning_rate": 3.2287078682454255e-06, + "loss": 0.74469769, + "num_input_tokens_seen": 103910380, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.59375, + "step": 4815, + "time_per_iteration": 2.4240670204162598 + }, + { + "auxiliary_loss_clip": 0.01078624, + "auxiliary_loss_mlp": 0.01037028, + "balance_loss_clip": 1.02207923, + "balance_loss_mlp": 1.02578044, + "epoch": 0.28955358484894034, + "flos": 20448907937280.0, + "grad_norm": 1.3841100290299935, + "language_loss": 0.70090729, + "learning_rate": 3.2284097329552465e-06, + "loss": 0.72206384, + "num_input_tokens_seen": 103929955, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.52734375, + "step": 4816, + "time_per_iteration": 2.4304616451263428 + }, + { + "auxiliary_loss_clip": 0.01085138, + "auxiliary_loss_mlp": 0.01034884, + "balance_loss_clip": 1.01666856, + "balance_loss_mlp": 1.02484918, + "epoch": 0.2896137081016083, + "flos": 22564086013440.0, + "grad_norm": 2.0496427179592764, + "language_loss": 0.73847157, + "learning_rate": 3.2281115538262844e-06, + "loss": 0.75967181, + "num_input_tokens_seen": 103948020, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.6015625, + "step": 4817, + "time_per_iteration": 2.4246084690093994 + }, + { + "auxiliary_loss_clip": 0.01085699, + "auxiliary_loss_mlp": 0.01038941, + "balance_loss_clip": 1.02074957, + "balance_loss_mlp": 1.02650654, + "epoch": 0.28967383135427627, + "flos": 26686735021440.0, + "grad_norm": 1.6915612583310022, + "language_loss": 0.76172686, + "learning_rate": 3.227813330869179e-06, + "loss": 0.78297329, + "num_input_tokens_seen": 103968740, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.59375, + "step": 4818, + "time_per_iteration": 2.442655324935913 + }, + { + "auxiliary_loss_clip": 0.01083231, + "auxiliary_loss_mlp": 0.0103029, + "balance_loss_clip": 1.01321888, + "balance_loss_mlp": 1.0253706, + "epoch": 0.28973395460694423, + "flos": 15557401768320.0, + "grad_norm": 1.8303846110340454, + "language_loss": 0.79493523, + "learning_rate": 3.2275150640945742e-06, + "loss": 0.81607044, + "num_input_tokens_seen": 103986005, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.578125, + "step": 4819, + "time_per_iteration": 2.3795225620269775 + }, + { + "auxiliary_loss_clip": 0.01086843, + "auxiliary_loss_mlp": 0.01036769, + "balance_loss_clip": 1.01699209, + "balance_loss_mlp": 1.02553666, + "epoch": 0.2897940778596122, + "flos": 18696477553920.0, + "grad_norm": 1.896152999150647, + "language_loss": 0.78887463, + "learning_rate": 3.227216753513115e-06, + "loss": 0.81011081, + "num_input_tokens_seen": 104005070, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 0.609375, + "step": 4820, + "time_per_iteration": 2.36979603767395 + }, + { + "auxiliary_loss_clip": 0.01086203, + "auxiliary_loss_mlp": 0.01031562, + "balance_loss_clip": 1.01493227, + "balance_loss_mlp": 1.02753949, + "epoch": 0.28985420111228016, + "flos": 18769306383360.0, + "grad_norm": 2.1469533656701505, + "language_loss": 0.7271353, + "learning_rate": 3.2269183991354464e-06, + "loss": 0.74831295, + "num_input_tokens_seen": 104022945, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.5859375, + "step": 4821, + "time_per_iteration": 2.354018211364746 + }, + { + "auxiliary_loss_clip": 0.0108335, + "auxiliary_loss_mlp": 0.01032665, + "balance_loss_clip": 1.01625037, + "balance_loss_mlp": 1.02552605, + "epoch": 0.28991432436494813, + "flos": 23068895495040.0, + "grad_norm": 1.7789812989965386, + "language_loss": 0.72078979, + "learning_rate": 3.226620000972216e-06, + "loss": 0.74194992, + "num_input_tokens_seen": 104042080, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.578125, + "step": 4822, + "time_per_iteration": 2.399756669998169 + }, + { + "auxiliary_loss_clip": 0.01082567, + "auxiliary_loss_mlp": 0.01037314, + "balance_loss_clip": 1.02093506, + "balance_loss_mlp": 1.02557909, + "epoch": 0.2899744476176161, + "flos": 17602229721600.0, + "grad_norm": 1.6526940471716602, + "language_loss": 0.66189766, + "learning_rate": 3.2263215590340726e-06, + "loss": 0.68309653, + "num_input_tokens_seen": 104060975, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.5703125, + "step": 4823, + "time_per_iteration": 2.3464319705963135 + }, + { + "auxiliary_loss_clip": 0.01081034, + "auxiliary_loss_mlp": 0.01032701, + "balance_loss_clip": 1.01583338, + "balance_loss_mlp": 1.02479911, + "epoch": 0.29003457087028406, + "flos": 22308277415040.0, + "grad_norm": 2.2258522926954902, + "language_loss": 0.81182373, + "learning_rate": 3.2260230733316683e-06, + "loss": 0.83296108, + "num_input_tokens_seen": 104081395, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.5625, + "step": 4824, + "time_per_iteration": 2.3960704803466797 + }, + { + "auxiliary_loss_clip": 0.0108277, + "auxiliary_loss_mlp": 0.01038686, + "balance_loss_clip": 1.02033961, + "balance_loss_mlp": 1.02461863, + "epoch": 0.2900946941229521, + "flos": 21943888888320.0, + "grad_norm": 2.0599624089569186, + "language_loss": 0.7224611, + "learning_rate": 3.2257245438756534e-06, + "loss": 0.74367565, + "num_input_tokens_seen": 104099995, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.58203125, + "step": 4825, + "time_per_iteration": 2.3737518787384033 + }, + { + "auxiliary_loss_clip": 0.01085591, + "auxiliary_loss_mlp": 0.01030159, + "balance_loss_clip": 1.01271844, + "balance_loss_mlp": 1.02707791, + "epoch": 0.29015481737562004, + "flos": 17931181351680.0, + "grad_norm": 2.2898350457481857, + "language_loss": 0.73085475, + "learning_rate": 3.2254259706766824e-06, + "loss": 0.75201225, + "num_input_tokens_seen": 104118930, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.5859375, + "step": 4826, + "time_per_iteration": 2.3637855052948 + }, + { + "auxiliary_loss_clip": 0.01080506, + "auxiliary_loss_mlp": 0.01031808, + "balance_loss_clip": 1.01483226, + "balance_loss_mlp": 1.02390599, + "epoch": 0.290214940628288, + "flos": 22782432856320.0, + "grad_norm": 3.359252595343814, + "language_loss": 0.68858981, + "learning_rate": 3.2251273537454113e-06, + "loss": 0.70971298, + "num_input_tokens_seen": 104136940, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.56640625, + "step": 4827, + "time_per_iteration": 2.3695380687713623 + }, + { + "auxiliary_loss_clip": 0.01084042, + "auxiliary_loss_mlp": 0.01035897, + "balance_loss_clip": 1.01819396, + "balance_loss_mlp": 1.0265789, + "epoch": 0.290275063880956, + "flos": 20005581093120.0, + "grad_norm": 1.824959284257807, + "language_loss": 0.80134833, + "learning_rate": 3.224828693092496e-06, + "loss": 0.82254779, + "num_input_tokens_seen": 104154280, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.57421875, + "step": 4828, + "time_per_iteration": 2.381049871444702 + }, + { + "auxiliary_loss_clip": 0.01082451, + "auxiliary_loss_mlp": 0.0104034, + "balance_loss_clip": 1.02346587, + "balance_loss_mlp": 1.02582574, + "epoch": 0.29033518713362394, + "flos": 22052538639360.0, + "grad_norm": 1.859359626107, + "language_loss": 0.80581927, + "learning_rate": 3.2245299887285954e-06, + "loss": 0.82704711, + "num_input_tokens_seen": 104172605, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.56640625, + "step": 4829, + "time_per_iteration": 2.3636674880981445 + }, + { + "auxiliary_loss_clip": 0.01082061, + "auxiliary_loss_mlp": 0.01032015, + "balance_loss_clip": 1.01545668, + "balance_loss_mlp": 1.02618432, + "epoch": 0.2903953103862919, + "flos": 25628866692480.0, + "grad_norm": 1.6939863258155763, + "language_loss": 0.82723534, + "learning_rate": 3.224231240664369e-06, + "loss": 0.84837615, + "num_input_tokens_seen": 104194120, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.55859375, + "step": 4830, + "time_per_iteration": 2.428407907485962 + }, + { + "auxiliary_loss_clip": 0.01083135, + "auxiliary_loss_mlp": 0.010374, + "balance_loss_clip": 1.02028179, + "balance_loss_mlp": 1.02500689, + "epoch": 0.29045543363895987, + "flos": 16944919954560.0, + "grad_norm": 2.582446349843684, + "language_loss": 0.79058111, + "learning_rate": 3.223932448910479e-06, + "loss": 0.81178641, + "num_input_tokens_seen": 104210875, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.58203125, + "step": 4831, + "time_per_iteration": 2.3361928462982178 + }, + { + "auxiliary_loss_clip": 0.01078225, + "auxiliary_loss_mlp": 0.01034196, + "balance_loss_clip": 1.01888967, + "balance_loss_mlp": 1.02458739, + "epoch": 0.29051555689162784, + "flos": 26394302540160.0, + "grad_norm": 1.6636560669260019, + "language_loss": 0.74149847, + "learning_rate": 3.2236336134775883e-06, + "loss": 0.76262271, + "num_input_tokens_seen": 104229875, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.53515625, + "step": 4832, + "time_per_iteration": 2.414759874343872 + }, + { + "auxiliary_loss_clip": 0.01080248, + "auxiliary_loss_mlp": 0.01028424, + "balance_loss_clip": 1.01357675, + "balance_loss_mlp": 1.02451015, + "epoch": 0.2905756801442958, + "flos": 21102866213760.0, + "grad_norm": 1.6261987899159107, + "language_loss": 0.76170707, + "learning_rate": 3.2233347343763614e-06, + "loss": 0.78279382, + "num_input_tokens_seen": 104250405, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.5546875, + "step": 4833, + "time_per_iteration": 3.8914058208465576 + }, + { + "auxiliary_loss_clip": 0.01078607, + "auxiliary_loss_mlp": 0.01032211, + "balance_loss_clip": 1.01640415, + "balance_loss_mlp": 1.02417588, + "epoch": 0.29063580339696377, + "flos": 15705154108800.0, + "grad_norm": 1.7451189603754897, + "language_loss": 0.6479373, + "learning_rate": 3.2230358116174645e-06, + "loss": 0.66904545, + "num_input_tokens_seen": 104269185, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.546875, + "step": 4834, + "time_per_iteration": 2.359287977218628 + }, + { + "auxiliary_loss_clip": 0.01077039, + "auxiliary_loss_mlp": 0.01025505, + "balance_loss_clip": 1.00991201, + "balance_loss_mlp": 1.02370596, + "epoch": 0.29069592664963173, + "flos": 24643827192960.0, + "grad_norm": 1.6971943668901526, + "language_loss": 0.71625978, + "learning_rate": 3.2227368452115658e-06, + "loss": 0.73728526, + "num_input_tokens_seen": 104289400, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.53515625, + "step": 4835, + "time_per_iteration": 2.399034023284912 + }, + { + "auxiliary_loss_clip": 0.01080264, + "auxiliary_loss_mlp": 0.01034315, + "balance_loss_clip": 1.01929474, + "balance_loss_mlp": 1.02554822, + "epoch": 0.2907560499022997, + "flos": 24972569354880.0, + "grad_norm": 1.5955799720644368, + "language_loss": 0.79172188, + "learning_rate": 3.2224378351693337e-06, + "loss": 0.8128677, + "num_input_tokens_seen": 104310485, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.546875, + "step": 4836, + "time_per_iteration": 3.7992281913757324 + }, + { + "auxiliary_loss_clip": 0.01078332, + "auxiliary_loss_mlp": 0.01033185, + "balance_loss_clip": 1.01762795, + "balance_loss_mlp": 1.02478957, + "epoch": 0.29081617315496766, + "flos": 18656606914560.0, + "grad_norm": 1.6570337145681453, + "language_loss": 0.80692613, + "learning_rate": 3.2221387815014405e-06, + "loss": 0.82804132, + "num_input_tokens_seen": 104327330, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.53515625, + "step": 4837, + "time_per_iteration": 2.3469960689544678 + }, + { + "auxiliary_loss_clip": 0.01079001, + "auxiliary_loss_mlp": 0.0103406, + "balance_loss_clip": 1.01747775, + "balance_loss_mlp": 1.0222944, + "epoch": 0.2908762964076356, + "flos": 35329693956480.0, + "grad_norm": 1.8333048825527871, + "language_loss": 0.6711567, + "learning_rate": 3.2218396842185576e-06, + "loss": 0.69228733, + "num_input_tokens_seen": 104350350, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.5703125, + "step": 4838, + "time_per_iteration": 2.5223610401153564 + }, + { + "auxiliary_loss_clip": 0.01084103, + "auxiliary_loss_mlp": 0.01037505, + "balance_loss_clip": 1.01902723, + "balance_loss_mlp": 1.02553546, + "epoch": 0.29093641966030365, + "flos": 23075179539840.0, + "grad_norm": 1.6165504959692143, + "language_loss": 0.71655321, + "learning_rate": 3.2215405433313595e-06, + "loss": 0.73776925, + "num_input_tokens_seen": 104369995, + "router_z_loss_clip": 0.18457031, + "router_z_loss_mlp": 0.5859375, + "step": 4839, + "time_per_iteration": 2.3829758167266846 + }, + { + "auxiliary_loss_clip": 0.01079707, + "auxiliary_loss_mlp": 0.01030924, + "balance_loss_clip": 1.01615405, + "balance_loss_mlp": 1.02491629, + "epoch": 0.2909965429129716, + "flos": 35953940799360.0, + "grad_norm": 1.795211431657631, + "language_loss": 0.76032734, + "learning_rate": 3.221241358850521e-06, + "loss": 0.78143358, + "num_input_tokens_seen": 104392285, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.546875, + "step": 4840, + "time_per_iteration": 3.8981010913848877 + }, + { + "auxiliary_loss_clip": 0.01081852, + "auxiliary_loss_mlp": 0.01038773, + "balance_loss_clip": 1.02273917, + "balance_loss_mlp": 1.02586091, + "epoch": 0.2910566661656396, + "flos": 30879001013760.0, + "grad_norm": 1.7548457654858056, + "language_loss": 0.60732472, + "learning_rate": 3.2209421307867205e-06, + "loss": 0.62853098, + "num_input_tokens_seen": 104412640, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.55859375, + "step": 4841, + "time_per_iteration": 2.4533472061157227 + }, + { + "auxiliary_loss_clip": 0.01080434, + "auxiliary_loss_mlp": 0.01034082, + "balance_loss_clip": 1.0171665, + "balance_loss_mlp": 1.0246284, + "epoch": 0.29111678941830754, + "flos": 30008825487360.0, + "grad_norm": 1.4024801716579982, + "language_loss": 0.71260989, + "learning_rate": 3.2206428591506358e-06, + "loss": 0.73375505, + "num_input_tokens_seen": 104435245, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.55859375, + "step": 4842, + "time_per_iteration": 2.4548144340515137 + }, + { + "auxiliary_loss_clip": 0.01078416, + "auxiliary_loss_mlp": 0.01034786, + "balance_loss_clip": 1.01953864, + "balance_loss_mlp": 1.02398825, + "epoch": 0.2911769126709755, + "flos": 22856274115200.0, + "grad_norm": 1.6124706220855658, + "language_loss": 0.72992384, + "learning_rate": 3.220343543952947e-06, + "loss": 0.75105584, + "num_input_tokens_seen": 104455395, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.546875, + "step": 4843, + "time_per_iteration": 3.7670974731445312 + }, + { + "auxiliary_loss_clip": 0.01078737, + "auxiliary_loss_mlp": 0.01032062, + "balance_loss_clip": 1.01528931, + "balance_loss_mlp": 1.02261329, + "epoch": 0.2912370359236435, + "flos": 21649501370880.0, + "grad_norm": 2.592727729122953, + "language_loss": 0.58053476, + "learning_rate": 3.2200441852043367e-06, + "loss": 0.60164273, + "num_input_tokens_seen": 104473350, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.5625, + "step": 4844, + "time_per_iteration": 2.3729407787323 + }, + { + "auxiliary_loss_clip": 0.01085042, + "auxiliary_loss_mlp": 0.010366, + "balance_loss_clip": 1.01942754, + "balance_loss_mlp": 1.02686727, + "epoch": 0.29129715917631144, + "flos": 22892234682240.0, + "grad_norm": 2.1923817293059846, + "language_loss": 0.86279273, + "learning_rate": 3.2197447829154875e-06, + "loss": 0.88400924, + "num_input_tokens_seen": 104492265, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.58203125, + "step": 4845, + "time_per_iteration": 2.4020376205444336 + }, + { + "auxiliary_loss_clip": 0.01081456, + "auxiliary_loss_mlp": 0.01039662, + "balance_loss_clip": 1.02263904, + "balance_loss_mlp": 1.02454257, + "epoch": 0.2913572824289794, + "flos": 22673364168960.0, + "grad_norm": 1.8233020624281395, + "language_loss": 0.6660794, + "learning_rate": 3.2194453370970844e-06, + "loss": 0.68729067, + "num_input_tokens_seen": 104510755, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.5703125, + "step": 4846, + "time_per_iteration": 2.383287191390991 + }, + { + "auxiliary_loss_clip": 0.01080792, + "auxiliary_loss_mlp": 0.01030144, + "balance_loss_clip": 1.0143373, + "balance_loss_mlp": 1.02626097, + "epoch": 0.29141740568164737, + "flos": 23106427073280.0, + "grad_norm": 2.9322882141661952, + "language_loss": 0.70153689, + "learning_rate": 3.219145847759814e-06, + "loss": 0.72264624, + "num_input_tokens_seen": 104530830, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.54296875, + "step": 4847, + "time_per_iteration": 2.4253220558166504 + }, + { + "auxiliary_loss_clip": 0.01080213, + "auxiliary_loss_mlp": 0.01034993, + "balance_loss_clip": 1.01851881, + "balance_loss_mlp": 1.02504373, + "epoch": 0.29147752893431533, + "flos": 23585889041280.0, + "grad_norm": 1.5276903295987276, + "language_loss": 0.74004263, + "learning_rate": 3.218846314914365e-06, + "loss": 0.76119471, + "num_input_tokens_seen": 104550115, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.55078125, + "step": 4848, + "time_per_iteration": 2.3964364528656006 + }, + { + "auxiliary_loss_clip": 0.01080892, + "auxiliary_loss_mlp": 0.01032397, + "balance_loss_clip": 1.01500416, + "balance_loss_mlp": 1.02441859, + "epoch": 0.2915376521869833, + "flos": 20591004637440.0, + "grad_norm": 2.0042220062771046, + "language_loss": 0.76787412, + "learning_rate": 3.218546738571425e-06, + "loss": 0.78900695, + "num_input_tokens_seen": 104566255, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.5625, + "step": 4849, + "time_per_iteration": 2.3720860481262207 + }, + { + "auxiliary_loss_clip": 0.01083458, + "auxiliary_loss_mlp": 0.01031891, + "balance_loss_clip": 1.01480842, + "balance_loss_mlp": 1.02582693, + "epoch": 0.29159777543965126, + "flos": 20810503555200.0, + "grad_norm": 1.745789091985758, + "language_loss": 0.78179145, + "learning_rate": 3.2182471187416874e-06, + "loss": 0.80294496, + "num_input_tokens_seen": 104585235, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.578125, + "step": 4850, + "time_per_iteration": 2.3725900650024414 + }, + { + "auxiliary_loss_clip": 0.01081255, + "auxiliary_loss_mlp": 0.010333, + "balance_loss_clip": 1.01680112, + "balance_loss_mlp": 1.02514887, + "epoch": 0.29165789869231923, + "flos": 24242989340160.0, + "grad_norm": 1.9998012325708516, + "language_loss": 0.75585085, + "learning_rate": 3.2179474554358438e-06, + "loss": 0.77699637, + "num_input_tokens_seen": 104605315, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.5625, + "step": 4851, + "time_per_iteration": 2.413799285888672 + }, + { + "auxiliary_loss_clip": 0.01079551, + "auxiliary_loss_mlp": 0.0103613, + "balance_loss_clip": 1.02114487, + "balance_loss_mlp": 1.02531195, + "epoch": 0.29171802194498725, + "flos": 28948653008640.0, + "grad_norm": 1.390250495806322, + "language_loss": 0.77273381, + "learning_rate": 3.2176477486645883e-06, + "loss": 0.79389066, + "num_input_tokens_seen": 104626055, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.54296875, + "step": 4852, + "time_per_iteration": 2.4361729621887207 + }, + { + "auxiliary_loss_clip": 0.01079641, + "auxiliary_loss_mlp": 0.01039729, + "balance_loss_clip": 1.0230999, + "balance_loss_mlp": 1.02406645, + "epoch": 0.2917781451976552, + "flos": 22597218760320.0, + "grad_norm": 1.530315160990392, + "language_loss": 0.77832162, + "learning_rate": 3.2173479984386165e-06, + "loss": 0.79951537, + "num_input_tokens_seen": 104646005, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.5546875, + "step": 4853, + "time_per_iteration": 2.4366989135742188 + }, + { + "auxiliary_loss_clip": 0.0108095, + "auxiliary_loss_mlp": 0.01033931, + "balance_loss_clip": 1.01707482, + "balance_loss_mlp": 1.02429318, + "epoch": 0.2918382684503232, + "flos": 21573530519040.0, + "grad_norm": 2.513343578178358, + "language_loss": 0.88420606, + "learning_rate": 3.217048204768626e-06, + "loss": 0.90535486, + "num_input_tokens_seen": 104661620, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.56640625, + "step": 4854, + "time_per_iteration": 2.348270893096924 + }, + { + "auxiliary_loss_clip": 0.01082857, + "auxiliary_loss_mlp": 0.01041459, + "balance_loss_clip": 1.02329183, + "balance_loss_mlp": 1.02627993, + "epoch": 0.29189839170299114, + "flos": 24352337318400.0, + "grad_norm": 1.7781811804924719, + "language_loss": 0.86646366, + "learning_rate": 3.2167483676653167e-06, + "loss": 0.88770688, + "num_input_tokens_seen": 104681445, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.56640625, + "step": 4855, + "time_per_iteration": 2.4230921268463135 + }, + { + "auxiliary_loss_clip": 0.0102273, + "auxiliary_loss_mlp": 0.01013921, + "balance_loss_clip": 1.01213312, + "balance_loss_mlp": 1.01027477, + "epoch": 0.2919585149556591, + "flos": 71313065032320.0, + "grad_norm": 0.8010861439518471, + "language_loss": 0.60175705, + "learning_rate": 3.216448487139387e-06, + "loss": 0.6221236, + "num_input_tokens_seen": 104747945, + "router_z_loss_clip": 0.01782227, + "router_z_loss_mlp": 0.12451172, + "step": 4856, + "time_per_iteration": 3.1278514862060547 + }, + { + "auxiliary_loss_clip": 0.01079198, + "auxiliary_loss_mlp": 0.01031544, + "balance_loss_clip": 1.01648211, + "balance_loss_mlp": 1.02405477, + "epoch": 0.2920186382083271, + "flos": 15632290368000.0, + "grad_norm": 2.1411161978615634, + "language_loss": 0.68328226, + "learning_rate": 3.2161485632015397e-06, + "loss": 0.70438963, + "num_input_tokens_seen": 104766225, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.55078125, + "step": 4857, + "time_per_iteration": 2.403705358505249 + }, + { + "auxiliary_loss_clip": 0.01081421, + "auxiliary_loss_mlp": 0.01033698, + "balance_loss_clip": 1.01780152, + "balance_loss_mlp": 1.02615559, + "epoch": 0.29207876146099504, + "flos": 28364765564160.0, + "grad_norm": 1.9635585952318306, + "language_loss": 0.84004754, + "learning_rate": 3.2158485958624794e-06, + "loss": 0.86119872, + "num_input_tokens_seen": 104785345, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.5546875, + "step": 4858, + "time_per_iteration": 2.444014549255371 + }, + { + "auxiliary_loss_clip": 0.01082481, + "auxiliary_loss_mlp": 0.01031619, + "balance_loss_clip": 1.01531112, + "balance_loss_mlp": 1.02627563, + "epoch": 0.292138884713663, + "flos": 21869907984000.0, + "grad_norm": 1.8162958687013442, + "language_loss": 0.77836138, + "learning_rate": 3.2155485851329095e-06, + "loss": 0.79950237, + "num_input_tokens_seen": 104804560, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.5625, + "step": 4859, + "time_per_iteration": 2.398045539855957 + }, + { + "auxiliary_loss_clip": 0.01084891, + "auxiliary_loss_mlp": 0.01036773, + "balance_loss_clip": 1.01970196, + "balance_loss_mlp": 1.02530432, + "epoch": 0.29219900796633097, + "flos": 20991598110720.0, + "grad_norm": 3.12915696884728, + "language_loss": 0.68915278, + "learning_rate": 3.215248531023538e-06, + "loss": 0.71036941, + "num_input_tokens_seen": 104821105, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.59765625, + "step": 4860, + "time_per_iteration": 2.3619866371154785 + }, + { + "auxiliary_loss_clip": 0.01081391, + "auxiliary_loss_mlp": 0.01036311, + "balance_loss_clip": 1.0205512, + "balance_loss_mlp": 1.02690506, + "epoch": 0.29225913121899894, + "flos": 35003221032960.0, + "grad_norm": 2.084397699268929, + "language_loss": 0.75611734, + "learning_rate": 3.2149484335450722e-06, + "loss": 0.7772944, + "num_input_tokens_seen": 104841440, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.546875, + "step": 4861, + "time_per_iteration": 2.5306544303894043 + }, + { + "auxiliary_loss_clip": 0.01079733, + "auxiliary_loss_mlp": 0.01039709, + "balance_loss_clip": 1.02440238, + "balance_loss_mlp": 1.02536917, + "epoch": 0.2923192544716669, + "flos": 13514843053440.0, + "grad_norm": 1.6225731800692211, + "language_loss": 0.90992594, + "learning_rate": 3.2146482927082216e-06, + "loss": 0.93112034, + "num_input_tokens_seen": 104858210, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.54296875, + "step": 4862, + "time_per_iteration": 2.3501853942871094 + }, + { + "auxiliary_loss_clip": 0.01081327, + "auxiliary_loss_mlp": 0.0103225, + "balance_loss_clip": 1.01696754, + "balance_loss_mlp": 1.02506638, + "epoch": 0.29237937772433487, + "flos": 19462506894720.0, + "grad_norm": 2.25288590517054, + "language_loss": 0.73321408, + "learning_rate": 3.214348108523698e-06, + "loss": 0.75434983, + "num_input_tokens_seen": 104875620, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.5625, + "step": 4863, + "time_per_iteration": 2.378408908843994 + }, + { + "auxiliary_loss_clip": 0.01077311, + "auxiliary_loss_mlp": 0.01030392, + "balance_loss_clip": 1.01527655, + "balance_loss_mlp": 1.02555752, + "epoch": 0.29243950097700283, + "flos": 20849536321920.0, + "grad_norm": 1.7460244344959828, + "language_loss": 0.7778933, + "learning_rate": 3.214047881002214e-06, + "loss": 0.79897034, + "num_input_tokens_seen": 104894600, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.515625, + "step": 4864, + "time_per_iteration": 2.360659599304199 + }, + { + "auxiliary_loss_clip": 0.01083728, + "auxiliary_loss_mlp": 0.01036639, + "balance_loss_clip": 1.01857841, + "balance_loss_mlp": 1.02650046, + "epoch": 0.29249962422967085, + "flos": 23583165955200.0, + "grad_norm": 5.269798209451896, + "language_loss": 0.8133713, + "learning_rate": 3.2137476101544848e-06, + "loss": 0.83457494, + "num_input_tokens_seen": 104914530, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.57421875, + "step": 4865, + "time_per_iteration": 2.4074978828430176 + }, + { + "auxiliary_loss_clip": 0.01082623, + "auxiliary_loss_mlp": 0.01031604, + "balance_loss_clip": 1.01410413, + "balance_loss_mlp": 1.02608252, + "epoch": 0.2925597474823388, + "flos": 22272247025280.0, + "grad_norm": 1.7860263103481007, + "language_loss": 0.85093796, + "learning_rate": 3.213447295991225e-06, + "loss": 0.87208021, + "num_input_tokens_seen": 104933460, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.5625, + "step": 4866, + "time_per_iteration": 2.3721225261688232 + }, + { + "auxiliary_loss_clip": 0.01077431, + "auxiliary_loss_mlp": 0.01029499, + "balance_loss_clip": 1.01423407, + "balance_loss_mlp": 1.02409363, + "epoch": 0.2926198707350068, + "flos": 34454770485120.0, + "grad_norm": 1.8350695441760498, + "language_loss": 0.75842911, + "learning_rate": 3.2131469385231525e-06, + "loss": 0.77949834, + "num_input_tokens_seen": 104954495, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.53515625, + "step": 4867, + "time_per_iteration": 2.498326539993286 + }, + { + "auxiliary_loss_clip": 0.01082298, + "auxiliary_loss_mlp": 0.0103905, + "balance_loss_clip": 1.02264643, + "balance_loss_mlp": 1.02520621, + "epoch": 0.29267999398767475, + "flos": 20703110613120.0, + "grad_norm": 1.8880790595271757, + "language_loss": 0.73381352, + "learning_rate": 3.212846537760986e-06, + "loss": 0.75502706, + "num_input_tokens_seen": 104971915, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.5703125, + "step": 4868, + "time_per_iteration": 2.372096300125122 + }, + { + "auxiliary_loss_clip": 0.01080101, + "auxiliary_loss_mlp": 0.01028783, + "balance_loss_clip": 1.01271343, + "balance_loss_mlp": 1.02560258, + "epoch": 0.2927401172403427, + "flos": 18367700480640.0, + "grad_norm": 1.4892387846479225, + "language_loss": 0.74384058, + "learning_rate": 3.212546093715447e-06, + "loss": 0.76492941, + "num_input_tokens_seen": 104991335, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.546875, + "step": 4869, + "time_per_iteration": 2.385347366333008 + }, + { + "auxiliary_loss_clip": 0.01081931, + "auxiliary_loss_mlp": 0.01029954, + "balance_loss_clip": 1.01398015, + "balance_loss_mlp": 1.02586901, + "epoch": 0.2928002404930107, + "flos": 26102847576960.0, + "grad_norm": 1.514749176376369, + "language_loss": 0.76660168, + "learning_rate": 3.2122456063972567e-06, + "loss": 0.78772056, + "num_input_tokens_seen": 105012015, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.5625, + "step": 4870, + "time_per_iteration": 2.4170315265655518 + }, + { + "auxiliary_loss_clip": 0.01085061, + "auxiliary_loss_mlp": 0.0103873, + "balance_loss_clip": 1.02070534, + "balance_loss_mlp": 1.02652979, + "epoch": 0.29286036374567864, + "flos": 21323656851840.0, + "grad_norm": 1.9955823115410365, + "language_loss": 0.67672682, + "learning_rate": 3.2119450758171393e-06, + "loss": 0.69796467, + "num_input_tokens_seen": 105031460, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.5859375, + "step": 4871, + "time_per_iteration": 2.3917009830474854 + }, + { + "auxiliary_loss_clip": 0.0107787, + "auxiliary_loss_mlp": 0.01033046, + "balance_loss_clip": 1.01746511, + "balance_loss_mlp": 1.02306676, + "epoch": 0.2929204869983466, + "flos": 29568221729280.0, + "grad_norm": 1.8462850604659118, + "language_loss": 0.77092016, + "learning_rate": 3.2116445019858196e-06, + "loss": 0.79202926, + "num_input_tokens_seen": 105052965, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.546875, + "step": 4872, + "time_per_iteration": 2.4423136711120605 + }, + { + "auxiliary_loss_clip": 0.01084541, + "auxiliary_loss_mlp": 0.01037598, + "balance_loss_clip": 1.01919222, + "balance_loss_mlp": 1.02650368, + "epoch": 0.2929806102510146, + "flos": 19057374944640.0, + "grad_norm": 1.8819621063513206, + "language_loss": 0.72737408, + "learning_rate": 3.211343884914024e-06, + "loss": 0.74859548, + "num_input_tokens_seen": 105071840, + "router_z_loss_clip": 0.18457031, + "router_z_loss_mlp": 0.58203125, + "step": 4873, + "time_per_iteration": 3.804901599884033 + }, + { + "auxiliary_loss_clip": 0.01080528, + "auxiliary_loss_mlp": 0.01032931, + "balance_loss_clip": 1.01541924, + "balance_loss_mlp": 1.02258682, + "epoch": 0.29304073350368254, + "flos": 21943155749760.0, + "grad_norm": 3.6052179327819207, + "language_loss": 0.78284812, + "learning_rate": 3.211043224612481e-06, + "loss": 0.80398273, + "num_input_tokens_seen": 105089445, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.578125, + "step": 4874, + "time_per_iteration": 2.3735947608947754 + }, + { + "auxiliary_loss_clip": 0.01084608, + "auxiliary_loss_mlp": 0.01032643, + "balance_loss_clip": 1.01491666, + "balance_loss_mlp": 1.02578545, + "epoch": 0.2931008567563505, + "flos": 15449904092160.0, + "grad_norm": 23.99480383454982, + "language_loss": 0.77402413, + "learning_rate": 3.2107425210919204e-06, + "loss": 0.79519665, + "num_input_tokens_seen": 105106210, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.58984375, + "step": 4875, + "time_per_iteration": 2.3539669513702393 + }, + { + "auxiliary_loss_clip": 0.01084258, + "auxiliary_loss_mlp": 0.01032358, + "balance_loss_clip": 1.01494789, + "balance_loss_mlp": 1.0282594, + "epoch": 0.29316098000901847, + "flos": 16982207153280.0, + "grad_norm": 1.8286495504906035, + "language_loss": 0.69097143, + "learning_rate": 3.2104417743630742e-06, + "loss": 0.71213758, + "num_input_tokens_seen": 105124200, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.5625, + "step": 4876, + "time_per_iteration": 3.7695279121398926 + }, + { + "auxiliary_loss_clip": 0.01080079, + "auxiliary_loss_mlp": 0.01032275, + "balance_loss_clip": 1.01652789, + "balance_loss_mlp": 1.02518678, + "epoch": 0.29322110326168643, + "flos": 16356912969600.0, + "grad_norm": 2.231555279326235, + "language_loss": 0.82430893, + "learning_rate": 3.2101409844366743e-06, + "loss": 0.84543246, + "num_input_tokens_seen": 105140400, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.546875, + "step": 4877, + "time_per_iteration": 2.3721048831939697 + }, + { + "auxiliary_loss_clip": 0.01084541, + "auxiliary_loss_mlp": 0.01034515, + "balance_loss_clip": 1.01792073, + "balance_loss_mlp": 1.02639866, + "epoch": 0.29328122651435445, + "flos": 13990010924160.0, + "grad_norm": 2.421226651180059, + "language_loss": 0.68025893, + "learning_rate": 3.209840151323456e-06, + "loss": 0.70144951, + "num_input_tokens_seen": 105157535, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.58203125, + "step": 4878, + "time_per_iteration": 2.3418586254119873 + }, + { + "auxiliary_loss_clip": 0.01081009, + "auxiliary_loss_mlp": 0.01037966, + "balance_loss_clip": 1.02038836, + "balance_loss_mlp": 1.02516055, + "epoch": 0.2933413497670224, + "flos": 25263430824960.0, + "grad_norm": 2.170631108356211, + "language_loss": 0.73805404, + "learning_rate": 3.2095392750341543e-06, + "loss": 0.75924385, + "num_input_tokens_seen": 105175185, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.55859375, + "step": 4879, + "time_per_iteration": 2.4113054275512695 + }, + { + "auxiliary_loss_clip": 0.01086212, + "auxiliary_loss_mlp": 0.01037099, + "balance_loss_clip": 1.0185858, + "balance_loss_mlp": 1.02735305, + "epoch": 0.2934014730196904, + "flos": 32122397640960.0, + "grad_norm": 1.835718001825746, + "language_loss": 0.66645366, + "learning_rate": 3.209238355579507e-06, + "loss": 0.68768674, + "num_input_tokens_seen": 105194540, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.5859375, + "step": 4880, + "time_per_iteration": 3.8453097343444824 + }, + { + "auxiliary_loss_clip": 0.01081036, + "auxiliary_loss_mlp": 0.01039601, + "balance_loss_clip": 1.023067, + "balance_loss_mlp": 1.02461982, + "epoch": 0.29346159627235835, + "flos": 24351359800320.0, + "grad_norm": 2.006945749269124, + "language_loss": 0.69953066, + "learning_rate": 3.2089373929702542e-06, + "loss": 0.7207371, + "num_input_tokens_seen": 105213215, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.56640625, + "step": 4881, + "time_per_iteration": 2.3981196880340576 + }, + { + "auxiliary_loss_clip": 0.01081542, + "auxiliary_loss_mlp": 0.01039028, + "balance_loss_clip": 1.02225518, + "balance_loss_mlp": 1.02529192, + "epoch": 0.2935217195250263, + "flos": 22745669328000.0, + "grad_norm": 1.553599614974391, + "language_loss": 0.83585513, + "learning_rate": 3.2086363872171344e-06, + "loss": 0.85706079, + "num_input_tokens_seen": 105231585, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.5625, + "step": 4882, + "time_per_iteration": 2.4001901149749756 + }, + { + "auxiliary_loss_clip": 0.0108388, + "auxiliary_loss_mlp": 0.01036315, + "balance_loss_clip": 1.0185287, + "balance_loss_mlp": 1.0260725, + "epoch": 0.2935818427776943, + "flos": 21724494704640.0, + "grad_norm": 2.67324288917706, + "language_loss": 0.71457648, + "learning_rate": 3.208335338330892e-06, + "loss": 0.73577839, + "num_input_tokens_seen": 105250120, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.578125, + "step": 4883, + "time_per_iteration": 3.7564988136291504 + }, + { + "auxiliary_loss_clip": 0.01082902, + "auxiliary_loss_mlp": 0.01032904, + "balance_loss_clip": 1.01577342, + "balance_loss_mlp": 1.02634728, + "epoch": 0.29364196603036224, + "flos": 23803851859200.0, + "grad_norm": 3.0229490266153656, + "language_loss": 0.92722136, + "learning_rate": 3.2080342463222693e-06, + "loss": 0.9483794, + "num_input_tokens_seen": 105266065, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.56640625, + "step": 4884, + "time_per_iteration": 2.3886795043945312 + }, + { + "auxiliary_loss_clip": 0.0108505, + "auxiliary_loss_mlp": 0.01036539, + "balance_loss_clip": 1.01994467, + "balance_loss_mlp": 1.02779257, + "epoch": 0.2937020892830302, + "flos": 23469139854720.0, + "grad_norm": 2.392878217485229, + "language_loss": 0.73708129, + "learning_rate": 3.207733111202011e-06, + "loss": 0.75829715, + "num_input_tokens_seen": 105282155, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.57421875, + "step": 4885, + "time_per_iteration": 2.3783321380615234 + }, + { + "auxiliary_loss_clip": 0.01080522, + "auxiliary_loss_mlp": 0.01030078, + "balance_loss_clip": 1.0132817, + "balance_loss_mlp": 1.02522457, + "epoch": 0.2937622125356982, + "flos": 24271793078400.0, + "grad_norm": 1.8573990103585152, + "language_loss": 0.85225159, + "learning_rate": 3.2074319329808656e-06, + "loss": 0.87335759, + "num_input_tokens_seen": 105299225, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.5546875, + "step": 4886, + "time_per_iteration": 2.3859682083129883 + }, + { + "auxiliary_loss_clip": 0.01081257, + "auxiliary_loss_mlp": 0.01034765, + "balance_loss_clip": 1.01786137, + "balance_loss_mlp": 1.02385736, + "epoch": 0.29382233578836614, + "flos": 20661564228480.0, + "grad_norm": 2.32258663063069, + "language_loss": 0.76938081, + "learning_rate": 3.2071307116695803e-06, + "loss": 0.79054105, + "num_input_tokens_seen": 105315710, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.57421875, + "step": 4887, + "time_per_iteration": 2.353215456008911 + }, + { + "auxiliary_loss_clip": 0.01084492, + "auxiliary_loss_mlp": 0.01033258, + "balance_loss_clip": 1.01702142, + "balance_loss_mlp": 1.02638662, + "epoch": 0.2938824590410341, + "flos": 16544117013120.0, + "grad_norm": 2.8613383860906425, + "language_loss": 0.79698789, + "learning_rate": 3.2068294472789044e-06, + "loss": 0.81816536, + "num_input_tokens_seen": 105333505, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.58203125, + "step": 4888, + "time_per_iteration": 2.3500609397888184 + }, + { + "auxiliary_loss_clip": 0.01078894, + "auxiliary_loss_mlp": 0.01032378, + "balance_loss_clip": 1.01561737, + "balance_loss_mlp": 1.02397156, + "epoch": 0.29394258229370207, + "flos": 37923949975680.0, + "grad_norm": 1.3871355726088788, + "language_loss": 0.55150604, + "learning_rate": 3.20652813981959e-06, + "loss": 0.57261878, + "num_input_tokens_seen": 105355605, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.546875, + "step": 4889, + "time_per_iteration": 2.516327381134033 + }, + { + "auxiliary_loss_clip": 0.0108457, + "auxiliary_loss_mlp": 0.01036514, + "balance_loss_clip": 1.01814413, + "balance_loss_mlp": 1.02591062, + "epoch": 0.29400270554637004, + "flos": 20043741075840.0, + "grad_norm": 1.572949638975355, + "language_loss": 0.8448022, + "learning_rate": 3.2062267893023903e-06, + "loss": 0.86601299, + "num_input_tokens_seen": 105374225, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.5859375, + "step": 4890, + "time_per_iteration": 2.388153314590454 + }, + { + "auxiliary_loss_clip": 0.0108447, + "auxiliary_loss_mlp": 0.0103541, + "balance_loss_clip": 1.01847041, + "balance_loss_mlp": 1.02654934, + "epoch": 0.294062828799038, + "flos": 15265527868800.0, + "grad_norm": 1.7813438056417468, + "language_loss": 0.72199506, + "learning_rate": 3.205925395738059e-06, + "loss": 0.74319386, + "num_input_tokens_seen": 105391565, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.578125, + "step": 4891, + "time_per_iteration": 2.3356659412384033 + }, + { + "auxiliary_loss_clip": 0.0108306, + "auxiliary_loss_mlp": 0.01035055, + "balance_loss_clip": 1.01757908, + "balance_loss_mlp": 1.02644444, + "epoch": 0.294122952051706, + "flos": 22746053352960.0, + "grad_norm": 1.7578053358519676, + "language_loss": 0.77017832, + "learning_rate": 3.205623959137353e-06, + "loss": 0.79135942, + "num_input_tokens_seen": 105409840, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.56640625, + "step": 4892, + "time_per_iteration": 2.372314691543579 + }, + { + "auxiliary_loss_clip": 0.01080153, + "auxiliary_loss_mlp": 0.01031026, + "balance_loss_clip": 1.01487303, + "balance_loss_mlp": 1.02538633, + "epoch": 0.294183075304374, + "flos": 24971731482240.0, + "grad_norm": 1.6844281457659027, + "language_loss": 0.78581607, + "learning_rate": 3.205322479511028e-06, + "loss": 0.8069278, + "num_input_tokens_seen": 105428645, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.546875, + "step": 4893, + "time_per_iteration": 2.3993079662323 + }, + { + "auxiliary_loss_clip": 0.01083619, + "auxiliary_loss_mlp": 0.01038484, + "balance_loss_clip": 1.02188969, + "balance_loss_mlp": 1.02579999, + "epoch": 0.29424319855704195, + "flos": 30951760020480.0, + "grad_norm": 2.0692127805239866, + "language_loss": 0.84711272, + "learning_rate": 3.205020956869845e-06, + "loss": 0.86833376, + "num_input_tokens_seen": 105447480, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.578125, + "step": 4894, + "time_per_iteration": 2.4666411876678467 + }, + { + "auxiliary_loss_clip": 0.01081782, + "auxiliary_loss_mlp": 0.01026289, + "balance_loss_clip": 1.0100764, + "balance_loss_mlp": 1.02415371, + "epoch": 0.2943033218097099, + "flos": 15230684465280.0, + "grad_norm": 2.3165119224286936, + "language_loss": 0.9101001, + "learning_rate": 3.204719391224563e-06, + "loss": 0.93118083, + "num_input_tokens_seen": 105464600, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.578125, + "step": 4895, + "time_per_iteration": 2.3518359661102295 + }, + { + "auxiliary_loss_clip": 0.01083902, + "auxiliary_loss_mlp": 0.01039759, + "balance_loss_clip": 1.02205598, + "balance_loss_mlp": 1.02557957, + "epoch": 0.2943634450623779, + "flos": 21724808906880.0, + "grad_norm": 2.2359087926147607, + "language_loss": 0.86197579, + "learning_rate": 3.2044177825859457e-06, + "loss": 0.88321245, + "num_input_tokens_seen": 105481510, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.5859375, + "step": 4896, + "time_per_iteration": 2.3923354148864746 + }, + { + "auxiliary_loss_clip": 0.01085484, + "auxiliary_loss_mlp": 0.01041251, + "balance_loss_clip": 1.02265477, + "balance_loss_mlp": 1.02733314, + "epoch": 0.29442356831504585, + "flos": 22600989187200.0, + "grad_norm": 1.688096675929124, + "language_loss": 0.73318756, + "learning_rate": 3.2041161309647555e-06, + "loss": 0.75445491, + "num_input_tokens_seen": 105501390, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.58203125, + "step": 4897, + "time_per_iteration": 2.3941261768341064 + }, + { + "auxiliary_loss_clip": 0.01086413, + "auxiliary_loss_mlp": 0.01036772, + "balance_loss_clip": 1.01761472, + "balance_loss_mlp": 1.02478504, + "epoch": 0.2944836915677138, + "flos": 20010363949440.0, + "grad_norm": 1.9960406042886065, + "language_loss": 0.73861003, + "learning_rate": 3.2038144363717572e-06, + "loss": 0.7598418, + "num_input_tokens_seen": 105519600, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 0.6171875, + "step": 4898, + "time_per_iteration": 2.369438409805298 + }, + { + "auxiliary_loss_clip": 0.01089625, + "auxiliary_loss_mlp": 0.01042093, + "balance_loss_clip": 1.0222919, + "balance_loss_mlp": 1.02782011, + "epoch": 0.2945438148203818, + "flos": 20044893150720.0, + "grad_norm": 3.701299878786074, + "language_loss": 0.70102954, + "learning_rate": 3.203512698817719e-06, + "loss": 0.72234678, + "num_input_tokens_seen": 105535970, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 0.6171875, + "step": 4899, + "time_per_iteration": 2.359187364578247 + }, + { + "auxiliary_loss_clip": 0.01084016, + "auxiliary_loss_mlp": 0.01041504, + "balance_loss_clip": 1.02347994, + "balance_loss_mlp": 1.02632058, + "epoch": 0.29460393807304974, + "flos": 23732384572800.0, + "grad_norm": 1.9553479039142885, + "language_loss": 0.78914893, + "learning_rate": 3.2032109183134086e-06, + "loss": 0.81040412, + "num_input_tokens_seen": 105556735, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.578125, + "step": 4900, + "time_per_iteration": 2.406459093093872 + }, + { + "auxiliary_loss_clip": 0.01081344, + "auxiliary_loss_mlp": 0.01035357, + "balance_loss_clip": 1.01802444, + "balance_loss_mlp": 1.02383006, + "epoch": 0.2946640613257177, + "flos": 14975190069120.0, + "grad_norm": 1.6438964898485167, + "language_loss": 0.80501366, + "learning_rate": 3.202909094869595e-06, + "loss": 0.82618064, + "num_input_tokens_seen": 105574875, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.57421875, + "step": 4901, + "time_per_iteration": 2.340542793273926 + }, + { + "auxiliary_loss_clip": 0.01076804, + "auxiliary_loss_mlp": 0.01029178, + "balance_loss_clip": 1.01307261, + "balance_loss_mlp": 1.02381194, + "epoch": 0.2947241845783857, + "flos": 24242744960640.0, + "grad_norm": 2.422611514284392, + "language_loss": 0.57843292, + "learning_rate": 3.2026072284970504e-06, + "loss": 0.59949273, + "num_input_tokens_seen": 105594225, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.53125, + "step": 4902, + "time_per_iteration": 2.4023821353912354 + }, + { + "auxiliary_loss_clip": 0.01083378, + "auxiliary_loss_mlp": 0.01032702, + "balance_loss_clip": 1.0166508, + "balance_loss_mlp": 1.02534986, + "epoch": 0.29478430783105364, + "flos": 19937360563200.0, + "grad_norm": 1.7167352084112582, + "language_loss": 0.75626671, + "learning_rate": 3.202305319206547e-06, + "loss": 0.77742743, + "num_input_tokens_seen": 105614000, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.578125, + "step": 4903, + "time_per_iteration": 2.391801595687866 + }, + { + "auxiliary_loss_clip": 0.01084565, + "auxiliary_loss_mlp": 0.01040744, + "balance_loss_clip": 1.02301753, + "balance_loss_mlp": 1.02632689, + "epoch": 0.2948444310837216, + "flos": 27380110089600.0, + "grad_norm": 2.8099633961081496, + "language_loss": 0.61930472, + "learning_rate": 3.20200336700886e-06, + "loss": 0.64055777, + "num_input_tokens_seen": 105634575, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.58203125, + "step": 4904, + "time_per_iteration": 2.4181010723114014 + }, + { + "auxiliary_loss_clip": 0.01083676, + "auxiliary_loss_mlp": 0.01033904, + "balance_loss_clip": 1.01679766, + "balance_loss_mlp": 1.02543235, + "epoch": 0.2949045543363896, + "flos": 23404305726720.0, + "grad_norm": 1.8502228578174662, + "language_loss": 0.73049009, + "learning_rate": 3.2017013719147644e-06, + "loss": 0.75166583, + "num_input_tokens_seen": 105654385, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.58203125, + "step": 4905, + "time_per_iteration": 2.398667812347412 + }, + { + "auxiliary_loss_clip": 0.01081329, + "auxiliary_loss_mlp": 0.01033864, + "balance_loss_clip": 1.01653099, + "balance_loss_mlp": 1.02478731, + "epoch": 0.2949646775890576, + "flos": 23950347390720.0, + "grad_norm": 1.7390085340468906, + "language_loss": 0.81068105, + "learning_rate": 3.201399333935038e-06, + "loss": 0.831833, + "num_input_tokens_seen": 105673570, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.56640625, + "step": 4906, + "time_per_iteration": 2.3977715969085693 + }, + { + "auxiliary_loss_clip": 0.01078128, + "auxiliary_loss_mlp": 0.01028579, + "balance_loss_clip": 1.01258135, + "balance_loss_mlp": 1.02443981, + "epoch": 0.29502480084172555, + "flos": 22783200906240.0, + "grad_norm": 2.603466052185471, + "language_loss": 0.87530965, + "learning_rate": 3.2010972530804595e-06, + "loss": 0.89637673, + "num_input_tokens_seen": 105691940, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.5390625, + "step": 4907, + "time_per_iteration": 2.409848690032959 + }, + { + "auxiliary_loss_clip": 0.0108497, + "auxiliary_loss_mlp": 0.0103254, + "balance_loss_clip": 1.01394343, + "balance_loss_mlp": 1.02614701, + "epoch": 0.2950849240943935, + "flos": 19645626309120.0, + "grad_norm": 2.102808847733714, + "language_loss": 0.82179803, + "learning_rate": 3.20079512936181e-06, + "loss": 0.84297311, + "num_input_tokens_seen": 105709825, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.5859375, + "step": 4908, + "time_per_iteration": 2.368774890899658 + }, + { + "auxiliary_loss_clip": 0.0101795, + "auxiliary_loss_mlp": 0.01013677, + "balance_loss_clip": 1.01192486, + "balance_loss_mlp": 1.00435793, + "epoch": 0.2951450473470615, + "flos": 70999790469120.0, + "grad_norm": 0.7751992544529333, + "language_loss": 0.57288802, + "learning_rate": 3.2004929627898707e-06, + "loss": 0.59320438, + "num_input_tokens_seen": 105766880, + "router_z_loss_clip": 0.01757812, + "router_z_loss_mlp": 0.13574219, + "step": 4909, + "time_per_iteration": 2.909693479537964 + }, + { + "auxiliary_loss_clip": 0.01083434, + "auxiliary_loss_mlp": 0.01034157, + "balance_loss_clip": 1.01845741, + "balance_loss_mlp": 1.0268054, + "epoch": 0.29520517059972945, + "flos": 22965203157120.0, + "grad_norm": 1.615802224826704, + "language_loss": 0.86681747, + "learning_rate": 3.200190753375426e-06, + "loss": 0.88799345, + "num_input_tokens_seen": 105786875, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.56640625, + "step": 4910, + "time_per_iteration": 2.388585090637207 + }, + { + "auxiliary_loss_clip": 0.01077096, + "auxiliary_loss_mlp": 0.01034553, + "balance_loss_clip": 1.01917529, + "balance_loss_mlp": 1.02380323, + "epoch": 0.2952652938523974, + "flos": 20484624124800.0, + "grad_norm": 1.951268358274521, + "language_loss": 0.72797281, + "learning_rate": 3.1998885011292604e-06, + "loss": 0.74908936, + "num_input_tokens_seen": 105805315, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.53515625, + "step": 4911, + "time_per_iteration": 2.3944482803344727 + }, + { + "auxiliary_loss_clip": 0.01082167, + "auxiliary_loss_mlp": 0.01028729, + "balance_loss_clip": 1.01252866, + "balance_loss_mlp": 1.02685213, + "epoch": 0.2953254171050654, + "flos": 19645556486400.0, + "grad_norm": 1.6875099426125348, + "language_loss": 0.90211958, + "learning_rate": 3.199586206062161e-06, + "loss": 0.9232285, + "num_input_tokens_seen": 105825125, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.5546875, + "step": 4912, + "time_per_iteration": 2.3709516525268555 + }, + { + "auxiliary_loss_clip": 0.01082796, + "auxiliary_loss_mlp": 0.01041035, + "balance_loss_clip": 1.02349925, + "balance_loss_mlp": 1.02643323, + "epoch": 0.29538554035773334, + "flos": 22746856314240.0, + "grad_norm": 1.3142472597057169, + "language_loss": 0.83268452, + "learning_rate": 3.1992838681849153e-06, + "loss": 0.85392284, + "num_input_tokens_seen": 105846085, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.5625, + "step": 4913, + "time_per_iteration": 3.7645251750946045 + }, + { + "auxiliary_loss_clip": 0.01083181, + "auxiliary_loss_mlp": 0.01033456, + "balance_loss_clip": 1.01692128, + "balance_loss_mlp": 1.02522612, + "epoch": 0.2954456636104013, + "flos": 21870780768000.0, + "grad_norm": 1.6816325869538873, + "language_loss": 0.76519728, + "learning_rate": 3.1989814875083134e-06, + "loss": 0.7863636, + "num_input_tokens_seen": 105865400, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.578125, + "step": 4914, + "time_per_iteration": 2.385406017303467 + }, + { + "auxiliary_loss_clip": 0.01080662, + "auxiliary_loss_mlp": 0.01032381, + "balance_loss_clip": 1.01528645, + "balance_loss_mlp": 1.02527452, + "epoch": 0.2955057868630693, + "flos": 40440978334080.0, + "grad_norm": 1.7104443636612858, + "language_loss": 0.8157649, + "learning_rate": 3.198679064043146e-06, + "loss": 0.83689535, + "num_input_tokens_seen": 105887920, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.5546875, + "step": 4915, + "time_per_iteration": 2.5420663356781006 + }, + { + "auxiliary_loss_clip": 0.01083894, + "auxiliary_loss_mlp": 0.01033532, + "balance_loss_clip": 1.01720047, + "balance_loss_mlp": 1.02655172, + "epoch": 0.29556591011573724, + "flos": 22563422697600.0, + "grad_norm": 1.9577696244712204, + "language_loss": 0.84652781, + "learning_rate": 3.1983765978002067e-06, + "loss": 0.86770213, + "num_input_tokens_seen": 105904035, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.57421875, + "step": 4916, + "time_per_iteration": 3.798715114593506 + }, + { + "auxiliary_loss_clip": 0.01076681, + "auxiliary_loss_mlp": 0.01027232, + "balance_loss_clip": 1.01163971, + "balance_loss_mlp": 1.023875, + "epoch": 0.2956260333684052, + "flos": 22088254826880.0, + "grad_norm": 2.022261172999719, + "language_loss": 0.70051736, + "learning_rate": 3.198074088790289e-06, + "loss": 0.72155643, + "num_input_tokens_seen": 105922685, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.53125, + "step": 4917, + "time_per_iteration": 2.3779242038726807 + }, + { + "auxiliary_loss_clip": 0.01083227, + "auxiliary_loss_mlp": 0.01031051, + "balance_loss_clip": 1.01427794, + "balance_loss_mlp": 1.02686203, + "epoch": 0.2956861566210732, + "flos": 16434559566720.0, + "grad_norm": 2.12124537751327, + "language_loss": 0.90761769, + "learning_rate": 3.197771537024189e-06, + "loss": 0.92876041, + "num_input_tokens_seen": 105940425, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.5625, + "step": 4918, + "time_per_iteration": 2.363657236099243 + }, + { + "auxiliary_loss_clip": 0.01082568, + "auxiliary_loss_mlp": 0.01033852, + "balance_loss_clip": 1.0171864, + "balance_loss_mlp": 1.02558792, + "epoch": 0.2957462798737412, + "flos": 25810903854720.0, + "grad_norm": 1.9382986504832316, + "language_loss": 0.72297627, + "learning_rate": 3.197468942512703e-06, + "loss": 0.74414045, + "num_input_tokens_seen": 105960550, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.5703125, + "step": 4919, + "time_per_iteration": 2.4097585678100586 + }, + { + "auxiliary_loss_clip": 0.01079496, + "auxiliary_loss_mlp": 0.01035669, + "balance_loss_clip": 1.0193845, + "balance_loss_mlp": 1.02410746, + "epoch": 0.29580640312640916, + "flos": 16689914317440.0, + "grad_norm": 2.188192651828881, + "language_loss": 0.75942761, + "learning_rate": 3.1971663052666317e-06, + "loss": 0.78057921, + "num_input_tokens_seen": 105978820, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.5546875, + "step": 4920, + "time_per_iteration": 3.7057127952575684 + }, + { + "auxiliary_loss_clip": 0.01083026, + "auxiliary_loss_mlp": 0.01038267, + "balance_loss_clip": 1.02089834, + "balance_loss_mlp": 1.02663875, + "epoch": 0.2958665263790771, + "flos": 23944621927680.0, + "grad_norm": 2.154003675044161, + "language_loss": 0.68290514, + "learning_rate": 3.196863625296775e-06, + "loss": 0.70411807, + "num_input_tokens_seen": 105997545, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.5625, + "step": 4921, + "time_per_iteration": 2.4013686180114746 + }, + { + "auxiliary_loss_clip": 0.01084788, + "auxiliary_loss_mlp": 0.01036962, + "balance_loss_clip": 1.0195272, + "balance_loss_mlp": 1.02576828, + "epoch": 0.2959266496317451, + "flos": 18477432483840.0, + "grad_norm": 2.072372345309396, + "language_loss": 0.74815679, + "learning_rate": 3.1965609026139327e-06, + "loss": 0.76937425, + "num_input_tokens_seen": 106015320, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.58984375, + "step": 4922, + "time_per_iteration": 2.362406015396118 + }, + { + "auxiliary_loss_clip": 0.01084972, + "auxiliary_loss_mlp": 0.01034915, + "balance_loss_clip": 1.01662803, + "balance_loss_mlp": 1.02432787, + "epoch": 0.29598677288441305, + "flos": 25956317134080.0, + "grad_norm": 2.026167419747763, + "language_loss": 0.76809484, + "learning_rate": 3.1962581372289105e-06, + "loss": 0.78929377, + "num_input_tokens_seen": 106034555, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.60546875, + "step": 4923, + "time_per_iteration": 3.867509365081787 + }, + { + "auxiliary_loss_clip": 0.01083088, + "auxiliary_loss_mlp": 0.01036361, + "balance_loss_clip": 1.01955867, + "balance_loss_mlp": 1.02602458, + "epoch": 0.296046896137081, + "flos": 25154815985280.0, + "grad_norm": 2.5533045242335186, + "language_loss": 0.8641305, + "learning_rate": 3.195955329152512e-06, + "loss": 0.88532495, + "num_input_tokens_seen": 106054200, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.5703125, + "step": 4924, + "time_per_iteration": 2.3980345726013184 + }, + { + "auxiliary_loss_clip": 0.01081054, + "auxiliary_loss_mlp": 0.01033754, + "balance_loss_clip": 1.01644504, + "balance_loss_mlp": 1.02528882, + "epoch": 0.296107019389749, + "flos": 21760106158080.0, + "grad_norm": 1.65516296503997, + "language_loss": 0.81541371, + "learning_rate": 3.1956524783955453e-06, + "loss": 0.8365618, + "num_input_tokens_seen": 106074700, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.55859375, + "step": 4925, + "time_per_iteration": 2.4050099849700928 + }, + { + "auxiliary_loss_clip": 0.01079202, + "auxiliary_loss_mlp": 0.01033993, + "balance_loss_clip": 1.01807833, + "balance_loss_mlp": 1.02453804, + "epoch": 0.29616714264241695, + "flos": 17959286862720.0, + "grad_norm": 2.4937253093920844, + "language_loss": 0.85965389, + "learning_rate": 3.195349584968816e-06, + "loss": 0.88078582, + "num_input_tokens_seen": 106091415, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.546875, + "step": 4926, + "time_per_iteration": 2.3361332416534424 + }, + { + "auxiliary_loss_clip": 0.01080665, + "auxiliary_loss_mlp": 0.01030631, + "balance_loss_clip": 1.01423955, + "balance_loss_mlp": 1.02393651, + "epoch": 0.2962272658950849, + "flos": 15011883774720.0, + "grad_norm": 1.7826865633422024, + "language_loss": 0.85789901, + "learning_rate": 3.1950466488831357e-06, + "loss": 0.87901199, + "num_input_tokens_seen": 106109135, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.56640625, + "step": 4927, + "time_per_iteration": 2.355961322784424 + }, + { + "auxiliary_loss_clip": 0.01079876, + "auxiliary_loss_mlp": 0.01034354, + "balance_loss_clip": 1.01846313, + "balance_loss_mlp": 1.02535462, + "epoch": 0.2962873891477529, + "flos": 14719974963840.0, + "grad_norm": 1.7001593708258933, + "language_loss": 0.80577832, + "learning_rate": 3.194743670149314e-06, + "loss": 0.82692057, + "num_input_tokens_seen": 106125750, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.546875, + "step": 4928, + "time_per_iteration": 2.3352510929107666 + }, + { + "auxiliary_loss_clip": 0.01088138, + "auxiliary_loss_mlp": 0.01038856, + "balance_loss_clip": 1.0184716, + "balance_loss_mlp": 1.02639675, + "epoch": 0.29634751240042084, + "flos": 26722590854400.0, + "grad_norm": 2.3744074087895477, + "language_loss": 0.72309142, + "learning_rate": 3.194440648778164e-06, + "loss": 0.7443614, + "num_input_tokens_seen": 106142835, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 0.6171875, + "step": 4929, + "time_per_iteration": 2.4388341903686523 + }, + { + "auxiliary_loss_clip": 0.0108588, + "auxiliary_loss_mlp": 0.01036753, + "balance_loss_clip": 1.01866865, + "balance_loss_mlp": 1.02645302, + "epoch": 0.2964076356530888, + "flos": 14570511966720.0, + "grad_norm": 4.212360454262512, + "language_loss": 0.71932477, + "learning_rate": 3.1941375847805e-06, + "loss": 0.74055111, + "num_input_tokens_seen": 106160680, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.59375, + "step": 4930, + "time_per_iteration": 2.3521056175231934 + }, + { + "auxiliary_loss_clip": 0.01016475, + "auxiliary_loss_mlp": 0.01000342, + "balance_loss_clip": 0.99885148, + "balance_loss_mlp": 1.00428259, + "epoch": 0.29646775890575683, + "flos": 63100969585920.0, + "grad_norm": 1.3993840343409092, + "language_loss": 0.60672355, + "learning_rate": 3.193834478167137e-06, + "loss": 0.62689161, + "num_input_tokens_seen": 106224415, + "router_z_loss_clip": 0.01489258, + "router_z_loss_mlp": 0.12207031, + "step": 4931, + "time_per_iteration": 2.992666482925415 + }, + { + "auxiliary_loss_clip": 0.0101594, + "auxiliary_loss_mlp": 0.0100598, + "balance_loss_clip": 1.00443029, + "balance_loss_mlp": 1.00372422, + "epoch": 0.2965278821584248, + "flos": 63064345703040.0, + "grad_norm": 0.735726169862356, + "language_loss": 0.52304494, + "learning_rate": 3.1935313289488926e-06, + "loss": 0.54326415, + "num_input_tokens_seen": 106279140, + "router_z_loss_clip": 0.01550293, + "router_z_loss_mlp": 0.12207031, + "step": 4932, + "time_per_iteration": 2.917015552520752 + }, + { + "auxiliary_loss_clip": 0.01081801, + "auxiliary_loss_mlp": 0.01035944, + "balance_loss_clip": 1.01920736, + "balance_loss_mlp": 1.02534056, + "epoch": 0.29658800541109276, + "flos": 23767612001280.0, + "grad_norm": 1.6280007898936617, + "language_loss": 0.81764573, + "learning_rate": 3.193228137136585e-06, + "loss": 0.83882314, + "num_input_tokens_seen": 106298190, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.5625, + "step": 4933, + "time_per_iteration": 2.432817220687866 + }, + { + "auxiliary_loss_clip": 0.01080823, + "auxiliary_loss_mlp": 0.0103003, + "balance_loss_clip": 1.01380575, + "balance_loss_mlp": 1.02674389, + "epoch": 0.2966481286637607, + "flos": 23987390209920.0, + "grad_norm": 1.66283188878693, + "language_loss": 0.75357807, + "learning_rate": 3.1929249027410347e-06, + "loss": 0.77468657, + "num_input_tokens_seen": 106319065, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.5390625, + "step": 4934, + "time_per_iteration": 2.425346851348877 + }, + { + "auxiliary_loss_clip": 0.0108314, + "auxiliary_loss_mlp": 0.01037937, + "balance_loss_clip": 1.02037764, + "balance_loss_mlp": 1.02552247, + "epoch": 0.2967082519164287, + "flos": 17164209404160.0, + "grad_norm": 1.893474981864153, + "language_loss": 0.62201482, + "learning_rate": 3.1926216257730634e-06, + "loss": 0.64322567, + "num_input_tokens_seen": 106338040, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.578125, + "step": 4935, + "time_per_iteration": 2.40461802482605 + }, + { + "auxiliary_loss_clip": 0.01081253, + "auxiliary_loss_mlp": 0.0103651, + "balance_loss_clip": 1.01964164, + "balance_loss_mlp": 1.02624869, + "epoch": 0.29676837516909665, + "flos": 29386428946560.0, + "grad_norm": 1.4360867525035652, + "language_loss": 0.79570103, + "learning_rate": 3.1923183062434936e-06, + "loss": 0.81687868, + "num_input_tokens_seen": 106358900, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.55078125, + "step": 4936, + "time_per_iteration": 2.450254440307617 + }, + { + "auxiliary_loss_clip": 0.01085813, + "auxiliary_loss_mlp": 0.0103925, + "balance_loss_clip": 1.0218811, + "balance_loss_mlp": 1.02724814, + "epoch": 0.2968284984217646, + "flos": 34749786407040.0, + "grad_norm": 1.6828314150332218, + "language_loss": 0.74293697, + "learning_rate": 3.1920149441631505e-06, + "loss": 0.76418757, + "num_input_tokens_seen": 106381805, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.5859375, + "step": 4937, + "time_per_iteration": 2.522477388381958 + }, + { + "auxiliary_loss_clip": 0.01081524, + "auxiliary_loss_mlp": 0.01033844, + "balance_loss_clip": 1.01664257, + "balance_loss_mlp": 1.02645683, + "epoch": 0.2968886216744326, + "flos": 21543016124160.0, + "grad_norm": 1.5303565576356226, + "language_loss": 0.78013259, + "learning_rate": 3.1917115395428608e-06, + "loss": 0.80128628, + "num_input_tokens_seen": 106402365, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.55078125, + "step": 4938, + "time_per_iteration": 2.3948261737823486 + }, + { + "auxiliary_loss_clip": 0.01087227, + "auxiliary_loss_mlp": 0.01040686, + "balance_loss_clip": 1.02297115, + "balance_loss_mlp": 1.028741, + "epoch": 0.29694874492710055, + "flos": 12786484936320.0, + "grad_norm": 2.4509817098247697, + "language_loss": 0.76497996, + "learning_rate": 3.191408092393451e-06, + "loss": 0.78625906, + "num_input_tokens_seen": 106419800, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.58203125, + "step": 4939, + "time_per_iteration": 2.3792197704315186 + }, + { + "auxiliary_loss_clip": 0.01087285, + "auxiliary_loss_mlp": 0.0103921, + "balance_loss_clip": 1.02199006, + "balance_loss_mlp": 1.02810097, + "epoch": 0.2970088681797685, + "flos": 24568868770560.0, + "grad_norm": 1.482221350598616, + "language_loss": 0.77775824, + "learning_rate": 3.1911046027257516e-06, + "loss": 0.79902315, + "num_input_tokens_seen": 106440300, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.59375, + "step": 4940, + "time_per_iteration": 2.4061083793640137 + }, + { + "auxiliary_loss_clip": 0.01084323, + "auxiliary_loss_mlp": 0.01033726, + "balance_loss_clip": 1.01455736, + "balance_loss_mlp": 1.02702928, + "epoch": 0.2970689914324365, + "flos": 23658054554880.0, + "grad_norm": 1.5153718585456948, + "language_loss": 0.75121921, + "learning_rate": 3.1908010705505925e-06, + "loss": 0.77239972, + "num_input_tokens_seen": 106460035, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 0.57421875, + "step": 4941, + "time_per_iteration": 2.4436583518981934 + }, + { + "auxiliary_loss_clip": 0.01086834, + "auxiliary_loss_mlp": 0.01039896, + "balance_loss_clip": 1.02140713, + "balance_loss_mlp": 1.02710521, + "epoch": 0.29712911468510445, + "flos": 39668909328000.0, + "grad_norm": 24.137535795859353, + "language_loss": 0.74060488, + "learning_rate": 3.1904974958788065e-06, + "loss": 0.76187223, + "num_input_tokens_seen": 106481095, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.59765625, + "step": 4942, + "time_per_iteration": 2.5331871509552 + }, + { + "auxiliary_loss_clip": 0.01086437, + "auxiliary_loss_mlp": 0.01039067, + "balance_loss_clip": 1.02030301, + "balance_loss_mlp": 1.02802205, + "epoch": 0.2971892379377724, + "flos": 26394127983360.0, + "grad_norm": 2.4533612845299944, + "language_loss": 0.70337939, + "learning_rate": 3.190193878721227e-06, + "loss": 0.72463441, + "num_input_tokens_seen": 106501590, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.5859375, + "step": 4943, + "time_per_iteration": 2.4478604793548584 + }, + { + "auxiliary_loss_clip": 0.01087369, + "auxiliary_loss_mlp": 0.01032296, + "balance_loss_clip": 1.01417589, + "balance_loss_mlp": 1.02815938, + "epoch": 0.2972493611904404, + "flos": 17602229721600.0, + "grad_norm": 2.1677638237566024, + "language_loss": 0.79639876, + "learning_rate": 3.1898902190886898e-06, + "loss": 0.81759542, + "num_input_tokens_seen": 106519430, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.59375, + "step": 4944, + "time_per_iteration": 2.354905128479004 + }, + { + "auxiliary_loss_clip": 0.01079669, + "auxiliary_loss_mlp": 0.0103301, + "balance_loss_clip": 1.01638031, + "balance_loss_mlp": 1.02533865, + "epoch": 0.2973094844431084, + "flos": 20411725472640.0, + "grad_norm": 1.9256585896864484, + "language_loss": 0.82982606, + "learning_rate": 3.1895865169920316e-06, + "loss": 0.85095286, + "num_input_tokens_seen": 106535870, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.54296875, + "step": 4945, + "time_per_iteration": 2.3946311473846436 + }, + { + "auxiliary_loss_clip": 0.0107914, + "auxiliary_loss_mlp": 0.01035796, + "balance_loss_clip": 1.01939237, + "balance_loss_mlp": 1.02460909, + "epoch": 0.29736960769577636, + "flos": 17492532629760.0, + "grad_norm": 1.7483272929882534, + "language_loss": 0.66412324, + "learning_rate": 3.18928277244209e-06, + "loss": 0.68527257, + "num_input_tokens_seen": 106553560, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.546875, + "step": 4946, + "time_per_iteration": 2.355820655822754 + }, + { + "auxiliary_loss_clip": 0.01083842, + "auxiliary_loss_mlp": 0.0103211, + "balance_loss_clip": 1.01539731, + "balance_loss_mlp": 1.02757418, + "epoch": 0.2974297309484443, + "flos": 26102777754240.0, + "grad_norm": 1.6484372382581192, + "language_loss": 0.73916656, + "learning_rate": 3.1889789854497052e-06, + "loss": 0.76032609, + "num_input_tokens_seen": 106574115, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.5625, + "step": 4947, + "time_per_iteration": 2.455073118209839 + }, + { + "auxiliary_loss_clip": 0.01085889, + "auxiliary_loss_mlp": 0.01036002, + "balance_loss_clip": 1.0185616, + "balance_loss_mlp": 1.02581787, + "epoch": 0.2974898542011123, + "flos": 25665246195840.0, + "grad_norm": 2.4214897349304167, + "language_loss": 0.7344296, + "learning_rate": 3.188675156025719e-06, + "loss": 0.75564855, + "num_input_tokens_seen": 106593070, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.6015625, + "step": 4948, + "time_per_iteration": 2.4111666679382324 + }, + { + "auxiliary_loss_clip": 0.01080541, + "auxiliary_loss_mlp": 0.01030711, + "balance_loss_clip": 1.01465368, + "balance_loss_mlp": 1.02533805, + "epoch": 0.29754997745378026, + "flos": 18660342430080.0, + "grad_norm": 2.0166661377262405, + "language_loss": 0.83489668, + "learning_rate": 3.1883712841809752e-06, + "loss": 0.85600924, + "num_input_tokens_seen": 106610695, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.55078125, + "step": 4949, + "time_per_iteration": 2.40474009513855 + }, + { + "auxiliary_loss_clip": 0.01079151, + "auxiliary_loss_mlp": 0.01031914, + "balance_loss_clip": 1.01442552, + "balance_loss_mlp": 1.02464867, + "epoch": 0.2976101007064482, + "flos": 22273468922880.0, + "grad_norm": 2.039484776956203, + "language_loss": 0.71246374, + "learning_rate": 3.188067369926316e-06, + "loss": 0.73357439, + "num_input_tokens_seen": 106631300, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.54296875, + "step": 4950, + "time_per_iteration": 2.399895429611206 + }, + { + "auxiliary_loss_clip": 0.01079742, + "auxiliary_loss_mlp": 0.01034907, + "balance_loss_clip": 1.01941013, + "balance_loss_mlp": 1.02599514, + "epoch": 0.2976702239591162, + "flos": 21944552204160.0, + "grad_norm": 1.875904429409681, + "language_loss": 0.82162273, + "learning_rate": 3.1877634132725887e-06, + "loss": 0.84276927, + "num_input_tokens_seen": 106650065, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.53515625, + "step": 4951, + "time_per_iteration": 2.405880928039551 + }, + { + "auxiliary_loss_clip": 0.01079854, + "auxiliary_loss_mlp": 0.01030158, + "balance_loss_clip": 1.01366544, + "balance_loss_mlp": 1.02419114, + "epoch": 0.29773034721178415, + "flos": 24636251427840.0, + "grad_norm": 2.4846012778229496, + "language_loss": 0.74077445, + "learning_rate": 3.187459414230641e-06, + "loss": 0.76187456, + "num_input_tokens_seen": 106668230, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.5546875, + "step": 4952, + "time_per_iteration": 2.4007833003997803 + }, + { + "auxiliary_loss_clip": 0.01082459, + "auxiliary_loss_mlp": 0.01036772, + "balance_loss_clip": 1.01856911, + "balance_loss_mlp": 1.02654946, + "epoch": 0.2977904704644521, + "flos": 20556545258880.0, + "grad_norm": 1.8650635284884682, + "language_loss": 0.84202546, + "learning_rate": 3.187155372811321e-06, + "loss": 0.86321777, + "num_input_tokens_seen": 106687785, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.55859375, + "step": 4953, + "time_per_iteration": 3.8723080158233643 + }, + { + "auxiliary_loss_clip": 0.01079685, + "auxiliary_loss_mlp": 0.01035035, + "balance_loss_clip": 1.01896596, + "balance_loss_mlp": 1.0242126, + "epoch": 0.2978505937171201, + "flos": 18915452801280.0, + "grad_norm": 1.9214609579710038, + "language_loss": 0.73884964, + "learning_rate": 3.186851289025479e-06, + "loss": 0.75999683, + "num_input_tokens_seen": 106706875, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.5546875, + "step": 4954, + "time_per_iteration": 2.381962776184082 + }, + { + "auxiliary_loss_clip": 0.01079855, + "auxiliary_loss_mlp": 0.01030377, + "balance_loss_clip": 1.0141294, + "balance_loss_mlp": 1.02533495, + "epoch": 0.29791071696978805, + "flos": 19316744501760.0, + "grad_norm": 2.031766591644584, + "language_loss": 0.75790274, + "learning_rate": 3.186547162883968e-06, + "loss": 0.77900517, + "num_input_tokens_seen": 106725105, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.546875, + "step": 4955, + "time_per_iteration": 2.3933565616607666 + }, + { + "auxiliary_loss_clip": 0.0108229, + "auxiliary_loss_mlp": 0.01033677, + "balance_loss_clip": 1.01619482, + "balance_loss_mlp": 1.02537096, + "epoch": 0.297970840222456, + "flos": 18805825532160.0, + "grad_norm": 1.6318311261163168, + "language_loss": 0.72451949, + "learning_rate": 3.1862429943976404e-06, + "loss": 0.74567914, + "num_input_tokens_seen": 106744780, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.5703125, + "step": 4956, + "time_per_iteration": 3.7838122844696045 + }, + { + "auxiliary_loss_clip": 0.01083506, + "auxiliary_loss_mlp": 0.01043598, + "balance_loss_clip": 1.02627647, + "balance_loss_mlp": 1.02395868, + "epoch": 0.298030963475124, + "flos": 22851770549760.0, + "grad_norm": 4.2951227468985165, + "language_loss": 0.7899521, + "learning_rate": 3.1859387835773525e-06, + "loss": 0.81122315, + "num_input_tokens_seen": 106764670, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.59765625, + "step": 4957, + "time_per_iteration": 2.4262795448303223 + }, + { + "auxiliary_loss_clip": 0.01080574, + "auxiliary_loss_mlp": 0.01036749, + "balance_loss_clip": 1.01974988, + "balance_loss_mlp": 1.02406991, + "epoch": 0.298091086727792, + "flos": 21867499100160.0, + "grad_norm": 1.5023480559888165, + "language_loss": 0.70402986, + "learning_rate": 3.1856345304339593e-06, + "loss": 0.72520304, + "num_input_tokens_seen": 106783695, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.5625, + "step": 4958, + "time_per_iteration": 2.400470733642578 + }, + { + "auxiliary_loss_clip": 0.01082169, + "auxiliary_loss_mlp": 0.01035256, + "balance_loss_clip": 1.01923442, + "balance_loss_mlp": 1.02732944, + "epoch": 0.29815120998045996, + "flos": 21174054209280.0, + "grad_norm": 1.6193798431206679, + "language_loss": 0.78985393, + "learning_rate": 3.1853302349783197e-06, + "loss": 0.81102812, + "num_input_tokens_seen": 106803150, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.55078125, + "step": 4959, + "time_per_iteration": 3.7395496368408203 + }, + { + "auxiliary_loss_clip": 0.01077656, + "auxiliary_loss_mlp": 0.0103347, + "balance_loss_clip": 1.01747179, + "balance_loss_mlp": 1.02345788, + "epoch": 0.29821133323312793, + "flos": 19895395242240.0, + "grad_norm": 1.8592494653388847, + "language_loss": 0.79433644, + "learning_rate": 3.185025897221293e-06, + "loss": 0.81544769, + "num_input_tokens_seen": 106820705, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.54296875, + "step": 4960, + "time_per_iteration": 2.3856394290924072 + }, + { + "auxiliary_loss_clip": 0.01081901, + "auxiliary_loss_mlp": 0.01032479, + "balance_loss_clip": 1.01484823, + "balance_loss_mlp": 1.02502084, + "epoch": 0.2982714564857959, + "flos": 12749930876160.0, + "grad_norm": 2.561922803602244, + "language_loss": 0.74007982, + "learning_rate": 3.1847215171737406e-06, + "loss": 0.76122361, + "num_input_tokens_seen": 106837335, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.5703125, + "step": 4961, + "time_per_iteration": 2.34979510307312 + }, + { + "auxiliary_loss_clip": 0.01079736, + "auxiliary_loss_mlp": 0.01027939, + "balance_loss_clip": 1.01259637, + "balance_loss_mlp": 1.02538598, + "epoch": 0.29833157973846386, + "flos": 22270850570880.0, + "grad_norm": 1.7412902746459413, + "language_loss": 0.62228787, + "learning_rate": 3.1844170948465246e-06, + "loss": 0.64336461, + "num_input_tokens_seen": 106856250, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.54296875, + "step": 4962, + "time_per_iteration": 3.761460542678833 + }, + { + "auxiliary_loss_clip": 0.01080534, + "auxiliary_loss_mlp": 0.01042533, + "balance_loss_clip": 1.02429366, + "balance_loss_mlp": 1.02536082, + "epoch": 0.2983917029911318, + "flos": 15372222583680.0, + "grad_norm": 1.8940844573085047, + "language_loss": 0.83450472, + "learning_rate": 3.184112630250509e-06, + "loss": 0.85573542, + "num_input_tokens_seen": 106873370, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.55078125, + "step": 4963, + "time_per_iteration": 2.3452987670898438 + }, + { + "auxiliary_loss_clip": 0.01081522, + "auxiliary_loss_mlp": 0.01029655, + "balance_loss_clip": 1.01291764, + "balance_loss_mlp": 1.02615285, + "epoch": 0.2984518262437998, + "flos": 15376726149120.0, + "grad_norm": 2.2477371080255995, + "language_loss": 0.66339022, + "learning_rate": 3.1838081233965595e-06, + "loss": 0.68450201, + "num_input_tokens_seen": 106890330, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.5546875, + "step": 4964, + "time_per_iteration": 2.354438543319702 + }, + { + "auxiliary_loss_clip": 0.01078215, + "auxiliary_loss_mlp": 0.01027563, + "balance_loss_clip": 1.01223254, + "balance_loss_mlp": 1.0245465, + "epoch": 0.29851194949646775, + "flos": 18107632696320.0, + "grad_norm": 1.7220333178811484, + "language_loss": 0.71495241, + "learning_rate": 3.1835035742955435e-06, + "loss": 0.73601019, + "num_input_tokens_seen": 106909190, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.53515625, + "step": 4965, + "time_per_iteration": 2.3654918670654297 + }, + { + "auxiliary_loss_clip": 0.01084846, + "auxiliary_loss_mlp": 0.01032337, + "balance_loss_clip": 1.01536155, + "balance_loss_mlp": 1.02781796, + "epoch": 0.2985720727491357, + "flos": 22017136654080.0, + "grad_norm": 1.788455229308673, + "language_loss": 0.66098297, + "learning_rate": 3.1831989829583286e-06, + "loss": 0.68215483, + "num_input_tokens_seen": 106927825, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.5703125, + "step": 4966, + "time_per_iteration": 2.3892927169799805 + }, + { + "auxiliary_loss_clip": 0.01085005, + "auxiliary_loss_mlp": 0.01036058, + "balance_loss_clip": 1.01940417, + "balance_loss_mlp": 1.0272212, + "epoch": 0.2986321960018037, + "flos": 13040547966720.0, + "grad_norm": 2.4757151727883597, + "language_loss": 0.74111062, + "learning_rate": 3.182894349395787e-06, + "loss": 0.76232123, + "num_input_tokens_seen": 106943155, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.578125, + "step": 4967, + "time_per_iteration": 2.3399319648742676 + }, + { + "auxiliary_loss_clip": 0.01079334, + "auxiliary_loss_mlp": 0.01029739, + "balance_loss_clip": 1.01384878, + "balance_loss_mlp": 1.02384973, + "epoch": 0.29869231925447165, + "flos": 14464166365440.0, + "grad_norm": 1.9589136430162541, + "language_loss": 0.71396685, + "learning_rate": 3.1825896736187876e-06, + "loss": 0.73505759, + "num_input_tokens_seen": 106960295, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.5546875, + "step": 4968, + "time_per_iteration": 2.347165107727051 + }, + { + "auxiliary_loss_clip": 0.01080485, + "auxiliary_loss_mlp": 0.01030845, + "balance_loss_clip": 1.01346445, + "balance_loss_mlp": 1.02313113, + "epoch": 0.2987524425071396, + "flos": 31648870604160.0, + "grad_norm": 1.7095628584243794, + "language_loss": 0.76583636, + "learning_rate": 3.182284955638205e-06, + "loss": 0.78694969, + "num_input_tokens_seen": 106982870, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.57421875, + "step": 4969, + "time_per_iteration": 2.4576480388641357 + }, + { + "auxiliary_loss_clip": 0.01081117, + "auxiliary_loss_mlp": 0.01029858, + "balance_loss_clip": 1.01413393, + "balance_loss_mlp": 1.02602363, + "epoch": 0.2988125657598076, + "flos": 21432376425600.0, + "grad_norm": 1.7152626049217727, + "language_loss": 0.6997999, + "learning_rate": 3.181980195464913e-06, + "loss": 0.72090966, + "num_input_tokens_seen": 107002405, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.55078125, + "step": 4970, + "time_per_iteration": 2.4025802612304688 + }, + { + "auxiliary_loss_clip": 0.01082983, + "auxiliary_loss_mlp": 0.01036306, + "balance_loss_clip": 1.01765001, + "balance_loss_mlp": 1.02405429, + "epoch": 0.2988726890124756, + "flos": 18076001137920.0, + "grad_norm": 2.1080221485636597, + "language_loss": 0.85170591, + "learning_rate": 3.1816753931097894e-06, + "loss": 0.87289882, + "num_input_tokens_seen": 107017310, + "router_z_loss_clip": 0.18652344, + "router_z_loss_mlp": 0.58984375, + "step": 4971, + "time_per_iteration": 2.345571756362915 + }, + { + "auxiliary_loss_clip": 0.01077978, + "auxiliary_loss_mlp": 0.01035813, + "balance_loss_clip": 1.0189805, + "balance_loss_mlp": 1.02405381, + "epoch": 0.29893281226514357, + "flos": 21754764720000.0, + "grad_norm": 2.1463429817387962, + "language_loss": 0.79577583, + "learning_rate": 3.1813705485837095e-06, + "loss": 0.81691372, + "num_input_tokens_seen": 107034645, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.5390625, + "step": 4972, + "time_per_iteration": 2.3881871700286865 + }, + { + "auxiliary_loss_clip": 0.01081858, + "auxiliary_loss_mlp": 0.01037044, + "balance_loss_clip": 1.01960325, + "balance_loss_mlp": 1.02570879, + "epoch": 0.29899293551781153, + "flos": 16835781444480.0, + "grad_norm": 1.9887407082693163, + "language_loss": 0.85172081, + "learning_rate": 3.1810656618975544e-06, + "loss": 0.87290978, + "num_input_tokens_seen": 107051125, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.5625, + "step": 4973, + "time_per_iteration": 2.343738555908203 + }, + { + "auxiliary_loss_clip": 0.01078965, + "auxiliary_loss_mlp": 0.01038845, + "balance_loss_clip": 1.02296674, + "balance_loss_mlp": 1.02659976, + "epoch": 0.2990530587704795, + "flos": 11728407139200.0, + "grad_norm": 1.5730562905210672, + "language_loss": 0.77422863, + "learning_rate": 3.180760733062204e-06, + "loss": 0.7954067, + "num_input_tokens_seen": 107068815, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.5234375, + "step": 4974, + "time_per_iteration": 2.3843629360198975 + }, + { + "auxiliary_loss_clip": 0.01082426, + "auxiliary_loss_mlp": 0.01035694, + "balance_loss_clip": 1.01911223, + "balance_loss_mlp": 1.0247128, + "epoch": 0.29911318202314746, + "flos": 28038571931520.0, + "grad_norm": 1.7122826057419647, + "language_loss": 0.7228446, + "learning_rate": 3.1804557620885396e-06, + "loss": 0.74402583, + "num_input_tokens_seen": 107090420, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.578125, + "step": 4975, + "time_per_iteration": 2.4561450481414795 + }, + { + "auxiliary_loss_clip": 0.01081795, + "auxiliary_loss_mlp": 0.01031848, + "balance_loss_clip": 1.0148015, + "balance_loss_mlp": 1.02685761, + "epoch": 0.2991733052758154, + "flos": 18732577766400.0, + "grad_norm": 2.1044543390603736, + "language_loss": 0.75725859, + "learning_rate": 3.1801507489874453e-06, + "loss": 0.778395, + "num_input_tokens_seen": 107107255, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.55078125, + "step": 4976, + "time_per_iteration": 2.365079879760742 + }, + { + "auxiliary_loss_clip": 0.01080687, + "auxiliary_loss_mlp": 0.01033974, + "balance_loss_clip": 1.01796412, + "balance_loss_mlp": 1.02608037, + "epoch": 0.2992334285284834, + "flos": 15558274552320.0, + "grad_norm": 2.142849305216903, + "language_loss": 0.86118251, + "learning_rate": 3.1798456937698073e-06, + "loss": 0.88232917, + "num_input_tokens_seen": 107123840, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.546875, + "step": 4977, + "time_per_iteration": 2.33827805519104 + }, + { + "auxiliary_loss_clip": 0.01082456, + "auxiliary_loss_mlp": 0.01034243, + "balance_loss_clip": 1.01756597, + "balance_loss_mlp": 1.02657115, + "epoch": 0.29929355178115136, + "flos": 21796520572800.0, + "grad_norm": 1.6709021316852026, + "language_loss": 0.68248498, + "learning_rate": 3.1795405964465114e-06, + "loss": 0.70365196, + "num_input_tokens_seen": 107143475, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.55859375, + "step": 4978, + "time_per_iteration": 2.3888306617736816 + }, + { + "auxiliary_loss_clip": 0.01081182, + "auxiliary_loss_mlp": 0.01032142, + "balance_loss_clip": 1.01507092, + "balance_loss_mlp": 1.02576137, + "epoch": 0.2993536750338193, + "flos": 21177475522560.0, + "grad_norm": 2.7338189545992795, + "language_loss": 0.75924754, + "learning_rate": 3.1792354570284452e-06, + "loss": 0.78038073, + "num_input_tokens_seen": 107161725, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.5546875, + "step": 4979, + "time_per_iteration": 2.379936695098877 + }, + { + "auxiliary_loss_clip": 0.01080065, + "auxiliary_loss_mlp": 0.01034341, + "balance_loss_clip": 1.01705599, + "balance_loss_mlp": 1.02382302, + "epoch": 0.2994137982864873, + "flos": 32120826629760.0, + "grad_norm": 1.655006270921756, + "language_loss": 0.68303317, + "learning_rate": 3.1789302755264996e-06, + "loss": 0.70417726, + "num_input_tokens_seen": 107183935, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.5625, + "step": 4980, + "time_per_iteration": 2.459005832672119 + }, + { + "auxiliary_loss_clip": 0.01079715, + "auxiliary_loss_mlp": 0.01035819, + "balance_loss_clip": 1.01988077, + "balance_loss_mlp": 1.02612805, + "epoch": 0.29947392153915525, + "flos": 21104367402240.0, + "grad_norm": 1.773695016630351, + "language_loss": 0.73461616, + "learning_rate": 3.178625051951564e-06, + "loss": 0.75577152, + "num_input_tokens_seen": 107204285, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.53515625, + "step": 4981, + "time_per_iteration": 2.4080123901367188 + }, + { + "auxiliary_loss_clip": 0.01080974, + "auxiliary_loss_mlp": 0.01032784, + "balance_loss_clip": 1.01629722, + "balance_loss_mlp": 1.02344227, + "epoch": 0.2995340447918232, + "flos": 21541584758400.0, + "grad_norm": 1.5953135557033733, + "language_loss": 0.86637998, + "learning_rate": 3.1783197863145335e-06, + "loss": 0.88751757, + "num_input_tokens_seen": 107225265, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.57421875, + "step": 4982, + "time_per_iteration": 2.385634183883667 + }, + { + "auxiliary_loss_clip": 0.01083326, + "auxiliary_loss_mlp": 0.01036894, + "balance_loss_clip": 1.01855969, + "balance_loss_mlp": 1.02523935, + "epoch": 0.2995941680444912, + "flos": 16724268961920.0, + "grad_norm": 2.064618365489174, + "language_loss": 0.86887771, + "learning_rate": 3.1780144786262997e-06, + "loss": 0.89007992, + "num_input_tokens_seen": 107241335, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.58203125, + "step": 4983, + "time_per_iteration": 2.349159002304077 + }, + { + "auxiliary_loss_clip": 0.01082124, + "auxiliary_loss_mlp": 0.01033313, + "balance_loss_clip": 1.01704073, + "balance_loss_mlp": 1.02556515, + "epoch": 0.2996542912971592, + "flos": 20922434974080.0, + "grad_norm": 2.6680200032774133, + "language_loss": 0.78614646, + "learning_rate": 3.17770912889776e-06, + "loss": 0.80730087, + "num_input_tokens_seen": 107259375, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.5625, + "step": 4984, + "time_per_iteration": 2.3706724643707275 + }, + { + "auxiliary_loss_clip": 0.0108325, + "auxiliary_loss_mlp": 0.01035547, + "balance_loss_clip": 1.01866663, + "balance_loss_mlp": 1.02616, + "epoch": 0.29971441454982717, + "flos": 25078775310720.0, + "grad_norm": 1.5860478498349024, + "language_loss": 0.78228557, + "learning_rate": 3.17740373713981e-06, + "loss": 0.80347353, + "num_input_tokens_seen": 107279890, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.5703125, + "step": 4985, + "time_per_iteration": 2.4217989444732666 + }, + { + "auxiliary_loss_clip": 0.01084551, + "auxiliary_loss_mlp": 0.01035965, + "balance_loss_clip": 1.01721382, + "balance_loss_mlp": 1.02559328, + "epoch": 0.29977453780249513, + "flos": 52553989543680.0, + "grad_norm": 1.9692385439801579, + "language_loss": 0.71667582, + "learning_rate": 3.1770983033633504e-06, + "loss": 0.73788095, + "num_input_tokens_seen": 107303430, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.58984375, + "step": 4986, + "time_per_iteration": 2.653592348098755 + }, + { + "auxiliary_loss_clip": 0.01084353, + "auxiliary_loss_mlp": 0.01040687, + "balance_loss_clip": 1.02284098, + "balance_loss_mlp": 1.02591658, + "epoch": 0.2998346610551631, + "flos": 22236042078720.0, + "grad_norm": 2.0502182867088186, + "language_loss": 0.73531449, + "learning_rate": 3.1767928275792796e-06, + "loss": 0.75656486, + "num_input_tokens_seen": 107323700, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 0.58203125, + "step": 4987, + "time_per_iteration": 2.391425609588623 + }, + { + "auxiliary_loss_clip": 0.01079211, + "auxiliary_loss_mlp": 0.01032974, + "balance_loss_clip": 1.01742935, + "balance_loss_mlp": 1.02483892, + "epoch": 0.29989478430783106, + "flos": 16872265681920.0, + "grad_norm": 2.0424406972309375, + "language_loss": 0.80119443, + "learning_rate": 3.1764873097984997e-06, + "loss": 0.82231629, + "num_input_tokens_seen": 107341965, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.54296875, + "step": 4988, + "time_per_iteration": 2.345507860183716 + }, + { + "auxiliary_loss_clip": 0.01080255, + "auxiliary_loss_mlp": 0.01037872, + "balance_loss_clip": 1.020926, + "balance_loss_mlp": 1.024441, + "epoch": 0.29995490756049903, + "flos": 23767751646720.0, + "grad_norm": 2.341136429212972, + "language_loss": 0.70591819, + "learning_rate": 3.1761817500319143e-06, + "loss": 0.72709942, + "num_input_tokens_seen": 107362615, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.55859375, + "step": 4989, + "time_per_iteration": 2.406156301498413 + }, + { + "auxiliary_loss_clip": 0.01085645, + "auxiliary_loss_mlp": 0.01034608, + "balance_loss_clip": 1.01725078, + "balance_loss_mlp": 1.02865005, + "epoch": 0.300015030813167, + "flos": 14464445656320.0, + "grad_norm": 2.1163616930816005, + "language_loss": 0.85251993, + "learning_rate": 3.175876148290428e-06, + "loss": 0.87372243, + "num_input_tokens_seen": 107378980, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.5703125, + "step": 4990, + "time_per_iteration": 2.3781583309173584 + }, + { + "auxiliary_loss_clip": 0.0108342, + "auxiliary_loss_mlp": 0.01034322, + "balance_loss_clip": 1.01616633, + "balance_loss_mlp": 1.02503562, + "epoch": 0.30007515406583496, + "flos": 25190811463680.0, + "grad_norm": 1.7912572462240683, + "language_loss": 0.67249948, + "learning_rate": 3.175570504584946e-06, + "loss": 0.69367695, + "num_input_tokens_seen": 107397640, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.58203125, + "step": 4991, + "time_per_iteration": 2.4635307788848877 + }, + { + "auxiliary_loss_clip": 0.01083613, + "auxiliary_loss_mlp": 0.01040097, + "balance_loss_clip": 1.02029645, + "balance_loss_mlp": 1.02456439, + "epoch": 0.3001352773185029, + "flos": 19390166824320.0, + "grad_norm": 1.8452541622253724, + "language_loss": 0.78739929, + "learning_rate": 3.175264818926377e-06, + "loss": 0.80863643, + "num_input_tokens_seen": 107416020, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 0.58984375, + "step": 4992, + "time_per_iteration": 3.789320945739746 + }, + { + "auxiliary_loss_clip": 0.01079713, + "auxiliary_loss_mlp": 0.01036541, + "balance_loss_clip": 1.01939869, + "balance_loss_mlp": 1.02497733, + "epoch": 0.3001954005711709, + "flos": 21542771744640.0, + "grad_norm": 1.7661513954177344, + "language_loss": 0.82482982, + "learning_rate": 3.17495909132563e-06, + "loss": 0.84599233, + "num_input_tokens_seen": 107436340, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.546875, + "step": 4993, + "time_per_iteration": 2.4323110580444336 + }, + { + "auxiliary_loss_clip": 0.01085163, + "auxiliary_loss_mlp": 0.01040835, + "balance_loss_clip": 1.02097487, + "balance_loss_mlp": 1.02445126, + "epoch": 0.30025552382383885, + "flos": 17383359208320.0, + "grad_norm": 2.2549975862711196, + "language_loss": 0.85495508, + "learning_rate": 3.174653321793615e-06, + "loss": 0.8762151, + "num_input_tokens_seen": 107454585, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 0.60546875, + "step": 4994, + "time_per_iteration": 2.373474359512329 + }, + { + "auxiliary_loss_clip": 0.01083051, + "auxiliary_loss_mlp": 0.01033208, + "balance_loss_clip": 1.01686478, + "balance_loss_mlp": 1.02654529, + "epoch": 0.3003156470765068, + "flos": 29532051694080.0, + "grad_norm": 2.008891863343585, + "language_loss": 0.81047344, + "learning_rate": 3.1743475103412446e-06, + "loss": 0.83163607, + "num_input_tokens_seen": 107477180, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.56640625, + "step": 4995, + "time_per_iteration": 2.477457046508789 + }, + { + "auxiliary_loss_clip": 0.010792, + "auxiliary_loss_mlp": 0.0103423, + "balance_loss_clip": 1.01655102, + "balance_loss_mlp": 1.02507055, + "epoch": 0.3003757703291748, + "flos": 43644923159040.0, + "grad_norm": 1.6878801284506146, + "language_loss": 0.67329788, + "learning_rate": 3.174041656979432e-06, + "loss": 0.69443214, + "num_input_tokens_seen": 107500250, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.5390625, + "step": 4996, + "time_per_iteration": 4.041480541229248 + }, + { + "auxiliary_loss_clip": 0.01081071, + "auxiliary_loss_mlp": 0.01036733, + "balance_loss_clip": 1.01979363, + "balance_loss_mlp": 1.02459562, + "epoch": 0.30043589358184275, + "flos": 22527287573760.0, + "grad_norm": 2.1129496227938844, + "language_loss": 0.75430369, + "learning_rate": 3.1737357617190935e-06, + "loss": 0.77548176, + "num_input_tokens_seen": 107520070, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.5625, + "step": 4997, + "time_per_iteration": 2.545781135559082 + }, + { + "auxiliary_loss_clip": 0.01077258, + "auxiliary_loss_mlp": 0.01026782, + "balance_loss_clip": 1.01080751, + "balance_loss_mlp": 1.02360058, + "epoch": 0.30049601683451077, + "flos": 20994844867200.0, + "grad_norm": 1.6812323361446717, + "language_loss": 0.77757078, + "learning_rate": 3.1734298245711443e-06, + "loss": 0.79861116, + "num_input_tokens_seen": 107539285, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.53515625, + "step": 4998, + "time_per_iteration": 4.003045082092285 + }, + { + "auxiliary_loss_clip": 0.01077762, + "auxiliary_loss_mlp": 0.01033633, + "balance_loss_clip": 1.01793289, + "balance_loss_mlp": 1.02437735, + "epoch": 0.30055614008717874, + "flos": 23914840671360.0, + "grad_norm": 1.5437849077527765, + "language_loss": 0.72840375, + "learning_rate": 3.1731238455465033e-06, + "loss": 0.74951768, + "num_input_tokens_seen": 107560260, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.53515625, + "step": 4999, + "time_per_iteration": 2.5262911319732666 + }, + { + "auxiliary_loss_clip": 0.01080357, + "auxiliary_loss_mlp": 0.01035271, + "balance_loss_clip": 1.01815248, + "balance_loss_mlp": 1.0255444, + "epoch": 0.3006162633398467, + "flos": 19168852515840.0, + "grad_norm": 1.5843387602958585, + "language_loss": 0.75770509, + "learning_rate": 3.1728178246560903e-06, + "loss": 0.7788614, + "num_input_tokens_seen": 107579260, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.546875, + "step": 5000, + "time_per_iteration": 2.4891839027404785 + }, + { + "auxiliary_loss_clip": 0.01079471, + "auxiliary_loss_mlp": 0.0103528, + "balance_loss_clip": 1.01942515, + "balance_loss_mlp": 1.02659953, + "epoch": 0.30067638659251467, + "flos": 14678498401920.0, + "grad_norm": 2.726883484417335, + "language_loss": 0.81758177, + "learning_rate": 3.172511761910825e-06, + "loss": 0.83872926, + "num_input_tokens_seen": 107595245, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.53125, + "step": 5001, + "time_per_iteration": 2.4570488929748535 + }, + { + "auxiliary_loss_clip": 0.01081517, + "auxiliary_loss_mlp": 0.01037039, + "balance_loss_clip": 1.01971793, + "balance_loss_mlp": 1.0257709, + "epoch": 0.30073650984518263, + "flos": 23366878882560.0, + "grad_norm": 2.1044792838413735, + "language_loss": 0.80636716, + "learning_rate": 3.1722056573216315e-06, + "loss": 0.82755268, + "num_input_tokens_seen": 107613985, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.5546875, + "step": 5002, + "time_per_iteration": 3.8591198921203613 + }, + { + "auxiliary_loss_clip": 0.0108517, + "auxiliary_loss_mlp": 0.01032671, + "balance_loss_clip": 1.01599324, + "balance_loss_mlp": 1.0265274, + "epoch": 0.3007966330978506, + "flos": 22965517359360.0, + "grad_norm": 1.8714947113016105, + "language_loss": 0.71259362, + "learning_rate": 3.1718995108994336e-06, + "loss": 0.73377204, + "num_input_tokens_seen": 107631435, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.5859375, + "step": 5003, + "time_per_iteration": 2.400160551071167 + }, + { + "auxiliary_loss_clip": 0.01082306, + "auxiliary_loss_mlp": 0.01034596, + "balance_loss_clip": 1.0175848, + "balance_loss_mlp": 1.02557886, + "epoch": 0.30085675635051856, + "flos": 27817222711680.0, + "grad_norm": 1.7953384054079804, + "language_loss": 0.70450509, + "learning_rate": 3.1715933226551562e-06, + "loss": 0.72567403, + "num_input_tokens_seen": 107650530, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.56640625, + "step": 5004, + "time_per_iteration": 2.440431594848633 + }, + { + "auxiliary_loss_clip": 0.01083846, + "auxiliary_loss_mlp": 0.01036555, + "balance_loss_clip": 1.01845884, + "balance_loss_mlp": 1.02555871, + "epoch": 0.3009168796031865, + "flos": 10882147760640.0, + "grad_norm": 2.6380312247766984, + "language_loss": 0.81586128, + "learning_rate": 3.171287092599727e-06, + "loss": 0.83706534, + "num_input_tokens_seen": 107662240, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.58203125, + "step": 5005, + "time_per_iteration": 2.3403513431549072 + }, + { + "auxiliary_loss_clip": 0.01080729, + "auxiliary_loss_mlp": 0.01031921, + "balance_loss_clip": 1.01595902, + "balance_loss_mlp": 1.02547896, + "epoch": 0.3009770028558545, + "flos": 23804270795520.0, + "grad_norm": 2.448149402759662, + "language_loss": 0.74657762, + "learning_rate": 3.1709808207440745e-06, + "loss": 0.76770413, + "num_input_tokens_seen": 107680330, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.55078125, + "step": 5006, + "time_per_iteration": 2.3962249755859375 + }, + { + "auxiliary_loss_clip": 0.01080192, + "auxiliary_loss_mlp": 0.01032461, + "balance_loss_clip": 1.01655793, + "balance_loss_mlp": 1.02567017, + "epoch": 0.30103712610852246, + "flos": 26467026635520.0, + "grad_norm": 1.7758321801712973, + "language_loss": 0.71093178, + "learning_rate": 3.170674507099128e-06, + "loss": 0.73205829, + "num_input_tokens_seen": 107700020, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.546875, + "step": 5007, + "time_per_iteration": 2.432056188583374 + }, + { + "auxiliary_loss_clip": 0.01080575, + "auxiliary_loss_mlp": 0.01028311, + "balance_loss_clip": 1.01070428, + "balance_loss_mlp": 1.02538824, + "epoch": 0.3010972493611904, + "flos": 22855366419840.0, + "grad_norm": 2.3228593027480535, + "language_loss": 0.76148784, + "learning_rate": 3.17036815167582e-06, + "loss": 0.78257668, + "num_input_tokens_seen": 107718575, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.55078125, + "step": 5008, + "time_per_iteration": 2.384437084197998 + }, + { + "auxiliary_loss_clip": 0.01080406, + "auxiliary_loss_mlp": 0.01032703, + "balance_loss_clip": 1.01582277, + "balance_loss_mlp": 1.02531934, + "epoch": 0.3011573726138584, + "flos": 24052748008320.0, + "grad_norm": 2.308683541342158, + "language_loss": 0.84400833, + "learning_rate": 3.170061754485084e-06, + "loss": 0.86513948, + "num_input_tokens_seen": 107738635, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.55078125, + "step": 5009, + "time_per_iteration": 2.4246981143951416 + }, + { + "auxiliary_loss_clip": 0.01082541, + "auxiliary_loss_mlp": 0.01035745, + "balance_loss_clip": 1.0166595, + "balance_loss_mlp": 1.02468181, + "epoch": 0.30121749586652635, + "flos": 20258841162240.0, + "grad_norm": 1.9321459927068347, + "language_loss": 0.83378142, + "learning_rate": 3.1697553155378527e-06, + "loss": 0.85496426, + "num_input_tokens_seen": 107753415, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 0.578125, + "step": 5010, + "time_per_iteration": 2.3626821041107178 + }, + { + "auxiliary_loss_clip": 0.01081279, + "auxiliary_loss_mlp": 0.01025883, + "balance_loss_clip": 1.00958765, + "balance_loss_mlp": 1.02533436, + "epoch": 0.3012776191191944, + "flos": 26941845392640.0, + "grad_norm": 2.27009907400155, + "language_loss": 0.8497237, + "learning_rate": 3.1694488348450636e-06, + "loss": 0.87079531, + "num_input_tokens_seen": 107773840, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.55859375, + "step": 5011, + "time_per_iteration": 2.488070011138916 + }, + { + "auxiliary_loss_clip": 0.01081505, + "auxiliary_loss_mlp": 0.01033962, + "balance_loss_clip": 1.01608098, + "balance_loss_mlp": 1.02349353, + "epoch": 0.30133774237186234, + "flos": 20411271624960.0, + "grad_norm": 1.9796811445131552, + "language_loss": 0.72257864, + "learning_rate": 3.169142312417654e-06, + "loss": 0.74373329, + "num_input_tokens_seen": 107792020, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 0.578125, + "step": 5012, + "time_per_iteration": 2.381861686706543 + }, + { + "auxiliary_loss_clip": 0.01080649, + "auxiliary_loss_mlp": 0.01028772, + "balance_loss_clip": 1.01197577, + "balance_loss_mlp": 1.02380049, + "epoch": 0.3013978656245303, + "flos": 19791423613440.0, + "grad_norm": 2.4962543051630366, + "language_loss": 0.87719458, + "learning_rate": 3.1688357482665622e-06, + "loss": 0.89828885, + "num_input_tokens_seen": 107809595, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.5703125, + "step": 5013, + "time_per_iteration": 2.4279744625091553 + }, + { + "auxiliary_loss_clip": 0.01083778, + "auxiliary_loss_mlp": 0.01034351, + "balance_loss_clip": 1.01557589, + "balance_loss_mlp": 1.02564931, + "epoch": 0.30145798887719827, + "flos": 16248821800320.0, + "grad_norm": 1.9163758464873473, + "language_loss": 0.83223724, + "learning_rate": 3.1685291424027293e-06, + "loss": 0.85341853, + "num_input_tokens_seen": 107827230, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.578125, + "step": 5014, + "time_per_iteration": 2.4155304431915283 + }, + { + "auxiliary_loss_clip": 0.01076556, + "auxiliary_loss_mlp": 0.0103143, + "balance_loss_clip": 1.01521778, + "balance_loss_mlp": 1.02403033, + "epoch": 0.30151811212986623, + "flos": 24570579427200.0, + "grad_norm": 1.5817098941484453, + "language_loss": 0.68388069, + "learning_rate": 3.1682224948370973e-06, + "loss": 0.70496058, + "num_input_tokens_seen": 107847195, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.5234375, + "step": 5015, + "time_per_iteration": 2.418494462966919 + }, + { + "auxiliary_loss_clip": 0.01081318, + "auxiliary_loss_mlp": 0.01035464, + "balance_loss_clip": 1.01878095, + "balance_loss_mlp": 1.02516627, + "epoch": 0.3015782353825342, + "flos": 21870990236160.0, + "grad_norm": 2.1210306427495307, + "language_loss": 0.74649143, + "learning_rate": 3.1679158055806096e-06, + "loss": 0.76765931, + "num_input_tokens_seen": 107866420, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.5625, + "step": 5016, + "time_per_iteration": 2.376962184906006 + }, + { + "auxiliary_loss_clip": 0.01083765, + "auxiliary_loss_mlp": 0.01030632, + "balance_loss_clip": 1.01298952, + "balance_loss_mlp": 1.02632689, + "epoch": 0.30163835863520216, + "flos": 28768012300800.0, + "grad_norm": 1.5494112497778512, + "language_loss": 0.65576136, + "learning_rate": 3.1676090746442105e-06, + "loss": 0.67690539, + "num_input_tokens_seen": 107889090, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.57421875, + "step": 5017, + "time_per_iteration": 2.461987257003784 + }, + { + "auxiliary_loss_clip": 0.01081179, + "auxiliary_loss_mlp": 0.01033032, + "balance_loss_clip": 1.01597965, + "balance_loss_mlp": 1.02568746, + "epoch": 0.30169848188787013, + "flos": 22965098423040.0, + "grad_norm": 2.3028788019207505, + "language_loss": 0.68410343, + "learning_rate": 3.1673023020388473e-06, + "loss": 0.70524549, + "num_input_tokens_seen": 107907520, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.5546875, + "step": 5018, + "time_per_iteration": 2.3817498683929443 + }, + { + "auxiliary_loss_clip": 0.01075965, + "auxiliary_loss_mlp": 0.01035591, + "balance_loss_clip": 1.02125061, + "balance_loss_mlp": 1.02357864, + "epoch": 0.3017586051405381, + "flos": 21834191796480.0, + "grad_norm": 2.172286130774456, + "language_loss": 0.7911346, + "learning_rate": 3.1669954877754677e-06, + "loss": 0.81225014, + "num_input_tokens_seen": 107925650, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.5234375, + "step": 5019, + "time_per_iteration": 2.382206439971924 + }, + { + "auxiliary_loss_clip": 0.01081606, + "auxiliary_loss_mlp": 0.0103845, + "balance_loss_clip": 1.02221918, + "balance_loss_mlp": 1.0261519, + "epoch": 0.30181872839320606, + "flos": 22159407911040.0, + "grad_norm": 2.5271579119668712, + "language_loss": 0.69893324, + "learning_rate": 3.1666886318650206e-06, + "loss": 0.72013378, + "num_input_tokens_seen": 107943975, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.5546875, + "step": 5020, + "time_per_iteration": 2.383802890777588 + }, + { + "auxiliary_loss_clip": 0.01082946, + "auxiliary_loss_mlp": 0.01038481, + "balance_loss_clip": 1.02095699, + "balance_loss_mlp": 1.02527726, + "epoch": 0.301878851645874, + "flos": 18113183602560.0, + "grad_norm": 1.9021581714737108, + "language_loss": 0.78609538, + "learning_rate": 3.1663817343184576e-06, + "loss": 0.80730963, + "num_input_tokens_seen": 107962950, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.578125, + "step": 5021, + "time_per_iteration": 2.3886027336120605 + }, + { + "auxiliary_loss_clip": 0.01077947, + "auxiliary_loss_mlp": 0.01025726, + "balance_loss_clip": 1.01038384, + "balance_loss_mlp": 1.02288127, + "epoch": 0.301938974898542, + "flos": 17601287114880.0, + "grad_norm": 2.2355073510638843, + "language_loss": 0.75984716, + "learning_rate": 3.166074795146731e-06, + "loss": 0.78088391, + "num_input_tokens_seen": 107979700, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.55078125, + "step": 5022, + "time_per_iteration": 2.3653833866119385 + }, + { + "auxiliary_loss_clip": 0.01079659, + "auxiliary_loss_mlp": 0.01038496, + "balance_loss_clip": 1.0213654, + "balance_loss_mlp": 1.02455592, + "epoch": 0.30199909815120995, + "flos": 11180445350400.0, + "grad_norm": 1.7892522293300075, + "language_loss": 0.69580376, + "learning_rate": 3.1657678143607943e-06, + "loss": 0.71698534, + "num_input_tokens_seen": 107996645, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.55078125, + "step": 5023, + "time_per_iteration": 2.3766841888427734 + }, + { + "auxiliary_loss_clip": 0.01082345, + "auxiliary_loss_mlp": 0.01030614, + "balance_loss_clip": 1.01438963, + "balance_loss_mlp": 1.02483201, + "epoch": 0.302059221403878, + "flos": 21906776246400.0, + "grad_norm": 1.9249216083430312, + "language_loss": 0.71643651, + "learning_rate": 3.165460791971603e-06, + "loss": 0.73756611, + "num_input_tokens_seen": 108015020, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.57421875, + "step": 5024, + "time_per_iteration": 2.4116077423095703 + }, + { + "auxiliary_loss_clip": 0.01079141, + "auxiliary_loss_mlp": 0.01034678, + "balance_loss_clip": 1.01878715, + "balance_loss_mlp": 1.02567625, + "epoch": 0.30211934465654594, + "flos": 26395175324160.0, + "grad_norm": 1.7100408121435982, + "language_loss": 0.74213982, + "learning_rate": 3.1651537279901135e-06, + "loss": 0.76327801, + "num_input_tokens_seen": 108036430, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.53515625, + "step": 5025, + "time_per_iteration": 2.4468863010406494 + }, + { + "auxiliary_loss_clip": 0.01077206, + "auxiliary_loss_mlp": 0.01030976, + "balance_loss_clip": 1.01613474, + "balance_loss_mlp": 1.02416253, + "epoch": 0.3021794679092139, + "flos": 23399453047680.0, + "grad_norm": 1.6971143565945075, + "language_loss": 0.67114282, + "learning_rate": 3.1648466224272854e-06, + "loss": 0.69222462, + "num_input_tokens_seen": 108054250, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.53125, + "step": 5026, + "time_per_iteration": 2.3966238498687744 + }, + { + "auxiliary_loss_clip": 0.01079268, + "auxiliary_loss_mlp": 0.01031486, + "balance_loss_clip": 1.01547635, + "balance_loss_mlp": 1.02524018, + "epoch": 0.30223959116188187, + "flos": 20260097971200.0, + "grad_norm": 1.7340964198932574, + "language_loss": 0.85200661, + "learning_rate": 3.1645394752940772e-06, + "loss": 0.87311411, + "num_input_tokens_seen": 108071495, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.5390625, + "step": 5027, + "time_per_iteration": 2.395587921142578 + }, + { + "auxiliary_loss_clip": 0.01075981, + "auxiliary_loss_mlp": 0.0103182, + "balance_loss_clip": 1.01678801, + "balance_loss_mlp": 1.02292156, + "epoch": 0.30229971441454984, + "flos": 26686630287360.0, + "grad_norm": 1.591699639119321, + "language_loss": 0.7846691, + "learning_rate": 3.164232286601451e-06, + "loss": 0.80574709, + "num_input_tokens_seen": 108092135, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.53125, + "step": 5028, + "time_per_iteration": 2.462153196334839 + }, + { + "auxiliary_loss_clip": 0.01081643, + "auxiliary_loss_mlp": 0.01032792, + "balance_loss_clip": 1.01566148, + "balance_loss_mlp": 1.02629507, + "epoch": 0.3023598376672178, + "flos": 34344026052480.0, + "grad_norm": 1.73540489182245, + "language_loss": 0.77354872, + "learning_rate": 3.1639250563603686e-06, + "loss": 0.79469311, + "num_input_tokens_seen": 108112945, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.5546875, + "step": 5029, + "time_per_iteration": 2.4995901584625244 + }, + { + "auxiliary_loss_clip": 0.0108169, + "auxiliary_loss_mlp": 0.01030893, + "balance_loss_clip": 1.01339316, + "balance_loss_mlp": 1.02492809, + "epoch": 0.30241996091988577, + "flos": 23111035372800.0, + "grad_norm": 2.2703102274881046, + "language_loss": 0.82117456, + "learning_rate": 3.1636177845817954e-06, + "loss": 0.84230042, + "num_input_tokens_seen": 108130325, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.5703125, + "step": 5030, + "time_per_iteration": 2.385838031768799 + }, + { + "auxiliary_loss_clip": 0.01080079, + "auxiliary_loss_mlp": 0.01032591, + "balance_loss_clip": 1.01592588, + "balance_loss_mlp": 1.0241152, + "epoch": 0.30248008417255373, + "flos": 19389014749440.0, + "grad_norm": 1.65919876176385, + "language_loss": 0.69756085, + "learning_rate": 3.1633104712766967e-06, + "loss": 0.71868753, + "num_input_tokens_seen": 108150300, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.55859375, + "step": 5031, + "time_per_iteration": 2.3964173793792725 + }, + { + "auxiliary_loss_clip": 0.01078944, + "auxiliary_loss_mlp": 0.01031782, + "balance_loss_clip": 1.01582026, + "balance_loss_mlp": 1.02500558, + "epoch": 0.3025402074252217, + "flos": 23768554608000.0, + "grad_norm": 1.70541486126613, + "language_loss": 0.82383776, + "learning_rate": 3.1630031164560395e-06, + "loss": 0.84494501, + "num_input_tokens_seen": 108170330, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.5390625, + "step": 5032, + "time_per_iteration": 3.774395227432251 + }, + { + "auxiliary_loss_clip": 0.01088144, + "auxiliary_loss_mlp": 0.01041199, + "balance_loss_clip": 1.0226146, + "balance_loss_mlp": 1.0278399, + "epoch": 0.30260033067788966, + "flos": 25992941016960.0, + "grad_norm": 2.3848223321526363, + "language_loss": 0.73535711, + "learning_rate": 3.162695720130793e-06, + "loss": 0.75665057, + "num_input_tokens_seen": 108191265, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.6015625, + "step": 5033, + "time_per_iteration": 2.437077045440674 + }, + { + "auxiliary_loss_clip": 0.01078681, + "auxiliary_loss_mlp": 0.01038563, + "balance_loss_clip": 1.02126586, + "balance_loss_mlp": 1.02415955, + "epoch": 0.3026604539305576, + "flos": 25373372296320.0, + "grad_norm": 2.0424456423279036, + "language_loss": 0.73978257, + "learning_rate": 3.1623882823119267e-06, + "loss": 0.76095498, + "num_input_tokens_seen": 108211615, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.546875, + "step": 5034, + "time_per_iteration": 2.4297237396240234 + }, + { + "auxiliary_loss_clip": 0.0107787, + "auxiliary_loss_mlp": 0.01030216, + "balance_loss_clip": 1.01390791, + "balance_loss_mlp": 1.02372336, + "epoch": 0.3027205771832256, + "flos": 25811532259200.0, + "grad_norm": 2.0084966306521923, + "language_loss": 0.72237194, + "learning_rate": 3.1620808030104127e-06, + "loss": 0.74345273, + "num_input_tokens_seen": 108231080, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.5390625, + "step": 5035, + "time_per_iteration": 3.821385383605957 + }, + { + "auxiliary_loss_clip": 0.01078289, + "auxiliary_loss_mlp": 0.01033219, + "balance_loss_clip": 1.01765025, + "balance_loss_mlp": 1.02332306, + "epoch": 0.30278070043589356, + "flos": 27343311649920.0, + "grad_norm": 1.9518835134075647, + "language_loss": 0.8750509, + "learning_rate": 3.1617732822372237e-06, + "loss": 0.89616603, + "num_input_tokens_seen": 108251125, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.55078125, + "step": 5036, + "time_per_iteration": 2.4213244915008545 + }, + { + "auxiliary_loss_clip": 0.01078881, + "auxiliary_loss_mlp": 0.01034763, + "balance_loss_clip": 1.01733446, + "balance_loss_mlp": 1.02524662, + "epoch": 0.3028408236885616, + "flos": 24785190754560.0, + "grad_norm": 1.4363712775314645, + "language_loss": 0.77089381, + "learning_rate": 3.1614657200033355e-06, + "loss": 0.79203027, + "num_input_tokens_seen": 108272545, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.53515625, + "step": 5037, + "time_per_iteration": 2.412492513656616 + }, + { + "auxiliary_loss_clip": 0.01082724, + "auxiliary_loss_mlp": 0.01036484, + "balance_loss_clip": 1.01900804, + "balance_loss_mlp": 1.02593899, + "epoch": 0.30290094694122954, + "flos": 12931653836160.0, + "grad_norm": 2.685255131767566, + "language_loss": 0.77590311, + "learning_rate": 3.1611581163197228e-06, + "loss": 0.79709518, + "num_input_tokens_seen": 108289725, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.56640625, + "step": 5038, + "time_per_iteration": 3.7611536979675293 + }, + { + "auxiliary_loss_clip": 0.01078642, + "auxiliary_loss_mlp": 0.01029598, + "balance_loss_clip": 1.01459527, + "balance_loss_mlp": 1.02628255, + "epoch": 0.3029610701938975, + "flos": 25915399153920.0, + "grad_norm": 1.8560384166378767, + "language_loss": 0.73723853, + "learning_rate": 3.160850471197364e-06, + "loss": 0.75832093, + "num_input_tokens_seen": 108310690, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.5234375, + "step": 5039, + "time_per_iteration": 2.4268336296081543 + }, + { + "auxiliary_loss_clip": 0.01075924, + "auxiliary_loss_mlp": 0.0103182, + "balance_loss_clip": 1.01786065, + "balance_loss_mlp": 1.02448809, + "epoch": 0.3030211934465655, + "flos": 21979919278080.0, + "grad_norm": 1.857112120650593, + "language_loss": 0.8008182, + "learning_rate": 3.160542784647238e-06, + "loss": 0.82189566, + "num_input_tokens_seen": 108328905, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.515625, + "step": 5040, + "time_per_iteration": 2.3754000663757324 + }, + { + "auxiliary_loss_clip": 0.01079966, + "auxiliary_loss_mlp": 0.01031537, + "balance_loss_clip": 1.01614738, + "balance_loss_mlp": 1.0265739, + "epoch": 0.30308131669923344, + "flos": 20991039528960.0, + "grad_norm": 1.54185585551736, + "language_loss": 0.81566, + "learning_rate": 3.1602350566803254e-06, + "loss": 0.836775, + "num_input_tokens_seen": 108346680, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.53125, + "step": 5041, + "time_per_iteration": 2.394514322280884 + }, + { + "auxiliary_loss_clip": 0.01017723, + "auxiliary_loss_mlp": 0.01017415, + "balance_loss_clip": 1.01566231, + "balance_loss_mlp": 1.00561452, + "epoch": 0.3031414399519014, + "flos": 60545641599360.0, + "grad_norm": 0.7632627054647441, + "language_loss": 0.59425414, + "learning_rate": 3.1599272873076076e-06, + "loss": 0.61460555, + "num_input_tokens_seen": 108413885, + "router_z_loss_clip": 0.01757812, + "router_z_loss_mlp": 0.12109375, + "step": 5042, + "time_per_iteration": 4.497772932052612 + }, + { + "auxiliary_loss_clip": 0.01081884, + "auxiliary_loss_mlp": 0.01028986, + "balance_loss_clip": 1.01265407, + "balance_loss_mlp": 1.02627993, + "epoch": 0.30320156320456937, + "flos": 21906601689600.0, + "grad_norm": 1.6305844740537034, + "language_loss": 0.7135089, + "learning_rate": 3.159619476540069e-06, + "loss": 0.73461759, + "num_input_tokens_seen": 108433640, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.5546875, + "step": 5043, + "time_per_iteration": 2.3825836181640625 + }, + { + "auxiliary_loss_clip": 0.01077251, + "auxiliary_loss_mlp": 0.01031775, + "balance_loss_clip": 1.01621795, + "balance_loss_mlp": 1.02385306, + "epoch": 0.30326168645723733, + "flos": 21651700786560.0, + "grad_norm": 2.111412253450013, + "language_loss": 0.69311726, + "learning_rate": 3.1593116243886943e-06, + "loss": 0.71420753, + "num_input_tokens_seen": 108452640, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.53515625, + "step": 5044, + "time_per_iteration": 2.3893320560455322 + }, + { + "auxiliary_loss_clip": 0.01076543, + "auxiliary_loss_mlp": 0.01030771, + "balance_loss_clip": 1.01541722, + "balance_loss_mlp": 1.02340984, + "epoch": 0.3033218097099053, + "flos": 21870222186240.0, + "grad_norm": 1.3166370881357141, + "language_loss": 0.77194965, + "learning_rate": 3.1590037308644695e-06, + "loss": 0.79302281, + "num_input_tokens_seen": 108472470, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.53125, + "step": 5045, + "time_per_iteration": 2.3869130611419678 + }, + { + "auxiliary_loss_clip": 0.01078538, + "auxiliary_loss_mlp": 0.01037209, + "balance_loss_clip": 1.0203464, + "balance_loss_mlp": 1.02383971, + "epoch": 0.30338193296257326, + "flos": 27088480569600.0, + "grad_norm": 1.8553203716814064, + "language_loss": 0.72408873, + "learning_rate": 3.158695795978383e-06, + "loss": 0.74524617, + "num_input_tokens_seen": 108493025, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.546875, + "step": 5046, + "time_per_iteration": 2.440122127532959 + }, + { + "auxiliary_loss_clip": 0.01081281, + "auxiliary_loss_mlp": 0.01037388, + "balance_loss_clip": 1.01970959, + "balance_loss_mlp": 1.02511406, + "epoch": 0.30344205621524123, + "flos": 19533415599360.0, + "grad_norm": 3.2137425197330827, + "language_loss": 0.80936623, + "learning_rate": 3.1583878197414237e-06, + "loss": 0.83055288, + "num_input_tokens_seen": 108513480, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.5625, + "step": 5047, + "time_per_iteration": 2.4542839527130127 + }, + { + "auxiliary_loss_clip": 0.01077701, + "auxiliary_loss_mlp": 0.01034701, + "balance_loss_clip": 1.01958573, + "balance_loss_mlp": 1.02419674, + "epoch": 0.3035021794679092, + "flos": 23909953080960.0, + "grad_norm": 1.716016772938192, + "language_loss": 0.72124553, + "learning_rate": 3.1580798021645833e-06, + "loss": 0.74236959, + "num_input_tokens_seen": 108533155, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.53515625, + "step": 5048, + "time_per_iteration": 2.417043924331665 + }, + { + "auxiliary_loss_clip": 0.01078642, + "auxiliary_loss_mlp": 0.01033414, + "balance_loss_clip": 1.01817906, + "balance_loss_mlp": 1.02458811, + "epoch": 0.30356230272057716, + "flos": 16142685667200.0, + "grad_norm": 1.66838836362413, + "language_loss": 0.75226957, + "learning_rate": 3.157771743258854e-06, + "loss": 0.77339017, + "num_input_tokens_seen": 108551900, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.54296875, + "step": 5049, + "time_per_iteration": 2.376274585723877 + }, + { + "auxiliary_loss_clip": 0.0108108, + "auxiliary_loss_mlp": 0.01035606, + "balance_loss_clip": 1.02032328, + "balance_loss_mlp": 1.02525365, + "epoch": 0.3036224259732452, + "flos": 28913390668800.0, + "grad_norm": 1.582238548426699, + "language_loss": 0.81829733, + "learning_rate": 3.1574636430352287e-06, + "loss": 0.83946419, + "num_input_tokens_seen": 108574005, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.55859375, + "step": 5050, + "time_per_iteration": 2.4505910873413086 + }, + { + "auxiliary_loss_clip": 0.01080029, + "auxiliary_loss_mlp": 0.01036879, + "balance_loss_clip": 1.02098894, + "balance_loss_mlp": 1.02524436, + "epoch": 0.30368254922591315, + "flos": 21104541959040.0, + "grad_norm": 2.6646821233862417, + "language_loss": 0.74094534, + "learning_rate": 3.1571555015047036e-06, + "loss": 0.76211441, + "num_input_tokens_seen": 108592715, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.546875, + "step": 5051, + "time_per_iteration": 2.3911256790161133 + }, + { + "auxiliary_loss_clip": 0.01079081, + "auxiliary_loss_mlp": 0.01034739, + "balance_loss_clip": 1.01973069, + "balance_loss_mlp": 1.02367949, + "epoch": 0.3037426724785811, + "flos": 23001198635520.0, + "grad_norm": 1.579753752100834, + "language_loss": 0.76945961, + "learning_rate": 3.156847318678275e-06, + "loss": 0.7905978, + "num_input_tokens_seen": 108611770, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.5546875, + "step": 5052, + "time_per_iteration": 2.4134862422943115 + }, + { + "auxiliary_loss_clip": 0.01081814, + "auxiliary_loss_mlp": 0.01034971, + "balance_loss_clip": 1.01846075, + "balance_loss_mlp": 1.02560735, + "epoch": 0.3038027957312491, + "flos": 15631801608960.0, + "grad_norm": 1.9089504797738561, + "language_loss": 0.82618737, + "learning_rate": 3.156539094566941e-06, + "loss": 0.84735525, + "num_input_tokens_seen": 108629070, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.5625, + "step": 5053, + "time_per_iteration": 2.367910623550415 + }, + { + "auxiliary_loss_clip": 0.01079904, + "auxiliary_loss_mlp": 0.01032492, + "balance_loss_clip": 1.01688755, + "balance_loss_mlp": 1.02545893, + "epoch": 0.30386291898391704, + "flos": 12713167347840.0, + "grad_norm": 1.540145886429998, + "language_loss": 0.71253401, + "learning_rate": 3.1562308291817024e-06, + "loss": 0.73365796, + "num_input_tokens_seen": 108646315, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.54296875, + "step": 5054, + "time_per_iteration": 2.375654935836792 + }, + { + "auxiliary_loss_clip": 0.01080617, + "auxiliary_loss_mlp": 0.01031856, + "balance_loss_clip": 1.01582265, + "balance_loss_mlp": 1.0258913, + "epoch": 0.303923042236585, + "flos": 26358237239040.0, + "grad_norm": 1.85087735015146, + "language_loss": 0.69822037, + "learning_rate": 3.15592252253356e-06, + "loss": 0.71934509, + "num_input_tokens_seen": 108665920, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.546875, + "step": 5055, + "time_per_iteration": 2.4142305850982666 + }, + { + "auxiliary_loss_clip": 0.01080944, + "auxiliary_loss_mlp": 0.01031188, + "balance_loss_clip": 1.01447487, + "balance_loss_mlp": 1.02540255, + "epoch": 0.30398316548925297, + "flos": 19718210759040.0, + "grad_norm": 1.6823764906449163, + "language_loss": 0.67452139, + "learning_rate": 3.1556141746335153e-06, + "loss": 0.69564271, + "num_input_tokens_seen": 108683485, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.5546875, + "step": 5056, + "time_per_iteration": 2.3767662048339844 + }, + { + "auxiliary_loss_clip": 0.01079828, + "auxiliary_loss_mlp": 0.0103147, + "balance_loss_clip": 1.0155319, + "balance_loss_mlp": 1.02489007, + "epoch": 0.30404328874192094, + "flos": 24238799976960.0, + "grad_norm": 1.6025181443183105, + "language_loss": 0.82537723, + "learning_rate": 3.155305785492574e-06, + "loss": 0.8464902, + "num_input_tokens_seen": 108702700, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.55078125, + "step": 5057, + "time_per_iteration": 2.3970909118652344 + }, + { + "auxiliary_loss_clip": 0.01077873, + "auxiliary_loss_mlp": 0.01029556, + "balance_loss_clip": 1.01315892, + "balance_loss_mlp": 1.0239892, + "epoch": 0.3041034119945889, + "flos": 24497785509120.0, + "grad_norm": 1.6665320149532368, + "language_loss": 0.88529772, + "learning_rate": 3.1549973551217408e-06, + "loss": 0.90637207, + "num_input_tokens_seen": 108721860, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.5390625, + "step": 5058, + "time_per_iteration": 2.4081411361694336 + }, + { + "auxiliary_loss_clip": 0.01080462, + "auxiliary_loss_mlp": 0.01032894, + "balance_loss_clip": 1.01594806, + "balance_loss_mlp": 1.02553666, + "epoch": 0.30416353524725687, + "flos": 28287747371520.0, + "grad_norm": 2.058088554296903, + "language_loss": 0.71862209, + "learning_rate": 3.1546888835320227e-06, + "loss": 0.73975563, + "num_input_tokens_seen": 108743215, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.55078125, + "step": 5059, + "time_per_iteration": 2.4287495613098145 + }, + { + "auxiliary_loss_clip": 0.01079415, + "auxiliary_loss_mlp": 0.01033589, + "balance_loss_clip": 1.01785302, + "balance_loss_mlp": 1.02515721, + "epoch": 0.30422365849992483, + "flos": 23659241541120.0, + "grad_norm": 1.5643683043373413, + "language_loss": 0.72890973, + "learning_rate": 3.1543803707344284e-06, + "loss": 0.7500397, + "num_input_tokens_seen": 108765505, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.5390625, + "step": 5060, + "time_per_iteration": 2.5202770233154297 + }, + { + "auxiliary_loss_clip": 0.01079598, + "auxiliary_loss_mlp": 0.01031143, + "balance_loss_clip": 1.01508582, + "balance_loss_mlp": 1.02489781, + "epoch": 0.3042837817525928, + "flos": 22997777322240.0, + "grad_norm": 1.9526716156198396, + "language_loss": 0.769113, + "learning_rate": 3.154071816739969e-06, + "loss": 0.79022038, + "num_input_tokens_seen": 108783370, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.546875, + "step": 5061, + "time_per_iteration": 2.3840816020965576 + }, + { + "auxiliary_loss_clip": 0.0108126, + "auxiliary_loss_mlp": 0.01033333, + "balance_loss_clip": 1.01638198, + "balance_loss_mlp": 1.02594531, + "epoch": 0.30434390500526076, + "flos": 22081482023040.0, + "grad_norm": 3.0975895023617093, + "language_loss": 0.81807518, + "learning_rate": 3.1537632215596542e-06, + "loss": 0.83922112, + "num_input_tokens_seen": 108797430, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.5546875, + "step": 5062, + "time_per_iteration": 2.3743505477905273 + }, + { + "auxiliary_loss_clip": 0.01075614, + "auxiliary_loss_mlp": 0.01031131, + "balance_loss_clip": 1.01559782, + "balance_loss_mlp": 1.02252674, + "epoch": 0.3044040282579287, + "flos": 25482336249600.0, + "grad_norm": 1.8621036098082937, + "language_loss": 0.74603331, + "learning_rate": 3.153454585204498e-06, + "loss": 0.76710081, + "num_input_tokens_seen": 108816945, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.53125, + "step": 5063, + "time_per_iteration": 2.4113378524780273 + }, + { + "auxiliary_loss_clip": 0.01078599, + "auxiliary_loss_mlp": 0.01033616, + "balance_loss_clip": 1.01596713, + "balance_loss_mlp": 1.02499104, + "epoch": 0.30446415151059675, + "flos": 21944377647360.0, + "grad_norm": 2.0935570233285534, + "language_loss": 0.84081364, + "learning_rate": 3.153145907685515e-06, + "loss": 0.86193573, + "num_input_tokens_seen": 108836615, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.53515625, + "step": 5064, + "time_per_iteration": 2.3820180892944336 + }, + { + "auxiliary_loss_clip": 0.010801, + "auxiliary_loss_mlp": 0.01033188, + "balance_loss_clip": 1.01652265, + "balance_loss_mlp": 1.02495968, + "epoch": 0.3045242747632647, + "flos": 16434489744000.0, + "grad_norm": 2.1758007655381744, + "language_loss": 0.75466955, + "learning_rate": 3.152837189013721e-06, + "loss": 0.77580249, + "num_input_tokens_seen": 108855165, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.5546875, + "step": 5065, + "time_per_iteration": 2.3753256797790527 + }, + { + "auxiliary_loss_clip": 0.01081433, + "auxiliary_loss_mlp": 0.01033266, + "balance_loss_clip": 1.016541, + "balance_loss_mlp": 1.02405083, + "epoch": 0.3045843980159327, + "flos": 31538998955520.0, + "grad_norm": 2.3023132129367916, + "language_loss": 0.61482322, + "learning_rate": 3.1525284292001323e-06, + "loss": 0.63597018, + "num_input_tokens_seen": 108874690, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.57421875, + "step": 5066, + "time_per_iteration": 2.466453790664673 + }, + { + "auxiliary_loss_clip": 0.01086782, + "auxiliary_loss_mlp": 0.01036319, + "balance_loss_clip": 1.01843762, + "balance_loss_mlp": 1.02784896, + "epoch": 0.30464452126860064, + "flos": 17852801616000.0, + "grad_norm": 2.304756466861215, + "language_loss": 0.82674682, + "learning_rate": 3.1522196282557698e-06, + "loss": 0.84797782, + "num_input_tokens_seen": 108893140, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 0.58984375, + "step": 5067, + "time_per_iteration": 2.3969404697418213 + }, + { + "auxiliary_loss_clip": 0.01078082, + "auxiliary_loss_mlp": 0.0103114, + "balance_loss_clip": 1.01597667, + "balance_loss_mlp": 1.02457166, + "epoch": 0.3047046445212686, + "flos": 20630351606400.0, + "grad_norm": 1.9227187056768802, + "language_loss": 0.63247436, + "learning_rate": 3.1519107861916516e-06, + "loss": 0.6535666, + "num_input_tokens_seen": 108911880, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.53515625, + "step": 5068, + "time_per_iteration": 2.4060232639312744 + }, + { + "auxiliary_loss_clip": 0.01077721, + "auxiliary_loss_mlp": 0.01028563, + "balance_loss_clip": 1.01345301, + "balance_loss_mlp": 1.02360404, + "epoch": 0.3047647677739366, + "flos": 21286544209920.0, + "grad_norm": 1.8648866684163907, + "language_loss": 0.75149035, + "learning_rate": 3.151601903018801e-06, + "loss": 0.77255321, + "num_input_tokens_seen": 108930440, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.5390625, + "step": 5069, + "time_per_iteration": 2.385392665863037 + }, + { + "auxiliary_loss_clip": 0.01076768, + "auxiliary_loss_mlp": 0.01035412, + "balance_loss_clip": 1.01986074, + "balance_loss_mlp": 1.02529395, + "epoch": 0.30482489102660454, + "flos": 20994879778560.0, + "grad_norm": 1.9300514163373805, + "language_loss": 0.7524488, + "learning_rate": 3.1512929787482405e-06, + "loss": 0.77357066, + "num_input_tokens_seen": 108949125, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.515625, + "step": 5070, + "time_per_iteration": 2.3724310398101807 + }, + { + "auxiliary_loss_clip": 0.01082619, + "auxiliary_loss_mlp": 0.01033286, + "balance_loss_clip": 1.01584578, + "balance_loss_mlp": 1.02601182, + "epoch": 0.3048850142792725, + "flos": 26289493038720.0, + "grad_norm": 1.8750056174865726, + "language_loss": 0.81683183, + "learning_rate": 3.150984013390995e-06, + "loss": 0.83799082, + "num_input_tokens_seen": 108972190, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.56640625, + "step": 5071, + "time_per_iteration": 2.447324275970459 + }, + { + "auxiliary_loss_clip": 0.01080769, + "auxiliary_loss_mlp": 0.01034315, + "balance_loss_clip": 1.01735187, + "balance_loss_mlp": 1.02393794, + "epoch": 0.30494513753194047, + "flos": 22345145677440.0, + "grad_norm": 4.689007473164586, + "language_loss": 0.75940239, + "learning_rate": 3.1506750069580916e-06, + "loss": 0.78055316, + "num_input_tokens_seen": 108990325, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.5703125, + "step": 5072, + "time_per_iteration": 3.8788299560546875 + }, + { + "auxiliary_loss_clip": 0.01078904, + "auxiliary_loss_mlp": 0.01036576, + "balance_loss_clip": 1.02013671, + "balance_loss_mlp": 1.02535915, + "epoch": 0.30500526078460843, + "flos": 19536627444480.0, + "grad_norm": 1.8412488787990688, + "language_loss": 0.71410644, + "learning_rate": 3.150365959460556e-06, + "loss": 0.7352612, + "num_input_tokens_seen": 109009505, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.53515625, + "step": 5073, + "time_per_iteration": 2.433459997177124 + }, + { + "auxiliary_loss_clip": 0.01083685, + "auxiliary_loss_mlp": 0.01035263, + "balance_loss_clip": 1.01834738, + "balance_loss_mlp": 1.02588344, + "epoch": 0.3050653840372764, + "flos": 14464445656320.0, + "grad_norm": 2.3076210393906336, + "language_loss": 0.76898336, + "learning_rate": 3.150056870909419e-06, + "loss": 0.79017282, + "num_input_tokens_seen": 109026350, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.578125, + "step": 5074, + "time_per_iteration": 2.36505126953125 + }, + { + "auxiliary_loss_clip": 0.01079156, + "auxiliary_loss_mlp": 0.01035008, + "balance_loss_clip": 1.01960599, + "balance_loss_mlp": 1.02561998, + "epoch": 0.30512550728994436, + "flos": 24242640226560.0, + "grad_norm": 1.8174893898110396, + "language_loss": 0.74389237, + "learning_rate": 3.1497477413157107e-06, + "loss": 0.76503408, + "num_input_tokens_seen": 109044165, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.53515625, + "step": 5075, + "time_per_iteration": 3.7757163047790527 + }, + { + "auxiliary_loss_clip": 0.01082864, + "auxiliary_loss_mlp": 0.01033601, + "balance_loss_clip": 1.01577902, + "balance_loss_mlp": 1.02590251, + "epoch": 0.30518563054261233, + "flos": 16359670967040.0, + "grad_norm": 1.9048589054866303, + "language_loss": 0.75819647, + "learning_rate": 3.1494385706904625e-06, + "loss": 0.77936113, + "num_input_tokens_seen": 109060665, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 0.5703125, + "step": 5076, + "time_per_iteration": 2.3512556552886963 + }, + { + "auxiliary_loss_clip": 0.01082248, + "auxiliary_loss_mlp": 0.01034118, + "balance_loss_clip": 1.01760733, + "balance_loss_mlp": 1.02683091, + "epoch": 0.30524575379528035, + "flos": 21578522843520.0, + "grad_norm": 2.3187040054924184, + "language_loss": 0.79411626, + "learning_rate": 3.149129359044709e-06, + "loss": 0.8152799, + "num_input_tokens_seen": 109080035, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.5546875, + "step": 5077, + "time_per_iteration": 3.752131462097168 + }, + { + "auxiliary_loss_clip": 0.01076754, + "auxiliary_loss_mlp": 0.01027194, + "balance_loss_clip": 1.01142287, + "balance_loss_mlp": 1.02505755, + "epoch": 0.3053058770479483, + "flos": 16544291569920.0, + "grad_norm": 1.8015275940502062, + "language_loss": 0.74522299, + "learning_rate": 3.148820106389484e-06, + "loss": 0.76626241, + "num_input_tokens_seen": 109097385, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.515625, + "step": 5078, + "time_per_iteration": 2.3557209968566895 + }, + { + "auxiliary_loss_clip": 0.01075746, + "auxiliary_loss_mlp": 0.01035325, + "balance_loss_clip": 1.01947069, + "balance_loss_mlp": 1.02324128, + "epoch": 0.3053660003006163, + "flos": 12312085115520.0, + "grad_norm": 2.0557977823221494, + "language_loss": 0.66679752, + "learning_rate": 3.1485108127358246e-06, + "loss": 0.68790823, + "num_input_tokens_seen": 109115495, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.5234375, + "step": 5079, + "time_per_iteration": 2.3630056381225586 + }, + { + "auxiliary_loss_clip": 0.01079402, + "auxiliary_loss_mlp": 0.01033911, + "balance_loss_clip": 1.01822281, + "balance_loss_mlp": 1.0249331, + "epoch": 0.30542612355328425, + "flos": 23111175018240.0, + "grad_norm": 2.3169130925395263, + "language_loss": 0.79645264, + "learning_rate": 3.1482014780947693e-06, + "loss": 0.81758577, + "num_input_tokens_seen": 109134235, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.546875, + "step": 5080, + "time_per_iteration": 2.3906824588775635 + }, + { + "auxiliary_loss_clip": 0.01078595, + "auxiliary_loss_mlp": 0.01038239, + "balance_loss_clip": 1.02320099, + "balance_loss_mlp": 1.02523494, + "epoch": 0.3054862468059522, + "flos": 24388297885440.0, + "grad_norm": 2.1723114008830637, + "language_loss": 0.80782568, + "learning_rate": 3.147892102477356e-06, + "loss": 0.82899398, + "num_input_tokens_seen": 109152760, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.53125, + "step": 5081, + "time_per_iteration": 3.762012004852295 + }, + { + "auxiliary_loss_clip": 0.01077588, + "auxiliary_loss_mlp": 0.01034157, + "balance_loss_clip": 1.01792085, + "balance_loss_mlp": 1.0235467, + "epoch": 0.3055463700586202, + "flos": 29384857935360.0, + "grad_norm": 1.8994625477612541, + "language_loss": 0.72201818, + "learning_rate": 3.147582685894627e-06, + "loss": 0.74313563, + "num_input_tokens_seen": 109173925, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.5390625, + "step": 5082, + "time_per_iteration": 2.432049512863159 + }, + { + "auxiliary_loss_clip": 0.010815, + "auxiliary_loss_mlp": 0.01032585, + "balance_loss_clip": 1.01512098, + "balance_loss_mlp": 1.02457976, + "epoch": 0.30560649331128814, + "flos": 25590636887040.0, + "grad_norm": 1.869577299281497, + "language_loss": 0.73351568, + "learning_rate": 3.1472732283576226e-06, + "loss": 0.75465655, + "num_input_tokens_seen": 109192510, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.56640625, + "step": 5083, + "time_per_iteration": 2.410402297973633 + }, + { + "auxiliary_loss_clip": 0.01079138, + "auxiliary_loss_mlp": 0.01028757, + "balance_loss_clip": 1.01243687, + "balance_loss_mlp": 1.0233649, + "epoch": 0.3056666165639561, + "flos": 19127515599360.0, + "grad_norm": 1.8073700407937934, + "language_loss": 0.71143746, + "learning_rate": 3.146963729877389e-06, + "loss": 0.73251641, + "num_input_tokens_seen": 109210885, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.55859375, + "step": 5084, + "time_per_iteration": 2.3652381896972656 + }, + { + "auxiliary_loss_clip": 0.01080707, + "auxiliary_loss_mlp": 0.01030032, + "balance_loss_clip": 1.01336694, + "balance_loss_mlp": 1.02467549, + "epoch": 0.30572673981662407, + "flos": 15522942389760.0, + "grad_norm": 1.742273901997806, + "language_loss": 0.78633082, + "learning_rate": 3.1466541904649698e-06, + "loss": 0.80743825, + "num_input_tokens_seen": 109229180, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.5625, + "step": 5085, + "time_per_iteration": 2.380459785461426 + }, + { + "auxiliary_loss_clip": 0.01075721, + "auxiliary_loss_mlp": 0.01030383, + "balance_loss_clip": 1.01443315, + "balance_loss_mlp": 1.02337861, + "epoch": 0.30578686306929204, + "flos": 21505484545920.0, + "grad_norm": 2.047968027875394, + "language_loss": 0.78112531, + "learning_rate": 3.1463446101314118e-06, + "loss": 0.80218637, + "num_input_tokens_seen": 109249510, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.5234375, + "step": 5086, + "time_per_iteration": 2.3850786685943604 + }, + { + "auxiliary_loss_clip": 0.01078415, + "auxiliary_loss_mlp": 0.01034493, + "balance_loss_clip": 1.01820946, + "balance_loss_mlp": 1.02396393, + "epoch": 0.30584698632196, + "flos": 20953368305280.0, + "grad_norm": 1.8084602686651856, + "language_loss": 0.76741385, + "learning_rate": 3.1460349888877645e-06, + "loss": 0.78854293, + "num_input_tokens_seen": 109268200, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.546875, + "step": 5087, + "time_per_iteration": 2.3894331455230713 + }, + { + "auxiliary_loss_clip": 0.01080381, + "auxiliary_loss_mlp": 0.0103147, + "balance_loss_clip": 1.01379085, + "balance_loss_mlp": 1.0240941, + "epoch": 0.30590710957462797, + "flos": 24679962316800.0, + "grad_norm": 2.484657609352947, + "language_loss": 0.72525501, + "learning_rate": 3.1457253267450756e-06, + "loss": 0.74637347, + "num_input_tokens_seen": 109288370, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.5625, + "step": 5088, + "time_per_iteration": 2.4178121089935303 + }, + { + "auxiliary_loss_clip": 0.01081955, + "auxiliary_loss_mlp": 0.01032452, + "balance_loss_clip": 1.01588249, + "balance_loss_mlp": 1.02458477, + "epoch": 0.30596723282729593, + "flos": 17086108959360.0, + "grad_norm": 2.226747048899952, + "language_loss": 0.79293025, + "learning_rate": 3.145415623714397e-06, + "loss": 0.81407434, + "num_input_tokens_seen": 109306730, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.5703125, + "step": 5089, + "time_per_iteration": 2.373013496398926 + }, + { + "auxiliary_loss_clip": 0.01078708, + "auxiliary_loss_mlp": 0.01035366, + "balance_loss_clip": 1.01964855, + "balance_loss_mlp": 1.02459717, + "epoch": 0.30602735607996395, + "flos": 22855994824320.0, + "grad_norm": 1.6334420631479762, + "language_loss": 0.76835668, + "learning_rate": 3.145105879806781e-06, + "loss": 0.78949744, + "num_input_tokens_seen": 109327360, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.5390625, + "step": 5090, + "time_per_iteration": 2.3933351039886475 + }, + { + "auxiliary_loss_clip": 0.01084782, + "auxiliary_loss_mlp": 0.01038957, + "balance_loss_clip": 1.0215404, + "balance_loss_mlp": 1.02664232, + "epoch": 0.3060874793326319, + "flos": 29860200362880.0, + "grad_norm": 1.789059172379312, + "language_loss": 0.76110858, + "learning_rate": 3.144796095033282e-06, + "loss": 0.78234595, + "num_input_tokens_seen": 109348135, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.58203125, + "step": 5091, + "time_per_iteration": 2.446603775024414 + }, + { + "auxiliary_loss_clip": 0.01082083, + "auxiliary_loss_mlp": 0.01032028, + "balance_loss_clip": 1.01585746, + "balance_loss_mlp": 1.02568531, + "epoch": 0.3061476025852999, + "flos": 20447546394240.0, + "grad_norm": 4.361862532620491, + "language_loss": 0.71736872, + "learning_rate": 3.1444862694049548e-06, + "loss": 0.73850983, + "num_input_tokens_seen": 109366220, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.56640625, + "step": 5092, + "time_per_iteration": 2.392932891845703 + }, + { + "auxiliary_loss_clip": 0.01077351, + "auxiliary_loss_mlp": 0.01030519, + "balance_loss_clip": 1.01486683, + "balance_loss_mlp": 1.02375352, + "epoch": 0.30620772583796785, + "flos": 19390446115200.0, + "grad_norm": 2.0449167330303504, + "language_loss": 0.82672793, + "learning_rate": 3.144176402932857e-06, + "loss": 0.84780663, + "num_input_tokens_seen": 109385260, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.5390625, + "step": 5093, + "time_per_iteration": 2.3958449363708496 + }, + { + "auxiliary_loss_clip": 0.01079465, + "auxiliary_loss_mlp": 0.01033301, + "balance_loss_clip": 1.01837611, + "balance_loss_mlp": 1.02562737, + "epoch": 0.3062678490906358, + "flos": 24023420599680.0, + "grad_norm": 1.7334995034031653, + "language_loss": 0.74549633, + "learning_rate": 3.143866495628046e-06, + "loss": 0.76662397, + "num_input_tokens_seen": 109405025, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.5390625, + "step": 5094, + "time_per_iteration": 2.4137465953826904 + }, + { + "auxiliary_loss_clip": 0.0107656, + "auxiliary_loss_mlp": 0.01031346, + "balance_loss_clip": 1.01611078, + "balance_loss_mlp": 1.02357149, + "epoch": 0.3063279723433038, + "flos": 19753647655680.0, + "grad_norm": 1.8931123445568476, + "language_loss": 0.75921643, + "learning_rate": 3.1435565475015827e-06, + "loss": 0.78029549, + "num_input_tokens_seen": 109422465, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.53125, + "step": 5095, + "time_per_iteration": 2.3703372478485107 + }, + { + "auxiliary_loss_clip": 0.01076011, + "auxiliary_loss_mlp": 0.01032305, + "balance_loss_clip": 1.01668847, + "balance_loss_mlp": 1.02395868, + "epoch": 0.30638809559597174, + "flos": 22449082394880.0, + "grad_norm": 2.143575018854079, + "language_loss": 0.80498981, + "learning_rate": 3.143246558564528e-06, + "loss": 0.82607299, + "num_input_tokens_seen": 109440575, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.51953125, + "step": 5096, + "time_per_iteration": 2.387840509414673 + }, + { + "auxiliary_loss_clip": 0.01076872, + "auxiliary_loss_mlp": 0.01029607, + "balance_loss_clip": 1.0137099, + "balance_loss_mlp": 1.02281272, + "epoch": 0.3064482188486397, + "flos": 17164209404160.0, + "grad_norm": 3.3072272142460304, + "language_loss": 0.8172816, + "learning_rate": 3.1429365288279437e-06, + "loss": 0.83834636, + "num_input_tokens_seen": 109459050, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.5390625, + "step": 5097, + "time_per_iteration": 2.352224826812744 + }, + { + "auxiliary_loss_clip": 0.0107899, + "auxiliary_loss_mlp": 0.01035665, + "balance_loss_clip": 1.01889229, + "balance_loss_mlp": 1.02406096, + "epoch": 0.3065083421013077, + "flos": 23767367621760.0, + "grad_norm": 2.10298827540818, + "language_loss": 0.78092313, + "learning_rate": 3.142626458302895e-06, + "loss": 0.80206966, + "num_input_tokens_seen": 109475860, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.546875, + "step": 5098, + "time_per_iteration": 2.3999123573303223 + }, + { + "auxiliary_loss_clip": 0.01077848, + "auxiliary_loss_mlp": 0.0102936, + "balance_loss_clip": 1.01379156, + "balance_loss_mlp": 1.02454019, + "epoch": 0.30656846535397564, + "flos": 26430647132160.0, + "grad_norm": 1.7533503981649752, + "language_loss": 0.84115088, + "learning_rate": 3.1423163470004473e-06, + "loss": 0.86222291, + "num_input_tokens_seen": 109494760, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.53125, + "step": 5099, + "time_per_iteration": 2.4063992500305176 + }, + { + "auxiliary_loss_clip": 0.01079286, + "auxiliary_loss_mlp": 0.01036687, + "balance_loss_clip": 1.02019453, + "balance_loss_mlp": 1.02415967, + "epoch": 0.3066285886066436, + "flos": 26650564986240.0, + "grad_norm": 1.5899167638724356, + "language_loss": 0.8568939, + "learning_rate": 3.1420061949316676e-06, + "loss": 0.87805367, + "num_input_tokens_seen": 109516480, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.55078125, + "step": 5100, + "time_per_iteration": 2.4158122539520264 + }, + { + "auxiliary_loss_clip": 0.01076121, + "auxiliary_loss_mlp": 0.01030898, + "balance_loss_clip": 1.01454246, + "balance_loss_mlp": 1.02231455, + "epoch": 0.30668871185931157, + "flos": 15049031328000.0, + "grad_norm": 1.888937394141001, + "language_loss": 0.79053181, + "learning_rate": 3.141696002107624e-06, + "loss": 0.81160194, + "num_input_tokens_seen": 109534615, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.5390625, + "step": 5101, + "time_per_iteration": 2.359696626663208 + }, + { + "auxiliary_loss_clip": 0.01079749, + "auxiliary_loss_mlp": 0.01038396, + "balance_loss_clip": 1.02103889, + "balance_loss_mlp": 1.0245868, + "epoch": 0.30674883511197953, + "flos": 20081133008640.0, + "grad_norm": 1.6623602981726566, + "language_loss": 0.80277586, + "learning_rate": 3.1413857685393873e-06, + "loss": 0.82395732, + "num_input_tokens_seen": 109554040, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.55078125, + "step": 5102, + "time_per_iteration": 2.3827927112579346 + }, + { + "auxiliary_loss_clip": 0.01081269, + "auxiliary_loss_mlp": 0.01032584, + "balance_loss_clip": 1.01548934, + "balance_loss_mlp": 1.02632546, + "epoch": 0.30680895836464755, + "flos": 22892688529920.0, + "grad_norm": 1.9847136357214878, + "language_loss": 0.88668692, + "learning_rate": 3.1410754942380287e-06, + "loss": 0.90782547, + "num_input_tokens_seen": 109574345, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.55078125, + "step": 5103, + "time_per_iteration": 2.3850743770599365 + }, + { + "auxiliary_loss_clip": 0.01077684, + "auxiliary_loss_mlp": 0.01033489, + "balance_loss_clip": 1.01766443, + "balance_loss_mlp": 1.02336264, + "epoch": 0.3068690816173155, + "flos": 23695027551360.0, + "grad_norm": 7.294134692163468, + "language_loss": 0.7403397, + "learning_rate": 3.1407651792146204e-06, + "loss": 0.76145148, + "num_input_tokens_seen": 109593670, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.5390625, + "step": 5104, + "time_per_iteration": 2.390892505645752 + }, + { + "auxiliary_loss_clip": 0.01079635, + "auxiliary_loss_mlp": 0.01035523, + "balance_loss_clip": 1.01887, + "balance_loss_mlp": 1.02392173, + "epoch": 0.3069292048699835, + "flos": 23549893562880.0, + "grad_norm": 2.122087033959047, + "language_loss": 0.72853553, + "learning_rate": 3.1404548234802376e-06, + "loss": 0.74968719, + "num_input_tokens_seen": 109613385, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.55859375, + "step": 5105, + "time_per_iteration": 2.3908679485321045 + }, + { + "auxiliary_loss_clip": 0.01081192, + "auxiliary_loss_mlp": 0.01033354, + "balance_loss_clip": 1.01637292, + "balance_loss_mlp": 1.02452767, + "epoch": 0.30698932812265145, + "flos": 24530604053760.0, + "grad_norm": 1.7685048479415502, + "language_loss": 0.8725996, + "learning_rate": 3.140144427045955e-06, + "loss": 0.89374506, + "num_input_tokens_seen": 109632395, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.56640625, + "step": 5106, + "time_per_iteration": 2.426076650619507 + }, + { + "auxiliary_loss_clip": 0.01081552, + "auxiliary_loss_mlp": 0.01040172, + "balance_loss_clip": 1.02134919, + "balance_loss_mlp": 1.02422607, + "epoch": 0.3070494513753194, + "flos": 20995368537600.0, + "grad_norm": 2.4336028654453337, + "language_loss": 0.71578121, + "learning_rate": 3.1398339899228512e-06, + "loss": 0.73699844, + "num_input_tokens_seen": 109651380, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.5703125, + "step": 5107, + "time_per_iteration": 2.3681674003601074 + }, + { + "auxiliary_loss_clip": 0.01077889, + "auxiliary_loss_mlp": 0.01036019, + "balance_loss_clip": 1.02060497, + "balance_loss_mlp": 1.02384579, + "epoch": 0.3071095746279874, + "flos": 19024940424960.0, + "grad_norm": 2.2126667843796572, + "language_loss": 0.72114658, + "learning_rate": 3.139523512122005e-06, + "loss": 0.74228561, + "num_input_tokens_seen": 109670240, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.5390625, + "step": 5108, + "time_per_iteration": 2.372995615005493 + }, + { + "auxiliary_loss_clip": 0.01077898, + "auxiliary_loss_mlp": 0.01030447, + "balance_loss_clip": 1.01489043, + "balance_loss_mlp": 1.02421427, + "epoch": 0.30716969788065535, + "flos": 21214448519040.0, + "grad_norm": 2.075612682080276, + "language_loss": 0.85682523, + "learning_rate": 3.1392129936544947e-06, + "loss": 0.87790871, + "num_input_tokens_seen": 109690810, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.5390625, + "step": 5109, + "time_per_iteration": 2.3912501335144043 + }, + { + "auxiliary_loss_clip": 0.01077607, + "auxiliary_loss_mlp": 0.01028635, + "balance_loss_clip": 1.01360869, + "balance_loss_mlp": 1.02277243, + "epoch": 0.3072298211333233, + "flos": 25771661619840.0, + "grad_norm": 1.6016735539815112, + "language_loss": 0.6779108, + "learning_rate": 3.1389024345314033e-06, + "loss": 0.69897318, + "num_input_tokens_seen": 109711145, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.546875, + "step": 5110, + "time_per_iteration": 2.429046392440796 + }, + { + "auxiliary_loss_clip": 0.01076413, + "auxiliary_loss_mlp": 0.0102863, + "balance_loss_clip": 1.01363373, + "balance_loss_mlp": 1.02355778, + "epoch": 0.3072899443859913, + "flos": 25847737205760.0, + "grad_norm": 1.4264155370175726, + "language_loss": 0.7684719, + "learning_rate": 3.1385918347638142e-06, + "loss": 0.78952235, + "num_input_tokens_seen": 109731425, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.52734375, + "step": 5111, + "time_per_iteration": 2.4254255294799805 + }, + { + "auxiliary_loss_clip": 0.01078433, + "auxiliary_loss_mlp": 0.01031635, + "balance_loss_clip": 1.01573277, + "balance_loss_mlp": 1.02413774, + "epoch": 0.30735006763865924, + "flos": 25921578464640.0, + "grad_norm": 2.7580918109065435, + "language_loss": 0.66969657, + "learning_rate": 3.1382811943628107e-06, + "loss": 0.69079721, + "num_input_tokens_seen": 109752720, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.54296875, + "step": 5112, + "time_per_iteration": 3.832089424133301 + }, + { + "auxiliary_loss_clip": 0.01081321, + "auxiliary_loss_mlp": 0.01037913, + "balance_loss_clip": 1.02080607, + "balance_loss_mlp": 1.02410853, + "epoch": 0.3074101908913272, + "flos": 30915764542080.0, + "grad_norm": 3.249391851251878, + "language_loss": 0.79351687, + "learning_rate": 3.1379705133394793e-06, + "loss": 0.81470919, + "num_input_tokens_seen": 109772840, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.5703125, + "step": 5113, + "time_per_iteration": 2.468846082687378 + }, + { + "auxiliary_loss_clip": 0.01077078, + "auxiliary_loss_mlp": 0.01035937, + "balance_loss_clip": 1.02079749, + "balance_loss_mlp": 1.02224755, + "epoch": 0.30747031414399517, + "flos": 18400204823040.0, + "grad_norm": 2.412408745969876, + "language_loss": 0.76614761, + "learning_rate": 3.1376597917049084e-06, + "loss": 0.7872777, + "num_input_tokens_seen": 109790150, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.546875, + "step": 5114, + "time_per_iteration": 3.7922019958496094 + }, + { + "auxiliary_loss_clip": 0.01080715, + "auxiliary_loss_mlp": 0.01030119, + "balance_loss_clip": 1.01314354, + "balance_loss_mlp": 1.02512193, + "epoch": 0.30753043739666314, + "flos": 22632201809280.0, + "grad_norm": 1.7376376609500717, + "language_loss": 0.62275064, + "learning_rate": 3.1373490294701853e-06, + "loss": 0.64385897, + "num_input_tokens_seen": 109807985, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.5546875, + "step": 5115, + "time_per_iteration": 2.385173797607422 + }, + { + "auxiliary_loss_clip": 0.01080656, + "auxiliary_loss_mlp": 0.01028723, + "balance_loss_clip": 1.01310694, + "balance_loss_mlp": 1.02506423, + "epoch": 0.3075905606493311, + "flos": 27342857802240.0, + "grad_norm": 1.7905529204636104, + "language_loss": 0.83087099, + "learning_rate": 3.1370382266464007e-06, + "loss": 0.85196483, + "num_input_tokens_seen": 109825920, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.5546875, + "step": 5116, + "time_per_iteration": 2.42085862159729 + }, + { + "auxiliary_loss_clip": 0.01076563, + "auxiliary_loss_mlp": 0.01037733, + "balance_loss_clip": 1.02163982, + "balance_loss_mlp": 1.02344823, + "epoch": 0.3076506839019991, + "flos": 22089721104000.0, + "grad_norm": 1.9990810770592031, + "language_loss": 0.75865328, + "learning_rate": 3.136727383244647e-06, + "loss": 0.77979636, + "num_input_tokens_seen": 109846220, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.53125, + "step": 5117, + "time_per_iteration": 3.773503541946411 + }, + { + "auxiliary_loss_clip": 0.01080605, + "auxiliary_loss_mlp": 0.01031636, + "balance_loss_clip": 1.01538754, + "balance_loss_mlp": 1.02455497, + "epoch": 0.3077108071546671, + "flos": 21288429423360.0, + "grad_norm": 2.2087428475027737, + "language_loss": 0.71562529, + "learning_rate": 3.136416499276017e-06, + "loss": 0.73674774, + "num_input_tokens_seen": 109863870, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.55859375, + "step": 5118, + "time_per_iteration": 2.3758347034454346 + }, + { + "auxiliary_loss_clip": 0.01080015, + "auxiliary_loss_mlp": 0.01028134, + "balance_loss_clip": 1.01220727, + "balance_loss_mlp": 1.02541113, + "epoch": 0.30777093040733505, + "flos": 21430002453120.0, + "grad_norm": 1.5744646214494442, + "language_loss": 0.74403101, + "learning_rate": 3.136105574751605e-06, + "loss": 0.76511252, + "num_input_tokens_seen": 109883500, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.546875, + "step": 5119, + "time_per_iteration": 2.3935563564300537 + }, + { + "auxiliary_loss_clip": 0.01082104, + "auxiliary_loss_mlp": 0.01043753, + "balance_loss_clip": 1.0266521, + "balance_loss_mlp": 1.025195, + "epoch": 0.307831053660003, + "flos": 23148148014720.0, + "grad_norm": 9.304577538731047, + "language_loss": 0.80417204, + "learning_rate": 3.135794609682508e-06, + "loss": 0.82543057, + "num_input_tokens_seen": 109904620, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.5703125, + "step": 5120, + "time_per_iteration": 3.8386504650115967 + }, + { + "auxiliary_loss_clip": 0.01076532, + "auxiliary_loss_mlp": 0.01036613, + "balance_loss_clip": 1.02130711, + "balance_loss_mlp": 1.02279353, + "epoch": 0.307891176912671, + "flos": 17018796124800.0, + "grad_norm": 2.262056870766989, + "language_loss": 0.80122209, + "learning_rate": 3.135483604079823e-06, + "loss": 0.8223536, + "num_input_tokens_seen": 109922275, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.5390625, + "step": 5121, + "time_per_iteration": 2.3527040481567383 + }, + { + "auxiliary_loss_clip": 0.0107719, + "auxiliary_loss_mlp": 0.01027376, + "balance_loss_clip": 1.01165867, + "balance_loss_mlp": 1.02393687, + "epoch": 0.30795130016533895, + "flos": 27703929749760.0, + "grad_norm": 1.4726564421949742, + "language_loss": 0.8263222, + "learning_rate": 3.1351725579546484e-06, + "loss": 0.84736788, + "num_input_tokens_seen": 109944265, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.53125, + "step": 5122, + "time_per_iteration": 2.42997407913208 + }, + { + "auxiliary_loss_clip": 0.01019117, + "auxiliary_loss_mlp": 0.0101245, + "balance_loss_clip": 1.01079261, + "balance_loss_mlp": 1.00643384, + "epoch": 0.3080114234180069, + "flos": 69054987294720.0, + "grad_norm": 0.8950464042382977, + "language_loss": 0.58587706, + "learning_rate": 3.134861471318086e-06, + "loss": 0.60619271, + "num_input_tokens_seen": 110014160, + "router_z_loss_clip": 0.01660156, + "router_z_loss_mlp": 0.12695312, + "step": 5123, + "time_per_iteration": 3.202012062072754 + }, + { + "auxiliary_loss_clip": 0.01079796, + "auxiliary_loss_mlp": 0.01032874, + "balance_loss_clip": 1.01499224, + "balance_loss_mlp": 1.02367806, + "epoch": 0.3080715466706749, + "flos": 24059101875840.0, + "grad_norm": 2.1068432353962487, + "language_loss": 0.83148974, + "learning_rate": 3.1345503441812357e-06, + "loss": 0.85261643, + "num_input_tokens_seen": 110034865, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 0.5625, + "step": 5124, + "time_per_iteration": 2.4288156032562256 + }, + { + "auxiliary_loss_clip": 0.01082191, + "auxiliary_loss_mlp": 0.01031345, + "balance_loss_clip": 1.01534677, + "balance_loss_mlp": 1.02525115, + "epoch": 0.30813166992334284, + "flos": 25847492826240.0, + "grad_norm": 1.8736936207917514, + "language_loss": 0.79029876, + "learning_rate": 3.1342391765552032e-06, + "loss": 0.81143409, + "num_input_tokens_seen": 110052930, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.5703125, + "step": 5125, + "time_per_iteration": 2.4141275882720947 + }, + { + "auxiliary_loss_clip": 0.01081219, + "auxiliary_loss_mlp": 0.01035264, + "balance_loss_clip": 1.01808608, + "balance_loss_mlp": 1.02440333, + "epoch": 0.3081917931760108, + "flos": 20448558823680.0, + "grad_norm": 1.8872369700136113, + "language_loss": 0.64191848, + "learning_rate": 3.1339279684510916e-06, + "loss": 0.66308331, + "num_input_tokens_seen": 110071765, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.5703125, + "step": 5126, + "time_per_iteration": 2.366394519805908 + }, + { + "auxiliary_loss_clip": 0.01077885, + "auxiliary_loss_mlp": 0.0103064, + "balance_loss_clip": 1.0156908, + "balance_loss_mlp": 1.02465594, + "epoch": 0.3082519164286788, + "flos": 22165098462720.0, + "grad_norm": 2.7576994911302997, + "language_loss": 0.86689198, + "learning_rate": 3.1336167198800072e-06, + "loss": 0.88797724, + "num_input_tokens_seen": 110092660, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.53125, + "step": 5127, + "time_per_iteration": 2.3918304443359375 + }, + { + "auxiliary_loss_clip": 0.01077635, + "auxiliary_loss_mlp": 0.01035878, + "balance_loss_clip": 1.01923609, + "balance_loss_mlp": 1.02379274, + "epoch": 0.30831203968134674, + "flos": 28912133859840.0, + "grad_norm": 1.92067605870235, + "language_loss": 0.68613291, + "learning_rate": 3.133305430853059e-06, + "loss": 0.707268, + "num_input_tokens_seen": 110114960, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.5390625, + "step": 5128, + "time_per_iteration": 2.4436962604522705 + }, + { + "auxiliary_loss_clip": 0.01080492, + "auxiliary_loss_mlp": 0.0103474, + "balance_loss_clip": 1.01841998, + "balance_loss_mlp": 1.02417898, + "epoch": 0.3083721629340147, + "flos": 25666503004800.0, + "grad_norm": 1.751935711431384, + "language_loss": 0.71321845, + "learning_rate": 3.132994101381354e-06, + "loss": 0.73437083, + "num_input_tokens_seen": 110135750, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.5625, + "step": 5129, + "time_per_iteration": 2.4182329177856445 + }, + { + "auxiliary_loss_clip": 0.01017501, + "auxiliary_loss_mlp": 0.0100063, + "balance_loss_clip": 0.99912214, + "balance_loss_mlp": 1.00459743, + "epoch": 0.3084322861866827, + "flos": 68209181763840.0, + "grad_norm": 0.8376185662338888, + "language_loss": 0.59226048, + "learning_rate": 3.132682731476005e-06, + "loss": 0.61244178, + "num_input_tokens_seen": 110189480, + "router_z_loss_clip": 0.01507568, + "router_z_loss_mlp": 0.12890625, + "step": 5130, + "time_per_iteration": 2.9858717918395996 + }, + { + "auxiliary_loss_clip": 0.01080878, + "auxiliary_loss_mlp": 0.01033931, + "balance_loss_clip": 1.01668143, + "balance_loss_mlp": 1.02439547, + "epoch": 0.3084924094393507, + "flos": 20295639601920.0, + "grad_norm": 2.6875767153783348, + "language_loss": 0.72677922, + "learning_rate": 3.1323713211481227e-06, + "loss": 0.74792731, + "num_input_tokens_seen": 110206445, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.5625, + "step": 5131, + "time_per_iteration": 2.373452663421631 + }, + { + "auxiliary_loss_clip": 0.01079323, + "auxiliary_loss_mlp": 0.01030824, + "balance_loss_clip": 1.01579165, + "balance_loss_mlp": 1.02553153, + "epoch": 0.30855253269201866, + "flos": 23948741468160.0, + "grad_norm": 5.220404308141966, + "language_loss": 0.71094596, + "learning_rate": 3.1320598704088204e-06, + "loss": 0.73204744, + "num_input_tokens_seen": 110226845, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.5390625, + "step": 5132, + "time_per_iteration": 2.4195947647094727 + }, + { + "auxiliary_loss_clip": 0.01076501, + "auxiliary_loss_mlp": 0.01030856, + "balance_loss_clip": 1.01643753, + "balance_loss_mlp": 1.02521384, + "epoch": 0.3086126559446866, + "flos": 19280853757440.0, + "grad_norm": 1.9599700961515136, + "language_loss": 0.90192401, + "learning_rate": 3.1317483792692136e-06, + "loss": 0.92299753, + "num_input_tokens_seen": 110244095, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.51171875, + "step": 5133, + "time_per_iteration": 2.385739326477051 + }, + { + "auxiliary_loss_clip": 0.01081413, + "auxiliary_loss_mlp": 0.01039873, + "balance_loss_clip": 1.0222894, + "balance_loss_mlp": 1.02509832, + "epoch": 0.3086727791973546, + "flos": 33759510203520.0, + "grad_norm": 1.7427786961028637, + "language_loss": 0.67729735, + "learning_rate": 3.131436847740418e-06, + "loss": 0.69851029, + "num_input_tokens_seen": 110264240, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.5625, + "step": 5134, + "time_per_iteration": 2.5005271434783936 + }, + { + "auxiliary_loss_clip": 0.01082007, + "auxiliary_loss_mlp": 0.01029308, + "balance_loss_clip": 1.01270247, + "balance_loss_mlp": 1.02506828, + "epoch": 0.30873290245002255, + "flos": 16033232954880.0, + "grad_norm": 1.975231147235075, + "language_loss": 0.82983732, + "learning_rate": 3.1311252758335523e-06, + "loss": 0.85095048, + "num_input_tokens_seen": 110282450, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.5703125, + "step": 5135, + "time_per_iteration": 2.3567006587982178 + }, + { + "auxiliary_loss_clip": 0.0101515, + "auxiliary_loss_mlp": 0.01000613, + "balance_loss_clip": 0.99915224, + "balance_loss_mlp": 1.00262022, + "epoch": 0.3087930257026905, + "flos": 65044618819200.0, + "grad_norm": 0.7082331396063813, + "language_loss": 0.5525611, + "learning_rate": 3.130813663559735e-06, + "loss": 0.57271874, + "num_input_tokens_seen": 110343715, + "router_z_loss_clip": 0.0145874, + "router_z_loss_mlp": 0.125, + "step": 5136, + "time_per_iteration": 3.045793056488037 + }, + { + "auxiliary_loss_clip": 0.01078779, + "auxiliary_loss_mlp": 0.01035155, + "balance_loss_clip": 1.01882315, + "balance_loss_mlp": 1.02354622, + "epoch": 0.3088531489553585, + "flos": 74736301173120.0, + "grad_norm": 2.646118738623355, + "language_loss": 0.7619133, + "learning_rate": 3.130502010930087e-06, + "loss": 0.78305262, + "num_input_tokens_seen": 110368430, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.5546875, + "step": 5137, + "time_per_iteration": 2.768284797668457 + }, + { + "auxiliary_loss_clip": 0.01077417, + "auxiliary_loss_mlp": 0.01029081, + "balance_loss_clip": 1.01478219, + "balance_loss_mlp": 1.02478099, + "epoch": 0.30891327220802645, + "flos": 21141235664640.0, + "grad_norm": 1.9320321602761175, + "language_loss": 0.79881501, + "learning_rate": 3.1301903179557293e-06, + "loss": 0.81988001, + "num_input_tokens_seen": 110386735, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.52734375, + "step": 5138, + "time_per_iteration": 2.3786003589630127 + }, + { + "auxiliary_loss_clip": 0.01078554, + "auxiliary_loss_mlp": 0.01027531, + "balance_loss_clip": 1.01010287, + "balance_loss_mlp": 1.0240829, + "epoch": 0.3089733954606944, + "flos": 25663360982400.0, + "grad_norm": 1.844342245454365, + "language_loss": 0.81375891, + "learning_rate": 3.1298785846477868e-06, + "loss": 0.83481979, + "num_input_tokens_seen": 110406820, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.546875, + "step": 5139, + "time_per_iteration": 2.419050693511963 + }, + { + "auxiliary_loss_clip": 0.01083194, + "auxiliary_loss_mlp": 0.01034558, + "balance_loss_clip": 1.0163784, + "balance_loss_mlp": 1.02489758, + "epoch": 0.3090335187133624, + "flos": 19426336859520.0, + "grad_norm": 2.0333308420387244, + "language_loss": 0.77431548, + "learning_rate": 3.129566811017384e-06, + "loss": 0.79549295, + "num_input_tokens_seen": 110424225, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.58203125, + "step": 5140, + "time_per_iteration": 2.3688924312591553 + }, + { + "auxiliary_loss_clip": 0.0107667, + "auxiliary_loss_mlp": 0.01033337, + "balance_loss_clip": 1.01797056, + "balance_loss_mlp": 1.02471864, + "epoch": 0.30909364196603034, + "flos": 20010294126720.0, + "grad_norm": 1.6614967973781085, + "language_loss": 0.78510714, + "learning_rate": 3.1292549970756476e-06, + "loss": 0.80620718, + "num_input_tokens_seen": 110443310, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.51953125, + "step": 5141, + "time_per_iteration": 2.3845062255859375 + }, + { + "auxiliary_loss_clip": 0.01080276, + "auxiliary_loss_mlp": 0.01030535, + "balance_loss_clip": 1.01422119, + "balance_loss_mlp": 1.02444828, + "epoch": 0.3091537652186983, + "flos": 19676699285760.0, + "grad_norm": 2.058116365027092, + "language_loss": 0.86921352, + "learning_rate": 3.128943142833705e-06, + "loss": 0.89032161, + "num_input_tokens_seen": 110460215, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.55859375, + "step": 5142, + "time_per_iteration": 2.3750782012939453 + }, + { + "auxiliary_loss_clip": 0.01079277, + "auxiliary_loss_mlp": 0.01036248, + "balance_loss_clip": 1.01983297, + "balance_loss_mlp": 1.02361512, + "epoch": 0.3092138884713663, + "flos": 17019075415680.0, + "grad_norm": 1.8896357536378163, + "language_loss": 0.78801435, + "learning_rate": 3.128631248302686e-06, + "loss": 0.80916965, + "num_input_tokens_seen": 110479385, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.5546875, + "step": 5143, + "time_per_iteration": 2.369886636734009 + }, + { + "auxiliary_loss_clip": 0.01077859, + "auxiliary_loss_mlp": 0.01033591, + "balance_loss_clip": 1.01674676, + "balance_loss_mlp": 1.02326035, + "epoch": 0.3092740117240343, + "flos": 25008809212800.0, + "grad_norm": 1.7947530757302053, + "language_loss": 0.72192693, + "learning_rate": 3.12831931349372e-06, + "loss": 0.7430414, + "num_input_tokens_seen": 110499885, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.546875, + "step": 5144, + "time_per_iteration": 2.4250173568725586 + }, + { + "auxiliary_loss_clip": 0.01080226, + "auxiliary_loss_mlp": 0.01035091, + "balance_loss_clip": 1.01883709, + "balance_loss_mlp": 1.02430296, + "epoch": 0.30933413497670226, + "flos": 25589310255360.0, + "grad_norm": 5.496147825587017, + "language_loss": 0.73840511, + "learning_rate": 3.128007338417941e-06, + "loss": 0.75955832, + "num_input_tokens_seen": 110519690, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.55859375, + "step": 5145, + "time_per_iteration": 2.4251694679260254 + }, + { + "auxiliary_loss_clip": 0.01080089, + "auxiliary_loss_mlp": 0.01036067, + "balance_loss_clip": 1.01983047, + "balance_loss_mlp": 1.02473593, + "epoch": 0.3093942582293702, + "flos": 24388507353600.0, + "grad_norm": 1.7758187332346287, + "language_loss": 0.75967741, + "learning_rate": 3.127695323086481e-06, + "loss": 0.78083897, + "num_input_tokens_seen": 110540520, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.5546875, + "step": 5146, + "time_per_iteration": 2.4361259937286377 + }, + { + "auxiliary_loss_clip": 0.01076616, + "auxiliary_loss_mlp": 0.0102819, + "balance_loss_clip": 1.01258004, + "balance_loss_mlp": 1.02296495, + "epoch": 0.3094543814820382, + "flos": 19645416840960.0, + "grad_norm": 1.8186994298447199, + "language_loss": 0.66443276, + "learning_rate": 3.1273832675104766e-06, + "loss": 0.68548083, + "num_input_tokens_seen": 110557950, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.53515625, + "step": 5147, + "time_per_iteration": 2.3907363414764404 + }, + { + "auxiliary_loss_clip": 0.01081732, + "auxiliary_loss_mlp": 0.01035762, + "balance_loss_clip": 1.02046752, + "balance_loss_mlp": 1.02510178, + "epoch": 0.30951450473470615, + "flos": 25662697666560.0, + "grad_norm": 1.750283828070761, + "language_loss": 0.74403429, + "learning_rate": 3.1270711717010623e-06, + "loss": 0.76520926, + "num_input_tokens_seen": 110578215, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.56640625, + "step": 5148, + "time_per_iteration": 2.4220266342163086 + }, + { + "auxiliary_loss_clip": 0.01084041, + "auxiliary_loss_mlp": 0.01040437, + "balance_loss_clip": 1.02317524, + "balance_loss_mlp": 1.02491403, + "epoch": 0.3095746279873741, + "flos": 12019617722880.0, + "grad_norm": 6.035270152315335, + "language_loss": 0.72642064, + "learning_rate": 3.126759035669378e-06, + "loss": 0.74766546, + "num_input_tokens_seen": 110592990, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.59375, + "step": 5149, + "time_per_iteration": 2.322463035583496 + }, + { + "auxiliary_loss_clip": 0.01079395, + "auxiliary_loss_mlp": 0.01039494, + "balance_loss_clip": 1.02232814, + "balance_loss_mlp": 1.02319908, + "epoch": 0.3096347512400421, + "flos": 23621919431040.0, + "grad_norm": 1.7128054683454688, + "language_loss": 0.84999681, + "learning_rate": 3.1264468594265612e-06, + "loss": 0.87118566, + "num_input_tokens_seen": 110612130, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.5625, + "step": 5150, + "time_per_iteration": 2.3952243328094482 + }, + { + "auxiliary_loss_clip": 0.0107901, + "auxiliary_loss_mlp": 0.01035461, + "balance_loss_clip": 1.01827121, + "balance_loss_mlp": 1.02312183, + "epoch": 0.30969487449271005, + "flos": 22528195269120.0, + "grad_norm": 1.8247627657633652, + "language_loss": 0.78886694, + "learning_rate": 3.126134642983754e-06, + "loss": 0.81001163, + "num_input_tokens_seen": 110632045, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.55859375, + "step": 5151, + "time_per_iteration": 3.798525333404541 + }, + { + "auxiliary_loss_clip": 0.01082234, + "auxiliary_loss_mlp": 0.01036797, + "balance_loss_clip": 1.02034593, + "balance_loss_mlp": 1.02549887, + "epoch": 0.309754997745378, + "flos": 15267029057280.0, + "grad_norm": 1.886276959709869, + "language_loss": 0.66999632, + "learning_rate": 3.125822386352098e-06, + "loss": 0.69118667, + "num_input_tokens_seen": 110649340, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.56640625, + "step": 5152, + "time_per_iteration": 2.3564295768737793 + }, + { + "auxiliary_loss_clip": 0.01078455, + "auxiliary_loss_mlp": 0.01030742, + "balance_loss_clip": 1.01430285, + "balance_loss_mlp": 1.02357638, + "epoch": 0.309815120998046, + "flos": 26978085250560.0, + "grad_norm": 2.0609214740170243, + "language_loss": 0.82165974, + "learning_rate": 3.1255100895427373e-06, + "loss": 0.84275174, + "num_input_tokens_seen": 110668450, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.546875, + "step": 5153, + "time_per_iteration": 2.414320468902588 + }, + { + "auxiliary_loss_clip": 0.01081617, + "auxiliary_loss_mlp": 0.0103522, + "balance_loss_clip": 1.01829267, + "balance_loss_mlp": 1.02683783, + "epoch": 0.30987524425071394, + "flos": 21142073537280.0, + "grad_norm": 1.7577864193391401, + "language_loss": 0.73961419, + "learning_rate": 3.1251977525668167e-06, + "loss": 0.76078254, + "num_input_tokens_seen": 110689410, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.546875, + "step": 5154, + "time_per_iteration": 3.8006789684295654 + }, + { + "auxiliary_loss_clip": 0.01080085, + "auxiliary_loss_mlp": 0.01029341, + "balance_loss_clip": 1.01333094, + "balance_loss_mlp": 1.02546859, + "epoch": 0.3099353675033819, + "flos": 15267378170880.0, + "grad_norm": 2.0899740514145404, + "language_loss": 0.75891566, + "learning_rate": 3.1248853754354824e-06, + "loss": 0.78000993, + "num_input_tokens_seen": 110707350, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.546875, + "step": 5155, + "time_per_iteration": 2.361358642578125 + }, + { + "auxiliary_loss_clip": 0.01082867, + "auxiliary_loss_mlp": 0.01032622, + "balance_loss_clip": 1.01556349, + "balance_loss_mlp": 1.02651596, + "epoch": 0.30999549075604993, + "flos": 15412896184320.0, + "grad_norm": 1.7411115666002182, + "language_loss": 0.78394848, + "learning_rate": 3.1245729581598826e-06, + "loss": 0.80510342, + "num_input_tokens_seen": 110724910, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.5625, + "step": 5156, + "time_per_iteration": 3.7453432083129883 + }, + { + "auxiliary_loss_clip": 0.01081073, + "auxiliary_loss_mlp": 0.01038307, + "balance_loss_clip": 1.02029407, + "balance_loss_mlp": 1.02402627, + "epoch": 0.3100556140087179, + "flos": 23183445265920.0, + "grad_norm": 2.132798793292484, + "language_loss": 0.75284863, + "learning_rate": 3.1242605007511664e-06, + "loss": 0.77404249, + "num_input_tokens_seen": 110744010, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.5703125, + "step": 5157, + "time_per_iteration": 2.4060237407684326 + }, + { + "auxiliary_loss_clip": 0.01078916, + "auxiliary_loss_mlp": 0.01032459, + "balance_loss_clip": 1.01668811, + "balance_loss_mlp": 1.02377439, + "epoch": 0.31011573726138586, + "flos": 25740902845440.0, + "grad_norm": 1.5550964610521982, + "language_loss": 0.69124174, + "learning_rate": 3.1239480032204857e-06, + "loss": 0.71235549, + "num_input_tokens_seen": 110765835, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.55078125, + "step": 5158, + "time_per_iteration": 2.426074504852295 + }, + { + "auxiliary_loss_clip": 0.01076075, + "auxiliary_loss_mlp": 0.01032981, + "balance_loss_clip": 1.01689982, + "balance_loss_mlp": 1.02291238, + "epoch": 0.3101758605140538, + "flos": 20010294126720.0, + "grad_norm": 2.0654928431009396, + "language_loss": 0.85362601, + "learning_rate": 3.123635465578991e-06, + "loss": 0.87471652, + "num_input_tokens_seen": 110784655, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.53125, + "step": 5159, + "time_per_iteration": 2.3895745277404785 + }, + { + "auxiliary_loss_clip": 0.01078274, + "auxiliary_loss_mlp": 0.01030762, + "balance_loss_clip": 1.01403642, + "balance_loss_mlp": 1.02348506, + "epoch": 0.3102359837667218, + "flos": 19134672428160.0, + "grad_norm": 2.5436504189724385, + "language_loss": 0.84694105, + "learning_rate": 3.123322887837837e-06, + "loss": 0.86803138, + "num_input_tokens_seen": 110802545, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.546875, + "step": 5160, + "time_per_iteration": 3.718170642852783 + }, + { + "auxiliary_loss_clip": 0.01079781, + "auxiliary_loss_mlp": 0.01033897, + "balance_loss_clip": 1.01805365, + "balance_loss_mlp": 1.02562118, + "epoch": 0.31029610701938976, + "flos": 22264531614720.0, + "grad_norm": 4.096411625206555, + "language_loss": 0.7556901, + "learning_rate": 3.123010270008179e-06, + "loss": 0.77682686, + "num_input_tokens_seen": 110820265, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.5390625, + "step": 5161, + "time_per_iteration": 2.3770389556884766 + }, + { + "auxiliary_loss_clip": 0.01080668, + "auxiliary_loss_mlp": 0.01036132, + "balance_loss_clip": 1.01968145, + "balance_loss_mlp": 1.02384794, + "epoch": 0.3103562302720577, + "flos": 20804533712640.0, + "grad_norm": 2.2722921416374873, + "language_loss": 0.81461251, + "learning_rate": 3.1226976121011734e-06, + "loss": 0.8357805, + "num_input_tokens_seen": 110836195, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.5703125, + "step": 5162, + "time_per_iteration": 2.3431122303009033 + }, + { + "auxiliary_loss_clip": 0.01076793, + "auxiliary_loss_mlp": 0.01031223, + "balance_loss_clip": 1.0169183, + "balance_loss_mlp": 1.02361786, + "epoch": 0.3104163535247257, + "flos": 22343120818560.0, + "grad_norm": 1.6402388280776106, + "language_loss": 0.82766771, + "learning_rate": 3.1223849141279774e-06, + "loss": 0.84874785, + "num_input_tokens_seen": 110856420, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.53125, + "step": 5163, + "time_per_iteration": 2.387047529220581 + }, + { + "auxiliary_loss_clip": 0.01081891, + "auxiliary_loss_mlp": 0.01036125, + "balance_loss_clip": 1.01897073, + "balance_loss_mlp": 1.02526426, + "epoch": 0.31047647677739365, + "flos": 21688289758080.0, + "grad_norm": 2.5654516636234286, + "language_loss": 0.7619108, + "learning_rate": 3.1220721760997517e-06, + "loss": 0.78309095, + "num_input_tokens_seen": 110876650, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.56640625, + "step": 5164, + "time_per_iteration": 2.3888611793518066 + }, + { + "auxiliary_loss_clip": 0.01082237, + "auxiliary_loss_mlp": 0.01034542, + "balance_loss_clip": 1.01800752, + "balance_loss_mlp": 1.02623677, + "epoch": 0.3105366000300616, + "flos": 18916255762560.0, + "grad_norm": 2.100125460507144, + "language_loss": 0.74655926, + "learning_rate": 3.1217593980276554e-06, + "loss": 0.76772702, + "num_input_tokens_seen": 110894445, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.5625, + "step": 5165, + "time_per_iteration": 2.375763177871704 + }, + { + "auxiliary_loss_clip": 0.01077283, + "auxiliary_loss_mlp": 0.01031538, + "balance_loss_clip": 1.01592183, + "balance_loss_mlp": 1.02380311, + "epoch": 0.3105967232827296, + "flos": 18259399843200.0, + "grad_norm": 1.4971185350030323, + "language_loss": 0.75919765, + "learning_rate": 3.1214465799228525e-06, + "loss": 0.78028589, + "num_input_tokens_seen": 110912855, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.53515625, + "step": 5166, + "time_per_iteration": 2.363426685333252 + }, + { + "auxiliary_loss_clip": 0.01079001, + "auxiliary_loss_mlp": 0.01035679, + "balance_loss_clip": 1.01888227, + "balance_loss_mlp": 1.02397704, + "epoch": 0.31065684653539755, + "flos": 17671288124160.0, + "grad_norm": 2.2446587201714228, + "language_loss": 0.73668718, + "learning_rate": 3.121133721796505e-06, + "loss": 0.75783396, + "num_input_tokens_seen": 110928025, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.55078125, + "step": 5167, + "time_per_iteration": 2.340365409851074 + }, + { + "auxiliary_loss_clip": 0.01014494, + "auxiliary_loss_mlp": 0.01002894, + "balance_loss_clip": 1.00134408, + "balance_loss_mlp": 1.00219703, + "epoch": 0.3107169697880655, + "flos": 68528742238080.0, + "grad_norm": 0.7114608222550712, + "language_loss": 0.52947611, + "learning_rate": 3.1208208236597795e-06, + "loss": 0.54965001, + "num_input_tokens_seen": 110992215, + "router_z_loss_clip": 0.01544189, + "router_z_loss_mlp": 0.12304688, + "step": 5168, + "time_per_iteration": 3.0846521854400635 + }, + { + "auxiliary_loss_clip": 0.01081284, + "auxiliary_loss_mlp": 0.01031618, + "balance_loss_clip": 1.01458335, + "balance_loss_mlp": 1.0247972, + "epoch": 0.3107770930407335, + "flos": 13187881370880.0, + "grad_norm": 31.9167680073535, + "language_loss": 0.786448, + "learning_rate": 3.1205078855238417e-06, + "loss": 0.80757707, + "num_input_tokens_seen": 111010400, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.5625, + "step": 5169, + "time_per_iteration": 2.398637056350708 + }, + { + "auxiliary_loss_clip": 0.0107965, + "auxiliary_loss_mlp": 0.01029861, + "balance_loss_clip": 1.01420891, + "balance_loss_mlp": 1.02494442, + "epoch": 0.3108372162934015, + "flos": 31579393265280.0, + "grad_norm": 1.5191279813242082, + "language_loss": 0.64471245, + "learning_rate": 3.12019490739986e-06, + "loss": 0.66580755, + "num_input_tokens_seen": 111033960, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.546875, + "step": 5170, + "time_per_iteration": 2.4840941429138184 + }, + { + "auxiliary_loss_clip": 0.01083673, + "auxiliary_loss_mlp": 0.01035095, + "balance_loss_clip": 1.01763117, + "balance_loss_mlp": 1.02708578, + "epoch": 0.31089733954606946, + "flos": 28728595509120.0, + "grad_norm": 3.0215369841663513, + "language_loss": 0.77810049, + "learning_rate": 3.1198818892990037e-06, + "loss": 0.79928815, + "num_input_tokens_seen": 111053265, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.56640625, + "step": 5171, + "time_per_iteration": 2.4291341304779053 + }, + { + "auxiliary_loss_clip": 0.01082376, + "auxiliary_loss_mlp": 0.01034013, + "balance_loss_clip": 1.01727653, + "balance_loss_mlp": 1.02589393, + "epoch": 0.3109574627987374, + "flos": 19682215280640.0, + "grad_norm": 1.932997151229255, + "language_loss": 0.83597481, + "learning_rate": 3.1195688312324426e-06, + "loss": 0.85713863, + "num_input_tokens_seen": 111071130, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.5625, + "step": 5172, + "time_per_iteration": 2.390674352645874 + }, + { + "auxiliary_loss_clip": 0.01081184, + "auxiliary_loss_mlp": 0.01036426, + "balance_loss_clip": 1.01885414, + "balance_loss_mlp": 1.02526832, + "epoch": 0.3110175860514054, + "flos": 14683106701440.0, + "grad_norm": 2.085447486912353, + "language_loss": 0.83641905, + "learning_rate": 3.11925573321135e-06, + "loss": 0.85759509, + "num_input_tokens_seen": 111089560, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.55859375, + "step": 5173, + "time_per_iteration": 2.370365619659424 + }, + { + "auxiliary_loss_clip": 0.01077795, + "auxiliary_loss_mlp": 0.01030433, + "balance_loss_clip": 1.01478052, + "balance_loss_mlp": 1.0254494, + "epoch": 0.31107770930407336, + "flos": 25738459050240.0, + "grad_norm": 2.2856778103464555, + "language_loss": 0.83201587, + "learning_rate": 3.1189425952469003e-06, + "loss": 0.85309815, + "num_input_tokens_seen": 111109960, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.5234375, + "step": 5174, + "time_per_iteration": 2.4165408611297607 + }, + { + "auxiliary_loss_clip": 0.01080246, + "auxiliary_loss_mlp": 0.01032643, + "balance_loss_clip": 1.01610303, + "balance_loss_mlp": 1.02588403, + "epoch": 0.3111378325567413, + "flos": 19207256878080.0, + "grad_norm": 2.5154940620519377, + "language_loss": 0.85075682, + "learning_rate": 3.1186294173502667e-06, + "loss": 0.87188578, + "num_input_tokens_seen": 111127960, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.54296875, + "step": 5175, + "time_per_iteration": 2.373990535736084 + }, + { + "auxiliary_loss_clip": 0.01081859, + "auxiliary_loss_mlp": 0.0103772, + "balance_loss_clip": 1.02126956, + "balance_loss_mlp": 1.02669513, + "epoch": 0.3111979558094093, + "flos": 23695237019520.0, + "grad_norm": 1.582127669238255, + "language_loss": 0.83314329, + "learning_rate": 3.118316199532627e-06, + "loss": 0.85433906, + "num_input_tokens_seen": 111146730, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.55078125, + "step": 5176, + "time_per_iteration": 2.394024610519409 + }, + { + "auxiliary_loss_clip": 0.01077379, + "auxiliary_loss_mlp": 0.01031054, + "balance_loss_clip": 1.01453185, + "balance_loss_mlp": 1.02340221, + "epoch": 0.31125807906207725, + "flos": 21031957509120.0, + "grad_norm": 2.0081556761543493, + "language_loss": 0.80177754, + "learning_rate": 3.1180029418051586e-06, + "loss": 0.82286185, + "num_input_tokens_seen": 111166295, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.5390625, + "step": 5177, + "time_per_iteration": 2.380166530609131 + }, + { + "auxiliary_loss_clip": 0.01080185, + "auxiliary_loss_mlp": 0.01033765, + "balance_loss_clip": 1.01688457, + "balance_loss_mlp": 1.0258652, + "epoch": 0.3113182023147452, + "flos": 23075493742080.0, + "grad_norm": 1.7207153332317755, + "language_loss": 0.80549353, + "learning_rate": 3.117689644179041e-06, + "loss": 0.82663304, + "num_input_tokens_seen": 111185665, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.54296875, + "step": 5178, + "time_per_iteration": 2.387977361679077 + }, + { + "auxiliary_loss_clip": 0.01081592, + "auxiliary_loss_mlp": 0.01033627, + "balance_loss_clip": 1.01745057, + "balance_loss_mlp": 1.02485847, + "epoch": 0.3113783255674132, + "flos": 11838174053760.0, + "grad_norm": 1.7749807277252454, + "language_loss": 0.81721008, + "learning_rate": 3.1173763066654556e-06, + "loss": 0.83836234, + "num_input_tokens_seen": 111201615, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.5703125, + "step": 5179, + "time_per_iteration": 2.371904134750366 + }, + { + "auxiliary_loss_clip": 0.0108212, + "auxiliary_loss_mlp": 0.01040558, + "balance_loss_clip": 1.02355909, + "balance_loss_mlp": 1.02723241, + "epoch": 0.31143844882008115, + "flos": 16288622616960.0, + "grad_norm": 1.6678175791675147, + "language_loss": 0.78229654, + "learning_rate": 3.1170629292755837e-06, + "loss": 0.80352336, + "num_input_tokens_seen": 111220515, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.546875, + "step": 5180, + "time_per_iteration": 2.3581440448760986 + }, + { + "auxiliary_loss_clip": 0.01079154, + "auxiliary_loss_mlp": 0.01028954, + "balance_loss_clip": 1.01297402, + "balance_loss_mlp": 1.02463138, + "epoch": 0.3114985720727491, + "flos": 23216787480960.0, + "grad_norm": 1.716053471694723, + "language_loss": 0.8308934, + "learning_rate": 3.1167495120206094e-06, + "loss": 0.85197449, + "num_input_tokens_seen": 111240395, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.546875, + "step": 5181, + "time_per_iteration": 2.4131224155426025 + }, + { + "auxiliary_loss_clip": 0.01075548, + "auxiliary_loss_mlp": 0.01032107, + "balance_loss_clip": 1.01744437, + "balance_loss_mlp": 1.02359533, + "epoch": 0.3115586953254171, + "flos": 30043319777280.0, + "grad_norm": 3.410956960308437, + "language_loss": 0.74611485, + "learning_rate": 3.116436054911717e-06, + "loss": 0.76719141, + "num_input_tokens_seen": 111261100, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.515625, + "step": 5182, + "time_per_iteration": 2.4406213760375977 + }, + { + "auxiliary_loss_clip": 0.01082012, + "auxiliary_loss_mlp": 0.01044019, + "balance_loss_clip": 1.02663863, + "balance_loss_mlp": 1.02548504, + "epoch": 0.3116188185780851, + "flos": 25665141461760.0, + "grad_norm": 1.827078533865724, + "language_loss": 0.70743579, + "learning_rate": 3.116122557960094e-06, + "loss": 0.72869617, + "num_input_tokens_seen": 111281320, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.56640625, + "step": 5183, + "time_per_iteration": 2.428618907928467 + }, + { + "auxiliary_loss_clip": 0.0101554, + "auxiliary_loss_mlp": 0.01005353, + "balance_loss_clip": 1.00384498, + "balance_loss_mlp": 1.0032835, + "epoch": 0.31167894183075306, + "flos": 69506974022400.0, + "grad_norm": 1.1213421774254597, + "language_loss": 0.59617829, + "learning_rate": 3.115809021176928e-06, + "loss": 0.61638725, + "num_input_tokens_seen": 111341405, + "router_z_loss_clip": 0.01507568, + "router_z_loss_mlp": 0.12255859, + "step": 5184, + "time_per_iteration": 3.0369694232940674 + }, + { + "auxiliary_loss_clip": 0.01076911, + "auxiliary_loss_mlp": 0.01033431, + "balance_loss_clip": 1.01824319, + "balance_loss_mlp": 1.02336717, + "epoch": 0.31173906508342103, + "flos": 14938950211200.0, + "grad_norm": 2.0188060636368292, + "language_loss": 0.7018702, + "learning_rate": 3.1154954445734088e-06, + "loss": 0.7229737, + "num_input_tokens_seen": 111358975, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.53515625, + "step": 5185, + "time_per_iteration": 2.36911678314209 + }, + { + "auxiliary_loss_clip": 0.01081712, + "auxiliary_loss_mlp": 0.01034833, + "balance_loss_clip": 1.01922858, + "balance_loss_mlp": 1.02561331, + "epoch": 0.311799188336089, + "flos": 16175224920960.0, + "grad_norm": 2.3242265680003027, + "language_loss": 0.63079411, + "learning_rate": 3.115181828160726e-06, + "loss": 0.65195954, + "num_input_tokens_seen": 111375845, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.5625, + "step": 5186, + "time_per_iteration": 2.345111131668091 + }, + { + "auxiliary_loss_clip": 0.01083555, + "auxiliary_loss_mlp": 0.01038992, + "balance_loss_clip": 1.02171898, + "balance_loss_mlp": 1.02592337, + "epoch": 0.31185931158875696, + "flos": 25008460099200.0, + "grad_norm": 2.375239376716589, + "language_loss": 0.86922002, + "learning_rate": 3.1148681719500723e-06, + "loss": 0.89044547, + "num_input_tokens_seen": 111394150, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.57421875, + "step": 5187, + "time_per_iteration": 2.4081404209136963 + }, + { + "auxiliary_loss_clip": 0.01078339, + "auxiliary_loss_mlp": 0.01036552, + "balance_loss_clip": 1.02109027, + "balance_loss_mlp": 1.02306581, + "epoch": 0.3119194348414249, + "flos": 37231377868800.0, + "grad_norm": 1.5291252460434375, + "language_loss": 0.62894654, + "learning_rate": 3.114554475952642e-06, + "loss": 0.65009546, + "num_input_tokens_seen": 111418355, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.5546875, + "step": 5188, + "time_per_iteration": 2.513761043548584 + }, + { + "auxiliary_loss_clip": 0.01082265, + "auxiliary_loss_mlp": 0.01035234, + "balance_loss_clip": 1.01893842, + "balance_loss_mlp": 1.027493, + "epoch": 0.3119795580940929, + "flos": 15011883774720.0, + "grad_norm": 2.1436139170398505, + "language_loss": 0.8322295, + "learning_rate": 3.1142407401796283e-06, + "loss": 0.8534044, + "num_input_tokens_seen": 111435445, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.546875, + "step": 5189, + "time_per_iteration": 2.3524062633514404 + }, + { + "auxiliary_loss_clip": 0.0107796, + "auxiliary_loss_mlp": 0.0102794, + "balance_loss_clip": 1.01183438, + "balance_loss_mlp": 1.02331614, + "epoch": 0.31203968134676086, + "flos": 15997237476480.0, + "grad_norm": 2.0552027135418753, + "language_loss": 0.78954196, + "learning_rate": 3.113926964642229e-06, + "loss": 0.810601, + "num_input_tokens_seen": 111453430, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.546875, + "step": 5190, + "time_per_iteration": 2.3690733909606934 + }, + { + "auxiliary_loss_clip": 0.01081006, + "auxiliary_loss_mlp": 0.010284, + "balance_loss_clip": 1.01239634, + "balance_loss_mlp": 1.02578008, + "epoch": 0.3120998045994288, + "flos": 23836356201600.0, + "grad_norm": 1.7876105766162835, + "language_loss": 0.75349545, + "learning_rate": 3.1136131493516426e-06, + "loss": 0.77458954, + "num_input_tokens_seen": 111475325, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.55078125, + "step": 5191, + "time_per_iteration": 3.8508388996124268 + }, + { + "auxiliary_loss_clip": 0.01014703, + "auxiliary_loss_mlp": 0.01001387, + "balance_loss_clip": 0.9999088, + "balance_loss_mlp": 1.0020169, + "epoch": 0.3121599278520968, + "flos": 69181059680640.0, + "grad_norm": 0.8485211279840035, + "language_loss": 0.63893914, + "learning_rate": 3.1132992943190664e-06, + "loss": 0.65910006, + "num_input_tokens_seen": 111533960, + "router_z_loss_clip": 0.01477051, + "router_z_loss_mlp": 0.12695312, + "step": 5192, + "time_per_iteration": 3.085897207260132 + }, + { + "auxiliary_loss_clip": 0.01079491, + "auxiliary_loss_mlp": 0.01030856, + "balance_loss_clip": 1.01410747, + "balance_loss_mlp": 1.02389359, + "epoch": 0.31222005110476475, + "flos": 23805213402240.0, + "grad_norm": 1.5507979407197972, + "language_loss": 0.79747391, + "learning_rate": 3.1129853995557033e-06, + "loss": 0.81857741, + "num_input_tokens_seen": 111554055, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.5546875, + "step": 5193, + "time_per_iteration": 2.4205949306488037 + }, + { + "auxiliary_loss_clip": 0.01080126, + "auxiliary_loss_mlp": 0.01031567, + "balance_loss_clip": 1.01493776, + "balance_loss_mlp": 1.02426255, + "epoch": 0.3122801743574327, + "flos": 25225026462720.0, + "grad_norm": 1.9433182780917384, + "language_loss": 0.72491264, + "learning_rate": 3.1126714650727534e-06, + "loss": 0.74602962, + "num_input_tokens_seen": 111574305, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.55859375, + "step": 5194, + "time_per_iteration": 2.40321683883667 + }, + { + "auxiliary_loss_clip": 0.01081824, + "auxiliary_loss_mlp": 0.01031836, + "balance_loss_clip": 1.015957, + "balance_loss_mlp": 1.02660155, + "epoch": 0.3123402976101007, + "flos": 22965377713920.0, + "grad_norm": 1.3980190269412873, + "language_loss": 0.76536357, + "learning_rate": 3.112357490881421e-06, + "loss": 0.78650016, + "num_input_tokens_seen": 111595680, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.55078125, + "step": 5195, + "time_per_iteration": 3.8176259994506836 + }, + { + "auxiliary_loss_clip": 0.01077942, + "auxiliary_loss_mlp": 0.01039451, + "balance_loss_clip": 1.02213025, + "balance_loss_mlp": 1.02252913, + "epoch": 0.3124004208627687, + "flos": 25190916197760.0, + "grad_norm": 1.4224160415367675, + "language_loss": 0.77692068, + "learning_rate": 3.112043476992911e-06, + "loss": 0.79809463, + "num_input_tokens_seen": 111618135, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.5546875, + "step": 5196, + "time_per_iteration": 3.7637298107147217 + }, + { + "auxiliary_loss_clip": 0.01013106, + "auxiliary_loss_mlp": 0.01002574, + "balance_loss_clip": 1.00095296, + "balance_loss_mlp": 1.00082779, + "epoch": 0.31246054411543667, + "flos": 67481626608000.0, + "grad_norm": 0.8141568095750946, + "language_loss": 0.54867059, + "learning_rate": 3.1117294234184304e-06, + "loss": 0.56882739, + "num_input_tokens_seen": 111682220, + "router_z_loss_clip": 0.01623535, + "router_z_loss_mlp": 0.12304688, + "step": 5197, + "time_per_iteration": 3.0648508071899414 + }, + { + "auxiliary_loss_clip": 0.01079666, + "auxiliary_loss_mlp": 0.01033759, + "balance_loss_clip": 1.01822639, + "balance_loss_mlp": 1.02567363, + "epoch": 0.31252066736810463, + "flos": 17857549560960.0, + "grad_norm": 1.5630285299325366, + "language_loss": 0.66702014, + "learning_rate": 3.111415330169186e-06, + "loss": 0.6881544, + "num_input_tokens_seen": 111700815, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.5390625, + "step": 5198, + "time_per_iteration": 2.367238759994507 + }, + { + "auxiliary_loss_clip": 0.01081461, + "auxiliary_loss_mlp": 0.01029728, + "balance_loss_clip": 1.01396847, + "balance_loss_mlp": 1.02380633, + "epoch": 0.3125807906207726, + "flos": 18474150816000.0, + "grad_norm": 1.9703807953105898, + "language_loss": 0.69121277, + "learning_rate": 3.111101197256387e-06, + "loss": 0.71232462, + "num_input_tokens_seen": 111718195, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.578125, + "step": 5199, + "time_per_iteration": 2.346186876296997 + }, + { + "auxiliary_loss_clip": 0.01080707, + "auxiliary_loss_mlp": 0.01037477, + "balance_loss_clip": 1.02091897, + "balance_loss_mlp": 1.0255847, + "epoch": 0.31264091387344056, + "flos": 18945722816640.0, + "grad_norm": 1.7182015056405442, + "language_loss": 0.78764206, + "learning_rate": 3.110787024691245e-06, + "loss": 0.80882394, + "num_input_tokens_seen": 111734440, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.55078125, + "step": 5200, + "time_per_iteration": 3.7270419597625732 + }, + { + "auxiliary_loss_clip": 0.01078461, + "auxiliary_loss_mlp": 0.0102878, + "balance_loss_clip": 1.01316404, + "balance_loss_mlp": 1.02528775, + "epoch": 0.3127010371261085, + "flos": 21467499120000.0, + "grad_norm": 1.9979917626069408, + "language_loss": 0.83699179, + "learning_rate": 3.1104728124849714e-06, + "loss": 0.85806423, + "num_input_tokens_seen": 111751960, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.53125, + "step": 5201, + "time_per_iteration": 2.3872454166412354 + }, + { + "auxiliary_loss_clip": 0.01081885, + "auxiliary_loss_mlp": 0.0103617, + "balance_loss_clip": 1.01945066, + "balance_loss_mlp": 1.02636862, + "epoch": 0.3127611603787765, + "flos": 15335284498560.0, + "grad_norm": 2.0687619785515077, + "language_loss": 0.6888448, + "learning_rate": 3.110158560648779e-06, + "loss": 0.71002531, + "num_input_tokens_seen": 111769585, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.5546875, + "step": 5202, + "time_per_iteration": 2.368793487548828 + }, + { + "auxiliary_loss_clip": 0.01082165, + "auxiliary_loss_mlp": 0.0103053, + "balance_loss_clip": 1.0154382, + "balance_loss_mlp": 1.02622008, + "epoch": 0.31282128363144446, + "flos": 17602020253440.0, + "grad_norm": 1.9734822106395633, + "language_loss": 0.8388415, + "learning_rate": 3.109844269193884e-06, + "loss": 0.85996842, + "num_input_tokens_seen": 111787880, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.55859375, + "step": 5203, + "time_per_iteration": 2.370553731918335 + }, + { + "auxiliary_loss_clip": 0.01079535, + "auxiliary_loss_mlp": 0.01029369, + "balance_loss_clip": 1.01358581, + "balance_loss_mlp": 1.02512097, + "epoch": 0.3128814068841124, + "flos": 26755653778560.0, + "grad_norm": 2.27855496136418, + "language_loss": 0.60890663, + "learning_rate": 3.109529938131501e-06, + "loss": 0.62999564, + "num_input_tokens_seen": 111805950, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.54296875, + "step": 5204, + "time_per_iteration": 2.4368503093719482 + }, + { + "auxiliary_loss_clip": 0.01078105, + "auxiliary_loss_mlp": 0.01028143, + "balance_loss_clip": 1.01350415, + "balance_loss_mlp": 1.02503884, + "epoch": 0.3129415301367804, + "flos": 22271304418560.0, + "grad_norm": 1.7547907188661906, + "language_loss": 0.65913028, + "learning_rate": 3.109215567472849e-06, + "loss": 0.68019271, + "num_input_tokens_seen": 111826135, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.53125, + "step": 5205, + "time_per_iteration": 2.3928170204162598 + }, + { + "auxiliary_loss_clip": 0.01080375, + "auxiliary_loss_mlp": 0.01033696, + "balance_loss_clip": 1.01767445, + "balance_loss_mlp": 1.02590215, + "epoch": 0.31300165338944835, + "flos": 26463814790400.0, + "grad_norm": 1.5164144421712036, + "language_loss": 0.76598847, + "learning_rate": 3.1089011572291464e-06, + "loss": 0.78712916, + "num_input_tokens_seen": 111844700, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.546875, + "step": 5206, + "time_per_iteration": 2.4176676273345947 + }, + { + "auxiliary_loss_clip": 0.01078435, + "auxiliary_loss_mlp": 0.01028937, + "balance_loss_clip": 1.01295102, + "balance_loss_mlp": 1.02383256, + "epoch": 0.3130617766421163, + "flos": 21943574686080.0, + "grad_norm": 2.53045503060247, + "language_loss": 0.8274411, + "learning_rate": 3.1085867074116143e-06, + "loss": 0.8485148, + "num_input_tokens_seen": 111861585, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.546875, + "step": 5207, + "time_per_iteration": 2.390138626098633 + }, + { + "auxiliary_loss_clip": 0.01078352, + "auxiliary_loss_mlp": 0.01029868, + "balance_loss_clip": 1.01528835, + "balance_loss_mlp": 1.02652049, + "epoch": 0.3131218998947843, + "flos": 23291710992000.0, + "grad_norm": 1.5481103415319293, + "language_loss": 0.71460927, + "learning_rate": 3.108272218031475e-06, + "loss": 0.73569143, + "num_input_tokens_seen": 111882950, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.51953125, + "step": 5208, + "time_per_iteration": 2.401580572128296 + }, + { + "auxiliary_loss_clip": 0.01082266, + "auxiliary_loss_mlp": 0.01036293, + "balance_loss_clip": 1.01954424, + "balance_loss_mlp": 1.02604055, + "epoch": 0.3131820231474523, + "flos": 21138652224000.0, + "grad_norm": 1.7093167898150907, + "language_loss": 0.74616063, + "learning_rate": 3.1079576890999498e-06, + "loss": 0.76734626, + "num_input_tokens_seen": 111901640, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.5625, + "step": 5209, + "time_per_iteration": 2.4222571849823 + }, + { + "auxiliary_loss_clip": 0.01080924, + "auxiliary_loss_mlp": 0.01033894, + "balance_loss_clip": 1.01882601, + "balance_loss_mlp": 1.02518535, + "epoch": 0.31324214640012027, + "flos": 23908870828800.0, + "grad_norm": 1.6895503139107564, + "language_loss": 0.77537382, + "learning_rate": 3.107643120628265e-06, + "loss": 0.79652196, + "num_input_tokens_seen": 111919615, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.5546875, + "step": 5210, + "time_per_iteration": 2.397287368774414 + }, + { + "auxiliary_loss_clip": 0.01075108, + "auxiliary_loss_mlp": 0.01032293, + "balance_loss_clip": 1.01727307, + "balance_loss_mlp": 1.02325869, + "epoch": 0.31330226965278823, + "flos": 22235832610560.0, + "grad_norm": 1.8560947599935154, + "language_loss": 0.79085064, + "learning_rate": 3.1073285126276467e-06, + "loss": 0.8119247, + "num_input_tokens_seen": 111938485, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.51953125, + "step": 5211, + "time_per_iteration": 2.3730459213256836 + }, + { + "auxiliary_loss_clip": 0.01077211, + "auxiliary_loss_mlp": 0.01031763, + "balance_loss_clip": 1.01728463, + "balance_loss_mlp": 1.02407885, + "epoch": 0.3133623929054562, + "flos": 19753019251200.0, + "grad_norm": 1.9107587778144368, + "language_loss": 0.79605746, + "learning_rate": 3.1070138651093217e-06, + "loss": 0.81714725, + "num_input_tokens_seen": 111956425, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.53125, + "step": 5212, + "time_per_iteration": 2.385580539703369 + }, + { + "auxiliary_loss_clip": 0.01081485, + "auxiliary_loss_mlp": 0.0103855, + "balance_loss_clip": 1.02183723, + "balance_loss_mlp": 1.02487481, + "epoch": 0.31342251615812416, + "flos": 27161030108160.0, + "grad_norm": 2.5929896746124643, + "language_loss": 0.712565, + "learning_rate": 3.10669917808452e-06, + "loss": 0.73376536, + "num_input_tokens_seen": 111975915, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.5625, + "step": 5213, + "time_per_iteration": 2.4182918071746826 + }, + { + "auxiliary_loss_clip": 0.0108239, + "auxiliary_loss_mlp": 0.01031634, + "balance_loss_clip": 1.01552844, + "balance_loss_mlp": 1.02738667, + "epoch": 0.31348263941079213, + "flos": 20228780615040.0, + "grad_norm": 24.25431217186947, + "language_loss": 0.77585387, + "learning_rate": 3.106384451564471e-06, + "loss": 0.79699421, + "num_input_tokens_seen": 111995055, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.55078125, + "step": 5214, + "time_per_iteration": 2.3801310062408447 + }, + { + "auxiliary_loss_clip": 0.01075621, + "auxiliary_loss_mlp": 0.010259, + "balance_loss_clip": 1.01164854, + "balance_loss_mlp": 1.02380764, + "epoch": 0.3135427626634601, + "flos": 24606505082880.0, + "grad_norm": 1.6536746125233315, + "language_loss": 0.82472187, + "learning_rate": 3.106069685560407e-06, + "loss": 0.8457371, + "num_input_tokens_seen": 112015830, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.515625, + "step": 5215, + "time_per_iteration": 2.410188674926758 + }, + { + "auxiliary_loss_clip": 0.01079952, + "auxiliary_loss_mlp": 0.01031824, + "balance_loss_clip": 1.01605809, + "balance_loss_mlp": 1.02497745, + "epoch": 0.31360288591612806, + "flos": 20958814477440.0, + "grad_norm": 1.8239087292961935, + "language_loss": 0.7913394, + "learning_rate": 3.1057548800835613e-06, + "loss": 0.81245714, + "num_input_tokens_seen": 112035065, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.546875, + "step": 5216, + "time_per_iteration": 2.3760178089141846 + }, + { + "auxiliary_loss_clip": 0.01077709, + "auxiliary_loss_mlp": 0.01031736, + "balance_loss_clip": 1.01504672, + "balance_loss_mlp": 1.02292299, + "epoch": 0.313663009168796, + "flos": 26979272236800.0, + "grad_norm": 1.6802899189431002, + "language_loss": 0.68560529, + "learning_rate": 3.105440035145168e-06, + "loss": 0.70669973, + "num_input_tokens_seen": 112058405, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.546875, + "step": 5217, + "time_per_iteration": 2.4405322074890137 + }, + { + "auxiliary_loss_clip": 0.01081398, + "auxiliary_loss_mlp": 0.01033088, + "balance_loss_clip": 1.01719773, + "balance_loss_mlp": 1.02496004, + "epoch": 0.313723132421464, + "flos": 18039935836800.0, + "grad_norm": 1.585974064048292, + "language_loss": 0.81060404, + "learning_rate": 3.105125150756463e-06, + "loss": 0.8317489, + "num_input_tokens_seen": 112076420, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.5625, + "step": 5218, + "time_per_iteration": 2.403529167175293 + }, + { + "auxiliary_loss_clip": 0.01081012, + "auxiliary_loss_mlp": 0.0103759, + "balance_loss_clip": 1.02019131, + "balance_loss_mlp": 1.02569699, + "epoch": 0.31378325567413196, + "flos": 22487905693440.0, + "grad_norm": 3.4544843904123232, + "language_loss": 0.69337761, + "learning_rate": 3.1048102269286843e-06, + "loss": 0.71456367, + "num_input_tokens_seen": 112090775, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.5546875, + "step": 5219, + "time_per_iteration": 2.359384059906006 + }, + { + "auxiliary_loss_clip": 0.01078975, + "auxiliary_loss_mlp": 0.01032719, + "balance_loss_clip": 1.01668501, + "balance_loss_mlp": 1.02443516, + "epoch": 0.3138433789267999, + "flos": 22418149063680.0, + "grad_norm": 2.361456868397611, + "language_loss": 0.79714119, + "learning_rate": 3.104495263673071e-06, + "loss": 0.81825817, + "num_input_tokens_seen": 112110980, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.546875, + "step": 5220, + "time_per_iteration": 2.389528751373291 + }, + { + "auxiliary_loss_clip": 0.01079666, + "auxiliary_loss_mlp": 0.01028899, + "balance_loss_clip": 1.0140636, + "balance_loss_mlp": 1.02482939, + "epoch": 0.3139035021794679, + "flos": 13005076158720.0, + "grad_norm": 1.7582443202352755, + "language_loss": 0.73346162, + "learning_rate": 3.1041802610008624e-06, + "loss": 0.75454724, + "num_input_tokens_seen": 112129020, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.546875, + "step": 5221, + "time_per_iteration": 2.342547655105591 + }, + { + "auxiliary_loss_clip": 0.01076388, + "auxiliary_loss_mlp": 0.01032688, + "balance_loss_clip": 1.01732218, + "balance_loss_mlp": 1.02358842, + "epoch": 0.31396362543213585, + "flos": 16945059600000.0, + "grad_norm": 1.7344539838687305, + "language_loss": 0.81519318, + "learning_rate": 3.103865218923301e-06, + "loss": 0.83628392, + "num_input_tokens_seen": 112147865, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.52734375, + "step": 5222, + "time_per_iteration": 2.368912696838379 + }, + { + "auxiliary_loss_clip": 0.01079858, + "auxiliary_loss_mlp": 0.01036266, + "balance_loss_clip": 1.01858759, + "balance_loss_mlp": 1.0240047, + "epoch": 0.31402374868480387, + "flos": 20155707406080.0, + "grad_norm": 2.371335870930761, + "language_loss": 0.69601059, + "learning_rate": 3.103550137451629e-06, + "loss": 0.71717179, + "num_input_tokens_seen": 112166745, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.55859375, + "step": 5223, + "time_per_iteration": 2.3521111011505127 + }, + { + "auxiliary_loss_clip": 0.01076712, + "auxiliary_loss_mlp": 0.01031703, + "balance_loss_clip": 1.01721931, + "balance_loss_mlp": 1.02363098, + "epoch": 0.31408387193747184, + "flos": 21250025061120.0, + "grad_norm": 1.509765308651534, + "language_loss": 0.80232835, + "learning_rate": 3.1032350165970915e-06, + "loss": 0.82341254, + "num_input_tokens_seen": 112185895, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.53125, + "step": 5224, + "time_per_iteration": 2.3955767154693604 + }, + { + "auxiliary_loss_clip": 0.01084675, + "auxiliary_loss_mlp": 0.01035959, + "balance_loss_clip": 1.01818466, + "balance_loss_mlp": 1.02635026, + "epoch": 0.3141439951901398, + "flos": 27483208934400.0, + "grad_norm": 2.628676699413002, + "language_loss": 0.58446127, + "learning_rate": 3.102919856370934e-06, + "loss": 0.60566765, + "num_input_tokens_seen": 112204465, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.58203125, + "step": 5225, + "time_per_iteration": 2.4336049556732178 + }, + { + "auxiliary_loss_clip": 0.01073478, + "auxiliary_loss_mlp": 0.01026267, + "balance_loss_clip": 1.01224792, + "balance_loss_mlp": 1.02312064, + "epoch": 0.31420411844280777, + "flos": 17851440072960.0, + "grad_norm": 2.0848753295686944, + "language_loss": 0.81684405, + "learning_rate": 3.102604656784404e-06, + "loss": 0.83784151, + "num_input_tokens_seen": 112221635, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.50390625, + "step": 5226, + "time_per_iteration": 2.371347188949585 + }, + { + "auxiliary_loss_clip": 0.01080596, + "auxiliary_loss_mlp": 0.01034467, + "balance_loss_clip": 1.01756275, + "balance_loss_mlp": 1.025051, + "epoch": 0.31426424169547573, + "flos": 21615879864960.0, + "grad_norm": 1.750586808766826, + "language_loss": 0.74049574, + "learning_rate": 3.10228941784875e-06, + "loss": 0.76164633, + "num_input_tokens_seen": 112241240, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.5546875, + "step": 5227, + "time_per_iteration": 2.3996050357818604 + }, + { + "auxiliary_loss_clip": 0.01081925, + "auxiliary_loss_mlp": 0.01034579, + "balance_loss_clip": 1.01865268, + "balance_loss_mlp": 1.0257802, + "epoch": 0.3143243649481437, + "flos": 30919290589440.0, + "grad_norm": 1.8413185551654894, + "language_loss": 0.6761961, + "learning_rate": 3.101974139575222e-06, + "loss": 0.69736111, + "num_input_tokens_seen": 112262350, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.5625, + "step": 5228, + "time_per_iteration": 2.4649455547332764 + }, + { + "auxiliary_loss_clip": 0.01078865, + "auxiliary_loss_mlp": 0.01032788, + "balance_loss_clip": 1.01746917, + "balance_loss_mlp": 1.02348232, + "epoch": 0.31438448820081166, + "flos": 22820278636800.0, + "grad_norm": 1.7984687215982043, + "language_loss": 0.79878032, + "learning_rate": 3.1016588219750716e-06, + "loss": 0.81989688, + "num_input_tokens_seen": 112283710, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.5546875, + "step": 5229, + "time_per_iteration": 2.399991750717163 + }, + { + "auxiliary_loss_clip": 0.01080643, + "auxiliary_loss_mlp": 0.01030776, + "balance_loss_clip": 1.01443267, + "balance_loss_mlp": 1.02655613, + "epoch": 0.3144446114534796, + "flos": 23291082587520.0, + "grad_norm": 1.8454262604987328, + "language_loss": 0.69932103, + "learning_rate": 3.1013434650595522e-06, + "loss": 0.7204352, + "num_input_tokens_seen": 112304285, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.5390625, + "step": 5230, + "time_per_iteration": 2.433551788330078 + }, + { + "auxiliary_loss_clip": 0.0107981, + "auxiliary_loss_mlp": 0.01034309, + "balance_loss_clip": 1.01765549, + "balance_loss_mlp": 1.02458167, + "epoch": 0.3145047347061476, + "flos": 31354692554880.0, + "grad_norm": 1.582479321456293, + "language_loss": 0.79186082, + "learning_rate": 3.101028068839917e-06, + "loss": 0.81300199, + "num_input_tokens_seen": 112325110, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.5546875, + "step": 5231, + "time_per_iteration": 3.946516752243042 + }, + { + "auxiliary_loss_clip": 0.0107863, + "auxiliary_loss_mlp": 0.01037219, + "balance_loss_clip": 1.02024388, + "balance_loss_mlp": 1.02544665, + "epoch": 0.31456485795881556, + "flos": 10888780919040.0, + "grad_norm": 2.0289127983457225, + "language_loss": 0.84659767, + "learning_rate": 3.100712633327422e-06, + "loss": 0.86775613, + "num_input_tokens_seen": 112339855, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.53125, + "step": 5232, + "time_per_iteration": 2.365459442138672 + }, + { + "auxiliary_loss_clip": 0.01079561, + "auxiliary_loss_mlp": 0.01035737, + "balance_loss_clip": 1.01959586, + "balance_loss_mlp": 1.02578616, + "epoch": 0.3146249812114835, + "flos": 17091485308800.0, + "grad_norm": 1.5416621934797834, + "language_loss": 0.79673326, + "learning_rate": 3.100397158533325e-06, + "loss": 0.81788617, + "num_input_tokens_seen": 112358480, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.5390625, + "step": 5233, + "time_per_iteration": 2.371882677078247 + }, + { + "auxiliary_loss_clip": 0.01079983, + "auxiliary_loss_mlp": 0.01032709, + "balance_loss_clip": 1.01687813, + "balance_loss_mlp": 1.02624929, + "epoch": 0.3146851044641515, + "flos": 55289469479040.0, + "grad_norm": 1.6638065143373353, + "language_loss": 0.7100246, + "learning_rate": 3.100081644468883e-06, + "loss": 0.73115146, + "num_input_tokens_seen": 112382350, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.53515625, + "step": 5234, + "time_per_iteration": 4.136966228485107 + }, + { + "auxiliary_loss_clip": 0.01080921, + "auxiliary_loss_mlp": 0.01028831, + "balance_loss_clip": 1.01270175, + "balance_loss_mlp": 1.02584028, + "epoch": 0.31474522771681945, + "flos": 27014674222080.0, + "grad_norm": 2.2056654656185146, + "language_loss": 0.72418338, + "learning_rate": 3.0997660911453575e-06, + "loss": 0.74528086, + "num_input_tokens_seen": 112400260, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.55078125, + "step": 5235, + "time_per_iteration": 3.772786855697632 + }, + { + "auxiliary_loss_clip": 0.0107908, + "auxiliary_loss_mlp": 0.0102908, + "balance_loss_clip": 1.01344013, + "balance_loss_mlp": 1.02459812, + "epoch": 0.3148053509694875, + "flos": 21250862933760.0, + "grad_norm": 1.8033134939001587, + "language_loss": 0.78739047, + "learning_rate": 3.0994504985740096e-06, + "loss": 0.8084721, + "num_input_tokens_seen": 112419400, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.54296875, + "step": 5236, + "time_per_iteration": 2.3732361793518066 + }, + { + "auxiliary_loss_clip": 0.01081067, + "auxiliary_loss_mlp": 0.01034409, + "balance_loss_clip": 1.01696825, + "balance_loss_mlp": 1.02453232, + "epoch": 0.31486547422215544, + "flos": 31247334524160.0, + "grad_norm": 1.6319074967202862, + "language_loss": 0.75643516, + "learning_rate": 3.099134866766101e-06, + "loss": 0.77758992, + "num_input_tokens_seen": 112440825, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.56640625, + "step": 5237, + "time_per_iteration": 2.448544979095459 + }, + { + "auxiliary_loss_clip": 0.01075025, + "auxiliary_loss_mlp": 0.01034988, + "balance_loss_clip": 1.02017581, + "balance_loss_mlp": 1.02330637, + "epoch": 0.3149255974748234, + "flos": 19827593648640.0, + "grad_norm": 2.0433828735521224, + "language_loss": 0.79406428, + "learning_rate": 3.0988191957328967e-06, + "loss": 0.81516439, + "num_input_tokens_seen": 112459180, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.51953125, + "step": 5238, + "time_per_iteration": 2.36114764213562 + }, + { + "auxiliary_loss_clip": 0.0101433, + "auxiliary_loss_mlp": 0.01001969, + "balance_loss_clip": 1.00028789, + "balance_loss_mlp": 1.00214195, + "epoch": 0.31498572072749137, + "flos": 67680981671040.0, + "grad_norm": 0.9531834974421111, + "language_loss": 0.67927349, + "learning_rate": 3.0985034854856615e-06, + "loss": 0.69943643, + "num_input_tokens_seen": 112516680, + "router_z_loss_clip": 0.0168457, + "router_z_loss_mlp": 0.12207031, + "step": 5239, + "time_per_iteration": 4.398332834243774 + }, + { + "auxiliary_loss_clip": 0.01082341, + "auxiliary_loss_mlp": 0.01038409, + "balance_loss_clip": 1.02045596, + "balance_loss_mlp": 1.02554214, + "epoch": 0.31504584398015933, + "flos": 19792086929280.0, + "grad_norm": 2.0690547521273865, + "language_loss": 0.82568109, + "learning_rate": 3.098187736035663e-06, + "loss": 0.84688854, + "num_input_tokens_seen": 112535895, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.56640625, + "step": 5240, + "time_per_iteration": 2.376136064529419 + }, + { + "auxiliary_loss_clip": 0.01081439, + "auxiliary_loss_mlp": 0.01032589, + "balance_loss_clip": 1.01651359, + "balance_loss_mlp": 1.02748108, + "epoch": 0.3151059672328273, + "flos": 26614185482880.0, + "grad_norm": 1.6884027060147966, + "language_loss": 0.81342447, + "learning_rate": 3.097871947394168e-06, + "loss": 0.83456481, + "num_input_tokens_seen": 112557490, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.5390625, + "step": 5241, + "time_per_iteration": 2.44288969039917 + }, + { + "auxiliary_loss_clip": 0.01079048, + "auxiliary_loss_mlp": 0.01036551, + "balance_loss_clip": 1.0208745, + "balance_loss_mlp": 1.02578354, + "epoch": 0.31516609048549526, + "flos": 24203363080320.0, + "grad_norm": 1.7535267418641647, + "language_loss": 0.73701584, + "learning_rate": 3.0975561195724477e-06, + "loss": 0.7581718, + "num_input_tokens_seen": 112577075, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.53125, + "step": 5242, + "time_per_iteration": 2.4050450325012207 + }, + { + "auxiliary_loss_clip": 0.01079953, + "auxiliary_loss_mlp": 0.0103126, + "balance_loss_clip": 1.01420116, + "balance_loss_mlp": 1.02561021, + "epoch": 0.31522621373816323, + "flos": 25957504120320.0, + "grad_norm": 1.877418219253772, + "language_loss": 0.7359069, + "learning_rate": 3.0972402525817732e-06, + "loss": 0.75701892, + "num_input_tokens_seen": 112597620, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.54296875, + "step": 5243, + "time_per_iteration": 2.4118287563323975 + }, + { + "auxiliary_loss_clip": 0.01078961, + "auxiliary_loss_mlp": 0.01033932, + "balance_loss_clip": 1.01637232, + "balance_loss_mlp": 1.02380478, + "epoch": 0.3152863369908312, + "flos": 21907718853120.0, + "grad_norm": 1.7487849442454195, + "language_loss": 0.64450109, + "learning_rate": 3.0969243464334166e-06, + "loss": 0.66562998, + "num_input_tokens_seen": 112617150, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.55078125, + "step": 5244, + "time_per_iteration": 2.405380964279175 + }, + { + "auxiliary_loss_clip": 0.01084078, + "auxiliary_loss_mlp": 0.01032981, + "balance_loss_clip": 1.0166012, + "balance_loss_mlp": 1.02682877, + "epoch": 0.31534646024349916, + "flos": 16280383536000.0, + "grad_norm": 1.9654765948469832, + "language_loss": 0.91164446, + "learning_rate": 3.0966084011386517e-06, + "loss": 0.93281496, + "num_input_tokens_seen": 112631090, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.57421875, + "step": 5245, + "time_per_iteration": 2.3264918327331543 + }, + { + "auxiliary_loss_clip": 0.01081491, + "auxiliary_loss_mlp": 0.01041307, + "balance_loss_clip": 1.02439117, + "balance_loss_mlp": 1.02488089, + "epoch": 0.3154065834961671, + "flos": 24716097440640.0, + "grad_norm": 1.9188636766392912, + "language_loss": 0.75167406, + "learning_rate": 3.0962924167087526e-06, + "loss": 0.77290201, + "num_input_tokens_seen": 112651220, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.5703125, + "step": 5246, + "time_per_iteration": 2.4234580993652344 + }, + { + "auxiliary_loss_clip": 0.01079419, + "auxiliary_loss_mlp": 0.01031931, + "balance_loss_clip": 1.01505113, + "balance_loss_mlp": 1.0246681, + "epoch": 0.3154667067488351, + "flos": 35369704241280.0, + "grad_norm": 1.5425169810359798, + "language_loss": 0.61345798, + "learning_rate": 3.0959763931549985e-06, + "loss": 0.63457149, + "num_input_tokens_seen": 112671560, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.546875, + "step": 5247, + "time_per_iteration": 2.493908166885376 + }, + { + "auxiliary_loss_clip": 0.01080289, + "auxiliary_loss_mlp": 0.01032763, + "balance_loss_clip": 1.0157752, + "balance_loss_mlp": 1.02422643, + "epoch": 0.31552683000150306, + "flos": 17455524721920.0, + "grad_norm": 2.4683848857149053, + "language_loss": 0.82290494, + "learning_rate": 3.0956603304886653e-06, + "loss": 0.84403551, + "num_input_tokens_seen": 112689790, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.55859375, + "step": 5248, + "time_per_iteration": 2.381251096725464 + }, + { + "auxiliary_loss_clip": 0.01078831, + "auxiliary_loss_mlp": 0.01045554, + "balance_loss_clip": 1.02718377, + "balance_loss_mlp": 1.02368248, + "epoch": 0.3155869532541711, + "flos": 18404778211200.0, + "grad_norm": 1.847536235910411, + "language_loss": 0.84778982, + "learning_rate": 3.095344228721034e-06, + "loss": 0.86903369, + "num_input_tokens_seen": 112708265, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.55078125, + "step": 5249, + "time_per_iteration": 2.3564109802246094 + }, + { + "auxiliary_loss_clip": 0.0108296, + "auxiliary_loss_mlp": 0.01033904, + "balance_loss_clip": 1.01699996, + "balance_loss_mlp": 1.02666736, + "epoch": 0.31564707650683904, + "flos": 21578697400320.0, + "grad_norm": 2.1545387065410253, + "language_loss": 0.85248566, + "learning_rate": 3.0950280878633844e-06, + "loss": 0.87365431, + "num_input_tokens_seen": 112727820, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.5625, + "step": 5250, + "time_per_iteration": 2.385845422744751 + }, + { + "auxiliary_loss_clip": 0.01079234, + "auxiliary_loss_mlp": 0.01036326, + "balance_loss_clip": 1.01895714, + "balance_loss_mlp": 1.0229063, + "epoch": 0.315707199759507, + "flos": 21029967561600.0, + "grad_norm": 2.386011396962339, + "language_loss": 0.68639684, + "learning_rate": 3.094711907926999e-06, + "loss": 0.70755243, + "num_input_tokens_seen": 112743140, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.5625, + "step": 5251, + "time_per_iteration": 2.3697569370269775 + }, + { + "auxiliary_loss_clip": 0.01079366, + "auxiliary_loss_mlp": 0.01033545, + "balance_loss_clip": 1.01743937, + "balance_loss_mlp": 1.02507663, + "epoch": 0.31576732301217497, + "flos": 26827784380800.0, + "grad_norm": 2.075415075517344, + "language_loss": 0.79322481, + "learning_rate": 3.0943956889231613e-06, + "loss": 0.81435394, + "num_input_tokens_seen": 112764705, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.54296875, + "step": 5252, + "time_per_iteration": 2.428051233291626 + }, + { + "auxiliary_loss_clip": 0.01079094, + "auxiliary_loss_mlp": 0.0103425, + "balance_loss_clip": 1.01830542, + "balance_loss_mlp": 1.02625227, + "epoch": 0.31582744626484294, + "flos": 22710057874560.0, + "grad_norm": 1.6049140984299957, + "language_loss": 0.7427122, + "learning_rate": 3.0940794308631574e-06, + "loss": 0.76384562, + "num_input_tokens_seen": 112785310, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.52734375, + "step": 5253, + "time_per_iteration": 2.412838935852051 + }, + { + "auxiliary_loss_clip": 0.01079843, + "auxiliary_loss_mlp": 0.0103098, + "balance_loss_clip": 1.01457703, + "balance_loss_mlp": 1.02441907, + "epoch": 0.3158875695175109, + "flos": 23950766327040.0, + "grad_norm": 1.6830502535164042, + "language_loss": 0.7342571, + "learning_rate": 3.0937631337582723e-06, + "loss": 0.75536537, + "num_input_tokens_seen": 112802905, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.5546875, + "step": 5254, + "time_per_iteration": 2.3974599838256836 + }, + { + "auxiliary_loss_clip": 0.01079906, + "auxiliary_loss_mlp": 0.01033564, + "balance_loss_clip": 1.01619577, + "balance_loss_mlp": 1.02538848, + "epoch": 0.31594769277017887, + "flos": 13261024402560.0, + "grad_norm": 1.8619085734771856, + "language_loss": 0.77918929, + "learning_rate": 3.093446797619795e-06, + "loss": 0.80032402, + "num_input_tokens_seen": 112820305, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.546875, + "step": 5255, + "time_per_iteration": 2.3649168014526367 + }, + { + "auxiliary_loss_clip": 0.01076779, + "auxiliary_loss_mlp": 0.01033573, + "balance_loss_clip": 1.01565611, + "balance_loss_mlp": 1.02357996, + "epoch": 0.31600781602284683, + "flos": 23367123262080.0, + "grad_norm": 1.925964166170941, + "language_loss": 0.7776494, + "learning_rate": 3.093130422459013e-06, + "loss": 0.7987529, + "num_input_tokens_seen": 112841185, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 0.53125, + "step": 5256, + "time_per_iteration": 2.382408380508423 + }, + { + "auxiliary_loss_clip": 0.01074519, + "auxiliary_loss_mlp": 0.01037775, + "balance_loss_clip": 1.02236092, + "balance_loss_mlp": 1.02439499, + "epoch": 0.3160679392755148, + "flos": 19827558737280.0, + "grad_norm": 1.5606482545775244, + "language_loss": 0.71487117, + "learning_rate": 3.0928140082872194e-06, + "loss": 0.7359941, + "num_input_tokens_seen": 112860570, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.5, + "step": 5257, + "time_per_iteration": 2.375041961669922 + }, + { + "auxiliary_loss_clip": 0.01078972, + "auxiliary_loss_mlp": 0.01032206, + "balance_loss_clip": 1.01505232, + "balance_loss_mlp": 1.02506304, + "epoch": 0.31612806252818276, + "flos": 20192191643520.0, + "grad_norm": 2.119419912510088, + "language_loss": 0.7674104, + "learning_rate": 3.092497555115704e-06, + "loss": 0.78852212, + "num_input_tokens_seen": 112877975, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.5390625, + "step": 5258, + "time_per_iteration": 2.3770792484283447 + }, + { + "auxiliary_loss_clip": 0.01081201, + "auxiliary_loss_mlp": 0.01038012, + "balance_loss_clip": 1.0215013, + "balance_loss_mlp": 1.02563679, + "epoch": 0.31618818578085073, + "flos": 24235029550080.0, + "grad_norm": 3.273718426598076, + "language_loss": 0.72169727, + "learning_rate": 3.0921810629557614e-06, + "loss": 0.7428894, + "num_input_tokens_seen": 112896170, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.5546875, + "step": 5259, + "time_per_iteration": 2.395052194595337 + }, + { + "auxiliary_loss_clip": 0.01083789, + "auxiliary_loss_mlp": 0.01037654, + "balance_loss_clip": 1.0212028, + "balance_loss_mlp": 1.02691913, + "epoch": 0.3162483090335187, + "flos": 25080695435520.0, + "grad_norm": 2.5263848532324014, + "language_loss": 0.66497993, + "learning_rate": 3.0918645318186863e-06, + "loss": 0.6861943, + "num_input_tokens_seen": 112916180, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.56640625, + "step": 5260, + "time_per_iteration": 2.4080142974853516 + }, + { + "auxiliary_loss_clip": 0.01078806, + "auxiliary_loss_mlp": 0.01030869, + "balance_loss_clip": 1.01394165, + "balance_loss_mlp": 1.02332568, + "epoch": 0.31630843228618666, + "flos": 26322171937920.0, + "grad_norm": 2.5167465326335106, + "language_loss": 0.72138435, + "learning_rate": 3.091547961715775e-06, + "loss": 0.74248111, + "num_input_tokens_seen": 112936745, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.5546875, + "step": 5261, + "time_per_iteration": 2.4369382858276367 + }, + { + "auxiliary_loss_clip": 0.01015332, + "auxiliary_loss_mlp": 0.01006692, + "balance_loss_clip": 1.00507116, + "balance_loss_mlp": 1.00298214, + "epoch": 0.3163685555388547, + "flos": 66754839502080.0, + "grad_norm": 0.7596495394119944, + "language_loss": 0.50580609, + "learning_rate": 3.0912313526583237e-06, + "loss": 0.52602631, + "num_input_tokens_seen": 112994845, + "router_z_loss_clip": 0.01623535, + "router_z_loss_mlp": 0.12353516, + "step": 5262, + "time_per_iteration": 3.0339443683624268 + }, + { + "auxiliary_loss_clip": 0.01082344, + "auxiliary_loss_mlp": 0.01031569, + "balance_loss_clip": 1.01476026, + "balance_loss_mlp": 1.02562666, + "epoch": 0.31642867879152264, + "flos": 25994442205440.0, + "grad_norm": 1.4456440908159507, + "language_loss": 0.85142934, + "learning_rate": 3.0909147046576333e-06, + "loss": 0.87256849, + "num_input_tokens_seen": 113015125, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.56640625, + "step": 5263, + "time_per_iteration": 2.4157674312591553 + }, + { + "auxiliary_loss_clip": 0.01076742, + "auxiliary_loss_mlp": 0.01034652, + "balance_loss_clip": 1.01944113, + "balance_loss_mlp": 1.02456498, + "epoch": 0.3164888020441906, + "flos": 25773791212800.0, + "grad_norm": 1.9295556132749418, + "language_loss": 0.82008076, + "learning_rate": 3.0905980177250026e-06, + "loss": 0.84119469, + "num_input_tokens_seen": 113035535, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.5234375, + "step": 5264, + "time_per_iteration": 2.426569700241089 + }, + { + "auxiliary_loss_clip": 0.01084145, + "auxiliary_loss_mlp": 0.01033681, + "balance_loss_clip": 1.01701522, + "balance_loss_mlp": 1.02686477, + "epoch": 0.3165489252968586, + "flos": 19755183755520.0, + "grad_norm": 1.8006424878874159, + "language_loss": 0.79693788, + "learning_rate": 3.090281291871734e-06, + "loss": 0.81811619, + "num_input_tokens_seen": 113052720, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.5703125, + "step": 5265, + "time_per_iteration": 2.392524480819702 + }, + { + "auxiliary_loss_clip": 0.01082602, + "auxiliary_loss_mlp": 0.01033871, + "balance_loss_clip": 1.01547682, + "balance_loss_mlp": 1.02509403, + "epoch": 0.31660904854952654, + "flos": 23182851772800.0, + "grad_norm": 1.5148958739692417, + "language_loss": 0.74676967, + "learning_rate": 3.089964527109131e-06, + "loss": 0.76793444, + "num_input_tokens_seen": 113071435, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.578125, + "step": 5266, + "time_per_iteration": 2.407978057861328 + }, + { + "auxiliary_loss_clip": 0.01079704, + "auxiliary_loss_mlp": 0.01036665, + "balance_loss_clip": 1.01959407, + "balance_loss_mlp": 1.02397108, + "epoch": 0.3166691718021945, + "flos": 20407571020800.0, + "grad_norm": 1.996167646815778, + "language_loss": 0.79371524, + "learning_rate": 3.0896477234484976e-06, + "loss": 0.81487888, + "num_input_tokens_seen": 113088645, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.55859375, + "step": 5267, + "time_per_iteration": 2.361037015914917 + }, + { + "auxiliary_loss_clip": 0.01013807, + "auxiliary_loss_mlp": 0.01000798, + "balance_loss_clip": 0.99931991, + "balance_loss_mlp": 1.00158882, + "epoch": 0.31672929505486247, + "flos": 70141275336960.0, + "grad_norm": 0.729997743554358, + "language_loss": 0.5780766, + "learning_rate": 3.08933088090114e-06, + "loss": 0.59822267, + "num_input_tokens_seen": 113152775, + "router_z_loss_clip": 0.01477051, + "router_z_loss_mlp": 0.12207031, + "step": 5268, + "time_per_iteration": 3.0364322662353516 + }, + { + "auxiliary_loss_clip": 0.01080023, + "auxiliary_loss_mlp": 0.01030453, + "balance_loss_clip": 1.01247668, + "balance_loss_mlp": 1.02402115, + "epoch": 0.31678941830753043, + "flos": 14354888209920.0, + "grad_norm": 2.54358297012882, + "language_loss": 0.73169315, + "learning_rate": 3.0890139994783653e-06, + "loss": 0.75279784, + "num_input_tokens_seen": 113171410, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.5625, + "step": 5269, + "time_per_iteration": 2.376150131225586 + }, + { + "auxiliary_loss_clip": 0.01080085, + "auxiliary_loss_mlp": 0.01040108, + "balance_loss_clip": 1.02297807, + "balance_loss_mlp": 1.02438664, + "epoch": 0.3168495415601984, + "flos": 22746611934720.0, + "grad_norm": 1.8357253850565918, + "language_loss": 0.79865623, + "learning_rate": 3.0886970791914822e-06, + "loss": 0.81985819, + "num_input_tokens_seen": 113189965, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.55859375, + "step": 5270, + "time_per_iteration": 3.775099515914917 + }, + { + "auxiliary_loss_clip": 0.01083041, + "auxiliary_loss_mlp": 0.01042041, + "balance_loss_clip": 1.02405214, + "balance_loss_mlp": 1.02541685, + "epoch": 0.31690966481286637, + "flos": 20114370489600.0, + "grad_norm": 2.166272483725908, + "language_loss": 0.79255253, + "learning_rate": 3.088380120051801e-06, + "loss": 0.81380343, + "num_input_tokens_seen": 113206355, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.578125, + "step": 5271, + "time_per_iteration": 2.378732442855835 + }, + { + "auxiliary_loss_clip": 0.01081302, + "auxiliary_loss_mlp": 0.01030256, + "balance_loss_clip": 1.0131495, + "balance_loss_mlp": 1.02506793, + "epoch": 0.31696978806553433, + "flos": 21177859547520.0, + "grad_norm": 1.7622399638827526, + "language_loss": 0.73040378, + "learning_rate": 3.088063122070633e-06, + "loss": 0.75151944, + "num_input_tokens_seen": 113225440, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.5625, + "step": 5272, + "time_per_iteration": 2.3977181911468506 + }, + { + "auxiliary_loss_clip": 0.01082461, + "auxiliary_loss_mlp": 0.01029965, + "balance_loss_clip": 1.01118922, + "balance_loss_mlp": 1.02434623, + "epoch": 0.3170299113182023, + "flos": 42995363713920.0, + "grad_norm": 2.223613280288384, + "language_loss": 0.69642627, + "learning_rate": 3.0877460852592902e-06, + "loss": 0.71755052, + "num_input_tokens_seen": 113248840, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.58203125, + "step": 5273, + "time_per_iteration": 3.9881303310394287 + }, + { + "auxiliary_loss_clip": 0.0107984, + "auxiliary_loss_mlp": 0.01033855, + "balance_loss_clip": 1.0169158, + "balance_loss_mlp": 1.02494097, + "epoch": 0.31709003457087026, + "flos": 24459066944640.0, + "grad_norm": 1.6830475698094653, + "language_loss": 0.67651677, + "learning_rate": 3.0874290096290888e-06, + "loss": 0.69765377, + "num_input_tokens_seen": 113269630, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.55078125, + "step": 5274, + "time_per_iteration": 2.4086549282073975 + }, + { + "auxiliary_loss_clip": 0.01076455, + "auxiliary_loss_mlp": 0.01037898, + "balance_loss_clip": 1.0220077, + "balance_loss_mlp": 1.023458, + "epoch": 0.3171501578235382, + "flos": 24134130120960.0, + "grad_norm": 1.711708657366332, + "language_loss": 0.80559027, + "learning_rate": 3.0871118951913423e-06, + "loss": 0.82673383, + "num_input_tokens_seen": 113291200, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.53125, + "step": 5275, + "time_per_iteration": 3.776034355163574 + }, + { + "auxiliary_loss_clip": 0.01080137, + "auxiliary_loss_mlp": 0.01032483, + "balance_loss_clip": 1.01566255, + "balance_loss_mlp": 1.02465034, + "epoch": 0.31721028107620625, + "flos": 18878724184320.0, + "grad_norm": 2.2230326829244618, + "language_loss": 0.72492123, + "learning_rate": 3.0867947419573693e-06, + "loss": 0.7460475, + "num_input_tokens_seen": 113310170, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.5546875, + "step": 5276, + "time_per_iteration": 2.359403133392334 + }, + { + "auxiliary_loss_clip": 0.01076312, + "auxiliary_loss_mlp": 0.01027512, + "balance_loss_clip": 1.01140654, + "balance_loss_mlp": 1.0237422, + "epoch": 0.3172704043288742, + "flos": 23146786471680.0, + "grad_norm": 1.4496608106011593, + "language_loss": 0.78041995, + "learning_rate": 3.0864775499384873e-06, + "loss": 0.80145818, + "num_input_tokens_seen": 113331140, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.5234375, + "step": 5277, + "time_per_iteration": 2.4496803283691406 + }, + { + "auxiliary_loss_clip": 0.01080901, + "auxiliary_loss_mlp": 0.01036499, + "balance_loss_clip": 1.01834309, + "balance_loss_mlp": 1.02437401, + "epoch": 0.3173305275815422, + "flos": 17857549560960.0, + "grad_norm": 1.7043471396870504, + "language_loss": 0.79108131, + "learning_rate": 3.086160319146016e-06, + "loss": 0.81225532, + "num_input_tokens_seen": 113350030, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.56640625, + "step": 5278, + "time_per_iteration": 3.7243001461029053 + }, + { + "auxiliary_loss_clip": 0.01013157, + "auxiliary_loss_mlp": 0.01002575, + "balance_loss_clip": 1.00128186, + "balance_loss_mlp": 1.00111103, + "epoch": 0.31739065083421014, + "flos": 59971042442880.0, + "grad_norm": 0.8719670676254162, + "language_loss": 0.62874103, + "learning_rate": 3.0858430495912772e-06, + "loss": 0.64889824, + "num_input_tokens_seen": 113395820, + "router_z_loss_clip": 0.01293945, + "router_z_loss_mlp": 0.12011719, + "step": 5279, + "time_per_iteration": 2.790759801864624 + }, + { + "auxiliary_loss_clip": 0.01084219, + "auxiliary_loss_mlp": 0.01040203, + "balance_loss_clip": 1.02104592, + "balance_loss_mlp": 1.02513933, + "epoch": 0.3174507740868781, + "flos": 23799976698240.0, + "grad_norm": 1.67235337841515, + "language_loss": 0.81060869, + "learning_rate": 3.0855257412855933e-06, + "loss": 0.83185291, + "num_input_tokens_seen": 113416835, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 0.58984375, + "step": 5280, + "time_per_iteration": 2.4147047996520996 + }, + { + "auxiliary_loss_clip": 0.01081347, + "auxiliary_loss_mlp": 0.01042717, + "balance_loss_clip": 1.0264926, + "balance_loss_mlp": 1.02539849, + "epoch": 0.31751089733954607, + "flos": 27637594433280.0, + "grad_norm": 1.581023055194415, + "language_loss": 0.78002334, + "learning_rate": 3.0852083942402874e-06, + "loss": 0.80126405, + "num_input_tokens_seen": 113440850, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.55859375, + "step": 5281, + "time_per_iteration": 2.471285104751587 + }, + { + "auxiliary_loss_clip": 0.01080128, + "auxiliary_loss_mlp": 0.01030832, + "balance_loss_clip": 1.01379657, + "balance_loss_mlp": 1.02490568, + "epoch": 0.31757102059221404, + "flos": 23768135671680.0, + "grad_norm": 1.6327436154783515, + "language_loss": 0.78271222, + "learning_rate": 3.084891008466686e-06, + "loss": 0.8038218, + "num_input_tokens_seen": 113461000, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.5546875, + "step": 5282, + "time_per_iteration": 2.39589524269104 + }, + { + "auxiliary_loss_clip": 0.01082877, + "auxiliary_loss_mlp": 0.01040751, + "balance_loss_clip": 1.02387059, + "balance_loss_mlp": 1.02483785, + "epoch": 0.317631143844882, + "flos": 25263361002240.0, + "grad_norm": 1.9959048398359513, + "language_loss": 0.67214715, + "learning_rate": 3.0845735839761145e-06, + "loss": 0.69338346, + "num_input_tokens_seen": 113480820, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.578125, + "step": 5283, + "time_per_iteration": 2.403817653656006 + }, + { + "auxiliary_loss_clip": 0.01013771, + "auxiliary_loss_mlp": 0.01001144, + "balance_loss_clip": 0.99983269, + "balance_loss_mlp": 1.00153661, + "epoch": 0.31769126709754997, + "flos": 55823290300800.0, + "grad_norm": 0.7362289128213836, + "language_loss": 0.52765673, + "learning_rate": 3.084256120779902e-06, + "loss": 0.54780585, + "num_input_tokens_seen": 113536910, + "router_z_loss_clip": 0.01312256, + "router_z_loss_mlp": 0.12207031, + "step": 5284, + "time_per_iteration": 2.9949584007263184 + }, + { + "auxiliary_loss_clip": 0.01085186, + "auxiliary_loss_mlp": 0.010419, + "balance_loss_clip": 1.02481723, + "balance_loss_mlp": 1.02751923, + "epoch": 0.31775139035021793, + "flos": 16689635026560.0, + "grad_norm": 2.288646676387672, + "language_loss": 0.69851232, + "learning_rate": 3.0839386188893777e-06, + "loss": 0.71978313, + "num_input_tokens_seen": 113555480, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.578125, + "step": 5285, + "time_per_iteration": 2.3910608291625977 + }, + { + "auxiliary_loss_clip": 0.01013096, + "auxiliary_loss_mlp": 0.0100228, + "balance_loss_clip": 1.00102246, + "balance_loss_mlp": 1.00139141, + "epoch": 0.3178115136028859, + "flos": 64224090979200.0, + "grad_norm": 0.813032882130715, + "language_loss": 0.60546649, + "learning_rate": 3.083621078315872e-06, + "loss": 0.62562025, + "num_input_tokens_seen": 113616790, + "router_z_loss_clip": 0.01257324, + "router_z_loss_mlp": 0.1171875, + "step": 5286, + "time_per_iteration": 3.0806162357330322 + }, + { + "auxiliary_loss_clip": 0.01082214, + "auxiliary_loss_mlp": 0.01037579, + "balance_loss_clip": 1.02079391, + "balance_loss_mlp": 1.0257957, + "epoch": 0.31787163685555386, + "flos": 18696477553920.0, + "grad_norm": 1.6527688333029478, + "language_loss": 0.71768641, + "learning_rate": 3.083303499070718e-06, + "loss": 0.73888433, + "num_input_tokens_seen": 113635320, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.5625, + "step": 5287, + "time_per_iteration": 2.3819234371185303 + }, + { + "auxiliary_loss_clip": 0.01080167, + "auxiliary_loss_mlp": 0.01037996, + "balance_loss_clip": 1.01981616, + "balance_loss_mlp": 1.02329683, + "epoch": 0.31793176010822183, + "flos": 21323691763200.0, + "grad_norm": 1.8911901920569434, + "language_loss": 0.75618762, + "learning_rate": 3.082985881165248e-06, + "loss": 0.77736926, + "num_input_tokens_seen": 113654000, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.5703125, + "step": 5288, + "time_per_iteration": 2.3669891357421875 + }, + { + "auxiliary_loss_clip": 0.01075516, + "auxiliary_loss_mlp": 0.010258, + "balance_loss_clip": 1.01118445, + "balance_loss_mlp": 1.02206612, + "epoch": 0.31799188336088985, + "flos": 20957662402560.0, + "grad_norm": 1.646249100576446, + "language_loss": 0.87391573, + "learning_rate": 3.082668224610798e-06, + "loss": 0.89492893, + "num_input_tokens_seen": 113672375, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.53515625, + "step": 5289, + "time_per_iteration": 2.394063711166382 + }, + { + "auxiliary_loss_clip": 0.01078334, + "auxiliary_loss_mlp": 0.01030259, + "balance_loss_clip": 1.01497591, + "balance_loss_mlp": 1.02536571, + "epoch": 0.3180520066135578, + "flos": 22490838247680.0, + "grad_norm": 2.3050482578316975, + "language_loss": 0.67488748, + "learning_rate": 3.0823505294187044e-06, + "loss": 0.6959734, + "num_input_tokens_seen": 113692385, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.53125, + "step": 5290, + "time_per_iteration": 2.390932321548462 + }, + { + "auxiliary_loss_clip": 0.01080873, + "auxiliary_loss_mlp": 0.01032342, + "balance_loss_clip": 1.01449621, + "balance_loss_mlp": 1.02368319, + "epoch": 0.3181121298662258, + "flos": 27234103317120.0, + "grad_norm": 2.62376350472759, + "language_loss": 0.80194283, + "learning_rate": 3.0820327956003045e-06, + "loss": 0.82307494, + "num_input_tokens_seen": 113712145, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 0.5703125, + "step": 5291, + "time_per_iteration": 2.4348864555358887 + }, + { + "auxiliary_loss_clip": 0.01079954, + "auxiliary_loss_mlp": 0.01033852, + "balance_loss_clip": 1.01600647, + "balance_loss_mlp": 1.02360988, + "epoch": 0.31817225311889374, + "flos": 23179186080000.0, + "grad_norm": 2.0409961214154677, + "language_loss": 0.7972188, + "learning_rate": 3.0817150231669367e-06, + "loss": 0.81835687, + "num_input_tokens_seen": 113731435, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 0.5625, + "step": 5292, + "time_per_iteration": 2.3765571117401123 + }, + { + "auxiliary_loss_clip": 0.01074242, + "auxiliary_loss_mlp": 0.01029241, + "balance_loss_clip": 1.01445878, + "balance_loss_mlp": 1.02328396, + "epoch": 0.3182323763715617, + "flos": 23257670549760.0, + "grad_norm": 2.314515886392783, + "language_loss": 0.74406004, + "learning_rate": 3.081397212129943e-06, + "loss": 0.76509488, + "num_input_tokens_seen": 113750825, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.5078125, + "step": 5293, + "time_per_iteration": 2.3932366371154785 + }, + { + "auxiliary_loss_clip": 0.01077713, + "auxiliary_loss_mlp": 0.01031669, + "balance_loss_clip": 1.01651788, + "balance_loss_mlp": 1.02501369, + "epoch": 0.3182924996242297, + "flos": 29015581818240.0, + "grad_norm": 6.755118345589732, + "language_loss": 0.73712504, + "learning_rate": 3.0810793625006637e-06, + "loss": 0.75821888, + "num_input_tokens_seen": 113770010, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.5234375, + "step": 5294, + "time_per_iteration": 2.4285929203033447 + }, + { + "auxiliary_loss_clip": 0.01081725, + "auxiliary_loss_mlp": 0.01030603, + "balance_loss_clip": 1.01303184, + "balance_loss_mlp": 1.02475572, + "epoch": 0.31835262287689764, + "flos": 20448139887360.0, + "grad_norm": 4.095635023253408, + "language_loss": 0.76049137, + "learning_rate": 3.080761474290443e-06, + "loss": 0.78161466, + "num_input_tokens_seen": 113788640, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.5703125, + "step": 5295, + "time_per_iteration": 2.374361276626587 + }, + { + "auxiliary_loss_clip": 0.01082446, + "auxiliary_loss_mlp": 0.01037101, + "balance_loss_clip": 1.02065003, + "balance_loss_mlp": 1.02512217, + "epoch": 0.3184127461295656, + "flos": 25118296836480.0, + "grad_norm": 1.6558944150019088, + "language_loss": 0.69456697, + "learning_rate": 3.0804435475106265e-06, + "loss": 0.71576238, + "num_input_tokens_seen": 113809515, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.57421875, + "step": 5296, + "time_per_iteration": 2.413593292236328 + }, + { + "auxiliary_loss_clip": 0.01079503, + "auxiliary_loss_mlp": 0.01032717, + "balance_loss_clip": 1.01682687, + "balance_loss_mlp": 1.02460003, + "epoch": 0.31847286938223357, + "flos": 25550207665920.0, + "grad_norm": 1.7147605209163255, + "language_loss": 0.77639103, + "learning_rate": 3.0801255821725578e-06, + "loss": 0.79751325, + "num_input_tokens_seen": 113829770, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.546875, + "step": 5297, + "time_per_iteration": 2.4264285564422607 + }, + { + "auxiliary_loss_clip": 0.01077453, + "auxiliary_loss_mlp": 0.01028859, + "balance_loss_clip": 1.01299191, + "balance_loss_mlp": 1.02425241, + "epoch": 0.31853299263490154, + "flos": 27781227233280.0, + "grad_norm": 2.6881575346058066, + "language_loss": 0.79487884, + "learning_rate": 3.0798075782875854e-06, + "loss": 0.81594205, + "num_input_tokens_seen": 113849320, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.53515625, + "step": 5298, + "time_per_iteration": 2.4228458404541016 + }, + { + "auxiliary_loss_clip": 0.01080002, + "auxiliary_loss_mlp": 0.01038655, + "balance_loss_clip": 1.0229069, + "balance_loss_mlp": 1.0242033, + "epoch": 0.3185931158875695, + "flos": 22705763777280.0, + "grad_norm": 1.6278739612813984, + "language_loss": 0.74050403, + "learning_rate": 3.0794895358670587e-06, + "loss": 0.76169056, + "num_input_tokens_seen": 113867860, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.55859375, + "step": 5299, + "time_per_iteration": 2.3948004245758057 + }, + { + "auxiliary_loss_clip": 0.01081397, + "auxiliary_loss_mlp": 0.01034239, + "balance_loss_clip": 1.01760912, + "balance_loss_mlp": 1.0242449, + "epoch": 0.31865323914023747, + "flos": 24570369959040.0, + "grad_norm": 2.19396212851942, + "language_loss": 0.78422546, + "learning_rate": 3.079171454922327e-06, + "loss": 0.80538183, + "num_input_tokens_seen": 113886375, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.5703125, + "step": 5300, + "time_per_iteration": 2.3989174365997314 + }, + { + "auxiliary_loss_clip": 0.01079522, + "auxiliary_loss_mlp": 0.01033003, + "balance_loss_clip": 1.01617098, + "balance_loss_mlp": 1.02372551, + "epoch": 0.31871336239290543, + "flos": 18185593495680.0, + "grad_norm": 1.9696395749252553, + "language_loss": 0.84092903, + "learning_rate": 3.0788533354647425e-06, + "loss": 0.86205423, + "num_input_tokens_seen": 113904065, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.55859375, + "step": 5301, + "time_per_iteration": 2.3700175285339355 + }, + { + "auxiliary_loss_clip": 0.01079199, + "auxiliary_loss_mlp": 0.01037238, + "balance_loss_clip": 1.02050066, + "balance_loss_mlp": 1.02565467, + "epoch": 0.31877348564557345, + "flos": 21825917804160.0, + "grad_norm": 2.099518613113411, + "language_loss": 0.77145398, + "learning_rate": 3.078535177505657e-06, + "loss": 0.79261839, + "num_input_tokens_seen": 113918415, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.53515625, + "step": 5302, + "time_per_iteration": 2.3479936122894287 + }, + { + "auxiliary_loss_clip": 0.01074655, + "auxiliary_loss_mlp": 0.010356, + "balance_loss_clip": 1.01954257, + "balance_loss_mlp": 1.02352607, + "epoch": 0.3188336088982414, + "flos": 22014239011200.0, + "grad_norm": 1.6542924805411356, + "language_loss": 0.78942561, + "learning_rate": 3.0782169810564256e-06, + "loss": 0.81052822, + "num_input_tokens_seen": 113938135, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.5078125, + "step": 5303, + "time_per_iteration": 2.408334493637085 + }, + { + "auxiliary_loss_clip": 0.01084304, + "auxiliary_loss_mlp": 0.01041492, + "balance_loss_clip": 1.02434969, + "balance_loss_mlp": 1.02641249, + "epoch": 0.3188937321509094, + "flos": 20046848186880.0, + "grad_norm": 2.0001558558790142, + "language_loss": 0.72907531, + "learning_rate": 3.0778987461284035e-06, + "loss": 0.75033325, + "num_input_tokens_seen": 113957125, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.578125, + "step": 5304, + "time_per_iteration": 2.374227523803711 + }, + { + "auxiliary_loss_clip": 0.01076543, + "auxiliary_loss_mlp": 0.01033692, + "balance_loss_clip": 1.01899385, + "balance_loss_mlp": 1.02409315, + "epoch": 0.31895385540357735, + "flos": 25846934244480.0, + "grad_norm": 1.952239126447074, + "language_loss": 0.72067142, + "learning_rate": 3.077580472732948e-06, + "loss": 0.74177378, + "num_input_tokens_seen": 113974875, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.5234375, + "step": 5305, + "time_per_iteration": 2.4203715324401855 + }, + { + "auxiliary_loss_clip": 0.01078933, + "auxiliary_loss_mlp": 0.0103289, + "balance_loss_clip": 1.01834631, + "balance_loss_mlp": 1.0255084, + "epoch": 0.3190139786562453, + "flos": 23476575974400.0, + "grad_norm": 1.710631185794342, + "language_loss": 0.6401546, + "learning_rate": 3.077262160881417e-06, + "loss": 0.66127276, + "num_input_tokens_seen": 113994450, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.53515625, + "step": 5306, + "time_per_iteration": 2.393486976623535 + }, + { + "auxiliary_loss_clip": 0.0108025, + "auxiliary_loss_mlp": 0.01034127, + "balance_loss_clip": 1.01727045, + "balance_loss_mlp": 1.0257448, + "epoch": 0.3190741019089133, + "flos": 29094275756160.0, + "grad_norm": 1.9610930205302717, + "language_loss": 0.7924794, + "learning_rate": 3.07694381058517e-06, + "loss": 0.81362319, + "num_input_tokens_seen": 114013945, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.546875, + "step": 5307, + "time_per_iteration": 2.442568063735962 + }, + { + "auxiliary_loss_clip": 0.01074879, + "auxiliary_loss_mlp": 0.01031194, + "balance_loss_clip": 1.01525021, + "balance_loss_mlp": 1.02398574, + "epoch": 0.31913422516158124, + "flos": 17128563039360.0, + "grad_norm": 1.662184571938068, + "language_loss": 0.77445477, + "learning_rate": 3.07662542185557e-06, + "loss": 0.79551548, + "num_input_tokens_seen": 114031375, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.5078125, + "step": 5308, + "time_per_iteration": 2.352698802947998 + }, + { + "auxiliary_loss_clip": 0.01086511, + "auxiliary_loss_mlp": 0.01034301, + "balance_loss_clip": 1.01606214, + "balance_loss_mlp": 1.02551162, + "epoch": 0.3191943484142492, + "flos": 16068949142400.0, + "grad_norm": 2.2970064580292457, + "language_loss": 0.73742926, + "learning_rate": 3.0763069947039774e-06, + "loss": 0.75863743, + "num_input_tokens_seen": 114048465, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 0.609375, + "step": 5309, + "time_per_iteration": 2.354645252227783 + }, + { + "auxiliary_loss_clip": 0.01079275, + "auxiliary_loss_mlp": 0.01029999, + "balance_loss_clip": 1.01563466, + "balance_loss_mlp": 1.02524686, + "epoch": 0.3192544716669172, + "flos": 22965063511680.0, + "grad_norm": 2.175398532649715, + "language_loss": 0.82634771, + "learning_rate": 3.0759885291417574e-06, + "loss": 0.84744048, + "num_input_tokens_seen": 114068415, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.5390625, + "step": 5310, + "time_per_iteration": 3.7492876052856445 + }, + { + "auxiliary_loss_clip": 0.01077036, + "auxiliary_loss_mlp": 0.01031986, + "balance_loss_clip": 1.01672697, + "balance_loss_mlp": 1.02340722, + "epoch": 0.31931459491958514, + "flos": 26869121297280.0, + "grad_norm": 1.3851259056368521, + "language_loss": 0.78265637, + "learning_rate": 3.0756700251802745e-06, + "loss": 0.80374658, + "num_input_tokens_seen": 114088565, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.5390625, + "step": 5311, + "time_per_iteration": 2.4239578247070312 + }, + { + "auxiliary_loss_clip": 0.0107738, + "auxiliary_loss_mlp": 0.01033138, + "balance_loss_clip": 1.01640153, + "balance_loss_mlp": 1.02321672, + "epoch": 0.3193747181722531, + "flos": 21835413694080.0, + "grad_norm": 1.7725353980341623, + "language_loss": 0.84363174, + "learning_rate": 3.0753514828308942e-06, + "loss": 0.86473691, + "num_input_tokens_seen": 114107160, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.5390625, + "step": 5312, + "time_per_iteration": 2.3842127323150635 + }, + { + "auxiliary_loss_clip": 0.01081476, + "auxiliary_loss_mlp": 0.01035483, + "balance_loss_clip": 1.01817369, + "balance_loss_mlp": 1.02505708, + "epoch": 0.31943484142492107, + "flos": 18324233971200.0, + "grad_norm": 2.2289129262842793, + "language_loss": 0.78590673, + "learning_rate": 3.0750329021049863e-06, + "loss": 0.80707633, + "num_input_tokens_seen": 114123420, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.56640625, + "step": 5313, + "time_per_iteration": 3.7440524101257324 + }, + { + "auxiliary_loss_clip": 0.01076528, + "auxiliary_loss_mlp": 0.01028799, + "balance_loss_clip": 1.01303971, + "balance_loss_mlp": 1.02394986, + "epoch": 0.31949496467758903, + "flos": 21614762701440.0, + "grad_norm": 2.69514675467478, + "language_loss": 0.86007148, + "learning_rate": 3.074714283013919e-06, + "loss": 0.88112479, + "num_input_tokens_seen": 114139230, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.5234375, + "step": 5314, + "time_per_iteration": 3.768296003341675 + }, + { + "auxiliary_loss_clip": 0.01077563, + "auxiliary_loss_mlp": 0.01031038, + "balance_loss_clip": 1.01520741, + "balance_loss_mlp": 1.0241971, + "epoch": 0.31955508793025705, + "flos": 21759198462720.0, + "grad_norm": 2.3285616252378714, + "language_loss": 0.79701555, + "learning_rate": 3.074395625569064e-06, + "loss": 0.81810158, + "num_input_tokens_seen": 114159290, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.53515625, + "step": 5315, + "time_per_iteration": 2.379410743713379 + }, + { + "auxiliary_loss_clip": 0.01080402, + "auxiliary_loss_mlp": 0.01033015, + "balance_loss_clip": 1.01719558, + "balance_loss_mlp": 1.02462316, + "epoch": 0.319615211182925, + "flos": 17163406442880.0, + "grad_norm": 1.660955287492213, + "language_loss": 0.68103802, + "learning_rate": 3.074076929781793e-06, + "loss": 0.70217216, + "num_input_tokens_seen": 114177655, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.55859375, + "step": 5316, + "time_per_iteration": 2.3701882362365723 + }, + { + "auxiliary_loss_clip": 0.01078854, + "auxiliary_loss_mlp": 0.01032013, + "balance_loss_clip": 1.01768994, + "balance_loss_mlp": 1.02430809, + "epoch": 0.319675334435593, + "flos": 28111505495040.0, + "grad_norm": 1.895388721289121, + "language_loss": 0.6942125, + "learning_rate": 3.073758195663479e-06, + "loss": 0.71532118, + "num_input_tokens_seen": 114200880, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.546875, + "step": 5317, + "time_per_iteration": 2.439493417739868 + }, + { + "auxiliary_loss_clip": 0.01014945, + "auxiliary_loss_mlp": 0.01007199, + "balance_loss_clip": 1.00601327, + "balance_loss_mlp": 1.00331116, + "epoch": 0.31973545768826095, + "flos": 69497266798080.0, + "grad_norm": 0.7302130338257007, + "language_loss": 0.53033507, + "learning_rate": 3.0734394232254967e-06, + "loss": 0.55055654, + "num_input_tokens_seen": 114267145, + "router_z_loss_clip": 0.01184082, + "router_z_loss_mlp": 0.11621094, + "step": 5318, + "time_per_iteration": 4.543852090835571 + }, + { + "auxiliary_loss_clip": 0.01077381, + "auxiliary_loss_mlp": 0.01032746, + "balance_loss_clip": 1.01823258, + "balance_loss_mlp": 1.02470493, + "epoch": 0.3197955809409289, + "flos": 13698346492800.0, + "grad_norm": 3.015290669098223, + "language_loss": 0.8383435, + "learning_rate": 3.0731206124792225e-06, + "loss": 0.85944486, + "num_input_tokens_seen": 114284630, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.52734375, + "step": 5319, + "time_per_iteration": 2.3598384857177734 + }, + { + "auxiliary_loss_clip": 0.01077311, + "auxiliary_loss_mlp": 0.01037845, + "balance_loss_clip": 1.02203143, + "balance_loss_mlp": 1.02440691, + "epoch": 0.3198557041935969, + "flos": 33216750207360.0, + "grad_norm": 2.114462507644159, + "language_loss": 0.63758969, + "learning_rate": 3.0728017634360345e-06, + "loss": 0.65874124, + "num_input_tokens_seen": 114305830, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.53125, + "step": 5320, + "time_per_iteration": 2.4845988750457764 + }, + { + "auxiliary_loss_clip": 0.01082309, + "auxiliary_loss_mlp": 0.01034486, + "balance_loss_clip": 1.01903701, + "balance_loss_mlp": 1.02537179, + "epoch": 0.31991582744626484, + "flos": 23730918295680.0, + "grad_norm": 1.8644225581946614, + "language_loss": 0.71038461, + "learning_rate": 3.072482876107311e-06, + "loss": 0.7315526, + "num_input_tokens_seen": 114325165, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.5703125, + "step": 5321, + "time_per_iteration": 2.394752264022827 + }, + { + "auxiliary_loss_clip": 0.010844, + "auxiliary_loss_mlp": 0.01037776, + "balance_loss_clip": 1.02035344, + "balance_loss_mlp": 1.02610743, + "epoch": 0.3199759506989328, + "flos": 18549877288320.0, + "grad_norm": 2.51645326805439, + "language_loss": 0.86026931, + "learning_rate": 3.072163950504432e-06, + "loss": 0.88149107, + "num_input_tokens_seen": 114341310, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.58203125, + "step": 5322, + "time_per_iteration": 2.354313611984253 + }, + { + "auxiliary_loss_clip": 0.01077182, + "auxiliary_loss_mlp": 0.01030272, + "balance_loss_clip": 1.01500106, + "balance_loss_mlp": 1.02371693, + "epoch": 0.3200360739516008, + "flos": 22417799950080.0, + "grad_norm": 1.6746175190444528, + "language_loss": 0.8324002, + "learning_rate": 3.0718449866387805e-06, + "loss": 0.85347468, + "num_input_tokens_seen": 114360355, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.53515625, + "step": 5323, + "time_per_iteration": 2.394345998764038 + }, + { + "auxiliary_loss_clip": 0.0107649, + "auxiliary_loss_mlp": 0.01032334, + "balance_loss_clip": 1.01686072, + "balance_loss_mlp": 1.0236522, + "epoch": 0.32009619720426874, + "flos": 20594181571200.0, + "grad_norm": 1.7991578629235503, + "language_loss": 0.78456134, + "learning_rate": 3.071525984521738e-06, + "loss": 0.80564952, + "num_input_tokens_seen": 114379220, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.52734375, + "step": 5324, + "time_per_iteration": 2.375988006591797 + }, + { + "auxiliary_loss_clip": 0.01077051, + "auxiliary_loss_mlp": 0.01029844, + "balance_loss_clip": 1.01416802, + "balance_loss_mlp": 1.02404213, + "epoch": 0.3201563204569367, + "flos": 18146735285760.0, + "grad_norm": 1.7299068565888955, + "language_loss": 0.79906166, + "learning_rate": 3.0712069441646896e-06, + "loss": 0.82013065, + "num_input_tokens_seen": 114396365, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.53125, + "step": 5325, + "time_per_iteration": 2.3500137329101562 + }, + { + "auxiliary_loss_clip": 0.01079942, + "auxiliary_loss_mlp": 0.01033401, + "balance_loss_clip": 1.01796389, + "balance_loss_mlp": 1.02517414, + "epoch": 0.32021644370960467, + "flos": 31682945957760.0, + "grad_norm": 1.7353914335932255, + "language_loss": 0.74854422, + "learning_rate": 3.0708878655790207e-06, + "loss": 0.7696777, + "num_input_tokens_seen": 114416780, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.546875, + "step": 5326, + "time_per_iteration": 2.473125457763672 + }, + { + "auxiliary_loss_clip": 0.01078257, + "auxiliary_loss_mlp": 0.01034268, + "balance_loss_clip": 1.0186758, + "balance_loss_mlp": 1.02567434, + "epoch": 0.32027656696227264, + "flos": 26864827200000.0, + "grad_norm": 1.7238630991112263, + "language_loss": 0.80849326, + "learning_rate": 3.070568748776118e-06, + "loss": 0.82961857, + "num_input_tokens_seen": 114437405, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.52734375, + "step": 5327, + "time_per_iteration": 2.444814920425415 + }, + { + "auxiliary_loss_clip": 0.01082048, + "auxiliary_loss_mlp": 0.01025392, + "balance_loss_clip": 1.00933397, + "balance_loss_mlp": 1.02633858, + "epoch": 0.32033669021494066, + "flos": 24168798967680.0, + "grad_norm": 1.5346637189642018, + "language_loss": 0.77680606, + "learning_rate": 3.0702495937673713e-06, + "loss": 0.79788041, + "num_input_tokens_seen": 114458505, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.5546875, + "step": 5328, + "time_per_iteration": 2.422175645828247 + }, + { + "auxiliary_loss_clip": 0.01079682, + "auxiliary_loss_mlp": 0.0103277, + "balance_loss_clip": 1.01577044, + "balance_loss_mlp": 1.02425337, + "epoch": 0.3203968134676086, + "flos": 24459660437760.0, + "grad_norm": 1.6415220813276596, + "language_loss": 0.74103975, + "learning_rate": 3.0699304005641686e-06, + "loss": 0.76216424, + "num_input_tokens_seen": 114479050, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.5546875, + "step": 5329, + "time_per_iteration": 2.408017158508301 + }, + { + "auxiliary_loss_clip": 0.0107571, + "auxiliary_loss_mlp": 0.0102651, + "balance_loss_clip": 1.01276469, + "balance_loss_mlp": 1.02347875, + "epoch": 0.3204569367202766, + "flos": 18003730890240.0, + "grad_norm": 1.6329958919204735, + "language_loss": 0.70584631, + "learning_rate": 3.069611169177903e-06, + "loss": 0.72686857, + "num_input_tokens_seen": 114497415, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.5234375, + "step": 5330, + "time_per_iteration": 2.3712151050567627 + }, + { + "auxiliary_loss_clip": 0.01083769, + "auxiliary_loss_mlp": 0.01030648, + "balance_loss_clip": 1.01284981, + "balance_loss_mlp": 1.02580059, + "epoch": 0.32051705997294455, + "flos": 30588418834560.0, + "grad_norm": 1.8023699623724145, + "language_loss": 0.79893219, + "learning_rate": 3.069291899619966e-06, + "loss": 0.82007635, + "num_input_tokens_seen": 114518785, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.578125, + "step": 5331, + "time_per_iteration": 2.4474527835845947 + }, + { + "auxiliary_loss_clip": 0.01014206, + "auxiliary_loss_mlp": 0.01005651, + "balance_loss_clip": 1.00445902, + "balance_loss_mlp": 1.00218296, + "epoch": 0.3205771832256125, + "flos": 68414855783040.0, + "grad_norm": 0.8261636075296693, + "language_loss": 0.57823443, + "learning_rate": 3.0689725919017517e-06, + "loss": 0.59843302, + "num_input_tokens_seen": 114577710, + "router_z_loss_clip": 0.01190186, + "router_z_loss_mlp": 0.12011719, + "step": 5332, + "time_per_iteration": 2.952845335006714 + }, + { + "auxiliary_loss_clip": 0.01081163, + "auxiliary_loss_mlp": 0.01032185, + "balance_loss_clip": 1.01600242, + "balance_loss_mlp": 1.02478945, + "epoch": 0.3206373064782805, + "flos": 30442691352960.0, + "grad_norm": 1.5109880214283495, + "language_loss": 0.73105484, + "learning_rate": 3.068653246034655e-06, + "loss": 0.75218832, + "num_input_tokens_seen": 114598640, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.5625, + "step": 5333, + "time_per_iteration": 2.4440903663635254 + }, + { + "auxiliary_loss_clip": 0.01079604, + "auxiliary_loss_mlp": 0.01037929, + "balance_loss_clip": 1.02090633, + "balance_loss_mlp": 1.02470422, + "epoch": 0.32069742973094845, + "flos": 22053411423360.0, + "grad_norm": 1.5521675736367253, + "language_loss": 0.7039721, + "learning_rate": 3.0683338620300728e-06, + "loss": 0.72514749, + "num_input_tokens_seen": 114618780, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.55078125, + "step": 5334, + "time_per_iteration": 2.3876729011535645 + }, + { + "auxiliary_loss_clip": 0.01078196, + "auxiliary_loss_mlp": 0.01032794, + "balance_loss_clip": 1.01683152, + "balance_loss_mlp": 1.02403474, + "epoch": 0.3207575529836164, + "flos": 22052922664320.0, + "grad_norm": 1.8526497599509197, + "language_loss": 0.775653, + "learning_rate": 3.068014439899404e-06, + "loss": 0.79676288, + "num_input_tokens_seen": 114637525, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.54296875, + "step": 5335, + "time_per_iteration": 2.3775601387023926 + }, + { + "auxiliary_loss_clip": 0.01078461, + "auxiliary_loss_mlp": 0.01030351, + "balance_loss_clip": 1.01471066, + "balance_loss_mlp": 1.02449226, + "epoch": 0.3208176762362844, + "flos": 34056132048000.0, + "grad_norm": 1.6648730709315847, + "language_loss": 0.68008214, + "learning_rate": 3.0676949796540458e-06, + "loss": 0.70117021, + "num_input_tokens_seen": 114659705, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.5390625, + "step": 5336, + "time_per_iteration": 2.484494924545288 + }, + { + "auxiliary_loss_clip": 0.01080373, + "auxiliary_loss_mlp": 0.01034546, + "balance_loss_clip": 1.01808918, + "balance_loss_mlp": 1.02567554, + "epoch": 0.32087779948895234, + "flos": 21797637736320.0, + "grad_norm": 9.52818582865673, + "language_loss": 0.79062629, + "learning_rate": 3.067375481305401e-06, + "loss": 0.81177551, + "num_input_tokens_seen": 114678340, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.546875, + "step": 5337, + "time_per_iteration": 2.381190538406372 + }, + { + "auxiliary_loss_clip": 0.01073155, + "auxiliary_loss_mlp": 0.01030506, + "balance_loss_clip": 1.0164988, + "balance_loss_mlp": 1.02252102, + "epoch": 0.3209379227416203, + "flos": 21433039741440.0, + "grad_norm": 1.7488093555085955, + "language_loss": 0.74007773, + "learning_rate": 3.0670559448648707e-06, + "loss": 0.76111436, + "num_input_tokens_seen": 114696980, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.5078125, + "step": 5338, + "time_per_iteration": 2.383863687515259 + }, + { + "auxiliary_loss_clip": 0.01079571, + "auxiliary_loss_mlp": 0.01028962, + "balance_loss_clip": 1.01314247, + "balance_loss_mlp": 1.02386236, + "epoch": 0.3209980459942883, + "flos": 25847876851200.0, + "grad_norm": 1.7597184247151785, + "language_loss": 0.62648952, + "learning_rate": 3.0667363703438588e-06, + "loss": 0.64757484, + "num_input_tokens_seen": 114717330, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.55859375, + "step": 5339, + "time_per_iteration": 2.406446695327759 + }, + { + "auxiliary_loss_clip": 0.01078526, + "auxiliary_loss_mlp": 0.01031825, + "balance_loss_clip": 1.01575601, + "balance_loss_mlp": 1.02463818, + "epoch": 0.32105816924695624, + "flos": 19098153279360.0, + "grad_norm": 2.4589022355520385, + "language_loss": 0.8209306, + "learning_rate": 3.0664167577537696e-06, + "loss": 0.8420341, + "num_input_tokens_seen": 114736320, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.5390625, + "step": 5340, + "time_per_iteration": 2.366525650024414 + }, + { + "auxiliary_loss_clip": 0.01078453, + "auxiliary_loss_mlp": 0.01040906, + "balance_loss_clip": 1.02472901, + "balance_loss_mlp": 1.02388883, + "epoch": 0.3211182924996242, + "flos": 16580915452800.0, + "grad_norm": 1.8944822944965334, + "language_loss": 0.76445788, + "learning_rate": 3.0660971071060095e-06, + "loss": 0.78565145, + "num_input_tokens_seen": 114754575, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.546875, + "step": 5341, + "time_per_iteration": 2.3488073348999023 + }, + { + "auxiliary_loss_clip": 0.01076127, + "auxiliary_loss_mlp": 0.01033629, + "balance_loss_clip": 1.01875222, + "balance_loss_mlp": 1.02412963, + "epoch": 0.3211784157522922, + "flos": 22671164753280.0, + "grad_norm": 1.6767059818845793, + "language_loss": 0.79426581, + "learning_rate": 3.0657774184119854e-06, + "loss": 0.81536341, + "num_input_tokens_seen": 114773590, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.51953125, + "step": 5342, + "time_per_iteration": 2.406992197036743 + }, + { + "auxiliary_loss_clip": 0.01079338, + "auxiliary_loss_mlp": 0.01034497, + "balance_loss_clip": 1.01804614, + "balance_loss_mlp": 1.02480423, + "epoch": 0.3212385390049602, + "flos": 20557732245120.0, + "grad_norm": 2.854786770329366, + "language_loss": 0.75216693, + "learning_rate": 3.065457691683108e-06, + "loss": 0.7733053, + "num_input_tokens_seen": 114790775, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.546875, + "step": 5343, + "time_per_iteration": 2.3806638717651367 + }, + { + "auxiliary_loss_clip": 0.01077548, + "auxiliary_loss_mlp": 0.01031206, + "balance_loss_clip": 1.01543427, + "balance_loss_mlp": 1.02391636, + "epoch": 0.32129866225762815, + "flos": 24789973610880.0, + "grad_norm": 2.594467174111739, + "language_loss": 0.82471192, + "learning_rate": 3.0651379269307853e-06, + "loss": 0.84579945, + "num_input_tokens_seen": 114809835, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.53515625, + "step": 5344, + "time_per_iteration": 2.4152557849884033 + }, + { + "auxiliary_loss_clip": 0.01078085, + "auxiliary_loss_mlp": 0.01033955, + "balance_loss_clip": 1.01802278, + "balance_loss_mlp": 1.02296662, + "epoch": 0.3213587855102961, + "flos": 18365954912640.0, + "grad_norm": 1.9454076014182453, + "language_loss": 0.79905093, + "learning_rate": 3.06481812416643e-06, + "loss": 0.8201713, + "num_input_tokens_seen": 114826505, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.55078125, + "step": 5345, + "time_per_iteration": 2.339587926864624 + }, + { + "auxiliary_loss_clip": 0.01077468, + "auxiliary_loss_mlp": 0.01035704, + "balance_loss_clip": 1.02013516, + "balance_loss_mlp": 1.02376652, + "epoch": 0.3214189087629641, + "flos": 27014778956160.0, + "grad_norm": 1.6835044905221574, + "language_loss": 0.82979214, + "learning_rate": 3.0644982834014545e-06, + "loss": 0.85092378, + "num_input_tokens_seen": 114846140, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.5390625, + "step": 5346, + "time_per_iteration": 2.42110538482666 + }, + { + "auxiliary_loss_clip": 0.01078451, + "auxiliary_loss_mlp": 0.01034962, + "balance_loss_clip": 1.01922059, + "balance_loss_mlp": 1.0234127, + "epoch": 0.32147903201563205, + "flos": 23147170496640.0, + "grad_norm": 1.461422133704922, + "language_loss": 0.8155455, + "learning_rate": 3.0641784046472745e-06, + "loss": 0.83667964, + "num_input_tokens_seen": 114866660, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.55078125, + "step": 5347, + "time_per_iteration": 2.4126431941986084 + }, + { + "auxiliary_loss_clip": 0.01077756, + "auxiliary_loss_mlp": 0.01032444, + "balance_loss_clip": 1.01629162, + "balance_loss_mlp": 1.02431095, + "epoch": 0.3215391552683, + "flos": 16579833200640.0, + "grad_norm": 2.213795023887685, + "language_loss": 0.79823768, + "learning_rate": 3.063858487915304e-06, + "loss": 0.81933963, + "num_input_tokens_seen": 114882820, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.53515625, + "step": 5348, + "time_per_iteration": 2.358952760696411 + }, + { + "auxiliary_loss_clip": 0.01080646, + "auxiliary_loss_mlp": 0.01039472, + "balance_loss_clip": 1.02431464, + "balance_loss_mlp": 1.02657938, + "epoch": 0.321599278520968, + "flos": 17820855855360.0, + "grad_norm": 1.9932386942977218, + "language_loss": 0.84975469, + "learning_rate": 3.0635385332169606e-06, + "loss": 0.87095582, + "num_input_tokens_seen": 114900745, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.5390625, + "step": 5349, + "time_per_iteration": 3.757690191268921 + }, + { + "auxiliary_loss_clip": 0.01076711, + "auxiliary_loss_mlp": 0.0103356, + "balance_loss_clip": 1.01833105, + "balance_loss_mlp": 1.02378941, + "epoch": 0.32165940177363594, + "flos": 16250881570560.0, + "grad_norm": 1.612803389003936, + "language_loss": 0.80543709, + "learning_rate": 3.063218540563663e-06, + "loss": 0.82653975, + "num_input_tokens_seen": 114917940, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.52734375, + "step": 5350, + "time_per_iteration": 2.3480212688446045 + }, + { + "auxiliary_loss_clip": 0.010773, + "auxiliary_loss_mlp": 0.01031107, + "balance_loss_clip": 1.01612234, + "balance_loss_mlp": 1.0239172, + "epoch": 0.3217195250263039, + "flos": 27598666400640.0, + "grad_norm": 1.4459224358230414, + "language_loss": 0.80121368, + "learning_rate": 3.06289850996683e-06, + "loss": 0.82229781, + "num_input_tokens_seen": 114937735, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.53125, + "step": 5351, + "time_per_iteration": 2.420304298400879 + }, + { + "auxiliary_loss_clip": 0.01078208, + "auxiliary_loss_mlp": 0.01035006, + "balance_loss_clip": 1.01911521, + "balance_loss_mlp": 1.02447629, + "epoch": 0.3217796482789719, + "flos": 21469523978880.0, + "grad_norm": 1.7297810608015358, + "language_loss": 0.75632811, + "learning_rate": 3.062578441437884e-06, + "loss": 0.77746028, + "num_input_tokens_seen": 114956630, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.5390625, + "step": 5352, + "time_per_iteration": 2.3908603191375732 + }, + { + "auxiliary_loss_clip": 0.01078515, + "auxiliary_loss_mlp": 0.01034806, + "balance_loss_clip": 1.01867068, + "balance_loss_mlp": 1.0249629, + "epoch": 0.32183977153163984, + "flos": 21214518341760.0, + "grad_norm": 3.985252892116223, + "language_loss": 0.8181082, + "learning_rate": 3.062258334988246e-06, + "loss": 0.83924145, + "num_input_tokens_seen": 114976470, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.53515625, + "step": 5353, + "time_per_iteration": 5.1726415157318115 + }, + { + "auxiliary_loss_clip": 0.01075771, + "auxiliary_loss_mlp": 0.01032235, + "balance_loss_clip": 1.01659489, + "balance_loss_mlp": 1.02447271, + "epoch": 0.3218998947843078, + "flos": 24607028753280.0, + "grad_norm": 1.5477413546729974, + "language_loss": 0.73335499, + "learning_rate": 3.0619381906293414e-06, + "loss": 0.75443506, + "num_input_tokens_seen": 114996710, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.51171875, + "step": 5354, + "time_per_iteration": 2.395334005355835 + }, + { + "auxiliary_loss_clip": 0.01076708, + "auxiliary_loss_mlp": 0.01031507, + "balance_loss_clip": 1.01655841, + "balance_loss_mlp": 1.0236063, + "epoch": 0.3219600180369758, + "flos": 22269558850560.0, + "grad_norm": 1.5242890369922673, + "language_loss": 0.83404744, + "learning_rate": 3.0616180083725943e-06, + "loss": 0.85512954, + "num_input_tokens_seen": 115015775, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.53125, + "step": 5355, + "time_per_iteration": 2.397935152053833 + }, + { + "auxiliary_loss_clip": 0.01083671, + "auxiliary_loss_mlp": 0.01030719, + "balance_loss_clip": 1.0153172, + "balance_loss_mlp": 1.02768159, + "epoch": 0.3220201412896438, + "flos": 14938251984000.0, + "grad_norm": 2.3672932936834035, + "language_loss": 0.71493244, + "learning_rate": 3.0612977882294306e-06, + "loss": 0.73607641, + "num_input_tokens_seen": 115034265, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.5625, + "step": 5356, + "time_per_iteration": 2.342625856399536 + }, + { + "auxiliary_loss_clip": 0.01083924, + "auxiliary_loss_mlp": 0.01035437, + "balance_loss_clip": 1.01785326, + "balance_loss_mlp": 1.02603495, + "epoch": 0.32208026454231176, + "flos": 22666486631040.0, + "grad_norm": 2.537484418368024, + "language_loss": 0.67573178, + "learning_rate": 3.0609775302112793e-06, + "loss": 0.6969254, + "num_input_tokens_seen": 115051945, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.578125, + "step": 5357, + "time_per_iteration": 2.395383596420288 + }, + { + "auxiliary_loss_clip": 0.01017228, + "auxiliary_loss_mlp": 0.01002446, + "balance_loss_clip": 1.00114655, + "balance_loss_mlp": 1.00552416, + "epoch": 0.3221403877949797, + "flos": 64601606177280.0, + "grad_norm": 0.7897995587024933, + "language_loss": 0.58244151, + "learning_rate": 3.060657234329569e-06, + "loss": 0.6026383, + "num_input_tokens_seen": 115119090, + "router_z_loss_clip": 0.01300049, + "router_z_loss_mlp": 0.1171875, + "step": 5358, + "time_per_iteration": 4.523858308792114 + }, + { + "auxiliary_loss_clip": 0.01077367, + "auxiliary_loss_mlp": 0.01035881, + "balance_loss_clip": 1.01972818, + "balance_loss_mlp": 1.02269447, + "epoch": 0.3222005110476477, + "flos": 20155986696960.0, + "grad_norm": 1.7286052533361376, + "language_loss": 0.83615881, + "learning_rate": 3.06033690059573e-06, + "loss": 0.85729128, + "num_input_tokens_seen": 115137755, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.546875, + "step": 5359, + "time_per_iteration": 2.361799478530884 + }, + { + "auxiliary_loss_clip": 0.01078628, + "auxiliary_loss_mlp": 0.01030075, + "balance_loss_clip": 1.01460195, + "balance_loss_mlp": 1.0245235, + "epoch": 0.32226063430031565, + "flos": 22673084878080.0, + "grad_norm": 1.689715791164159, + "language_loss": 0.79500908, + "learning_rate": 3.060016529021195e-06, + "loss": 0.81609607, + "num_input_tokens_seen": 115158150, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.5390625, + "step": 5360, + "time_per_iteration": 2.4136157035827637 + }, + { + "auxiliary_loss_clip": 0.01014479, + "auxiliary_loss_mlp": 0.01000586, + "balance_loss_clip": 0.9994117, + "balance_loss_mlp": 1.0027318, + "epoch": 0.3223207575529836, + "flos": 63825312896640.0, + "grad_norm": 0.6529925605651171, + "language_loss": 0.56954265, + "learning_rate": 3.0596961196173965e-06, + "loss": 0.58969331, + "num_input_tokens_seen": 115212755, + "router_z_loss_clip": 0.01171875, + "router_z_loss_mlp": 0.11767578, + "step": 5361, + "time_per_iteration": 2.896439790725708 + }, + { + "auxiliary_loss_clip": 0.0107876, + "auxiliary_loss_mlp": 0.01035494, + "balance_loss_clip": 1.0193764, + "balance_loss_mlp": 1.02444124, + "epoch": 0.3223808808056516, + "flos": 26868911829120.0, + "grad_norm": 5.140079638641671, + "language_loss": 0.70762086, + "learning_rate": 3.0593756723957695e-06, + "loss": 0.72876334, + "num_input_tokens_seen": 115233090, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.54296875, + "step": 5362, + "time_per_iteration": 2.4290249347686768 + }, + { + "auxiliary_loss_clip": 0.01078289, + "auxiliary_loss_mlp": 0.0103632, + "balance_loss_clip": 1.02153254, + "balance_loss_mlp": 1.02497351, + "epoch": 0.32244100405831955, + "flos": 26760122432640.0, + "grad_norm": 1.6110618745369967, + "language_loss": 0.73848045, + "learning_rate": 3.0590551873677493e-06, + "loss": 0.75962651, + "num_input_tokens_seen": 115252645, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.53125, + "step": 5363, + "time_per_iteration": 2.419034242630005 + }, + { + "auxiliary_loss_clip": 0.01082123, + "auxiliary_loss_mlp": 0.01037253, + "balance_loss_clip": 1.02108836, + "balance_loss_mlp": 1.024948, + "epoch": 0.3225011273109875, + "flos": 23801966645760.0, + "grad_norm": 2.0659613830640193, + "language_loss": 0.76461691, + "learning_rate": 3.058734664544774e-06, + "loss": 0.78581065, + "num_input_tokens_seen": 115269085, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.57421875, + "step": 5364, + "time_per_iteration": 2.4025800228118896 + }, + { + "auxiliary_loss_clip": 0.01079656, + "auxiliary_loss_mlp": 0.01032721, + "balance_loss_clip": 1.01593637, + "balance_loss_mlp": 1.02442658, + "epoch": 0.3225612505636555, + "flos": 17273557382400.0, + "grad_norm": 3.239717133902645, + "language_loss": 0.77195823, + "learning_rate": 3.0584141039382828e-06, + "loss": 0.793082, + "num_input_tokens_seen": 115286470, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.55078125, + "step": 5365, + "time_per_iteration": 2.3846027851104736 + }, + { + "auxiliary_loss_clip": 0.01084334, + "auxiliary_loss_mlp": 0.01036756, + "balance_loss_clip": 1.02034116, + "balance_loss_mlp": 1.02883041, + "epoch": 0.32262137381632344, + "flos": 23365168225920.0, + "grad_norm": 1.6422084171036462, + "language_loss": 0.76844335, + "learning_rate": 3.0580935055597135e-06, + "loss": 0.78965425, + "num_input_tokens_seen": 115307000, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.5546875, + "step": 5366, + "time_per_iteration": 2.4153943061828613 + }, + { + "auxiliary_loss_clip": 0.01078505, + "auxiliary_loss_mlp": 0.01032185, + "balance_loss_clip": 1.01686645, + "balance_loss_mlp": 1.02536798, + "epoch": 0.3226814970689914, + "flos": 23257670549760.0, + "grad_norm": 1.8247870818913614, + "language_loss": 0.71983856, + "learning_rate": 3.057772869420509e-06, + "loss": 0.74094546, + "num_input_tokens_seen": 115325925, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.53125, + "step": 5367, + "time_per_iteration": 2.3906137943267822 + }, + { + "auxiliary_loss_clip": 0.01076456, + "auxiliary_loss_mlp": 0.01031678, + "balance_loss_clip": 1.01684797, + "balance_loss_mlp": 1.02413893, + "epoch": 0.32274162032165943, + "flos": 16394374725120.0, + "grad_norm": 2.0400868125342995, + "language_loss": 0.7415911, + "learning_rate": 3.057452195532112e-06, + "loss": 0.76267242, + "num_input_tokens_seen": 115343705, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.5234375, + "step": 5368, + "time_per_iteration": 2.3711793422698975 + }, + { + "auxiliary_loss_clip": 0.01077534, + "auxiliary_loss_mlp": 0.01032284, + "balance_loss_clip": 1.01760924, + "balance_loss_mlp": 1.02591383, + "epoch": 0.3228017435743274, + "flos": 27853846594560.0, + "grad_norm": 1.5648915567637165, + "language_loss": 0.78616285, + "learning_rate": 3.057131483905967e-06, + "loss": 0.80726111, + "num_input_tokens_seen": 115364170, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.515625, + "step": 5369, + "time_per_iteration": 2.441316604614258 + }, + { + "auxiliary_loss_clip": 0.01076838, + "auxiliary_loss_mlp": 0.01026979, + "balance_loss_clip": 1.01285231, + "balance_loss_mlp": 1.02579892, + "epoch": 0.32286186682699536, + "flos": 19607780528640.0, + "grad_norm": 2.135077820571584, + "language_loss": 0.83170462, + "learning_rate": 3.0568107345535173e-06, + "loss": 0.85274273, + "num_input_tokens_seen": 115382495, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.51171875, + "step": 5370, + "time_per_iteration": 2.3850996494293213 + }, + { + "auxiliary_loss_clip": 0.01079691, + "auxiliary_loss_mlp": 0.01030886, + "balance_loss_clip": 1.01536465, + "balance_loss_mlp": 1.02564371, + "epoch": 0.3229219900796633, + "flos": 24132873312000.0, + "grad_norm": 2.4760410996302076, + "language_loss": 0.83028758, + "learning_rate": 3.0564899474862112e-06, + "loss": 0.85139334, + "num_input_tokens_seen": 115399450, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.5390625, + "step": 5371, + "time_per_iteration": 2.4109177589416504 + }, + { + "auxiliary_loss_clip": 0.01082251, + "auxiliary_loss_mlp": 0.01037747, + "balance_loss_clip": 1.02056861, + "balance_loss_mlp": 1.02494669, + "epoch": 0.3229821133323313, + "flos": 17747747735040.0, + "grad_norm": 2.808182109081233, + "language_loss": 0.88815355, + "learning_rate": 3.056169122715497e-06, + "loss": 0.90935355, + "num_input_tokens_seen": 115417700, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.5703125, + "step": 5372, + "time_per_iteration": 2.3465170860290527 + }, + { + "auxiliary_loss_clip": 0.01079449, + "auxiliary_loss_mlp": 0.01032666, + "balance_loss_clip": 1.01750314, + "balance_loss_mlp": 1.0259248, + "epoch": 0.32304223658499925, + "flos": 22344936209280.0, + "grad_norm": 2.2559786400364317, + "language_loss": 0.72769004, + "learning_rate": 3.055848260252823e-06, + "loss": 0.74881119, + "num_input_tokens_seen": 115435840, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.53515625, + "step": 5373, + "time_per_iteration": 2.39790415763855 + }, + { + "auxiliary_loss_clip": 0.01078354, + "auxiliary_loss_mlp": 0.0103161, + "balance_loss_clip": 1.01712024, + "balance_loss_mlp": 1.02540016, + "epoch": 0.3231023598376672, + "flos": 18477327749760.0, + "grad_norm": 2.1306090181539434, + "language_loss": 0.81241184, + "learning_rate": 3.055527360109641e-06, + "loss": 0.83351153, + "num_input_tokens_seen": 115454210, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.53125, + "step": 5374, + "time_per_iteration": 2.3609631061553955 + }, + { + "auxiliary_loss_clip": 0.01079095, + "auxiliary_loss_mlp": 0.01032982, + "balance_loss_clip": 1.01794958, + "balance_loss_mlp": 1.02573276, + "epoch": 0.3231624830903352, + "flos": 27635080815360.0, + "grad_norm": 2.8729695217465667, + "language_loss": 0.87721264, + "learning_rate": 3.0552064222974024e-06, + "loss": 0.89833343, + "num_input_tokens_seen": 115471785, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.53515625, + "step": 5375, + "time_per_iteration": 2.4294028282165527 + }, + { + "auxiliary_loss_clip": 0.0108127, + "auxiliary_loss_mlp": 0.01034899, + "balance_loss_clip": 1.01782823, + "balance_loss_mlp": 1.02376342, + "epoch": 0.32322260634300315, + "flos": 21725332577280.0, + "grad_norm": 2.5660122403635124, + "language_loss": 0.76166165, + "learning_rate": 3.054885446827561e-06, + "loss": 0.78282332, + "num_input_tokens_seen": 115491405, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.57421875, + "step": 5376, + "time_per_iteration": 2.383476972579956 + }, + { + "auxiliary_loss_clip": 0.010763, + "auxiliary_loss_mlp": 0.0102696, + "balance_loss_clip": 1.01236224, + "balance_loss_mlp": 1.02472198, + "epoch": 0.3232827295956711, + "flos": 22636565729280.0, + "grad_norm": 1.8001887070961868, + "language_loss": 0.6709525, + "learning_rate": 3.0545644337115716e-06, + "loss": 0.69198507, + "num_input_tokens_seen": 115511555, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.515625, + "step": 5377, + "time_per_iteration": 2.4060797691345215 + }, + { + "auxiliary_loss_clip": 0.01079296, + "auxiliary_loss_mlp": 0.01040704, + "balance_loss_clip": 1.02427065, + "balance_loss_mlp": 1.02596271, + "epoch": 0.3233428528483391, + "flos": 26321403888000.0, + "grad_norm": 1.3927412936248764, + "language_loss": 0.72254539, + "learning_rate": 3.0542433829608902e-06, + "loss": 0.74374539, + "num_input_tokens_seen": 115532860, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.53125, + "step": 5378, + "time_per_iteration": 2.4265549182891846 + }, + { + "auxiliary_loss_clip": 0.01077624, + "auxiliary_loss_mlp": 0.01028484, + "balance_loss_clip": 1.01256919, + "balance_loss_mlp": 1.02334094, + "epoch": 0.32340297610100704, + "flos": 28583950279680.0, + "grad_norm": 2.592730588230886, + "language_loss": 0.81970894, + "learning_rate": 3.0539222945869742e-06, + "loss": 0.84077007, + "num_input_tokens_seen": 115553850, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.54296875, + "step": 5379, + "time_per_iteration": 2.519977569580078 + }, + { + "auxiliary_loss_clip": 0.0108063, + "auxiliary_loss_mlp": 0.01029433, + "balance_loss_clip": 1.01434147, + "balance_loss_mlp": 1.02522683, + "epoch": 0.323463099353675, + "flos": 30772480855680.0, + "grad_norm": 2.5958444129036886, + "language_loss": 0.78698713, + "learning_rate": 3.0536011686012827e-06, + "loss": 0.80808771, + "num_input_tokens_seen": 115575530, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.5546875, + "step": 5380, + "time_per_iteration": 2.4635374546051025 + }, + { + "auxiliary_loss_clip": 0.01080093, + "auxiliary_loss_mlp": 0.01029063, + "balance_loss_clip": 1.014359, + "balance_loss_mlp": 1.02611732, + "epoch": 0.32352322260634303, + "flos": 25227435346560.0, + "grad_norm": 1.7120370289719287, + "language_loss": 0.76934105, + "learning_rate": 3.0532800050152752e-06, + "loss": 0.79043263, + "num_input_tokens_seen": 115594885, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.5390625, + "step": 5381, + "time_per_iteration": 2.428349494934082 + }, + { + "auxiliary_loss_clip": 0.01075395, + "auxiliary_loss_mlp": 0.01028506, + "balance_loss_clip": 1.01452243, + "balance_loss_mlp": 1.02448523, + "epoch": 0.323583345859011, + "flos": 23329382215680.0, + "grad_norm": 1.6992395955451058, + "language_loss": 0.71832007, + "learning_rate": 3.052958803840414e-06, + "loss": 0.73935908, + "num_input_tokens_seen": 115614080, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.5078125, + "step": 5382, + "time_per_iteration": 2.3826966285705566 + }, + { + "auxiliary_loss_clip": 0.01080542, + "auxiliary_loss_mlp": 0.01038824, + "balance_loss_clip": 1.02308822, + "balance_loss_mlp": 1.02441061, + "epoch": 0.32364346911167896, + "flos": 26206470092160.0, + "grad_norm": 3.099905045241218, + "language_loss": 0.70067793, + "learning_rate": 3.0526375650881617e-06, + "loss": 0.72187161, + "num_input_tokens_seen": 115632820, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.5625, + "step": 5383, + "time_per_iteration": 2.48154354095459 + }, + { + "auxiliary_loss_clip": 0.01075978, + "auxiliary_loss_mlp": 0.01028604, + "balance_loss_clip": 1.01544309, + "balance_loss_mlp": 1.02491498, + "epoch": 0.3237035923643469, + "flos": 23694643526400.0, + "grad_norm": 2.2161622735116917, + "language_loss": 0.78149533, + "learning_rate": 3.0523162887699824e-06, + "loss": 0.8025412, + "num_input_tokens_seen": 115652860, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.51171875, + "step": 5384, + "time_per_iteration": 2.3936946392059326 + }, + { + "auxiliary_loss_clip": 0.01082099, + "auxiliary_loss_mlp": 0.0103474, + "balance_loss_clip": 1.01907587, + "balance_loss_mlp": 1.02628994, + "epoch": 0.3237637156170149, + "flos": 14427856684800.0, + "grad_norm": 2.306999604704847, + "language_loss": 0.75232095, + "learning_rate": 3.051994974897342e-06, + "loss": 0.77348936, + "num_input_tokens_seen": 115670940, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.55859375, + "step": 5385, + "time_per_iteration": 2.379140853881836 + }, + { + "auxiliary_loss_clip": 0.01078455, + "auxiliary_loss_mlp": 0.01033392, + "balance_loss_clip": 1.01775217, + "balance_loss_mlp": 1.02491426, + "epoch": 0.32382383886968286, + "flos": 31061736403200.0, + "grad_norm": 2.0634454545759655, + "language_loss": 0.71936297, + "learning_rate": 3.051673623481706e-06, + "loss": 0.7404815, + "num_input_tokens_seen": 115691155, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.53515625, + "step": 5386, + "time_per_iteration": 2.506383180618286 + }, + { + "auxiliary_loss_clip": 0.01079334, + "auxiliary_loss_mlp": 0.01030879, + "balance_loss_clip": 1.01449919, + "balance_loss_mlp": 1.02353239, + "epoch": 0.3238839621223508, + "flos": 23255855159040.0, + "grad_norm": 1.787912039359255, + "language_loss": 0.94587326, + "learning_rate": 3.0513522345345446e-06, + "loss": 0.96697545, + "num_input_tokens_seen": 115710340, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.55859375, + "step": 5387, + "time_per_iteration": 2.4188995361328125 + }, + { + "auxiliary_loss_clip": 0.01080641, + "auxiliary_loss_mlp": 0.01034944, + "balance_loss_clip": 1.01856422, + "balance_loss_mlp": 1.02532244, + "epoch": 0.3239440853750188, + "flos": 22963597234560.0, + "grad_norm": 3.300937406522664, + "language_loss": 0.77612454, + "learning_rate": 3.0510308080673256e-06, + "loss": 0.79728043, + "num_input_tokens_seen": 115726745, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.5546875, + "step": 5388, + "time_per_iteration": 3.7762293815612793 + }, + { + "auxiliary_loss_clip": 0.01080741, + "auxiliary_loss_mlp": 0.01031123, + "balance_loss_clip": 1.01455331, + "balance_loss_mlp": 1.02494729, + "epoch": 0.32400420862768675, + "flos": 36245151383040.0, + "grad_norm": 1.896369953650391, + "language_loss": 0.71499395, + "learning_rate": 3.0507093440915214e-06, + "loss": 0.73611259, + "num_input_tokens_seen": 115749385, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.55859375, + "step": 5389, + "time_per_iteration": 2.5134568214416504 + }, + { + "auxiliary_loss_clip": 0.01078189, + "auxiliary_loss_mlp": 0.01033765, + "balance_loss_clip": 1.0176239, + "balance_loss_mlp": 1.02494574, + "epoch": 0.3240643318803547, + "flos": 21615426017280.0, + "grad_norm": 2.2665248512681244, + "language_loss": 0.80806518, + "learning_rate": 3.0503878426186028e-06, + "loss": 0.82918477, + "num_input_tokens_seen": 115768105, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.53125, + "step": 5390, + "time_per_iteration": 2.3800621032714844 + }, + { + "auxiliary_loss_clip": 0.01081436, + "auxiliary_loss_mlp": 0.01044436, + "balance_loss_clip": 1.02858067, + "balance_loss_mlp": 1.02740049, + "epoch": 0.3241244551330227, + "flos": 23294468989440.0, + "grad_norm": 8.930226657265974, + "language_loss": 0.72171915, + "learning_rate": 3.050066303660044e-06, + "loss": 0.74297786, + "num_input_tokens_seen": 115787340, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.5390625, + "step": 5391, + "time_per_iteration": 2.418109178543091 + }, + { + "auxiliary_loss_clip": 0.01076605, + "auxiliary_loss_mlp": 0.01032707, + "balance_loss_clip": 1.01793671, + "balance_loss_mlp": 1.02426147, + "epoch": 0.32418457838569065, + "flos": 14096461259520.0, + "grad_norm": 2.867852821443578, + "language_loss": 0.77113712, + "learning_rate": 3.0497447272273203e-06, + "loss": 0.79223019, + "num_input_tokens_seen": 115805565, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.5234375, + "step": 5392, + "time_per_iteration": 3.818276882171631 + }, + { + "auxiliary_loss_clip": 0.01082108, + "auxiliary_loss_mlp": 0.01033949, + "balance_loss_clip": 1.01781976, + "balance_loss_mlp": 1.02654922, + "epoch": 0.3242447016383586, + "flos": 29751376055040.0, + "grad_norm": 1.8130150819251067, + "language_loss": 0.62515903, + "learning_rate": 3.049423113331907e-06, + "loss": 0.64631963, + "num_input_tokens_seen": 115826725, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.5546875, + "step": 5393, + "time_per_iteration": 3.904017210006714 + }, + { + "auxiliary_loss_clip": 0.01078697, + "auxiliary_loss_mlp": 0.01034742, + "balance_loss_clip": 1.01972163, + "balance_loss_mlp": 1.02516043, + "epoch": 0.3243048248910266, + "flos": 24350102991360.0, + "grad_norm": 1.608725193401751, + "language_loss": 0.82639152, + "learning_rate": 3.049101461985283e-06, + "loss": 0.84752595, + "num_input_tokens_seen": 115846955, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.53515625, + "step": 5394, + "time_per_iteration": 2.436965227127075 + }, + { + "auxiliary_loss_clip": 0.0107615, + "auxiliary_loss_mlp": 0.01036876, + "balance_loss_clip": 1.02332175, + "balance_loss_mlp": 1.02509201, + "epoch": 0.3243649481436946, + "flos": 24351883470720.0, + "grad_norm": 2.9218368106545247, + "language_loss": 0.81923747, + "learning_rate": 3.048779773198926e-06, + "loss": 0.84036779, + "num_input_tokens_seen": 115865975, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.51171875, + "step": 5395, + "time_per_iteration": 2.4302804470062256 + }, + { + "auxiliary_loss_clip": 0.01077724, + "auxiliary_loss_mlp": 0.01031875, + "balance_loss_clip": 1.01837444, + "balance_loss_mlp": 1.02696395, + "epoch": 0.32442507139636256, + "flos": 22924250265600.0, + "grad_norm": 1.737781338974741, + "language_loss": 0.83581054, + "learning_rate": 3.048458046984317e-06, + "loss": 0.85690653, + "num_input_tokens_seen": 115884950, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.5078125, + "step": 5396, + "time_per_iteration": 2.405505895614624 + }, + { + "auxiliary_loss_clip": 0.01082508, + "auxiliary_loss_mlp": 0.01036973, + "balance_loss_clip": 1.02117813, + "balance_loss_mlp": 1.02619088, + "epoch": 0.32448519464903053, + "flos": 22199103993600.0, + "grad_norm": 1.8534963832756508, + "language_loss": 0.75302124, + "learning_rate": 3.0481362833529363e-06, + "loss": 0.77421606, + "num_input_tokens_seen": 115904170, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.5625, + "step": 5397, + "time_per_iteration": 3.7684366703033447 + }, + { + "auxiliary_loss_clip": 0.01078287, + "auxiliary_loss_mlp": 0.01029348, + "balance_loss_clip": 1.01461387, + "balance_loss_mlp": 1.02486801, + "epoch": 0.3245453179016985, + "flos": 18837596736000.0, + "grad_norm": 2.2254877229769248, + "language_loss": 0.66831249, + "learning_rate": 3.0478144823162686e-06, + "loss": 0.68938887, + "num_input_tokens_seen": 115919255, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.53515625, + "step": 5398, + "time_per_iteration": 2.3500254154205322 + }, + { + "auxiliary_loss_clip": 0.01077051, + "auxiliary_loss_mlp": 0.01023777, + "balance_loss_clip": 1.00887644, + "balance_loss_mlp": 1.02394485, + "epoch": 0.32460544115436646, + "flos": 21177335877120.0, + "grad_norm": 1.4097211460049397, + "language_loss": 0.72865582, + "learning_rate": 3.0474926438857976e-06, + "loss": 0.74966413, + "num_input_tokens_seen": 115938535, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.53125, + "step": 5399, + "time_per_iteration": 2.4297893047332764 + }, + { + "auxiliary_loss_clip": 0.01079311, + "auxiliary_loss_mlp": 0.0103186, + "balance_loss_clip": 1.01638675, + "balance_loss_mlp": 1.02466607, + "epoch": 0.3246655644070344, + "flos": 21980058923520.0, + "grad_norm": 3.2654722497656126, + "language_loss": 0.713521, + "learning_rate": 3.047170768073008e-06, + "loss": 0.73463267, + "num_input_tokens_seen": 115955005, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.546875, + "step": 5400, + "time_per_iteration": 2.387359380722046 + }, + { + "auxiliary_loss_clip": 0.01081027, + "auxiliary_loss_mlp": 0.01035176, + "balance_loss_clip": 1.02026868, + "balance_loss_mlp": 1.025769, + "epoch": 0.3247256876597024, + "flos": 32158393119360.0, + "grad_norm": 2.1169328533256455, + "language_loss": 0.79567647, + "learning_rate": 3.046848854889388e-06, + "loss": 0.8168385, + "num_input_tokens_seen": 115975305, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.55078125, + "step": 5401, + "time_per_iteration": 2.4848508834838867 + }, + { + "auxiliary_loss_clip": 0.01079866, + "auxiliary_loss_mlp": 0.01038272, + "balance_loss_clip": 1.02216136, + "balance_loss_mlp": 1.02576995, + "epoch": 0.32478581091237035, + "flos": 20996450789760.0, + "grad_norm": 1.5743781154015057, + "language_loss": 0.87424928, + "learning_rate": 3.0465269043464243e-06, + "loss": 0.89543062, + "num_input_tokens_seen": 115994810, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.54296875, + "step": 5402, + "time_per_iteration": 2.3826820850372314 + }, + { + "auxiliary_loss_clip": 0.01074204, + "auxiliary_loss_mlp": 0.01032167, + "balance_loss_clip": 1.01690769, + "balance_loss_mlp": 1.02333927, + "epoch": 0.3248459341650383, + "flos": 17924199079680.0, + "grad_norm": 3.434140366835923, + "language_loss": 0.84468889, + "learning_rate": 3.0462049164556082e-06, + "loss": 0.86575258, + "num_input_tokens_seen": 116011095, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.5078125, + "step": 5403, + "time_per_iteration": 2.3555710315704346 + }, + { + "auxiliary_loss_clip": 0.01081101, + "auxiliary_loss_mlp": 0.01031497, + "balance_loss_clip": 1.01775241, + "balance_loss_mlp": 1.02975714, + "epoch": 0.3249060574177063, + "flos": 24534444303360.0, + "grad_norm": 2.19252206773407, + "language_loss": 0.86714506, + "learning_rate": 3.0458828912284293e-06, + "loss": 0.88827109, + "num_input_tokens_seen": 116028805, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.51171875, + "step": 5404, + "time_per_iteration": 2.424011707305908 + }, + { + "auxiliary_loss_clip": 0.01078396, + "auxiliary_loss_mlp": 0.01031567, + "balance_loss_clip": 1.0167017, + "balance_loss_mlp": 1.0257628, + "epoch": 0.32496618067037425, + "flos": 25993569421440.0, + "grad_norm": 1.5546801839984006, + "language_loss": 0.72701812, + "learning_rate": 3.0455608286763803e-06, + "loss": 0.74811774, + "num_input_tokens_seen": 116047765, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.52734375, + "step": 5405, + "time_per_iteration": 2.4844679832458496 + }, + { + "auxiliary_loss_clip": 0.01077176, + "auxiliary_loss_mlp": 0.0103331, + "balance_loss_clip": 1.0177412, + "balance_loss_mlp": 1.0244993, + "epoch": 0.3250263039230422, + "flos": 19572727656960.0, + "grad_norm": 1.641493441548095, + "language_loss": 0.82919037, + "learning_rate": 3.045238728810955e-06, + "loss": 0.85029519, + "num_input_tokens_seen": 116068385, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.52734375, + "step": 5406, + "time_per_iteration": 2.4126479625701904 + }, + { + "auxiliary_loss_clip": 0.0107731, + "auxiliary_loss_mlp": 0.0103144, + "balance_loss_clip": 1.01671171, + "balance_loss_mlp": 1.02566195, + "epoch": 0.3250864271757102, + "flos": 16762708235520.0, + "grad_norm": 1.7530919370193365, + "language_loss": 0.87812674, + "learning_rate": 3.0449165916436485e-06, + "loss": 0.89921427, + "num_input_tokens_seen": 116085350, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.515625, + "step": 5407, + "time_per_iteration": 2.373321533203125 + }, + { + "auxiliary_loss_clip": 0.0107742, + "auxiliary_loss_mlp": 0.01030407, + "balance_loss_clip": 1.01505244, + "balance_loss_mlp": 1.02433276, + "epoch": 0.3251465504283782, + "flos": 27818200229760.0, + "grad_norm": 1.6656429373095116, + "language_loss": 0.69662368, + "learning_rate": 3.044594417185956e-06, + "loss": 0.71770191, + "num_input_tokens_seen": 116107560, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.53125, + "step": 5408, + "time_per_iteration": 2.4401321411132812 + }, + { + "auxiliary_loss_clip": 0.01083446, + "auxiliary_loss_mlp": 0.0102933, + "balance_loss_clip": 1.01321888, + "balance_loss_mlp": 1.02557099, + "epoch": 0.32520667368104617, + "flos": 19062122889600.0, + "grad_norm": 1.7759011843616177, + "language_loss": 0.77459997, + "learning_rate": 3.044272205449376e-06, + "loss": 0.79572773, + "num_input_tokens_seen": 116125980, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.578125, + "step": 5409, + "time_per_iteration": 2.3784339427948 + }, + { + "auxiliary_loss_clip": 0.0107907, + "auxiliary_loss_mlp": 0.01037077, + "balance_loss_clip": 1.02132392, + "balance_loss_mlp": 1.02310836, + "epoch": 0.32526679693371413, + "flos": 29381017685760.0, + "grad_norm": 1.7387065242879542, + "language_loss": 0.83244413, + "learning_rate": 3.0439499564454073e-06, + "loss": 0.85360563, + "num_input_tokens_seen": 116146530, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.5625, + "step": 5410, + "time_per_iteration": 2.433255910873413 + }, + { + "auxiliary_loss_clip": 0.01075946, + "auxiliary_loss_mlp": 0.01034512, + "balance_loss_clip": 1.01973033, + "balance_loss_mlp": 1.02407193, + "epoch": 0.3253269201863821, + "flos": 20703459726720.0, + "grad_norm": 1.541437800142103, + "language_loss": 0.7082814, + "learning_rate": 3.04362767018555e-06, + "loss": 0.72938603, + "num_input_tokens_seen": 116165695, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.515625, + "step": 5411, + "time_per_iteration": 2.3710265159606934 + }, + { + "auxiliary_loss_clip": 0.01079104, + "auxiliary_loss_mlp": 0.01033374, + "balance_loss_clip": 1.01905727, + "balance_loss_mlp": 1.0264312, + "epoch": 0.32538704343905006, + "flos": 29092914213120.0, + "grad_norm": 1.4822385106475402, + "language_loss": 0.82966936, + "learning_rate": 3.0433053466813053e-06, + "loss": 0.8507942, + "num_input_tokens_seen": 116185375, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.52734375, + "step": 5412, + "time_per_iteration": 2.4416842460632324 + }, + { + "auxiliary_loss_clip": 0.01078737, + "auxiliary_loss_mlp": 0.01031884, + "balance_loss_clip": 1.01546848, + "balance_loss_mlp": 1.02402878, + "epoch": 0.325447166691718, + "flos": 24675109637760.0, + "grad_norm": 1.7157391316373405, + "language_loss": 0.80915964, + "learning_rate": 3.042982985944177e-06, + "loss": 0.83026582, + "num_input_tokens_seen": 116204335, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.546875, + "step": 5413, + "time_per_iteration": 2.4012088775634766 + }, + { + "auxiliary_loss_clip": 0.01077817, + "auxiliary_loss_mlp": 0.01031647, + "balance_loss_clip": 1.01567328, + "balance_loss_mlp": 1.0245018, + "epoch": 0.325507289944386, + "flos": 21542073517440.0, + "grad_norm": 1.6593193477589114, + "language_loss": 0.76812875, + "learning_rate": 3.0426605879856685e-06, + "loss": 0.78922343, + "num_input_tokens_seen": 116222840, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.53515625, + "step": 5414, + "time_per_iteration": 2.3885934352874756 + }, + { + "auxiliary_loss_clip": 0.01012807, + "auxiliary_loss_mlp": 0.01013497, + "balance_loss_clip": 1.01207817, + "balance_loss_mlp": 1.00135851, + "epoch": 0.32556741319705396, + "flos": 71515527206400.0, + "grad_norm": 0.9144087985326989, + "language_loss": 0.63917202, + "learning_rate": 3.0423381528172864e-06, + "loss": 0.65943509, + "num_input_tokens_seen": 116274940, + "router_z_loss_clip": 0.01416016, + "router_z_loss_mlp": 0.11425781, + "step": 5415, + "time_per_iteration": 2.9493916034698486 + }, + { + "auxiliary_loss_clip": 0.0107686, + "auxiliary_loss_mlp": 0.01028036, + "balance_loss_clip": 1.01292014, + "balance_loss_mlp": 1.02416492, + "epoch": 0.3256275364497219, + "flos": 23731302320640.0, + "grad_norm": 1.7554377611460208, + "language_loss": 0.74007642, + "learning_rate": 3.042015680450536e-06, + "loss": 0.76112533, + "num_input_tokens_seen": 116297300, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.52734375, + "step": 5416, + "time_per_iteration": 2.4102070331573486 + }, + { + "auxiliary_loss_clip": 0.01013101, + "auxiliary_loss_mlp": 0.01002505, + "balance_loss_clip": 1.00118756, + "balance_loss_mlp": 1.00168014, + "epoch": 0.3256876597023899, + "flos": 67286043838080.0, + "grad_norm": 0.783166779440946, + "language_loss": 0.57991099, + "learning_rate": 3.041693170896926e-06, + "loss": 0.60006702, + "num_input_tokens_seen": 116362370, + "router_z_loss_clip": 0.01318359, + "router_z_loss_mlp": 0.11425781, + "step": 5417, + "time_per_iteration": 3.112537384033203 + }, + { + "auxiliary_loss_clip": 0.01013247, + "auxiliary_loss_mlp": 0.00999626, + "balance_loss_clip": 0.99820149, + "balance_loss_mlp": 1.0018003, + "epoch": 0.32574778295505785, + "flos": 71278605653760.0, + "grad_norm": 0.888885935789413, + "language_loss": 0.63371241, + "learning_rate": 3.0413706241679674e-06, + "loss": 0.65384114, + "num_input_tokens_seen": 116430365, + "router_z_loss_clip": 0.01422119, + "router_z_loss_mlp": 0.11425781, + "step": 5418, + "time_per_iteration": 3.155064344406128 + }, + { + "auxiliary_loss_clip": 0.01076001, + "auxiliary_loss_mlp": 0.01035298, + "balance_loss_clip": 1.02075422, + "balance_loss_mlp": 1.02432656, + "epoch": 0.3258079062077258, + "flos": 20775345949440.0, + "grad_norm": 2.9823348293798655, + "language_loss": 0.69778025, + "learning_rate": 3.041048040275169e-06, + "loss": 0.71889329, + "num_input_tokens_seen": 116447525, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.515625, + "step": 5419, + "time_per_iteration": 2.3773465156555176 + }, + { + "auxiliary_loss_clip": 0.01078726, + "auxiliary_loss_mlp": 0.01031779, + "balance_loss_clip": 1.01542389, + "balance_loss_mlp": 1.02577317, + "epoch": 0.3258680294603938, + "flos": 22234401244800.0, + "grad_norm": 1.9098820078835552, + "language_loss": 0.77784669, + "learning_rate": 3.0407254192300444e-06, + "loss": 0.79895169, + "num_input_tokens_seen": 116466310, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.53125, + "step": 5420, + "time_per_iteration": 2.4065260887145996 + }, + { + "auxiliary_loss_clip": 0.01079892, + "auxiliary_loss_mlp": 0.01034915, + "balance_loss_clip": 1.01897705, + "balance_loss_mlp": 1.02523708, + "epoch": 0.3259281527130618, + "flos": 26978748566400.0, + "grad_norm": 1.5051119461954516, + "language_loss": 0.79559088, + "learning_rate": 3.040402761044107e-06, + "loss": 0.81673896, + "num_input_tokens_seen": 116487825, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.546875, + "step": 5421, + "time_per_iteration": 2.4390525817871094 + }, + { + "auxiliary_loss_clip": 0.01074568, + "auxiliary_loss_mlp": 0.01032796, + "balance_loss_clip": 1.01919973, + "balance_loss_mlp": 1.0242002, + "epoch": 0.32598827596572977, + "flos": 26213033427840.0, + "grad_norm": 2.4132741181899204, + "language_loss": 0.75062263, + "learning_rate": 3.040080065728871e-06, + "loss": 0.77169627, + "num_input_tokens_seen": 116509950, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.50390625, + "step": 5422, + "time_per_iteration": 2.4640307426452637 + }, + { + "auxiliary_loss_clip": 0.01079739, + "auxiliary_loss_mlp": 0.01036498, + "balance_loss_clip": 1.02117372, + "balance_loss_mlp": 1.02581573, + "epoch": 0.32604839921839773, + "flos": 17638783781760.0, + "grad_norm": 2.2617686532229517, + "language_loss": 0.63190514, + "learning_rate": 3.0397573332958527e-06, + "loss": 0.65306753, + "num_input_tokens_seen": 116527695, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.5390625, + "step": 5423, + "time_per_iteration": 2.3707315921783447 + }, + { + "auxiliary_loss_clip": 0.01072296, + "auxiliary_loss_mlp": 0.01030382, + "balance_loss_clip": 1.01644063, + "balance_loss_mlp": 1.0233438, + "epoch": 0.3261085224710657, + "flos": 23621605228800.0, + "grad_norm": 1.660012476943711, + "language_loss": 0.74586529, + "learning_rate": 3.039434563756569e-06, + "loss": 0.76689208, + "num_input_tokens_seen": 116547800, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.48828125, + "step": 5424, + "time_per_iteration": 2.4088220596313477 + }, + { + "auxiliary_loss_clip": 0.01073955, + "auxiliary_loss_mlp": 0.01028688, + "balance_loss_clip": 1.01474011, + "balance_loss_mlp": 1.02412653, + "epoch": 0.32616864572373366, + "flos": 23259276472320.0, + "grad_norm": 1.627510589085908, + "language_loss": 0.77298176, + "learning_rate": 3.0391117571225407e-06, + "loss": 0.7940082, + "num_input_tokens_seen": 116568460, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.49804688, + "step": 5425, + "time_per_iteration": 2.4299569129943848 + }, + { + "auxiliary_loss_clip": 0.0107992, + "auxiliary_loss_mlp": 0.01032948, + "balance_loss_clip": 1.01578176, + "balance_loss_mlp": 1.02493644, + "epoch": 0.32622876897640163, + "flos": 25592242809600.0, + "grad_norm": 2.4558171768548895, + "language_loss": 0.7800011, + "learning_rate": 3.0387889134052866e-06, + "loss": 0.80112982, + "num_input_tokens_seen": 116588705, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.546875, + "step": 5426, + "time_per_iteration": 2.4657649993896484 + }, + { + "auxiliary_loss_clip": 0.01078852, + "auxiliary_loss_mlp": 0.01029928, + "balance_loss_clip": 1.01478291, + "balance_loss_mlp": 1.02712727, + "epoch": 0.3262888922290696, + "flos": 22417904684160.0, + "grad_norm": 1.6946984512612686, + "language_loss": 0.74343133, + "learning_rate": 3.0384660326163277e-06, + "loss": 0.76451916, + "num_input_tokens_seen": 116608845, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.515625, + "step": 5427, + "time_per_iteration": 2.4272100925445557 + }, + { + "auxiliary_loss_clip": 0.01079176, + "auxiliary_loss_mlp": 0.01033608, + "balance_loss_clip": 1.01759815, + "balance_loss_mlp": 1.02467334, + "epoch": 0.32634901548173756, + "flos": 19717896556800.0, + "grad_norm": 2.333646384732536, + "language_loss": 0.79019922, + "learning_rate": 3.0381431147671875e-06, + "loss": 0.8113271, + "num_input_tokens_seen": 116628145, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.546875, + "step": 5428, + "time_per_iteration": 3.756551504135132 + }, + { + "auxiliary_loss_clip": 0.01075906, + "auxiliary_loss_mlp": 0.01032761, + "balance_loss_clip": 1.01808619, + "balance_loss_mlp": 1.02349102, + "epoch": 0.3264091387344055, + "flos": 16142022351360.0, + "grad_norm": 1.7366548579309573, + "language_loss": 0.71382821, + "learning_rate": 3.03782015986939e-06, + "loss": 0.7349149, + "num_input_tokens_seen": 116646920, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.5234375, + "step": 5429, + "time_per_iteration": 2.368297576904297 + }, + { + "auxiliary_loss_clip": 0.01077668, + "auxiliary_loss_mlp": 0.0103048, + "balance_loss_clip": 1.01642549, + "balance_loss_mlp": 1.02595758, + "epoch": 0.3264692619870735, + "flos": 16398145152000.0, + "grad_norm": 1.7309756186465304, + "language_loss": 0.78436255, + "learning_rate": 3.037497167934461e-06, + "loss": 0.805444, + "num_input_tokens_seen": 116665100, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.515625, + "step": 5430, + "time_per_iteration": 2.3637843132019043 + }, + { + "auxiliary_loss_clip": 0.01079765, + "auxiliary_loss_mlp": 0.01033136, + "balance_loss_clip": 1.01623201, + "balance_loss_mlp": 1.02481902, + "epoch": 0.32652938523974145, + "flos": 22381245889920.0, + "grad_norm": 2.344577465160989, + "language_loss": 0.84277546, + "learning_rate": 3.037174138973927e-06, + "loss": 0.86390448, + "num_input_tokens_seen": 116682205, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.55078125, + "step": 5431, + "time_per_iteration": 2.3848962783813477 + }, + { + "auxiliary_loss_clip": 0.01074432, + "auxiliary_loss_mlp": 0.01033338, + "balance_loss_clip": 1.01854372, + "balance_loss_mlp": 1.02247286, + "epoch": 0.3265895084924094, + "flos": 21906985714560.0, + "grad_norm": 5.324207634185405, + "language_loss": 0.70300651, + "learning_rate": 3.0368510729993147e-06, + "loss": 0.72408426, + "num_input_tokens_seen": 116702575, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.51953125, + "step": 5432, + "time_per_iteration": 3.8911707401275635 + }, + { + "auxiliary_loss_clip": 0.01074675, + "auxiliary_loss_mlp": 0.01025699, + "balance_loss_clip": 1.01156664, + "balance_loss_mlp": 1.02224946, + "epoch": 0.3266496317450774, + "flos": 16066330790400.0, + "grad_norm": 2.310135841221492, + "language_loss": 0.84323114, + "learning_rate": 3.0365279700221555e-06, + "loss": 0.86423481, + "num_input_tokens_seen": 116720885, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.5234375, + "step": 5433, + "time_per_iteration": 3.7029778957366943 + }, + { + "auxiliary_loss_clip": 0.01075471, + "auxiliary_loss_mlp": 0.01027421, + "balance_loss_clip": 1.01315725, + "balance_loss_mlp": 1.02445185, + "epoch": 0.3267097549977454, + "flos": 22527147928320.0, + "grad_norm": 1.3751413669100931, + "language_loss": 0.85843301, + "learning_rate": 3.036204830053979e-06, + "loss": 0.879462, + "num_input_tokens_seen": 116740395, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.5078125, + "step": 5434, + "time_per_iteration": 2.393791437149048 + }, + { + "auxiliary_loss_clip": 0.01079266, + "auxiliary_loss_mlp": 0.01033873, + "balance_loss_clip": 1.01808953, + "balance_loss_mlp": 1.02544296, + "epoch": 0.32676987825041337, + "flos": 27269226011520.0, + "grad_norm": 1.767099675996129, + "language_loss": 0.87589449, + "learning_rate": 3.035881653106318e-06, + "loss": 0.89702582, + "num_input_tokens_seen": 116758870, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.5390625, + "step": 5435, + "time_per_iteration": 2.4210550785064697 + }, + { + "auxiliary_loss_clip": 0.01077102, + "auxiliary_loss_mlp": 0.01033977, + "balance_loss_clip": 1.01865852, + "balance_loss_mlp": 1.02470124, + "epoch": 0.32683000150308134, + "flos": 11507511767040.0, + "grad_norm": 2.413629176530171, + "language_loss": 0.76559198, + "learning_rate": 3.035558439190705e-06, + "loss": 0.78670275, + "num_input_tokens_seen": 116773440, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.5234375, + "step": 5436, + "time_per_iteration": 2.349705696105957 + }, + { + "auxiliary_loss_clip": 0.01078114, + "auxiliary_loss_mlp": 0.01029775, + "balance_loss_clip": 1.01520216, + "balance_loss_mlp": 1.0256958, + "epoch": 0.3268901247557493, + "flos": 25629006337920.0, + "grad_norm": 1.5246901564698747, + "language_loss": 0.71808475, + "learning_rate": 3.0352351883186753e-06, + "loss": 0.73916364, + "num_input_tokens_seen": 116794375, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.5234375, + "step": 5437, + "time_per_iteration": 3.8887460231781006 + }, + { + "auxiliary_loss_clip": 0.01076429, + "auxiliary_loss_mlp": 0.01033303, + "balance_loss_clip": 1.0166254, + "balance_loss_mlp": 1.02238953, + "epoch": 0.32695024800841727, + "flos": 24859765152000.0, + "grad_norm": 1.5498046810882387, + "language_loss": 0.63579702, + "learning_rate": 3.034911900501765e-06, + "loss": 0.65689439, + "num_input_tokens_seen": 116815095, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.5390625, + "step": 5438, + "time_per_iteration": 2.4118716716766357 + }, + { + "auxiliary_loss_clip": 0.01077552, + "auxiliary_loss_mlp": 0.01029749, + "balance_loss_clip": 1.01378727, + "balance_loss_mlp": 1.02547598, + "epoch": 0.32701037126108523, + "flos": 28838013310080.0, + "grad_norm": 1.453865626869374, + "language_loss": 0.74592376, + "learning_rate": 3.0345885757515104e-06, + "loss": 0.76699674, + "num_input_tokens_seen": 116836630, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.51953125, + "step": 5439, + "time_per_iteration": 2.428367853164673 + }, + { + "auxiliary_loss_clip": 0.01078362, + "auxiliary_loss_mlp": 0.01033008, + "balance_loss_clip": 1.0178982, + "balance_loss_mlp": 1.0250299, + "epoch": 0.3270704945137532, + "flos": 27963822977280.0, + "grad_norm": 1.851964283862922, + "language_loss": 0.74615502, + "learning_rate": 3.034265214079451e-06, + "loss": 0.76726878, + "num_input_tokens_seen": 116856880, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.53125, + "step": 5440, + "time_per_iteration": 2.430908203125 + }, + { + "auxiliary_loss_clip": 0.01077425, + "auxiliary_loss_mlp": 0.01038005, + "balance_loss_clip": 1.02284217, + "balance_loss_mlp": 1.0243032, + "epoch": 0.32713061776642116, + "flos": 23689721024640.0, + "grad_norm": 1.9272560702716868, + "language_loss": 0.84941757, + "learning_rate": 3.0339418154971262e-06, + "loss": 0.87057185, + "num_input_tokens_seen": 116873770, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.53125, + "step": 5441, + "time_per_iteration": 2.3966825008392334 + }, + { + "auxiliary_loss_clip": 0.01079412, + "auxiliary_loss_mlp": 0.01034647, + "balance_loss_clip": 1.01808846, + "balance_loss_mlp": 1.02504659, + "epoch": 0.3271907410190891, + "flos": 22454528567040.0, + "grad_norm": 2.0114420325569675, + "language_loss": 0.86473727, + "learning_rate": 3.0336183800160786e-06, + "loss": 0.88587785, + "num_input_tokens_seen": 116891225, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.54296875, + "step": 5442, + "time_per_iteration": 2.380145788192749 + }, + { + "auxiliary_loss_clip": 0.01080376, + "auxiliary_loss_mlp": 0.01033421, + "balance_loss_clip": 1.01675582, + "balance_loss_mlp": 1.02519119, + "epoch": 0.3272508642717571, + "flos": 22819021827840.0, + "grad_norm": 1.5505837294884508, + "language_loss": 0.77489972, + "learning_rate": 3.033294907647849e-06, + "loss": 0.79603767, + "num_input_tokens_seen": 116912300, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.55078125, + "step": 5443, + "time_per_iteration": 2.395960807800293 + }, + { + "auxiliary_loss_clip": 0.0107767, + "auxiliary_loss_mlp": 0.01031808, + "balance_loss_clip": 1.01604891, + "balance_loss_mlp": 1.02239799, + "epoch": 0.32731098752442506, + "flos": 11800572652800.0, + "grad_norm": 2.8015540557958403, + "language_loss": 0.81702423, + "learning_rate": 3.0329713984039824e-06, + "loss": 0.83811897, + "num_input_tokens_seen": 116929425, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.5546875, + "step": 5444, + "time_per_iteration": 2.348742723464966 + }, + { + "auxiliary_loss_clip": 0.01077456, + "auxiliary_loss_mlp": 0.01034156, + "balance_loss_clip": 1.01850367, + "balance_loss_mlp": 1.0237186, + "epoch": 0.327371110777093, + "flos": 21026860450560.0, + "grad_norm": 2.0066738671434616, + "language_loss": 0.58649683, + "learning_rate": 3.032647852296024e-06, + "loss": 0.60761285, + "num_input_tokens_seen": 116948255, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.5390625, + "step": 5445, + "time_per_iteration": 2.4022276401519775 + }, + { + "auxiliary_loss_clip": 0.01079605, + "auxiliary_loss_mlp": 0.01035419, + "balance_loss_clip": 1.01877713, + "balance_loss_mlp": 1.0249486, + "epoch": 0.327431234029761, + "flos": 19061110460160.0, + "grad_norm": 2.5663118672278142, + "language_loss": 0.88257396, + "learning_rate": 3.0323242693355195e-06, + "loss": 0.90372419, + "num_input_tokens_seen": 116964905, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.546875, + "step": 5446, + "time_per_iteration": 2.359149932861328 + }, + { + "auxiliary_loss_clip": 0.01083825, + "auxiliary_loss_mlp": 0.01038657, + "balance_loss_clip": 1.02052522, + "balance_loss_mlp": 1.02630043, + "epoch": 0.32749135728242895, + "flos": 25848016496640.0, + "grad_norm": 2.8254414686427483, + "language_loss": 0.79008245, + "learning_rate": 3.0320006495340175e-06, + "loss": 0.81130731, + "num_input_tokens_seen": 116983650, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.578125, + "step": 5447, + "time_per_iteration": 2.4090781211853027 + }, + { + "auxiliary_loss_clip": 0.01079066, + "auxiliary_loss_mlp": 0.01033409, + "balance_loss_clip": 1.01882958, + "balance_loss_mlp": 1.02469659, + "epoch": 0.327551480535097, + "flos": 20119502459520.0, + "grad_norm": 2.4382832938527472, + "language_loss": 0.73265076, + "learning_rate": 3.0316769929030672e-06, + "loss": 0.75377548, + "num_input_tokens_seen": 117003265, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.54296875, + "step": 5448, + "time_per_iteration": 2.3767454624176025 + }, + { + "auxiliary_loss_clip": 0.01077026, + "auxiliary_loss_mlp": 0.01033209, + "balance_loss_clip": 1.01808095, + "balance_loss_mlp": 1.02421916, + "epoch": 0.32761160378776494, + "flos": 28802297122560.0, + "grad_norm": 2.7915555525855007, + "language_loss": 0.66764009, + "learning_rate": 3.0313532994542185e-06, + "loss": 0.6887424, + "num_input_tokens_seen": 117025370, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.52734375, + "step": 5449, + "time_per_iteration": 2.4324452877044678 + }, + { + "auxiliary_loss_clip": 0.01075557, + "auxiliary_loss_mlp": 0.0102858, + "balance_loss_clip": 1.01382208, + "balance_loss_mlp": 1.02304649, + "epoch": 0.3276717270404329, + "flos": 26936713422720.0, + "grad_norm": 1.4328077652786053, + "language_loss": 0.65584016, + "learning_rate": 3.0310295691990234e-06, + "loss": 0.67688155, + "num_input_tokens_seen": 117044350, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.52734375, + "step": 5450, + "time_per_iteration": 2.4380178451538086 + }, + { + "auxiliary_loss_clip": 0.01079037, + "auxiliary_loss_mlp": 0.01031527, + "balance_loss_clip": 1.01591063, + "balance_loss_mlp": 1.02471006, + "epoch": 0.32773185029310087, + "flos": 25337237172480.0, + "grad_norm": 1.8563300319989955, + "language_loss": 0.77256566, + "learning_rate": 3.030705802149035e-06, + "loss": 0.79367125, + "num_input_tokens_seen": 117064450, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.54296875, + "step": 5451, + "time_per_iteration": 2.4091036319732666 + }, + { + "auxiliary_loss_clip": 0.01079516, + "auxiliary_loss_mlp": 0.01034288, + "balance_loss_clip": 1.01773548, + "balance_loss_mlp": 1.02491164, + "epoch": 0.32779197354576883, + "flos": 26390636847360.0, + "grad_norm": 2.9295776544066325, + "language_loss": 0.70404297, + "learning_rate": 3.030381998315808e-06, + "loss": 0.72518098, + "num_input_tokens_seen": 117083060, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.546875, + "step": 5452, + "time_per_iteration": 2.4223482608795166 + }, + { + "auxiliary_loss_clip": 0.01075831, + "auxiliary_loss_mlp": 0.01029826, + "balance_loss_clip": 1.01447225, + "balance_loss_mlp": 1.02386236, + "epoch": 0.3278520967984368, + "flos": 24898239336960.0, + "grad_norm": 1.5042685434288139, + "language_loss": 0.78481078, + "learning_rate": 3.030058157710899e-06, + "loss": 0.80586743, + "num_input_tokens_seen": 117101860, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.51953125, + "step": 5453, + "time_per_iteration": 2.4038357734680176 + }, + { + "auxiliary_loss_clip": 0.01078186, + "auxiliary_loss_mlp": 0.01028418, + "balance_loss_clip": 1.01371348, + "balance_loss_mlp": 1.02466023, + "epoch": 0.32791222005110476, + "flos": 29751690257280.0, + "grad_norm": 2.8454525940046937, + "language_loss": 0.75492507, + "learning_rate": 3.0297342803458624e-06, + "loss": 0.77599108, + "num_input_tokens_seen": 117123100, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.53515625, + "step": 5454, + "time_per_iteration": 2.445586681365967 + }, + { + "auxiliary_loss_clip": 0.01074595, + "auxiliary_loss_mlp": 0.01030052, + "balance_loss_clip": 1.01624203, + "balance_loss_mlp": 1.02453721, + "epoch": 0.32797234330377273, + "flos": 16507144016640.0, + "grad_norm": 1.7433348099328165, + "language_loss": 0.76670611, + "learning_rate": 3.029410366232259e-06, + "loss": 0.78775251, + "num_input_tokens_seen": 117140515, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.5, + "step": 5455, + "time_per_iteration": 2.3452672958374023 + }, + { + "auxiliary_loss_clip": 0.01080187, + "auxiliary_loss_mlp": 0.01034238, + "balance_loss_clip": 1.01670265, + "balance_loss_mlp": 1.02370977, + "epoch": 0.3280324665564407, + "flos": 26576723727360.0, + "grad_norm": 1.6577279543472647, + "language_loss": 0.73814428, + "learning_rate": 3.0290864153816467e-06, + "loss": 0.75928849, + "num_input_tokens_seen": 117161485, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.5625, + "step": 5456, + "time_per_iteration": 2.44221830368042 + }, + { + "auxiliary_loss_clip": 0.01080212, + "auxiliary_loss_mlp": 0.01034765, + "balance_loss_clip": 1.01813531, + "balance_loss_mlp": 1.02497685, + "epoch": 0.32809258980910866, + "flos": 22928858565120.0, + "grad_norm": 1.3973542595297122, + "language_loss": 0.78002566, + "learning_rate": 3.028762427805588e-06, + "loss": 0.80117542, + "num_input_tokens_seen": 117181870, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.55078125, + "step": 5457, + "time_per_iteration": 2.3832404613494873 + }, + { + "auxiliary_loss_clip": 0.01078947, + "auxiliary_loss_mlp": 0.0103513, + "balance_loss_clip": 1.01989484, + "balance_loss_mlp": 1.0238179, + "epoch": 0.3281527130617766, + "flos": 22782747058560.0, + "grad_norm": 2.2032463858877755, + "language_loss": 0.78798318, + "learning_rate": 3.028438403515645e-06, + "loss": 0.80912393, + "num_input_tokens_seen": 117201380, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.55078125, + "step": 5458, + "time_per_iteration": 2.4138641357421875 + }, + { + "auxiliary_loss_clip": 0.01077177, + "auxiliary_loss_mlp": 0.0102955, + "balance_loss_clip": 1.01431465, + "balance_loss_mlp": 1.02429223, + "epoch": 0.3282128363144446, + "flos": 21249641036160.0, + "grad_norm": 1.8622662939696581, + "language_loss": 0.7289722, + "learning_rate": 3.0281143425233795e-06, + "loss": 0.7500394, + "num_input_tokens_seen": 117221040, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.52734375, + "step": 5459, + "time_per_iteration": 2.399421215057373 + }, + { + "auxiliary_loss_clip": 0.01080956, + "auxiliary_loss_mlp": 0.01033907, + "balance_loss_clip": 1.01768255, + "balance_loss_mlp": 1.02518368, + "epoch": 0.32827295956711255, + "flos": 30841853460480.0, + "grad_norm": 1.7544502809181561, + "language_loss": 0.84002161, + "learning_rate": 3.02779024484036e-06, + "loss": 0.86117017, + "num_input_tokens_seen": 117241395, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.55859375, + "step": 5460, + "time_per_iteration": 2.464229106903076 + }, + { + "auxiliary_loss_clip": 0.01076967, + "auxiliary_loss_mlp": 0.01030284, + "balance_loss_clip": 1.01539469, + "balance_loss_mlp": 1.02279675, + "epoch": 0.3283330828197806, + "flos": 25914002699520.0, + "grad_norm": 1.7437057590676457, + "language_loss": 0.76681101, + "learning_rate": 3.0274661104781483e-06, + "loss": 0.78788352, + "num_input_tokens_seen": 117259340, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.5390625, + "step": 5461, + "time_per_iteration": 2.402656078338623 + }, + { + "auxiliary_loss_clip": 0.0107789, + "auxiliary_loss_mlp": 0.01031778, + "balance_loss_clip": 1.01458812, + "balance_loss_mlp": 1.02455664, + "epoch": 0.32839320607244854, + "flos": 38580526604160.0, + "grad_norm": 1.9193213046648052, + "language_loss": 0.63180983, + "learning_rate": 3.027141939448315e-06, + "loss": 0.65290648, + "num_input_tokens_seen": 117282375, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.53125, + "step": 5462, + "time_per_iteration": 2.5308499336242676 + }, + { + "auxiliary_loss_clip": 0.0107776, + "auxiliary_loss_mlp": 0.01027519, + "balance_loss_clip": 1.0127486, + "balance_loss_mlp": 1.02419543, + "epoch": 0.3284533293251165, + "flos": 26649692202240.0, + "grad_norm": 1.6113829294151034, + "language_loss": 0.77892303, + "learning_rate": 3.0268177317624275e-06, + "loss": 0.79997581, + "num_input_tokens_seen": 117303830, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.5390625, + "step": 5463, + "time_per_iteration": 2.426086664199829 + }, + { + "auxiliary_loss_clip": 0.01077726, + "auxiliary_loss_mlp": 0.0103632, + "balance_loss_clip": 1.02021456, + "balance_loss_mlp": 1.02386844, + "epoch": 0.32851345257778447, + "flos": 15303268915200.0, + "grad_norm": 2.125496897238556, + "language_loss": 0.69328785, + "learning_rate": 3.0264934874320566e-06, + "loss": 0.71442831, + "num_input_tokens_seen": 117320665, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.5390625, + "step": 5464, + "time_per_iteration": 2.3595173358917236 + }, + { + "auxiliary_loss_clip": 0.01077268, + "auxiliary_loss_mlp": 0.01035001, + "balance_loss_clip": 1.01927733, + "balance_loss_mlp": 1.02579987, + "epoch": 0.32857357583045244, + "flos": 23512606364160.0, + "grad_norm": 2.222365487172947, + "language_loss": 0.72405088, + "learning_rate": 3.026169206468774e-06, + "loss": 0.74517351, + "num_input_tokens_seen": 117339795, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.515625, + "step": 5465, + "time_per_iteration": 2.3878488540649414 + }, + { + "auxiliary_loss_clip": 0.0107929, + "auxiliary_loss_mlp": 0.01027737, + "balance_loss_clip": 1.01265681, + "balance_loss_mlp": 1.02633595, + "epoch": 0.3286336990831204, + "flos": 20994181551360.0, + "grad_norm": 1.3946260054309163, + "language_loss": 0.82909477, + "learning_rate": 3.025844888884152e-06, + "loss": 0.85016495, + "num_input_tokens_seen": 117359525, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.53125, + "step": 5466, + "time_per_iteration": 2.384462594985962 + }, + { + "auxiliary_loss_clip": 0.01077987, + "auxiliary_loss_mlp": 0.01031917, + "balance_loss_clip": 1.01669383, + "balance_loss_mlp": 1.02449858, + "epoch": 0.32869382233578837, + "flos": 23657705441280.0, + "grad_norm": 1.6523196533680227, + "language_loss": 0.79685318, + "learning_rate": 3.0255205346897646e-06, + "loss": 0.81795228, + "num_input_tokens_seen": 117380320, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.53515625, + "step": 5467, + "time_per_iteration": 3.8258657455444336 + }, + { + "auxiliary_loss_clip": 0.01077876, + "auxiliary_loss_mlp": 0.0103499, + "balance_loss_clip": 1.01895642, + "balance_loss_mlp": 1.02396202, + "epoch": 0.32875394558845633, + "flos": 25335386870400.0, + "grad_norm": 1.5816377923003648, + "language_loss": 0.74493074, + "learning_rate": 3.0251961438971866e-06, + "loss": 0.7660594, + "num_input_tokens_seen": 117400695, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.5390625, + "step": 5468, + "time_per_iteration": 2.4155049324035645 + }, + { + "auxiliary_loss_clip": 0.01084114, + "auxiliary_loss_mlp": 0.01036531, + "balance_loss_clip": 1.01854181, + "balance_loss_mlp": 1.02674651, + "epoch": 0.3288140688411243, + "flos": 14902221594240.0, + "grad_norm": 2.5271646654174105, + "language_loss": 0.78453338, + "learning_rate": 3.024871716517996e-06, + "loss": 0.80573982, + "num_input_tokens_seen": 117418800, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.57421875, + "step": 5469, + "time_per_iteration": 2.3450305461883545 + }, + { + "auxiliary_loss_clip": 0.01077556, + "auxiliary_loss_mlp": 0.01028308, + "balance_loss_clip": 1.01290655, + "balance_loss_mlp": 1.02343345, + "epoch": 0.32887419209379226, + "flos": 18550366047360.0, + "grad_norm": 1.8224757690139617, + "language_loss": 0.81447303, + "learning_rate": 3.0245472525637706e-06, + "loss": 0.83553171, + "num_input_tokens_seen": 117438220, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.5390625, + "step": 5470, + "time_per_iteration": 2.3762497901916504 + }, + { + "auxiliary_loss_clip": 0.0107779, + "auxiliary_loss_mlp": 0.01030068, + "balance_loss_clip": 1.01378453, + "balance_loss_mlp": 1.0233016, + "epoch": 0.3289343153464602, + "flos": 48103785360000.0, + "grad_norm": 1.6722158362914659, + "language_loss": 0.67574811, + "learning_rate": 3.0242227520460885e-06, + "loss": 0.69682676, + "num_input_tokens_seen": 117462560, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.546875, + "step": 5471, + "time_per_iteration": 3.962620496749878 + }, + { + "auxiliary_loss_clip": 0.01080355, + "auxiliary_loss_mlp": 0.01033582, + "balance_loss_clip": 1.01510489, + "balance_loss_mlp": 1.02359247, + "epoch": 0.3289944385991282, + "flos": 27599050425600.0, + "grad_norm": 7.410125431003147, + "language_loss": 0.64579719, + "learning_rate": 3.023898214976531e-06, + "loss": 0.66693652, + "num_input_tokens_seen": 117483665, + "router_z_loss_clip": 0.18457031, + "router_z_loss_mlp": 0.56640625, + "step": 5472, + "time_per_iteration": 3.9157164096832275 + }, + { + "auxiliary_loss_clip": 0.01077792, + "auxiliary_loss_mlp": 0.01035754, + "balance_loss_clip": 1.01887417, + "balance_loss_mlp": 1.0238626, + "epoch": 0.32905456185179616, + "flos": 20119292991360.0, + "grad_norm": 1.6672053581604682, + "language_loss": 0.88222539, + "learning_rate": 3.02357364136668e-06, + "loss": 0.90336084, + "num_input_tokens_seen": 117503565, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.5390625, + "step": 5473, + "time_per_iteration": 2.389453172683716 + }, + { + "auxiliary_loss_clip": 0.01080474, + "auxiliary_loss_mlp": 0.01038804, + "balance_loss_clip": 1.02100611, + "balance_loss_mlp": 1.02503395, + "epoch": 0.3291146851044642, + "flos": 23179255902720.0, + "grad_norm": 5.034410165739722, + "language_loss": 0.78064179, + "learning_rate": 3.023249031228119e-06, + "loss": 0.80183458, + "num_input_tokens_seen": 117521460, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.5546875, + "step": 5474, + "time_per_iteration": 2.3733670711517334 + }, + { + "auxiliary_loss_clip": 0.0101337, + "auxiliary_loss_mlp": 0.01002093, + "balance_loss_clip": 1.0008707, + "balance_loss_mlp": 1.00199521, + "epoch": 0.32917480835713214, + "flos": 67618626249600.0, + "grad_norm": 0.8068229367215125, + "language_loss": 0.60187316, + "learning_rate": 3.0229243845724323e-06, + "loss": 0.62202775, + "num_input_tokens_seen": 117580550, + "router_z_loss_clip": 0.01220703, + "router_z_loss_mlp": 0.11376953, + "step": 5475, + "time_per_iteration": 2.99570631980896 + }, + { + "auxiliary_loss_clip": 0.01080725, + "auxiliary_loss_mlp": 0.01035766, + "balance_loss_clip": 1.01752687, + "balance_loss_mlp": 1.02312112, + "epoch": 0.3292349316098001, + "flos": 27963299306880.0, + "grad_norm": 3.683065370882492, + "language_loss": 0.76878214, + "learning_rate": 3.022599701411205e-06, + "loss": 0.78994703, + "num_input_tokens_seen": 117600645, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 0.578125, + "step": 5476, + "time_per_iteration": 3.75848388671875 + }, + { + "auxiliary_loss_clip": 0.01080818, + "auxiliary_loss_mlp": 0.01038302, + "balance_loss_clip": 1.02189863, + "balance_loss_mlp": 1.02523828, + "epoch": 0.3292950548624681, + "flos": 20262716323200.0, + "grad_norm": 1.6678265199555182, + "language_loss": 0.74488866, + "learning_rate": 3.0222749817560252e-06, + "loss": 0.76607984, + "num_input_tokens_seen": 117618880, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.55859375, + "step": 5477, + "time_per_iteration": 2.3645784854888916 + }, + { + "auxiliary_loss_clip": 0.01074658, + "auxiliary_loss_mlp": 0.01030904, + "balance_loss_clip": 1.01579463, + "balance_loss_mlp": 1.02327657, + "epoch": 0.32935517811513604, + "flos": 20811969832320.0, + "grad_norm": 2.019059742157605, + "language_loss": 0.75214255, + "learning_rate": 3.0219502256184804e-06, + "loss": 0.77319813, + "num_input_tokens_seen": 117636445, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.515625, + "step": 5478, + "time_per_iteration": 2.383361577987671 + }, + { + "auxiliary_loss_clip": 0.01078613, + "auxiliary_loss_mlp": 0.01034305, + "balance_loss_clip": 1.01910627, + "balance_loss_mlp": 1.025846, + "epoch": 0.329415301367804, + "flos": 18440878423680.0, + "grad_norm": 2.0264913751417093, + "language_loss": 0.8089844, + "learning_rate": 3.0216254330101617e-06, + "loss": 0.83011365, + "num_input_tokens_seen": 117653105, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.53125, + "step": 5479, + "time_per_iteration": 2.34741473197937 + }, + { + "auxiliary_loss_clip": 0.01012531, + "auxiliary_loss_mlp": 0.01002723, + "balance_loss_clip": 1.00146508, + "balance_loss_mlp": 1.00145316, + "epoch": 0.32947542462047197, + "flos": 66319367713920.0, + "grad_norm": 0.7611947774682117, + "language_loss": 0.56492686, + "learning_rate": 3.0213006039426587e-06, + "loss": 0.58507937, + "num_input_tokens_seen": 117719225, + "router_z_loss_clip": 0.01257324, + "router_z_loss_mlp": 0.11083984, + "step": 5480, + "time_per_iteration": 3.1084365844726562 + }, + { + "auxiliary_loss_clip": 0.0107615, + "auxiliary_loss_mlp": 0.01033383, + "balance_loss_clip": 1.01784968, + "balance_loss_mlp": 1.02299666, + "epoch": 0.32953554787313993, + "flos": 23220488085120.0, + "grad_norm": 2.002717647561456, + "language_loss": 0.7727921, + "learning_rate": 3.0209757384275643e-06, + "loss": 0.79388744, + "num_input_tokens_seen": 117738725, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.53125, + "step": 5481, + "time_per_iteration": 2.3818211555480957 + }, + { + "auxiliary_loss_clip": 0.01081065, + "auxiliary_loss_mlp": 0.01030712, + "balance_loss_clip": 1.01504767, + "balance_loss_mlp": 1.02616775, + "epoch": 0.3295956711258079, + "flos": 27008460000000.0, + "grad_norm": 1.5277859408892422, + "language_loss": 0.78278458, + "learning_rate": 3.020650836476472e-06, + "loss": 0.80390239, + "num_input_tokens_seen": 117757765, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.546875, + "step": 5482, + "time_per_iteration": 2.418118476867676 + }, + { + "auxiliary_loss_clip": 0.01079361, + "auxiliary_loss_mlp": 0.01034942, + "balance_loss_clip": 1.01869941, + "balance_loss_mlp": 1.0245055, + "epoch": 0.32965579437847586, + "flos": 19170702817920.0, + "grad_norm": 2.179453532430209, + "language_loss": 0.73739564, + "learning_rate": 3.0203258981009767e-06, + "loss": 0.75853866, + "num_input_tokens_seen": 117776810, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.546875, + "step": 5483, + "time_per_iteration": 2.362783670425415 + }, + { + "auxiliary_loss_clip": 0.01079387, + "auxiliary_loss_mlp": 0.01028127, + "balance_loss_clip": 1.01248038, + "balance_loss_mlp": 1.02548862, + "epoch": 0.32971591763114383, + "flos": 30481200449280.0, + "grad_norm": 2.2287266426919, + "language_loss": 0.7526831, + "learning_rate": 3.0200009233126745e-06, + "loss": 0.77375823, + "num_input_tokens_seen": 117797730, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.5390625, + "step": 5484, + "time_per_iteration": 2.4598166942596436 + }, + { + "auxiliary_loss_clip": 0.0107882, + "auxiliary_loss_mlp": 0.01035946, + "balance_loss_clip": 1.02034175, + "balance_loss_mlp": 1.02521205, + "epoch": 0.3297760408838118, + "flos": 16288657528320.0, + "grad_norm": 1.8076020325860054, + "language_loss": 0.71962059, + "learning_rate": 3.0196759121231636e-06, + "loss": 0.74076831, + "num_input_tokens_seen": 117815365, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.53515625, + "step": 5485, + "time_per_iteration": 2.351759672164917 + }, + { + "auxiliary_loss_clip": 0.01076681, + "auxiliary_loss_mlp": 0.01040589, + "balance_loss_clip": 1.02534819, + "balance_loss_mlp": 1.02417111, + "epoch": 0.32983616413647976, + "flos": 29529712632960.0, + "grad_norm": 1.6391832855517314, + "language_loss": 0.80184996, + "learning_rate": 3.0193508645440424e-06, + "loss": 0.82302266, + "num_input_tokens_seen": 117836095, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.5234375, + "step": 5486, + "time_per_iteration": 2.450106143951416 + }, + { + "auxiliary_loss_clip": 0.01075837, + "auxiliary_loss_mlp": 0.01029841, + "balance_loss_clip": 1.01403403, + "balance_loss_mlp": 1.02320254, + "epoch": 0.3298962873891478, + "flos": 20630351606400.0, + "grad_norm": 2.028862755313964, + "language_loss": 0.83958948, + "learning_rate": 3.0190257805869106e-06, + "loss": 0.86064625, + "num_input_tokens_seen": 117854655, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.52734375, + "step": 5487, + "time_per_iteration": 2.365351676940918 + }, + { + "auxiliary_loss_clip": 0.01082498, + "auxiliary_loss_mlp": 0.0103583, + "balance_loss_clip": 1.01861644, + "balance_loss_mlp": 1.02514911, + "epoch": 0.32995641064181574, + "flos": 14975120246400.0, + "grad_norm": 2.017911445322458, + "language_loss": 0.73766994, + "learning_rate": 3.01870066026337e-06, + "loss": 0.75885332, + "num_input_tokens_seen": 117873300, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.57421875, + "step": 5488, + "time_per_iteration": 2.3447751998901367 + }, + { + "auxiliary_loss_clip": 0.01077945, + "auxiliary_loss_mlp": 0.01036305, + "balance_loss_clip": 1.01941299, + "balance_loss_mlp": 1.02367687, + "epoch": 0.3300165338944837, + "flos": 18660447164160.0, + "grad_norm": 2.191391902059211, + "language_loss": 0.72819901, + "learning_rate": 3.018375503585023e-06, + "loss": 0.74934149, + "num_input_tokens_seen": 117891540, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.5390625, + "step": 5489, + "time_per_iteration": 2.352830410003662 + }, + { + "auxiliary_loss_clip": 0.01075289, + "auxiliary_loss_mlp": 0.01033082, + "balance_loss_clip": 1.01680982, + "balance_loss_mlp": 1.02175522, + "epoch": 0.3300766571471517, + "flos": 25582816742400.0, + "grad_norm": 2.533477887498127, + "language_loss": 0.88714314, + "learning_rate": 3.018050310563474e-06, + "loss": 0.90822685, + "num_input_tokens_seen": 117907690, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.53515625, + "step": 5490, + "time_per_iteration": 2.3791744709014893 + }, + { + "auxiliary_loss_clip": 0.01078381, + "auxiliary_loss_mlp": 0.01033527, + "balance_loss_clip": 1.01782727, + "balance_loss_mlp": 1.02359843, + "epoch": 0.33013678039981964, + "flos": 11362726892160.0, + "grad_norm": 1.9242370586077664, + "language_loss": 0.83497536, + "learning_rate": 3.0177250812103286e-06, + "loss": 0.85609448, + "num_input_tokens_seen": 117925640, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.546875, + "step": 5491, + "time_per_iteration": 2.3475584983825684 + }, + { + "auxiliary_loss_clip": 0.01076319, + "auxiliary_loss_mlp": 0.01031083, + "balance_loss_clip": 1.01445293, + "balance_loss_mlp": 1.02417612, + "epoch": 0.3301969036524876, + "flos": 24820208714880.0, + "grad_norm": 1.9356188378003132, + "language_loss": 0.77759326, + "learning_rate": 3.017399815537193e-06, + "loss": 0.79866719, + "num_input_tokens_seen": 117944525, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.5234375, + "step": 5492, + "time_per_iteration": 2.4017999172210693 + }, + { + "auxiliary_loss_clip": 0.01082598, + "auxiliary_loss_mlp": 0.01036579, + "balance_loss_clip": 1.01918674, + "balance_loss_mlp": 1.02648592, + "epoch": 0.33025702690515557, + "flos": 15960229568640.0, + "grad_norm": 3.011100252222635, + "language_loss": 0.74424094, + "learning_rate": 3.0170745135556744e-06, + "loss": 0.76543272, + "num_input_tokens_seen": 117962515, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.5625, + "step": 5493, + "time_per_iteration": 2.3523318767547607 + }, + { + "auxiliary_loss_clip": 0.01012345, + "auxiliary_loss_mlp": 0.01000917, + "balance_loss_clip": 0.99961793, + "balance_loss_mlp": 1.001477, + "epoch": 0.33031715015782354, + "flos": 59413582897920.0, + "grad_norm": 0.7828648661663578, + "language_loss": 0.53932202, + "learning_rate": 3.0167491752773826e-06, + "loss": 0.55945462, + "num_input_tokens_seen": 118018780, + "router_z_loss_clip": 0.01300049, + "router_z_loss_mlp": 0.10839844, + "step": 5494, + "time_per_iteration": 2.961301326751709 + }, + { + "auxiliary_loss_clip": 0.01077667, + "auxiliary_loss_mlp": 0.01034082, + "balance_loss_clip": 1.01802456, + "balance_loss_mlp": 1.02586997, + "epoch": 0.3303772734104915, + "flos": 23183270709120.0, + "grad_norm": 5.756944714097515, + "language_loss": 0.8663829, + "learning_rate": 3.0164238007139285e-06, + "loss": 0.88750041, + "num_input_tokens_seen": 118038610, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.515625, + "step": 5495, + "time_per_iteration": 2.3944449424743652 + }, + { + "auxiliary_loss_clip": 0.01082321, + "auxiliary_loss_mlp": 0.01044525, + "balance_loss_clip": 1.02660799, + "balance_loss_mlp": 1.02547014, + "epoch": 0.33043739666315947, + "flos": 33070533966720.0, + "grad_norm": 3.088849380326435, + "language_loss": 0.73474109, + "learning_rate": 3.0160983898769233e-06, + "loss": 0.75600958, + "num_input_tokens_seen": 118055905, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.5703125, + "step": 5496, + "time_per_iteration": 2.476020097732544 + }, + { + "auxiliary_loss_clip": 0.01077013, + "auxiliary_loss_mlp": 0.01031965, + "balance_loss_clip": 1.01724291, + "balance_loss_mlp": 1.02416658, + "epoch": 0.33049751991582743, + "flos": 24894399087360.0, + "grad_norm": 2.065212289781025, + "language_loss": 0.72320926, + "learning_rate": 3.015772942777981e-06, + "loss": 0.74429905, + "num_input_tokens_seen": 118073695, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.52734375, + "step": 5497, + "time_per_iteration": 2.3937156200408936 + }, + { + "auxiliary_loss_clip": 0.01080199, + "auxiliary_loss_mlp": 0.01031382, + "balance_loss_clip": 1.01635528, + "balance_loss_mlp": 1.02741897, + "epoch": 0.3305576431684954, + "flos": 29459292687360.0, + "grad_norm": 1.8311271164735543, + "language_loss": 0.7991792, + "learning_rate": 3.015447459428714e-06, + "loss": 0.82029504, + "num_input_tokens_seen": 118094030, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.52734375, + "step": 5498, + "time_per_iteration": 2.4372262954711914 + }, + { + "auxiliary_loss_clip": 0.01077133, + "auxiliary_loss_mlp": 0.01034207, + "balance_loss_clip": 1.01810145, + "balance_loss_mlp": 1.02355289, + "epoch": 0.33061776642116336, + "flos": 22631363936640.0, + "grad_norm": 2.858740061844805, + "language_loss": 0.76234901, + "learning_rate": 3.01512193984074e-06, + "loss": 0.78346241, + "num_input_tokens_seen": 118111665, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.53515625, + "step": 5499, + "time_per_iteration": 2.391963005065918 + }, + { + "auxiliary_loss_clip": 0.01076669, + "auxiliary_loss_mlp": 0.01031135, + "balance_loss_clip": 1.01641226, + "balance_loss_mlp": 1.02401519, + "epoch": 0.3306778896738313, + "flos": 25775117844480.0, + "grad_norm": 1.8301505462738015, + "language_loss": 0.79005277, + "learning_rate": 3.0147963840256748e-06, + "loss": 0.81113082, + "num_input_tokens_seen": 118132435, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.52734375, + "step": 5500, + "time_per_iteration": 2.4190316200256348 + }, + { + "auxiliary_loss_clip": 0.01081682, + "auxiliary_loss_mlp": 0.010331, + "balance_loss_clip": 1.01567161, + "balance_loss_mlp": 1.02551925, + "epoch": 0.33073801292649935, + "flos": 36939050121600.0, + "grad_norm": 1.7803486323603586, + "language_loss": 0.6642729, + "learning_rate": 3.0144707919951376e-06, + "loss": 0.68542069, + "num_input_tokens_seen": 118155255, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.5625, + "step": 5501, + "time_per_iteration": 2.5176336765289307 + }, + { + "auxiliary_loss_clip": 0.01079887, + "auxiliary_loss_mlp": 0.01035618, + "balance_loss_clip": 1.01729608, + "balance_loss_mlp": 1.02398086, + "epoch": 0.3307981361791673, + "flos": 12966951087360.0, + "grad_norm": 2.194220947815171, + "language_loss": 0.7766012, + "learning_rate": 3.014145163760747e-06, + "loss": 0.79775625, + "num_input_tokens_seen": 118169865, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.55859375, + "step": 5502, + "time_per_iteration": 2.33063006401062 + }, + { + "auxiliary_loss_clip": 0.01082904, + "auxiliary_loss_mlp": 0.0103426, + "balance_loss_clip": 1.01813066, + "balance_loss_mlp": 1.02699018, + "epoch": 0.3308582594318353, + "flos": 25373197739520.0, + "grad_norm": 1.8803017426210131, + "language_loss": 0.72392511, + "learning_rate": 3.013819499334124e-06, + "loss": 0.74509674, + "num_input_tokens_seen": 118190760, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.55859375, + "step": 5503, + "time_per_iteration": 2.4150242805480957 + }, + { + "auxiliary_loss_clip": 0.01079542, + "auxiliary_loss_mlp": 0.01031826, + "balance_loss_clip": 1.01608992, + "balance_loss_mlp": 1.02399158, + "epoch": 0.33091838268450324, + "flos": 26467375749120.0, + "grad_norm": 1.6156589334629954, + "language_loss": 0.75059319, + "learning_rate": 3.0134937987268913e-06, + "loss": 0.77170682, + "num_input_tokens_seen": 118213620, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.5546875, + "step": 5504, + "time_per_iteration": 2.43715238571167 + }, + { + "auxiliary_loss_clip": 0.01079161, + "auxiliary_loss_mlp": 0.01037461, + "balance_loss_clip": 1.02143955, + "balance_loss_mlp": 1.02522111, + "epoch": 0.3309785059371712, + "flos": 24970055736960.0, + "grad_norm": 7.375256822080118, + "language_loss": 0.69805634, + "learning_rate": 3.013168061950672e-06, + "loss": 0.71922266, + "num_input_tokens_seen": 118235010, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.5390625, + "step": 5505, + "time_per_iteration": 2.5208687782287598 + }, + { + "auxiliary_loss_clip": 0.01080602, + "auxiliary_loss_mlp": 0.01040268, + "balance_loss_clip": 1.02393603, + "balance_loss_mlp": 1.026914, + "epoch": 0.3310386291898392, + "flos": 20445731003520.0, + "grad_norm": 1.613642332257, + "language_loss": 0.82061791, + "learning_rate": 3.0128422890170908e-06, + "loss": 0.84182662, + "num_input_tokens_seen": 118255820, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.5390625, + "step": 5506, + "time_per_iteration": 3.81140398979187 + }, + { + "auxiliary_loss_clip": 0.01080435, + "auxiliary_loss_mlp": 0.0103575, + "balance_loss_clip": 1.01950145, + "balance_loss_mlp": 1.02583778, + "epoch": 0.33109875244250714, + "flos": 23181629875200.0, + "grad_norm": 1.7497305121320694, + "language_loss": 0.79462695, + "learning_rate": 3.0125164799377727e-06, + "loss": 0.81578875, + "num_input_tokens_seen": 118274160, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.546875, + "step": 5507, + "time_per_iteration": 2.378666400909424 + }, + { + "auxiliary_loss_clip": 0.01077581, + "auxiliary_loss_mlp": 0.01039683, + "balance_loss_clip": 1.02302921, + "balance_loss_mlp": 1.02317238, + "epoch": 0.3311588756951751, + "flos": 24167297779200.0, + "grad_norm": 1.590986125933765, + "language_loss": 0.71247351, + "learning_rate": 3.0121906347243473e-06, + "loss": 0.73364615, + "num_input_tokens_seen": 118294385, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.54296875, + "step": 5508, + "time_per_iteration": 2.388368844985962 + }, + { + "auxiliary_loss_clip": 0.01079258, + "auxiliary_loss_mlp": 0.01029568, + "balance_loss_clip": 1.0145123, + "balance_loss_mlp": 1.02706981, + "epoch": 0.33121899894784307, + "flos": 28144533507840.0, + "grad_norm": 1.7840217270823422, + "language_loss": 0.71811736, + "learning_rate": 3.011864753388441e-06, + "loss": 0.7392056, + "num_input_tokens_seen": 118313105, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.51953125, + "step": 5509, + "time_per_iteration": 2.4253482818603516 + }, + { + "auxiliary_loss_clip": 0.01079044, + "auxiliary_loss_mlp": 0.01034368, + "balance_loss_clip": 1.01773834, + "balance_loss_mlp": 1.02436996, + "epoch": 0.33127912220051103, + "flos": 29566441249920.0, + "grad_norm": 1.549841388085175, + "language_loss": 0.73306143, + "learning_rate": 3.0115388359416845e-06, + "loss": 0.75419545, + "num_input_tokens_seen": 118335250, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.546875, + "step": 5510, + "time_per_iteration": 2.436173677444458 + }, + { + "auxiliary_loss_clip": 0.01076664, + "auxiliary_loss_mlp": 0.01036019, + "balance_loss_clip": 1.01986647, + "balance_loss_mlp": 1.02390003, + "epoch": 0.331339245453179, + "flos": 14427961418880.0, + "grad_norm": 2.2954665977566537, + "language_loss": 0.87709373, + "learning_rate": 3.011212882395709e-06, + "loss": 0.89822054, + "num_input_tokens_seen": 118351470, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.52734375, + "step": 5511, + "time_per_iteration": 5.153402805328369 + }, + { + "auxiliary_loss_clip": 0.0107445, + "auxiliary_loss_mlp": 0.01031561, + "balance_loss_clip": 1.01763129, + "balance_loss_mlp": 1.02489877, + "epoch": 0.33139936870584696, + "flos": 20886055470720.0, + "grad_norm": 1.7593741678117978, + "language_loss": 0.73084962, + "learning_rate": 3.010886892762147e-06, + "loss": 0.75190973, + "num_input_tokens_seen": 118370970, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.49609375, + "step": 5512, + "time_per_iteration": 2.3822708129882812 + }, + { + "auxiliary_loss_clip": 0.01078026, + "auxiliary_loss_mlp": 0.01035621, + "balance_loss_clip": 1.02036226, + "balance_loss_mlp": 1.02561522, + "epoch": 0.33145949195851493, + "flos": 36282857518080.0, + "grad_norm": 1.7300419795133566, + "language_loss": 0.72435218, + "learning_rate": 3.0105608670526317e-06, + "loss": 0.74548864, + "num_input_tokens_seen": 118393125, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.5234375, + "step": 5513, + "time_per_iteration": 2.4983346462249756 + }, + { + "auxiliary_loss_clip": 0.01081727, + "auxiliary_loss_mlp": 0.01037217, + "balance_loss_clip": 1.01943088, + "balance_loss_mlp": 1.0249964, + "epoch": 0.33151961521118295, + "flos": 14278952269440.0, + "grad_norm": 1.9408920491872914, + "language_loss": 0.68344891, + "learning_rate": 3.010234805278799e-06, + "loss": 0.70463836, + "num_input_tokens_seen": 118410860, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.56640625, + "step": 5514, + "time_per_iteration": 2.3421661853790283 + }, + { + "auxiliary_loss_clip": 0.01079897, + "auxiliary_loss_mlp": 0.01039622, + "balance_loss_clip": 1.02193093, + "balance_loss_mlp": 1.02536178, + "epoch": 0.3315797384638509, + "flos": 20773356001920.0, + "grad_norm": 2.689355754922417, + "language_loss": 0.66556174, + "learning_rate": 3.0099087074522844e-06, + "loss": 0.68675697, + "num_input_tokens_seen": 118429570, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.546875, + "step": 5515, + "time_per_iteration": 2.378521203994751 + }, + { + "auxiliary_loss_clip": 0.01078409, + "auxiliary_loss_mlp": 0.01030626, + "balance_loss_clip": 1.01386523, + "balance_loss_mlp": 1.02416182, + "epoch": 0.3316398617165189, + "flos": 24678356394240.0, + "grad_norm": 1.5437653433032403, + "language_loss": 0.69410121, + "learning_rate": 3.009582573584726e-06, + "loss": 0.71519154, + "num_input_tokens_seen": 118450285, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.5390625, + "step": 5516, + "time_per_iteration": 3.8009870052337646 + }, + { + "auxiliary_loss_clip": 0.01078705, + "auxiliary_loss_mlp": 0.01031319, + "balance_loss_clip": 1.01446319, + "balance_loss_mlp": 1.02335072, + "epoch": 0.33169998496918685, + "flos": 18586989930240.0, + "grad_norm": 2.5286791546493124, + "language_loss": 0.80638069, + "learning_rate": 3.0092564036877624e-06, + "loss": 0.82748091, + "num_input_tokens_seen": 118468270, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.5546875, + "step": 5517, + "time_per_iteration": 2.361835241317749 + }, + { + "auxiliary_loss_clip": 0.0107473, + "auxiliary_loss_mlp": 0.01032545, + "balance_loss_clip": 1.01736951, + "balance_loss_mlp": 1.02369392, + "epoch": 0.3317601082218548, + "flos": 20192610579840.0, + "grad_norm": 1.8632426934898658, + "language_loss": 0.74322176, + "learning_rate": 3.0089301977730343e-06, + "loss": 0.76429451, + "num_input_tokens_seen": 118486615, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.51171875, + "step": 5518, + "time_per_iteration": 2.362668514251709 + }, + { + "auxiliary_loss_clip": 0.01013505, + "auxiliary_loss_mlp": 0.01001692, + "balance_loss_clip": 1.00026703, + "balance_loss_mlp": 1.00193334, + "epoch": 0.3318202314745228, + "flos": 68971301032320.0, + "grad_norm": 0.6087780292243, + "language_loss": 0.54339445, + "learning_rate": 3.008603955852182e-06, + "loss": 0.56354642, + "num_input_tokens_seen": 118553580, + "router_z_loss_clip": 0.01422119, + "router_z_loss_mlp": 0.11572266, + "step": 5519, + "time_per_iteration": 3.1202445030212402 + }, + { + "auxiliary_loss_clip": 0.01077966, + "auxiliary_loss_mlp": 0.01031499, + "balance_loss_clip": 1.01454747, + "balance_loss_mlp": 1.02486324, + "epoch": 0.33188035472719074, + "flos": 21499235412480.0, + "grad_norm": 2.2352145529889387, + "language_loss": 0.78812635, + "learning_rate": 3.00827767793685e-06, + "loss": 0.80922097, + "num_input_tokens_seen": 118570280, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.53125, + "step": 5520, + "time_per_iteration": 2.3668649196624756 + }, + { + "auxiliary_loss_clip": 0.01076427, + "auxiliary_loss_mlp": 0.01029362, + "balance_loss_clip": 1.01472282, + "balance_loss_mlp": 1.02562094, + "epoch": 0.3319404779798587, + "flos": 28869400488960.0, + "grad_norm": 1.677298408178834, + "language_loss": 0.76335812, + "learning_rate": 3.0079513640386806e-06, + "loss": 0.78441596, + "num_input_tokens_seen": 118590455, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.5078125, + "step": 5521, + "time_per_iteration": 2.426194667816162 + }, + { + "auxiliary_loss_clip": 0.01078363, + "auxiliary_loss_mlp": 0.01032232, + "balance_loss_clip": 1.01630592, + "balance_loss_mlp": 1.02395725, + "epoch": 0.33200060123252667, + "flos": 23075773032960.0, + "grad_norm": 2.390168532822534, + "language_loss": 0.70330912, + "learning_rate": 3.00762501416932e-06, + "loss": 0.72441512, + "num_input_tokens_seen": 118609495, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.54296875, + "step": 5522, + "time_per_iteration": 2.376386880874634 + }, + { + "auxiliary_loss_clip": 0.01076956, + "auxiliary_loss_mlp": 0.01031793, + "balance_loss_clip": 1.01673639, + "balance_loss_mlp": 1.02484918, + "epoch": 0.33206072448519464, + "flos": 21141410221440.0, + "grad_norm": 1.7767204860105101, + "language_loss": 0.73826516, + "learning_rate": 3.007298628340414e-06, + "loss": 0.75935268, + "num_input_tokens_seen": 118628720, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.51953125, + "step": 5523, + "time_per_iteration": 2.4023706912994385 + }, + { + "auxiliary_loss_clip": 0.01073963, + "auxiliary_loss_mlp": 0.01035024, + "balance_loss_clip": 1.01871657, + "balance_loss_mlp": 1.02310383, + "epoch": 0.3321208477378626, + "flos": 13078254101760.0, + "grad_norm": 1.6769998269019346, + "language_loss": 0.81628752, + "learning_rate": 3.0069722065636114e-06, + "loss": 0.83737737, + "num_input_tokens_seen": 118645955, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.5078125, + "step": 5524, + "time_per_iteration": 2.3407528400421143 + }, + { + "auxiliary_loss_clip": 0.0107441, + "auxiliary_loss_mlp": 0.01029288, + "balance_loss_clip": 1.01440465, + "balance_loss_mlp": 1.02322602, + "epoch": 0.33218097099053057, + "flos": 21214343784960.0, + "grad_norm": 1.9232459480305069, + "language_loss": 0.82582688, + "learning_rate": 3.006645748850561e-06, + "loss": 0.84686387, + "num_input_tokens_seen": 118665605, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.51171875, + "step": 5525, + "time_per_iteration": 2.3984243869781494 + }, + { + "auxiliary_loss_clip": 0.01012263, + "auxiliary_loss_mlp": 0.01001452, + "balance_loss_clip": 1.00016451, + "balance_loss_mlp": 1.00131154, + "epoch": 0.33224109424319853, + "flos": 64345483376640.0, + "grad_norm": 0.765381682479276, + "language_loss": 0.52468234, + "learning_rate": 3.006319255212913e-06, + "loss": 0.54481953, + "num_input_tokens_seen": 118728155, + "router_z_loss_clip": 0.01287842, + "router_z_loss_mlp": 0.109375, + "step": 5526, + "time_per_iteration": 2.991244316101074 + }, + { + "auxiliary_loss_clip": 0.01078519, + "auxiliary_loss_mlp": 0.01038393, + "balance_loss_clip": 1.02202523, + "balance_loss_mlp": 1.02494764, + "epoch": 0.33230121749586655, + "flos": 17345094491520.0, + "grad_norm": 1.992856585999236, + "language_loss": 0.77381909, + "learning_rate": 3.0059927256623195e-06, + "loss": 0.79498827, + "num_input_tokens_seen": 118743955, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.53515625, + "step": 5527, + "time_per_iteration": 2.3451356887817383 + }, + { + "auxiliary_loss_clip": 0.01079503, + "auxiliary_loss_mlp": 0.0103317, + "balance_loss_clip": 1.01830423, + "balance_loss_mlp": 1.02678227, + "epoch": 0.3323613407485345, + "flos": 20995962030720.0, + "grad_norm": 2.2897982068716547, + "language_loss": 0.71828073, + "learning_rate": 3.005666160210434e-06, + "loss": 0.73940748, + "num_input_tokens_seen": 118763275, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.52734375, + "step": 5528, + "time_per_iteration": 2.368333101272583 + }, + { + "auxiliary_loss_clip": 0.01076571, + "auxiliary_loss_mlp": 0.01025358, + "balance_loss_clip": 1.0109458, + "balance_loss_mlp": 1.02340984, + "epoch": 0.3324214640012025, + "flos": 13151676424320.0, + "grad_norm": 1.6105959128340976, + "language_loss": 0.82893622, + "learning_rate": 3.005339558868909e-06, + "loss": 0.8499555, + "num_input_tokens_seen": 118781110, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.53125, + "step": 5529, + "time_per_iteration": 2.357602834701538 + }, + { + "auxiliary_loss_clip": 0.01079475, + "auxiliary_loss_mlp": 0.0103376, + "balance_loss_clip": 1.01735663, + "balance_loss_mlp": 1.02448368, + "epoch": 0.33248158725387045, + "flos": 22272421582080.0, + "grad_norm": 2.561172822028531, + "language_loss": 0.69755512, + "learning_rate": 3.0050129216494017e-06, + "loss": 0.71868747, + "num_input_tokens_seen": 118800620, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.546875, + "step": 5530, + "time_per_iteration": 2.370915412902832 + }, + { + "auxiliary_loss_clip": 0.0107758, + "auxiliary_loss_mlp": 0.01037905, + "balance_loss_clip": 1.02079821, + "balance_loss_mlp": 1.02417314, + "epoch": 0.3325417105065384, + "flos": 20739943964160.0, + "grad_norm": 2.457266075947352, + "language_loss": 0.76323688, + "learning_rate": 3.004686248563569e-06, + "loss": 0.78439176, + "num_input_tokens_seen": 118818725, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.53515625, + "step": 5531, + "time_per_iteration": 2.375516414642334 + }, + { + "auxiliary_loss_clip": 0.0107839, + "auxiliary_loss_mlp": 0.01031938, + "balance_loss_clip": 1.01682234, + "balance_loss_mlp": 1.02428508, + "epoch": 0.3326018337592064, + "flos": 24789380117760.0, + "grad_norm": 1.9400348786361958, + "language_loss": 0.7339288, + "learning_rate": 3.0043595396230675e-06, + "loss": 0.75503206, + "num_input_tokens_seen": 118839390, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.5390625, + "step": 5532, + "time_per_iteration": 2.408520460128784 + }, + { + "auxiliary_loss_clip": 0.01076965, + "auxiliary_loss_mlp": 0.01028579, + "balance_loss_clip": 1.01318359, + "balance_loss_mlp": 1.02466965, + "epoch": 0.33266195701187434, + "flos": 14500825159680.0, + "grad_norm": 2.0078269637567883, + "language_loss": 0.65814972, + "learning_rate": 3.004032794839558e-06, + "loss": 0.67920512, + "num_input_tokens_seen": 118856275, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.5234375, + "step": 5533, + "time_per_iteration": 2.3536040782928467 + }, + { + "auxiliary_loss_clip": 0.01078917, + "auxiliary_loss_mlp": 0.01029114, + "balance_loss_clip": 1.01380777, + "balance_loss_mlp": 1.02548361, + "epoch": 0.3327220802645423, + "flos": 15303513294720.0, + "grad_norm": 2.0519894762515407, + "language_loss": 0.71043754, + "learning_rate": 3.0037060142247006e-06, + "loss": 0.73151791, + "num_input_tokens_seen": 118873830, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.53515625, + "step": 5534, + "time_per_iteration": 2.3512964248657227 + }, + { + "auxiliary_loss_clip": 0.01075729, + "auxiliary_loss_mlp": 0.01026959, + "balance_loss_clip": 1.0120821, + "balance_loss_mlp": 1.02500868, + "epoch": 0.3327822035172103, + "flos": 23476401417600.0, + "grad_norm": 2.238261373473902, + "language_loss": 0.66874146, + "learning_rate": 3.0033791977901582e-06, + "loss": 0.68976831, + "num_input_tokens_seen": 118891560, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.5078125, + "step": 5535, + "time_per_iteration": 2.3810787200927734 + }, + { + "auxiliary_loss_clip": 0.01076021, + "auxiliary_loss_mlp": 0.01033247, + "balance_loss_clip": 1.01820302, + "balance_loss_mlp": 1.02380037, + "epoch": 0.33284232676987824, + "flos": 25373337384960.0, + "grad_norm": 3.5856777647169307, + "language_loss": 0.72643864, + "learning_rate": 3.0030523455475923e-06, + "loss": 0.74753129, + "num_input_tokens_seen": 118910260, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.5234375, + "step": 5536, + "time_per_iteration": 2.4156932830810547 + }, + { + "auxiliary_loss_clip": 0.01075153, + "auxiliary_loss_mlp": 0.01033417, + "balance_loss_clip": 1.01839089, + "balance_loss_mlp": 1.02250683, + "epoch": 0.3329024500225462, + "flos": 23693281983360.0, + "grad_norm": 1.7974857731830458, + "language_loss": 0.81679094, + "learning_rate": 3.0027254575086683e-06, + "loss": 0.83787668, + "num_input_tokens_seen": 118929985, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.52734375, + "step": 5537, + "time_per_iteration": 2.4111626148223877 + }, + { + "auxiliary_loss_clip": 0.01081624, + "auxiliary_loss_mlp": 0.01034877, + "balance_loss_clip": 1.01918864, + "balance_loss_mlp": 1.02702272, + "epoch": 0.33296257327521417, + "flos": 31721804167680.0, + "grad_norm": 3.531693637137087, + "language_loss": 0.713952, + "learning_rate": 3.0023985336850526e-06, + "loss": 0.73511702, + "num_input_tokens_seen": 118951355, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.546875, + "step": 5538, + "time_per_iteration": 2.465549945831299 + }, + { + "auxiliary_loss_clip": 0.01074623, + "auxiliary_loss_mlp": 0.01028647, + "balance_loss_clip": 1.01397192, + "balance_loss_mlp": 1.02383459, + "epoch": 0.33302269652788213, + "flos": 22743679380480.0, + "grad_norm": 1.7361844059001326, + "language_loss": 0.74090689, + "learning_rate": 3.0020715740884112e-06, + "loss": 0.76193959, + "num_input_tokens_seen": 118970910, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.5078125, + "step": 5539, + "time_per_iteration": 2.3753397464752197 + }, + { + "auxiliary_loss_clip": 0.01079784, + "auxiliary_loss_mlp": 0.01035945, + "balance_loss_clip": 1.01948261, + "balance_loss_mlp": 1.02391446, + "epoch": 0.33308281978055015, + "flos": 11472947654400.0, + "grad_norm": 2.257751568342649, + "language_loss": 0.71164829, + "learning_rate": 3.001744578730413e-06, + "loss": 0.73280561, + "num_input_tokens_seen": 118989200, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.55859375, + "step": 5540, + "time_per_iteration": 2.357773542404175 + }, + { + "auxiliary_loss_clip": 0.01073691, + "auxiliary_loss_mlp": 0.01027466, + "balance_loss_clip": 1.01273119, + "balance_loss_mlp": 1.02216721, + "epoch": 0.3331429430332181, + "flos": 38212262916480.0, + "grad_norm": 1.6121686349596964, + "language_loss": 0.60606539, + "learning_rate": 3.0014175476227284e-06, + "loss": 0.62707698, + "num_input_tokens_seen": 119011030, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.515625, + "step": 5541, + "time_per_iteration": 2.5146448612213135 + }, + { + "auxiliary_loss_clip": 0.01076282, + "auxiliary_loss_mlp": 0.01033443, + "balance_loss_clip": 1.01733828, + "balance_loss_mlp": 1.02213919, + "epoch": 0.3332030662858861, + "flos": 22527566864640.0, + "grad_norm": 2.82336732415007, + "language_loss": 0.68974257, + "learning_rate": 3.0010904807770267e-06, + "loss": 0.71083981, + "num_input_tokens_seen": 119030620, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.5390625, + "step": 5542, + "time_per_iteration": 2.381967544555664 + }, + { + "auxiliary_loss_clip": 0.0107615, + "auxiliary_loss_mlp": 0.01030972, + "balance_loss_clip": 1.01657724, + "balance_loss_mlp": 1.02423632, + "epoch": 0.33326318953855405, + "flos": 15996853451520.0, + "grad_norm": 1.5508973421615084, + "language_loss": 0.75223792, + "learning_rate": 3.0007633782049808e-06, + "loss": 0.77330911, + "num_input_tokens_seen": 119048015, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.51953125, + "step": 5543, + "time_per_iteration": 2.347586154937744 + }, + { + "auxiliary_loss_clip": 0.01079862, + "auxiliary_loss_mlp": 0.01028852, + "balance_loss_clip": 1.01336658, + "balance_loss_mlp": 1.02651429, + "epoch": 0.333323312791222, + "flos": 25592347543680.0, + "grad_norm": 37.04893001160876, + "language_loss": 0.74965572, + "learning_rate": 3.000436239918264e-06, + "loss": 0.77074289, + "num_input_tokens_seen": 119066280, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.53125, + "step": 5544, + "time_per_iteration": 2.416640281677246 + }, + { + "auxiliary_loss_clip": 0.01073255, + "auxiliary_loss_mlp": 0.01028505, + "balance_loss_clip": 1.01365125, + "balance_loss_mlp": 1.02256787, + "epoch": 0.33338343604389, + "flos": 25118366659200.0, + "grad_norm": 2.065096202813144, + "language_loss": 0.70477557, + "learning_rate": 3.0001090659285514e-06, + "loss": 0.72579312, + "num_input_tokens_seen": 119087680, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.5078125, + "step": 5545, + "time_per_iteration": 2.3997411727905273 + }, + { + "auxiliary_loss_clip": 0.01073648, + "auxiliary_loss_mlp": 0.01029524, + "balance_loss_clip": 1.01487279, + "balance_loss_mlp": 1.02298927, + "epoch": 0.33344355929655795, + "flos": 16946316408960.0, + "grad_norm": 1.9542117608094833, + "language_loss": 0.69310373, + "learning_rate": 2.9997818562475194e-06, + "loss": 0.71413547, + "num_input_tokens_seen": 119105820, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.5078125, + "step": 5546, + "time_per_iteration": 3.830307722091675 + }, + { + "auxiliary_loss_clip": 0.0107685, + "auxiliary_loss_mlp": 0.0103088, + "balance_loss_clip": 1.01484632, + "balance_loss_mlp": 1.02271593, + "epoch": 0.3335036825492259, + "flos": 27888410707200.0, + "grad_norm": 1.530255302838695, + "language_loss": 0.64674079, + "learning_rate": 2.999454610886844e-06, + "loss": 0.66781807, + "num_input_tokens_seen": 119126630, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.5390625, + "step": 5547, + "time_per_iteration": 2.445080280303955 + }, + { + "auxiliary_loss_clip": 0.01073803, + "auxiliary_loss_mlp": 0.0102841, + "balance_loss_clip": 1.01397347, + "balance_loss_mlp": 1.02250731, + "epoch": 0.3335638058018939, + "flos": 16178646234240.0, + "grad_norm": 2.3783957839733847, + "language_loss": 0.84989339, + "learning_rate": 2.999127329858205e-06, + "loss": 0.87091547, + "num_input_tokens_seen": 119143375, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.51171875, + "step": 5548, + "time_per_iteration": 2.3369579315185547 + }, + { + "auxiliary_loss_clip": 0.01075009, + "auxiliary_loss_mlp": 0.01028834, + "balance_loss_clip": 1.01317048, + "balance_loss_mlp": 1.02133405, + "epoch": 0.33362392905456184, + "flos": 39894517733760.0, + "grad_norm": 2.024635341949424, + "language_loss": 0.74164677, + "learning_rate": 2.9988000131732813e-06, + "loss": 0.76268518, + "num_input_tokens_seen": 119166450, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.53515625, + "step": 5549, + "time_per_iteration": 2.5600130558013916 + }, + { + "auxiliary_loss_clip": 0.01077087, + "auxiliary_loss_mlp": 0.0103115, + "balance_loss_clip": 1.01602232, + "balance_loss_mlp": 1.02468634, + "epoch": 0.3336840523072298, + "flos": 44269588938240.0, + "grad_norm": 2.005761780232257, + "language_loss": 0.68795037, + "learning_rate": 2.998472660843755e-06, + "loss": 0.70903265, + "num_input_tokens_seen": 119189645, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.5234375, + "step": 5550, + "time_per_iteration": 4.060149669647217 + }, + { + "auxiliary_loss_clip": 0.01074862, + "auxiliary_loss_mlp": 0.01030884, + "balance_loss_clip": 1.01628661, + "balance_loss_mlp": 1.02353942, + "epoch": 0.33374417555989777, + "flos": 15084782426880.0, + "grad_norm": 1.7187397239201687, + "language_loss": 0.60497022, + "learning_rate": 2.998145272881307e-06, + "loss": 0.62602764, + "num_input_tokens_seen": 119208045, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.51171875, + "step": 5551, + "time_per_iteration": 3.7055156230926514 + }, + { + "auxiliary_loss_clip": 0.01074716, + "auxiliary_loss_mlp": 0.01028059, + "balance_loss_clip": 1.01322913, + "balance_loss_mlp": 1.02368164, + "epoch": 0.33380429881256574, + "flos": 15848333061120.0, + "grad_norm": 1.6057997482365778, + "language_loss": 0.70380235, + "learning_rate": 2.997817849297622e-06, + "loss": 0.72483003, + "num_input_tokens_seen": 119224910, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.51171875, + "step": 5552, + "time_per_iteration": 2.349670886993408 + }, + { + "auxiliary_loss_clip": 0.01075229, + "auxiliary_loss_mlp": 0.01030936, + "balance_loss_clip": 1.01615965, + "balance_loss_mlp": 1.02256203, + "epoch": 0.33386442206523376, + "flos": 13479475979520.0, + "grad_norm": 2.0648316931796185, + "language_loss": 0.83294916, + "learning_rate": 2.997490390104385e-06, + "loss": 0.85401082, + "num_input_tokens_seen": 119243290, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.52734375, + "step": 5553, + "time_per_iteration": 2.3437111377716064 + }, + { + "auxiliary_loss_clip": 0.01076369, + "auxiliary_loss_mlp": 0.01037392, + "balance_loss_clip": 1.02192426, + "balance_loss_mlp": 1.0237931, + "epoch": 0.3339245453179017, + "flos": 16689739760640.0, + "grad_norm": 1.8340308159497167, + "language_loss": 0.81052446, + "learning_rate": 2.9971628953132815e-06, + "loss": 0.83166212, + "num_input_tokens_seen": 119261195, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.5234375, + "step": 5554, + "time_per_iteration": 2.3397092819213867 + }, + { + "auxiliary_loss_clip": 0.01074086, + "auxiliary_loss_mlp": 0.01030403, + "balance_loss_clip": 1.01522732, + "balance_loss_mlp": 1.02320063, + "epoch": 0.3339846685705697, + "flos": 24609402725760.0, + "grad_norm": 1.519066197738685, + "language_loss": 0.81127423, + "learning_rate": 2.9968353649359996e-06, + "loss": 0.83231908, + "num_input_tokens_seen": 119282845, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.5078125, + "step": 5555, + "time_per_iteration": 2.426910161972046 + }, + { + "auxiliary_loss_clip": 0.01073358, + "auxiliary_loss_mlp": 0.01027621, + "balance_loss_clip": 1.01359582, + "balance_loss_mlp": 1.02268112, + "epoch": 0.33404479182323765, + "flos": 30952562981760.0, + "grad_norm": 1.6786413419295263, + "language_loss": 0.74415982, + "learning_rate": 2.996507798984227e-06, + "loss": 0.76516962, + "num_input_tokens_seen": 119304430, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.5078125, + "step": 5556, + "time_per_iteration": 3.8798019886016846 + }, + { + "auxiliary_loss_clip": 0.01074321, + "auxiliary_loss_mlp": 0.01030326, + "balance_loss_clip": 1.01603329, + "balance_loss_mlp": 1.02379119, + "epoch": 0.3341049150759056, + "flos": 23512187427840.0, + "grad_norm": 1.8836318491891912, + "language_loss": 0.82044494, + "learning_rate": 2.9961801974696546e-06, + "loss": 0.8414914, + "num_input_tokens_seen": 119323830, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.5078125, + "step": 5557, + "time_per_iteration": 2.3874154090881348 + }, + { + "auxiliary_loss_clip": 0.01075775, + "auxiliary_loss_mlp": 0.01032975, + "balance_loss_clip": 1.01826465, + "balance_loss_mlp": 1.02358103, + "epoch": 0.3341650383285736, + "flos": 24025620015360.0, + "grad_norm": 2.2757069998563657, + "language_loss": 0.80175179, + "learning_rate": 2.995852560403974e-06, + "loss": 0.82283926, + "num_input_tokens_seen": 119346340, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.5234375, + "step": 5558, + "time_per_iteration": 2.4520602226257324 + }, + { + "auxiliary_loss_clip": 0.01074148, + "auxiliary_loss_mlp": 0.01028048, + "balance_loss_clip": 1.01390362, + "balance_loss_mlp": 1.02303123, + "epoch": 0.33422516158124155, + "flos": 24900752954880.0, + "grad_norm": 1.7410947955627079, + "language_loss": 0.8146323, + "learning_rate": 2.9955248877988767e-06, + "loss": 0.83565426, + "num_input_tokens_seen": 119367285, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.51171875, + "step": 5559, + "time_per_iteration": 2.4129600524902344 + }, + { + "auxiliary_loss_clip": 0.01074564, + "auxiliary_loss_mlp": 0.0103193, + "balance_loss_clip": 1.01679003, + "balance_loss_mlp": 1.02415633, + "epoch": 0.3342852848339095, + "flos": 18332403229440.0, + "grad_norm": 3.4040807078479327, + "language_loss": 0.7177844, + "learning_rate": 2.9951971796660565e-06, + "loss": 0.73884928, + "num_input_tokens_seen": 119385370, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.50390625, + "step": 5560, + "time_per_iteration": 2.368154287338257 + }, + { + "auxiliary_loss_clip": 0.01077398, + "auxiliary_loss_mlp": 0.01032648, + "balance_loss_clip": 1.01735377, + "balance_loss_mlp": 1.02394676, + "epoch": 0.3343454080865775, + "flos": 30045170079360.0, + "grad_norm": 1.5090462505936386, + "language_loss": 0.75033867, + "learning_rate": 2.994869436017209e-06, + "loss": 0.77143919, + "num_input_tokens_seen": 119409150, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.53515625, + "step": 5561, + "time_per_iteration": 2.4563941955566406 + }, + { + "auxiliary_loss_clip": 0.01075304, + "auxiliary_loss_mlp": 0.01031141, + "balance_loss_clip": 1.0162518, + "balance_loss_mlp": 1.02332568, + "epoch": 0.33440553133924544, + "flos": 16397900772480.0, + "grad_norm": 1.636777793952095, + "language_loss": 0.69598168, + "learning_rate": 2.9945416568640314e-06, + "loss": 0.71704608, + "num_input_tokens_seen": 119426475, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.51953125, + "step": 5562, + "time_per_iteration": 2.3684780597686768 + }, + { + "auxiliary_loss_clip": 0.0107414, + "auxiliary_loss_mlp": 0.01031801, + "balance_loss_clip": 1.01791883, + "balance_loss_mlp": 1.02342868, + "epoch": 0.3344656545919134, + "flos": 24240964481280.0, + "grad_norm": 2.016993099031963, + "language_loss": 0.64630532, + "learning_rate": 2.99421384221822e-06, + "loss": 0.66736472, + "num_input_tokens_seen": 119446900, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.5078125, + "step": 5563, + "time_per_iteration": 2.391129493713379 + }, + { + "auxiliary_loss_clip": 0.01078389, + "auxiliary_loss_mlp": 0.01036197, + "balance_loss_clip": 1.02034795, + "balance_loss_mlp": 1.02499962, + "epoch": 0.3345257778445814, + "flos": 52116911832960.0, + "grad_norm": 4.5676194274434065, + "language_loss": 0.74270809, + "learning_rate": 2.9938859920914735e-06, + "loss": 0.76385391, + "num_input_tokens_seen": 119470945, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.53125, + "step": 5564, + "time_per_iteration": 2.6527907848358154 + }, + { + "auxiliary_loss_clip": 0.01016207, + "auxiliary_loss_mlp": 0.01003749, + "balance_loss_clip": 1.00217545, + "balance_loss_mlp": 1.00512195, + "epoch": 0.33458590109724934, + "flos": 68045614577280.0, + "grad_norm": 0.7741006322130335, + "language_loss": 0.55468792, + "learning_rate": 2.9935581064954934e-06, + "loss": 0.57488745, + "num_input_tokens_seen": 119529925, + "router_z_loss_clip": 0.01574707, + "router_z_loss_mlp": 0.11083984, + "step": 5565, + "time_per_iteration": 2.97763991355896 + }, + { + "auxiliary_loss_clip": 0.01073709, + "auxiliary_loss_mlp": 0.01028739, + "balance_loss_clip": 1.01495206, + "balance_loss_mlp": 1.02421498, + "epoch": 0.3346460243499173, + "flos": 37413275385600.0, + "grad_norm": 2.1109905326199274, + "language_loss": 0.6467492, + "learning_rate": 2.9932301854419794e-06, + "loss": 0.66777366, + "num_input_tokens_seen": 119550700, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.49414062, + "step": 5566, + "time_per_iteration": 2.514491081237793 + }, + { + "auxiliary_loss_clip": 0.01075475, + "auxiliary_loss_mlp": 0.01030986, + "balance_loss_clip": 1.01588249, + "balance_loss_mlp": 1.02459669, + "epoch": 0.3347061476025853, + "flos": 18696372819840.0, + "grad_norm": 2.2253847357918835, + "language_loss": 0.77230692, + "learning_rate": 2.9929022289426352e-06, + "loss": 0.79337156, + "num_input_tokens_seen": 119569295, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.5078125, + "step": 5567, + "time_per_iteration": 2.34729266166687 + }, + { + "auxiliary_loss_clip": 0.01076869, + "auxiliary_loss_mlp": 0.01030514, + "balance_loss_clip": 1.01514792, + "balance_loss_mlp": 1.02460814, + "epoch": 0.3347662708552533, + "flos": 13916972626560.0, + "grad_norm": 2.313692891187955, + "language_loss": 0.75705385, + "learning_rate": 2.9925742370091645e-06, + "loss": 0.77812767, + "num_input_tokens_seen": 119587375, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.51953125, + "step": 5568, + "time_per_iteration": 2.3547399044036865 + }, + { + "auxiliary_loss_clip": 0.01076316, + "auxiliary_loss_mlp": 0.0103273, + "balance_loss_clip": 1.01811469, + "balance_loss_mlp": 1.02347338, + "epoch": 0.33482639410792125, + "flos": 19749528115200.0, + "grad_norm": 2.0441753961713163, + "language_loss": 0.70955795, + "learning_rate": 2.992246209653272e-06, + "loss": 0.7306484, + "num_input_tokens_seen": 119604530, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.52734375, + "step": 5569, + "time_per_iteration": 2.354771137237549 + }, + { + "auxiliary_loss_clip": 0.01076972, + "auxiliary_loss_mlp": 0.01028026, + "balance_loss_clip": 1.01256418, + "balance_loss_mlp": 1.02347755, + "epoch": 0.3348865173605892, + "flos": 16102186623360.0, + "grad_norm": 2.198998616787981, + "language_loss": 0.89482254, + "learning_rate": 2.9919181468866653e-06, + "loss": 0.91587257, + "num_input_tokens_seen": 119621025, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.53125, + "step": 5570, + "time_per_iteration": 2.352205991744995 + }, + { + "auxiliary_loss_clip": 0.01074117, + "auxiliary_loss_mlp": 0.01025588, + "balance_loss_clip": 1.01072288, + "balance_loss_mlp": 1.02277756, + "epoch": 0.3349466406132572, + "flos": 25007796783360.0, + "grad_norm": 2.4539795075892705, + "language_loss": 0.79679501, + "learning_rate": 2.9915900487210514e-06, + "loss": 0.81779206, + "num_input_tokens_seen": 119641725, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.515625, + "step": 5571, + "time_per_iteration": 2.400282382965088 + }, + { + "auxiliary_loss_clip": 0.01015455, + "auxiliary_loss_mlp": 0.01005221, + "balance_loss_clip": 1.00393963, + "balance_loss_mlp": 1.00460982, + "epoch": 0.33500676386592515, + "flos": 54316647089280.0, + "grad_norm": 0.9081892765750889, + "language_loss": 0.55968076, + "learning_rate": 2.991261915168139e-06, + "loss": 0.57988751, + "num_input_tokens_seen": 119693560, + "router_z_loss_clip": 0.01281738, + "router_z_loss_mlp": 0.10839844, + "step": 5572, + "time_per_iteration": 2.953749895095825 + }, + { + "auxiliary_loss_clip": 0.01076335, + "auxiliary_loss_mlp": 0.01034884, + "balance_loss_clip": 1.0207938, + "balance_loss_mlp": 1.02485335, + "epoch": 0.3350668871185931, + "flos": 26796117911040.0, + "grad_norm": 2.078657938560479, + "language_loss": 0.7806412, + "learning_rate": 2.990933746239639e-06, + "loss": 0.8017534, + "num_input_tokens_seen": 119712935, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.515625, + "step": 5573, + "time_per_iteration": 2.4105515480041504 + }, + { + "auxiliary_loss_clip": 0.0107709, + "auxiliary_loss_mlp": 0.01038027, + "balance_loss_clip": 1.02286911, + "balance_loss_mlp": 1.02408218, + "epoch": 0.3351270103712611, + "flos": 33509112865920.0, + "grad_norm": 2.509549001053669, + "language_loss": 0.72811186, + "learning_rate": 2.9906055419472622e-06, + "loss": 0.74926305, + "num_input_tokens_seen": 119731680, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.53125, + "step": 5574, + "time_per_iteration": 2.4823031425476074 + }, + { + "auxiliary_loss_clip": 0.01073969, + "auxiliary_loss_mlp": 0.01032803, + "balance_loss_clip": 1.01743698, + "balance_loss_mlp": 1.02348042, + "epoch": 0.33518713362392905, + "flos": 26505012061440.0, + "grad_norm": 1.6871356426871273, + "language_loss": 0.87748444, + "learning_rate": 2.9902773023027224e-06, + "loss": 0.89855218, + "num_input_tokens_seen": 119752155, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.50390625, + "step": 5575, + "time_per_iteration": 2.4122045040130615 + }, + { + "auxiliary_loss_clip": 0.01078844, + "auxiliary_loss_mlp": 0.01033762, + "balance_loss_clip": 1.01659584, + "balance_loss_mlp": 1.02330542, + "epoch": 0.335247256876597, + "flos": 17231557150080.0, + "grad_norm": 6.901726321552094, + "language_loss": 0.82656181, + "learning_rate": 2.9899490273177327e-06, + "loss": 0.8476879, + "num_input_tokens_seen": 119769195, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.5546875, + "step": 5576, + "time_per_iteration": 2.358658790588379 + }, + { + "auxiliary_loss_clip": 0.0107583, + "auxiliary_loss_mlp": 0.01031376, + "balance_loss_clip": 1.01499701, + "balance_loss_mlp": 1.02272081, + "epoch": 0.335307380129265, + "flos": 25628203376640.0, + "grad_norm": 2.250754527870607, + "language_loss": 0.72949106, + "learning_rate": 2.9896207170040084e-06, + "loss": 0.75056314, + "num_input_tokens_seen": 119786810, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.53125, + "step": 5577, + "time_per_iteration": 2.4029417037963867 + }, + { + "auxiliary_loss_clip": 0.01078041, + "auxiliary_loss_mlp": 0.01029663, + "balance_loss_clip": 1.01322389, + "balance_loss_mlp": 1.02676105, + "epoch": 0.33536750338193294, + "flos": 19679143080960.0, + "grad_norm": 1.8074639118780211, + "language_loss": 0.81616819, + "learning_rate": 2.989292371373266e-06, + "loss": 0.83724523, + "num_input_tokens_seen": 119805395, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.51171875, + "step": 5578, + "time_per_iteration": 2.389028310775757 + }, + { + "auxiliary_loss_clip": 0.01074314, + "auxiliary_loss_mlp": 0.01028994, + "balance_loss_clip": 1.01451552, + "balance_loss_mlp": 1.0248735, + "epoch": 0.3354276266346009, + "flos": 18331635179520.0, + "grad_norm": 1.6912751610589036, + "language_loss": 0.71834564, + "learning_rate": 2.9889639904372246e-06, + "loss": 0.73937869, + "num_input_tokens_seen": 119823135, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.49414062, + "step": 5579, + "time_per_iteration": 2.3517847061157227 + }, + { + "auxiliary_loss_clip": 0.01079201, + "auxiliary_loss_mlp": 0.01032052, + "balance_loss_clip": 1.01599455, + "balance_loss_mlp": 1.02580476, + "epoch": 0.3354877498872689, + "flos": 17857584472320.0, + "grad_norm": 2.3857803422952277, + "language_loss": 0.81320035, + "learning_rate": 2.988635574207602e-06, + "loss": 0.8343128, + "num_input_tokens_seen": 119842265, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.53125, + "step": 5580, + "time_per_iteration": 2.383861780166626 + }, + { + "auxiliary_loss_clip": 0.01077778, + "auxiliary_loss_mlp": 0.0102849, + "balance_loss_clip": 1.01293325, + "balance_loss_mlp": 1.02508652, + "epoch": 0.3355478731399369, + "flos": 24716586199680.0, + "grad_norm": 2.567139607894021, + "language_loss": 0.77625459, + "learning_rate": 2.988307122696119e-06, + "loss": 0.79731727, + "num_input_tokens_seen": 119862500, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.52734375, + "step": 5581, + "time_per_iteration": 2.416560649871826 + }, + { + "auxiliary_loss_clip": 0.01081128, + "auxiliary_loss_mlp": 0.01034889, + "balance_loss_clip": 1.0163753, + "balance_loss_mlp": 1.02548647, + "epoch": 0.33560799639260486, + "flos": 16872928997760.0, + "grad_norm": 2.4729237608761654, + "language_loss": 0.74867547, + "learning_rate": 2.9879786359144967e-06, + "loss": 0.76983559, + "num_input_tokens_seen": 119880160, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.5546875, + "step": 5582, + "time_per_iteration": 2.352843999862671 + }, + { + "auxiliary_loss_clip": 0.01075789, + "auxiliary_loss_mlp": 0.01032979, + "balance_loss_clip": 1.01754141, + "balance_loss_mlp": 1.02315283, + "epoch": 0.3356681196452728, + "flos": 18332507963520.0, + "grad_norm": 1.619946468347716, + "language_loss": 0.82264602, + "learning_rate": 2.9876501138744577e-06, + "loss": 0.84373367, + "num_input_tokens_seen": 119899040, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.52734375, + "step": 5583, + "time_per_iteration": 2.373098611831665 + }, + { + "auxiliary_loss_clip": 0.01077088, + "auxiliary_loss_mlp": 0.01034813, + "balance_loss_clip": 1.01961994, + "balance_loss_mlp": 1.02562809, + "epoch": 0.3357282428979408, + "flos": 34749192913920.0, + "grad_norm": 2.1014313550016785, + "language_loss": 0.77570271, + "learning_rate": 2.9873215565877274e-06, + "loss": 0.79682177, + "num_input_tokens_seen": 119921120, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.515625, + "step": 5584, + "time_per_iteration": 2.4951212406158447 + }, + { + "auxiliary_loss_clip": 0.01077104, + "auxiliary_loss_mlp": 0.01031395, + "balance_loss_clip": 1.01571941, + "balance_loss_mlp": 1.02345395, + "epoch": 0.33578836615060875, + "flos": 21579011602560.0, + "grad_norm": 2.3506498722560765, + "language_loss": 0.76042569, + "learning_rate": 2.9869929640660303e-06, + "loss": 0.78151071, + "num_input_tokens_seen": 119940165, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.5390625, + "step": 5585, + "time_per_iteration": 2.3909871578216553 + }, + { + "auxiliary_loss_clip": 0.0107393, + "auxiliary_loss_mlp": 0.01029624, + "balance_loss_clip": 1.01537824, + "balance_loss_mlp": 1.023453, + "epoch": 0.3358484894032767, + "flos": 24529277422080.0, + "grad_norm": 1.4681236171573067, + "language_loss": 0.77592355, + "learning_rate": 2.9866643363210928e-06, + "loss": 0.79695916, + "num_input_tokens_seen": 119959730, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.50390625, + "step": 5586, + "time_per_iteration": 3.894225597381592 + }, + { + "auxiliary_loss_clip": 0.01080874, + "auxiliary_loss_mlp": 0.01038859, + "balance_loss_clip": 1.02164531, + "balance_loss_mlp": 1.02675009, + "epoch": 0.3359086126559447, + "flos": 22454493655680.0, + "grad_norm": 2.1479307669552004, + "language_loss": 0.80926239, + "learning_rate": 2.9863356733646437e-06, + "loss": 0.83045971, + "num_input_tokens_seen": 119979315, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.5390625, + "step": 5587, + "time_per_iteration": 2.38244366645813 + }, + { + "auxiliary_loss_clip": 0.01073225, + "auxiliary_loss_mlp": 0.01032402, + "balance_loss_clip": 1.01838279, + "balance_loss_mlp": 1.02423763, + "epoch": 0.33596873590861265, + "flos": 16542790381440.0, + "grad_norm": 1.9372760399643874, + "language_loss": 0.66947579, + "learning_rate": 2.9860069752084115e-06, + "loss": 0.69053209, + "num_input_tokens_seen": 119996140, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.49023438, + "step": 5588, + "time_per_iteration": 2.36731219291687 + }, + { + "auxiliary_loss_clip": 0.01076421, + "auxiliary_loss_mlp": 0.01041194, + "balance_loss_clip": 1.02563763, + "balance_loss_mlp": 1.0249002, + "epoch": 0.3360288591612806, + "flos": 31174470783360.0, + "grad_norm": 1.9303552909992168, + "language_loss": 0.69827455, + "learning_rate": 2.985678241864126e-06, + "loss": 0.71945071, + "num_input_tokens_seen": 120017720, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.515625, + "step": 5589, + "time_per_iteration": 2.453322410583496 + }, + { + "auxiliary_loss_clip": 0.01075426, + "auxiliary_loss_mlp": 0.0103508, + "balance_loss_clip": 1.01866531, + "balance_loss_mlp": 1.02233672, + "epoch": 0.3360889824139486, + "flos": 23695760689920.0, + "grad_norm": 1.6541257405760252, + "language_loss": 0.6743663, + "learning_rate": 2.9853494733435204e-06, + "loss": 0.69547141, + "num_input_tokens_seen": 120036335, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.53125, + "step": 5590, + "time_per_iteration": 3.856215715408325 + }, + { + "auxiliary_loss_clip": 0.01072904, + "auxiliary_loss_mlp": 0.01035295, + "balance_loss_clip": 1.01961875, + "balance_loss_mlp": 1.02358758, + "epoch": 0.33614910566661654, + "flos": 19317093615360.0, + "grad_norm": 2.368900515365514, + "language_loss": 0.73454851, + "learning_rate": 2.985020669658326e-06, + "loss": 0.75563049, + "num_input_tokens_seen": 120056120, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.49414062, + "step": 5591, + "time_per_iteration": 2.393751621246338 + }, + { + "auxiliary_loss_clip": 0.01075056, + "auxiliary_loss_mlp": 0.0103177, + "balance_loss_clip": 1.01710773, + "balance_loss_mlp": 1.02314603, + "epoch": 0.3362092289192845, + "flos": 16471323095040.0, + "grad_norm": 2.0580623479939204, + "language_loss": 0.69646275, + "learning_rate": 2.984691830820278e-06, + "loss": 0.71753097, + "num_input_tokens_seen": 120073650, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.515625, + "step": 5592, + "time_per_iteration": 2.3774869441986084 + }, + { + "auxiliary_loss_clip": 0.01071907, + "auxiliary_loss_mlp": 0.0103073, + "balance_loss_clip": 1.01660371, + "balance_loss_mlp": 1.02291036, + "epoch": 0.33626935217195253, + "flos": 24242430758400.0, + "grad_norm": 2.658665761255701, + "language_loss": 0.76120383, + "learning_rate": 2.9843629568411114e-06, + "loss": 0.78223014, + "num_input_tokens_seen": 120093260, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.48828125, + "step": 5593, + "time_per_iteration": 2.41409969329834 + }, + { + "auxiliary_loss_clip": 0.01074446, + "auxiliary_loss_mlp": 0.01033657, + "balance_loss_clip": 1.016348, + "balance_loss_mlp": 1.02114677, + "epoch": 0.3363294754246205, + "flos": 19717756911360.0, + "grad_norm": 1.9364391321455898, + "language_loss": 0.71527827, + "learning_rate": 2.984034047732563e-06, + "loss": 0.73635936, + "num_input_tokens_seen": 120111830, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.53125, + "step": 5594, + "time_per_iteration": 2.393780469894409 + }, + { + "auxiliary_loss_clip": 0.01078368, + "auxiliary_loss_mlp": 0.01031649, + "balance_loss_clip": 1.01581812, + "balance_loss_mlp": 1.02515757, + "epoch": 0.33638959867728846, + "flos": 22595333546880.0, + "grad_norm": 4.823768842939805, + "language_loss": 0.80069888, + "learning_rate": 2.983705103506371e-06, + "loss": 0.82179904, + "num_input_tokens_seen": 120130470, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.53125, + "step": 5595, + "time_per_iteration": 3.772585153579712 + }, + { + "auxiliary_loss_clip": 0.0107574, + "auxiliary_loss_mlp": 0.01029729, + "balance_loss_clip": 1.01576948, + "balance_loss_mlp": 1.02415037, + "epoch": 0.3364497219299564, + "flos": 20993727703680.0, + "grad_norm": 3.183615868305529, + "language_loss": 0.81332552, + "learning_rate": 2.983376124174274e-06, + "loss": 0.83438021, + "num_input_tokens_seen": 120150735, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.515625, + "step": 5596, + "time_per_iteration": 2.4326045513153076 + }, + { + "auxiliary_loss_clip": 0.01075532, + "auxiliary_loss_mlp": 0.01026483, + "balance_loss_clip": 1.01266718, + "balance_loss_mlp": 1.02411282, + "epoch": 0.3365098451826244, + "flos": 25227435346560.0, + "grad_norm": 1.6261324892571685, + "language_loss": 0.75755507, + "learning_rate": 2.9830471097480133e-06, + "loss": 0.77857518, + "num_input_tokens_seen": 120173230, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.51171875, + "step": 5597, + "time_per_iteration": 2.4832892417907715 + }, + { + "auxiliary_loss_clip": 0.01074414, + "auxiliary_loss_mlp": 0.01028109, + "balance_loss_clip": 1.01399422, + "balance_loss_mlp": 1.02418804, + "epoch": 0.33656996843529235, + "flos": 24570544515840.0, + "grad_norm": 1.763349238154856, + "language_loss": 0.78587317, + "learning_rate": 2.982718060239329e-06, + "loss": 0.80689836, + "num_input_tokens_seen": 120191860, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.5, + "step": 5598, + "time_per_iteration": 2.4460082054138184 + }, + { + "auxiliary_loss_clip": 0.01079467, + "auxiliary_loss_mlp": 0.01030174, + "balance_loss_clip": 1.01379418, + "balance_loss_mlp": 1.02409256, + "epoch": 0.3366300916879603, + "flos": 44089436989440.0, + "grad_norm": 6.318534761904872, + "language_loss": 0.64851892, + "learning_rate": 2.9823889756599652e-06, + "loss": 0.66961539, + "num_input_tokens_seen": 120219195, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.5546875, + "step": 5599, + "time_per_iteration": 2.5885472297668457 + }, + { + "auxiliary_loss_clip": 0.01081874, + "auxiliary_loss_mlp": 0.0103895, + "balance_loss_clip": 1.02197492, + "balance_loss_mlp": 1.02561009, + "epoch": 0.3366902149406283, + "flos": 13879057023360.0, + "grad_norm": 3.913644062184387, + "language_loss": 0.82308364, + "learning_rate": 2.9820598560216653e-06, + "loss": 0.84429187, + "num_input_tokens_seen": 120232950, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.5625, + "step": 5600, + "time_per_iteration": 2.360757350921631 + }, + { + "auxiliary_loss_clip": 0.01077675, + "auxiliary_loss_mlp": 0.01035691, + "balance_loss_clip": 1.0187397, + "balance_loss_mlp": 1.02370954, + "epoch": 0.33675033819329625, + "flos": 16252173290880.0, + "grad_norm": 2.5046643143532537, + "language_loss": 0.83401078, + "learning_rate": 2.9817307013361764e-06, + "loss": 0.85514444, + "num_input_tokens_seen": 120248865, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.5390625, + "step": 5601, + "time_per_iteration": 2.3644134998321533 + }, + { + "auxiliary_loss_clip": 0.01076271, + "auxiliary_loss_mlp": 0.01028827, + "balance_loss_clip": 1.01521301, + "balance_loss_mlp": 1.02712429, + "epoch": 0.3368104614459642, + "flos": 17054861425920.0, + "grad_norm": 1.861699238392671, + "language_loss": 0.83444321, + "learning_rate": 2.9814015116152437e-06, + "loss": 0.85549414, + "num_input_tokens_seen": 120267820, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.4921875, + "step": 5602, + "time_per_iteration": 2.3646373748779297 + }, + { + "auxiliary_loss_clip": 0.01077992, + "auxiliary_loss_mlp": 0.01032738, + "balance_loss_clip": 1.01659739, + "balance_loss_mlp": 1.02583826, + "epoch": 0.3368705846986322, + "flos": 17857654295040.0, + "grad_norm": 2.0427291698062997, + "language_loss": 0.69899702, + "learning_rate": 2.9810722868706154e-06, + "loss": 0.72010434, + "num_input_tokens_seen": 120286540, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.51953125, + "step": 5603, + "time_per_iteration": 2.3502278327941895 + }, + { + "auxiliary_loss_clip": 0.01079199, + "auxiliary_loss_mlp": 0.01030145, + "balance_loss_clip": 1.01384962, + "balance_loss_mlp": 1.0249238, + "epoch": 0.33693070795130015, + "flos": 22928404717440.0, + "grad_norm": 1.4579416458193406, + "language_loss": 0.83036739, + "learning_rate": 2.980743027114041e-06, + "loss": 0.85146081, + "num_input_tokens_seen": 120307305, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.54296875, + "step": 5604, + "time_per_iteration": 2.3966259956359863 + }, + { + "auxiliary_loss_clip": 0.01077618, + "auxiliary_loss_mlp": 0.01028898, + "balance_loss_clip": 1.01315093, + "balance_loss_mlp": 1.02480912, + "epoch": 0.3369908312039681, + "flos": 22016368604160.0, + "grad_norm": 1.3949106231684925, + "language_loss": 0.73859751, + "learning_rate": 2.98041373235727e-06, + "loss": 0.75966263, + "num_input_tokens_seen": 120327845, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.52734375, + "step": 5605, + "time_per_iteration": 2.3924612998962402 + }, + { + "auxiliary_loss_clip": 0.01078442, + "auxiliary_loss_mlp": 0.0103599, + "balance_loss_clip": 1.01965797, + "balance_loss_mlp": 1.02344894, + "epoch": 0.33705095445663613, + "flos": 11801166145920.0, + "grad_norm": 2.4049308667503704, + "language_loss": 0.83449692, + "learning_rate": 2.980084402612056e-06, + "loss": 0.85564131, + "num_input_tokens_seen": 120343255, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.55078125, + "step": 5606, + "time_per_iteration": 2.351778984069824 + }, + { + "auxiliary_loss_clip": 0.01074129, + "auxiliary_loss_mlp": 0.01029338, + "balance_loss_clip": 1.01455641, + "balance_loss_mlp": 1.0226649, + "epoch": 0.3371110777093041, + "flos": 25045223627520.0, + "grad_norm": 1.560513435668126, + "language_loss": 0.67990649, + "learning_rate": 2.97975503789015e-06, + "loss": 0.70094109, + "num_input_tokens_seen": 120361745, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.515625, + "step": 5607, + "time_per_iteration": 2.402966022491455 + }, + { + "auxiliary_loss_clip": 0.01079023, + "auxiliary_loss_mlp": 0.01029879, + "balance_loss_clip": 1.0140481, + "balance_loss_mlp": 1.02460265, + "epoch": 0.33717120096197206, + "flos": 26577805979520.0, + "grad_norm": 2.5935135639872477, + "language_loss": 0.70628291, + "learning_rate": 2.979425638203307e-06, + "loss": 0.72737193, + "num_input_tokens_seen": 120380565, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.54296875, + "step": 5608, + "time_per_iteration": 2.417232036590576 + }, + { + "auxiliary_loss_clip": 0.01077287, + "auxiliary_loss_mlp": 0.01031477, + "balance_loss_clip": 1.01711226, + "balance_loss_mlp": 1.02548313, + "epoch": 0.33723132421464, + "flos": 15157646167680.0, + "grad_norm": 1.9004754845051264, + "language_loss": 0.79216373, + "learning_rate": 2.9790962035632823e-06, + "loss": 0.81325138, + "num_input_tokens_seen": 120399235, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.515625, + "step": 5609, + "time_per_iteration": 2.3566930294036865 + }, + { + "auxiliary_loss_clip": 0.01076582, + "auxiliary_loss_mlp": 0.01036807, + "balance_loss_clip": 1.0208807, + "balance_loss_mlp": 1.02438807, + "epoch": 0.337291447467308, + "flos": 23435099412480.0, + "grad_norm": 4.681624607450182, + "language_loss": 0.82176632, + "learning_rate": 2.978766733981833e-06, + "loss": 0.84290016, + "num_input_tokens_seen": 120420095, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.5234375, + "step": 5610, + "time_per_iteration": 2.4102094173431396 + }, + { + "auxiliary_loss_clip": 0.01073582, + "auxiliary_loss_mlp": 0.0103033, + "balance_loss_clip": 1.01439166, + "balance_loss_mlp": 1.0230006, + "epoch": 0.33735157071997596, + "flos": 17237212790400.0, + "grad_norm": 2.080899212381112, + "language_loss": 0.81895936, + "learning_rate": 2.9784372294707165e-06, + "loss": 0.83999848, + "num_input_tokens_seen": 120437690, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.5078125, + "step": 5611, + "time_per_iteration": 2.347810745239258 + }, + { + "auxiliary_loss_clip": 0.01078331, + "auxiliary_loss_mlp": 0.01036802, + "balance_loss_clip": 1.01898026, + "balance_loss_mlp": 1.0247165, + "epoch": 0.3374116939726439, + "flos": 28256115813120.0, + "grad_norm": 1.6169619410858138, + "language_loss": 0.79374301, + "learning_rate": 2.9781076900416923e-06, + "loss": 0.81489432, + "num_input_tokens_seen": 120459240, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.53515625, + "step": 5612, + "time_per_iteration": 2.436657667160034 + }, + { + "auxiliary_loss_clip": 0.01075045, + "auxiliary_loss_mlp": 0.01030121, + "balance_loss_clip": 1.01448071, + "balance_loss_mlp": 1.02245164, + "epoch": 0.3374718172253119, + "flos": 35917910409600.0, + "grad_norm": 3.1761571225599345, + "language_loss": 0.69668424, + "learning_rate": 2.97777811570652e-06, + "loss": 0.71773589, + "num_input_tokens_seen": 120481090, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.5234375, + "step": 5613, + "time_per_iteration": 2.487042188644409 + }, + { + "auxiliary_loss_clip": 0.01076619, + "auxiliary_loss_mlp": 0.01032591, + "balance_loss_clip": 1.01573491, + "balance_loss_mlp": 1.02403438, + "epoch": 0.33753194047797985, + "flos": 18185698229760.0, + "grad_norm": 4.249750231575481, + "language_loss": 0.7980001, + "learning_rate": 2.977448506476962e-06, + "loss": 0.81909221, + "num_input_tokens_seen": 120500045, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.52734375, + "step": 5614, + "time_per_iteration": 2.379701614379883 + }, + { + "auxiliary_loss_clip": 0.0107648, + "auxiliary_loss_mlp": 0.0103566, + "balance_loss_clip": 1.01924527, + "balance_loss_mlp": 1.02390707, + "epoch": 0.3375920637306478, + "flos": 23147798901120.0, + "grad_norm": 1.6485986299045274, + "language_loss": 0.90878582, + "learning_rate": 2.977118862364781e-06, + "loss": 0.9299072, + "num_input_tokens_seen": 120521125, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.52734375, + "step": 5615, + "time_per_iteration": 2.3953943252563477 + }, + { + "auxiliary_loss_clip": 0.01075264, + "auxiliary_loss_mlp": 0.01029163, + "balance_loss_clip": 1.01466739, + "balance_loss_mlp": 1.02394128, + "epoch": 0.3376521869833158, + "flos": 23111105195520.0, + "grad_norm": 2.526942021719091, + "language_loss": 0.81231046, + "learning_rate": 2.9767891833817424e-06, + "loss": 0.83335471, + "num_input_tokens_seen": 120539180, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.51171875, + "step": 5616, + "time_per_iteration": 2.390759229660034 + }, + { + "auxiliary_loss_clip": 0.01078023, + "auxiliary_loss_mlp": 0.01033462, + "balance_loss_clip": 1.01626003, + "balance_loss_mlp": 1.02376103, + "epoch": 0.33771231023598375, + "flos": 19273766751360.0, + "grad_norm": 2.126146631811231, + "language_loss": 0.83726001, + "learning_rate": 2.976459469539609e-06, + "loss": 0.85837483, + "num_input_tokens_seen": 120556280, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.54296875, + "step": 5617, + "time_per_iteration": 2.3498964309692383 + }, + { + "auxiliary_loss_clip": 0.01076682, + "auxiliary_loss_mlp": 0.01032118, + "balance_loss_clip": 1.01750875, + "balance_loss_mlp": 1.02386653, + "epoch": 0.3377724334886517, + "flos": 18149213992320.0, + "grad_norm": 1.3591091061291356, + "language_loss": 0.80304039, + "learning_rate": 2.97612972085015e-06, + "loss": 0.82412839, + "num_input_tokens_seen": 120575395, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.52734375, + "step": 5618, + "time_per_iteration": 2.3753347396850586 + }, + { + "auxiliary_loss_clip": 0.01076327, + "auxiliary_loss_mlp": 0.01029588, + "balance_loss_clip": 1.01401925, + "balance_loss_mlp": 1.02267492, + "epoch": 0.3378325567413197, + "flos": 25774803642240.0, + "grad_norm": 3.1814983270588586, + "language_loss": 0.70787764, + "learning_rate": 2.9757999373251315e-06, + "loss": 0.72893679, + "num_input_tokens_seen": 120596075, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.53515625, + "step": 5619, + "time_per_iteration": 2.4107887744903564 + }, + { + "auxiliary_loss_clip": 0.0107475, + "auxiliary_loss_mlp": 0.01036776, + "balance_loss_clip": 1.02047396, + "balance_loss_mlp": 1.02219379, + "epoch": 0.3378926799939877, + "flos": 21316255643520.0, + "grad_norm": 2.647308501402637, + "language_loss": 0.69699872, + "learning_rate": 2.9754701189763236e-06, + "loss": 0.71811402, + "num_input_tokens_seen": 120614195, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.5234375, + "step": 5620, + "time_per_iteration": 2.39211368560791 + }, + { + "auxiliary_loss_clip": 0.01078578, + "auxiliary_loss_mlp": 0.01031369, + "balance_loss_clip": 1.01637197, + "balance_loss_mlp": 1.02622032, + "epoch": 0.33795280324665566, + "flos": 24898867741440.0, + "grad_norm": 1.6244545301680078, + "language_loss": 0.67200458, + "learning_rate": 2.975140265815496e-06, + "loss": 0.69310403, + "num_input_tokens_seen": 120634475, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.5234375, + "step": 5621, + "time_per_iteration": 2.4176483154296875 + }, + { + "auxiliary_loss_clip": 0.01073725, + "auxiliary_loss_mlp": 0.01031109, + "balance_loss_clip": 1.01666045, + "balance_loss_mlp": 1.02346575, + "epoch": 0.33801292649932363, + "flos": 24752791146240.0, + "grad_norm": 1.8086469645672796, + "language_loss": 0.82557905, + "learning_rate": 2.9748103778544213e-06, + "loss": 0.84662735, + "num_input_tokens_seen": 120654980, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.50390625, + "step": 5622, + "time_per_iteration": 2.4154491424560547 + }, + { + "auxiliary_loss_clip": 0.01074326, + "auxiliary_loss_mlp": 0.01031873, + "balance_loss_clip": 1.01735902, + "balance_loss_mlp": 1.02285719, + "epoch": 0.3380730497519916, + "flos": 26722765411200.0, + "grad_norm": 1.4356041179159156, + "language_loss": 0.7320528, + "learning_rate": 2.974480455104871e-06, + "loss": 0.75311476, + "num_input_tokens_seen": 120676245, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.515625, + "step": 5623, + "time_per_iteration": 2.4266836643218994 + }, + { + "auxiliary_loss_clip": 0.0101272, + "auxiliary_loss_mlp": 0.01003525, + "balance_loss_clip": 1.00199938, + "balance_loss_mlp": 1.00198269, + "epoch": 0.33813317300465956, + "flos": 70032241560960.0, + "grad_norm": 0.7436209765602, + "language_loss": 0.54921335, + "learning_rate": 2.9741504975786206e-06, + "loss": 0.56937581, + "num_input_tokens_seen": 120741965, + "router_z_loss_clip": 0.01525879, + "router_z_loss_mlp": 0.10742188, + "step": 5624, + "time_per_iteration": 3.153130054473877 + }, + { + "auxiliary_loss_clip": 0.01077752, + "auxiliary_loss_mlp": 0.01037399, + "balance_loss_clip": 1.02203345, + "balance_loss_mlp": 1.02473402, + "epoch": 0.3381932962573275, + "flos": 24096179606400.0, + "grad_norm": 2.549702212202259, + "language_loss": 0.72598791, + "learning_rate": 2.9738205052874444e-06, + "loss": 0.74713939, + "num_input_tokens_seen": 120760410, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.53125, + "step": 5625, + "time_per_iteration": 2.3772730827331543 + }, + { + "auxiliary_loss_clip": 0.01076543, + "auxiliary_loss_mlp": 0.01030958, + "balance_loss_clip": 1.01548493, + "balance_loss_mlp": 1.02353954, + "epoch": 0.3382534195099955, + "flos": 19277327710080.0, + "grad_norm": 4.643797502931981, + "language_loss": 0.70201731, + "learning_rate": 2.9734904782431196e-06, + "loss": 0.72309232, + "num_input_tokens_seen": 120777705, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.53125, + "step": 5626, + "time_per_iteration": 3.8215889930725098 + }, + { + "auxiliary_loss_clip": 0.01073943, + "auxiliary_loss_mlp": 0.01030701, + "balance_loss_clip": 1.01525116, + "balance_loss_mlp": 1.0228622, + "epoch": 0.33831354276266346, + "flos": 25225131196800.0, + "grad_norm": 1.6269249559112355, + "language_loss": 0.81360829, + "learning_rate": 2.973160416457423e-06, + "loss": 0.83465469, + "num_input_tokens_seen": 120798660, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.51171875, + "step": 5627, + "time_per_iteration": 2.4187004566192627 + }, + { + "auxiliary_loss_clip": 0.01081669, + "auxiliary_loss_mlp": 0.01037101, + "balance_loss_clip": 1.02085328, + "balance_loss_mlp": 1.02671492, + "epoch": 0.3383736660153314, + "flos": 23110895727360.0, + "grad_norm": 2.3035341468533828, + "language_loss": 0.80695158, + "learning_rate": 2.9728303199421354e-06, + "loss": 0.82813925, + "num_input_tokens_seen": 120816705, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.55078125, + "step": 5628, + "time_per_iteration": 2.369699001312256 + }, + { + "auxiliary_loss_clip": 0.01075366, + "auxiliary_loss_mlp": 0.01031168, + "balance_loss_clip": 1.01518178, + "balance_loss_mlp": 1.02260685, + "epoch": 0.3384337892679994, + "flos": 23476017392640.0, + "grad_norm": 1.9356229428978402, + "language_loss": 0.76929748, + "learning_rate": 2.9725001887090358e-06, + "loss": 0.79036283, + "num_input_tokens_seen": 120835375, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.52734375, + "step": 5629, + "time_per_iteration": 2.383164167404175 + }, + { + "auxiliary_loss_clip": 0.01076709, + "auxiliary_loss_mlp": 0.0103251, + "balance_loss_clip": 1.01631522, + "balance_loss_mlp": 1.02310622, + "epoch": 0.33849391252066735, + "flos": 19424835671040.0, + "grad_norm": 1.8089866933099867, + "language_loss": 0.84739345, + "learning_rate": 2.9721700227699055e-06, + "loss": 0.86848569, + "num_input_tokens_seen": 120854260, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.53515625, + "step": 5630, + "time_per_iteration": 3.8142900466918945 + }, + { + "auxiliary_loss_clip": 0.01076486, + "auxiliary_loss_mlp": 0.01029818, + "balance_loss_clip": 1.01495266, + "balance_loss_mlp": 1.02372074, + "epoch": 0.3385540357733353, + "flos": 21063903269760.0, + "grad_norm": 2.744936973249887, + "language_loss": 0.71665031, + "learning_rate": 2.9718398221365285e-06, + "loss": 0.73771334, + "num_input_tokens_seen": 120871590, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.52734375, + "step": 5631, + "time_per_iteration": 2.3694472312927246 + }, + { + "auxiliary_loss_clip": 0.01012176, + "auxiliary_loss_mlp": 0.01004933, + "balance_loss_clip": 1.00341284, + "balance_loss_mlp": 1.00143838, + "epoch": 0.3386141590260033, + "flos": 69205220208000.0, + "grad_norm": 0.8448382363837306, + "language_loss": 0.56190181, + "learning_rate": 2.9715095868206874e-06, + "loss": 0.58207297, + "num_input_tokens_seen": 120925550, + "router_z_loss_clip": 0.01519775, + "router_z_loss_mlp": 0.10742188, + "step": 5632, + "time_per_iteration": 3.065547227859497 + }, + { + "auxiliary_loss_clip": 0.01075578, + "auxiliary_loss_mlp": 0.01028533, + "balance_loss_clip": 1.01295805, + "balance_loss_mlp": 1.02369523, + "epoch": 0.3386742822786713, + "flos": 25518331728000.0, + "grad_norm": 1.519673194388011, + "language_loss": 0.80069363, + "learning_rate": 2.9711793168341686e-06, + "loss": 0.82173479, + "num_input_tokens_seen": 120947620, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.51953125, + "step": 5633, + "time_per_iteration": 2.4332261085510254 + }, + { + "auxiliary_loss_clip": 0.01074967, + "auxiliary_loss_mlp": 0.01029958, + "balance_loss_clip": 1.01499772, + "balance_loss_mlp": 1.02392209, + "epoch": 0.33873440553133927, + "flos": 23621989253760.0, + "grad_norm": 1.8328984334662508, + "language_loss": 0.592664, + "learning_rate": 2.9708490121887587e-06, + "loss": 0.61371326, + "num_input_tokens_seen": 120965205, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.51171875, + "step": 5634, + "time_per_iteration": 2.3915348052978516 + }, + { + "auxiliary_loss_clip": 0.0107395, + "auxiliary_loss_mlp": 0.01030309, + "balance_loss_clip": 1.01555061, + "balance_loss_mlp": 1.02264261, + "epoch": 0.33879452878400723, + "flos": 17088029084160.0, + "grad_norm": 2.1240742698835064, + "language_loss": 0.92571288, + "learning_rate": 2.9705186728962436e-06, + "loss": 0.94675547, + "num_input_tokens_seen": 120983560, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.51171875, + "step": 5635, + "time_per_iteration": 3.7240724563598633 + }, + { + "auxiliary_loss_clip": 0.01074799, + "auxiliary_loss_mlp": 0.0102773, + "balance_loss_clip": 1.01378798, + "balance_loss_mlp": 1.02493238, + "epoch": 0.3388546520366752, + "flos": 15741149587200.0, + "grad_norm": 2.8554016811383764, + "language_loss": 0.75170875, + "learning_rate": 2.9701882989684145e-06, + "loss": 0.77273405, + "num_input_tokens_seen": 121001400, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.5, + "step": 5636, + "time_per_iteration": 2.3523852825164795 + }, + { + "auxiliary_loss_clip": 0.0107459, + "auxiliary_loss_mlp": 0.01033708, + "balance_loss_clip": 1.01800776, + "balance_loss_mlp": 1.02363765, + "epoch": 0.33891477528934316, + "flos": 22417660304640.0, + "grad_norm": 1.5282647174407973, + "language_loss": 0.83106989, + "learning_rate": 2.96985789041706e-06, + "loss": 0.85215294, + "num_input_tokens_seen": 121021760, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.5078125, + "step": 5637, + "time_per_iteration": 2.402747392654419 + }, + { + "auxiliary_loss_clip": 0.0107723, + "auxiliary_loss_mlp": 0.01035413, + "balance_loss_clip": 1.01957023, + "balance_loss_mlp": 1.02447546, + "epoch": 0.3389748985420111, + "flos": 17273871584640.0, + "grad_norm": 2.1993904029405082, + "language_loss": 0.69675529, + "learning_rate": 2.9695274472539725e-06, + "loss": 0.71788174, + "num_input_tokens_seen": 121041070, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.52734375, + "step": 5638, + "time_per_iteration": 2.3540613651275635 + }, + { + "auxiliary_loss_clip": 0.01076904, + "auxiliary_loss_mlp": 0.01038821, + "balance_loss_clip": 1.02290702, + "balance_loss_mlp": 1.02506208, + "epoch": 0.3390350217946791, + "flos": 27743765477760.0, + "grad_norm": 1.695284926964946, + "language_loss": 0.80802101, + "learning_rate": 2.9691969694909443e-06, + "loss": 0.82917827, + "num_input_tokens_seen": 121060890, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.51953125, + "step": 5639, + "time_per_iteration": 2.4445531368255615 + }, + { + "auxiliary_loss_clip": 0.01078085, + "auxiliary_loss_mlp": 0.01039751, + "balance_loss_clip": 1.02337193, + "balance_loss_mlp": 1.02449226, + "epoch": 0.33909514504734706, + "flos": 20338756997760.0, + "grad_norm": 2.689527476927121, + "language_loss": 0.67899245, + "learning_rate": 2.9688664571397696e-06, + "loss": 0.70017081, + "num_input_tokens_seen": 121079135, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.53515625, + "step": 5640, + "time_per_iteration": 2.36692214012146 + }, + { + "auxiliary_loss_clip": 0.01075868, + "auxiliary_loss_mlp": 0.01036855, + "balance_loss_clip": 1.021137, + "balance_loss_mlp": 1.02434814, + "epoch": 0.339155268300015, + "flos": 14829148385280.0, + "grad_norm": 1.674574474960228, + "language_loss": 0.69692838, + "learning_rate": 2.9685359102122432e-06, + "loss": 0.71805567, + "num_input_tokens_seen": 121097685, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.515625, + "step": 5641, + "time_per_iteration": 2.367981195449829 + }, + { + "auxiliary_loss_clip": 0.01077071, + "auxiliary_loss_mlp": 0.01037029, + "balance_loss_clip": 1.02202082, + "balance_loss_mlp": 1.02441025, + "epoch": 0.339215391552683, + "flos": 26066747364480.0, + "grad_norm": 1.7932676903481128, + "language_loss": 0.8727544, + "learning_rate": 2.9682053287201615e-06, + "loss": 0.89389545, + "num_input_tokens_seen": 121115640, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.52734375, + "step": 5642, + "time_per_iteration": 2.408924102783203 + }, + { + "auxiliary_loss_clip": 0.01070217, + "auxiliary_loss_mlp": 0.01027118, + "balance_loss_clip": 1.0143925, + "balance_loss_mlp": 1.02345514, + "epoch": 0.33927551480535095, + "flos": 14573828545920.0, + "grad_norm": 2.438411095618502, + "language_loss": 0.84156924, + "learning_rate": 2.967874712675322e-06, + "loss": 0.86254263, + "num_input_tokens_seen": 121132485, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.46875, + "step": 5643, + "time_per_iteration": 2.378606081008911 + }, + { + "auxiliary_loss_clip": 0.01076258, + "auxiliary_loss_mlp": 0.01039051, + "balance_loss_clip": 1.02366066, + "balance_loss_mlp": 1.02433157, + "epoch": 0.3393356380580189, + "flos": 23804445352320.0, + "grad_norm": 1.6008140968226678, + "language_loss": 0.76782477, + "learning_rate": 2.9675440620895233e-06, + "loss": 0.78897786, + "num_input_tokens_seen": 121152935, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.51953125, + "step": 5644, + "time_per_iteration": 2.3966500759124756 + }, + { + "auxiliary_loss_clip": 0.0107423, + "auxiliary_loss_mlp": 0.01038527, + "balance_loss_clip": 1.02316117, + "balance_loss_mlp": 1.02269149, + "epoch": 0.3393957613106869, + "flos": 17346909882240.0, + "grad_norm": 4.295219821805532, + "language_loss": 0.62777728, + "learning_rate": 2.9672133769745664e-06, + "loss": 0.6489048, + "num_input_tokens_seen": 121169835, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.515625, + "step": 5645, + "time_per_iteration": 2.369154930114746 + }, + { + "auxiliary_loss_clip": 0.0107192, + "auxiliary_loss_mlp": 0.01025699, + "balance_loss_clip": 1.01165605, + "balance_loss_mlp": 1.02285695, + "epoch": 0.3394558845633549, + "flos": 28432846448640.0, + "grad_norm": 1.8700672525926678, + "language_loss": 0.76823199, + "learning_rate": 2.966882657342252e-06, + "loss": 0.78920817, + "num_input_tokens_seen": 121190290, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.49023438, + "step": 5646, + "time_per_iteration": 2.427600145339966 + }, + { + "auxiliary_loss_clip": 0.01077248, + "auxiliary_loss_mlp": 0.01034818, + "balance_loss_clip": 1.0186888, + "balance_loss_mlp": 1.02365959, + "epoch": 0.33951600781602287, + "flos": 22085950677120.0, + "grad_norm": 2.0542152627864807, + "language_loss": 0.78814727, + "learning_rate": 2.9665519032043825e-06, + "loss": 0.80926788, + "num_input_tokens_seen": 121209060, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.53515625, + "step": 5647, + "time_per_iteration": 2.3867404460906982 + }, + { + "auxiliary_loss_clip": 0.01076388, + "auxiliary_loss_mlp": 0.01026807, + "balance_loss_clip": 1.01215613, + "balance_loss_mlp": 1.02448642, + "epoch": 0.33957613106869083, + "flos": 23877134536320.0, + "grad_norm": 2.481785087077141, + "language_loss": 0.77033567, + "learning_rate": 2.9662211145727618e-06, + "loss": 0.79136753, + "num_input_tokens_seen": 121227480, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.51953125, + "step": 5648, + "time_per_iteration": 2.3942325115203857 + }, + { + "auxiliary_loss_clip": 0.01013995, + "auxiliary_loss_mlp": 0.01006688, + "balance_loss_clip": 1.00538874, + "balance_loss_mlp": 1.00304985, + "epoch": 0.3396362543213588, + "flos": 71239014305280.0, + "grad_norm": 0.774301705182907, + "language_loss": 0.56301385, + "learning_rate": 2.965890291459195e-06, + "loss": 0.58322066, + "num_input_tokens_seen": 121291305, + "router_z_loss_clip": 0.01300049, + "router_z_loss_mlp": 0.109375, + "step": 5649, + "time_per_iteration": 3.0583043098449707 + }, + { + "auxiliary_loss_clip": 0.01074092, + "auxiliary_loss_mlp": 0.01037205, + "balance_loss_clip": 1.02161837, + "balance_loss_mlp": 1.02277708, + "epoch": 0.33969637757402676, + "flos": 25920426389760.0, + "grad_norm": 1.5210174824094578, + "language_loss": 0.85373539, + "learning_rate": 2.9655594338754887e-06, + "loss": 0.87484837, + "num_input_tokens_seen": 121312740, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.51171875, + "step": 5650, + "time_per_iteration": 2.435272216796875 + }, + { + "auxiliary_loss_clip": 0.01072556, + "auxiliary_loss_mlp": 0.01022831, + "balance_loss_clip": 1.00887132, + "balance_loss_mlp": 1.02285469, + "epoch": 0.33975650082669473, + "flos": 35260286440320.0, + "grad_norm": 1.8909576118453764, + "language_loss": 0.71000648, + "learning_rate": 2.9652285418334496e-06, + "loss": 0.73096031, + "num_input_tokens_seen": 121334220, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.49609375, + "step": 5651, + "time_per_iteration": 2.5079214572906494 + }, + { + "auxiliary_loss_clip": 0.010732, + "auxiliary_loss_mlp": 0.0103013, + "balance_loss_clip": 1.01514578, + "balance_loss_mlp": 1.02314401, + "epoch": 0.3398166240793627, + "flos": 16646273251200.0, + "grad_norm": 1.8315780166776139, + "language_loss": 0.81215549, + "learning_rate": 2.964897615344886e-06, + "loss": 0.83318877, + "num_input_tokens_seen": 121351870, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.5, + "step": 5652, + "time_per_iteration": 2.3635478019714355 + }, + { + "auxiliary_loss_clip": 0.01077169, + "auxiliary_loss_mlp": 0.01033078, + "balance_loss_clip": 1.01691341, + "balance_loss_mlp": 1.02482486, + "epoch": 0.33987674733203066, + "flos": 24061022000640.0, + "grad_norm": 1.7951883801322637, + "language_loss": 0.76595128, + "learning_rate": 2.9645666544216097e-06, + "loss": 0.7870537, + "num_input_tokens_seen": 121373400, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.5234375, + "step": 5653, + "time_per_iteration": 2.417473554611206 + }, + { + "auxiliary_loss_clip": 0.01076131, + "auxiliary_loss_mlp": 0.01027669, + "balance_loss_clip": 1.01310778, + "balance_loss_mlp": 1.0246743, + "epoch": 0.3399368705846986, + "flos": 13250132058240.0, + "grad_norm": 2.875885198306419, + "language_loss": 0.86475319, + "learning_rate": 2.9642356590754298e-06, + "loss": 0.88579118, + "num_input_tokens_seen": 121385225, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.515625, + "step": 5654, + "time_per_iteration": 2.3092591762542725 + }, + { + "auxiliary_loss_clip": 0.01073479, + "auxiliary_loss_mlp": 0.01028222, + "balance_loss_clip": 1.01392317, + "balance_loss_mlp": 1.02191973, + "epoch": 0.3399969938373666, + "flos": 27011706756480.0, + "grad_norm": 2.428931871480884, + "language_loss": 0.65184164, + "learning_rate": 2.9639046293181603e-06, + "loss": 0.67285866, + "num_input_tokens_seen": 121404735, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.515625, + "step": 5655, + "time_per_iteration": 2.426403522491455 + }, + { + "auxiliary_loss_clip": 0.01075061, + "auxiliary_loss_mlp": 0.01028235, + "balance_loss_clip": 1.01450205, + "balance_loss_mlp": 1.02433681, + "epoch": 0.34005711709003456, + "flos": 28548792673920.0, + "grad_norm": 1.4290279978617195, + "language_loss": 0.76443708, + "learning_rate": 2.963573565161613e-06, + "loss": 0.78547001, + "num_input_tokens_seen": 121426780, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.5078125, + "step": 5656, + "time_per_iteration": 2.440124988555908 + }, + { + "auxiliary_loss_clip": 0.0107775, + "auxiliary_loss_mlp": 0.01031176, + "balance_loss_clip": 1.01574993, + "balance_loss_mlp": 1.02358961, + "epoch": 0.3401172403427025, + "flos": 21615914776320.0, + "grad_norm": 2.028324762550039, + "language_loss": 0.8266331, + "learning_rate": 2.963242466617605e-06, + "loss": 0.84772229, + "num_input_tokens_seen": 121447245, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.5390625, + "step": 5657, + "time_per_iteration": 2.4245548248291016 + }, + { + "auxiliary_loss_clip": 0.01075183, + "auxiliary_loss_mlp": 0.01026801, + "balance_loss_clip": 1.01234102, + "balance_loss_mlp": 1.024786, + "epoch": 0.3401773635953705, + "flos": 25884570556800.0, + "grad_norm": 1.8957057597359546, + "language_loss": 0.85232812, + "learning_rate": 2.9629113336979505e-06, + "loss": 0.87334794, + "num_input_tokens_seen": 121468165, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.50390625, + "step": 5658, + "time_per_iteration": 2.4326703548431396 + }, + { + "auxiliary_loss_clip": 0.01012248, + "auxiliary_loss_mlp": 0.01002538, + "balance_loss_clip": 1.00134003, + "balance_loss_mlp": 1.00153422, + "epoch": 0.3402374868480385, + "flos": 65504704982400.0, + "grad_norm": 0.8148447127203781, + "language_loss": 0.59968007, + "learning_rate": 2.962580166414467e-06, + "loss": 0.61982793, + "num_input_tokens_seen": 121523795, + "router_z_loss_clip": 0.01196289, + "router_z_loss_mlp": 0.10742188, + "step": 5659, + "time_per_iteration": 2.9893085956573486 + }, + { + "auxiliary_loss_clip": 0.01074024, + "auxiliary_loss_mlp": 0.01025055, + "balance_loss_clip": 1.01110721, + "balance_loss_mlp": 1.025442, + "epoch": 0.34029761010070647, + "flos": 24059450989440.0, + "grad_norm": 1.7796851025341553, + "language_loss": 0.6785512, + "learning_rate": 2.9622489647789742e-06, + "loss": 0.69954199, + "num_input_tokens_seen": 121542950, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.484375, + "step": 5660, + "time_per_iteration": 2.403658866882324 + }, + { + "auxiliary_loss_clip": 0.01078764, + "auxiliary_loss_mlp": 0.01033478, + "balance_loss_clip": 1.01771879, + "balance_loss_mlp": 1.02695012, + "epoch": 0.34035773335337444, + "flos": 27598491843840.0, + "grad_norm": 1.78381560535721, + "language_loss": 0.6722554, + "learning_rate": 2.9619177288032904e-06, + "loss": 0.69337779, + "num_input_tokens_seen": 121562765, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.51953125, + "step": 5661, + "time_per_iteration": 2.4223902225494385 + }, + { + "auxiliary_loss_clip": 0.0107266, + "auxiliary_loss_mlp": 0.01030514, + "balance_loss_clip": 1.01619661, + "balance_loss_mlp": 1.02369678, + "epoch": 0.3404178566060424, + "flos": 20811760364160.0, + "grad_norm": 1.8030479031776563, + "language_loss": 0.7913093, + "learning_rate": 2.9615864584992374e-06, + "loss": 0.81234097, + "num_input_tokens_seen": 121581610, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.49023438, + "step": 5662, + "time_per_iteration": 2.37856388092041 + }, + { + "auxiliary_loss_clip": 0.01075628, + "auxiliary_loss_mlp": 0.01034125, + "balance_loss_clip": 1.01933074, + "balance_loss_mlp": 1.02419043, + "epoch": 0.34047797985871037, + "flos": 26832357768960.0, + "grad_norm": 2.475650058666183, + "language_loss": 0.73567003, + "learning_rate": 2.961255153878637e-06, + "loss": 0.75676757, + "num_input_tokens_seen": 121601885, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.515625, + "step": 5663, + "time_per_iteration": 2.416792154312134 + }, + { + "auxiliary_loss_clip": 0.01070499, + "auxiliary_loss_mlp": 0.01027902, + "balance_loss_clip": 1.01439524, + "balance_loss_mlp": 1.02212024, + "epoch": 0.34053810311137833, + "flos": 19681621787520.0, + "grad_norm": 1.612172269073878, + "language_loss": 0.85902905, + "learning_rate": 2.9609238149533132e-06, + "loss": 0.88001305, + "num_input_tokens_seen": 121621335, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.484375, + "step": 5664, + "time_per_iteration": 2.3576905727386475 + }, + { + "auxiliary_loss_clip": 0.01074406, + "auxiliary_loss_mlp": 0.01026329, + "balance_loss_clip": 1.01238728, + "balance_loss_mlp": 1.02412224, + "epoch": 0.3405982263640463, + "flos": 21724669261440.0, + "grad_norm": 2.206593324272444, + "language_loss": 0.69139558, + "learning_rate": 2.9605924417350904e-06, + "loss": 0.71240294, + "num_input_tokens_seen": 121641310, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.5, + "step": 5665, + "time_per_iteration": 2.3931596279144287 + }, + { + "auxiliary_loss_clip": 0.01074031, + "auxiliary_loss_mlp": 0.01028627, + "balance_loss_clip": 1.01261675, + "balance_loss_mlp": 1.0229063, + "epoch": 0.34065834961671426, + "flos": 18040634064000.0, + "grad_norm": 2.777823712143208, + "language_loss": 0.72417438, + "learning_rate": 2.960261034235794e-06, + "loss": 0.74520099, + "num_input_tokens_seen": 121659625, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.51171875, + "step": 5666, + "time_per_iteration": 3.715991497039795 + }, + { + "auxiliary_loss_clip": 0.01075164, + "auxiliary_loss_mlp": 0.01033404, + "balance_loss_clip": 1.01895595, + "balance_loss_mlp": 1.02328968, + "epoch": 0.3407184728693822, + "flos": 21396276213120.0, + "grad_norm": 1.5275353853193336, + "language_loss": 0.73051095, + "learning_rate": 2.959929592467251e-06, + "loss": 0.75159669, + "num_input_tokens_seen": 121679205, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.51953125, + "step": 5667, + "time_per_iteration": 2.3947646617889404 + }, + { + "auxiliary_loss_clip": 0.01012561, + "auxiliary_loss_mlp": 0.0100436, + "balance_loss_clip": 1.00307214, + "balance_loss_mlp": 1.00175703, + "epoch": 0.3407785961220502, + "flos": 68684559102720.0, + "grad_norm": 0.8794491710731555, + "language_loss": 0.63289535, + "learning_rate": 2.959598116441291e-06, + "loss": 0.65306461, + "num_input_tokens_seen": 121751085, + "router_z_loss_clip": 0.01287842, + "router_z_loss_mlp": 0.10791016, + "step": 5668, + "time_per_iteration": 3.17501163482666 + }, + { + "auxiliary_loss_clip": 0.01075604, + "auxiliary_loss_mlp": 0.01033663, + "balance_loss_clip": 1.01920295, + "balance_loss_mlp": 1.02472508, + "epoch": 0.34083871937471816, + "flos": 14063503069440.0, + "grad_norm": 2.23530494272223, + "language_loss": 0.72277373, + "learning_rate": 2.959266606169741e-06, + "loss": 0.74386638, + "num_input_tokens_seen": 121768565, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.51171875, + "step": 5669, + "time_per_iteration": 5.303999423980713 + }, + { + "auxiliary_loss_clip": 0.01077472, + "auxiliary_loss_mlp": 0.01032144, + "balance_loss_clip": 1.01693916, + "balance_loss_mlp": 1.02426696, + "epoch": 0.3408988426273861, + "flos": 17084677593600.0, + "grad_norm": 1.8950884689325542, + "language_loss": 0.8054558, + "learning_rate": 2.9589350616644353e-06, + "loss": 0.82655203, + "num_input_tokens_seen": 121784925, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.53125, + "step": 5670, + "time_per_iteration": 2.35649037361145 + }, + { + "auxiliary_loss_clip": 0.01075323, + "auxiliary_loss_mlp": 0.01032444, + "balance_loss_clip": 1.01763189, + "balance_loss_mlp": 1.02297938, + "epoch": 0.3409589658800541, + "flos": 24023420599680.0, + "grad_norm": 1.6016969120455145, + "language_loss": 0.77061087, + "learning_rate": 2.9586034829372026e-06, + "loss": 0.79168856, + "num_input_tokens_seen": 121804425, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.5234375, + "step": 5671, + "time_per_iteration": 2.3982691764831543 + }, + { + "auxiliary_loss_clip": 0.0107555, + "auxiliary_loss_mlp": 0.01034369, + "balance_loss_clip": 1.01825142, + "balance_loss_mlp": 1.02337396, + "epoch": 0.34101908913272205, + "flos": 21140956373760.0, + "grad_norm": 1.796762478931232, + "language_loss": 0.74508572, + "learning_rate": 2.958271869999878e-06, + "loss": 0.76618481, + "num_input_tokens_seen": 121825145, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.5234375, + "step": 5672, + "time_per_iteration": 2.4157934188842773 + }, + { + "auxiliary_loss_clip": 0.01075326, + "auxiliary_loss_mlp": 0.01029518, + "balance_loss_clip": 1.01557052, + "balance_loss_mlp": 1.02421045, + "epoch": 0.3410792123853901, + "flos": 15701209125120.0, + "grad_norm": 3.570387713207652, + "language_loss": 0.73293406, + "learning_rate": 2.9579402228642956e-06, + "loss": 0.75398248, + "num_input_tokens_seen": 121842185, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.5078125, + "step": 5673, + "time_per_iteration": 2.3417811393737793 + }, + { + "auxiliary_loss_clip": 0.01074595, + "auxiliary_loss_mlp": 0.0102692, + "balance_loss_clip": 1.01315713, + "balance_loss_mlp": 1.02400386, + "epoch": 0.34113933563805804, + "flos": 23034994698240.0, + "grad_norm": 1.963562023842391, + "language_loss": 0.79760337, + "learning_rate": 2.9576085415422902e-06, + "loss": 0.81861854, + "num_input_tokens_seen": 121862260, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.5078125, + "step": 5674, + "time_per_iteration": 3.866154193878174 + }, + { + "auxiliary_loss_clip": 0.01013711, + "auxiliary_loss_mlp": 0.01001131, + "balance_loss_clip": 0.99987978, + "balance_loss_mlp": 1.00293326, + "epoch": 0.341199458890726, + "flos": 69611294764800.0, + "grad_norm": 0.8143198738572112, + "language_loss": 0.56072396, + "learning_rate": 2.957276826045699e-06, + "loss": 0.58087242, + "num_input_tokens_seen": 121923560, + "router_z_loss_clip": 0.01251221, + "router_z_loss_mlp": 0.10791016, + "step": 5675, + "time_per_iteration": 3.181320905685425 + }, + { + "auxiliary_loss_clip": 0.0107496, + "auxiliary_loss_mlp": 0.01032886, + "balance_loss_clip": 1.01872945, + "balance_loss_mlp": 1.02460063, + "epoch": 0.34125958214339397, + "flos": 22345250411520.0, + "grad_norm": 1.5788252027311565, + "language_loss": 0.78773433, + "learning_rate": 2.9569450763863606e-06, + "loss": 0.8088128, + "num_input_tokens_seen": 121943515, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.50390625, + "step": 5676, + "time_per_iteration": 2.386732578277588 + }, + { + "auxiliary_loss_clip": 0.01072465, + "auxiliary_loss_mlp": 0.01025233, + "balance_loss_clip": 1.00962305, + "balance_loss_mlp": 1.02226472, + "epoch": 0.34131970539606193, + "flos": 21870850590720.0, + "grad_norm": 1.7749481020339508, + "language_loss": 0.85304117, + "learning_rate": 2.9566132925761143e-06, + "loss": 0.87401807, + "num_input_tokens_seen": 121962540, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.5, + "step": 5677, + "time_per_iteration": 2.4253628253936768 + }, + { + "auxiliary_loss_clip": 0.01072604, + "auxiliary_loss_mlp": 0.01033403, + "balance_loss_clip": 1.01908624, + "balance_loss_mlp": 1.02298474, + "epoch": 0.3413798286487299, + "flos": 24934583928960.0, + "grad_norm": 1.8824480524619214, + "language_loss": 0.79324758, + "learning_rate": 2.9562814746267996e-06, + "loss": 0.81430763, + "num_input_tokens_seen": 121979830, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.49609375, + "step": 5678, + "time_per_iteration": 2.4013874530792236 + }, + { + "auxiliary_loss_clip": 0.01074352, + "auxiliary_loss_mlp": 0.01028741, + "balance_loss_clip": 1.01456094, + "balance_loss_mlp": 1.02263176, + "epoch": 0.34143995190139786, + "flos": 25373197739520.0, + "grad_norm": 1.742050798159555, + "language_loss": 0.74857879, + "learning_rate": 2.9559496225502594e-06, + "loss": 0.76960969, + "num_input_tokens_seen": 121999055, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.515625, + "step": 5679, + "time_per_iteration": 2.414229154586792 + }, + { + "auxiliary_loss_clip": 0.010139, + "auxiliary_loss_mlp": 0.01010534, + "balance_loss_clip": 1.00940156, + "balance_loss_mlp": 1.00296545, + "epoch": 0.34150007515406583, + "flos": 67778876856960.0, + "grad_norm": 0.7177870234899756, + "language_loss": 0.59463364, + "learning_rate": 2.955617736358336e-06, + "loss": 0.61487794, + "num_input_tokens_seen": 122067015, + "router_z_loss_clip": 0.01135254, + "router_z_loss_mlp": 0.109375, + "step": 5680, + "time_per_iteration": 3.1156423091888428 + }, + { + "auxiliary_loss_clip": 0.01073377, + "auxiliary_loss_mlp": 0.01025167, + "balance_loss_clip": 1.01211929, + "balance_loss_mlp": 1.02435422, + "epoch": 0.3415601984067338, + "flos": 20301399976320.0, + "grad_norm": 1.8957808482037422, + "language_loss": 0.7218442, + "learning_rate": 2.955285816062874e-06, + "loss": 0.74282968, + "num_input_tokens_seen": 122085295, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.49023438, + "step": 5681, + "time_per_iteration": 2.37471342086792 + }, + { + "auxiliary_loss_clip": 0.01071708, + "auxiliary_loss_mlp": 0.01032365, + "balance_loss_clip": 1.01884079, + "balance_loss_mlp": 1.02241039, + "epoch": 0.34162032165940176, + "flos": 26029983836160.0, + "grad_norm": 2.011187576118092, + "language_loss": 0.71353495, + "learning_rate": 2.9549538616757183e-06, + "loss": 0.73457569, + "num_input_tokens_seen": 122104020, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.4921875, + "step": 5682, + "time_per_iteration": 2.421415328979492 + }, + { + "auxiliary_loss_clip": 0.0107595, + "auxiliary_loss_mlp": 0.01032842, + "balance_loss_clip": 1.016886, + "balance_loss_mlp": 1.02399075, + "epoch": 0.3416804449120697, + "flos": 28802087654400.0, + "grad_norm": 1.6609579092068152, + "language_loss": 0.84060884, + "learning_rate": 2.9546218732087154e-06, + "loss": 0.86169678, + "num_input_tokens_seen": 122125080, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.51953125, + "step": 5683, + "time_per_iteration": 2.4393839836120605 + }, + { + "auxiliary_loss_clip": 0.01078074, + "auxiliary_loss_mlp": 0.01034073, + "balance_loss_clip": 1.01802111, + "balance_loss_mlp": 1.02507877, + "epoch": 0.3417405681647377, + "flos": 22600500428160.0, + "grad_norm": 2.5783638337989148, + "language_loss": 0.70450306, + "learning_rate": 2.9542898506737135e-06, + "loss": 0.72562456, + "num_input_tokens_seen": 122146350, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.53125, + "step": 5684, + "time_per_iteration": 2.4221436977386475 + }, + { + "auxiliary_loss_clip": 0.0107283, + "auxiliary_loss_mlp": 0.01030206, + "balance_loss_clip": 1.01708674, + "balance_loss_mlp": 1.02378631, + "epoch": 0.34180069141740566, + "flos": 24715119922560.0, + "grad_norm": 1.3735198308248453, + "language_loss": 0.74791551, + "learning_rate": 2.953957794082562e-06, + "loss": 0.76894587, + "num_input_tokens_seen": 122168085, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.49023438, + "step": 5685, + "time_per_iteration": 2.4222681522369385 + }, + { + "auxiliary_loss_clip": 0.01073828, + "auxiliary_loss_mlp": 0.01030782, + "balance_loss_clip": 1.01708531, + "balance_loss_mlp": 1.02382946, + "epoch": 0.3418608146700737, + "flos": 30517440307200.0, + "grad_norm": 1.8220409488685603, + "language_loss": 0.70073551, + "learning_rate": 2.9536257034471107e-06, + "loss": 0.72178161, + "num_input_tokens_seen": 122191040, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.5, + "step": 5686, + "time_per_iteration": 2.4573311805725098 + }, + { + "auxiliary_loss_clip": 0.010748, + "auxiliary_loss_mlp": 0.01031809, + "balance_loss_clip": 1.016294, + "balance_loss_mlp": 1.02225232, + "epoch": 0.34192093792274164, + "flos": 15121441221120.0, + "grad_norm": 2.1157254569466764, + "language_loss": 0.77686048, + "learning_rate": 2.9532935787792114e-06, + "loss": 0.79792655, + "num_input_tokens_seen": 122209225, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.5234375, + "step": 5687, + "time_per_iteration": 2.367379903793335 + }, + { + "auxiliary_loss_clip": 0.01075551, + "auxiliary_loss_mlp": 0.01031559, + "balance_loss_clip": 1.01678872, + "balance_loss_mlp": 1.02577603, + "epoch": 0.3419810611754096, + "flos": 13186973675520.0, + "grad_norm": 2.5256990769760366, + "language_loss": 0.8633939, + "learning_rate": 2.9529614200907157e-06, + "loss": 0.88446498, + "num_input_tokens_seen": 122226160, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.49804688, + "step": 5688, + "time_per_iteration": 2.3486032485961914 + }, + { + "auxiliary_loss_clip": 0.01080261, + "auxiliary_loss_mlp": 0.01031542, + "balance_loss_clip": 1.01519239, + "balance_loss_mlp": 1.02515614, + "epoch": 0.34204118442807757, + "flos": 19535265901440.0, + "grad_norm": 6.925714096368138, + "language_loss": 0.79477704, + "learning_rate": 2.9526292273934787e-06, + "loss": 0.81589508, + "num_input_tokens_seen": 122243115, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.55078125, + "step": 5689, + "time_per_iteration": 2.367412805557251 + }, + { + "auxiliary_loss_clip": 0.01074823, + "auxiliary_loss_mlp": 0.01031888, + "balance_loss_clip": 1.01592612, + "balance_loss_mlp": 1.02324164, + "epoch": 0.34210130768074554, + "flos": 15193955848320.0, + "grad_norm": 2.095379269535975, + "language_loss": 0.73551762, + "learning_rate": 2.9522970006993547e-06, + "loss": 0.7565847, + "num_input_tokens_seen": 122261105, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.515625, + "step": 5690, + "time_per_iteration": 2.3497183322906494 + }, + { + "auxiliary_loss_clip": 0.01074195, + "auxiliary_loss_mlp": 0.01026784, + "balance_loss_clip": 1.01299095, + "balance_loss_mlp": 1.02267826, + "epoch": 0.3421614309334135, + "flos": 24935072688000.0, + "grad_norm": 2.5310555944899122, + "language_loss": 0.75868571, + "learning_rate": 2.9519647400202003e-06, + "loss": 0.77969539, + "num_input_tokens_seen": 122279995, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.515625, + "step": 5691, + "time_per_iteration": 2.4062180519104004 + }, + { + "auxiliary_loss_clip": 0.01072996, + "auxiliary_loss_mlp": 0.01028443, + "balance_loss_clip": 1.01408434, + "balance_loss_mlp": 1.02382326, + "epoch": 0.34222155418608147, + "flos": 21907544296320.0, + "grad_norm": 2.8975207884020935, + "language_loss": 0.68100238, + "learning_rate": 2.9516324453678733e-06, + "loss": 0.70201677, + "num_input_tokens_seen": 122299070, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.4921875, + "step": 5692, + "time_per_iteration": 2.4295010566711426 + }, + { + "auxiliary_loss_clip": 0.01077801, + "auxiliary_loss_mlp": 0.01032918, + "balance_loss_clip": 1.01755166, + "balance_loss_mlp": 1.02483153, + "epoch": 0.34228167743874943, + "flos": 18113078868480.0, + "grad_norm": 3.498462332068761, + "language_loss": 0.71870959, + "learning_rate": 2.9513001167542316e-06, + "loss": 0.73981678, + "num_input_tokens_seen": 122316800, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.53125, + "step": 5693, + "time_per_iteration": 2.3698456287384033 + }, + { + "auxiliary_loss_clip": 0.01074298, + "auxiliary_loss_mlp": 0.01036127, + "balance_loss_clip": 1.02198911, + "balance_loss_mlp": 1.02347028, + "epoch": 0.3423418006914174, + "flos": 21287521728000.0, + "grad_norm": 1.810947149540552, + "language_loss": 0.75413698, + "learning_rate": 2.9509677541911363e-06, + "loss": 0.7752412, + "num_input_tokens_seen": 122335275, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.5078125, + "step": 5694, + "time_per_iteration": 2.383002996444702 + }, + { + "auxiliary_loss_clip": 0.01074298, + "auxiliary_loss_mlp": 0.01026843, + "balance_loss_clip": 1.01333082, + "balance_loss_mlp": 1.02473545, + "epoch": 0.34240192394408536, + "flos": 19822601324160.0, + "grad_norm": 1.7129228357165343, + "language_loss": 0.79348469, + "learning_rate": 2.9506353576904483e-06, + "loss": 0.81449616, + "num_input_tokens_seen": 122353215, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.49609375, + "step": 5695, + "time_per_iteration": 2.3853683471679688 + }, + { + "auxiliary_loss_clip": 0.01074382, + "auxiliary_loss_mlp": 0.01032469, + "balance_loss_clip": 1.01774085, + "balance_loss_mlp": 1.02322435, + "epoch": 0.3424620471967533, + "flos": 24534374480640.0, + "grad_norm": 1.8839048115373787, + "language_loss": 0.73172057, + "learning_rate": 2.9503029272640296e-06, + "loss": 0.75278914, + "num_input_tokens_seen": 122372495, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.51171875, + "step": 5696, + "time_per_iteration": 2.4111664295196533 + }, + { + "auxiliary_loss_clip": 0.0107503, + "auxiliary_loss_mlp": 0.01037978, + "balance_loss_clip": 1.02321994, + "balance_loss_mlp": 1.02284837, + "epoch": 0.3425221704494213, + "flos": 25847702294400.0, + "grad_norm": 1.7880738321196663, + "language_loss": 0.7082808, + "learning_rate": 2.9499704629237436e-06, + "loss": 0.72941089, + "num_input_tokens_seen": 122394600, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.5234375, + "step": 5697, + "time_per_iteration": 2.456502914428711 + }, + { + "auxiliary_loss_clip": 0.01072197, + "auxiliary_loss_mlp": 0.01030383, + "balance_loss_clip": 1.01631629, + "balance_loss_mlp": 1.02377963, + "epoch": 0.34258229370208926, + "flos": 21539524988160.0, + "grad_norm": 1.919144394090135, + "language_loss": 0.81856322, + "learning_rate": 2.9496379646814555e-06, + "loss": 0.839589, + "num_input_tokens_seen": 122414700, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.484375, + "step": 5698, + "time_per_iteration": 2.4169797897338867 + }, + { + "auxiliary_loss_clip": 0.01074333, + "auxiliary_loss_mlp": 0.01036529, + "balance_loss_clip": 1.02098382, + "balance_loss_mlp": 1.02245975, + "epoch": 0.3426424169547573, + "flos": 23651840332800.0, + "grad_norm": 2.518058786375559, + "language_loss": 0.68970788, + "learning_rate": 2.949305432549031e-06, + "loss": 0.7108165, + "num_input_tokens_seen": 122432760, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.51953125, + "step": 5699, + "time_per_iteration": 2.3943941593170166 + }, + { + "auxiliary_loss_clip": 0.01075163, + "auxiliary_loss_mlp": 0.01028546, + "balance_loss_clip": 1.01354909, + "balance_loss_mlp": 1.0239923, + "epoch": 0.34270254020742524, + "flos": 24643722458880.0, + "grad_norm": 2.2104811468110777, + "language_loss": 0.72249305, + "learning_rate": 2.9489728665383382e-06, + "loss": 0.74353015, + "num_input_tokens_seen": 122449105, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.51171875, + "step": 5700, + "time_per_iteration": 2.397564172744751 + }, + { + "auxiliary_loss_clip": 0.01073098, + "auxiliary_loss_mlp": 0.01029279, + "balance_loss_clip": 1.01558805, + "balance_loss_mlp": 1.02317071, + "epoch": 0.3427626634600932, + "flos": 20995682739840.0, + "grad_norm": 2.6289647790545967, + "language_loss": 0.8181535, + "learning_rate": 2.948640266661244e-06, + "loss": 0.83917725, + "num_input_tokens_seen": 122468700, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.5, + "step": 5701, + "time_per_iteration": 2.3824145793914795 + }, + { + "auxiliary_loss_clip": 0.0107711, + "auxiliary_loss_mlp": 0.01034621, + "balance_loss_clip": 1.02019644, + "balance_loss_mlp": 1.02536964, + "epoch": 0.3428227867127612, + "flos": 21432725539200.0, + "grad_norm": 2.0152050073294117, + "language_loss": 0.71497083, + "learning_rate": 2.94830763292962e-06, + "loss": 0.7360881, + "num_input_tokens_seen": 122488160, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.515625, + "step": 5702, + "time_per_iteration": 2.3891875743865967 + }, + { + "auxiliary_loss_clip": 0.01013416, + "auxiliary_loss_mlp": 0.0100492, + "balance_loss_clip": 1.0035913, + "balance_loss_mlp": 1.00280738, + "epoch": 0.34288290996542914, + "flos": 55728709827840.0, + "grad_norm": 0.7810744800621326, + "language_loss": 0.5740124, + "learning_rate": 2.9479749653553347e-06, + "loss": 0.59419584, + "num_input_tokens_seen": 122542890, + "router_z_loss_clip": 0.01330566, + "router_z_loss_mlp": 0.10644531, + "step": 5703, + "time_per_iteration": 2.9056954383850098 + }, + { + "auxiliary_loss_clip": 0.01077928, + "auxiliary_loss_mlp": 0.01035736, + "balance_loss_clip": 1.02021539, + "balance_loss_mlp": 1.02564716, + "epoch": 0.3429430332180971, + "flos": 20155777228800.0, + "grad_norm": 1.8204221027755783, + "language_loss": 0.75020349, + "learning_rate": 2.947642263950262e-06, + "loss": 0.77134013, + "num_input_tokens_seen": 122561770, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.5234375, + "step": 5704, + "time_per_iteration": 2.372490406036377 + }, + { + "auxiliary_loss_clip": 0.01074965, + "auxiliary_loss_mlp": 0.01029224, + "balance_loss_clip": 1.01540124, + "balance_loss_mlp": 1.0250175, + "epoch": 0.34300315647076507, + "flos": 17964942503040.0, + "grad_norm": 1.879754833532729, + "language_loss": 0.72667003, + "learning_rate": 2.947309528726274e-06, + "loss": 0.7477119, + "num_input_tokens_seen": 122580580, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.5, + "step": 5705, + "time_per_iteration": 3.7764549255371094 + }, + { + "auxiliary_loss_clip": 0.01073615, + "auxiliary_loss_mlp": 0.01026075, + "balance_loss_clip": 1.01195419, + "balance_loss_mlp": 1.02365065, + "epoch": 0.34306327972343303, + "flos": 22085845943040.0, + "grad_norm": 6.5751739236510875, + "language_loss": 0.80007935, + "learning_rate": 2.9469767596952463e-06, + "loss": 0.82107627, + "num_input_tokens_seen": 122599810, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.5, + "step": 5706, + "time_per_iteration": 2.3817293643951416 + }, + { + "auxiliary_loss_clip": 0.01077495, + "auxiliary_loss_mlp": 0.01025772, + "balance_loss_clip": 1.01019716, + "balance_loss_mlp": 1.02438688, + "epoch": 0.343123402976101, + "flos": 18441681384960.0, + "grad_norm": 2.6645609762034494, + "language_loss": 0.82980669, + "learning_rate": 2.946643956869054e-06, + "loss": 0.85083938, + "num_input_tokens_seen": 122616035, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.53125, + "step": 5707, + "time_per_iteration": 2.365241289138794 + }, + { + "auxiliary_loss_clip": 0.0107762, + "auxiliary_loss_mlp": 0.0102833, + "balance_loss_clip": 1.01338148, + "balance_loss_mlp": 1.0263772, + "epoch": 0.34318352622876896, + "flos": 17162778038400.0, + "grad_norm": 2.960922892708378, + "language_loss": 0.75558245, + "learning_rate": 2.9463111202595734e-06, + "loss": 0.77664202, + "num_input_tokens_seen": 122633785, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.51171875, + "step": 5708, + "time_per_iteration": 2.36161208152771 + }, + { + "auxiliary_loss_clip": 0.01073583, + "auxiliary_loss_mlp": 0.01027603, + "balance_loss_clip": 1.01321423, + "balance_loss_mlp": 1.02404428, + "epoch": 0.34324364948143693, + "flos": 26686944489600.0, + "grad_norm": 1.7596122305417234, + "language_loss": 0.81427091, + "learning_rate": 2.945978249878683e-06, + "loss": 0.8352828, + "num_input_tokens_seen": 122652100, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.49609375, + "step": 5709, + "time_per_iteration": 3.832428216934204 + }, + { + "auxiliary_loss_clip": 0.01076236, + "auxiliary_loss_mlp": 0.01037268, + "balance_loss_clip": 1.02000666, + "balance_loss_mlp": 1.02516234, + "epoch": 0.3433037727341049, + "flos": 21250513820160.0, + "grad_norm": 3.63411230336256, + "language_loss": 0.78820145, + "learning_rate": 2.9456453457382628e-06, + "loss": 0.80933654, + "num_input_tokens_seen": 122669720, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.51171875, + "step": 5710, + "time_per_iteration": 2.3868179321289062 + }, + { + "auxiliary_loss_clip": 0.01077579, + "auxiliary_loss_mlp": 0.01034373, + "balance_loss_clip": 1.017946, + "balance_loss_mlp": 1.02373505, + "epoch": 0.34336389598677286, + "flos": 20628431481600.0, + "grad_norm": 1.6910221146640076, + "language_loss": 0.69980818, + "learning_rate": 2.9453124078501926e-06, + "loss": 0.72092772, + "num_input_tokens_seen": 122688715, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.5390625, + "step": 5711, + "time_per_iteration": 2.376746892929077 + }, + { + "auxiliary_loss_clip": 0.01074128, + "auxiliary_loss_mlp": 0.01029468, + "balance_loss_clip": 1.01438832, + "balance_loss_mlp": 1.02360702, + "epoch": 0.3434240192394409, + "flos": 14537693422080.0, + "grad_norm": 1.952955427006162, + "language_loss": 0.67864966, + "learning_rate": 2.944979436226354e-06, + "loss": 0.69968557, + "num_input_tokens_seen": 122706970, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.50390625, + "step": 5712, + "time_per_iteration": 2.3456666469573975 + }, + { + "auxiliary_loss_clip": 0.01012491, + "auxiliary_loss_mlp": 0.01004574, + "balance_loss_clip": 1.00324452, + "balance_loss_mlp": 1.00178766, + "epoch": 0.34348414249210885, + "flos": 58048828784640.0, + "grad_norm": 1.3914194816331944, + "language_loss": 0.58103466, + "learning_rate": 2.94464643087863e-06, + "loss": 0.60120523, + "num_input_tokens_seen": 122758095, + "router_z_loss_clip": 0.01330566, + "router_z_loss_mlp": 0.10742188, + "step": 5713, + "time_per_iteration": 2.9882845878601074 + }, + { + "auxiliary_loss_clip": 0.01073632, + "auxiliary_loss_mlp": 0.010309, + "balance_loss_clip": 1.01583195, + "balance_loss_mlp": 1.02346694, + "epoch": 0.3435442657447768, + "flos": 20703389904000.0, + "grad_norm": 1.8318763180632778, + "language_loss": 0.805336, + "learning_rate": 2.9443133918189054e-06, + "loss": 0.82638127, + "num_input_tokens_seen": 122777815, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.5, + "step": 5714, + "time_per_iteration": 3.7624547481536865 + }, + { + "auxiliary_loss_clip": 0.01075892, + "auxiliary_loss_mlp": 0.01027368, + "balance_loss_clip": 1.01227617, + "balance_loss_mlp": 1.02396405, + "epoch": 0.3436043889974448, + "flos": 22929137856000.0, + "grad_norm": 1.9340453784652882, + "language_loss": 0.7198323, + "learning_rate": 2.943980319059064e-06, + "loss": 0.74086487, + "num_input_tokens_seen": 122797555, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.51953125, + "step": 5715, + "time_per_iteration": 2.388002872467041 + }, + { + "auxiliary_loss_clip": 0.01012159, + "auxiliary_loss_mlp": 0.01004033, + "balance_loss_clip": 1.00257826, + "balance_loss_mlp": 1.00159621, + "epoch": 0.34366451225011274, + "flos": 58399914216960.0, + "grad_norm": 0.961472637748732, + "language_loss": 0.65866983, + "learning_rate": 2.9436472126109943e-06, + "loss": 0.6788317, + "num_input_tokens_seen": 122863955, + "router_z_loss_clip": 0.01452637, + "router_z_loss_mlp": 0.10546875, + "step": 5716, + "time_per_iteration": 3.1248884201049805 + }, + { + "auxiliary_loss_clip": 0.0107796, + "auxiliary_loss_mlp": 0.01033594, + "balance_loss_clip": 1.01971757, + "balance_loss_mlp": 1.02654815, + "epoch": 0.3437246355027807, + "flos": 15595387194240.0, + "grad_norm": 1.9599620798169002, + "language_loss": 0.7386241, + "learning_rate": 2.9433140724865824e-06, + "loss": 0.75973964, + "num_input_tokens_seen": 122883000, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.515625, + "step": 5717, + "time_per_iteration": 2.3608038425445557 + }, + { + "auxiliary_loss_clip": 0.01074512, + "auxiliary_loss_mlp": 0.01028792, + "balance_loss_clip": 1.01446342, + "balance_loss_mlp": 1.02328563, + "epoch": 0.34378475875544867, + "flos": 27671041382400.0, + "grad_norm": 1.7242431315742077, + "language_loss": 0.75316566, + "learning_rate": 2.9429808986977175e-06, + "loss": 0.77419877, + "num_input_tokens_seen": 122903265, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.51171875, + "step": 5718, + "time_per_iteration": 2.4267332553863525 + }, + { + "auxiliary_loss_clip": 0.01074498, + "auxiliary_loss_mlp": 0.01037326, + "balance_loss_clip": 1.02221656, + "balance_loss_mlp": 1.02298272, + "epoch": 0.34384488200811664, + "flos": 31430139736320.0, + "grad_norm": 2.09727195742519, + "language_loss": 0.63215935, + "learning_rate": 2.9426476912562905e-06, + "loss": 0.65327752, + "num_input_tokens_seen": 122923860, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.515625, + "step": 5719, + "time_per_iteration": 2.4573211669921875 + }, + { + "auxiliary_loss_clip": 0.01076853, + "auxiliary_loss_mlp": 0.01033549, + "balance_loss_clip": 1.01659739, + "balance_loss_mlp": 1.02301538, + "epoch": 0.3439050052607846, + "flos": 24898763007360.0, + "grad_norm": 2.7939078417818957, + "language_loss": 0.73268723, + "learning_rate": 2.9423144501741918e-06, + "loss": 0.75379127, + "num_input_tokens_seen": 122945305, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.5390625, + "step": 5720, + "time_per_iteration": 2.404113531112671 + }, + { + "auxiliary_loss_clip": 0.0107499, + "auxiliary_loss_mlp": 0.01033338, + "balance_loss_clip": 1.01810896, + "balance_loss_mlp": 1.02303505, + "epoch": 0.34396512851345257, + "flos": 18149109258240.0, + "grad_norm": 3.081595879074298, + "language_loss": 0.73838741, + "learning_rate": 2.9419811754633143e-06, + "loss": 0.7594707, + "num_input_tokens_seen": 122962535, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.51953125, + "step": 5721, + "time_per_iteration": 2.3456480503082275 + }, + { + "auxiliary_loss_clip": 0.01077279, + "auxiliary_loss_mlp": 0.01037928, + "balance_loss_clip": 1.02213287, + "balance_loss_mlp": 1.02426636, + "epoch": 0.34402525176612053, + "flos": 16033512245760.0, + "grad_norm": 2.3607524072250174, + "language_loss": 0.80540323, + "learning_rate": 2.9416478671355516e-06, + "loss": 0.82655531, + "num_input_tokens_seen": 122979750, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.53125, + "step": 5722, + "time_per_iteration": 2.3664419651031494 + }, + { + "auxiliary_loss_clip": 0.01075159, + "auxiliary_loss_mlp": 0.0102811, + "balance_loss_clip": 1.01406729, + "balance_loss_mlp": 1.0242641, + "epoch": 0.3440853750187885, + "flos": 21177580256640.0, + "grad_norm": 1.6469063943501463, + "language_loss": 0.81590909, + "learning_rate": 2.9413145252027985e-06, + "loss": 0.83694184, + "num_input_tokens_seen": 122998955, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.5078125, + "step": 5723, + "time_per_iteration": 2.3793845176696777 + }, + { + "auxiliary_loss_clip": 0.01075197, + "auxiliary_loss_mlp": 0.01032864, + "balance_loss_clip": 1.01790333, + "balance_loss_mlp": 1.02287889, + "epoch": 0.34414549827145646, + "flos": 12677032224000.0, + "grad_norm": 2.004536861061174, + "language_loss": 0.81350088, + "learning_rate": 2.940981149676952e-06, + "loss": 0.83458149, + "num_input_tokens_seen": 123016165, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.5234375, + "step": 5724, + "time_per_iteration": 2.359459400177002 + }, + { + "auxiliary_loss_clip": 0.01076813, + "auxiliary_loss_mlp": 0.01035081, + "balance_loss_clip": 1.01961946, + "balance_loss_mlp": 1.02426028, + "epoch": 0.3442056215241244, + "flos": 31283190357120.0, + "grad_norm": 1.7974641488688818, + "language_loss": 0.69345838, + "learning_rate": 2.940647740569908e-06, + "loss": 0.71457732, + "num_input_tokens_seen": 123036900, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.52734375, + "step": 5725, + "time_per_iteration": 2.451002359390259 + }, + { + "auxiliary_loss_clip": 0.01079179, + "auxiliary_loss_mlp": 0.01039052, + "balance_loss_clip": 1.02187443, + "balance_loss_mlp": 1.02365041, + "epoch": 0.34426574477679245, + "flos": 23366180655360.0, + "grad_norm": 1.3771175473156736, + "language_loss": 0.69205964, + "learning_rate": 2.9403142978935665e-06, + "loss": 0.71324199, + "num_input_tokens_seen": 123057480, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.5546875, + "step": 5726, + "time_per_iteration": 2.3979878425598145 + }, + { + "auxiliary_loss_clip": 0.01074969, + "auxiliary_loss_mlp": 0.01031835, + "balance_loss_clip": 1.01743448, + "balance_loss_mlp": 1.02418554, + "epoch": 0.3443258680294604, + "flos": 24534269746560.0, + "grad_norm": 1.8254818200113507, + "language_loss": 0.72980255, + "learning_rate": 2.939980821659826e-06, + "loss": 0.75087065, + "num_input_tokens_seen": 123076890, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.5078125, + "step": 5727, + "time_per_iteration": 2.400801658630371 + }, + { + "auxiliary_loss_clip": 0.01074399, + "auxiliary_loss_mlp": 0.01032277, + "balance_loss_clip": 1.01679194, + "balance_loss_mlp": 1.02309775, + "epoch": 0.3443859912821284, + "flos": 20229094817280.0, + "grad_norm": 2.052924977815989, + "language_loss": 0.8778612, + "learning_rate": 2.9396473118805886e-06, + "loss": 0.89892799, + "num_input_tokens_seen": 123092530, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.51171875, + "step": 5728, + "time_per_iteration": 2.3673133850097656 + }, + { + "auxiliary_loss_clip": 0.01074814, + "auxiliary_loss_mlp": 0.01028116, + "balance_loss_clip": 1.01354897, + "balance_loss_mlp": 1.02365649, + "epoch": 0.34444611453479634, + "flos": 24315364321920.0, + "grad_norm": 2.36386125166066, + "language_loss": 0.70071566, + "learning_rate": 2.9393137685677555e-06, + "loss": 0.72174501, + "num_input_tokens_seen": 123110560, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.51171875, + "step": 5729, + "time_per_iteration": 2.3869028091430664 + }, + { + "auxiliary_loss_clip": 0.01072309, + "auxiliary_loss_mlp": 0.01026316, + "balance_loss_clip": 1.01136732, + "balance_loss_mlp": 1.02240229, + "epoch": 0.3445062377874643, + "flos": 16982451532800.0, + "grad_norm": 1.9786301324215365, + "language_loss": 0.74284554, + "learning_rate": 2.9389801917332294e-06, + "loss": 0.76383179, + "num_input_tokens_seen": 123128655, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.5, + "step": 5730, + "time_per_iteration": 2.359952688217163 + }, + { + "auxiliary_loss_clip": 0.01075309, + "auxiliary_loss_mlp": 0.0103268, + "balance_loss_clip": 1.01738501, + "balance_loss_mlp": 1.02374601, + "epoch": 0.3445663610401323, + "flos": 20301679267200.0, + "grad_norm": 2.4256585009243996, + "language_loss": 0.79217422, + "learning_rate": 2.938646581388917e-06, + "loss": 0.81325412, + "num_input_tokens_seen": 123145130, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.515625, + "step": 5731, + "time_per_iteration": 2.354198932647705 + }, + { + "auxiliary_loss_clip": 0.01073332, + "auxiliary_loss_mlp": 0.01030458, + "balance_loss_clip": 1.01530051, + "balance_loss_mlp": 1.02286911, + "epoch": 0.34462648429280024, + "flos": 15887191271040.0, + "grad_norm": 1.7999135472334495, + "language_loss": 0.7846632, + "learning_rate": 2.9383129375467214e-06, + "loss": 0.80570114, + "num_input_tokens_seen": 123162265, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.50390625, + "step": 5732, + "time_per_iteration": 2.3416993618011475 + }, + { + "auxiliary_loss_clip": 0.01016597, + "auxiliary_loss_mlp": 0.01018421, + "balance_loss_clip": 1.01696038, + "balance_loss_mlp": 1.00597429, + "epoch": 0.3446866075454682, + "flos": 59307760967040.0, + "grad_norm": 0.7502479164887117, + "language_loss": 0.53450692, + "learning_rate": 2.937979260218551e-06, + "loss": 0.55485713, + "num_input_tokens_seen": 123218620, + "router_z_loss_clip": 0.0145874, + "router_z_loss_mlp": 0.10644531, + "step": 5733, + "time_per_iteration": 3.051616668701172 + }, + { + "auxiliary_loss_clip": 0.01077458, + "auxiliary_loss_mlp": 0.0103427, + "balance_loss_clip": 1.01828384, + "balance_loss_mlp": 1.02557707, + "epoch": 0.34474673079813617, + "flos": 22342771704960.0, + "grad_norm": 1.8022796957976845, + "language_loss": 0.83247119, + "learning_rate": 2.9376455494163137e-06, + "loss": 0.8535884, + "num_input_tokens_seen": 123237325, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.51953125, + "step": 5734, + "time_per_iteration": 2.3901491165161133 + }, + { + "auxiliary_loss_clip": 0.01077648, + "auxiliary_loss_mlp": 0.01029249, + "balance_loss_clip": 1.01475275, + "balance_loss_mlp": 1.02484977, + "epoch": 0.34480685405080413, + "flos": 27668981612160.0, + "grad_norm": 1.8166628704497338, + "language_loss": 0.92839354, + "learning_rate": 2.9373118051519185e-06, + "loss": 0.94946253, + "num_input_tokens_seen": 123258650, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.52734375, + "step": 5735, + "time_per_iteration": 2.4364943504333496 + }, + { + "auxiliary_loss_clip": 0.01078311, + "auxiliary_loss_mlp": 0.01037926, + "balance_loss_clip": 1.02165425, + "balance_loss_mlp": 1.02551818, + "epoch": 0.3448669773034721, + "flos": 22454912592000.0, + "grad_norm": 1.738102938193459, + "language_loss": 0.76464218, + "learning_rate": 2.936978027437276e-06, + "loss": 0.78580451, + "num_input_tokens_seen": 123277155, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.52734375, + "step": 5736, + "time_per_iteration": 2.4330787658691406 + }, + { + "auxiliary_loss_clip": 0.01079526, + "auxiliary_loss_mlp": 0.01035283, + "balance_loss_clip": 1.01942849, + "balance_loss_mlp": 1.02621675, + "epoch": 0.34492710055614006, + "flos": 24935037776640.0, + "grad_norm": 1.5953825999955018, + "language_loss": 0.7859174, + "learning_rate": 2.9366442162842976e-06, + "loss": 0.80706549, + "num_input_tokens_seen": 123297640, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.53515625, + "step": 5737, + "time_per_iteration": 2.4368796348571777 + }, + { + "auxiliary_loss_clip": 0.01078198, + "auxiliary_loss_mlp": 0.01035162, + "balance_loss_clip": 1.01867485, + "balance_loss_mlp": 1.02395606, + "epoch": 0.34498722380880803, + "flos": 20119781750400.0, + "grad_norm": 2.3215257339896884, + "language_loss": 0.72001284, + "learning_rate": 2.936310371704897e-06, + "loss": 0.74114645, + "num_input_tokens_seen": 123314370, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.54296875, + "step": 5738, + "time_per_iteration": 2.37078595161438 + }, + { + "auxiliary_loss_clip": 0.01077618, + "auxiliary_loss_mlp": 0.01036955, + "balance_loss_clip": 1.0200274, + "balance_loss_mlp": 1.02273893, + "epoch": 0.34504734706147605, + "flos": 28436896166400.0, + "grad_norm": 1.927403471930639, + "language_loss": 0.81537116, + "learning_rate": 2.9359764937109877e-06, + "loss": 0.83651686, + "num_input_tokens_seen": 123336085, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.546875, + "step": 5739, + "time_per_iteration": 2.4279112815856934 + }, + { + "auxiliary_loss_clip": 0.01077766, + "auxiliary_loss_mlp": 0.01030833, + "balance_loss_clip": 1.01411963, + "balance_loss_mlp": 1.02528167, + "epoch": 0.345107470314144, + "flos": 22673364168960.0, + "grad_norm": 1.8140352206923913, + "language_loss": 0.82550031, + "learning_rate": 2.9356425823144847e-06, + "loss": 0.84658629, + "num_input_tokens_seen": 123354460, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.5234375, + "step": 5740, + "time_per_iteration": 2.38163423538208 + }, + { + "auxiliary_loss_clip": 0.01076628, + "auxiliary_loss_mlp": 0.01036161, + "balance_loss_clip": 1.02053297, + "balance_loss_mlp": 1.02384496, + "epoch": 0.345167593566812, + "flos": 20629688290560.0, + "grad_norm": 2.20053714675375, + "language_loss": 0.76983535, + "learning_rate": 2.9353086375273047e-06, + "loss": 0.79096317, + "num_input_tokens_seen": 123373420, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.52734375, + "step": 5741, + "time_per_iteration": 2.39556622505188 + }, + { + "auxiliary_loss_clip": 0.01077453, + "auxiliary_loss_mlp": 0.01035437, + "balance_loss_clip": 1.01949894, + "balance_loss_mlp": 1.02289307, + "epoch": 0.34522771681947995, + "flos": 26213138161920.0, + "grad_norm": 2.7080009040108686, + "language_loss": 0.76901495, + "learning_rate": 2.9349746593613654e-06, + "loss": 0.79014397, + "num_input_tokens_seen": 123394730, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.546875, + "step": 5742, + "time_per_iteration": 2.4205451011657715 + }, + { + "auxiliary_loss_clip": 0.01076968, + "auxiliary_loss_mlp": 0.01030963, + "balance_loss_clip": 1.01612163, + "balance_loss_mlp": 1.02513111, + "epoch": 0.3452878400721479, + "flos": 19061354839680.0, + "grad_norm": 2.1362010721961755, + "language_loss": 0.75643522, + "learning_rate": 2.934640647828586e-06, + "loss": 0.77751452, + "num_input_tokens_seen": 123412895, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.51953125, + "step": 5743, + "time_per_iteration": 2.3760452270507812 + }, + { + "auxiliary_loss_clip": 0.01076572, + "auxiliary_loss_mlp": 0.0103158, + "balance_loss_clip": 1.01658332, + "balance_loss_mlp": 1.02589667, + "epoch": 0.3453479633248159, + "flos": 27928455903360.0, + "grad_norm": 1.7423094726165422, + "language_loss": 0.70363706, + "learning_rate": 2.934306602940885e-06, + "loss": 0.72471857, + "num_input_tokens_seen": 123432320, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.5078125, + "step": 5744, + "time_per_iteration": 3.9053919315338135 + }, + { + "auxiliary_loss_clip": 0.01076273, + "auxiliary_loss_mlp": 0.01036956, + "balance_loss_clip": 1.02117252, + "balance_loss_mlp": 1.02527165, + "epoch": 0.34540808657748384, + "flos": 19605197088000.0, + "grad_norm": 1.730994897244149, + "language_loss": 0.79508853, + "learning_rate": 2.9339725247101855e-06, + "loss": 0.81622088, + "num_input_tokens_seen": 123450980, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.5078125, + "step": 5745, + "time_per_iteration": 2.3645341396331787 + }, + { + "auxiliary_loss_clip": 0.01078754, + "auxiliary_loss_mlp": 0.01037973, + "balance_loss_clip": 1.02135563, + "balance_loss_mlp": 1.02447414, + "epoch": 0.3454682098301518, + "flos": 20410643220480.0, + "grad_norm": 2.0932756279079814, + "language_loss": 0.89304304, + "learning_rate": 2.933638413148409e-06, + "loss": 0.91421044, + "num_input_tokens_seen": 123469365, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.54296875, + "step": 5746, + "time_per_iteration": 2.365699052810669 + }, + { + "auxiliary_loss_clip": 0.0107653, + "auxiliary_loss_mlp": 0.0103436, + "balance_loss_clip": 1.0184691, + "balance_loss_mlp": 1.02264893, + "epoch": 0.34552833308281977, + "flos": 21324040876800.0, + "grad_norm": 1.9816292143558374, + "language_loss": 0.63900352, + "learning_rate": 2.9333042682674788e-06, + "loss": 0.66011238, + "num_input_tokens_seen": 123489425, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.5390625, + "step": 5747, + "time_per_iteration": 2.3807475566864014 + }, + { + "auxiliary_loss_clip": 0.01076452, + "auxiliary_loss_mlp": 0.01030777, + "balance_loss_clip": 1.01575661, + "balance_loss_mlp": 1.02542019, + "epoch": 0.34558845633548774, + "flos": 36242253740160.0, + "grad_norm": 3.251906901895553, + "language_loss": 0.72763515, + "learning_rate": 2.9329700900793207e-06, + "loss": 0.74870741, + "num_input_tokens_seen": 123509970, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.5078125, + "step": 5748, + "time_per_iteration": 5.37058424949646 + }, + { + "auxiliary_loss_clip": 0.0107386, + "auxiliary_loss_mlp": 0.01025661, + "balance_loss_clip": 1.0116837, + "balance_loss_mlp": 1.02340102, + "epoch": 0.3456485795881557, + "flos": 22449606065280.0, + "grad_norm": 1.574322390276474, + "language_loss": 0.75496805, + "learning_rate": 2.9326358785958593e-06, + "loss": 0.77596331, + "num_input_tokens_seen": 123531055, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.50390625, + "step": 5749, + "time_per_iteration": 2.428903818130493 + }, + { + "auxiliary_loss_clip": 0.0101205, + "auxiliary_loss_mlp": 0.01000613, + "balance_loss_clip": 0.99915814, + "balance_loss_mlp": 1.00142384, + "epoch": 0.34570870284082367, + "flos": 62001135936000.0, + "grad_norm": 0.8768927374316948, + "language_loss": 0.62612498, + "learning_rate": 2.9323016338290227e-06, + "loss": 0.64625168, + "num_input_tokens_seen": 123584720, + "router_z_loss_clip": 0.01452637, + "router_z_loss_mlp": 0.10644531, + "step": 5750, + "time_per_iteration": 2.904465675354004 + }, + { + "auxiliary_loss_clip": 0.01070569, + "auxiliary_loss_mlp": 0.01026406, + "balance_loss_clip": 1.0119102, + "balance_loss_mlp": 1.02240634, + "epoch": 0.34576882609349163, + "flos": 22781141136000.0, + "grad_norm": 1.781390994107412, + "language_loss": 0.80388081, + "learning_rate": 2.931967355790739e-06, + "loss": 0.82485062, + "num_input_tokens_seen": 123604465, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.48242188, + "step": 5751, + "time_per_iteration": 2.4359335899353027 + }, + { + "auxiliary_loss_clip": 0.01074952, + "auxiliary_loss_mlp": 0.01034677, + "balance_loss_clip": 1.02043736, + "balance_loss_mlp": 1.02548003, + "epoch": 0.34582894934615965, + "flos": 12348010771200.0, + "grad_norm": 2.0951007095316347, + "language_loss": 0.83903956, + "learning_rate": 2.931633044492937e-06, + "loss": 0.86013579, + "num_input_tokens_seen": 123622320, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.49609375, + "step": 5752, + "time_per_iteration": 2.346137285232544 + }, + { + "auxiliary_loss_clip": 0.0101195, + "auxiliary_loss_mlp": 0.01000668, + "balance_loss_clip": 0.99930316, + "balance_loss_mlp": 1.00140488, + "epoch": 0.3458890725988276, + "flos": 70164563080320.0, + "grad_norm": 0.7362709478195214, + "language_loss": 0.63234472, + "learning_rate": 2.931298699947549e-06, + "loss": 0.65247089, + "num_input_tokens_seen": 123678010, + "router_z_loss_clip": 0.01367188, + "router_z_loss_mlp": 0.10546875, + "step": 5753, + "time_per_iteration": 4.350102186203003 + }, + { + "auxiliary_loss_clip": 0.01074815, + "auxiliary_loss_mlp": 0.01036552, + "balance_loss_clip": 1.02073264, + "balance_loss_mlp": 1.02396595, + "epoch": 0.3459491958514956, + "flos": 17091624954240.0, + "grad_norm": 1.9361911633317213, + "language_loss": 0.71062148, + "learning_rate": 2.9309643221665054e-06, + "loss": 0.73173523, + "num_input_tokens_seen": 123696830, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.5078125, + "step": 5754, + "time_per_iteration": 2.382948875427246 + }, + { + "auxiliary_loss_clip": 0.01076214, + "auxiliary_loss_mlp": 0.01029767, + "balance_loss_clip": 1.01380539, + "balance_loss_mlp": 1.02380681, + "epoch": 0.34600931910416355, + "flos": 16650113500800.0, + "grad_norm": 1.7727484939524747, + "language_loss": 0.72692454, + "learning_rate": 2.9306299111617402e-06, + "loss": 0.74798429, + "num_input_tokens_seen": 123714360, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.5234375, + "step": 5755, + "time_per_iteration": 2.402243137359619 + }, + { + "auxiliary_loss_clip": 0.01074005, + "auxiliary_loss_mlp": 0.01033562, + "balance_loss_clip": 1.01872611, + "balance_loss_mlp": 1.0241015, + "epoch": 0.3460694423568315, + "flos": 38544635859840.0, + "grad_norm": 1.5349196514409829, + "language_loss": 0.72659063, + "learning_rate": 2.9302954669451875e-06, + "loss": 0.74766636, + "num_input_tokens_seen": 123739250, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.49804688, + "step": 5756, + "time_per_iteration": 2.5276730060577393 + }, + { + "auxiliary_loss_clip": 0.01012249, + "auxiliary_loss_mlp": 0.0100983, + "balance_loss_clip": 1.00842357, + "balance_loss_mlp": 1.00163567, + "epoch": 0.3461295656094995, + "flos": 72077837564160.0, + "grad_norm": 0.7095907827505381, + "language_loss": 0.62560987, + "learning_rate": 2.9299609895287817e-06, + "loss": 0.64583063, + "num_input_tokens_seen": 123802845, + "router_z_loss_clip": 0.01403809, + "router_z_loss_mlp": 0.10644531, + "step": 5757, + "time_per_iteration": 3.0415329933166504 + }, + { + "auxiliary_loss_clip": 0.01011606, + "auxiliary_loss_mlp": 0.01005802, + "balance_loss_clip": 1.00447905, + "balance_loss_mlp": 1.00110459, + "epoch": 0.34618968886216744, + "flos": 65457118932480.0, + "grad_norm": 0.8188537340470671, + "language_loss": 0.59229028, + "learning_rate": 2.929626478924461e-06, + "loss": 0.61246443, + "num_input_tokens_seen": 123861805, + "router_z_loss_clip": 0.01324463, + "router_z_loss_mlp": 0.10498047, + "step": 5758, + "time_per_iteration": 3.0388827323913574 + }, + { + "auxiliary_loss_clip": 0.01075919, + "auxiliary_loss_mlp": 0.01036041, + "balance_loss_clip": 1.02117562, + "balance_loss_mlp": 1.02438259, + "epoch": 0.3462498121148354, + "flos": 23471548738560.0, + "grad_norm": 1.8880444198446622, + "language_loss": 0.71870965, + "learning_rate": 2.9292919351441626e-06, + "loss": 0.73982924, + "num_input_tokens_seen": 123881820, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.515625, + "step": 5759, + "time_per_iteration": 2.408294677734375 + }, + { + "auxiliary_loss_clip": 0.01074462, + "auxiliary_loss_mlp": 0.01038014, + "balance_loss_clip": 1.02254105, + "balance_loss_mlp": 1.02385116, + "epoch": 0.3463099353675034, + "flos": 24169636840320.0, + "grad_norm": 1.8755606585016409, + "language_loss": 0.83612382, + "learning_rate": 2.928957358199825e-06, + "loss": 0.8572486, + "num_input_tokens_seen": 123903700, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.5078125, + "step": 5760, + "time_per_iteration": 2.436171770095825 + }, + { + "auxiliary_loss_clip": 0.01011966, + "auxiliary_loss_mlp": 0.01001194, + "balance_loss_clip": 0.99995399, + "balance_loss_mlp": 1.00140882, + "epoch": 0.34637005862017134, + "flos": 63697915745280.0, + "grad_norm": 0.8139112462603719, + "language_loss": 0.56570119, + "learning_rate": 2.9286227481033903e-06, + "loss": 0.58583277, + "num_input_tokens_seen": 123960075, + "router_z_loss_clip": 0.01239014, + "router_z_loss_mlp": 0.10546875, + "step": 5761, + "time_per_iteration": 3.1018810272216797 + }, + { + "auxiliary_loss_clip": 0.01073732, + "auxiliary_loss_mlp": 0.01032484, + "balance_loss_clip": 1.01768374, + "balance_loss_mlp": 1.02344429, + "epoch": 0.3464301818728393, + "flos": 13144868709120.0, + "grad_norm": 2.0034356663293305, + "language_loss": 0.94723499, + "learning_rate": 2.9282881048667972e-06, + "loss": 0.96829712, + "num_input_tokens_seen": 123975805, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.50390625, + "step": 5762, + "time_per_iteration": 2.357673168182373 + }, + { + "auxiliary_loss_clip": 0.01076457, + "auxiliary_loss_mlp": 0.01029967, + "balance_loss_clip": 1.01435089, + "balance_loss_mlp": 1.02338696, + "epoch": 0.34649030512550727, + "flos": 29313879408000.0, + "grad_norm": 1.7542001792396027, + "language_loss": 0.69893539, + "learning_rate": 2.927953428501989e-06, + "loss": 0.71999967, + "num_input_tokens_seen": 123997530, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.53125, + "step": 5763, + "time_per_iteration": 2.442671775817871 + }, + { + "auxiliary_loss_clip": 0.01079704, + "auxiliary_loss_mlp": 0.01036954, + "balance_loss_clip": 1.019526, + "balance_loss_mlp": 1.02577722, + "epoch": 0.34655042837817523, + "flos": 23729801132160.0, + "grad_norm": 1.6769862060415226, + "language_loss": 0.83350599, + "learning_rate": 2.9276187190209107e-06, + "loss": 0.85467255, + "num_input_tokens_seen": 124016375, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.5390625, + "step": 5764, + "time_per_iteration": 2.403332471847534 + }, + { + "auxiliary_loss_clip": 0.01075342, + "auxiliary_loss_mlp": 0.01026728, + "balance_loss_clip": 1.01154089, + "balance_loss_mlp": 1.02309275, + "epoch": 0.34661055163084326, + "flos": 22053132132480.0, + "grad_norm": 2.2522769181028206, + "language_loss": 0.67345691, + "learning_rate": 2.927283976435506e-06, + "loss": 0.69447756, + "num_input_tokens_seen": 124033975, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.5234375, + "step": 5765, + "time_per_iteration": 2.3712351322174072 + }, + { + "auxiliary_loss_clip": 0.01076218, + "auxiliary_loss_mlp": 0.01035711, + "balance_loss_clip": 1.02057719, + "balance_loss_mlp": 1.0238992, + "epoch": 0.3466706748835112, + "flos": 21798126495360.0, + "grad_norm": 2.5534336520752845, + "language_loss": 0.76755512, + "learning_rate": 2.926949200757722e-06, + "loss": 0.78867447, + "num_input_tokens_seen": 124051930, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.5234375, + "step": 5766, + "time_per_iteration": 2.39005184173584 + }, + { + "auxiliary_loss_clip": 0.01072689, + "auxiliary_loss_mlp": 0.01028991, + "balance_loss_clip": 1.01452541, + "balance_loss_mlp": 1.02277708, + "epoch": 0.3467307981361792, + "flos": 19460726415360.0, + "grad_norm": 1.3885360457512503, + "language_loss": 0.73505926, + "learning_rate": 2.926614391999505e-06, + "loss": 0.7560761, + "num_input_tokens_seen": 124071220, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.5, + "step": 5767, + "time_per_iteration": 2.362863302230835 + }, + { + "auxiliary_loss_clip": 0.01078442, + "auxiliary_loss_mlp": 0.01032352, + "balance_loss_clip": 1.01603234, + "balance_loss_mlp": 1.02600789, + "epoch": 0.34679092138884715, + "flos": 24826283291520.0, + "grad_norm": 1.7762556141611965, + "language_loss": 0.77778924, + "learning_rate": 2.926279550172804e-06, + "loss": 0.79889715, + "num_input_tokens_seen": 124090140, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.5234375, + "step": 5768, + "time_per_iteration": 2.4200541973114014 + }, + { + "auxiliary_loss_clip": 0.01013231, + "auxiliary_loss_mlp": 0.01001561, + "balance_loss_clip": 1.00019014, + "balance_loss_mlp": 1.00275207, + "epoch": 0.3468510446415151, + "flos": 63233116548480.0, + "grad_norm": 0.7683543475722181, + "language_loss": 0.57456195, + "learning_rate": 2.9259446752895686e-06, + "loss": 0.59470987, + "num_input_tokens_seen": 124152025, + "router_z_loss_clip": 0.01373291, + "router_z_loss_mlp": 0.10498047, + "step": 5769, + "time_per_iteration": 3.068924903869629 + }, + { + "auxiliary_loss_clip": 0.01083012, + "auxiliary_loss_mlp": 0.01032253, + "balance_loss_clip": 1.01468158, + "balance_loss_mlp": 1.02535903, + "epoch": 0.3469111678941831, + "flos": 12120168038400.0, + "grad_norm": 3.5004369628595042, + "language_loss": 0.86012661, + "learning_rate": 2.9256097673617495e-06, + "loss": 0.88127929, + "num_input_tokens_seen": 124165795, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.578125, + "step": 5770, + "time_per_iteration": 2.387011766433716 + }, + { + "auxiliary_loss_clip": 0.01011965, + "auxiliary_loss_mlp": 0.01001873, + "balance_loss_clip": 1.00060976, + "balance_loss_mlp": 1.0014503, + "epoch": 0.34697129114685105, + "flos": 65931134728320.0, + "grad_norm": 0.7645425760205378, + "language_loss": 0.59758162, + "learning_rate": 2.9252748264012985e-06, + "loss": 0.61772001, + "num_input_tokens_seen": 124222925, + "router_z_loss_clip": 0.01263428, + "router_z_loss_mlp": 0.10546875, + "step": 5771, + "time_per_iteration": 2.822774648666382 + }, + { + "auxiliary_loss_clip": 0.01074503, + "auxiliary_loss_mlp": 0.01032142, + "balance_loss_clip": 1.01688278, + "balance_loss_mlp": 1.02360356, + "epoch": 0.347031414399519, + "flos": 34452920183040.0, + "grad_norm": 1.6598753304082707, + "language_loss": 0.71974301, + "learning_rate": 2.9249398524201693e-06, + "loss": 0.74080938, + "num_input_tokens_seen": 124240915, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.5078125, + "step": 5772, + "time_per_iteration": 2.4949748516082764 + }, + { + "auxiliary_loss_clip": 0.01076371, + "auxiliary_loss_mlp": 0.01028959, + "balance_loss_clip": 1.01287127, + "balance_loss_mlp": 1.02321267, + "epoch": 0.347091537652187, + "flos": 26942892733440.0, + "grad_norm": 1.3769018633624883, + "language_loss": 0.76191044, + "learning_rate": 2.9246048454303165e-06, + "loss": 0.78296363, + "num_input_tokens_seen": 124262770, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.53125, + "step": 5773, + "time_per_iteration": 2.4566125869750977 + }, + { + "auxiliary_loss_clip": 0.01076734, + "auxiliary_loss_mlp": 0.01033058, + "balance_loss_clip": 1.01663136, + "balance_loss_mlp": 1.02347696, + "epoch": 0.34715166090485494, + "flos": 21141165841920.0, + "grad_norm": 2.3084275675194594, + "language_loss": 0.7031635, + "learning_rate": 2.9242698054436942e-06, + "loss": 0.72426146, + "num_input_tokens_seen": 124280950, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.53125, + "step": 5774, + "time_per_iteration": 2.4065067768096924 + }, + { + "auxiliary_loss_clip": 0.01074918, + "auxiliary_loss_mlp": 0.01032588, + "balance_loss_clip": 1.01771677, + "balance_loss_mlp": 1.02435637, + "epoch": 0.3472117841575229, + "flos": 23476855265280.0, + "grad_norm": 1.6153572545834267, + "language_loss": 0.76111162, + "learning_rate": 2.9239347324722605e-06, + "loss": 0.78218669, + "num_input_tokens_seen": 124299540, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.50390625, + "step": 5775, + "time_per_iteration": 2.5024466514587402 + }, + { + "auxiliary_loss_clip": 0.01077937, + "auxiliary_loss_mlp": 0.01031097, + "balance_loss_clip": 1.0144968, + "balance_loss_mlp": 1.02390623, + "epoch": 0.34727190741019087, + "flos": 17491869313920.0, + "grad_norm": 2.0187230457228624, + "language_loss": 0.77591276, + "learning_rate": 2.923599626527973e-06, + "loss": 0.79700303, + "num_input_tokens_seen": 124316285, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.5390625, + "step": 5776, + "time_per_iteration": 2.374683380126953 + }, + { + "auxiliary_loss_clip": 0.01014615, + "auxiliary_loss_mlp": 0.01000357, + "balance_loss_clip": 0.99898612, + "balance_loss_mlp": 1.00433517, + "epoch": 0.34733203066285884, + "flos": 65261848498560.0, + "grad_norm": 1.13396177879421, + "language_loss": 0.63349223, + "learning_rate": 2.9232644876227904e-06, + "loss": 0.65364194, + "num_input_tokens_seen": 124376650, + "router_z_loss_clip": 0.01373291, + "router_z_loss_mlp": 0.10253906, + "step": 5777, + "time_per_iteration": 3.0906662940979004 + }, + { + "auxiliary_loss_clip": 0.0107635, + "auxiliary_loss_mlp": 0.01031915, + "balance_loss_clip": 1.01598871, + "balance_loss_mlp": 1.02296972, + "epoch": 0.3473921539155268, + "flos": 28657442424960.0, + "grad_norm": 1.828737695489423, + "language_loss": 0.64427119, + "learning_rate": 2.9229293157686732e-06, + "loss": 0.66535383, + "num_input_tokens_seen": 124396475, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.53125, + "step": 5778, + "time_per_iteration": 2.5096380710601807 + }, + { + "auxiliary_loss_clip": 0.01079001, + "auxiliary_loss_mlp": 0.01033326, + "balance_loss_clip": 1.0173161, + "balance_loss_mlp": 1.02501488, + "epoch": 0.3474522771681948, + "flos": 40835497230720.0, + "grad_norm": 1.6874462169990343, + "language_loss": 0.71371233, + "learning_rate": 2.9225941109775825e-06, + "loss": 0.73483562, + "num_input_tokens_seen": 124416480, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.5390625, + "step": 5779, + "time_per_iteration": 2.5633721351623535 + }, + { + "auxiliary_loss_clip": 0.01080528, + "auxiliary_loss_mlp": 0.01037824, + "balance_loss_clip": 1.02174294, + "balance_loss_mlp": 1.02552652, + "epoch": 0.3475124004208628, + "flos": 24607412778240.0, + "grad_norm": 2.043754218686711, + "language_loss": 0.62217283, + "learning_rate": 2.9222588732614818e-06, + "loss": 0.64335632, + "num_input_tokens_seen": 124435950, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.55078125, + "step": 5780, + "time_per_iteration": 2.4177258014678955 + }, + { + "auxiliary_loss_clip": 0.01075642, + "auxiliary_loss_mlp": 0.0103364, + "balance_loss_clip": 1.01859534, + "balance_loss_mlp": 1.02438533, + "epoch": 0.34757252367353075, + "flos": 22710197520000.0, + "grad_norm": 1.6086040301959126, + "language_loss": 0.72151911, + "learning_rate": 2.921923602632333e-06, + "loss": 0.74261189, + "num_input_tokens_seen": 124455410, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.51171875, + "step": 5781, + "time_per_iteration": 2.430724620819092 + }, + { + "auxiliary_loss_clip": 0.01079651, + "auxiliary_loss_mlp": 0.01039548, + "balance_loss_clip": 1.02189898, + "balance_loss_mlp": 1.02653515, + "epoch": 0.3476326469261987, + "flos": 19827174712320.0, + "grad_norm": 1.8761444313037403, + "language_loss": 0.76921785, + "learning_rate": 2.9215882991021036e-06, + "loss": 0.7904098, + "num_input_tokens_seen": 124474870, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.53125, + "step": 5782, + "time_per_iteration": 2.3804807662963867 + }, + { + "auxiliary_loss_clip": 0.01075955, + "auxiliary_loss_mlp": 0.01031543, + "balance_loss_clip": 1.01630783, + "balance_loss_mlp": 1.02366221, + "epoch": 0.3476927701788667, + "flos": 19937081272320.0, + "grad_norm": 1.8881325478499325, + "language_loss": 0.62519693, + "learning_rate": 2.9212529626827582e-06, + "loss": 0.64627182, + "num_input_tokens_seen": 124494105, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.5234375, + "step": 5783, + "time_per_iteration": 2.4045162200927734 + }, + { + "auxiliary_loss_clip": 0.01073196, + "auxiliary_loss_mlp": 0.0102567, + "balance_loss_clip": 1.01146054, + "balance_loss_mlp": 1.02310634, + "epoch": 0.34775289343153465, + "flos": 20734218501120.0, + "grad_norm": 1.6226022829266527, + "language_loss": 0.88506716, + "learning_rate": 2.9209175933862636e-06, + "loss": 0.90605581, + "num_input_tokens_seen": 124512030, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.5, + "step": 5784, + "time_per_iteration": 3.7488017082214355 + }, + { + "auxiliary_loss_clip": 0.01074304, + "auxiliary_loss_mlp": 0.01029736, + "balance_loss_clip": 1.01439333, + "balance_loss_mlp": 1.02360535, + "epoch": 0.3478130166842026, + "flos": 19353822232320.0, + "grad_norm": 1.5938023869846187, + "language_loss": 0.81219316, + "learning_rate": 2.92058219122459e-06, + "loss": 0.83323359, + "num_input_tokens_seen": 124530980, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.5078125, + "step": 5785, + "time_per_iteration": 2.414529800415039 + }, + { + "auxiliary_loss_clip": 0.01079448, + "auxiliary_loss_mlp": 0.01037375, + "balance_loss_clip": 1.02221203, + "balance_loss_mlp": 1.02634561, + "epoch": 0.3478731399368706, + "flos": 22050199578240.0, + "grad_norm": 1.794738943084231, + "language_loss": 0.80747348, + "learning_rate": 2.9202467562097052e-06, + "loss": 0.82864165, + "num_input_tokens_seen": 124549330, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.53125, + "step": 5786, + "time_per_iteration": 2.4369876384735107 + }, + { + "auxiliary_loss_clip": 0.01076496, + "auxiliary_loss_mlp": 0.01032913, + "balance_loss_clip": 1.01723695, + "balance_loss_mlp": 1.02581239, + "epoch": 0.34793326318953854, + "flos": 18040459507200.0, + "grad_norm": 2.3785193508610014, + "language_loss": 0.75002062, + "learning_rate": 2.9199112883535813e-06, + "loss": 0.77111471, + "num_input_tokens_seen": 124567200, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.5078125, + "step": 5787, + "time_per_iteration": 5.203081369400024 + }, + { + "auxiliary_loss_clip": 0.01076853, + "auxiliary_loss_mlp": 0.01029907, + "balance_loss_clip": 1.01446939, + "balance_loss_mlp": 1.02461243, + "epoch": 0.3479933864422065, + "flos": 29313390648960.0, + "grad_norm": 1.7350346884740975, + "language_loss": 0.81622982, + "learning_rate": 2.919575787668189e-06, + "loss": 0.83729744, + "num_input_tokens_seen": 124587025, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.5234375, + "step": 5788, + "time_per_iteration": 2.4759206771850586 + }, + { + "auxiliary_loss_clip": 0.01081896, + "auxiliary_loss_mlp": 0.01033492, + "balance_loss_clip": 1.01657593, + "balance_loss_mlp": 1.02689099, + "epoch": 0.3480535096948745, + "flos": 20119677016320.0, + "grad_norm": 2.4067150975214235, + "language_loss": 0.8551451, + "learning_rate": 2.919240254165503e-06, + "loss": 0.87629896, + "num_input_tokens_seen": 124605860, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.55078125, + "step": 5789, + "time_per_iteration": 2.4148001670837402 + }, + { + "auxiliary_loss_clip": 0.01079605, + "auxiliary_loss_mlp": 0.01049796, + "balance_loss_clip": 1.03234363, + "balance_loss_mlp": 1.02595782, + "epoch": 0.34811363294754244, + "flos": 18548061897600.0, + "grad_norm": 1.7079288661975327, + "language_loss": 0.85120916, + "learning_rate": 2.918904687857497e-06, + "loss": 0.87250316, + "num_input_tokens_seen": 124624270, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.53515625, + "step": 5790, + "time_per_iteration": 2.3937594890594482 + }, + { + "auxiliary_loss_clip": 0.0107882, + "auxiliary_loss_mlp": 0.01037594, + "balance_loss_clip": 1.02116656, + "balance_loss_mlp": 1.02550459, + "epoch": 0.3481737562002104, + "flos": 26869086385920.0, + "grad_norm": 2.0424667302257014, + "language_loss": 0.81423348, + "learning_rate": 2.9185690887561463e-06, + "loss": 0.8353976, + "num_input_tokens_seen": 124644005, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.53125, + "step": 5791, + "time_per_iteration": 2.4153356552124023 + }, + { + "auxiliary_loss_clip": 0.01077823, + "auxiliary_loss_mlp": 0.01029016, + "balance_loss_clip": 1.01322699, + "balance_loss_mlp": 1.02349496, + "epoch": 0.3482338794528784, + "flos": 28907525560320.0, + "grad_norm": 1.8588390468281573, + "language_loss": 0.77465641, + "learning_rate": 2.918233456873428e-06, + "loss": 0.79572481, + "num_input_tokens_seen": 124663020, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.54296875, + "step": 5792, + "time_per_iteration": 2.45379900932312 + }, + { + "auxiliary_loss_clip": 0.01074614, + "auxiliary_loss_mlp": 0.01027482, + "balance_loss_clip": 1.01203263, + "balance_loss_mlp": 1.02255368, + "epoch": 0.3482940027055464, + "flos": 22199662575360.0, + "grad_norm": 1.6172860339390311, + "language_loss": 0.81855458, + "learning_rate": 2.9178977922213188e-06, + "loss": 0.83957553, + "num_input_tokens_seen": 124682975, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.51953125, + "step": 5793, + "time_per_iteration": 3.8589749336242676 + }, + { + "auxiliary_loss_clip": 0.01077809, + "auxiliary_loss_mlp": 0.01041606, + "balance_loss_clip": 1.02494013, + "balance_loss_mlp": 1.02469444, + "epoch": 0.34835412595821436, + "flos": 20301679267200.0, + "grad_norm": 1.7174350608394966, + "language_loss": 0.75724077, + "learning_rate": 2.917562094811799e-06, + "loss": 0.77843487, + "num_input_tokens_seen": 124701340, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.53125, + "step": 5794, + "time_per_iteration": 2.424049139022827 + }, + { + "auxiliary_loss_clip": 0.01076234, + "auxiliary_loss_mlp": 0.01036285, + "balance_loss_clip": 1.02066886, + "balance_loss_mlp": 1.02419746, + "epoch": 0.3484142492108823, + "flos": 20448628646400.0, + "grad_norm": 5.746312748421174, + "language_loss": 0.56843466, + "learning_rate": 2.917226364656848e-06, + "loss": 0.58955985, + "num_input_tokens_seen": 124719165, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.5234375, + "step": 5795, + "time_per_iteration": 2.3890457153320312 + }, + { + "auxiliary_loss_clip": 0.0107618, + "auxiliary_loss_mlp": 0.01028074, + "balance_loss_clip": 1.01239753, + "balance_loss_mlp": 1.02457547, + "epoch": 0.3484743724635503, + "flos": 24351778736640.0, + "grad_norm": 1.7390058205596206, + "language_loss": 0.82748753, + "learning_rate": 2.9168906017684474e-06, + "loss": 0.84853005, + "num_input_tokens_seen": 124738670, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.515625, + "step": 5796, + "time_per_iteration": 2.425893545150757 + }, + { + "auxiliary_loss_clip": 0.01074377, + "auxiliary_loss_mlp": 0.01027431, + "balance_loss_clip": 1.01304209, + "balance_loss_mlp": 1.02409446, + "epoch": 0.34853449571621825, + "flos": 24351848559360.0, + "grad_norm": 1.7386246320570766, + "language_loss": 0.83200371, + "learning_rate": 2.91655480615858e-06, + "loss": 0.8530218, + "num_input_tokens_seen": 124758760, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.50390625, + "step": 5797, + "time_per_iteration": 2.4673445224761963 + }, + { + "auxiliary_loss_clip": 0.01073897, + "auxiliary_loss_mlp": 0.0103212, + "balance_loss_clip": 1.01665902, + "balance_loss_mlp": 1.02394271, + "epoch": 0.3485946189688862, + "flos": 27266572748160.0, + "grad_norm": 2.4555665395737285, + "language_loss": 0.73516202, + "learning_rate": 2.9162189778392286e-06, + "loss": 0.75622225, + "num_input_tokens_seen": 124777765, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.5, + "step": 5798, + "time_per_iteration": 2.4390857219696045 + }, + { + "auxiliary_loss_clip": 0.0107446, + "auxiliary_loss_mlp": 0.01032421, + "balance_loss_clip": 1.01660144, + "balance_loss_mlp": 1.02269697, + "epoch": 0.3486547422215542, + "flos": 20155672494720.0, + "grad_norm": 2.020521921115232, + "language_loss": 0.75903696, + "learning_rate": 2.9158831168223797e-06, + "loss": 0.78010577, + "num_input_tokens_seen": 124796775, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.51953125, + "step": 5799, + "time_per_iteration": 2.391947031021118 + }, + { + "auxiliary_loss_clip": 0.01076702, + "auxiliary_loss_mlp": 0.01029917, + "balance_loss_clip": 1.01552272, + "balance_loss_mlp": 1.0248158, + "epoch": 0.34871486547422215, + "flos": 20229304285440.0, + "grad_norm": 2.0517329753676483, + "language_loss": 0.75460827, + "learning_rate": 2.915547223120018e-06, + "loss": 0.77567446, + "num_input_tokens_seen": 124815825, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.51953125, + "step": 5800, + "time_per_iteration": 2.4259848594665527 + }, + { + "auxiliary_loss_clip": 0.01079949, + "auxiliary_loss_mlp": 0.01038756, + "balance_loss_clip": 1.0218401, + "balance_loss_mlp": 1.02539814, + "epoch": 0.3487749887268901, + "flos": 44051591208960.0, + "grad_norm": 1.6037827883161688, + "language_loss": 0.66977096, + "learning_rate": 2.9152112967441307e-06, + "loss": 0.69095802, + "num_input_tokens_seen": 124838420, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.546875, + "step": 5801, + "time_per_iteration": 2.594118118286133 + }, + { + "auxiliary_loss_clip": 0.01074173, + "auxiliary_loss_mlp": 0.01029465, + "balance_loss_clip": 1.01387203, + "balance_loss_mlp": 1.02352047, + "epoch": 0.3488351119795581, + "flos": 23294015141760.0, + "grad_norm": 1.8131761048769601, + "language_loss": 0.76868844, + "learning_rate": 2.9148753377067063e-06, + "loss": 0.78972483, + "num_input_tokens_seen": 124857320, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.5078125, + "step": 5802, + "time_per_iteration": 2.4443297386169434 + }, + { + "auxiliary_loss_clip": 0.01072151, + "auxiliary_loss_mlp": 0.01030415, + "balance_loss_clip": 1.01556194, + "balance_loss_mlp": 1.02278686, + "epoch": 0.34889523523222604, + "flos": 19933904338560.0, + "grad_norm": 1.5974508130425775, + "language_loss": 0.78238654, + "learning_rate": 2.9145393460197346e-06, + "loss": 0.8034122, + "num_input_tokens_seen": 124875685, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.49414062, + "step": 5803, + "time_per_iteration": 2.3786134719848633 + }, + { + "auxiliary_loss_clip": 0.01075542, + "auxiliary_loss_mlp": 0.01032346, + "balance_loss_clip": 1.01674187, + "balance_loss_mlp": 1.02237821, + "epoch": 0.348955358484894, + "flos": 30444855857280.0, + "grad_norm": 2.55264393675343, + "language_loss": 0.67801392, + "learning_rate": 2.914203321695206e-06, + "loss": 0.69909281, + "num_input_tokens_seen": 124895960, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.53125, + "step": 5804, + "time_per_iteration": 2.4601457118988037 + }, + { + "auxiliary_loss_clip": 0.01072578, + "auxiliary_loss_mlp": 0.0103481, + "balance_loss_clip": 1.01994491, + "balance_loss_mlp": 1.02311182, + "epoch": 0.349015481737562, + "flos": 17999122590720.0, + "grad_norm": 1.7104962421911951, + "language_loss": 0.76287705, + "learning_rate": 2.913867264745113e-06, + "loss": 0.78395092, + "num_input_tokens_seen": 124914140, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.49609375, + "step": 5805, + "time_per_iteration": 2.388007640838623 + }, + { + "auxiliary_loss_clip": 0.01076168, + "auxiliary_loss_mlp": 0.01029521, + "balance_loss_clip": 1.01469731, + "balance_loss_mlp": 1.02544165, + "epoch": 0.34907560499023, + "flos": 27197269966080.0, + "grad_norm": 4.360438706111584, + "language_loss": 0.67598635, + "learning_rate": 2.913531175181448e-06, + "loss": 0.6970433, + "num_input_tokens_seen": 124934180, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.5078125, + "step": 5806, + "time_per_iteration": 2.4748339653015137 + }, + { + "auxiliary_loss_clip": 0.01077264, + "auxiliary_loss_mlp": 0.01033205, + "balance_loss_clip": 1.01806498, + "balance_loss_mlp": 1.0251286, + "epoch": 0.34913572824289796, + "flos": 30225566407680.0, + "grad_norm": 1.412946999041995, + "language_loss": 0.71812558, + "learning_rate": 2.913195053016205e-06, + "loss": 0.73923028, + "num_input_tokens_seen": 124956060, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.5234375, + "step": 5807, + "time_per_iteration": 2.527106285095215 + }, + { + "auxiliary_loss_clip": 0.01073411, + "auxiliary_loss_mlp": 0.01035095, + "balance_loss_clip": 1.01941895, + "balance_loss_mlp": 1.02175343, + "epoch": 0.3491958514955659, + "flos": 29970595681920.0, + "grad_norm": 1.828088224074478, + "language_loss": 0.73812759, + "learning_rate": 2.9128588982613794e-06, + "loss": 0.75921267, + "num_input_tokens_seen": 124976070, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.515625, + "step": 5808, + "time_per_iteration": 2.4526548385620117 + }, + { + "auxiliary_loss_clip": 0.01073649, + "auxiliary_loss_mlp": 0.0103266, + "balance_loss_clip": 1.01828885, + "balance_loss_mlp": 1.02470326, + "epoch": 0.3492559747482339, + "flos": 22782188476800.0, + "grad_norm": 1.5046794121200924, + "language_loss": 0.84514034, + "learning_rate": 2.912522710928968e-06, + "loss": 0.86620349, + "num_input_tokens_seen": 124996995, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.49023438, + "step": 5809, + "time_per_iteration": 2.4007365703582764 + }, + { + "auxiliary_loss_clip": 0.0107256, + "auxiliary_loss_mlp": 0.01031038, + "balance_loss_clip": 1.01753116, + "balance_loss_mlp": 1.0239048, + "epoch": 0.34931609800090185, + "flos": 26066817187200.0, + "grad_norm": 1.813103706262116, + "language_loss": 0.80167866, + "learning_rate": 2.912186491030968e-06, + "loss": 0.82271469, + "num_input_tokens_seen": 125015600, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.48828125, + "step": 5810, + "time_per_iteration": 2.4301726818084717 + }, + { + "auxiliary_loss_clip": 0.01072106, + "auxiliary_loss_mlp": 0.01032702, + "balance_loss_clip": 1.01809931, + "balance_loss_mlp": 1.02249813, + "epoch": 0.3493762212535698, + "flos": 29240736376320.0, + "grad_norm": 1.6366058780106694, + "language_loss": 0.75798559, + "learning_rate": 2.911850238579379e-06, + "loss": 0.77903366, + "num_input_tokens_seen": 125035290, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.49609375, + "step": 5811, + "time_per_iteration": 2.4376323223114014 + }, + { + "auxiliary_loss_clip": 0.0107568, + "auxiliary_loss_mlp": 0.0103184, + "balance_loss_clip": 1.0161581, + "balance_loss_mlp": 1.02264857, + "epoch": 0.3494363445062378, + "flos": 27124825161600.0, + "grad_norm": 1.3865002619728117, + "language_loss": 0.79880184, + "learning_rate": 2.9115139535862003e-06, + "loss": 0.81987703, + "num_input_tokens_seen": 125057130, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.53125, + "step": 5812, + "time_per_iteration": 2.4402496814727783 + }, + { + "auxiliary_loss_clip": 0.01073742, + "auxiliary_loss_mlp": 0.01031055, + "balance_loss_clip": 1.01657104, + "balance_loss_mlp": 1.02258897, + "epoch": 0.34949646775890575, + "flos": 12275391409920.0, + "grad_norm": 1.907659193732723, + "language_loss": 0.69322318, + "learning_rate": 2.9111776360634334e-06, + "loss": 0.71427113, + "num_input_tokens_seen": 125073720, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.51171875, + "step": 5813, + "time_per_iteration": 2.3609864711761475 + }, + { + "auxiliary_loss_clip": 0.01070174, + "auxiliary_loss_mlp": 0.01029968, + "balance_loss_clip": 1.01554418, + "balance_loss_mlp": 1.02205002, + "epoch": 0.3495565910115737, + "flos": 17164558517760.0, + "grad_norm": 1.864892791019543, + "language_loss": 0.76173961, + "learning_rate": 2.9108412860230806e-06, + "loss": 0.78274101, + "num_input_tokens_seen": 125090635, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.48046875, + "step": 5814, + "time_per_iteration": 2.375277519226074 + }, + { + "auxiliary_loss_clip": 0.01075564, + "auxiliary_loss_mlp": 0.01036629, + "balance_loss_clip": 1.02041626, + "balance_loss_mlp": 1.0230937, + "epoch": 0.3496167142642417, + "flos": 26464547928960.0, + "grad_norm": 1.641588965021327, + "language_loss": 0.84404933, + "learning_rate": 2.910504903477145e-06, + "loss": 0.86517131, + "num_input_tokens_seen": 125110070, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.5234375, + "step": 5815, + "time_per_iteration": 2.434227466583252 + }, + { + "auxiliary_loss_clip": 0.01070806, + "auxiliary_loss_mlp": 0.01028793, + "balance_loss_clip": 1.01516712, + "balance_loss_mlp": 1.02113342, + "epoch": 0.34967683751690964, + "flos": 17414048160000.0, + "grad_norm": 1.9441700650437517, + "language_loss": 0.77666688, + "learning_rate": 2.910168488437632e-06, + "loss": 0.79766285, + "num_input_tokens_seen": 125125730, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.49609375, + "step": 5816, + "time_per_iteration": 2.381499767303467 + }, + { + "auxiliary_loss_clip": 0.01074997, + "auxiliary_loss_mlp": 0.01029586, + "balance_loss_clip": 1.01498866, + "balance_loss_mlp": 1.02349591, + "epoch": 0.3497369607695776, + "flos": 22598964328320.0, + "grad_norm": 1.8082848661862878, + "language_loss": 0.58813262, + "learning_rate": 2.9098320409165462e-06, + "loss": 0.60917848, + "num_input_tokens_seen": 125146195, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.515625, + "step": 5817, + "time_per_iteration": 2.3892834186553955 + }, + { + "auxiliary_loss_clip": 0.0101592, + "auxiliary_loss_mlp": 0.01009118, + "balance_loss_clip": 1.00780642, + "balance_loss_mlp": 1.00540102, + "epoch": 0.34979708402224563, + "flos": 68526193708800.0, + "grad_norm": 0.87321725435861, + "language_loss": 0.59851706, + "learning_rate": 2.9094955609258954e-06, + "loss": 0.6187675, + "num_input_tokens_seen": 125207790, + "router_z_loss_clip": 0.01312256, + "router_z_loss_mlp": 0.10546875, + "step": 5818, + "time_per_iteration": 3.0726945400238037 + }, + { + "auxiliary_loss_clip": 0.01070647, + "auxiliary_loss_mlp": 0.01026739, + "balance_loss_clip": 1.01235628, + "balance_loss_mlp": 1.02256739, + "epoch": 0.3498572072749136, + "flos": 18988630744320.0, + "grad_norm": 1.996602069957389, + "language_loss": 0.83271128, + "learning_rate": 2.909159048477688e-06, + "loss": 0.85368514, + "num_input_tokens_seen": 125226220, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.48046875, + "step": 5819, + "time_per_iteration": 2.385380744934082 + }, + { + "auxiliary_loss_clip": 0.01071479, + "auxiliary_loss_mlp": 0.0102749, + "balance_loss_clip": 1.01358426, + "balance_loss_mlp": 1.0221417, + "epoch": 0.34991733052758156, + "flos": 27817641648000.0, + "grad_norm": 2.0543557592234083, + "language_loss": 0.71232194, + "learning_rate": 2.9088225035839327e-06, + "loss": 0.73331165, + "num_input_tokens_seen": 125247485, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.4921875, + "step": 5820, + "time_per_iteration": 2.441352128982544 + }, + { + "auxiliary_loss_clip": 0.01073675, + "auxiliary_loss_mlp": 0.01029408, + "balance_loss_clip": 1.01569295, + "balance_loss_mlp": 1.02273846, + "epoch": 0.3499774537802495, + "flos": 33582779568000.0, + "grad_norm": 1.650895162186316, + "language_loss": 0.70354503, + "learning_rate": 2.9084859262566397e-06, + "loss": 0.72457588, + "num_input_tokens_seen": 125268625, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.5078125, + "step": 5821, + "time_per_iteration": 2.4730236530303955 + }, + { + "auxiliary_loss_clip": 0.01079223, + "auxiliary_loss_mlp": 0.0103638, + "balance_loss_clip": 1.01930976, + "balance_loss_mlp": 1.02490127, + "epoch": 0.3500375770329175, + "flos": 23475633367680.0, + "grad_norm": 1.9285327056340842, + "language_loss": 0.73762476, + "learning_rate": 2.9081493165078216e-06, + "loss": 0.75878084, + "num_input_tokens_seen": 125287530, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.54296875, + "step": 5822, + "time_per_iteration": 2.4335241317749023 + }, + { + "auxiliary_loss_clip": 0.01075611, + "auxiliary_loss_mlp": 0.0102836, + "balance_loss_clip": 1.01213551, + "balance_loss_mlp": 1.02340245, + "epoch": 0.35009770028558546, + "flos": 19025045159040.0, + "grad_norm": 2.431197042198075, + "language_loss": 0.78223145, + "learning_rate": 2.907812674349489e-06, + "loss": 0.80327117, + "num_input_tokens_seen": 125307020, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.51953125, + "step": 5823, + "time_per_iteration": 3.828317165374756 + }, + { + "auxiliary_loss_clip": 0.01011293, + "auxiliary_loss_mlp": 0.01006719, + "balance_loss_clip": 1.00555038, + "balance_loss_mlp": 1.00143886, + "epoch": 0.3501578235382534, + "flos": 68348555377920.0, + "grad_norm": 0.7144161359937008, + "language_loss": 0.5924117, + "learning_rate": 2.907475999793659e-06, + "loss": 0.6125918, + "num_input_tokens_seen": 125370445, + "router_z_loss_clip": 0.01165771, + "router_z_loss_mlp": 0.09863281, + "step": 5824, + "time_per_iteration": 3.057630777359009 + }, + { + "auxiliary_loss_clip": 0.01074372, + "auxiliary_loss_mlp": 0.01028096, + "balance_loss_clip": 1.01267076, + "balance_loss_mlp": 1.02360642, + "epoch": 0.3502179467909214, + "flos": 21249850504320.0, + "grad_norm": 2.0157943792023287, + "language_loss": 0.84892666, + "learning_rate": 2.9071392928523433e-06, + "loss": 0.86995137, + "num_input_tokens_seen": 125388900, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.5078125, + "step": 5825, + "time_per_iteration": 2.445631742477417 + }, + { + "auxiliary_loss_clip": 0.01075198, + "auxiliary_loss_mlp": 0.0102548, + "balance_loss_clip": 1.01110983, + "balance_loss_mlp": 1.02463436, + "epoch": 0.35027807004358935, + "flos": 11942285328000.0, + "grad_norm": 2.8645329266613824, + "language_loss": 0.83159238, + "learning_rate": 2.9068025535375603e-06, + "loss": 0.8525992, + "num_input_tokens_seen": 125402675, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.5078125, + "step": 5826, + "time_per_iteration": 2.364750385284424 + }, + { + "auxiliary_loss_clip": 0.01076329, + "auxiliary_loss_mlp": 0.01035671, + "balance_loss_clip": 1.02020931, + "balance_loss_mlp": 1.02465343, + "epoch": 0.3503381932962573, + "flos": 21469838181120.0, + "grad_norm": 1.482831130007531, + "language_loss": 0.808658, + "learning_rate": 2.9064657818613274e-06, + "loss": 0.82977796, + "num_input_tokens_seen": 125421360, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.515625, + "step": 5827, + "time_per_iteration": 5.1953113079071045 + }, + { + "auxiliary_loss_clip": 0.0107525, + "auxiliary_loss_mlp": 0.01028869, + "balance_loss_clip": 1.01473665, + "balance_loss_mlp": 1.02498889, + "epoch": 0.3503983165489253, + "flos": 21250059972480.0, + "grad_norm": 3.244150573028192, + "language_loss": 0.70889628, + "learning_rate": 2.906128977835661e-06, + "loss": 0.72993743, + "num_input_tokens_seen": 125440000, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.50390625, + "step": 5828, + "time_per_iteration": 2.397907018661499 + }, + { + "auxiliary_loss_clip": 0.01079964, + "auxiliary_loss_mlp": 0.01033352, + "balance_loss_clip": 1.01635885, + "balance_loss_mlp": 1.02630138, + "epoch": 0.35045843980159325, + "flos": 27814569448320.0, + "grad_norm": 1.781232522570234, + "language_loss": 0.79580939, + "learning_rate": 2.9057921414725838e-06, + "loss": 0.81694257, + "num_input_tokens_seen": 125460390, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.5390625, + "step": 5829, + "time_per_iteration": 2.4365835189819336 + }, + { + "auxiliary_loss_clip": 0.01077453, + "auxiliary_loss_mlp": 0.01040442, + "balance_loss_clip": 1.02403903, + "balance_loss_mlp": 1.02434659, + "epoch": 0.3505185630542612, + "flos": 25919972542080.0, + "grad_norm": 2.0281747068118174, + "language_loss": 0.72217435, + "learning_rate": 2.9054552727841136e-06, + "loss": 0.74335325, + "num_input_tokens_seen": 125478410, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.53125, + "step": 5830, + "time_per_iteration": 2.428008794784546 + }, + { + "auxiliary_loss_clip": 0.01074093, + "auxiliary_loss_mlp": 0.01026985, + "balance_loss_clip": 1.0122267, + "balance_loss_mlp": 1.02410579, + "epoch": 0.35057868630692923, + "flos": 20520724337280.0, + "grad_norm": 2.4031299902788508, + "language_loss": 0.88610458, + "learning_rate": 2.905118371782275e-06, + "loss": 0.90711534, + "num_input_tokens_seen": 125495975, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.5, + "step": 5831, + "time_per_iteration": 2.3890857696533203 + }, + { + "auxiliary_loss_clip": 0.01073385, + "auxiliary_loss_mlp": 0.0103394, + "balance_loss_clip": 1.01850271, + "balance_loss_mlp": 1.02219677, + "epoch": 0.3506388095595972, + "flos": 20447616216960.0, + "grad_norm": 1.8117341828354205, + "language_loss": 0.78457981, + "learning_rate": 2.9047814384790894e-06, + "loss": 0.8056531, + "num_input_tokens_seen": 125515035, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.51171875, + "step": 5832, + "time_per_iteration": 3.911949872970581 + }, + { + "auxiliary_loss_clip": 0.01074548, + "auxiliary_loss_mlp": 0.01038139, + "balance_loss_clip": 1.02183068, + "balance_loss_mlp": 1.02344561, + "epoch": 0.35069893281226516, + "flos": 23108626488960.0, + "grad_norm": 3.267834556160976, + "language_loss": 0.70697343, + "learning_rate": 2.9044444728865814e-06, + "loss": 0.7281003, + "num_input_tokens_seen": 125535555, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.51171875, + "step": 5833, + "time_per_iteration": 2.3969180583953857 + }, + { + "auxiliary_loss_clip": 0.01073233, + "auxiliary_loss_mlp": 0.01028039, + "balance_loss_clip": 1.01367474, + "balance_loss_mlp": 1.02462578, + "epoch": 0.35075905606493313, + "flos": 27270762111360.0, + "grad_norm": 1.395886004571591, + "language_loss": 0.80790132, + "learning_rate": 2.904107475016777e-06, + "loss": 0.82891405, + "num_input_tokens_seen": 125558195, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.48632812, + "step": 5834, + "time_per_iteration": 2.6021265983581543 + }, + { + "auxiliary_loss_clip": 0.01074976, + "auxiliary_loss_mlp": 0.01032193, + "balance_loss_clip": 1.01661253, + "balance_loss_mlp": 1.02431047, + "epoch": 0.3508191793176011, + "flos": 19127794890240.0, + "grad_norm": 1.9985227525423257, + "language_loss": 0.8406868, + "learning_rate": 2.903770444881702e-06, + "loss": 0.86175847, + "num_input_tokens_seen": 125575375, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.5078125, + "step": 5835, + "time_per_iteration": 2.368988037109375 + }, + { + "auxiliary_loss_clip": 0.01072856, + "auxiliary_loss_mlp": 0.0103457, + "balance_loss_clip": 1.01981819, + "balance_loss_mlp": 1.02323973, + "epoch": 0.35087930257026906, + "flos": 25556386976640.0, + "grad_norm": 1.4509454469989345, + "language_loss": 0.76626897, + "learning_rate": 2.903433382493386e-06, + "loss": 0.78734314, + "num_input_tokens_seen": 125596745, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.49609375, + "step": 5836, + "time_per_iteration": 2.5065622329711914 + }, + { + "auxiliary_loss_clip": 0.0107682, + "auxiliary_loss_mlp": 0.01030679, + "balance_loss_clip": 1.01526499, + "balance_loss_mlp": 1.02583432, + "epoch": 0.350939425822937, + "flos": 18003277042560.0, + "grad_norm": 1.9367643988797245, + "language_loss": 0.77378464, + "learning_rate": 2.903096287863855e-06, + "loss": 0.79485965, + "num_input_tokens_seen": 125613980, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.51171875, + "step": 5837, + "time_per_iteration": 2.370356798171997 + }, + { + "auxiliary_loss_clip": 0.01073095, + "auxiliary_loss_mlp": 0.01028777, + "balance_loss_clip": 1.01420355, + "balance_loss_mlp": 1.02333236, + "epoch": 0.350999549075605, + "flos": 22272107379840.0, + "grad_norm": 1.7659622437642473, + "language_loss": 0.67823792, + "learning_rate": 2.902759161005141e-06, + "loss": 0.69925666, + "num_input_tokens_seen": 125632100, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.49804688, + "step": 5838, + "time_per_iteration": 2.4062552452087402 + }, + { + "auxiliary_loss_clip": 0.01075698, + "auxiliary_loss_mlp": 0.0102871, + "balance_loss_clip": 1.01408279, + "balance_loss_mlp": 1.02416635, + "epoch": 0.35105967232827295, + "flos": 14391407358720.0, + "grad_norm": 2.078667375600241, + "language_loss": 0.83085197, + "learning_rate": 2.9024220019292752e-06, + "loss": 0.85189605, + "num_input_tokens_seen": 125649190, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.515625, + "step": 5839, + "time_per_iteration": 2.354326009750366 + }, + { + "auxiliary_loss_clip": 0.01076081, + "auxiliary_loss_mlp": 0.01034223, + "balance_loss_clip": 1.01841605, + "balance_loss_mlp": 1.02299321, + "epoch": 0.3511197955809409, + "flos": 25081184194560.0, + "grad_norm": 1.6183001641249404, + "language_loss": 0.59279394, + "learning_rate": 2.902084810648289e-06, + "loss": 0.61389709, + "num_input_tokens_seen": 125668680, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.53125, + "step": 5840, + "time_per_iteration": 2.420515537261963 + }, + { + "auxiliary_loss_clip": 0.01073923, + "auxiliary_loss_mlp": 0.01031403, + "balance_loss_clip": 1.01645458, + "balance_loss_mlp": 1.02264929, + "epoch": 0.3511799188336089, + "flos": 25882999545600.0, + "grad_norm": 2.231764468532241, + "language_loss": 0.87437105, + "learning_rate": 2.901747587174216e-06, + "loss": 0.89542437, + "num_input_tokens_seen": 125686935, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.51171875, + "step": 5841, + "time_per_iteration": 2.412553310394287 + }, + { + "auxiliary_loss_clip": 0.0107715, + "auxiliary_loss_mlp": 0.01030999, + "balance_loss_clip": 1.01469123, + "balance_loss_mlp": 1.02402568, + "epoch": 0.35124004208627685, + "flos": 20082704019840.0, + "grad_norm": 1.8000295990916717, + "language_loss": 0.75027156, + "learning_rate": 2.9014103315190916e-06, + "loss": 0.77135301, + "num_input_tokens_seen": 125707180, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.53125, + "step": 5842, + "time_per_iteration": 2.4183547496795654 + }, + { + "auxiliary_loss_clip": 0.01075919, + "auxiliary_loss_mlp": 0.01035649, + "balance_loss_clip": 1.02070045, + "balance_loss_mlp": 1.02348018, + "epoch": 0.3513001653389448, + "flos": 17782521315840.0, + "grad_norm": 2.6569198410275496, + "language_loss": 0.68676543, + "learning_rate": 2.9010730436949514e-06, + "loss": 0.70788109, + "num_input_tokens_seen": 125722780, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.5234375, + "step": 5843, + "time_per_iteration": 2.3426473140716553 + }, + { + "auxiliary_loss_clip": 0.01075804, + "auxiliary_loss_mlp": 0.01034453, + "balance_loss_clip": 1.01873565, + "balance_loss_mlp": 1.02483296, + "epoch": 0.3513602885916128, + "flos": 29385870364800.0, + "grad_norm": 1.9303890909133155, + "language_loss": 0.65193594, + "learning_rate": 2.900735723713832e-06, + "loss": 0.67303848, + "num_input_tokens_seen": 125742110, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.51171875, + "step": 5844, + "time_per_iteration": 2.4219937324523926 + }, + { + "auxiliary_loss_clip": 0.01075904, + "auxiliary_loss_mlp": 0.01034569, + "balance_loss_clip": 1.01880348, + "balance_loss_mlp": 1.02411294, + "epoch": 0.3514204118442808, + "flos": 16178960436480.0, + "grad_norm": 1.8652018751226302, + "language_loss": 0.75374216, + "learning_rate": 2.9003983715877713e-06, + "loss": 0.77484691, + "num_input_tokens_seen": 125759980, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.515625, + "step": 5845, + "time_per_iteration": 2.391521692276001 + }, + { + "auxiliary_loss_clip": 0.01074055, + "auxiliary_loss_mlp": 0.01035567, + "balance_loss_clip": 1.02029073, + "balance_loss_mlp": 1.02443659, + "epoch": 0.35148053509694877, + "flos": 23833737849600.0, + "grad_norm": 2.568877763249513, + "language_loss": 0.73095214, + "learning_rate": 2.9000609873288085e-06, + "loss": 0.75204837, + "num_input_tokens_seen": 125772660, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.49609375, + "step": 5846, + "time_per_iteration": 2.373753547668457 + }, + { + "auxiliary_loss_clip": 0.01076269, + "auxiliary_loss_mlp": 0.01032295, + "balance_loss_clip": 1.0172565, + "balance_loss_mlp": 1.02514589, + "epoch": 0.35154065834961673, + "flos": 20990376213120.0, + "grad_norm": 1.6068390241702384, + "language_loss": 0.75692546, + "learning_rate": 2.8997235709489845e-06, + "loss": 0.77801108, + "num_input_tokens_seen": 125791935, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.5078125, + "step": 5847, + "time_per_iteration": 2.4587488174438477 + }, + { + "auxiliary_loss_clip": 0.0107543, + "auxiliary_loss_mlp": 0.01030056, + "balance_loss_clip": 1.01584053, + "balance_loss_mlp": 1.02389681, + "epoch": 0.3516007816022847, + "flos": 33254072317440.0, + "grad_norm": 2.1191199957805007, + "language_loss": 0.7246365, + "learning_rate": 2.8993861224603412e-06, + "loss": 0.74569136, + "num_input_tokens_seen": 125813455, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.515625, + "step": 5848, + "time_per_iteration": 2.530982494354248 + }, + { + "auxiliary_loss_clip": 0.01079916, + "auxiliary_loss_mlp": 0.01037422, + "balance_loss_clip": 1.01999366, + "balance_loss_mlp": 1.02587247, + "epoch": 0.35166090485495266, + "flos": 11726207723520.0, + "grad_norm": 28.206760223459955, + "language_loss": 0.90069497, + "learning_rate": 2.8990486418749205e-06, + "loss": 0.92186832, + "num_input_tokens_seen": 125827660, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.5390625, + "step": 5849, + "time_per_iteration": 2.4092886447906494 + }, + { + "auxiliary_loss_clip": 0.01074253, + "auxiliary_loss_mlp": 0.01029541, + "balance_loss_clip": 1.01482427, + "balance_loss_mlp": 1.02365971, + "epoch": 0.3517210281076206, + "flos": 22637333779200.0, + "grad_norm": 2.035788367149342, + "language_loss": 0.75290322, + "learning_rate": 2.8987111292047663e-06, + "loss": 0.77394116, + "num_input_tokens_seen": 125846655, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.5078125, + "step": 5850, + "time_per_iteration": 2.414071798324585 + }, + { + "auxiliary_loss_clip": 0.0107581, + "auxiliary_loss_mlp": 0.01029625, + "balance_loss_clip": 1.01503372, + "balance_loss_mlp": 1.02625632, + "epoch": 0.3517811513602886, + "flos": 21321736727040.0, + "grad_norm": 1.4213073000169885, + "language_loss": 0.75725776, + "learning_rate": 2.898373584461924e-06, + "loss": 0.77831215, + "num_input_tokens_seen": 125866290, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.49609375, + "step": 5851, + "time_per_iteration": 2.4352986812591553 + }, + { + "auxiliary_loss_clip": 0.01078671, + "auxiliary_loss_mlp": 0.01032765, + "balance_loss_clip": 1.01679087, + "balance_loss_mlp": 1.02562332, + "epoch": 0.35184127461295656, + "flos": 21031817863680.0, + "grad_norm": 1.9879022840375544, + "language_loss": 0.87389195, + "learning_rate": 2.8980360076584384e-06, + "loss": 0.8950063, + "num_input_tokens_seen": 125884620, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.53125, + "step": 5852, + "time_per_iteration": 2.383826732635498 + }, + { + "auxiliary_loss_clip": 0.01073404, + "auxiliary_loss_mlp": 0.01027158, + "balance_loss_clip": 1.0128293, + "balance_loss_mlp": 1.02399099, + "epoch": 0.3519013978656245, + "flos": 22454179453440.0, + "grad_norm": 2.0853284929415112, + "language_loss": 0.67925978, + "learning_rate": 2.8976983988063586e-06, + "loss": 0.70026541, + "num_input_tokens_seen": 125902430, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.49414062, + "step": 5853, + "time_per_iteration": 2.4830963611602783 + }, + { + "auxiliary_loss_clip": 0.01074363, + "auxiliary_loss_mlp": 0.01034063, + "balance_loss_clip": 1.01886415, + "balance_loss_mlp": 1.02310848, + "epoch": 0.3519615211182925, + "flos": 13114459048320.0, + "grad_norm": 1.5559215025064466, + "language_loss": 0.80806428, + "learning_rate": 2.8973607579177317e-06, + "loss": 0.82914853, + "num_input_tokens_seen": 125920570, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.51171875, + "step": 5854, + "time_per_iteration": 2.380789041519165 + }, + { + "auxiliary_loss_clip": 0.0107246, + "auxiliary_loss_mlp": 0.01028949, + "balance_loss_clip": 1.01477504, + "balance_loss_mlp": 1.02247334, + "epoch": 0.35202164437096045, + "flos": 19134148757760.0, + "grad_norm": 1.440726560838551, + "language_loss": 0.73182976, + "learning_rate": 2.8970230850046076e-06, + "loss": 0.75284386, + "num_input_tokens_seen": 125939800, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.5, + "step": 5855, + "time_per_iteration": 2.39998197555542 + }, + { + "auxiliary_loss_clip": 0.01071977, + "auxiliary_loss_mlp": 0.01030791, + "balance_loss_clip": 1.01595569, + "balance_loss_mlp": 1.02250242, + "epoch": 0.3520817676236284, + "flos": 26540972628480.0, + "grad_norm": 2.2127951257077836, + "language_loss": 0.71072859, + "learning_rate": 2.896685380079037e-06, + "loss": 0.73175633, + "num_input_tokens_seen": 125958720, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.49414062, + "step": 5856, + "time_per_iteration": 2.4252126216888428 + }, + { + "auxiliary_loss_clip": 0.01076514, + "auxiliary_loss_mlp": 0.01035549, + "balance_loss_clip": 1.01808488, + "balance_loss_mlp": 1.02447963, + "epoch": 0.3521418908762964, + "flos": 44891776010880.0, + "grad_norm": 1.711461703634843, + "language_loss": 0.61526108, + "learning_rate": 2.896347643153072e-06, + "loss": 0.63638175, + "num_input_tokens_seen": 125984310, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.51953125, + "step": 5857, + "time_per_iteration": 2.608412265777588 + }, + { + "auxiliary_loss_clip": 0.01074462, + "auxiliary_loss_mlp": 0.01030573, + "balance_loss_clip": 1.01489115, + "balance_loss_mlp": 1.02292824, + "epoch": 0.3522020141289644, + "flos": 20186536003200.0, + "grad_norm": 1.9383542983882343, + "language_loss": 0.73661101, + "learning_rate": 2.896009874238765e-06, + "loss": 0.75766134, + "num_input_tokens_seen": 126002410, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.515625, + "step": 5858, + "time_per_iteration": 2.383868932723999 + }, + { + "auxiliary_loss_clip": 0.0107661, + "auxiliary_loss_mlp": 0.0103506, + "balance_loss_clip": 1.01953936, + "balance_loss_mlp": 1.02272761, + "epoch": 0.35226213738163237, + "flos": 27562670922240.0, + "grad_norm": 1.5275005693541637, + "language_loss": 0.76397693, + "learning_rate": 2.8956720733481707e-06, + "loss": 0.78509367, + "num_input_tokens_seen": 126022490, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.5390625, + "step": 5859, + "time_per_iteration": 2.433192253112793 + }, + { + "auxiliary_loss_clip": 0.01081974, + "auxiliary_loss_mlp": 0.01039412, + "balance_loss_clip": 1.02261555, + "balance_loss_mlp": 1.02637196, + "epoch": 0.35232226063430033, + "flos": 22965203157120.0, + "grad_norm": 1.6797592247291404, + "language_loss": 0.72011071, + "learning_rate": 2.895334240493344e-06, + "loss": 0.74132454, + "num_input_tokens_seen": 126042895, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.5546875, + "step": 5860, + "time_per_iteration": 2.3874709606170654 + }, + { + "auxiliary_loss_clip": 0.01077176, + "auxiliary_loss_mlp": 0.01033995, + "balance_loss_clip": 1.0176518, + "balance_loss_mlp": 1.02230132, + "epoch": 0.3523823838869683, + "flos": 19167386238720.0, + "grad_norm": 2.348800594671615, + "language_loss": 0.66274589, + "learning_rate": 2.8949963756863414e-06, + "loss": 0.68385756, + "num_input_tokens_seen": 126060130, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.546875, + "step": 5861, + "time_per_iteration": 2.3611271381378174 + }, + { + "auxiliary_loss_clip": 0.01073783, + "auxiliary_loss_mlp": 0.01028534, + "balance_loss_clip": 1.01421082, + "balance_loss_mlp": 1.02372766, + "epoch": 0.35244250713963626, + "flos": 17930029276800.0, + "grad_norm": 1.7706961305058337, + "language_loss": 0.67007422, + "learning_rate": 2.8946584789392197e-06, + "loss": 0.69109738, + "num_input_tokens_seen": 126077850, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.5, + "step": 5862, + "time_per_iteration": 2.363870620727539 + }, + { + "auxiliary_loss_clip": 0.01077104, + "auxiliary_loss_mlp": 0.01032803, + "balance_loss_clip": 1.0167222, + "balance_loss_mlp": 1.02441454, + "epoch": 0.35250263039230423, + "flos": 21431503641600.0, + "grad_norm": 2.3277378732391774, + "language_loss": 0.77282941, + "learning_rate": 2.894320550264039e-06, + "loss": 0.7939285, + "num_input_tokens_seen": 126095985, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.52734375, + "step": 5863, + "time_per_iteration": 3.743237018585205 + }, + { + "auxiliary_loss_clip": 0.01076797, + "auxiliary_loss_mlp": 0.01035159, + "balance_loss_clip": 1.0196079, + "balance_loss_mlp": 1.02465439, + "epoch": 0.3525627536449722, + "flos": 27415651720320.0, + "grad_norm": 1.6787246104310511, + "language_loss": 0.74978757, + "learning_rate": 2.893982589672858e-06, + "loss": 0.77090716, + "num_input_tokens_seen": 126116070, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.5234375, + "step": 5864, + "time_per_iteration": 2.422184705734253 + }, + { + "auxiliary_loss_clip": 0.01074159, + "auxiliary_loss_mlp": 0.01037481, + "balance_loss_clip": 1.02261543, + "balance_loss_mlp": 1.02355051, + "epoch": 0.35262287689764016, + "flos": 24788681890560.0, + "grad_norm": 2.15381827712521, + "language_loss": 0.79011428, + "learning_rate": 2.893644597177738e-06, + "loss": 0.81123072, + "num_input_tokens_seen": 126135205, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.50390625, + "step": 5865, + "time_per_iteration": 2.4226791858673096 + }, + { + "auxiliary_loss_clip": 0.01078665, + "auxiliary_loss_mlp": 0.01033594, + "balance_loss_clip": 1.01717925, + "balance_loss_mlp": 1.02550244, + "epoch": 0.3526830001503081, + "flos": 17820646387200.0, + "grad_norm": 1.883785408154615, + "language_loss": 0.80973965, + "learning_rate": 2.8933065727907417e-06, + "loss": 0.83086228, + "num_input_tokens_seen": 126151895, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.53125, + "step": 5866, + "time_per_iteration": 3.7610630989074707 + }, + { + "auxiliary_loss_clip": 0.01079901, + "auxiliary_loss_mlp": 0.01033245, + "balance_loss_clip": 1.0147202, + "balance_loss_mlp": 1.02367556, + "epoch": 0.3527431234029761, + "flos": 18077118301440.0, + "grad_norm": 2.086356827126721, + "language_loss": 0.83860362, + "learning_rate": 2.8929685165239308e-06, + "loss": 0.85973513, + "num_input_tokens_seen": 126168515, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.5625, + "step": 5867, + "time_per_iteration": 3.7440896034240723 + }, + { + "auxiliary_loss_clip": 0.01077859, + "auxiliary_loss_mlp": 0.01033803, + "balance_loss_clip": 1.01700675, + "balance_loss_mlp": 1.02475953, + "epoch": 0.35280324665564405, + "flos": 19426336859520.0, + "grad_norm": 1.661167565472378, + "language_loss": 0.7399323, + "learning_rate": 2.892630428389371e-06, + "loss": 0.76104897, + "num_input_tokens_seen": 126186460, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.53125, + "step": 5868, + "time_per_iteration": 2.3896663188934326 + }, + { + "auxiliary_loss_clip": 0.01077802, + "auxiliary_loss_mlp": 0.0103076, + "balance_loss_clip": 1.01446378, + "balance_loss_mlp": 1.02448463, + "epoch": 0.352863369908312, + "flos": 21503040750720.0, + "grad_norm": 2.6046129995476885, + "language_loss": 0.61233103, + "learning_rate": 2.892292308399127e-06, + "loss": 0.63341665, + "num_input_tokens_seen": 126206170, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.53515625, + "step": 5869, + "time_per_iteration": 2.4027748107910156 + }, + { + "auxiliary_loss_clip": 0.01076204, + "auxiliary_loss_mlp": 0.01034791, + "balance_loss_clip": 1.01955581, + "balance_loss_mlp": 1.02358913, + "epoch": 0.35292349316098, + "flos": 22308417060480.0, + "grad_norm": 2.048940801439133, + "language_loss": 0.74459761, + "learning_rate": 2.8919541565652655e-06, + "loss": 0.76570749, + "num_input_tokens_seen": 126225605, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.52734375, + "step": 5870, + "time_per_iteration": 2.402498960494995 + }, + { + "auxiliary_loss_clip": 0.01074536, + "auxiliary_loss_mlp": 0.01031378, + "balance_loss_clip": 1.01572561, + "balance_loss_mlp": 1.02281451, + "epoch": 0.352983616413648, + "flos": 33108344835840.0, + "grad_norm": 1.591893236940386, + "language_loss": 0.71751237, + "learning_rate": 2.8916159728998555e-06, + "loss": 0.73857152, + "num_input_tokens_seen": 126250230, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.515625, + "step": 5871, + "time_per_iteration": 3.878713369369507 + }, + { + "auxiliary_loss_clip": 0.01071878, + "auxiliary_loss_mlp": 0.01029749, + "balance_loss_clip": 1.01571822, + "balance_loss_mlp": 1.02270341, + "epoch": 0.35304373966631597, + "flos": 18695639681280.0, + "grad_norm": 1.8012887879951427, + "language_loss": 0.73728526, + "learning_rate": 2.8912777574149642e-06, + "loss": 0.75830156, + "num_input_tokens_seen": 126268315, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.4921875, + "step": 5872, + "time_per_iteration": 2.3558521270751953 + }, + { + "auxiliary_loss_clip": 0.01073731, + "auxiliary_loss_mlp": 0.01032379, + "balance_loss_clip": 1.01740646, + "balance_loss_mlp": 1.02281487, + "epoch": 0.35310386291898394, + "flos": 23363911416960.0, + "grad_norm": 1.6723231977884612, + "language_loss": 0.82846761, + "learning_rate": 2.8909395101226628e-06, + "loss": 0.84952873, + "num_input_tokens_seen": 126288390, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.5078125, + "step": 5873, + "time_per_iteration": 2.396043539047241 + }, + { + "auxiliary_loss_clip": 0.01079075, + "auxiliary_loss_mlp": 0.0103179, + "balance_loss_clip": 1.0153271, + "balance_loss_mlp": 1.02438068, + "epoch": 0.3531639861716519, + "flos": 24460812512640.0, + "grad_norm": 1.9535666503686573, + "language_loss": 0.66170931, + "learning_rate": 2.8906012310350212e-06, + "loss": 0.68281794, + "num_input_tokens_seen": 126305750, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.546875, + "step": 5874, + "time_per_iteration": 2.390150785446167 + }, + { + "auxiliary_loss_clip": 0.01012705, + "auxiliary_loss_mlp": 0.01000866, + "balance_loss_clip": 0.99954325, + "balance_loss_mlp": 1.002509, + "epoch": 0.35322410942431987, + "flos": 70309347955200.0, + "grad_norm": 0.9091830235546952, + "language_loss": 0.61591953, + "learning_rate": 2.890262920164113e-06, + "loss": 0.63605529, + "num_input_tokens_seen": 126362495, + "router_z_loss_clip": 0.01324463, + "router_z_loss_mlp": 0.1015625, + "step": 5875, + "time_per_iteration": 2.942614793777466 + }, + { + "auxiliary_loss_clip": 0.0107746, + "auxiliary_loss_mlp": 0.0103055, + "balance_loss_clip": 1.01539278, + "balance_loss_mlp": 1.02466393, + "epoch": 0.35328423267698783, + "flos": 19820087706240.0, + "grad_norm": 1.8210745894631823, + "language_loss": 0.7979157, + "learning_rate": 2.8899245775220113e-06, + "loss": 0.81899577, + "num_input_tokens_seen": 126378320, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.52734375, + "step": 5876, + "time_per_iteration": 2.3559935092926025 + }, + { + "auxiliary_loss_clip": 0.01012388, + "auxiliary_loss_mlp": 0.01000726, + "balance_loss_clip": 0.99947476, + "balance_loss_mlp": 1.00215995, + "epoch": 0.3533443559296558, + "flos": 60823516043520.0, + "grad_norm": 0.6723607187770596, + "language_loss": 0.56811762, + "learning_rate": 2.8895862031207906e-06, + "loss": 0.58824879, + "num_input_tokens_seen": 126442735, + "router_z_loss_clip": 0.01251221, + "router_z_loss_mlp": 0.10253906, + "step": 5877, + "time_per_iteration": 3.133697748184204 + }, + { + "auxiliary_loss_clip": 0.01076172, + "auxiliary_loss_mlp": 0.01029721, + "balance_loss_clip": 1.01378298, + "balance_loss_mlp": 1.02378035, + "epoch": 0.35340447918232376, + "flos": 24754571625600.0, + "grad_norm": 1.701678658638952, + "language_loss": 0.719262, + "learning_rate": 2.889247796972527e-06, + "loss": 0.74032098, + "num_input_tokens_seen": 126463090, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.5234375, + "step": 5878, + "time_per_iteration": 2.4063687324523926 + }, + { + "auxiliary_loss_clip": 0.01072906, + "auxiliary_loss_mlp": 0.01034266, + "balance_loss_clip": 1.01875675, + "balance_loss_mlp": 1.02110076, + "epoch": 0.3534646024349917, + "flos": 21795298675200.0, + "grad_norm": 1.580979400444761, + "language_loss": 0.78321564, + "learning_rate": 2.8889093590892965e-06, + "loss": 0.80428731, + "num_input_tokens_seen": 126482105, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.51953125, + "step": 5879, + "time_per_iteration": 2.4020001888275146 + }, + { + "auxiliary_loss_clip": 0.01078892, + "auxiliary_loss_mlp": 0.01034549, + "balance_loss_clip": 1.01694214, + "balance_loss_mlp": 1.0247345, + "epoch": 0.3535247256876597, + "flos": 20011062176640.0, + "grad_norm": 2.101443437287178, + "language_loss": 0.62793958, + "learning_rate": 2.8885708894831776e-06, + "loss": 0.64907402, + "num_input_tokens_seen": 126502125, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.54296875, + "step": 5880, + "time_per_iteration": 2.384622812271118 + }, + { + "auxiliary_loss_clip": 0.01074874, + "auxiliary_loss_mlp": 0.01031584, + "balance_loss_clip": 1.01529992, + "balance_loss_mlp": 1.02300763, + "epoch": 0.35358484894032766, + "flos": 18186920127360.0, + "grad_norm": 1.9008485433476485, + "language_loss": 0.65353465, + "learning_rate": 2.8882323881662496e-06, + "loss": 0.67459929, + "num_input_tokens_seen": 126521950, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.51953125, + "step": 5881, + "time_per_iteration": 2.394958734512329 + }, + { + "auxiliary_loss_clip": 0.01072389, + "auxiliary_loss_mlp": 0.01026406, + "balance_loss_clip": 1.01245236, + "balance_loss_mlp": 1.02308178, + "epoch": 0.3536449721929956, + "flos": 22819266207360.0, + "grad_norm": 1.5645037558978367, + "language_loss": 0.758187, + "learning_rate": 2.887893855150592e-06, + "loss": 0.77917492, + "num_input_tokens_seen": 126542445, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.49414062, + "step": 5882, + "time_per_iteration": 2.385859966278076 + }, + { + "auxiliary_loss_clip": 0.01077255, + "auxiliary_loss_mlp": 0.01033896, + "balance_loss_clip": 1.01841033, + "balance_loss_mlp": 1.02386963, + "epoch": 0.3537050954456636, + "flos": 26431135891200.0, + "grad_norm": 2.616980383547137, + "language_loss": 0.70357871, + "learning_rate": 2.8875552904482874e-06, + "loss": 0.7246902, + "num_input_tokens_seen": 126560690, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.53515625, + "step": 5883, + "time_per_iteration": 2.4271581172943115 + }, + { + "auxiliary_loss_clip": 0.01079067, + "auxiliary_loss_mlp": 0.01032945, + "balance_loss_clip": 1.01709604, + "balance_loss_mlp": 1.02432811, + "epoch": 0.3537652186983316, + "flos": 17196329721600.0, + "grad_norm": 2.404685447682957, + "language_loss": 0.78723383, + "learning_rate": 2.8872166940714166e-06, + "loss": 0.80835396, + "num_input_tokens_seen": 126577620, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.546875, + "step": 5884, + "time_per_iteration": 2.345073699951172 + }, + { + "auxiliary_loss_clip": 0.010778, + "auxiliary_loss_mlp": 0.01033695, + "balance_loss_clip": 1.01872206, + "balance_loss_mlp": 1.02498007, + "epoch": 0.3538253419509996, + "flos": 19535754660480.0, + "grad_norm": 1.9494007214503086, + "language_loss": 0.75360185, + "learning_rate": 2.886878066032065e-06, + "loss": 0.77471679, + "num_input_tokens_seen": 126596235, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.53125, + "step": 5885, + "time_per_iteration": 2.387208938598633 + }, + { + "auxiliary_loss_clip": 0.01077986, + "auxiliary_loss_mlp": 0.01033045, + "balance_loss_clip": 1.01721382, + "balance_loss_mlp": 1.02465034, + "epoch": 0.35388546520366754, + "flos": 12127813626240.0, + "grad_norm": 2.3022689683142525, + "language_loss": 0.83416253, + "learning_rate": 2.8865394063423155e-06, + "loss": 0.85527289, + "num_input_tokens_seen": 126612830, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.53515625, + "step": 5886, + "time_per_iteration": 2.3495311737060547 + }, + { + "auxiliary_loss_clip": 0.01074471, + "auxiliary_loss_mlp": 0.01030524, + "balance_loss_clip": 1.0141561, + "balance_loss_mlp": 1.02315307, + "epoch": 0.3539455884563355, + "flos": 19677257867520.0, + "grad_norm": 2.039683036946036, + "language_loss": 0.77759564, + "learning_rate": 2.8862007150142557e-06, + "loss": 0.79864556, + "num_input_tokens_seen": 126630910, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.51171875, + "step": 5887, + "time_per_iteration": 2.376030921936035 + }, + { + "auxiliary_loss_clip": 0.0107524, + "auxiliary_loss_mlp": 0.0103677, + "balance_loss_clip": 1.02040219, + "balance_loss_mlp": 1.02309012, + "epoch": 0.35400571170900347, + "flos": 18071218281600.0, + "grad_norm": 1.8087145519029344, + "language_loss": 0.65876943, + "learning_rate": 2.885861992059972e-06, + "loss": 0.67988944, + "num_input_tokens_seen": 126648365, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.5234375, + "step": 5888, + "time_per_iteration": 2.3508224487304688 + }, + { + "auxiliary_loss_clip": 0.01074859, + "auxiliary_loss_mlp": 0.01034565, + "balance_loss_clip": 1.01972389, + "balance_loss_mlp": 1.02368033, + "epoch": 0.35406583496167143, + "flos": 26066852098560.0, + "grad_norm": 2.565955302156471, + "language_loss": 0.7759434, + "learning_rate": 2.8855232374915528e-06, + "loss": 0.79703766, + "num_input_tokens_seen": 126667500, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.51171875, + "step": 5889, + "time_per_iteration": 2.4281210899353027 + }, + { + "auxiliary_loss_clip": 0.01076147, + "auxiliary_loss_mlp": 0.0103659, + "balance_loss_clip": 1.02120578, + "balance_loss_mlp": 1.0248611, + "epoch": 0.3541259582143394, + "flos": 19791423613440.0, + "grad_norm": 1.6545030615509662, + "language_loss": 0.80782312, + "learning_rate": 2.885184451321087e-06, + "loss": 0.82895052, + "num_input_tokens_seen": 126686820, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.51171875, + "step": 5890, + "time_per_iteration": 2.372559070587158 + }, + { + "auxiliary_loss_clip": 0.01071039, + "auxiliary_loss_mlp": 0.01028752, + "balance_loss_clip": 1.01510835, + "balance_loss_mlp": 1.02188134, + "epoch": 0.35418608146700736, + "flos": 24021011715840.0, + "grad_norm": 1.6483791044769702, + "language_loss": 0.7966547, + "learning_rate": 2.884845633560664e-06, + "loss": 0.81765258, + "num_input_tokens_seen": 126706965, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.4921875, + "step": 5891, + "time_per_iteration": 2.439776659011841 + }, + { + "auxiliary_loss_clip": 0.01075726, + "auxiliary_loss_mlp": 0.01034747, + "balance_loss_clip": 1.01836753, + "balance_loss_mlp": 1.02430964, + "epoch": 0.35424620471967533, + "flos": 12384948856320.0, + "grad_norm": 1.7290875428508632, + "language_loss": 0.72937632, + "learning_rate": 2.8845067842223776e-06, + "loss": 0.75048107, + "num_input_tokens_seen": 126724015, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.515625, + "step": 5892, + "time_per_iteration": 2.3800930976867676 + }, + { + "auxiliary_loss_clip": 0.01075378, + "auxiliary_loss_mlp": 0.01036513, + "balance_loss_clip": 1.02059865, + "balance_loss_mlp": 1.02440941, + "epoch": 0.3543063279723433, + "flos": 19672859036160.0, + "grad_norm": 2.1595526988730733, + "language_loss": 0.6732682, + "learning_rate": 2.884167903318319e-06, + "loss": 0.69438702, + "num_input_tokens_seen": 126737565, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.5078125, + "step": 5893, + "time_per_iteration": 2.4048843383789062 + }, + { + "auxiliary_loss_clip": 0.01073617, + "auxiliary_loss_mlp": 0.01032349, + "balance_loss_clip": 1.01644659, + "balance_loss_mlp": 1.02289867, + "epoch": 0.35436645122501126, + "flos": 21908102878080.0, + "grad_norm": 1.7349576200799974, + "language_loss": 0.6976167, + "learning_rate": 2.8838289908605822e-06, + "loss": 0.71867639, + "num_input_tokens_seen": 126756095, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.5078125, + "step": 5894, + "time_per_iteration": 2.385986804962158 + }, + { + "auxiliary_loss_clip": 0.0107549, + "auxiliary_loss_mlp": 0.01027314, + "balance_loss_clip": 1.01326466, + "balance_loss_mlp": 1.02557349, + "epoch": 0.3544265744776792, + "flos": 21718629596160.0, + "grad_norm": 2.645961968340466, + "language_loss": 0.74912483, + "learning_rate": 2.8834900468612624e-06, + "loss": 0.77015287, + "num_input_tokens_seen": 126775455, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.5, + "step": 5895, + "time_per_iteration": 2.4591476917266846 + }, + { + "auxiliary_loss_clip": 0.01074106, + "auxiliary_loss_mlp": 0.01031558, + "balance_loss_clip": 1.01627517, + "balance_loss_mlp": 1.02289915, + "epoch": 0.3544866977303472, + "flos": 21212214192000.0, + "grad_norm": 1.9440931471312937, + "language_loss": 0.8345443, + "learning_rate": 2.883151071332455e-06, + "loss": 0.85560095, + "num_input_tokens_seen": 126792320, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.51171875, + "step": 5896, + "time_per_iteration": 2.3896162509918213 + }, + { + "auxiliary_loss_clip": 0.01075096, + "auxiliary_loss_mlp": 0.01040164, + "balance_loss_clip": 1.02389169, + "balance_loss_mlp": 1.0239203, + "epoch": 0.35454682098301515, + "flos": 29310213715200.0, + "grad_norm": 1.6298269937323695, + "language_loss": 0.69902974, + "learning_rate": 2.8828120642862585e-06, + "loss": 0.72018236, + "num_input_tokens_seen": 126813680, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.51171875, + "step": 5897, + "time_per_iteration": 2.477522134780884 + }, + { + "auxiliary_loss_clip": 0.01074205, + "auxiliary_loss_mlp": 0.010336, + "balance_loss_clip": 1.01820421, + "balance_loss_mlp": 1.02349758, + "epoch": 0.3546069442356832, + "flos": 24315434144640.0, + "grad_norm": 1.499286734717137, + "language_loss": 0.81830782, + "learning_rate": 2.882473025734769e-06, + "loss": 0.83938587, + "num_input_tokens_seen": 126834395, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.5078125, + "step": 5898, + "time_per_iteration": 2.408853769302368 + }, + { + "auxiliary_loss_clip": 0.0107103, + "auxiliary_loss_mlp": 0.01032759, + "balance_loss_clip": 1.01895452, + "balance_loss_mlp": 1.02268291, + "epoch": 0.35466706748835114, + "flos": 22856169381120.0, + "grad_norm": 1.4404394282424455, + "language_loss": 0.74296194, + "learning_rate": 2.8821339556900883e-06, + "loss": 0.76399988, + "num_input_tokens_seen": 126855145, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.48242188, + "step": 5899, + "time_per_iteration": 2.4350152015686035 + }, + { + "auxiliary_loss_clip": 0.01074276, + "auxiliary_loss_mlp": 0.01030871, + "balance_loss_clip": 1.01613092, + "balance_loss_mlp": 1.02306354, + "epoch": 0.3547271907410191, + "flos": 28328839908480.0, + "grad_norm": 2.139747911416434, + "language_loss": 0.79579532, + "learning_rate": 2.8817948541643153e-06, + "loss": 0.81684673, + "num_input_tokens_seen": 126873790, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.51171875, + "step": 5900, + "time_per_iteration": 2.4274556636810303 + }, + { + "auxiliary_loss_clip": 0.01075679, + "auxiliary_loss_mlp": 0.01031729, + "balance_loss_clip": 1.01562381, + "balance_loss_mlp": 1.02413762, + "epoch": 0.35478731399368707, + "flos": 23512955477760.0, + "grad_norm": 1.8622299784497105, + "language_loss": 0.81282228, + "learning_rate": 2.8814557211695523e-06, + "loss": 0.8338964, + "num_input_tokens_seen": 126892865, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.515625, + "step": 5901, + "time_per_iteration": 2.447465658187866 + }, + { + "auxiliary_loss_clip": 0.01075801, + "auxiliary_loss_mlp": 0.01028772, + "balance_loss_clip": 1.01333487, + "balance_loss_mlp": 1.02326739, + "epoch": 0.35484743724635504, + "flos": 18623334522240.0, + "grad_norm": 1.7799090708897565, + "language_loss": 0.757442, + "learning_rate": 2.8811165567179025e-06, + "loss": 0.7784878, + "num_input_tokens_seen": 126911935, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.5234375, + "step": 5902, + "time_per_iteration": 2.3875577449798584 + }, + { + "auxiliary_loss_clip": 0.01074364, + "auxiliary_loss_mlp": 0.01032203, + "balance_loss_clip": 1.01726556, + "balance_loss_mlp": 1.02345991, + "epoch": 0.354907560499023, + "flos": 17383533765120.0, + "grad_norm": 3.485506888533945, + "language_loss": 0.70700645, + "learning_rate": 2.880777360821468e-06, + "loss": 0.72807217, + "num_input_tokens_seen": 126930040, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.5078125, + "step": 5903, + "time_per_iteration": 3.7309510707855225 + }, + { + "auxiliary_loss_clip": 0.01076166, + "auxiliary_loss_mlp": 0.01032293, + "balance_loss_clip": 1.01654553, + "balance_loss_mlp": 1.02330661, + "epoch": 0.35496768375169097, + "flos": 19207536168960.0, + "grad_norm": 2.8247905819090935, + "language_loss": 0.74162674, + "learning_rate": 2.8804381334923563e-06, + "loss": 0.76271129, + "num_input_tokens_seen": 126948390, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.52734375, + "step": 5904, + "time_per_iteration": 2.3524158000946045 + }, + { + "auxiliary_loss_clip": 0.01077683, + "auxiliary_loss_mlp": 0.01030771, + "balance_loss_clip": 1.01529133, + "balance_loss_mlp": 1.02583325, + "epoch": 0.35502780700435893, + "flos": 18331809736320.0, + "grad_norm": 8.969292883330912, + "language_loss": 0.79157579, + "learning_rate": 2.8800988747426722e-06, + "loss": 0.81266034, + "num_input_tokens_seen": 126964905, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.51953125, + "step": 5905, + "time_per_iteration": 2.3464064598083496 + }, + { + "auxiliary_loss_clip": 0.01067803, + "auxiliary_loss_mlp": 0.01029924, + "balance_loss_clip": 1.01639962, + "balance_loss_mlp": 1.02163506, + "epoch": 0.3550879302570269, + "flos": 15447704676480.0, + "grad_norm": 1.8276164085040287, + "language_loss": 0.72286129, + "learning_rate": 2.8797595845845225e-06, + "loss": 0.74383855, + "num_input_tokens_seen": 126982000, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.4609375, + "step": 5906, + "time_per_iteration": 3.7140448093414307 + }, + { + "auxiliary_loss_clip": 0.01077125, + "auxiliary_loss_mlp": 0.01028239, + "balance_loss_clip": 1.01140714, + "balance_loss_mlp": 1.02378869, + "epoch": 0.35514805350969486, + "flos": 21978173710080.0, + "grad_norm": 1.9258495636842574, + "language_loss": 0.74568594, + "learning_rate": 2.879420263030017e-06, + "loss": 0.76673961, + "num_input_tokens_seen": 126998390, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.53515625, + "step": 5907, + "time_per_iteration": 3.828256607055664 + }, + { + "auxiliary_loss_clip": 0.01074815, + "auxiliary_loss_mlp": 0.01029877, + "balance_loss_clip": 1.01446307, + "balance_loss_mlp": 1.02332067, + "epoch": 0.3552081767623628, + "flos": 29860654210560.0, + "grad_norm": 1.6433169390756894, + "language_loss": 0.75686789, + "learning_rate": 2.8790809100912637e-06, + "loss": 0.77791482, + "num_input_tokens_seen": 127020220, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.515625, + "step": 5908, + "time_per_iteration": 2.4670584201812744 + }, + { + "auxiliary_loss_clip": 0.01075054, + "auxiliary_loss_mlp": 0.01028911, + "balance_loss_clip": 1.01433158, + "balance_loss_mlp": 1.02412367, + "epoch": 0.3552683000150308, + "flos": 26431066068480.0, + "grad_norm": 1.8866977599410624, + "language_loss": 0.68300748, + "learning_rate": 2.8787415257803742e-06, + "loss": 0.7040472, + "num_input_tokens_seen": 127038585, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.5078125, + "step": 5909, + "time_per_iteration": 2.509817123413086 + }, + { + "auxiliary_loss_clip": 0.01071519, + "auxiliary_loss_mlp": 0.01028497, + "balance_loss_clip": 1.01430488, + "balance_loss_mlp": 1.02333391, + "epoch": 0.35532842326769876, + "flos": 19785139568640.0, + "grad_norm": 1.7310796374024051, + "language_loss": 0.78199911, + "learning_rate": 2.8784021101094605e-06, + "loss": 0.80299926, + "num_input_tokens_seen": 127056215, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.48242188, + "step": 5910, + "time_per_iteration": 2.426288604736328 + }, + { + "auxiliary_loss_clip": 0.01076215, + "auxiliary_loss_mlp": 0.01028293, + "balance_loss_clip": 1.01181817, + "balance_loss_mlp": 1.02370858, + "epoch": 0.3553885465203668, + "flos": 17238295042560.0, + "grad_norm": 1.7470070007300544, + "language_loss": 0.71116287, + "learning_rate": 2.878062663090635e-06, + "loss": 0.73220789, + "num_input_tokens_seen": 127075825, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.5234375, + "step": 5911, + "time_per_iteration": 3.816206693649292 + }, + { + "auxiliary_loss_clip": 0.01071634, + "auxiliary_loss_mlp": 0.01026735, + "balance_loss_clip": 1.01283526, + "balance_loss_mlp": 1.02319586, + "epoch": 0.35544866977303474, + "flos": 14933608773120.0, + "grad_norm": 2.4276127251523456, + "language_loss": 0.86980754, + "learning_rate": 2.8777231847360117e-06, + "loss": 0.89079118, + "num_input_tokens_seen": 127091205, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.484375, + "step": 5912, + "time_per_iteration": 2.385530710220337 + }, + { + "auxiliary_loss_clip": 0.01072557, + "auxiliary_loss_mlp": 0.01026646, + "balance_loss_clip": 1.01207829, + "balance_loss_mlp": 1.0229423, + "epoch": 0.3555087930257027, + "flos": 19755009198720.0, + "grad_norm": 2.004730984511101, + "language_loss": 0.76809984, + "learning_rate": 2.8773836750577053e-06, + "loss": 0.78909194, + "num_input_tokens_seen": 127109210, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.49609375, + "step": 5913, + "time_per_iteration": 2.398463726043701 + }, + { + "auxiliary_loss_clip": 0.01072708, + "auxiliary_loss_mlp": 0.01028703, + "balance_loss_clip": 1.01374865, + "balance_loss_mlp": 1.02411163, + "epoch": 0.3555689162783707, + "flos": 21067219848960.0, + "grad_norm": 1.2736134280555174, + "language_loss": 0.82607269, + "learning_rate": 2.877044134067833e-06, + "loss": 0.84708679, + "num_input_tokens_seen": 127128400, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.484375, + "step": 5914, + "time_per_iteration": 2.3949437141418457 + }, + { + "auxiliary_loss_clip": 0.01072856, + "auxiliary_loss_mlp": 0.01027922, + "balance_loss_clip": 1.01286614, + "balance_loss_mlp": 1.02342296, + "epoch": 0.35562903953103864, + "flos": 33068334551040.0, + "grad_norm": 2.0536885243897727, + "language_loss": 0.70349467, + "learning_rate": 2.8767045617785108e-06, + "loss": 0.72450244, + "num_input_tokens_seen": 127149965, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.49609375, + "step": 5915, + "time_per_iteration": 2.499976396560669 + }, + { + "auxiliary_loss_clip": 0.01070764, + "auxiliary_loss_mlp": 0.01030656, + "balance_loss_clip": 1.01661289, + "balance_loss_mlp": 1.02145088, + "epoch": 0.3556891627837066, + "flos": 20556824549760.0, + "grad_norm": 1.7392553373605206, + "language_loss": 0.76000738, + "learning_rate": 2.8763649582018584e-06, + "loss": 0.78102154, + "num_input_tokens_seen": 127169865, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.49414062, + "step": 5916, + "time_per_iteration": 2.4060258865356445 + }, + { + "auxiliary_loss_clip": 0.01075412, + "auxiliary_loss_mlp": 0.01033269, + "balance_loss_clip": 1.01874971, + "balance_loss_mlp": 1.02454448, + "epoch": 0.35574928603637457, + "flos": 20702307651840.0, + "grad_norm": 1.5779878568317227, + "language_loss": 0.88140929, + "learning_rate": 2.876025323349995e-06, + "loss": 0.90249616, + "num_input_tokens_seen": 127188075, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.5078125, + "step": 5917, + "time_per_iteration": 2.3921902179718018 + }, + { + "auxiliary_loss_clip": 0.01072951, + "auxiliary_loss_mlp": 0.01025504, + "balance_loss_clip": 1.01175272, + "balance_loss_mlp": 1.02395844, + "epoch": 0.35580940928904253, + "flos": 15193711468800.0, + "grad_norm": 1.9180134136793532, + "language_loss": 0.74765903, + "learning_rate": 2.875685657235041e-06, + "loss": 0.76864356, + "num_input_tokens_seen": 127206065, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.48828125, + "step": 5918, + "time_per_iteration": 2.412428379058838 + }, + { + "auxiliary_loss_clip": 0.01012443, + "auxiliary_loss_mlp": 0.0100438, + "balance_loss_clip": 1.00324166, + "balance_loss_mlp": 1.00213659, + "epoch": 0.3558695325417105, + "flos": 58636312099200.0, + "grad_norm": 0.9166926561251532, + "language_loss": 0.63805127, + "learning_rate": 2.8753459598691183e-06, + "loss": 0.65821946, + "num_input_tokens_seen": 127257885, + "router_z_loss_clip": 0.01141357, + "router_z_loss_mlp": 0.10302734, + "step": 5919, + "time_per_iteration": 2.80796217918396 + }, + { + "auxiliary_loss_clip": 0.01075172, + "auxiliary_loss_mlp": 0.01031999, + "balance_loss_clip": 1.01707435, + "balance_loss_mlp": 1.02332199, + "epoch": 0.35592965579437846, + "flos": 22017136654080.0, + "grad_norm": 2.2192382856850954, + "language_loss": 0.73782456, + "learning_rate": 2.8750062312643495e-06, + "loss": 0.75889635, + "num_input_tokens_seen": 127275550, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.51953125, + "step": 5920, + "time_per_iteration": 2.43385910987854 + }, + { + "auxiliary_loss_clip": 0.01070828, + "auxiliary_loss_mlp": 0.01027108, + "balance_loss_clip": 1.01217747, + "balance_loss_mlp": 1.02149773, + "epoch": 0.35598977904704643, + "flos": 23366564680320.0, + "grad_norm": 1.7025904234366431, + "language_loss": 0.7757051, + "learning_rate": 2.8746664714328603e-06, + "loss": 0.7966845, + "num_input_tokens_seen": 127295110, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.4921875, + "step": 5921, + "time_per_iteration": 2.422354221343994 + }, + { + "auxiliary_loss_clip": 0.01072123, + "auxiliary_loss_mlp": 0.01028261, + "balance_loss_clip": 1.01437283, + "balance_loss_mlp": 1.02315259, + "epoch": 0.3560499022997144, + "flos": 17784371617920.0, + "grad_norm": 2.1851720961550476, + "language_loss": 0.67276013, + "learning_rate": 2.8743266803867743e-06, + "loss": 0.69376391, + "num_input_tokens_seen": 127312865, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.49023438, + "step": 5922, + "time_per_iteration": 2.4101037979125977 + }, + { + "auxiliary_loss_clip": 0.0107533, + "auxiliary_loss_mlp": 0.01034294, + "balance_loss_clip": 1.01991677, + "balance_loss_mlp": 1.02396476, + "epoch": 0.35611002555238236, + "flos": 20739420293760.0, + "grad_norm": 1.9506992526186122, + "language_loss": 0.78942466, + "learning_rate": 2.8739868581382175e-06, + "loss": 0.81052095, + "num_input_tokens_seen": 127331710, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.515625, + "step": 5923, + "time_per_iteration": 2.4214835166931152 + }, + { + "auxiliary_loss_clip": 0.01075011, + "auxiliary_loss_mlp": 0.0103228, + "balance_loss_clip": 1.01836824, + "balance_loss_mlp": 1.02488649, + "epoch": 0.3561701488050504, + "flos": 19461250085760.0, + "grad_norm": 1.8827073097389189, + "language_loss": 0.85266215, + "learning_rate": 2.873647004699318e-06, + "loss": 0.87373507, + "num_input_tokens_seen": 127350950, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.5, + "step": 5924, + "time_per_iteration": 2.3856821060180664 + }, + { + "auxiliary_loss_clip": 0.01072474, + "auxiliary_loss_mlp": 0.01029402, + "balance_loss_clip": 1.01472116, + "balance_loss_mlp": 1.02325845, + "epoch": 0.35623027205771834, + "flos": 30773598019200.0, + "grad_norm": 1.8864423090769173, + "language_loss": 0.77786255, + "learning_rate": 2.8733071200822046e-06, + "loss": 0.79888129, + "num_input_tokens_seen": 127369385, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.4921875, + "step": 5925, + "time_per_iteration": 2.446983814239502 + }, + { + "auxiliary_loss_clip": 0.01070506, + "auxiliary_loss_mlp": 0.01031533, + "balance_loss_clip": 1.01706064, + "balance_loss_mlp": 1.02089548, + "epoch": 0.3562903953103863, + "flos": 16980182294400.0, + "grad_norm": 1.9084098048431093, + "language_loss": 0.75571799, + "learning_rate": 2.8729672042990068e-06, + "loss": 0.77673841, + "num_input_tokens_seen": 127386965, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.49609375, + "step": 5926, + "time_per_iteration": 2.363508462905884 + }, + { + "auxiliary_loss_clip": 0.01075721, + "auxiliary_loss_mlp": 0.01027054, + "balance_loss_clip": 1.01245046, + "balance_loss_mlp": 1.02421558, + "epoch": 0.3563505185630543, + "flos": 23838765085440.0, + "grad_norm": 2.025626013025771, + "language_loss": 0.69512618, + "learning_rate": 2.872627257361855e-06, + "loss": 0.71615392, + "num_input_tokens_seen": 127406075, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.515625, + "step": 5927, + "time_per_iteration": 2.383791923522949 + }, + { + "auxiliary_loss_clip": 0.01069768, + "auxiliary_loss_mlp": 0.01028729, + "balance_loss_clip": 1.01556206, + "balance_loss_mlp": 1.02203751, + "epoch": 0.35641064181572224, + "flos": 22272351759360.0, + "grad_norm": 1.977131925250172, + "language_loss": 0.79609823, + "learning_rate": 2.8722872792828803e-06, + "loss": 0.81708324, + "num_input_tokens_seen": 127425350, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.47851562, + "step": 5928, + "time_per_iteration": 2.395374298095703 + }, + { + "auxiliary_loss_clip": 0.01070851, + "auxiliary_loss_mlp": 0.010285, + "balance_loss_clip": 1.01433802, + "balance_loss_mlp": 1.02258492, + "epoch": 0.3564707650683902, + "flos": 23000186206080.0, + "grad_norm": 1.36971100217665, + "language_loss": 0.81821471, + "learning_rate": 2.8719472700742167e-06, + "loss": 0.83920825, + "num_input_tokens_seen": 127446335, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.48242188, + "step": 5929, + "time_per_iteration": 2.419863700866699 + }, + { + "auxiliary_loss_clip": 0.01068277, + "auxiliary_loss_mlp": 0.01024995, + "balance_loss_clip": 1.01179874, + "balance_loss_mlp": 1.02052593, + "epoch": 0.35653088832105817, + "flos": 14683385992320.0, + "grad_norm": 1.596973328497599, + "language_loss": 0.70001251, + "learning_rate": 2.871607229747998e-06, + "loss": 0.72094524, + "num_input_tokens_seen": 127462795, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.4765625, + "step": 5930, + "time_per_iteration": 2.342053174972534 + }, + { + "auxiliary_loss_clip": 0.01075997, + "auxiliary_loss_mlp": 0.0102953, + "balance_loss_clip": 1.015064, + "balance_loss_mlp": 1.02533948, + "epoch": 0.35659101157372614, + "flos": 23475947569920.0, + "grad_norm": 1.8892181752957757, + "language_loss": 0.67771393, + "learning_rate": 2.8712671583163596e-06, + "loss": 0.69876921, + "num_input_tokens_seen": 127482675, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.5078125, + "step": 5931, + "time_per_iteration": 2.424328088760376 + }, + { + "auxiliary_loss_clip": 0.01070913, + "auxiliary_loss_mlp": 0.01031929, + "balance_loss_clip": 1.01791549, + "balance_loss_mlp": 1.0222764, + "epoch": 0.3566511348263941, + "flos": 26577456865920.0, + "grad_norm": 1.715579462192903, + "language_loss": 0.6755209, + "learning_rate": 2.870927055791437e-06, + "loss": 0.6965493, + "num_input_tokens_seen": 127502275, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.48828125, + "step": 5932, + "time_per_iteration": 2.4113881587982178 + }, + { + "auxiliary_loss_clip": 0.01069909, + "auxiliary_loss_mlp": 0.01024306, + "balance_loss_clip": 1.01120543, + "balance_loss_mlp": 1.02280855, + "epoch": 0.35671125807906207, + "flos": 13114179757440.0, + "grad_norm": 2.152898948611348, + "language_loss": 0.78931725, + "learning_rate": 2.8705869221853684e-06, + "loss": 0.81025946, + "num_input_tokens_seen": 127520195, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.47070312, + "step": 5933, + "time_per_iteration": 2.379514217376709 + }, + { + "auxiliary_loss_clip": 0.01070714, + "auxiliary_loss_mlp": 0.01035387, + "balance_loss_clip": 1.02138042, + "balance_loss_mlp": 1.02115858, + "epoch": 0.35677138133173003, + "flos": 32999171414400.0, + "grad_norm": 1.481716828316316, + "language_loss": 0.69572234, + "learning_rate": 2.8702467575102914e-06, + "loss": 0.71678329, + "num_input_tokens_seen": 127544495, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.49609375, + "step": 5934, + "time_per_iteration": 2.4865353107452393 + }, + { + "auxiliary_loss_clip": 0.0107718, + "auxiliary_loss_mlp": 0.01036748, + "balance_loss_clip": 1.0198679, + "balance_loss_mlp": 1.02353334, + "epoch": 0.356831504584398, + "flos": 20776777315200.0, + "grad_norm": 1.6308150932066683, + "language_loss": 0.70757735, + "learning_rate": 2.869906561778347e-06, + "loss": 0.72871661, + "num_input_tokens_seen": 127563810, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.5390625, + "step": 5935, + "time_per_iteration": 2.417588472366333 + }, + { + "auxiliary_loss_clip": 0.01071965, + "auxiliary_loss_mlp": 0.0102967, + "balance_loss_clip": 1.01504302, + "balance_loss_mlp": 1.02198422, + "epoch": 0.35689162783706596, + "flos": 12164786622720.0, + "grad_norm": 2.6017334537563785, + "language_loss": 0.78478062, + "learning_rate": 2.869566335001674e-06, + "loss": 0.80579704, + "num_input_tokens_seen": 127579065, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.5, + "step": 5936, + "time_per_iteration": 2.3403899669647217 + }, + { + "auxiliary_loss_clip": 0.01070823, + "auxiliary_loss_mlp": 0.01033008, + "balance_loss_clip": 1.01793408, + "balance_loss_mlp": 1.0215863, + "epoch": 0.356951751089734, + "flos": 23840371008000.0, + "grad_norm": 1.360430489831818, + "language_loss": 0.64434779, + "learning_rate": 2.8692260771924167e-06, + "loss": 0.66538608, + "num_input_tokens_seen": 127599105, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.4921875, + "step": 5937, + "time_per_iteration": 2.4048120975494385 + }, + { + "auxiliary_loss_clip": 0.01074406, + "auxiliary_loss_mlp": 0.01026844, + "balance_loss_clip": 1.01247382, + "balance_loss_mlp": 1.02333021, + "epoch": 0.35701187434240195, + "flos": 11721564512640.0, + "grad_norm": 2.338908034518505, + "language_loss": 0.7841239, + "learning_rate": 2.868885788362715e-06, + "loss": 0.80513638, + "num_input_tokens_seen": 127614940, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.5078125, + "step": 5938, + "time_per_iteration": 2.3586747646331787 + }, + { + "auxiliary_loss_clip": 0.01073713, + "auxiliary_loss_mlp": 0.01033914, + "balance_loss_clip": 1.01928055, + "balance_loss_mlp": 1.02315617, + "epoch": 0.3570719975950699, + "flos": 24897750577920.0, + "grad_norm": 1.504809101025306, + "language_loss": 0.8034789, + "learning_rate": 2.868545468524716e-06, + "loss": 0.82455516, + "num_input_tokens_seen": 127634960, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.50390625, + "step": 5939, + "time_per_iteration": 2.4168505668640137 + }, + { + "auxiliary_loss_clip": 0.01073777, + "auxiliary_loss_mlp": 0.01028514, + "balance_loss_clip": 1.01325536, + "balance_loss_mlp": 1.02135611, + "epoch": 0.3571321208477379, + "flos": 25993639244160.0, + "grad_norm": 1.79766284266332, + "language_loss": 0.79158193, + "learning_rate": 2.8682051176905624e-06, + "loss": 0.81260484, + "num_input_tokens_seen": 127654545, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.5234375, + "step": 5940, + "time_per_iteration": 2.4209342002868652 + }, + { + "auxiliary_loss_clip": 0.01073551, + "auxiliary_loss_mlp": 0.01028896, + "balance_loss_clip": 1.01330376, + "balance_loss_mlp": 1.02250683, + "epoch": 0.35719224410040584, + "flos": 14500790248320.0, + "grad_norm": 1.8942428759598329, + "language_loss": 0.71959144, + "learning_rate": 2.867864735872402e-06, + "loss": 0.7406159, + "num_input_tokens_seen": 127672320, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.5078125, + "step": 5941, + "time_per_iteration": 2.4017865657806396 + }, + { + "auxiliary_loss_clip": 0.01074913, + "auxiliary_loss_mlp": 0.01031032, + "balance_loss_clip": 1.01592779, + "balance_loss_mlp": 1.02432132, + "epoch": 0.3572523673530738, + "flos": 31174121669760.0, + "grad_norm": 2.034368847560607, + "language_loss": 0.63886237, + "learning_rate": 2.8675243230823815e-06, + "loss": 0.65992182, + "num_input_tokens_seen": 127693315, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.5078125, + "step": 5942, + "time_per_iteration": 4.0378193855285645 + }, + { + "auxiliary_loss_clip": 0.01071789, + "auxiliary_loss_mlp": 0.01036464, + "balance_loss_clip": 1.02050829, + "balance_loss_mlp": 1.02242017, + "epoch": 0.3573124906057418, + "flos": 15851056147200.0, + "grad_norm": 1.8992011189116973, + "language_loss": 0.73817796, + "learning_rate": 2.86718387933265e-06, + "loss": 0.75926054, + "num_input_tokens_seen": 127711570, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.49414062, + "step": 5943, + "time_per_iteration": 2.3711822032928467 + }, + { + "auxiliary_loss_clip": 0.01013196, + "auxiliary_loss_mlp": 0.01000807, + "balance_loss_clip": 0.99947822, + "balance_loss_mlp": 1.00293803, + "epoch": 0.35737261385840974, + "flos": 60819989996160.0, + "grad_norm": 0.8011237422544815, + "language_loss": 0.6077981, + "learning_rate": 2.8668434046353557e-06, + "loss": 0.62793815, + "num_input_tokens_seen": 127772475, + "router_z_loss_clip": 0.01330566, + "router_z_loss_mlp": 0.10253906, + "step": 5944, + "time_per_iteration": 3.124643087387085 + }, + { + "auxiliary_loss_clip": 0.01069537, + "auxiliary_loss_mlp": 0.01025334, + "balance_loss_clip": 1.01138711, + "balance_loss_mlp": 1.02124238, + "epoch": 0.3574327371110777, + "flos": 18842763617280.0, + "grad_norm": 1.719653871564029, + "language_loss": 0.72688848, + "learning_rate": 2.86650289900265e-06, + "loss": 0.74783719, + "num_input_tokens_seen": 127790940, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.48242188, + "step": 5945, + "time_per_iteration": 3.7560200691223145 + }, + { + "auxiliary_loss_clip": 0.01071227, + "auxiliary_loss_mlp": 0.01030704, + "balance_loss_clip": 1.01599336, + "balance_loss_mlp": 1.02127099, + "epoch": 0.35749286036374567, + "flos": 23548566931200.0, + "grad_norm": 1.6907981232767915, + "language_loss": 0.80688787, + "learning_rate": 2.8661623624466856e-06, + "loss": 0.8279072, + "num_input_tokens_seen": 127808275, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.5, + "step": 5946, + "time_per_iteration": 3.8214290142059326 + }, + { + "auxiliary_loss_clip": 0.01074525, + "auxiliary_loss_mlp": 0.01037912, + "balance_loss_clip": 1.0230881, + "balance_loss_mlp": 1.0244472, + "epoch": 0.35755298361641363, + "flos": 21104437224960.0, + "grad_norm": 1.3280999474170685, + "language_loss": 0.68914711, + "learning_rate": 2.8658217949796133e-06, + "loss": 0.71027148, + "num_input_tokens_seen": 127828840, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.5, + "step": 5947, + "time_per_iteration": 2.4186766147613525 + }, + { + "auxiliary_loss_clip": 0.01069792, + "auxiliary_loss_mlp": 0.01036792, + "balance_loss_clip": 1.02277255, + "balance_loss_mlp": 1.02240419, + "epoch": 0.3576131068690816, + "flos": 19244020406400.0, + "grad_norm": 1.6894771416948158, + "language_loss": 0.7563026, + "learning_rate": 2.8654811966135893e-06, + "loss": 0.77736843, + "num_input_tokens_seen": 127846240, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.47460938, + "step": 5948, + "time_per_iteration": 2.3672938346862793 + }, + { + "auxiliary_loss_clip": 0.01069503, + "auxiliary_loss_mlp": 0.01033781, + "balance_loss_clip": 1.01949406, + "balance_loss_mlp": 1.02019787, + "epoch": 0.35767323012174956, + "flos": 28653532352640.0, + "grad_norm": 5.932336696304652, + "language_loss": 0.70936704, + "learning_rate": 2.865140567360767e-06, + "loss": 0.73039985, + "num_input_tokens_seen": 127866880, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.4921875, + "step": 5949, + "time_per_iteration": 2.471717596054077 + }, + { + "auxiliary_loss_clip": 0.01072815, + "auxiliary_loss_mlp": 0.01035241, + "balance_loss_clip": 1.02138329, + "balance_loss_mlp": 1.02277851, + "epoch": 0.35773335337441753, + "flos": 17084607770880.0, + "grad_norm": 1.8392819708553219, + "language_loss": 0.77234703, + "learning_rate": 2.864799907233304e-06, + "loss": 0.79342759, + "num_input_tokens_seen": 127883560, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.5, + "step": 5950, + "time_per_iteration": 3.8080577850341797 + }, + { + "auxiliary_loss_clip": 0.01072638, + "auxiliary_loss_mlp": 0.01028006, + "balance_loss_clip": 1.01333153, + "balance_loss_mlp": 1.02253437, + "epoch": 0.35779347662708555, + "flos": 15887680030080.0, + "grad_norm": 1.6855518394982005, + "language_loss": 0.73074478, + "learning_rate": 2.8644592162433565e-06, + "loss": 0.75175124, + "num_input_tokens_seen": 127902330, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.5, + "step": 5951, + "time_per_iteration": 2.385679006576538 + }, + { + "auxiliary_loss_clip": 0.01075451, + "auxiliary_loss_mlp": 0.0102857, + "balance_loss_clip": 1.01308489, + "balance_loss_mlp": 1.02243567, + "epoch": 0.3578535998797535, + "flos": 28657547159040.0, + "grad_norm": 2.0627297901646666, + "language_loss": 0.70312607, + "learning_rate": 2.864118494403083e-06, + "loss": 0.72416627, + "num_input_tokens_seen": 127922325, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.53125, + "step": 5952, + "time_per_iteration": 2.4737606048583984 + }, + { + "auxiliary_loss_clip": 0.01070435, + "auxiliary_loss_mlp": 0.01028892, + "balance_loss_clip": 1.01501608, + "balance_loss_mlp": 1.02144802, + "epoch": 0.3579137231324215, + "flos": 37850911678080.0, + "grad_norm": 1.7315607420422052, + "language_loss": 0.6982621, + "learning_rate": 2.863777741724643e-06, + "loss": 0.71925539, + "num_input_tokens_seen": 127942635, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.48828125, + "step": 5953, + "time_per_iteration": 2.5504329204559326 + }, + { + "auxiliary_loss_clip": 0.01070049, + "auxiliary_loss_mlp": 0.01027952, + "balance_loss_clip": 1.01369417, + "balance_loss_mlp": 1.02118003, + "epoch": 0.35797384638508944, + "flos": 22345739170560.0, + "grad_norm": 1.522911635126259, + "language_loss": 0.66758895, + "learning_rate": 2.863436958220198e-06, + "loss": 0.68856895, + "num_input_tokens_seen": 127962520, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.48828125, + "step": 5954, + "time_per_iteration": 2.4199447631835938 + }, + { + "auxiliary_loss_clip": 0.0107244, + "auxiliary_loss_mlp": 0.01030019, + "balance_loss_clip": 1.01595175, + "balance_loss_mlp": 1.02268529, + "epoch": 0.3580339696377574, + "flos": 13588858869120.0, + "grad_norm": 1.9268731718788137, + "language_loss": 0.74515939, + "learning_rate": 2.8630961439019087e-06, + "loss": 0.76618397, + "num_input_tokens_seen": 127981180, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.49609375, + "step": 5955, + "time_per_iteration": 2.4272725582122803 + }, + { + "auxiliary_loss_clip": 0.01068141, + "auxiliary_loss_mlp": 0.01022692, + "balance_loss_clip": 1.00941253, + "balance_loss_mlp": 1.02124548, + "epoch": 0.3580940928904254, + "flos": 23767123242240.0, + "grad_norm": 1.6735730058771812, + "language_loss": 0.76419848, + "learning_rate": 2.8627552987819382e-06, + "loss": 0.78510684, + "num_input_tokens_seen": 127999725, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.46875, + "step": 5956, + "time_per_iteration": 2.405097484588623 + }, + { + "auxiliary_loss_clip": 0.01069308, + "auxiliary_loss_mlp": 0.01026953, + "balance_loss_clip": 1.01375628, + "balance_loss_mlp": 1.02230978, + "epoch": 0.35815421614309334, + "flos": 19462856008320.0, + "grad_norm": 1.5454168655842875, + "language_loss": 0.73009193, + "learning_rate": 2.86241442287245e-06, + "loss": 0.75105453, + "num_input_tokens_seen": 128018885, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.47070312, + "step": 5957, + "time_per_iteration": 2.3796825408935547 + }, + { + "auxiliary_loss_clip": 0.01071461, + "auxiliary_loss_mlp": 0.0102929, + "balance_loss_clip": 1.0146687, + "balance_loss_mlp": 1.02231061, + "epoch": 0.3582143393957613, + "flos": 23367053439360.0, + "grad_norm": 1.7028257216885643, + "language_loss": 0.70873713, + "learning_rate": 2.86207351618561e-06, + "loss": 0.72974467, + "num_input_tokens_seen": 128037875, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.4921875, + "step": 5958, + "time_per_iteration": 2.3916237354278564 + }, + { + "auxiliary_loss_clip": 0.01069399, + "auxiliary_loss_mlp": 0.01026143, + "balance_loss_clip": 1.01275599, + "balance_loss_mlp": 1.02178752, + "epoch": 0.35827446264842927, + "flos": 26322067203840.0, + "grad_norm": 1.5966503158089345, + "language_loss": 0.8835175, + "learning_rate": 2.8617325787335833e-06, + "loss": 0.90447289, + "num_input_tokens_seen": 128056045, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.4765625, + "step": 5959, + "time_per_iteration": 2.4356231689453125 + }, + { + "auxiliary_loss_clip": 0.01070174, + "auxiliary_loss_mlp": 0.01033224, + "balance_loss_clip": 1.01886547, + "balance_loss_mlp": 1.02187371, + "epoch": 0.35833458590109724, + "flos": 30445274793600.0, + "grad_norm": 1.659003537861445, + "language_loss": 0.58252156, + "learning_rate": 2.861391610528538e-06, + "loss": 0.60355556, + "num_input_tokens_seen": 128077815, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.48242188, + "step": 5960, + "time_per_iteration": 2.448814630508423 + }, + { + "auxiliary_loss_clip": 0.01071052, + "auxiliary_loss_mlp": 0.01025945, + "balance_loss_clip": 1.01110339, + "balance_loss_mlp": 1.02143323, + "epoch": 0.3583947091537652, + "flos": 14829008739840.0, + "grad_norm": 2.03019221311439, + "language_loss": 0.76461655, + "learning_rate": 2.8610506115826415e-06, + "loss": 0.78558648, + "num_input_tokens_seen": 128095460, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.49414062, + "step": 5961, + "time_per_iteration": 2.3817083835601807 + }, + { + "auxiliary_loss_clip": 0.01072396, + "auxiliary_loss_mlp": 0.01026293, + "balance_loss_clip": 1.01171958, + "balance_loss_mlp": 1.02307522, + "epoch": 0.35845483240643317, + "flos": 34239216551040.0, + "grad_norm": 1.6862321619024208, + "language_loss": 0.70352829, + "learning_rate": 2.8607095819080633e-06, + "loss": 0.7245152, + "num_input_tokens_seen": 128118605, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.49414062, + "step": 5962, + "time_per_iteration": 2.4933602809906006 + }, + { + "auxiliary_loss_clip": 0.01070173, + "auxiliary_loss_mlp": 0.01028243, + "balance_loss_clip": 1.01541662, + "balance_loss_mlp": 1.02336693, + "epoch": 0.35851495565910113, + "flos": 20959023945600.0, + "grad_norm": 1.6460268097452655, + "language_loss": 0.74522746, + "learning_rate": 2.8603685215169745e-06, + "loss": 0.76621163, + "num_input_tokens_seen": 128139205, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.46875, + "step": 5963, + "time_per_iteration": 2.4234609603881836 + }, + { + "auxiliary_loss_clip": 0.01070613, + "auxiliary_loss_mlp": 0.01029882, + "balance_loss_clip": 1.01486123, + "balance_loss_mlp": 1.0227766, + "epoch": 0.35857507891176915, + "flos": 22308766174080.0, + "grad_norm": 1.5310674884993207, + "language_loss": 0.78604966, + "learning_rate": 2.8600274304215458e-06, + "loss": 0.80705464, + "num_input_tokens_seen": 128158765, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.47851562, + "step": 5964, + "time_per_iteration": 2.41050124168396 + }, + { + "auxiliary_loss_clip": 0.01072615, + "auxiliary_loss_mlp": 0.01027577, + "balance_loss_clip": 1.01299191, + "balance_loss_mlp": 1.02206707, + "epoch": 0.3586352021644371, + "flos": 23366739237120.0, + "grad_norm": 2.025921451943204, + "language_loss": 0.6644938, + "learning_rate": 2.859686308633951e-06, + "loss": 0.68549573, + "num_input_tokens_seen": 128177850, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.50390625, + "step": 5965, + "time_per_iteration": 2.408123254776001 + }, + { + "auxiliary_loss_clip": 0.01072317, + "auxiliary_loss_mlp": 0.01026799, + "balance_loss_clip": 1.01197517, + "balance_loss_mlp": 1.02330947, + "epoch": 0.3586953254171051, + "flos": 27848156042880.0, + "grad_norm": 1.5425050498290698, + "language_loss": 0.79170668, + "learning_rate": 2.8593451561663634e-06, + "loss": 0.81269795, + "num_input_tokens_seen": 128196925, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.49023438, + "step": 5966, + "time_per_iteration": 2.4352409839630127 + }, + { + "auxiliary_loss_clip": 0.01070282, + "auxiliary_loss_mlp": 0.01029939, + "balance_loss_clip": 1.01453137, + "balance_loss_mlp": 1.02103496, + "epoch": 0.35875544866977305, + "flos": 19499479891200.0, + "grad_norm": 1.9073311257023076, + "language_loss": 0.91068411, + "learning_rate": 2.859003973030957e-06, + "loss": 0.93168628, + "num_input_tokens_seen": 128213955, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.4921875, + "step": 5967, + "time_per_iteration": 2.383220672607422 + }, + { + "auxiliary_loss_clip": 0.01075283, + "auxiliary_loss_mlp": 0.01033249, + "balance_loss_clip": 1.01794267, + "balance_loss_mlp": 1.02471089, + "epoch": 0.358815571922441, + "flos": 21470047649280.0, + "grad_norm": 1.7376755701244664, + "language_loss": 0.8022114, + "learning_rate": 2.858662759239909e-06, + "loss": 0.82329679, + "num_input_tokens_seen": 128232980, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.5078125, + "step": 5968, + "time_per_iteration": 2.3881704807281494 + }, + { + "auxiliary_loss_clip": 0.01076556, + "auxiliary_loss_mlp": 0.01039407, + "balance_loss_clip": 1.02344513, + "balance_loss_mlp": 1.02425456, + "epoch": 0.358875695175109, + "flos": 21834331441920.0, + "grad_norm": 2.1063316108064427, + "language_loss": 0.84497571, + "learning_rate": 2.858321514805395e-06, + "loss": 0.86613536, + "num_input_tokens_seen": 128252795, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.5234375, + "step": 5969, + "time_per_iteration": 2.417069673538208 + }, + { + "auxiliary_loss_clip": 0.01070347, + "auxiliary_loss_mlp": 0.01024977, + "balance_loss_clip": 1.01148224, + "balance_loss_mlp": 1.02203286, + "epoch": 0.35893581842777694, + "flos": 32010361488000.0, + "grad_norm": 1.758826799255447, + "language_loss": 0.72189152, + "learning_rate": 2.8579802397395953e-06, + "loss": 0.7428447, + "num_input_tokens_seen": 128273115, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.484375, + "step": 5970, + "time_per_iteration": 2.47866153717041 + }, + { + "auxiliary_loss_clip": 0.01071148, + "auxiliary_loss_mlp": 0.01030514, + "balance_loss_clip": 1.01675689, + "balance_loss_mlp": 1.02226007, + "epoch": 0.3589959416804449, + "flos": 20484763770240.0, + "grad_norm": 1.7765434929520485, + "language_loss": 0.79491836, + "learning_rate": 2.857638934054687e-06, + "loss": 0.8159349, + "num_input_tokens_seen": 128292220, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.48828125, + "step": 5971, + "time_per_iteration": 2.4473137855529785 + }, + { + "auxiliary_loss_clip": 0.01071605, + "auxiliary_loss_mlp": 0.01030863, + "balance_loss_clip": 1.01579535, + "balance_loss_mlp": 1.02054417, + "epoch": 0.3590560649331129, + "flos": 16179728486400.0, + "grad_norm": 1.7992220947713973, + "language_loss": 0.78177643, + "learning_rate": 2.8572975977628517e-06, + "loss": 0.80280107, + "num_input_tokens_seen": 128310305, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.5078125, + "step": 5972, + "time_per_iteration": 2.359095573425293 + }, + { + "auxiliary_loss_clip": 0.0107055, + "auxiliary_loss_mlp": 0.01032108, + "balance_loss_clip": 1.0173142, + "balance_loss_mlp": 1.02126408, + "epoch": 0.35911618818578084, + "flos": 20374368451200.0, + "grad_norm": 1.9280626901090425, + "language_loss": 0.81194162, + "learning_rate": 2.8569562308762697e-06, + "loss": 0.83296818, + "num_input_tokens_seen": 128328305, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.4921875, + "step": 5973, + "time_per_iteration": 2.3903608322143555 + }, + { + "auxiliary_loss_clip": 0.01013822, + "auxiliary_loss_mlp": 0.00999089, + "balance_loss_clip": 0.99780113, + "balance_loss_mlp": 1.00375342, + "epoch": 0.3591763114384488, + "flos": 41234308358400.0, + "grad_norm": 0.9096813525259001, + "language_loss": 0.5677613, + "learning_rate": 2.8566148334071245e-06, + "loss": 0.58789039, + "num_input_tokens_seen": 128378380, + "router_z_loss_clip": 0.01287842, + "router_z_loss_mlp": 0.10058594, + "step": 5974, + "time_per_iteration": 2.891608238220215 + }, + { + "auxiliary_loss_clip": 0.01071886, + "auxiliary_loss_mlp": 0.01028706, + "balance_loss_clip": 1.01536608, + "balance_loss_mlp": 1.02229309, + "epoch": 0.35923643469111677, + "flos": 18694522517760.0, + "grad_norm": 1.9743637413899624, + "language_loss": 0.69251728, + "learning_rate": 2.8562734053675997e-06, + "loss": 0.71352315, + "num_input_tokens_seen": 128394315, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.49609375, + "step": 5975, + "time_per_iteration": 2.4206223487854004 + }, + { + "auxiliary_loss_clip": 0.01070207, + "auxiliary_loss_mlp": 0.01030684, + "balance_loss_clip": 1.01723135, + "balance_loss_mlp": 1.02263165, + "epoch": 0.35929655794378473, + "flos": 25008774301440.0, + "grad_norm": 1.7627566589357815, + "language_loss": 0.79994309, + "learning_rate": 2.8559319467698794e-06, + "loss": 0.82095206, + "num_input_tokens_seen": 128414515, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.47460938, + "step": 5976, + "time_per_iteration": 2.4443769454956055 + }, + { + "auxiliary_loss_clip": 0.0107081, + "auxiliary_loss_mlp": 0.01030335, + "balance_loss_clip": 1.01508832, + "balance_loss_mlp": 1.02138186, + "epoch": 0.35935668119645275, + "flos": 14974701310080.0, + "grad_norm": 1.80127599447291, + "language_loss": 0.7893914, + "learning_rate": 2.855590457626149e-06, + "loss": 0.81040287, + "num_input_tokens_seen": 128430615, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.49414062, + "step": 5977, + "time_per_iteration": 2.3912503719329834 + }, + { + "auxiliary_loss_clip": 0.01069086, + "auxiliary_loss_mlp": 0.0103344, + "balance_loss_clip": 1.02023149, + "balance_loss_mlp": 1.02143776, + "epoch": 0.3594168044491207, + "flos": 21177091497600.0, + "grad_norm": 2.660693871903989, + "language_loss": 0.80078697, + "learning_rate": 2.855248937948597e-06, + "loss": 0.82181215, + "num_input_tokens_seen": 128449480, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.4765625, + "step": 5978, + "time_per_iteration": 2.388336658477783 + }, + { + "auxiliary_loss_clip": 0.0106953, + "auxiliary_loss_mlp": 0.01024494, + "balance_loss_clip": 1.01004541, + "balance_loss_mlp": 1.02077723, + "epoch": 0.3594769277017887, + "flos": 27670936648320.0, + "grad_norm": 1.809516799260894, + "language_loss": 0.6769433, + "learning_rate": 2.8549073877494096e-06, + "loss": 0.69788361, + "num_input_tokens_seen": 128471465, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.48828125, + "step": 5979, + "time_per_iteration": 2.4456100463867188 + }, + { + "auxiliary_loss_clip": 0.0107007, + "auxiliary_loss_mlp": 0.01025456, + "balance_loss_clip": 1.01171136, + "balance_loss_mlp": 1.02133811, + "epoch": 0.35953705095445665, + "flos": 23001233546880.0, + "grad_norm": 2.679761879983404, + "language_loss": 0.67270786, + "learning_rate": 2.8545658070407773e-06, + "loss": 0.69366312, + "num_input_tokens_seen": 128490645, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.48828125, + "step": 5980, + "time_per_iteration": 2.3917486667633057 + }, + { + "auxiliary_loss_clip": 0.01071541, + "auxiliary_loss_mlp": 0.0103259, + "balance_loss_clip": 1.01663971, + "balance_loss_mlp": 1.02089095, + "epoch": 0.3595971742071246, + "flos": 25512990289920.0, + "grad_norm": 1.9069745761872319, + "language_loss": 0.71115279, + "learning_rate": 2.8542241958348894e-06, + "loss": 0.73219407, + "num_input_tokens_seen": 128510225, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.50390625, + "step": 5981, + "time_per_iteration": 3.8275861740112305 + }, + { + "auxiliary_loss_clip": 0.01074017, + "auxiliary_loss_mlp": 0.01033756, + "balance_loss_clip": 1.01759088, + "balance_loss_mlp": 1.02403331, + "epoch": 0.3596572974597926, + "flos": 29861247703680.0, + "grad_norm": 2.186124021653524, + "language_loss": 0.71259987, + "learning_rate": 2.8538825541439367e-06, + "loss": 0.73367763, + "num_input_tokens_seen": 128530195, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.5, + "step": 5982, + "time_per_iteration": 2.4482645988464355 + }, + { + "auxiliary_loss_clip": 0.01068776, + "auxiliary_loss_mlp": 0.01032037, + "balance_loss_clip": 1.01901364, + "balance_loss_mlp": 1.0227052, + "epoch": 0.35971742071246054, + "flos": 23111419397760.0, + "grad_norm": 1.6310207903392884, + "language_loss": 0.75598907, + "learning_rate": 2.8535408819801127e-06, + "loss": 0.77699721, + "num_input_tokens_seen": 128549990, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.4609375, + "step": 5983, + "time_per_iteration": 2.424999713897705 + }, + { + "auxiliary_loss_clip": 0.01076745, + "auxiliary_loss_mlp": 0.0103267, + "balance_loss_clip": 1.01652908, + "balance_loss_mlp": 1.02407491, + "epoch": 0.3597775439651285, + "flos": 16724478430080.0, + "grad_norm": 1.6772260668171775, + "language_loss": 0.76604366, + "learning_rate": 2.85319917935561e-06, + "loss": 0.78713775, + "num_input_tokens_seen": 128567925, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.52734375, + "step": 5984, + "time_per_iteration": 2.3655178546905518 + }, + { + "auxiliary_loss_clip": 0.01069737, + "auxiliary_loss_mlp": 0.01029488, + "balance_loss_clip": 1.01650012, + "balance_loss_mlp": 1.02234197, + "epoch": 0.3598376672177965, + "flos": 19718455138560.0, + "grad_norm": 2.5706400797306372, + "language_loss": 0.86202085, + "learning_rate": 2.8528574462826234e-06, + "loss": 0.88301313, + "num_input_tokens_seen": 128585655, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.47460938, + "step": 5985, + "time_per_iteration": 5.212001085281372 + }, + { + "auxiliary_loss_clip": 0.01068186, + "auxiliary_loss_mlp": 0.01034093, + "balance_loss_clip": 1.0196445, + "balance_loss_mlp": 1.02062666, + "epoch": 0.35989779047046444, + "flos": 17310565290240.0, + "grad_norm": 1.31571106849939, + "language_loss": 0.72440183, + "learning_rate": 2.852515682773348e-06, + "loss": 0.74542469, + "num_input_tokens_seen": 128604820, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.4765625, + "step": 5986, + "time_per_iteration": 2.3966708183288574 + }, + { + "auxiliary_loss_clip": 0.0107331, + "auxiliary_loss_mlp": 0.01033464, + "balance_loss_clip": 1.01806188, + "balance_loss_mlp": 1.02130544, + "epoch": 0.3599579137231324, + "flos": 22710127697280.0, + "grad_norm": 2.956739643243447, + "language_loss": 0.74059355, + "learning_rate": 2.8521738888399815e-06, + "loss": 0.76166123, + "num_input_tokens_seen": 128623070, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.51953125, + "step": 5987, + "time_per_iteration": 2.427263021469116 + }, + { + "auxiliary_loss_clip": 0.0107469, + "auxiliary_loss_mlp": 0.01030468, + "balance_loss_clip": 1.01565039, + "balance_loss_mlp": 1.02407086, + "epoch": 0.36001803697580037, + "flos": 20958814477440.0, + "grad_norm": 2.5739309117662703, + "language_loss": 0.69163907, + "learning_rate": 2.8518320644947204e-06, + "loss": 0.71269071, + "num_input_tokens_seen": 128642430, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.50390625, + "step": 5988, + "time_per_iteration": 2.4609618186950684 + }, + { + "auxiliary_loss_clip": 0.01073051, + "auxiliary_loss_mlp": 0.01028814, + "balance_loss_clip": 1.01384687, + "balance_loss_mlp": 1.02237487, + "epoch": 0.36007816022846834, + "flos": 20484519390720.0, + "grad_norm": 1.8033435565128877, + "language_loss": 0.73564243, + "learning_rate": 2.851490209749764e-06, + "loss": 0.75666106, + "num_input_tokens_seen": 128661285, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.5078125, + "step": 5989, + "time_per_iteration": 3.8666367530822754 + }, + { + "auxiliary_loss_clip": 0.01069402, + "auxiliary_loss_mlp": 0.01026398, + "balance_loss_clip": 1.01235521, + "balance_loss_mlp": 1.02149057, + "epoch": 0.36013828348113636, + "flos": 27999993012480.0, + "grad_norm": 2.594888852621829, + "language_loss": 0.80210066, + "learning_rate": 2.8511483246173126e-06, + "loss": 0.82305861, + "num_input_tokens_seen": 128682210, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.48046875, + "step": 5990, + "time_per_iteration": 2.440448045730591 + }, + { + "auxiliary_loss_clip": 0.01073075, + "auxiliary_loss_mlp": 0.01026314, + "balance_loss_clip": 1.01153827, + "balance_loss_mlp": 1.02331042, + "epoch": 0.3601984067338043, + "flos": 20081202831360.0, + "grad_norm": 1.6472624849151765, + "language_loss": 0.840931, + "learning_rate": 2.8508064091095664e-06, + "loss": 0.86192489, + "num_input_tokens_seen": 128700445, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.49609375, + "step": 5991, + "time_per_iteration": 2.396472930908203 + }, + { + "auxiliary_loss_clip": 0.01072402, + "auxiliary_loss_mlp": 0.0103289, + "balance_loss_clip": 1.01835823, + "balance_loss_mlp": 1.02134836, + "epoch": 0.3602585299864723, + "flos": 18616806097920.0, + "grad_norm": 1.7126643977310503, + "language_loss": 0.75447881, + "learning_rate": 2.8504644632387286e-06, + "loss": 0.77553165, + "num_input_tokens_seen": 128716855, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.51171875, + "step": 5992, + "time_per_iteration": 2.3715078830718994 + }, + { + "auxiliary_loss_clip": 0.01069857, + "auxiliary_loss_mlp": 0.0103526, + "balance_loss_clip": 1.02072811, + "balance_loss_mlp": 1.0215801, + "epoch": 0.36031865323914025, + "flos": 19571994518400.0, + "grad_norm": 1.8280424757494191, + "language_loss": 0.77356052, + "learning_rate": 2.850122487017002e-06, + "loss": 0.79461169, + "num_input_tokens_seen": 128735835, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.48242188, + "step": 5993, + "time_per_iteration": 2.375019073486328 + }, + { + "auxiliary_loss_clip": 0.01074143, + "auxiliary_loss_mlp": 0.01036265, + "balance_loss_clip": 1.02153707, + "balance_loss_mlp": 1.02269566, + "epoch": 0.3603787764918082, + "flos": 17489739720960.0, + "grad_norm": 1.6386141039743205, + "language_loss": 0.74430043, + "learning_rate": 2.84978048045659e-06, + "loss": 0.76540458, + "num_input_tokens_seen": 128752465, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.515625, + "step": 5994, + "time_per_iteration": 2.3964900970458984 + }, + { + "auxiliary_loss_clip": 0.0107325, + "auxiliary_loss_mlp": 0.01029603, + "balance_loss_clip": 1.0149461, + "balance_loss_mlp": 1.02215815, + "epoch": 0.3604388997444762, + "flos": 15522488542080.0, + "grad_norm": 1.689677266658668, + "language_loss": 0.68651265, + "learning_rate": 2.8494384435696987e-06, + "loss": 0.70754117, + "num_input_tokens_seen": 128770865, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.5078125, + "step": 5995, + "time_per_iteration": 2.356747627258301 + }, + { + "auxiliary_loss_clip": 0.01072455, + "auxiliary_loss_mlp": 0.01032943, + "balance_loss_clip": 1.01810753, + "balance_loss_mlp": 1.02186203, + "epoch": 0.36049902299714415, + "flos": 17309936885760.0, + "grad_norm": 1.7972816808866687, + "language_loss": 0.82576621, + "learning_rate": 2.849096376368534e-06, + "loss": 0.84682024, + "num_input_tokens_seen": 128789730, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.5078125, + "step": 5996, + "time_per_iteration": 2.375746965408325 + }, + { + "auxiliary_loss_clip": 0.01071046, + "auxiliary_loss_mlp": 0.01027851, + "balance_loss_clip": 1.01374257, + "balance_loss_mlp": 1.02251923, + "epoch": 0.3605591462498121, + "flos": 17055070894080.0, + "grad_norm": 1.6360357918131012, + "language_loss": 0.73591554, + "learning_rate": 2.8487542788653044e-06, + "loss": 0.75690454, + "num_input_tokens_seen": 128806610, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.484375, + "step": 5997, + "time_per_iteration": 2.3556747436523438 + }, + { + "auxiliary_loss_clip": 0.0106946, + "auxiliary_loss_mlp": 0.01029605, + "balance_loss_clip": 1.01534796, + "balance_loss_mlp": 1.02197778, + "epoch": 0.3606192695024801, + "flos": 16835921089920.0, + "grad_norm": 2.443251493045155, + "language_loss": 0.68559325, + "learning_rate": 2.848412151072218e-06, + "loss": 0.70658386, + "num_input_tokens_seen": 128824830, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.47460938, + "step": 5998, + "time_per_iteration": 2.381971597671509 + }, + { + "auxiliary_loss_clip": 0.01072282, + "auxiliary_loss_mlp": 0.01021889, + "balance_loss_clip": 1.00744081, + "balance_loss_mlp": 1.02302206, + "epoch": 0.36067939275514804, + "flos": 12128860967040.0, + "grad_norm": 2.1652778127333745, + "language_loss": 0.77397305, + "learning_rate": 2.8480699930014834e-06, + "loss": 0.79491478, + "num_input_tokens_seen": 128838170, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.4921875, + "step": 5999, + "time_per_iteration": 2.3386337757110596 + }, + { + "auxiliary_loss_clip": 0.01072589, + "auxiliary_loss_mlp": 0.01035818, + "balance_loss_clip": 1.02178073, + "balance_loss_mlp": 1.02274752, + "epoch": 0.360739516007816, + "flos": 18040459507200.0, + "grad_norm": 5.906092126339494, + "language_loss": 0.78284979, + "learning_rate": 2.847727804665313e-06, + "loss": 0.80393386, + "num_input_tokens_seen": 128855625, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.49804688, + "step": 6000, + "time_per_iteration": 2.3734095096588135 + }, + { + "auxiliary_loss_clip": 0.01070591, + "auxiliary_loss_mlp": 0.0102952, + "balance_loss_clip": 1.0152092, + "balance_loss_mlp": 1.02154922, + "epoch": 0.360799639260484, + "flos": 18548864858880.0, + "grad_norm": 3.522855557431247, + "language_loss": 0.78478992, + "learning_rate": 2.8473855860759175e-06, + "loss": 0.80579108, + "num_input_tokens_seen": 128873540, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.49023438, + "step": 6001, + "time_per_iteration": 2.3571958541870117 + }, + { + "auxiliary_loss_clip": 0.01068303, + "auxiliary_loss_mlp": 0.01021467, + "balance_loss_clip": 1.00745416, + "balance_loss_mlp": 1.02169609, + "epoch": 0.36085976251315194, + "flos": 19681028294400.0, + "grad_norm": 2.384961015490449, + "language_loss": 0.83246374, + "learning_rate": 2.847043337245511e-06, + "loss": 0.85336137, + "num_input_tokens_seen": 128889925, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.46679688, + "step": 6002, + "time_per_iteration": 2.3922977447509766 + }, + { + "auxiliary_loss_clip": 0.01066386, + "auxiliary_loss_mlp": 0.01024759, + "balance_loss_clip": 1.01165187, + "balance_loss_mlp": 1.02045155, + "epoch": 0.3609198857658199, + "flos": 24198021642240.0, + "grad_norm": 1.9957726217601077, + "language_loss": 0.90845191, + "learning_rate": 2.8467010581863058e-06, + "loss": 0.92936337, + "num_input_tokens_seen": 128906890, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.45898438, + "step": 6003, + "time_per_iteration": 2.384376287460327 + }, + { + "auxiliary_loss_clip": 0.01014298, + "auxiliary_loss_mlp": 0.0099876, + "balance_loss_clip": 0.99746621, + "balance_loss_mlp": 1.00391531, + "epoch": 0.3609800090184879, + "flos": 57112946346240.0, + "grad_norm": 0.8654188719359439, + "language_loss": 0.53336197, + "learning_rate": 2.8463587489105175e-06, + "loss": 0.55349255, + "num_input_tokens_seen": 128965940, + "router_z_loss_clip": 0.01293945, + "router_z_loss_mlp": 0.10351562, + "step": 6004, + "time_per_iteration": 2.9778435230255127 + }, + { + "auxiliary_loss_clip": 0.01070475, + "auxiliary_loss_mlp": 0.01027862, + "balance_loss_clip": 1.01256752, + "balance_loss_mlp": 1.02094913, + "epoch": 0.3610401322711559, + "flos": 20810259175680.0, + "grad_norm": 1.8112342696294805, + "language_loss": 0.77950227, + "learning_rate": 2.846016409430363e-06, + "loss": 0.80048561, + "num_input_tokens_seen": 128985835, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.49414062, + "step": 6005, + "time_per_iteration": 2.40242862701416 + }, + { + "auxiliary_loss_clip": 0.01071135, + "auxiliary_loss_mlp": 0.01029362, + "balance_loss_clip": 1.01550364, + "balance_loss_mlp": 1.02229643, + "epoch": 0.36110025552382385, + "flos": 13698311581440.0, + "grad_norm": 3.3247251940557345, + "language_loss": 0.79514426, + "learning_rate": 2.8456740397580586e-06, + "loss": 0.81614923, + "num_input_tokens_seen": 129003120, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.48828125, + "step": 6006, + "time_per_iteration": 2.397096633911133 + }, + { + "auxiliary_loss_clip": 0.01073198, + "auxiliary_loss_mlp": 0.01029841, + "balance_loss_clip": 1.01353323, + "balance_loss_mlp": 1.02276611, + "epoch": 0.3611603787764918, + "flos": 22453935073920.0, + "grad_norm": 3.3961878709665156, + "language_loss": 0.84467876, + "learning_rate": 2.845331639905824e-06, + "loss": 0.86570913, + "num_input_tokens_seen": 129021645, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.50390625, + "step": 6007, + "time_per_iteration": 2.3960461616516113 + }, + { + "auxiliary_loss_clip": 0.01074799, + "auxiliary_loss_mlp": 0.01031876, + "balance_loss_clip": 1.01552045, + "balance_loss_mlp": 1.02260673, + "epoch": 0.3612205020291598, + "flos": 20885601623040.0, + "grad_norm": 1.929858206572468, + "language_loss": 0.73038328, + "learning_rate": 2.844989209885877e-06, + "loss": 0.75145, + "num_input_tokens_seen": 129038375, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.5234375, + "step": 6008, + "time_per_iteration": 2.411940574645996 + }, + { + "auxiliary_loss_clip": 0.01068807, + "auxiliary_loss_mlp": 0.01029272, + "balance_loss_clip": 1.01484811, + "balance_loss_mlp": 1.02130961, + "epoch": 0.36128062528182775, + "flos": 15741079764480.0, + "grad_norm": 1.8621695631660327, + "language_loss": 0.827088, + "learning_rate": 2.844646749710439e-06, + "loss": 0.84806877, + "num_input_tokens_seen": 129056235, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.47460938, + "step": 6009, + "time_per_iteration": 2.3955862522125244 + }, + { + "auxiliary_loss_clip": 0.01072142, + "auxiliary_loss_mlp": 0.01028741, + "balance_loss_clip": 1.01379824, + "balance_loss_mlp": 1.02278244, + "epoch": 0.3613407485344957, + "flos": 16763546108160.0, + "grad_norm": 2.1010749206175316, + "language_loss": 0.76096261, + "learning_rate": 2.844304259391731e-06, + "loss": 0.78197145, + "num_input_tokens_seen": 129072405, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.4921875, + "step": 6010, + "time_per_iteration": 2.392655611038208 + }, + { + "auxiliary_loss_clip": 0.01071585, + "auxiliary_loss_mlp": 0.01031625, + "balance_loss_clip": 1.01671231, + "balance_loss_mlp": 1.02281487, + "epoch": 0.3614008717871637, + "flos": 20370283822080.0, + "grad_norm": 1.7025574288875394, + "language_loss": 0.82709467, + "learning_rate": 2.8439617389419757e-06, + "loss": 0.84812677, + "num_input_tokens_seen": 129090225, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.48828125, + "step": 6011, + "time_per_iteration": 2.410205125808716 + }, + { + "auxiliary_loss_clip": 0.01075987, + "auxiliary_loss_mlp": 0.01034962, + "balance_loss_clip": 1.01916671, + "balance_loss_mlp": 1.02419984, + "epoch": 0.36146099503983165, + "flos": 22775764786560.0, + "grad_norm": 2.1693312184923372, + "language_loss": 0.62887549, + "learning_rate": 2.843619188373397e-06, + "loss": 0.64998496, + "num_input_tokens_seen": 129107685, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.51953125, + "step": 6012, + "time_per_iteration": 2.446655511856079 + }, + { + "auxiliary_loss_clip": 0.01066514, + "auxiliary_loss_mlp": 0.01032172, + "balance_loss_clip": 1.01835549, + "balance_loss_mlp": 1.0201298, + "epoch": 0.3615211182924996, + "flos": 22995717552000.0, + "grad_norm": 1.799851830212082, + "language_loss": 0.83540189, + "learning_rate": 2.843276607698219e-06, + "loss": 0.85638869, + "num_input_tokens_seen": 129125315, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.46289062, + "step": 6013, + "time_per_iteration": 2.403985023498535 + }, + { + "auxiliary_loss_clip": 0.01070088, + "auxiliary_loss_mlp": 0.01030337, + "balance_loss_clip": 1.01508391, + "balance_loss_mlp": 1.02161551, + "epoch": 0.3615812415451676, + "flos": 16647320592000.0, + "grad_norm": 1.830283251842251, + "language_loss": 0.91465521, + "learning_rate": 2.8429339969286687e-06, + "loss": 0.93565953, + "num_input_tokens_seen": 129141600, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.484375, + "step": 6014, + "time_per_iteration": 2.4018900394439697 + }, + { + "auxiliary_loss_clip": 0.01070047, + "auxiliary_loss_mlp": 0.01030247, + "balance_loss_clip": 1.01532149, + "balance_loss_mlp": 1.02177155, + "epoch": 0.36164136479783554, + "flos": 21319153286400.0, + "grad_norm": 1.6945475273154569, + "language_loss": 0.73786128, + "learning_rate": 2.8425913560769725e-06, + "loss": 0.75886428, + "num_input_tokens_seen": 129160665, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.48242188, + "step": 6015, + "time_per_iteration": 2.405226469039917 + }, + { + "auxiliary_loss_clip": 0.01072748, + "auxiliary_loss_mlp": 0.01030815, + "balance_loss_clip": 1.0158546, + "balance_loss_mlp": 1.02209187, + "epoch": 0.3617014880505035, + "flos": 24168449854080.0, + "grad_norm": 2.2412958755721237, + "language_loss": 0.6504758, + "learning_rate": 2.8422486851553577e-06, + "loss": 0.67151141, + "num_input_tokens_seen": 129179220, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.5078125, + "step": 6016, + "time_per_iteration": 2.425557851791382 + }, + { + "auxiliary_loss_clip": 0.01073209, + "auxiliary_loss_mlp": 0.01033838, + "balance_loss_clip": 1.01727951, + "balance_loss_mlp": 1.02278531, + "epoch": 0.3617616113031715, + "flos": 39013414951680.0, + "grad_norm": 1.7495379388897212, + "language_loss": 0.71684813, + "learning_rate": 2.8419059841760545e-06, + "loss": 0.73791862, + "num_input_tokens_seen": 129200385, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.50390625, + "step": 6017, + "time_per_iteration": 2.5464694499969482 + }, + { + "auxiliary_loss_clip": 0.01073047, + "auxiliary_loss_mlp": 0.01029447, + "balance_loss_clip": 1.01409292, + "balance_loss_mlp": 1.02167881, + "epoch": 0.3618217345558395, + "flos": 12130013041920.0, + "grad_norm": 1.8318374531220054, + "language_loss": 0.73157543, + "learning_rate": 2.8415632531512916e-06, + "loss": 0.75260037, + "num_input_tokens_seen": 129217395, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.515625, + "step": 6018, + "time_per_iteration": 2.374448299407959 + }, + { + "auxiliary_loss_clip": 0.0106963, + "auxiliary_loss_mlp": 0.01030278, + "balance_loss_clip": 1.01555562, + "balance_loss_mlp": 1.02175033, + "epoch": 0.36188185780850746, + "flos": 24933885701760.0, + "grad_norm": 2.0418854675693727, + "language_loss": 0.69575953, + "learning_rate": 2.841220492093301e-06, + "loss": 0.71675861, + "num_input_tokens_seen": 129238940, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.47851562, + "step": 6019, + "time_per_iteration": 2.425837278366089 + }, + { + "auxiliary_loss_clip": 0.01074115, + "auxiliary_loss_mlp": 0.010318, + "balance_loss_clip": 1.01635003, + "balance_loss_mlp": 1.02307045, + "epoch": 0.3619419810611754, + "flos": 20957802048000.0, + "grad_norm": 1.9137437844254053, + "language_loss": 0.76357806, + "learning_rate": 2.840877701014316e-06, + "loss": 0.78463715, + "num_input_tokens_seen": 129258240, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.5078125, + "step": 6020, + "time_per_iteration": 2.4055187702178955 + }, + { + "auxiliary_loss_clip": 0.0107369, + "auxiliary_loss_mlp": 0.0103304, + "balance_loss_clip": 1.01760221, + "balance_loss_mlp": 1.02433956, + "epoch": 0.3620021043138434, + "flos": 22527776332800.0, + "grad_norm": 1.662826101099494, + "language_loss": 0.73821962, + "learning_rate": 2.840534879926567e-06, + "loss": 0.75928688, + "num_input_tokens_seen": 129279040, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.4921875, + "step": 6021, + "time_per_iteration": 3.8287951946258545 + }, + { + "auxiliary_loss_clip": 0.01071361, + "auxiliary_loss_mlp": 0.01031302, + "balance_loss_clip": 1.01703238, + "balance_loss_mlp": 1.02220607, + "epoch": 0.36206222756651135, + "flos": 15595771219200.0, + "grad_norm": 1.668204654199499, + "language_loss": 0.80771255, + "learning_rate": 2.8401920288422915e-06, + "loss": 0.82873923, + "num_input_tokens_seen": 129295415, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.4921875, + "step": 6022, + "time_per_iteration": 2.3824174404144287 + }, + { + "auxiliary_loss_clip": 0.01069059, + "auxiliary_loss_mlp": 0.01027948, + "balance_loss_clip": 1.01399469, + "balance_loss_mlp": 1.02202117, + "epoch": 0.3621223508191793, + "flos": 23586028686720.0, + "grad_norm": 1.8072920444109888, + "language_loss": 0.81503475, + "learning_rate": 2.8398491477737235e-06, + "loss": 0.83600485, + "num_input_tokens_seen": 129312620, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.46875, + "step": 6023, + "time_per_iteration": 2.398380994796753 + }, + { + "auxiliary_loss_clip": 0.01071812, + "auxiliary_loss_mlp": 0.0102838, + "balance_loss_clip": 1.01320481, + "balance_loss_mlp": 1.02203727, + "epoch": 0.3621824740718473, + "flos": 22308801085440.0, + "grad_norm": 1.5989021913224075, + "language_loss": 0.79522765, + "learning_rate": 2.8395062367330997e-06, + "loss": 0.81622958, + "num_input_tokens_seen": 129331825, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.49804688, + "step": 6024, + "time_per_iteration": 3.8028693199157715 + }, + { + "auxiliary_loss_clip": 0.01068689, + "auxiliary_loss_mlp": 0.01028026, + "balance_loss_clip": 1.01492524, + "balance_loss_mlp": 1.02201271, + "epoch": 0.36224259732451525, + "flos": 16762708235520.0, + "grad_norm": 2.760747341964129, + "language_loss": 0.75033462, + "learning_rate": 2.839163295732658e-06, + "loss": 0.77130175, + "num_input_tokens_seen": 129350400, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.46679688, + "step": 6025, + "time_per_iteration": 3.770988702774048 + }, + { + "auxiliary_loss_clip": 0.01069876, + "auxiliary_loss_mlp": 0.01029721, + "balance_loss_clip": 1.01622677, + "balance_loss_mlp": 1.02261126, + "epoch": 0.3623027205771832, + "flos": 23148601862400.0, + "grad_norm": 2.1020669072743066, + "language_loss": 0.72191185, + "learning_rate": 2.8388203247846365e-06, + "loss": 0.74290782, + "num_input_tokens_seen": 129371155, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.47265625, + "step": 6026, + "time_per_iteration": 2.397810220718384 + }, + { + "auxiliary_loss_clip": 0.01077714, + "auxiliary_loss_mlp": 0.01034315, + "balance_loss_clip": 1.01856709, + "balance_loss_mlp": 1.02483678, + "epoch": 0.3623628438298512, + "flos": 28547884978560.0, + "grad_norm": 2.0490089708430705, + "language_loss": 0.78985703, + "learning_rate": 2.8384773239012757e-06, + "loss": 0.81097728, + "num_input_tokens_seen": 129391230, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.52734375, + "step": 6027, + "time_per_iteration": 2.450732707977295 + }, + { + "auxiliary_loss_clip": 0.01073279, + "auxiliary_loss_mlp": 0.01035291, + "balance_loss_clip": 1.01925683, + "balance_loss_mlp": 1.02260876, + "epoch": 0.36242296708251914, + "flos": 25483732704000.0, + "grad_norm": 2.390060660765317, + "language_loss": 0.67954075, + "learning_rate": 2.838134293094815e-06, + "loss": 0.70062649, + "num_input_tokens_seen": 129410065, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.5078125, + "step": 6028, + "time_per_iteration": 2.428199291229248 + }, + { + "auxiliary_loss_clip": 0.01071619, + "auxiliary_loss_mlp": 0.01027801, + "balance_loss_clip": 1.01357889, + "balance_loss_mlp": 1.02303064, + "epoch": 0.3624830903351871, + "flos": 16289425578240.0, + "grad_norm": 1.6164910617338464, + "language_loss": 0.85275388, + "learning_rate": 2.8377912323774986e-06, + "loss": 0.87374812, + "num_input_tokens_seen": 129428655, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.48632812, + "step": 6029, + "time_per_iteration": 3.7839155197143555 + }, + { + "auxiliary_loss_clip": 0.01070903, + "auxiliary_loss_mlp": 0.01027415, + "balance_loss_clip": 1.01374197, + "balance_loss_mlp": 1.02290821, + "epoch": 0.36254321358785513, + "flos": 18295325498880.0, + "grad_norm": 1.6603485673202667, + "language_loss": 0.72660106, + "learning_rate": 2.8374481417615675e-06, + "loss": 0.74758422, + "num_input_tokens_seen": 129447845, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.48046875, + "step": 6030, + "time_per_iteration": 2.3626651763916016 + }, + { + "auxiliary_loss_clip": 0.01074222, + "auxiliary_loss_mlp": 0.01033462, + "balance_loss_clip": 1.01628411, + "balance_loss_mlp": 1.02194142, + "epoch": 0.3626033368405231, + "flos": 14864445636480.0, + "grad_norm": 2.3157037466546644, + "language_loss": 0.74142039, + "learning_rate": 2.8371050212592664e-06, + "loss": 0.76249719, + "num_input_tokens_seen": 129463275, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.5234375, + "step": 6031, + "time_per_iteration": 2.3725292682647705 + }, + { + "auxiliary_loss_clip": 0.010703, + "auxiliary_loss_mlp": 0.01023962, + "balance_loss_clip": 1.00937104, + "balance_loss_mlp": 1.02210462, + "epoch": 0.36266346009319106, + "flos": 22305589240320.0, + "grad_norm": 1.6102340557882369, + "language_loss": 0.7318635, + "learning_rate": 2.8367618708828413e-06, + "loss": 0.75280613, + "num_input_tokens_seen": 129483205, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.48242188, + "step": 6032, + "time_per_iteration": 2.393301248550415 + }, + { + "auxiliary_loss_clip": 0.01072592, + "auxiliary_loss_mlp": 0.01030492, + "balance_loss_clip": 1.01608527, + "balance_loss_mlp": 1.02226877, + "epoch": 0.362723583345859, + "flos": 18221379505920.0, + "grad_norm": 2.0396954233271827, + "language_loss": 0.78155452, + "learning_rate": 2.836418690644536e-06, + "loss": 0.80258536, + "num_input_tokens_seen": 129499885, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.50390625, + "step": 6033, + "time_per_iteration": 2.3853368759155273 + }, + { + "auxiliary_loss_clip": 0.01012328, + "auxiliary_loss_mlp": 0.01019246, + "balance_loss_clip": 1.01789856, + "balance_loss_mlp": 1.00241518, + "epoch": 0.362783706598527, + "flos": 68495818959360.0, + "grad_norm": 0.801836756334226, + "language_loss": 0.64749706, + "learning_rate": 2.8360754805566004e-06, + "loss": 0.66781282, + "num_input_tokens_seen": 129561885, + "router_z_loss_clip": 0.01348877, + "router_z_loss_mlp": 0.09912109, + "step": 6034, + "time_per_iteration": 3.11136794090271 + }, + { + "auxiliary_loss_clip": 0.01071413, + "auxiliary_loss_mlp": 0.0102934, + "balance_loss_clip": 1.01399183, + "balance_loss_mlp": 1.02213168, + "epoch": 0.36284382985119495, + "flos": 26575432007040.0, + "grad_norm": 1.6497360784137405, + "language_loss": 0.89779735, + "learning_rate": 2.835732240631281e-06, + "loss": 0.91880488, + "num_input_tokens_seen": 129582325, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.4921875, + "step": 6035, + "time_per_iteration": 2.427647113800049 + }, + { + "auxiliary_loss_clip": 0.01073043, + "auxiliary_loss_mlp": 0.01030498, + "balance_loss_clip": 1.01587117, + "balance_loss_mlp": 1.02252495, + "epoch": 0.3629039531038629, + "flos": 20155742317440.0, + "grad_norm": 1.7384524033812592, + "language_loss": 0.73809171, + "learning_rate": 2.8353889708808274e-06, + "loss": 0.75912702, + "num_input_tokens_seen": 129600350, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.50390625, + "step": 6036, + "time_per_iteration": 2.3848750591278076 + }, + { + "auxiliary_loss_clip": 0.01072986, + "auxiliary_loss_mlp": 0.01029311, + "balance_loss_clip": 1.01377857, + "balance_loss_mlp": 1.02243233, + "epoch": 0.3629640763565309, + "flos": 18624696065280.0, + "grad_norm": 1.8271408483473113, + "language_loss": 0.75926924, + "learning_rate": 2.835045671317491e-06, + "loss": 0.78029221, + "num_input_tokens_seen": 129618425, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.50390625, + "step": 6037, + "time_per_iteration": 2.4064931869506836 + }, + { + "auxiliary_loss_clip": 0.01070234, + "auxiliary_loss_mlp": 0.01041391, + "balance_loss_clip": 1.02629948, + "balance_loss_mlp": 1.02303052, + "epoch": 0.36302419960919885, + "flos": 19570493329920.0, + "grad_norm": 1.5238921625190627, + "language_loss": 0.78709567, + "learning_rate": 2.834702341953522e-06, + "loss": 0.80821192, + "num_input_tokens_seen": 129636750, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.47265625, + "step": 6038, + "time_per_iteration": 2.392469644546509 + }, + { + "auxiliary_loss_clip": 0.01012324, + "auxiliary_loss_mlp": 0.01001427, + "balance_loss_clip": 1.00004971, + "balance_loss_mlp": 1.00219309, + "epoch": 0.3630843228618668, + "flos": 63794239920000.0, + "grad_norm": 0.8251691126315029, + "language_loss": 0.6337781, + "learning_rate": 2.8343589828011737e-06, + "loss": 0.65391564, + "num_input_tokens_seen": 129699030, + "router_z_loss_clip": 0.01379395, + "router_z_loss_mlp": 0.1015625, + "step": 6039, + "time_per_iteration": 3.1344316005706787 + }, + { + "auxiliary_loss_clip": 0.01071489, + "auxiliary_loss_mlp": 0.01036209, + "balance_loss_clip": 1.02149868, + "balance_loss_mlp": 1.02284336, + "epoch": 0.3631444461145348, + "flos": 21834087062400.0, + "grad_norm": 2.4876093444982805, + "language_loss": 0.71191859, + "learning_rate": 2.8340155938726993e-06, + "loss": 0.73299551, + "num_input_tokens_seen": 129717135, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.48632812, + "step": 6040, + "time_per_iteration": 2.410862922668457 + }, + { + "auxiliary_loss_clip": 0.01076693, + "auxiliary_loss_mlp": 0.0103075, + "balance_loss_clip": 1.01563394, + "balance_loss_mlp": 1.02409446, + "epoch": 0.36320456936720275, + "flos": 21721073391360.0, + "grad_norm": 1.9267410556946198, + "language_loss": 0.81326181, + "learning_rate": 2.833672175180354e-06, + "loss": 0.83433628, + "num_input_tokens_seen": 129735940, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.52734375, + "step": 6041, + "time_per_iteration": 2.4156813621520996 + }, + { + "auxiliary_loss_clip": 0.01074447, + "auxiliary_loss_mlp": 0.01028398, + "balance_loss_clip": 1.01278138, + "balance_loss_mlp": 1.02320158, + "epoch": 0.3632646926198707, + "flos": 17018132808960.0, + "grad_norm": 1.8663442910249932, + "language_loss": 0.83524156, + "learning_rate": 2.8333287267363934e-06, + "loss": 0.85626996, + "num_input_tokens_seen": 129752790, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.51171875, + "step": 6042, + "time_per_iteration": 2.3927791118621826 + }, + { + "auxiliary_loss_clip": 0.0107211, + "auxiliary_loss_mlp": 0.01032434, + "balance_loss_clip": 1.01739573, + "balance_loss_mlp": 1.0238142, + "epoch": 0.36332481587253873, + "flos": 23330045531520.0, + "grad_norm": 1.5747586370196147, + "language_loss": 0.78099209, + "learning_rate": 2.832985248553074e-06, + "loss": 0.80203754, + "num_input_tokens_seen": 129773655, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.48242188, + "step": 6043, + "time_per_iteration": 2.4068548679351807 + }, + { + "auxiliary_loss_clip": 0.01070407, + "auxiliary_loss_mlp": 0.01034211, + "balance_loss_clip": 1.01822472, + "balance_loss_mlp": 1.02241945, + "epoch": 0.3633849391252067, + "flos": 10742774146560.0, + "grad_norm": 3.0625358771682882, + "language_loss": 0.65889776, + "learning_rate": 2.8326417406426536e-06, + "loss": 0.67994392, + "num_input_tokens_seen": 129791605, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.48046875, + "step": 6044, + "time_per_iteration": 2.3587169647216797 + }, + { + "auxiliary_loss_clip": 0.01071919, + "auxiliary_loss_mlp": 0.01029001, + "balance_loss_clip": 1.01363468, + "balance_loss_mlp": 1.02365458, + "epoch": 0.36344506237787466, + "flos": 25847946673920.0, + "grad_norm": 1.6559850047449243, + "language_loss": 0.8122344, + "learning_rate": 2.8322982030173908e-06, + "loss": 0.83324373, + "num_input_tokens_seen": 129811075, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.484375, + "step": 6045, + "time_per_iteration": 2.45621395111084 + }, + { + "auxiliary_loss_clip": 0.0107271, + "auxiliary_loss_mlp": 0.01032149, + "balance_loss_clip": 1.01638985, + "balance_loss_mlp": 1.02294445, + "epoch": 0.3635051856305426, + "flos": 30152737578240.0, + "grad_norm": 1.7701918222092998, + "language_loss": 0.65188402, + "learning_rate": 2.8319546356895467e-06, + "loss": 0.67293257, + "num_input_tokens_seen": 129833755, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.49609375, + "step": 6046, + "time_per_iteration": 2.4616801738739014 + }, + { + "auxiliary_loss_clip": 0.01072185, + "auxiliary_loss_mlp": 0.01032312, + "balance_loss_clip": 1.01732159, + "balance_loss_mlp": 1.02223921, + "epoch": 0.3635653088832106, + "flos": 22197358425600.0, + "grad_norm": 1.7518953487820985, + "language_loss": 0.77624506, + "learning_rate": 2.831611038671382e-06, + "loss": 0.79729009, + "num_input_tokens_seen": 129854475, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.5, + "step": 6047, + "time_per_iteration": 2.422403573989868 + }, + { + "auxiliary_loss_clip": 0.01076261, + "auxiliary_loss_mlp": 0.01038343, + "balance_loss_clip": 1.02086687, + "balance_loss_mlp": 1.0221169, + "epoch": 0.36362543213587856, + "flos": 24785993715840.0, + "grad_norm": 1.5879397421556394, + "language_loss": 0.79469538, + "learning_rate": 2.8312674119751585e-06, + "loss": 0.81584144, + "num_input_tokens_seen": 129873530, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.5390625, + "step": 6048, + "time_per_iteration": 2.4356281757354736 + }, + { + "auxiliary_loss_clip": 0.01011234, + "auxiliary_loss_mlp": 0.01001155, + "balance_loss_clip": 0.99994522, + "balance_loss_mlp": 1.00127006, + "epoch": 0.3636855553885465, + "flos": 62522877427200.0, + "grad_norm": 0.7530051565630759, + "language_loss": 0.52588218, + "learning_rate": 2.8309237556131385e-06, + "loss": 0.54600608, + "num_input_tokens_seen": 129940400, + "router_z_loss_clip": 0.01208496, + "router_z_loss_mlp": 0.09960938, + "step": 6049, + "time_per_iteration": 3.103388786315918 + }, + { + "auxiliary_loss_clip": 0.01073577, + "auxiliary_loss_mlp": 0.01029463, + "balance_loss_clip": 1.01375103, + "balance_loss_mlp": 1.02340925, + "epoch": 0.3637456786412145, + "flos": 24059520812160.0, + "grad_norm": 2.0125450543831747, + "language_loss": 0.86114162, + "learning_rate": 2.8305800695975873e-06, + "loss": 0.88217199, + "num_input_tokens_seen": 129958635, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.50390625, + "step": 6050, + "time_per_iteration": 2.4318606853485107 + }, + { + "auxiliary_loss_clip": 0.01072245, + "auxiliary_loss_mlp": 0.01032076, + "balance_loss_clip": 1.01792586, + "balance_loss_mlp": 1.02397776, + "epoch": 0.36380580189388245, + "flos": 16690542721920.0, + "grad_norm": 1.8121395456824636, + "language_loss": 0.77918768, + "learning_rate": 2.8302363539407703e-06, + "loss": 0.80023092, + "num_input_tokens_seen": 129977685, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.484375, + "step": 6051, + "time_per_iteration": 2.387392520904541 + }, + { + "auxiliary_loss_clip": 0.01072581, + "auxiliary_loss_mlp": 0.01032486, + "balance_loss_clip": 1.0183177, + "balance_loss_mlp": 1.02278066, + "epoch": 0.3638659251465504, + "flos": 25113060132480.0, + "grad_norm": 1.7123928093346799, + "language_loss": 0.82470536, + "learning_rate": 2.829892608654953e-06, + "loss": 0.84575599, + "num_input_tokens_seen": 129997530, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.49804688, + "step": 6052, + "time_per_iteration": 2.4124338626861572 + }, + { + "auxiliary_loss_clip": 0.01068165, + "auxiliary_loss_mlp": 0.01029815, + "balance_loss_clip": 1.01685691, + "balance_loss_mlp": 1.02117634, + "epoch": 0.3639260483992184, + "flos": 23001896862720.0, + "grad_norm": 1.4943925841120913, + "language_loss": 0.7220093, + "learning_rate": 2.829548833752404e-06, + "loss": 0.74298918, + "num_input_tokens_seen": 130017955, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.47070312, + "step": 6053, + "time_per_iteration": 2.447387933731079 + }, + { + "auxiliary_loss_clip": 0.01011196, + "auxiliary_loss_mlp": 0.010035, + "balance_loss_clip": 1.00231433, + "balance_loss_mlp": 1.00151181, + "epoch": 0.36398617165188635, + "flos": 70712839071360.0, + "grad_norm": 0.7724249829209577, + "language_loss": 0.61200237, + "learning_rate": 2.8292050292453904e-06, + "loss": 0.63214934, + "num_input_tokens_seen": 130074275, + "router_z_loss_clip": 0.01184082, + "router_z_loss_mlp": 0.09667969, + "step": 6054, + "time_per_iteration": 3.0855648517608643 + }, + { + "auxiliary_loss_clip": 0.01071131, + "auxiliary_loss_mlp": 0.01032404, + "balance_loss_clip": 1.01699007, + "balance_loss_mlp": 1.02134585, + "epoch": 0.3640462949045543, + "flos": 22234401244800.0, + "grad_norm": 1.8421544709503386, + "language_loss": 0.75803816, + "learning_rate": 2.828861195146182e-06, + "loss": 0.77907354, + "num_input_tokens_seen": 130091375, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.49804688, + "step": 6055, + "time_per_iteration": 2.430845022201538 + }, + { + "auxiliary_loss_clip": 0.01072917, + "auxiliary_loss_mlp": 0.01037189, + "balance_loss_clip": 1.02200162, + "balance_loss_mlp": 1.0230329, + "epoch": 0.3641064181572223, + "flos": 21542457542400.0, + "grad_norm": 1.50952682916665, + "language_loss": 0.75209242, + "learning_rate": 2.82851733146705e-06, + "loss": 0.77319348, + "num_input_tokens_seen": 130111595, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.5, + "step": 6056, + "time_per_iteration": 2.3948845863342285 + }, + { + "auxiliary_loss_clip": 0.0107158, + "auxiliary_loss_mlp": 0.01037045, + "balance_loss_clip": 1.02169693, + "balance_loss_mlp": 1.02250767, + "epoch": 0.3641665414098903, + "flos": 22272212113920.0, + "grad_norm": 1.777259886806799, + "language_loss": 0.80024457, + "learning_rate": 2.8281734382202657e-06, + "loss": 0.8213309, + "num_input_tokens_seen": 130131440, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.4921875, + "step": 6057, + "time_per_iteration": 2.384439468383789 + }, + { + "auxiliary_loss_clip": 0.01071641, + "auxiliary_loss_mlp": 0.01029192, + "balance_loss_clip": 1.01478541, + "balance_loss_mlp": 1.02222943, + "epoch": 0.36422666466255826, + "flos": 28328420972160.0, + "grad_norm": 2.0290703185367143, + "language_loss": 0.80716157, + "learning_rate": 2.8278295154181017e-06, + "loss": 0.82816988, + "num_input_tokens_seen": 130151375, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.49414062, + "step": 6058, + "time_per_iteration": 2.474151372909546 + }, + { + "auxiliary_loss_clip": 0.01071565, + "auxiliary_loss_mlp": 0.01032193, + "balance_loss_clip": 1.01655245, + "balance_loss_mlp": 1.02245402, + "epoch": 0.36428678791522623, + "flos": 24169357549440.0, + "grad_norm": 1.7015808799638785, + "language_loss": 0.85123634, + "learning_rate": 2.8274855630728316e-06, + "loss": 0.87227386, + "num_input_tokens_seen": 130169960, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.4921875, + "step": 6059, + "time_per_iteration": 2.4142816066741943 + }, + { + "auxiliary_loss_clip": 0.01071468, + "auxiliary_loss_mlp": 0.01031453, + "balance_loss_clip": 1.01624775, + "balance_loss_mlp": 1.02149642, + "epoch": 0.3643469111678942, + "flos": 22527357396480.0, + "grad_norm": 1.4250841475354012, + "language_loss": 0.88126129, + "learning_rate": 2.82714158119673e-06, + "loss": 0.90229052, + "num_input_tokens_seen": 130189800, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.5, + "step": 6060, + "time_per_iteration": 2.4167838096618652 + }, + { + "auxiliary_loss_clip": 0.01072664, + "auxiliary_loss_mlp": 0.0103502, + "balance_loss_clip": 1.01926088, + "balance_loss_mlp": 1.02345657, + "epoch": 0.36440703442056216, + "flos": 19425603720960.0, + "grad_norm": 3.0504392611882754, + "language_loss": 0.67363739, + "learning_rate": 2.826797569802074e-06, + "loss": 0.69471419, + "num_input_tokens_seen": 130206370, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.4921875, + "step": 6061, + "time_per_iteration": 3.7727997303009033 + }, + { + "auxiliary_loss_clip": 0.010745, + "auxiliary_loss_mlp": 0.01029858, + "balance_loss_clip": 1.01421785, + "balance_loss_mlp": 1.02394438, + "epoch": 0.3644671576732301, + "flos": 18039551811840.0, + "grad_norm": 2.047186421643374, + "language_loss": 0.74945015, + "learning_rate": 2.826453528901139e-06, + "loss": 0.77049369, + "num_input_tokens_seen": 130224445, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.50390625, + "step": 6062, + "time_per_iteration": 2.374467372894287 + }, + { + "auxiliary_loss_clip": 0.01071705, + "auxiliary_loss_mlp": 0.01029355, + "balance_loss_clip": 1.01338673, + "balance_loss_mlp": 1.02322197, + "epoch": 0.3645272809258981, + "flos": 21541759315200.0, + "grad_norm": 1.7153839465780016, + "language_loss": 0.72601569, + "learning_rate": 2.826109458506203e-06, + "loss": 0.74702626, + "num_input_tokens_seen": 130245380, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.484375, + "step": 6063, + "time_per_iteration": 2.4319028854370117 + }, + { + "auxiliary_loss_clip": 0.01072168, + "auxiliary_loss_mlp": 0.0102924, + "balance_loss_clip": 1.01460075, + "balance_loss_mlp": 1.02368081, + "epoch": 0.36458740417856605, + "flos": 22745774062080.0, + "grad_norm": 1.8262591781232413, + "language_loss": 0.67901099, + "learning_rate": 2.825765358629546e-06, + "loss": 0.70002508, + "num_input_tokens_seen": 130265575, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.48632812, + "step": 6064, + "time_per_iteration": 3.8194692134857178 + }, + { + "auxiliary_loss_clip": 0.01074266, + "auxiliary_loss_mlp": 0.01033442, + "balance_loss_clip": 1.01827288, + "balance_loss_mlp": 1.02300286, + "epoch": 0.364647527431234, + "flos": 26139471459840.0, + "grad_norm": 2.0185268382777224, + "language_loss": 0.74218488, + "learning_rate": 2.825421229283447e-06, + "loss": 0.76326197, + "num_input_tokens_seen": 130286195, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.515625, + "step": 6065, + "time_per_iteration": 3.8645195960998535 + }, + { + "auxiliary_loss_clip": 0.01074578, + "auxiliary_loss_mlp": 0.0103359, + "balance_loss_clip": 1.01641214, + "balance_loss_mlp": 1.02254605, + "epoch": 0.364707650683902, + "flos": 31028568744960.0, + "grad_norm": 2.5926110301281207, + "language_loss": 0.75493026, + "learning_rate": 2.825077070480188e-06, + "loss": 0.77601194, + "num_input_tokens_seen": 130306095, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.51953125, + "step": 6066, + "time_per_iteration": 2.461808204650879 + }, + { + "auxiliary_loss_clip": 0.01069396, + "auxiliary_loss_mlp": 0.01025117, + "balance_loss_clip": 1.01133609, + "balance_loss_mlp": 1.02257264, + "epoch": 0.36476777393656995, + "flos": 19571889784320.0, + "grad_norm": 2.0832752967370727, + "language_loss": 0.76463497, + "learning_rate": 2.8247328822320505e-06, + "loss": 0.78558004, + "num_input_tokens_seen": 130324685, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.46875, + "step": 6067, + "time_per_iteration": 2.4324355125427246 + }, + { + "auxiliary_loss_clip": 0.01070256, + "auxiliary_loss_mlp": 0.01030046, + "balance_loss_clip": 1.01634336, + "balance_loss_mlp": 1.022964, + "epoch": 0.3648278971892379, + "flos": 17747887380480.0, + "grad_norm": 2.5657297006787023, + "language_loss": 0.71200514, + "learning_rate": 2.8243886645513176e-06, + "loss": 0.73300815, + "num_input_tokens_seen": 130343855, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.47265625, + "step": 6068, + "time_per_iteration": 2.388589859008789 + }, + { + "auxiliary_loss_clip": 0.01072116, + "auxiliary_loss_mlp": 0.01030708, + "balance_loss_clip": 1.01584864, + "balance_loss_mlp": 1.02157402, + "epoch": 0.3648880204419059, + "flos": 17930203833600.0, + "grad_norm": 2.291649456482206, + "language_loss": 0.73609048, + "learning_rate": 2.8240444174502747e-06, + "loss": 0.7571187, + "num_input_tokens_seen": 130362320, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.5078125, + "step": 6069, + "time_per_iteration": 3.752021551132202 + }, + { + "auxiliary_loss_clip": 0.01075793, + "auxiliary_loss_mlp": 0.01028242, + "balance_loss_clip": 1.0126853, + "balance_loss_mlp": 1.02370799, + "epoch": 0.3649481436945739, + "flos": 22637159222400.0, + "grad_norm": 3.2000084945749094, + "language_loss": 0.66391349, + "learning_rate": 2.8237001409412055e-06, + "loss": 0.68495381, + "num_input_tokens_seen": 130383165, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.5234375, + "step": 6070, + "time_per_iteration": 2.419123411178589 + }, + { + "auxiliary_loss_clip": 0.01069843, + "auxiliary_loss_mlp": 0.01025939, + "balance_loss_clip": 1.01214683, + "balance_loss_mlp": 1.02234411, + "epoch": 0.36500826694724187, + "flos": 21578592666240.0, + "grad_norm": 1.761209779560282, + "language_loss": 0.74285257, + "learning_rate": 2.8233558350363974e-06, + "loss": 0.76381034, + "num_input_tokens_seen": 130402425, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.47460938, + "step": 6071, + "time_per_iteration": 2.412306785583496 + }, + { + "auxiliary_loss_clip": 0.01070685, + "auxiliary_loss_mlp": 0.01028248, + "balance_loss_clip": 1.01219058, + "balance_loss_mlp": 1.02212548, + "epoch": 0.36506839019990983, + "flos": 13771664081280.0, + "grad_norm": 2.774690388639247, + "language_loss": 0.88472986, + "learning_rate": 2.823011499748137e-06, + "loss": 0.90571928, + "num_input_tokens_seen": 130419440, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.484375, + "step": 6072, + "time_per_iteration": 2.3721706867218018 + }, + { + "auxiliary_loss_clip": 0.01071892, + "auxiliary_loss_mlp": 0.0103254, + "balance_loss_clip": 1.01757336, + "balance_loss_mlp": 1.0234673, + "epoch": 0.3651285134525778, + "flos": 17274011230080.0, + "grad_norm": 2.1909947639588734, + "language_loss": 0.72709632, + "learning_rate": 2.8226671350887136e-06, + "loss": 0.74814063, + "num_input_tokens_seen": 130438495, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.484375, + "step": 6073, + "time_per_iteration": 2.3870556354522705 + }, + { + "auxiliary_loss_clip": 0.01074635, + "auxiliary_loss_mlp": 0.01035279, + "balance_loss_clip": 1.01974654, + "balance_loss_mlp": 1.0242734, + "epoch": 0.36518863670524576, + "flos": 21906915891840.0, + "grad_norm": 2.1040590566084507, + "language_loss": 0.67018306, + "learning_rate": 2.8223227410704163e-06, + "loss": 0.69128215, + "num_input_tokens_seen": 130455575, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.50390625, + "step": 6074, + "time_per_iteration": 2.3765158653259277 + }, + { + "auxiliary_loss_clip": 0.01070799, + "auxiliary_loss_mlp": 0.01028698, + "balance_loss_clip": 1.01332605, + "balance_loss_mlp": 1.0223608, + "epoch": 0.3652487599579137, + "flos": 27121054734720.0, + "grad_norm": 1.4666752069201787, + "language_loss": 0.72824764, + "learning_rate": 2.8219783177055355e-06, + "loss": 0.74924266, + "num_input_tokens_seen": 130476385, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.484375, + "step": 6075, + "time_per_iteration": 2.4290153980255127 + }, + { + "auxiliary_loss_clip": 0.01076606, + "auxiliary_loss_mlp": 0.01033471, + "balance_loss_clip": 1.01736557, + "balance_loss_mlp": 1.02360809, + "epoch": 0.3653088832105817, + "flos": 19754555351040.0, + "grad_norm": 2.235700546025527, + "language_loss": 0.89782155, + "learning_rate": 2.821633865006363e-06, + "loss": 0.91892231, + "num_input_tokens_seen": 130493630, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.52734375, + "step": 6076, + "time_per_iteration": 2.3699605464935303 + }, + { + "auxiliary_loss_clip": 0.01069922, + "auxiliary_loss_mlp": 0.01028779, + "balance_loss_clip": 1.01412868, + "balance_loss_mlp": 1.0224843, + "epoch": 0.36536900646324966, + "flos": 13114179757440.0, + "grad_norm": 2.0333809197559445, + "language_loss": 0.69961256, + "learning_rate": 2.8212893829851914e-06, + "loss": 0.72059953, + "num_input_tokens_seen": 130510735, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.47460938, + "step": 6077, + "time_per_iteration": 2.3759055137634277 + }, + { + "auxiliary_loss_clip": 0.0101222, + "auxiliary_loss_mlp": 0.01003624, + "balance_loss_clip": 1.00216389, + "balance_loss_mlp": 1.00172603, + "epoch": 0.3654291297159176, + "flos": 71096743048320.0, + "grad_norm": 0.7510107083192209, + "language_loss": 0.61749446, + "learning_rate": 2.8209448716543145e-06, + "loss": 0.63765287, + "num_input_tokens_seen": 130577050, + "router_z_loss_clip": 0.0145874, + "router_z_loss_mlp": 0.10498047, + "step": 6078, + "time_per_iteration": 3.1172690391540527 + }, + { + "auxiliary_loss_clip": 0.01071253, + "auxiliary_loss_mlp": 0.01028099, + "balance_loss_clip": 1.01387787, + "balance_loss_mlp": 1.0223639, + "epoch": 0.3654892529685856, + "flos": 23616508170240.0, + "grad_norm": 2.2250549627565275, + "language_loss": 0.78407478, + "learning_rate": 2.8206003310260265e-06, + "loss": 0.80506825, + "num_input_tokens_seen": 130593780, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.48828125, + "step": 6079, + "time_per_iteration": 2.3918392658233643 + }, + { + "auxiliary_loss_clip": 0.01076139, + "auxiliary_loss_mlp": 0.01031004, + "balance_loss_clip": 1.01537561, + "balance_loss_mlp": 1.02561152, + "epoch": 0.36554937622125355, + "flos": 43469135130240.0, + "grad_norm": 1.7005091387442286, + "language_loss": 0.62789857, + "learning_rate": 2.820255761112624e-06, + "loss": 0.64897001, + "num_input_tokens_seen": 130615510, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.50390625, + "step": 6080, + "time_per_iteration": 2.578517436981201 + }, + { + "auxiliary_loss_clip": 0.0107477, + "auxiliary_loss_mlp": 0.0103404, + "balance_loss_clip": 1.01839948, + "balance_loss_mlp": 1.02311301, + "epoch": 0.3656094994739215, + "flos": 23293526382720.0, + "grad_norm": 2.974710498766856, + "language_loss": 0.66998851, + "learning_rate": 2.819911161926403e-06, + "loss": 0.69107664, + "num_input_tokens_seen": 130635410, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.515625, + "step": 6081, + "time_per_iteration": 2.4205496311187744 + }, + { + "auxiliary_loss_clip": 0.01077766, + "auxiliary_loss_mlp": 0.01035334, + "balance_loss_clip": 1.01962256, + "balance_loss_mlp": 1.02398562, + "epoch": 0.3656696227265895, + "flos": 24570823806720.0, + "grad_norm": 1.5921363746692543, + "language_loss": 0.74886107, + "learning_rate": 2.8195665334796617e-06, + "loss": 0.76999199, + "num_input_tokens_seen": 130657725, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.53515625, + "step": 6082, + "time_per_iteration": 2.4470021724700928 + }, + { + "auxiliary_loss_clip": 0.01074657, + "auxiliary_loss_mlp": 0.01027502, + "balance_loss_clip": 1.01280308, + "balance_loss_mlp": 1.02507901, + "epoch": 0.3657297459792575, + "flos": 27927129271680.0, + "grad_norm": 1.873788943133173, + "language_loss": 0.83015347, + "learning_rate": 2.8192218757846993e-06, + "loss": 0.85117501, + "num_input_tokens_seen": 130678360, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.49609375, + "step": 6083, + "time_per_iteration": 2.4452130794525146 + }, + { + "auxiliary_loss_clip": 0.01012076, + "auxiliary_loss_mlp": 0.01001497, + "balance_loss_clip": 1.00010812, + "balance_loss_mlp": 1.00187576, + "epoch": 0.36578986923192547, + "flos": 67389631441920.0, + "grad_norm": 0.8066155189438377, + "language_loss": 0.59282637, + "learning_rate": 2.8188771888538148e-06, + "loss": 0.61296201, + "num_input_tokens_seen": 130742110, + "router_z_loss_clip": 0.01391602, + "router_z_loss_mlp": 0.10205078, + "step": 6084, + "time_per_iteration": 3.1411938667297363 + }, + { + "auxiliary_loss_clip": 0.01072586, + "auxiliary_loss_mlp": 0.01035622, + "balance_loss_clip": 1.01979709, + "balance_loss_mlp": 1.02367342, + "epoch": 0.36584999248459343, + "flos": 20226546288000.0, + "grad_norm": 1.8096110485277122, + "language_loss": 0.73080671, + "learning_rate": 2.8185324726993102e-06, + "loss": 0.75188875, + "num_input_tokens_seen": 130759870, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.48828125, + "step": 6085, + "time_per_iteration": 2.3918511867523193 + }, + { + "auxiliary_loss_clip": 0.01075101, + "auxiliary_loss_mlp": 0.01033134, + "balance_loss_clip": 1.0192461, + "balance_loss_mlp": 1.02546644, + "epoch": 0.3659101157372614, + "flos": 19061459573760.0, + "grad_norm": 1.7548351969433356, + "language_loss": 0.78040498, + "learning_rate": 2.8181877273334875e-06, + "loss": 0.80148733, + "num_input_tokens_seen": 130778510, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.49609375, + "step": 6086, + "time_per_iteration": 2.3726563453674316 + }, + { + "auxiliary_loss_clip": 0.01069736, + "auxiliary_loss_mlp": 0.01029776, + "balance_loss_clip": 1.01596522, + "balance_loss_mlp": 1.02252769, + "epoch": 0.36597023898992936, + "flos": 30809384029440.0, + "grad_norm": 1.9169474830485742, + "language_loss": 0.76484811, + "learning_rate": 2.8178429527686484e-06, + "loss": 0.78584319, + "num_input_tokens_seen": 130798535, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.47265625, + "step": 6087, + "time_per_iteration": 2.4766108989715576 + }, + { + "auxiliary_loss_clip": 0.01073796, + "auxiliary_loss_mlp": 0.01027462, + "balance_loss_clip": 1.01267409, + "balance_loss_mlp": 1.02283359, + "epoch": 0.36603036224259733, + "flos": 20520759248640.0, + "grad_norm": 4.837126570229189, + "language_loss": 0.70253181, + "learning_rate": 2.817498149017099e-06, + "loss": 0.72354448, + "num_input_tokens_seen": 130816655, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.5078125, + "step": 6088, + "time_per_iteration": 2.374274253845215 + }, + { + "auxiliary_loss_clip": 0.01077826, + "auxiliary_loss_mlp": 0.01032263, + "balance_loss_clip": 1.01544309, + "balance_loss_mlp": 1.02386379, + "epoch": 0.3660904854952653, + "flos": 38327790205440.0, + "grad_norm": 1.4893284857481737, + "language_loss": 0.79942602, + "learning_rate": 2.8171533160911432e-06, + "loss": 0.82052696, + "num_input_tokens_seen": 130841225, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.5390625, + "step": 6089, + "time_per_iteration": 2.571160316467285 + }, + { + "auxiliary_loss_clip": 0.0107228, + "auxiliary_loss_mlp": 0.01027273, + "balance_loss_clip": 1.01303935, + "balance_loss_mlp": 1.02323604, + "epoch": 0.36615060874793326, + "flos": 21834471087360.0, + "grad_norm": 1.7539765661715723, + "language_loss": 0.71559191, + "learning_rate": 2.8168084540030873e-06, + "loss": 0.73658746, + "num_input_tokens_seen": 130861050, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.49023438, + "step": 6090, + "time_per_iteration": 2.3875107765197754 + }, + { + "auxiliary_loss_clip": 0.01069674, + "auxiliary_loss_mlp": 0.01029682, + "balance_loss_clip": 1.01669431, + "balance_loss_mlp": 1.02335072, + "epoch": 0.3662107320006012, + "flos": 16580601250560.0, + "grad_norm": 1.6736934618082873, + "language_loss": 0.74514467, + "learning_rate": 2.8164635627652394e-06, + "loss": 0.7661382, + "num_input_tokens_seen": 130879775, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.4609375, + "step": 6091, + "time_per_iteration": 2.438204050064087 + }, + { + "auxiliary_loss_clip": 0.01073348, + "auxiliary_loss_mlp": 0.01032238, + "balance_loss_clip": 1.01793921, + "balance_loss_mlp": 1.02414513, + "epoch": 0.3662708552532692, + "flos": 20957348200320.0, + "grad_norm": 1.7218782317398558, + "language_loss": 0.72412252, + "learning_rate": 2.8161186423899067e-06, + "loss": 0.74517834, + "num_input_tokens_seen": 130898070, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.4921875, + "step": 6092, + "time_per_iteration": 2.3788232803344727 + }, + { + "auxiliary_loss_clip": 0.01074102, + "auxiliary_loss_mlp": 0.0103209, + "balance_loss_clip": 1.01696229, + "balance_loss_mlp": 1.02433026, + "epoch": 0.36633097850593715, + "flos": 21901783921920.0, + "grad_norm": 3.503077265508304, + "language_loss": 0.78127027, + "learning_rate": 2.8157736928893995e-06, + "loss": 0.80233216, + "num_input_tokens_seen": 130915250, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.49804688, + "step": 6093, + "time_per_iteration": 2.3988356590270996 + }, + { + "auxiliary_loss_clip": 0.01073091, + "auxiliary_loss_mlp": 0.01031992, + "balance_loss_clip": 1.0170908, + "balance_loss_mlp": 1.0216434, + "epoch": 0.3663911017586051, + "flos": 32852745705600.0, + "grad_norm": 2.905177954514504, + "language_loss": 0.74240935, + "learning_rate": 2.815428714276027e-06, + "loss": 0.76346028, + "num_input_tokens_seen": 130936995, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.515625, + "step": 6094, + "time_per_iteration": 2.4882853031158447 + }, + { + "auxiliary_loss_clip": 0.01076868, + "auxiliary_loss_mlp": 0.01033887, + "balance_loss_clip": 1.01922989, + "balance_loss_mlp": 1.02538097, + "epoch": 0.3664512250112731, + "flos": 27270517731840.0, + "grad_norm": 1.6249883141313535, + "language_loss": 0.79396409, + "learning_rate": 2.8150837065621016e-06, + "loss": 0.8150717, + "num_input_tokens_seen": 130957970, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.515625, + "step": 6095, + "time_per_iteration": 2.4629571437835693 + }, + { + "auxiliary_loss_clip": 0.01073984, + "auxiliary_loss_mlp": 0.01028103, + "balance_loss_clip": 1.01143146, + "balance_loss_mlp": 1.02212191, + "epoch": 0.3665113482639411, + "flos": 17783498833920.0, + "grad_norm": 2.5555783459106873, + "language_loss": 0.73315299, + "learning_rate": 2.8147386697599346e-06, + "loss": 0.75417387, + "num_input_tokens_seen": 130974915, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.515625, + "step": 6096, + "time_per_iteration": 2.3589084148406982 + }, + { + "auxiliary_loss_clip": 0.01073089, + "auxiliary_loss_mlp": 0.0102737, + "balance_loss_clip": 1.0123142, + "balance_loss_mlp": 1.02268195, + "epoch": 0.36657147151660907, + "flos": 27853392746880.0, + "grad_norm": 1.7622378318724488, + "language_loss": 0.66725016, + "learning_rate": 2.8143936038818412e-06, + "loss": 0.68825483, + "num_input_tokens_seen": 130995745, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.50390625, + "step": 6097, + "time_per_iteration": 2.444540023803711 + }, + { + "auxiliary_loss_clip": 0.01074246, + "auxiliary_loss_mlp": 0.01035125, + "balance_loss_clip": 1.02024198, + "balance_loss_mlp": 1.02339411, + "epoch": 0.36663159476927704, + "flos": 25372848625920.0, + "grad_norm": 1.5661748626378365, + "language_loss": 0.7748847, + "learning_rate": 2.8140485089401344e-06, + "loss": 0.79597843, + "num_input_tokens_seen": 131015545, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.5078125, + "step": 6098, + "time_per_iteration": 2.4213931560516357 + }, + { + "auxiliary_loss_clip": 0.01070718, + "auxiliary_loss_mlp": 0.01027452, + "balance_loss_clip": 1.01336694, + "balance_loss_mlp": 1.02303529, + "epoch": 0.366691718021945, + "flos": 21356265928320.0, + "grad_norm": 1.6360923787194308, + "language_loss": 0.73556, + "learning_rate": 2.8137033849471305e-06, + "loss": 0.75654173, + "num_input_tokens_seen": 131033990, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.4765625, + "step": 6099, + "time_per_iteration": 2.388598918914795 + }, + { + "auxiliary_loss_clip": 0.01068536, + "auxiliary_loss_mlp": 0.01033656, + "balance_loss_clip": 1.01951194, + "balance_loss_mlp": 1.02262831, + "epoch": 0.36675184127461297, + "flos": 16799436852480.0, + "grad_norm": 1.8494578564905462, + "language_loss": 0.84355438, + "learning_rate": 2.8133582319151456e-06, + "loss": 0.86457634, + "num_input_tokens_seen": 131050710, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.45898438, + "step": 6100, + "time_per_iteration": 3.8065085411071777 + }, + { + "auxiliary_loss_clip": 0.01074379, + "auxiliary_loss_mlp": 0.01026763, + "balance_loss_clip": 1.01202273, + "balance_loss_mlp": 1.02243161, + "epoch": 0.36681196452728093, + "flos": 21905484526080.0, + "grad_norm": 2.4890628118262144, + "language_loss": 0.70205688, + "learning_rate": 2.8130130498564975e-06, + "loss": 0.72306836, + "num_input_tokens_seen": 131071435, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.51953125, + "step": 6101, + "time_per_iteration": 2.4087295532226562 + }, + { + "auxiliary_loss_clip": 0.01073664, + "auxiliary_loss_mlp": 0.01035356, + "balance_loss_clip": 1.01923847, + "balance_loss_mlp": 1.02254891, + "epoch": 0.3668720877799489, + "flos": 17711472965760.0, + "grad_norm": 2.367555584335422, + "language_loss": 0.76163924, + "learning_rate": 2.8126678387835057e-06, + "loss": 0.78272951, + "num_input_tokens_seen": 131088775, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.51171875, + "step": 6102, + "time_per_iteration": 2.3563108444213867 + }, + { + "auxiliary_loss_clip": 0.01076192, + "auxiliary_loss_mlp": 0.01031804, + "balance_loss_clip": 1.01512623, + "balance_loss_mlp": 1.02343702, + "epoch": 0.36693221103261686, + "flos": 47043717615360.0, + "grad_norm": 1.7124634897630733, + "language_loss": 0.70331347, + "learning_rate": 2.812322598708489e-06, + "loss": 0.72439349, + "num_input_tokens_seen": 131112800, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.52734375, + "step": 6103, + "time_per_iteration": 4.036387920379639 + }, + { + "auxiliary_loss_clip": 0.01074232, + "auxiliary_loss_mlp": 0.01030023, + "balance_loss_clip": 1.01553893, + "balance_loss_mlp": 1.02312946, + "epoch": 0.3669923342852848, + "flos": 15960020100480.0, + "grad_norm": 1.9806521522904572, + "language_loss": 0.71845764, + "learning_rate": 2.811977329643768e-06, + "loss": 0.73950016, + "num_input_tokens_seen": 131131150, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.51171875, + "step": 6104, + "time_per_iteration": 2.397360324859619 + }, + { + "auxiliary_loss_clip": 0.0107291, + "auxiliary_loss_mlp": 0.01028384, + "balance_loss_clip": 1.01295865, + "balance_loss_mlp": 1.02321506, + "epoch": 0.3670524575379528, + "flos": 19973460775680.0, + "grad_norm": 1.728501736126496, + "language_loss": 0.81408119, + "learning_rate": 2.8116320316016646e-06, + "loss": 0.83509409, + "num_input_tokens_seen": 131150365, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.49609375, + "step": 6105, + "time_per_iteration": 3.898510217666626 + }, + { + "auxiliary_loss_clip": 0.01078725, + "auxiliary_loss_mlp": 0.01035895, + "balance_loss_clip": 1.01938462, + "balance_loss_mlp": 1.02527511, + "epoch": 0.36711258079062076, + "flos": 25701765344640.0, + "grad_norm": 1.6666451576869006, + "language_loss": 0.8094269, + "learning_rate": 2.8112867045945016e-06, + "loss": 0.83057308, + "num_input_tokens_seen": 131169310, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.53125, + "step": 6106, + "time_per_iteration": 2.4523539543151855 + }, + { + "auxiliary_loss_clip": 0.01011356, + "auxiliary_loss_mlp": 0.01009177, + "balance_loss_clip": 1.00791371, + "balance_loss_mlp": 1.00122011, + "epoch": 0.3671727040432887, + "flos": 60769364791680.0, + "grad_norm": 0.6889625019557507, + "language_loss": 0.59163457, + "learning_rate": 2.8109413486346044e-06, + "loss": 0.61183989, + "num_input_tokens_seen": 131232900, + "router_z_loss_clip": 0.01263428, + "router_z_loss_mlp": 0.1015625, + "step": 6107, + "time_per_iteration": 3.0986409187316895 + }, + { + "auxiliary_loss_clip": 0.01072713, + "auxiliary_loss_mlp": 0.01026048, + "balance_loss_clip": 1.0108968, + "balance_loss_mlp": 1.02281106, + "epoch": 0.3672328272959567, + "flos": 18660307518720.0, + "grad_norm": 1.4899114665702824, + "language_loss": 0.74680829, + "learning_rate": 2.810595963734295e-06, + "loss": 0.76779592, + "num_input_tokens_seen": 131250920, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.5, + "step": 6108, + "time_per_iteration": 3.8320603370666504 + }, + { + "auxiliary_loss_clip": 0.01073165, + "auxiliary_loss_mlp": 0.01032712, + "balance_loss_clip": 1.01732183, + "balance_loss_mlp": 1.02256417, + "epoch": 0.3672929505486247, + "flos": 15048158544000.0, + "grad_norm": 2.2105412007391614, + "language_loss": 0.73425055, + "learning_rate": 2.810250549905901e-06, + "loss": 0.75530934, + "num_input_tokens_seen": 131267910, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.5078125, + "step": 6109, + "time_per_iteration": 2.378941774368286 + }, + { + "auxiliary_loss_clip": 0.01072392, + "auxiliary_loss_mlp": 0.01027169, + "balance_loss_clip": 1.01346624, + "balance_loss_mlp": 1.02250171, + "epoch": 0.3673530738012927, + "flos": 20588456108160.0, + "grad_norm": 2.1505767009493573, + "language_loss": 0.52575916, + "learning_rate": 2.80990510716175e-06, + "loss": 0.54675484, + "num_input_tokens_seen": 131287150, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.5, + "step": 6110, + "time_per_iteration": 2.4200210571289062 + }, + { + "auxiliary_loss_clip": 0.01073563, + "auxiliary_loss_mlp": 0.01030324, + "balance_loss_clip": 1.01585174, + "balance_loss_mlp": 1.02452254, + "epoch": 0.36741319705396064, + "flos": 21688743605760.0, + "grad_norm": 1.530504211503779, + "language_loss": 0.80748588, + "learning_rate": 2.8095596355141676e-06, + "loss": 0.82852477, + "num_input_tokens_seen": 131308225, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.49023438, + "step": 6111, + "time_per_iteration": 2.420271635055542 + }, + { + "auxiliary_loss_clip": 0.0107125, + "auxiliary_loss_mlp": 0.01032753, + "balance_loss_clip": 1.0179708, + "balance_loss_mlp": 1.02336311, + "epoch": 0.3674733203066286, + "flos": 29860898590080.0, + "grad_norm": 1.4739418271489957, + "language_loss": 0.72328079, + "learning_rate": 2.809214134975485e-06, + "loss": 0.74432081, + "num_input_tokens_seen": 131332115, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.47851562, + "step": 6112, + "time_per_iteration": 2.4564170837402344 + }, + { + "auxiliary_loss_clip": 0.01073792, + "auxiliary_loss_mlp": 0.01038512, + "balance_loss_clip": 1.02386117, + "balance_loss_mlp": 1.02396154, + "epoch": 0.36753344355929657, + "flos": 18256118175360.0, + "grad_norm": 1.527052847752993, + "language_loss": 0.85310948, + "learning_rate": 2.8088686055580315e-06, + "loss": 0.87423253, + "num_input_tokens_seen": 131351885, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.49804688, + "step": 6113, + "time_per_iteration": 2.3928356170654297 + }, + { + "auxiliary_loss_clip": 0.01075114, + "auxiliary_loss_mlp": 0.01032246, + "balance_loss_clip": 1.01707113, + "balance_loss_mlp": 1.02370787, + "epoch": 0.36759356681196453, + "flos": 25299984885120.0, + "grad_norm": 1.7608819048036308, + "language_loss": 0.78340703, + "learning_rate": 2.8085230472741377e-06, + "loss": 0.80448067, + "num_input_tokens_seen": 131370245, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.515625, + "step": 6114, + "time_per_iteration": 2.4111688137054443 + }, + { + "auxiliary_loss_clip": 0.010784, + "auxiliary_loss_mlp": 0.0103524, + "balance_loss_clip": 1.01853895, + "balance_loss_mlp": 1.02454901, + "epoch": 0.3676536900646325, + "flos": 21031887686400.0, + "grad_norm": 1.763906217716276, + "language_loss": 0.67075121, + "learning_rate": 2.808177460136137e-06, + "loss": 0.69188762, + "num_input_tokens_seen": 131388115, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.5390625, + "step": 6115, + "time_per_iteration": 2.3944694995880127 + }, + { + "auxiliary_loss_clip": 0.01071051, + "auxiliary_loss_mlp": 0.01026193, + "balance_loss_clip": 1.01204896, + "balance_loss_mlp": 1.02275336, + "epoch": 0.36771381331730046, + "flos": 16287610187520.0, + "grad_norm": 2.6289992092190957, + "language_loss": 0.76909393, + "learning_rate": 2.807831844156361e-06, + "loss": 0.79006636, + "num_input_tokens_seen": 131404595, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.48242188, + "step": 6116, + "time_per_iteration": 2.3609845638275146 + }, + { + "auxiliary_loss_clip": 0.01071148, + "auxiliary_loss_mlp": 0.01029104, + "balance_loss_clip": 1.01534724, + "balance_loss_mlp": 1.02222228, + "epoch": 0.36777393656996843, + "flos": 22308870908160.0, + "grad_norm": 1.967186657552637, + "language_loss": 0.63121545, + "learning_rate": 2.8074861993471444e-06, + "loss": 0.65221786, + "num_input_tokens_seen": 131423760, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.48828125, + "step": 6117, + "time_per_iteration": 2.448568820953369 + }, + { + "auxiliary_loss_clip": 0.01071838, + "auxiliary_loss_mlp": 0.01029702, + "balance_loss_clip": 1.01490819, + "balance_loss_mlp": 1.02300096, + "epoch": 0.3678340598226364, + "flos": 26832846528000.0, + "grad_norm": 2.381143549490389, + "language_loss": 0.73201048, + "learning_rate": 2.807140525720822e-06, + "loss": 0.75302595, + "num_input_tokens_seen": 131444955, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.48828125, + "step": 6118, + "time_per_iteration": 2.4490222930908203 + }, + { + "auxiliary_loss_clip": 0.01077614, + "auxiliary_loss_mlp": 0.01035592, + "balance_loss_clip": 1.01834214, + "balance_loss_mlp": 1.02380753, + "epoch": 0.36789418307530436, + "flos": 21760664739840.0, + "grad_norm": 1.8290684607763328, + "language_loss": 0.72595912, + "learning_rate": 2.8067948232897314e-06, + "loss": 0.74709117, + "num_input_tokens_seen": 131465720, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.5390625, + "step": 6119, + "time_per_iteration": 2.434501886367798 + }, + { + "auxiliary_loss_clip": 0.0107327, + "auxiliary_loss_mlp": 0.01030152, + "balance_loss_clip": 1.01558518, + "balance_loss_mlp": 1.02385104, + "epoch": 0.3679543063279723, + "flos": 15923291483520.0, + "grad_norm": 1.7799372833722993, + "language_loss": 0.80388439, + "learning_rate": 2.806449092066209e-06, + "loss": 0.82491863, + "num_input_tokens_seen": 131483080, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.49414062, + "step": 6120, + "time_per_iteration": 2.3701846599578857 + }, + { + "auxiliary_loss_clip": 0.01072336, + "auxiliary_loss_mlp": 0.01036467, + "balance_loss_clip": 1.0213995, + "balance_loss_mlp": 1.02289307, + "epoch": 0.3680144295806403, + "flos": 24274516164480.0, + "grad_norm": 1.9158094248540187, + "language_loss": 0.64188147, + "learning_rate": 2.8061033320625923e-06, + "loss": 0.66296947, + "num_input_tokens_seen": 131502545, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.49609375, + "step": 6121, + "time_per_iteration": 2.4181020259857178 + }, + { + "auxiliary_loss_clip": 0.01077199, + "auxiliary_loss_mlp": 0.01033524, + "balance_loss_clip": 1.01809859, + "balance_loss_mlp": 1.02500939, + "epoch": 0.36807455283330826, + "flos": 26102952311040.0, + "grad_norm": 3.524007778996557, + "language_loss": 0.71480983, + "learning_rate": 2.8057575432912215e-06, + "loss": 0.73591709, + "num_input_tokens_seen": 131522155, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.5234375, + "step": 6122, + "time_per_iteration": 2.4268269538879395 + }, + { + "auxiliary_loss_clip": 0.01071565, + "auxiliary_loss_mlp": 0.01028494, + "balance_loss_clip": 1.01360464, + "balance_loss_mlp": 1.02429342, + "epoch": 0.3681346760859763, + "flos": 24643827192960.0, + "grad_norm": 1.9234544111295393, + "language_loss": 0.69032305, + "learning_rate": 2.805411725764436e-06, + "loss": 0.71132362, + "num_input_tokens_seen": 131543865, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.47265625, + "step": 6123, + "time_per_iteration": 2.4417355060577393 + }, + { + "auxiliary_loss_clip": 0.01077047, + "auxiliary_loss_mlp": 0.01031129, + "balance_loss_clip": 1.01446366, + "balance_loss_mlp": 1.02417839, + "epoch": 0.36819479933864424, + "flos": 23877239270400.0, + "grad_norm": 2.033338858680913, + "language_loss": 0.73455763, + "learning_rate": 2.805065879494579e-06, + "loss": 0.75563937, + "num_input_tokens_seen": 131562155, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.52734375, + "step": 6124, + "time_per_iteration": 2.4074692726135254 + }, + { + "auxiliary_loss_clip": 0.01073393, + "auxiliary_loss_mlp": 0.01036235, + "balance_loss_clip": 1.02058899, + "balance_loss_mlp": 1.02219164, + "epoch": 0.3682549225913122, + "flos": 25552895840640.0, + "grad_norm": 2.353104072639115, + "language_loss": 0.7406553, + "learning_rate": 2.804720004493991e-06, + "loss": 0.76175153, + "num_input_tokens_seen": 131581695, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.51171875, + "step": 6125, + "time_per_iteration": 2.4441373348236084 + }, + { + "auxiliary_loss_clip": 0.01075917, + "auxiliary_loss_mlp": 0.01035083, + "balance_loss_clip": 1.0186677, + "balance_loss_mlp": 1.02454197, + "epoch": 0.36831504584398017, + "flos": 16945653093120.0, + "grad_norm": 1.7670871319194332, + "language_loss": 0.78394169, + "learning_rate": 2.804374100775016e-06, + "loss": 0.80505168, + "num_input_tokens_seen": 131599465, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.51171875, + "step": 6126, + "time_per_iteration": 2.352067470550537 + }, + { + "auxiliary_loss_clip": 0.01075314, + "auxiliary_loss_mlp": 0.010314, + "balance_loss_clip": 1.01406729, + "balance_loss_mlp": 1.02231216, + "epoch": 0.36837516909664814, + "flos": 19864042974720.0, + "grad_norm": 2.2242182044548526, + "language_loss": 0.66127962, + "learning_rate": 2.8040281683499985e-06, + "loss": 0.68234676, + "num_input_tokens_seen": 131618330, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.53125, + "step": 6127, + "time_per_iteration": 2.408078193664551 + }, + { + "auxiliary_loss_clip": 0.01078459, + "auxiliary_loss_mlp": 0.01027712, + "balance_loss_clip": 1.01140451, + "balance_loss_mlp": 1.02582002, + "epoch": 0.3684352923493161, + "flos": 37625652385920.0, + "grad_norm": 1.7119671063003425, + "language_loss": 0.70323122, + "learning_rate": 2.8036822072312835e-06, + "loss": 0.72429293, + "num_input_tokens_seen": 131638960, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.52734375, + "step": 6128, + "time_per_iteration": 2.5249085426330566 + }, + { + "auxiliary_loss_clip": 0.01076101, + "auxiliary_loss_mlp": 0.01033464, + "balance_loss_clip": 1.01847959, + "balance_loss_mlp": 1.02550697, + "epoch": 0.36849541560198407, + "flos": 14464620213120.0, + "grad_norm": 1.723113560882796, + "language_loss": 0.75043875, + "learning_rate": 2.803336217431218e-06, + "loss": 0.77153438, + "num_input_tokens_seen": 131657440, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.5078125, + "step": 6129, + "time_per_iteration": 2.3905277252197266 + }, + { + "auxiliary_loss_clip": 0.01073848, + "auxiliary_loss_mlp": 0.01032574, + "balance_loss_clip": 1.01796436, + "balance_loss_mlp": 1.02329254, + "epoch": 0.36855553885465203, + "flos": 25769706583680.0, + "grad_norm": 1.5522875672930927, + "language_loss": 0.84871697, + "learning_rate": 2.80299019896215e-06, + "loss": 0.86978114, + "num_input_tokens_seen": 131678035, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.50390625, + "step": 6130, + "time_per_iteration": 2.446664333343506 + }, + { + "auxiliary_loss_clip": 0.01012405, + "auxiliary_loss_mlp": 0.01001117, + "balance_loss_clip": 0.9997645, + "balance_loss_mlp": 1.00186753, + "epoch": 0.36861566210732, + "flos": 65045701071360.0, + "grad_norm": 0.809886112055028, + "language_loss": 0.60249758, + "learning_rate": 2.8026441518364262e-06, + "loss": 0.62263286, + "num_input_tokens_seen": 131742470, + "router_z_loss_clip": 0.0135498, + "router_z_loss_mlp": 0.10546875, + "step": 6131, + "time_per_iteration": 3.138420343399048 + }, + { + "auxiliary_loss_clip": 0.01070932, + "auxiliary_loss_mlp": 0.01029284, + "balance_loss_clip": 1.01451361, + "balance_loss_mlp": 1.0219245, + "epoch": 0.36867578535998796, + "flos": 30953226297600.0, + "grad_norm": 1.5012132493405446, + "language_loss": 0.72874516, + "learning_rate": 2.8022980760663977e-06, + "loss": 0.74974728, + "num_input_tokens_seen": 131764570, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.4921875, + "step": 6132, + "time_per_iteration": 2.466062545776367 + }, + { + "auxiliary_loss_clip": 0.01076173, + "auxiliary_loss_mlp": 0.01030889, + "balance_loss_clip": 1.01473641, + "balance_loss_mlp": 1.02380884, + "epoch": 0.3687359086126559, + "flos": 28836756501120.0, + "grad_norm": 1.768903603637774, + "language_loss": 0.74087232, + "learning_rate": 2.8019519716644147e-06, + "loss": 0.76194292, + "num_input_tokens_seen": 131785720, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.5234375, + "step": 6133, + "time_per_iteration": 2.467952013015747 + }, + { + "auxiliary_loss_clip": 0.0107123, + "auxiliary_loss_mlp": 0.01032985, + "balance_loss_clip": 1.01807165, + "balance_loss_mlp": 1.02409494, + "epoch": 0.3687960318653239, + "flos": 21395752542720.0, + "grad_norm": 2.7257354337482638, + "language_loss": 0.71565998, + "learning_rate": 2.801605838642829e-06, + "loss": 0.73670214, + "num_input_tokens_seen": 131804430, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.47265625, + "step": 6134, + "time_per_iteration": 2.389812707901001 + }, + { + "auxiliary_loss_clip": 0.01073092, + "auxiliary_loss_mlp": 0.01028475, + "balance_loss_clip": 1.01295376, + "balance_loss_mlp": 1.02352655, + "epoch": 0.36885615511799186, + "flos": 20265020472960.0, + "grad_norm": 1.632198933011989, + "language_loss": 0.75171113, + "learning_rate": 2.8012596770139933e-06, + "loss": 0.77272677, + "num_input_tokens_seen": 131822060, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.49609375, + "step": 6135, + "time_per_iteration": 2.402489185333252 + }, + { + "auxiliary_loss_clip": 0.01012098, + "auxiliary_loss_mlp": 0.00997505, + "balance_loss_clip": 0.99619401, + "balance_loss_mlp": 1.00174928, + "epoch": 0.3689162783706599, + "flos": 63085922789760.0, + "grad_norm": 0.8170817831202992, + "language_loss": 0.58822632, + "learning_rate": 2.80091348679026e-06, + "loss": 0.60832238, + "num_input_tokens_seen": 131880715, + "router_z_loss_clip": 0.01312256, + "router_z_loss_mlp": 0.10351562, + "step": 6136, + "time_per_iteration": 2.981640338897705 + }, + { + "auxiliary_loss_clip": 0.01072818, + "auxiliary_loss_mlp": 0.01028872, + "balance_loss_clip": 1.01381636, + "balance_loss_mlp": 1.0235889, + "epoch": 0.36897640162332784, + "flos": 10961225723520.0, + "grad_norm": 1.8049396648927767, + "language_loss": 0.79098797, + "learning_rate": 2.800567267983985e-06, + "loss": 0.8120048, + "num_input_tokens_seen": 131895850, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.4921875, + "step": 6137, + "time_per_iteration": 2.38376522064209 + }, + { + "auxiliary_loss_clip": 0.01075635, + "auxiliary_loss_mlp": 0.01040549, + "balance_loss_clip": 1.02424133, + "balance_loss_mlp": 1.02535903, + "epoch": 0.3690365248759958, + "flos": 20703250258560.0, + "grad_norm": 1.7896905927516755, + "language_loss": 0.7387743, + "learning_rate": 2.8002210206075233e-06, + "loss": 0.75993609, + "num_input_tokens_seen": 131915775, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.5, + "step": 6138, + "time_per_iteration": 2.426119327545166 + }, + { + "auxiliary_loss_clip": 0.01077474, + "auxiliary_loss_mlp": 0.01032328, + "balance_loss_clip": 1.01556754, + "balance_loss_mlp": 1.02376294, + "epoch": 0.3690966481286638, + "flos": 31825182303360.0, + "grad_norm": 1.7406841382641254, + "language_loss": 0.65257591, + "learning_rate": 2.7998747446732315e-06, + "loss": 0.67367387, + "num_input_tokens_seen": 131935715, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.53515625, + "step": 6139, + "time_per_iteration": 3.9093217849731445 + }, + { + "auxiliary_loss_clip": 0.01070738, + "auxiliary_loss_mlp": 0.01034697, + "balance_loss_clip": 1.01935506, + "balance_loss_mlp": 1.02215028, + "epoch": 0.36915677138133174, + "flos": 13114109934720.0, + "grad_norm": 2.0513219928312214, + "language_loss": 0.71410334, + "learning_rate": 2.7995284401934677e-06, + "loss": 0.73515773, + "num_input_tokens_seen": 131954120, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.484375, + "step": 6140, + "time_per_iteration": 2.3560144901275635 + }, + { + "auxiliary_loss_clip": 0.01011733, + "auxiliary_loss_mlp": 0.01012969, + "balance_loss_clip": 1.01162171, + "balance_loss_mlp": 1.00160146, + "epoch": 0.3692168946339997, + "flos": 68683372116480.0, + "grad_norm": 0.7434864879686168, + "language_loss": 0.59351194, + "learning_rate": 2.7991821071805906e-06, + "loss": 0.61375892, + "num_input_tokens_seen": 132017485, + "router_z_loss_clip": 0.01348877, + "router_z_loss_mlp": 0.1015625, + "step": 6141, + "time_per_iteration": 3.1399033069610596 + }, + { + "auxiliary_loss_clip": 0.01072174, + "auxiliary_loss_mlp": 0.0103299, + "balance_loss_clip": 1.01775515, + "balance_loss_mlp": 1.02185011, + "epoch": 0.36927701788666767, + "flos": 22016787540480.0, + "grad_norm": 1.6896237242923156, + "language_loss": 0.75097811, + "learning_rate": 2.7988357456469605e-06, + "loss": 0.77202976, + "num_input_tokens_seen": 132036760, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.50390625, + "step": 6142, + "time_per_iteration": 2.39654803276062 + }, + { + "auxiliary_loss_clip": 0.01072031, + "auxiliary_loss_mlp": 0.0102752, + "balance_loss_clip": 1.01261878, + "balance_loss_mlp": 1.02329087, + "epoch": 0.36933714113933563, + "flos": 21834505998720.0, + "grad_norm": 2.1855094497804997, + "language_loss": 0.76777643, + "learning_rate": 2.7984893556049365e-06, + "loss": 0.78877193, + "num_input_tokens_seen": 132056935, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.48828125, + "step": 6143, + "time_per_iteration": 3.908581018447876 + }, + { + "auxiliary_loss_clip": 0.01071934, + "auxiliary_loss_mlp": 0.01027748, + "balance_loss_clip": 1.01339555, + "balance_loss_mlp": 1.02334738, + "epoch": 0.3693972643920036, + "flos": 23690698542720.0, + "grad_norm": 1.5964966895470476, + "language_loss": 0.82052958, + "learning_rate": 2.7981429370668815e-06, + "loss": 0.84152639, + "num_input_tokens_seen": 132077285, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.48632812, + "step": 6144, + "time_per_iteration": 3.936103343963623 + }, + { + "auxiliary_loss_clip": 0.01072841, + "auxiliary_loss_mlp": 0.01031839, + "balance_loss_clip": 1.01730764, + "balance_loss_mlp": 1.0217663, + "epoch": 0.36945738764467156, + "flos": 22855645710720.0, + "grad_norm": 2.649624064880404, + "language_loss": 0.77396673, + "learning_rate": 2.797796490045158e-06, + "loss": 0.79501355, + "num_input_tokens_seen": 132095520, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.51171875, + "step": 6145, + "time_per_iteration": 2.4452884197235107 + }, + { + "auxiliary_loss_clip": 0.0107695, + "auxiliary_loss_mlp": 0.0102642, + "balance_loss_clip": 1.01095879, + "balance_loss_mlp": 1.02534926, + "epoch": 0.36951751089733953, + "flos": 16615060629120.0, + "grad_norm": 2.019876258482959, + "language_loss": 0.76790512, + "learning_rate": 2.7974500145521304e-06, + "loss": 0.78893888, + "num_input_tokens_seen": 132112810, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.515625, + "step": 6146, + "time_per_iteration": 2.370117664337158 + }, + { + "auxiliary_loss_clip": 0.01074658, + "auxiliary_loss_mlp": 0.01036015, + "balance_loss_clip": 1.01977897, + "balance_loss_mlp": 1.02355146, + "epoch": 0.3695776341500075, + "flos": 18913602499200.0, + "grad_norm": 1.495732302996305, + "language_loss": 0.80735236, + "learning_rate": 2.7971035106001636e-06, + "loss": 0.82845902, + "num_input_tokens_seen": 132131615, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.51171875, + "step": 6147, + "time_per_iteration": 3.8307507038116455 + }, + { + "auxiliary_loss_clip": 0.01073759, + "auxiliary_loss_mlp": 0.01029099, + "balance_loss_clip": 1.01432931, + "balance_loss_mlp": 1.02287328, + "epoch": 0.36963775740267546, + "flos": 20807571000960.0, + "grad_norm": 1.788172256452552, + "language_loss": 0.83178854, + "learning_rate": 2.796756978201622e-06, + "loss": 0.85281712, + "num_input_tokens_seen": 132149585, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.5078125, + "step": 6148, + "time_per_iteration": 2.429959535598755 + }, + { + "auxiliary_loss_clip": 0.01071161, + "auxiliary_loss_mlp": 0.01032553, + "balance_loss_clip": 1.01699615, + "balance_loss_mlp": 1.02326953, + "epoch": 0.3696978806553435, + "flos": 26060847344640.0, + "grad_norm": 2.394569391033449, + "language_loss": 0.73839736, + "learning_rate": 2.7964104173688735e-06, + "loss": 0.75943446, + "num_input_tokens_seen": 132165555, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.47851562, + "step": 6149, + "time_per_iteration": 2.4272594451904297 + }, + { + "auxiliary_loss_clip": 0.01074895, + "auxiliary_loss_mlp": 0.01036226, + "balance_loss_clip": 1.01898813, + "balance_loss_mlp": 1.02445626, + "epoch": 0.36975800390801145, + "flos": 26832706882560.0, + "grad_norm": 2.2923998020270555, + "language_loss": 0.70792317, + "learning_rate": 2.796063828114286e-06, + "loss": 0.72903436, + "num_input_tokens_seen": 132185100, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.50390625, + "step": 6150, + "time_per_iteration": 2.439673662185669 + }, + { + "auxiliary_loss_clip": 0.01073981, + "auxiliary_loss_mlp": 0.01037773, + "balance_loss_clip": 1.02262712, + "balance_loss_mlp": 1.02396321, + "epoch": 0.3698181271606794, + "flos": 21141549866880.0, + "grad_norm": 1.5323359610565865, + "language_loss": 0.8186698, + "learning_rate": 2.795717210450228e-06, + "loss": 0.8397873, + "num_input_tokens_seen": 132203930, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.5, + "step": 6151, + "time_per_iteration": 2.443286657333374 + }, + { + "auxiliary_loss_clip": 0.01013235, + "auxiliary_loss_mlp": 0.01001615, + "balance_loss_clip": 1.00013053, + "balance_loss_mlp": 1.0029459, + "epoch": 0.3698782504133474, + "flos": 66739478503680.0, + "grad_norm": 0.7747616968948969, + "language_loss": 0.63107443, + "learning_rate": 2.7953705643890705e-06, + "loss": 0.65122294, + "num_input_tokens_seen": 132263845, + "router_z_loss_clip": 0.01483154, + "router_z_loss_mlp": 0.10253906, + "step": 6152, + "time_per_iteration": 3.131995439529419 + }, + { + "auxiliary_loss_clip": 0.0107044, + "auxiliary_loss_mlp": 0.01033976, + "balance_loss_clip": 1.01950431, + "balance_loss_mlp": 1.02314484, + "epoch": 0.36993837366601534, + "flos": 24310511642880.0, + "grad_norm": 2.1349998952096096, + "language_loss": 0.7014755, + "learning_rate": 2.7950238899431827e-06, + "loss": 0.72251964, + "num_input_tokens_seen": 132282350, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.47265625, + "step": 6153, + "time_per_iteration": 2.4402735233306885 + }, + { + "auxiliary_loss_clip": 0.01072548, + "auxiliary_loss_mlp": 0.01030596, + "balance_loss_clip": 1.01481295, + "balance_loss_mlp": 1.02292311, + "epoch": 0.3699984969186833, + "flos": 24348147955200.0, + "grad_norm": 1.689951032689083, + "language_loss": 0.72462368, + "learning_rate": 2.7946771871249374e-06, + "loss": 0.74565518, + "num_input_tokens_seen": 132301930, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.49609375, + "step": 6154, + "time_per_iteration": 2.4188525676727295 + }, + { + "auxiliary_loss_clip": 0.01072557, + "auxiliary_loss_mlp": 0.01033567, + "balance_loss_clip": 1.01965499, + "balance_loss_mlp": 1.02387547, + "epoch": 0.37005862017135127, + "flos": 19828117319040.0, + "grad_norm": 1.6489161345322185, + "language_loss": 0.67670542, + "learning_rate": 2.794330455946707e-06, + "loss": 0.69776672, + "num_input_tokens_seen": 132320915, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.48632812, + "step": 6155, + "time_per_iteration": 2.3896164894104004 + }, + { + "auxiliary_loss_clip": 0.01072978, + "auxiliary_loss_mlp": 0.01027271, + "balance_loss_clip": 1.01282251, + "balance_loss_mlp": 1.02416444, + "epoch": 0.37011874342401924, + "flos": 19572762568320.0, + "grad_norm": 1.726609775642434, + "language_loss": 0.6764698, + "learning_rate": 2.7939836964208665e-06, + "loss": 0.69747233, + "num_input_tokens_seen": 132340415, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.48632812, + "step": 6156, + "time_per_iteration": 2.384624481201172 + }, + { + "auxiliary_loss_clip": 0.0107055, + "auxiliary_loss_mlp": 0.01032121, + "balance_loss_clip": 1.01825714, + "balance_loss_mlp": 1.02271199, + "epoch": 0.3701788666766872, + "flos": 20373356021760.0, + "grad_norm": 1.6829890275997992, + "language_loss": 0.82059813, + "learning_rate": 2.7936369085597895e-06, + "loss": 0.8416248, + "num_input_tokens_seen": 132358600, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.4765625, + "step": 6157, + "time_per_iteration": 2.3858392238616943 + }, + { + "auxiliary_loss_clip": 0.01075272, + "auxiliary_loss_mlp": 0.01035344, + "balance_loss_clip": 1.01779675, + "balance_loss_mlp": 1.02302527, + "epoch": 0.37023898992935517, + "flos": 15340032443520.0, + "grad_norm": 2.2596027278673816, + "language_loss": 0.76417756, + "learning_rate": 2.793290092375853e-06, + "loss": 0.78528368, + "num_input_tokens_seen": 132373160, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.5234375, + "step": 6158, + "time_per_iteration": 2.3714630603790283 + }, + { + "auxiliary_loss_clip": 0.01074523, + "auxiliary_loss_mlp": 0.0102908, + "balance_loss_clip": 1.01333809, + "balance_loss_mlp": 1.02275419, + "epoch": 0.37029911318202313, + "flos": 19572902213760.0, + "grad_norm": 2.1155851363161347, + "language_loss": 0.69438392, + "learning_rate": 2.7929432478814346e-06, + "loss": 0.71541995, + "num_input_tokens_seen": 132392345, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.51953125, + "step": 6159, + "time_per_iteration": 2.3774023056030273 + }, + { + "auxiliary_loss_clip": 0.01069322, + "auxiliary_loss_mlp": 0.01033653, + "balance_loss_clip": 1.01967525, + "balance_loss_mlp": 1.02098835, + "epoch": 0.3703592364346911, + "flos": 26212160643840.0, + "grad_norm": 2.283030665095097, + "language_loss": 0.70785308, + "learning_rate": 2.7925963750889108e-06, + "loss": 0.72888285, + "num_input_tokens_seen": 132412620, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.484375, + "step": 6160, + "time_per_iteration": 2.4150218963623047 + }, + { + "auxiliary_loss_clip": 0.01067627, + "auxiliary_loss_mlp": 0.01025522, + "balance_loss_clip": 1.01220083, + "balance_loss_mlp": 1.02091527, + "epoch": 0.37041935968735906, + "flos": 20047267123200.0, + "grad_norm": 1.5429641045666378, + "language_loss": 0.79158479, + "learning_rate": 2.792249474010661e-06, + "loss": 0.81251633, + "num_input_tokens_seen": 132431570, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.46875, + "step": 6161, + "time_per_iteration": 2.4009366035461426 + }, + { + "auxiliary_loss_clip": 0.01072354, + "auxiliary_loss_mlp": 0.01033695, + "balance_loss_clip": 1.01769161, + "balance_loss_mlp": 1.02363896, + "epoch": 0.3704794829400271, + "flos": 24132663843840.0, + "grad_norm": 1.6410845084206767, + "language_loss": 0.7925939, + "learning_rate": 2.791902544659065e-06, + "loss": 0.81365436, + "num_input_tokens_seen": 132451525, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.48632812, + "step": 6162, + "time_per_iteration": 2.418328046798706 + }, + { + "auxiliary_loss_clip": 0.01073634, + "auxiliary_loss_mlp": 0.01035884, + "balance_loss_clip": 1.02153099, + "balance_loss_mlp": 1.02438009, + "epoch": 0.37053960619269505, + "flos": 14865981736320.0, + "grad_norm": 1.7897845932250385, + "language_loss": 0.79305756, + "learning_rate": 2.7915555870465047e-06, + "loss": 0.81415278, + "num_input_tokens_seen": 132469875, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.4921875, + "step": 6163, + "time_per_iteration": 2.374377965927124 + }, + { + "auxiliary_loss_clip": 0.01073365, + "auxiliary_loss_mlp": 0.01030597, + "balance_loss_clip": 1.01530254, + "balance_loss_mlp": 1.0234623, + "epoch": 0.370599729445363, + "flos": 21360420380160.0, + "grad_norm": 1.640969722414908, + "language_loss": 0.68600202, + "learning_rate": 2.791208601185362e-06, + "loss": 0.70704174, + "num_input_tokens_seen": 132488360, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.49804688, + "step": 6164, + "time_per_iteration": 2.399181365966797 + }, + { + "auxiliary_loss_clip": 0.01077313, + "auxiliary_loss_mlp": 0.01031061, + "balance_loss_clip": 1.01463354, + "balance_loss_mlp": 1.02592254, + "epoch": 0.370659852698031, + "flos": 26827958937600.0, + "grad_norm": 2.197070402237643, + "language_loss": 0.82759511, + "learning_rate": 2.7908615870880185e-06, + "loss": 0.84867883, + "num_input_tokens_seen": 132508630, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.515625, + "step": 6165, + "time_per_iteration": 2.458951234817505 + }, + { + "auxiliary_loss_clip": 0.01075641, + "auxiliary_loss_mlp": 0.01033168, + "balance_loss_clip": 1.01656222, + "balance_loss_mlp": 1.02415347, + "epoch": 0.37071997595069894, + "flos": 19098013633920.0, + "grad_norm": 1.9676734329284329, + "language_loss": 0.6925658, + "learning_rate": 2.7905145447668605e-06, + "loss": 0.71365392, + "num_input_tokens_seen": 132527465, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.515625, + "step": 6166, + "time_per_iteration": 2.410658597946167 + }, + { + "auxiliary_loss_clip": 0.01012471, + "auxiliary_loss_mlp": 0.01002587, + "balance_loss_clip": 1.00098932, + "balance_loss_mlp": 1.00274086, + "epoch": 0.3707800992033669, + "flos": 52175809163520.0, + "grad_norm": 0.7916298208244803, + "language_loss": 0.56858432, + "learning_rate": 2.790167474234271e-06, + "loss": 0.58873487, + "num_input_tokens_seen": 132579940, + "router_z_loss_clip": 0.01599121, + "router_z_loss_mlp": 0.09716797, + "step": 6167, + "time_per_iteration": 2.9531548023223877 + }, + { + "auxiliary_loss_clip": 0.01071433, + "auxiliary_loss_mlp": 0.01024003, + "balance_loss_clip": 1.01077676, + "balance_loss_mlp": 1.02367759, + "epoch": 0.3708402224560349, + "flos": 19900806503040.0, + "grad_norm": 1.8341806717633984, + "language_loss": 0.74896836, + "learning_rate": 2.7898203755026377e-06, + "loss": 0.76992279, + "num_input_tokens_seen": 132598390, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.4765625, + "step": 6168, + "time_per_iteration": 2.458956003189087 + }, + { + "auxiliary_loss_clip": 0.01073177, + "auxiliary_loss_mlp": 0.01029685, + "balance_loss_clip": 1.0148499, + "balance_loss_mlp": 1.02347374, + "epoch": 0.37090034570870284, + "flos": 20006698256640.0, + "grad_norm": 1.6362399849231193, + "language_loss": 0.73621279, + "learning_rate": 2.7894732485843465e-06, + "loss": 0.75724137, + "num_input_tokens_seen": 132616920, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.49609375, + "step": 6169, + "time_per_iteration": 2.4200477600097656 + }, + { + "auxiliary_loss_clip": 0.01070128, + "auxiliary_loss_mlp": 0.01031023, + "balance_loss_clip": 1.01733732, + "balance_loss_mlp": 1.02295423, + "epoch": 0.3709604689613708, + "flos": 24133536627840.0, + "grad_norm": 2.3200258407583605, + "language_loss": 0.79219866, + "learning_rate": 2.7891260934917854e-06, + "loss": 0.81321013, + "num_input_tokens_seen": 132637660, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.47265625, + "step": 6170, + "time_per_iteration": 2.410937547683716 + }, + { + "auxiliary_loss_clip": 0.01075555, + "auxiliary_loss_mlp": 0.0103432, + "balance_loss_clip": 1.01871562, + "balance_loss_mlp": 1.02447486, + "epoch": 0.37102059221403877, + "flos": 23875004943360.0, + "grad_norm": 1.7120475613100137, + "language_loss": 0.76155788, + "learning_rate": 2.7887789102373444e-06, + "loss": 0.78265667, + "num_input_tokens_seen": 132657635, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.51171875, + "step": 6171, + "time_per_iteration": 2.4557530879974365 + }, + { + "auxiliary_loss_clip": 0.01073001, + "auxiliary_loss_mlp": 0.01029295, + "balance_loss_clip": 1.01421499, + "balance_loss_mlp": 1.02395082, + "epoch": 0.37108071546670673, + "flos": 14500406223360.0, + "grad_norm": 2.0934543148346725, + "language_loss": 0.80144608, + "learning_rate": 2.7884316988334125e-06, + "loss": 0.822469, + "num_input_tokens_seen": 132674455, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.49023438, + "step": 6172, + "time_per_iteration": 2.373897075653076 + }, + { + "auxiliary_loss_clip": 0.01073653, + "auxiliary_loss_mlp": 0.01036751, + "balance_loss_clip": 1.02107489, + "balance_loss_mlp": 1.02205753, + "epoch": 0.3711408387193747, + "flos": 34561360465920.0, + "grad_norm": 1.6175478360151696, + "language_loss": 0.5934695, + "learning_rate": 2.7880844592923815e-06, + "loss": 0.6145736, + "num_input_tokens_seen": 132695140, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.515625, + "step": 6173, + "time_per_iteration": 2.538891315460205 + }, + { + "auxiliary_loss_clip": 0.0107288, + "auxiliary_loss_mlp": 0.01032275, + "balance_loss_clip": 1.01729608, + "balance_loss_mlp": 1.02335238, + "epoch": 0.37120096197204266, + "flos": 17309762328960.0, + "grad_norm": 1.7995129521863482, + "language_loss": 0.8051486, + "learning_rate": 2.787737191626644e-06, + "loss": 0.82620013, + "num_input_tokens_seen": 132712470, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.49609375, + "step": 6174, + "time_per_iteration": 2.3775181770324707 + }, + { + "auxiliary_loss_clip": 0.01069493, + "auxiliary_loss_mlp": 0.01027895, + "balance_loss_clip": 1.01350641, + "balance_loss_mlp": 1.02220082, + "epoch": 0.37126108522471063, + "flos": 30662748852480.0, + "grad_norm": 8.105359517295618, + "language_loss": 0.80235422, + "learning_rate": 2.787389895848591e-06, + "loss": 0.82332814, + "num_input_tokens_seen": 132732945, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.47265625, + "step": 6175, + "time_per_iteration": 2.4975883960723877 + }, + { + "auxiliary_loss_clip": 0.01074522, + "auxiliary_loss_mlp": 0.01040811, + "balance_loss_clip": 1.02551639, + "balance_loss_mlp": 1.02468562, + "epoch": 0.37132120847737865, + "flos": 25154466871680.0, + "grad_norm": 1.646359475074023, + "language_loss": 0.88616079, + "learning_rate": 2.78704257197062e-06, + "loss": 0.90731406, + "num_input_tokens_seen": 132752470, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.49804688, + "step": 6176, + "time_per_iteration": 2.419347047805786 + }, + { + "auxiliary_loss_clip": 0.01074123, + "auxiliary_loss_mlp": 0.01034807, + "balance_loss_clip": 1.02093101, + "balance_loss_mlp": 1.02398229, + "epoch": 0.3713813317300466, + "flos": 21212458571520.0, + "grad_norm": 1.5523897519054592, + "language_loss": 0.73386168, + "learning_rate": 2.7866952200051224e-06, + "loss": 0.754951, + "num_input_tokens_seen": 132771485, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.5, + "step": 6177, + "time_per_iteration": 2.4250547885894775 + }, + { + "auxiliary_loss_clip": 0.01071404, + "auxiliary_loss_mlp": 0.01035656, + "balance_loss_clip": 1.02051044, + "balance_loss_mlp": 1.02271843, + "epoch": 0.3714414549827146, + "flos": 21615565662720.0, + "grad_norm": 1.7272531332666148, + "language_loss": 0.75224185, + "learning_rate": 2.7863478399644973e-06, + "loss": 0.77331245, + "num_input_tokens_seen": 132791465, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.48632812, + "step": 6178, + "time_per_iteration": 2.4231603145599365 + }, + { + "auxiliary_loss_clip": 0.0107545, + "auxiliary_loss_mlp": 0.01036685, + "balance_loss_clip": 1.02227855, + "balance_loss_mlp": 1.02573061, + "epoch": 0.37150157823538255, + "flos": 19971331182720.0, + "grad_norm": 1.6729144938403504, + "language_loss": 0.71863556, + "learning_rate": 2.786000431861139e-06, + "loss": 0.73975694, + "num_input_tokens_seen": 132810160, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.49804688, + "step": 6179, + "time_per_iteration": 3.814878463745117 + }, + { + "auxiliary_loss_clip": 0.01074192, + "auxiliary_loss_mlp": 0.01032942, + "balance_loss_clip": 1.018255, + "balance_loss_mlp": 1.02353489, + "epoch": 0.3715617014880505, + "flos": 24859485861120.0, + "grad_norm": 1.6387236846173039, + "language_loss": 0.70228338, + "learning_rate": 2.7856529957074484e-06, + "loss": 0.7233547, + "num_input_tokens_seen": 132831265, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.5078125, + "step": 6180, + "time_per_iteration": 2.4519412517547607 + }, + { + "auxiliary_loss_clip": 0.01070058, + "auxiliary_loss_mlp": 0.01029084, + "balance_loss_clip": 1.01530373, + "balance_loss_mlp": 1.02216136, + "epoch": 0.3716218247407185, + "flos": 20448035153280.0, + "grad_norm": 4.263361426471605, + "language_loss": 0.77865577, + "learning_rate": 2.7853055315158233e-06, + "loss": 0.79964721, + "num_input_tokens_seen": 132850005, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.47851562, + "step": 6181, + "time_per_iteration": 2.4347891807556152 + }, + { + "auxiliary_loss_clip": 0.01070912, + "auxiliary_loss_mlp": 0.01031494, + "balance_loss_clip": 1.01698649, + "balance_loss_mlp": 1.02277517, + "epoch": 0.37168194799338644, + "flos": 24132349641600.0, + "grad_norm": 2.0063929499321476, + "language_loss": 0.78351223, + "learning_rate": 2.7849580392986633e-06, + "loss": 0.80453634, + "num_input_tokens_seen": 132865790, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.48046875, + "step": 6182, + "time_per_iteration": 3.87495756149292 + }, + { + "auxiliary_loss_clip": 0.01012116, + "auxiliary_loss_mlp": 0.01015715, + "balance_loss_clip": 1.01415372, + "balance_loss_mlp": 1.00223398, + "epoch": 0.3717420712460544, + "flos": 67405481199360.0, + "grad_norm": 0.7888858361685097, + "language_loss": 0.57507634, + "learning_rate": 2.7846105190683705e-06, + "loss": 0.59535468, + "num_input_tokens_seen": 132921775, + "router_z_loss_clip": 0.015625, + "router_z_loss_mlp": 0.09863281, + "step": 6183, + "time_per_iteration": 3.0429883003234863 + }, + { + "auxiliary_loss_clip": 0.01074568, + "auxiliary_loss_mlp": 0.01033589, + "balance_loss_clip": 1.01740015, + "balance_loss_mlp": 1.02184033, + "epoch": 0.37180219449872237, + "flos": 22375974274560.0, + "grad_norm": 3.168425218040862, + "language_loss": 0.76708633, + "learning_rate": 2.7842629708373466e-06, + "loss": 0.78816789, + "num_input_tokens_seen": 132941060, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.52734375, + "step": 6184, + "time_per_iteration": 3.7743980884552 + }, + { + "auxiliary_loss_clip": 0.0107153, + "auxiliary_loss_mlp": 0.0102878, + "balance_loss_clip": 1.01381278, + "balance_loss_mlp": 1.0239737, + "epoch": 0.37186231775139034, + "flos": 21868860643200.0, + "grad_norm": 1.880182044908937, + "language_loss": 0.72111183, + "learning_rate": 2.7839153946179943e-06, + "loss": 0.7421149, + "num_input_tokens_seen": 132961850, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.4765625, + "step": 6185, + "time_per_iteration": 2.4437339305877686 + }, + { + "auxiliary_loss_clip": 0.01072195, + "auxiliary_loss_mlp": 0.01020814, + "balance_loss_clip": 1.00696802, + "balance_loss_mlp": 1.02383614, + "epoch": 0.3719224410040583, + "flos": 22414238991360.0, + "grad_norm": 1.716124519664065, + "language_loss": 0.77161324, + "learning_rate": 2.783567790422718e-06, + "loss": 0.79254329, + "num_input_tokens_seen": 132981625, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.48242188, + "step": 6186, + "time_per_iteration": 2.3991236686706543 + }, + { + "auxiliary_loss_clip": 0.01076559, + "auxiliary_loss_mlp": 0.01032691, + "balance_loss_clip": 1.016765, + "balance_loss_mlp": 1.02423143, + "epoch": 0.37198256425672627, + "flos": 25150172774400.0, + "grad_norm": 1.6683382242907194, + "language_loss": 0.83254975, + "learning_rate": 2.7832201582639227e-06, + "loss": 0.85364223, + "num_input_tokens_seen": 133001225, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.5234375, + "step": 6187, + "time_per_iteration": 3.8323800563812256 + }, + { + "auxiliary_loss_clip": 0.01072231, + "auxiliary_loss_mlp": 0.01034634, + "balance_loss_clip": 1.02053714, + "balance_loss_mlp": 1.02339578, + "epoch": 0.37204268750939423, + "flos": 21137360503680.0, + "grad_norm": 2.4403374394653894, + "language_loss": 0.84715891, + "learning_rate": 2.782872498154015e-06, + "loss": 0.8682276, + "num_input_tokens_seen": 133018820, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.48828125, + "step": 6188, + "time_per_iteration": 2.4364564418792725 + }, + { + "auxiliary_loss_clip": 0.01073667, + "auxiliary_loss_mlp": 0.01029507, + "balance_loss_clip": 1.01457644, + "balance_loss_mlp": 1.02422071, + "epoch": 0.37210281076206225, + "flos": 21505763836800.0, + "grad_norm": 1.6357768777067616, + "language_loss": 0.65399086, + "learning_rate": 2.782524810105401e-06, + "loss": 0.6750226, + "num_input_tokens_seen": 133040205, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.49414062, + "step": 6189, + "time_per_iteration": 2.435692548751831 + }, + { + "auxiliary_loss_clip": 0.01076552, + "auxiliary_loss_mlp": 0.01033102, + "balance_loss_clip": 1.01834989, + "balance_loss_mlp": 1.0265429, + "epoch": 0.3721629340147302, + "flos": 17346874970880.0, + "grad_norm": 1.7951459138673946, + "language_loss": 0.83917522, + "learning_rate": 2.78217709413049e-06, + "loss": 0.86027175, + "num_input_tokens_seen": 133058095, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.5, + "step": 6190, + "time_per_iteration": 2.3571572303771973 + }, + { + "auxiliary_loss_clip": 0.01074819, + "auxiliary_loss_mlp": 0.01030338, + "balance_loss_clip": 1.01549053, + "balance_loss_mlp": 1.02322471, + "epoch": 0.3722230572673982, + "flos": 16431557189760.0, + "grad_norm": 2.362273291297379, + "language_loss": 0.87694013, + "learning_rate": 2.781829350241691e-06, + "loss": 0.89799178, + "num_input_tokens_seen": 133071530, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.515625, + "step": 6191, + "time_per_iteration": 2.3645830154418945 + }, + { + "auxiliary_loss_clip": 0.01073861, + "auxiliary_loss_mlp": 0.01031657, + "balance_loss_clip": 1.01458609, + "balance_loss_mlp": 1.02215326, + "epoch": 0.37228318052006615, + "flos": 22673608548480.0, + "grad_norm": 1.7807218420149122, + "language_loss": 0.73598015, + "learning_rate": 2.7814815784514125e-06, + "loss": 0.75703537, + "num_input_tokens_seen": 133091410, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.515625, + "step": 6192, + "time_per_iteration": 2.40130352973938 + }, + { + "auxiliary_loss_clip": 0.01070789, + "auxiliary_loss_mlp": 0.01034159, + "balance_loss_clip": 1.02030671, + "balance_loss_mlp": 1.02252173, + "epoch": 0.3723433037727341, + "flos": 25264303608960.0, + "grad_norm": 2.00977431482234, + "language_loss": 0.79427123, + "learning_rate": 2.7811337787720674e-06, + "loss": 0.81532073, + "num_input_tokens_seen": 133110365, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.48242188, + "step": 6193, + "time_per_iteration": 2.4418416023254395 + }, + { + "auxiliary_loss_clip": 0.01073343, + "auxiliary_loss_mlp": 0.01034885, + "balance_loss_clip": 1.02055621, + "balance_loss_mlp": 1.02290344, + "epoch": 0.3724034270254021, + "flos": 10523903633280.0, + "grad_norm": 1.7143225133463045, + "language_loss": 0.84252727, + "learning_rate": 2.7807859512160663e-06, + "loss": 0.86360955, + "num_input_tokens_seen": 133128255, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.50390625, + "step": 6194, + "time_per_iteration": 2.358292818069458 + }, + { + "auxiliary_loss_clip": 0.0107146, + "auxiliary_loss_mlp": 0.01039287, + "balance_loss_clip": 1.02523792, + "balance_loss_mlp": 1.02197599, + "epoch": 0.37246355027807004, + "flos": 20265195029760.0, + "grad_norm": 2.2598525844751607, + "language_loss": 0.77382863, + "learning_rate": 2.7804380957958238e-06, + "loss": 0.79493612, + "num_input_tokens_seen": 133143975, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.49414062, + "step": 6195, + "time_per_iteration": 2.4039194583892822 + }, + { + "auxiliary_loss_clip": 0.010717, + "auxiliary_loss_mlp": 0.01033611, + "balance_loss_clip": 1.01815605, + "balance_loss_mlp": 1.02313817, + "epoch": 0.372523673530738, + "flos": 19499549713920.0, + "grad_norm": 1.4917777586955658, + "language_loss": 0.78952169, + "learning_rate": 2.780090212523753e-06, + "loss": 0.81057477, + "num_input_tokens_seen": 133162935, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.48632812, + "step": 6196, + "time_per_iteration": 2.3927886486053467 + }, + { + "auxiliary_loss_clip": 0.01071872, + "auxiliary_loss_mlp": 0.01032082, + "balance_loss_clip": 1.01787806, + "balance_loss_mlp": 1.02293777, + "epoch": 0.372583796783406, + "flos": 16763301728640.0, + "grad_norm": 1.9642784373385083, + "language_loss": 0.82945043, + "learning_rate": 2.779742301412269e-06, + "loss": 0.85048997, + "num_input_tokens_seen": 133181180, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.48828125, + "step": 6197, + "time_per_iteration": 2.562487840652466 + }, + { + "auxiliary_loss_clip": 0.01070065, + "auxiliary_loss_mlp": 0.01030933, + "balance_loss_clip": 1.01587713, + "balance_loss_mlp": 1.0225631, + "epoch": 0.37264392003607394, + "flos": 22636879931520.0, + "grad_norm": 1.5517358097417642, + "language_loss": 0.64610058, + "learning_rate": 2.7793943624737884e-06, + "loss": 0.6671105, + "num_input_tokens_seen": 133199615, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.47460938, + "step": 6198, + "time_per_iteration": 2.4047329425811768 + }, + { + "auxiliary_loss_clip": 0.0106942, + "auxiliary_loss_mlp": 0.0103916, + "balance_loss_clip": 1.02477741, + "balance_loss_mlp": 1.02189946, + "epoch": 0.3727040432887419, + "flos": 19972134144000.0, + "grad_norm": 1.5870133253278214, + "language_loss": 0.73951387, + "learning_rate": 2.7790463957207275e-06, + "loss": 0.76059961, + "num_input_tokens_seen": 133219650, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.4765625, + "step": 6199, + "time_per_iteration": 2.5048775672912598 + }, + { + "auxiliary_loss_clip": 0.01069121, + "auxiliary_loss_mlp": 0.01024381, + "balance_loss_clip": 1.01048076, + "balance_loss_mlp": 1.02145684, + "epoch": 0.37276416654140987, + "flos": 63896991517440.0, + "grad_norm": 1.7664831402604675, + "language_loss": 0.80773234, + "learning_rate": 2.7786984011655045e-06, + "loss": 0.82866734, + "num_input_tokens_seen": 133245675, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.4765625, + "step": 6200, + "time_per_iteration": 2.8172550201416016 + }, + { + "auxiliary_loss_clip": 0.01072431, + "auxiliary_loss_mlp": 0.01032329, + "balance_loss_clip": 1.01771438, + "balance_loss_mlp": 1.02333105, + "epoch": 0.37282428979407783, + "flos": 39784401705600.0, + "grad_norm": 1.9718792875855191, + "language_loss": 0.60570848, + "learning_rate": 2.7783503788205383e-06, + "loss": 0.62675607, + "num_input_tokens_seen": 133266905, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.49023438, + "step": 6201, + "time_per_iteration": 2.592897415161133 + }, + { + "auxiliary_loss_clip": 0.01074313, + "auxiliary_loss_mlp": 0.01031279, + "balance_loss_clip": 1.01644909, + "balance_loss_mlp": 1.02458799, + "epoch": 0.37288441304674586, + "flos": 22707998104320.0, + "grad_norm": 1.7946640034925792, + "language_loss": 0.73126602, + "learning_rate": 2.7780023286982502e-06, + "loss": 0.75232196, + "num_input_tokens_seen": 133286865, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.49609375, + "step": 6202, + "time_per_iteration": 2.426520824432373 + }, + { + "auxiliary_loss_clip": 0.01071296, + "auxiliary_loss_mlp": 0.01032465, + "balance_loss_clip": 1.01842213, + "balance_loss_mlp": 1.02342236, + "epoch": 0.3729445362994138, + "flos": 18769306383360.0, + "grad_norm": 1.905286330760698, + "language_loss": 0.73945296, + "learning_rate": 2.77765425081106e-06, + "loss": 0.76049066, + "num_input_tokens_seen": 133305295, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.47851562, + "step": 6203, + "time_per_iteration": 2.429581642150879 + }, + { + "auxiliary_loss_clip": 0.01069211, + "auxiliary_loss_mlp": 0.01026896, + "balance_loss_clip": 1.01419401, + "balance_loss_mlp": 1.0224328, + "epoch": 0.3730046595520818, + "flos": 22455087148800.0, + "grad_norm": 1.6515603762382831, + "language_loss": 0.8148706, + "learning_rate": 2.7773061451713893e-06, + "loss": 0.83583176, + "num_input_tokens_seen": 133324625, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.46679688, + "step": 6204, + "time_per_iteration": 2.428969144821167 + }, + { + "auxiliary_loss_clip": 0.01073931, + "auxiliary_loss_mlp": 0.01035608, + "balance_loss_clip": 1.0206176, + "balance_loss_mlp": 1.02339745, + "epoch": 0.37306478280474975, + "flos": 24315224676480.0, + "grad_norm": 1.952317168419333, + "language_loss": 0.75136554, + "learning_rate": 2.776958011791662e-06, + "loss": 0.772461, + "num_input_tokens_seen": 133344625, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.5078125, + "step": 6205, + "time_per_iteration": 2.4917941093444824 + }, + { + "auxiliary_loss_clip": 0.01070959, + "auxiliary_loss_mlp": 0.01036767, + "balance_loss_clip": 1.02197933, + "balance_loss_mlp": 1.02249265, + "epoch": 0.3731249060574177, + "flos": 15814257707520.0, + "grad_norm": 2.0624817279397627, + "language_loss": 0.7796436, + "learning_rate": 2.776609850684302e-06, + "loss": 0.80072081, + "num_input_tokens_seen": 133363605, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.484375, + "step": 6206, + "time_per_iteration": 2.3968253135681152 + }, + { + "auxiliary_loss_clip": 0.01071396, + "auxiliary_loss_mlp": 0.01034369, + "balance_loss_clip": 1.01899123, + "balance_loss_mlp": 1.02178884, + "epoch": 0.3731850293100857, + "flos": 19827069978240.0, + "grad_norm": 2.0639849207425702, + "language_loss": 0.93374777, + "learning_rate": 2.7762616618617346e-06, + "loss": 0.95480537, + "num_input_tokens_seen": 133379405, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.49609375, + "step": 6207, + "time_per_iteration": 2.437605381011963 + }, + { + "auxiliary_loss_clip": 0.01073339, + "auxiliary_loss_mlp": 0.01028216, + "balance_loss_clip": 1.01380968, + "balance_loss_mlp": 1.02353597, + "epoch": 0.37324515256275365, + "flos": 19061354839680.0, + "grad_norm": 2.021518420268413, + "language_loss": 0.82872462, + "learning_rate": 2.7759134453363847e-06, + "loss": 0.84974027, + "num_input_tokens_seen": 133397585, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.49804688, + "step": 6208, + "time_per_iteration": 2.428740978240967 + }, + { + "auxiliary_loss_clip": 0.0107438, + "auxiliary_loss_mlp": 0.01033187, + "balance_loss_clip": 1.01751077, + "balance_loss_mlp": 1.02334714, + "epoch": 0.3733052758154216, + "flos": 20703285169920.0, + "grad_norm": 1.9384499984513741, + "language_loss": 0.73320979, + "learning_rate": 2.7755652011206798e-06, + "loss": 0.75428545, + "num_input_tokens_seen": 133415365, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.5078125, + "step": 6209, + "time_per_iteration": 2.4449281692504883 + }, + { + "auxiliary_loss_clip": 0.01072479, + "auxiliary_loss_mlp": 0.01029623, + "balance_loss_clip": 1.01419723, + "balance_loss_mlp": 1.0235002, + "epoch": 0.3733653990680896, + "flos": 20192470934400.0, + "grad_norm": 2.684803602523182, + "language_loss": 0.70224679, + "learning_rate": 2.7752169292270485e-06, + "loss": 0.72326779, + "num_input_tokens_seen": 133435700, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.48828125, + "step": 6210, + "time_per_iteration": 2.4383599758148193 + }, + { + "auxiliary_loss_clip": 0.01073842, + "auxiliary_loss_mlp": 0.0102965, + "balance_loss_clip": 1.01439178, + "balance_loss_mlp": 1.02291262, + "epoch": 0.37342552232075754, + "flos": 20338617352320.0, + "grad_norm": 1.5257492057054218, + "language_loss": 0.77836812, + "learning_rate": 2.7748686296679184e-06, + "loss": 0.79940307, + "num_input_tokens_seen": 133455180, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.5078125, + "step": 6211, + "time_per_iteration": 2.4451894760131836 + }, + { + "auxiliary_loss_clip": 0.01073375, + "auxiliary_loss_mlp": 0.01036915, + "balance_loss_clip": 1.02191854, + "balance_loss_mlp": 1.0232141, + "epoch": 0.3734856455734255, + "flos": 35516409240960.0, + "grad_norm": 1.6208511725525019, + "language_loss": 0.73387438, + "learning_rate": 2.7745203024557207e-06, + "loss": 0.75497729, + "num_input_tokens_seen": 133476715, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.5, + "step": 6212, + "time_per_iteration": 2.5342884063720703 + }, + { + "auxiliary_loss_clip": 0.0107942, + "auxiliary_loss_mlp": 0.01045293, + "balance_loss_clip": 1.02907455, + "balance_loss_mlp": 1.02517772, + "epoch": 0.37354576882609347, + "flos": 21141235664640.0, + "grad_norm": 2.6233294817691717, + "language_loss": 0.821738, + "learning_rate": 2.7741719476028855e-06, + "loss": 0.84298515, + "num_input_tokens_seen": 133494550, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.54296875, + "step": 6213, + "time_per_iteration": 2.439648151397705 + }, + { + "auxiliary_loss_clip": 0.01075524, + "auxiliary_loss_mlp": 0.01036, + "balance_loss_clip": 1.02059782, + "balance_loss_mlp": 1.02497196, + "epoch": 0.37360589207876144, + "flos": 21505728925440.0, + "grad_norm": 2.459126521009793, + "language_loss": 0.78650653, + "learning_rate": 2.773823565121844e-06, + "loss": 0.80762172, + "num_input_tokens_seen": 133512640, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.5078125, + "step": 6214, + "time_per_iteration": 2.4146270751953125 + }, + { + "auxiliary_loss_clip": 0.010713, + "auxiliary_loss_mlp": 0.01037962, + "balance_loss_clip": 1.02341223, + "balance_loss_mlp": 1.02261221, + "epoch": 0.37366601533142946, + "flos": 38434275452160.0, + "grad_norm": 1.6021105794750954, + "language_loss": 0.84825546, + "learning_rate": 2.7734751550250306e-06, + "loss": 0.86934805, + "num_input_tokens_seen": 133535540, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.48632812, + "step": 6215, + "time_per_iteration": 2.5981287956237793 + }, + { + "auxiliary_loss_clip": 0.01074024, + "auxiliary_loss_mlp": 0.01036817, + "balance_loss_clip": 1.0203898, + "balance_loss_mlp": 1.02218866, + "epoch": 0.3737261385840974, + "flos": 18440215107840.0, + "grad_norm": 1.6171487134210691, + "language_loss": 0.68683094, + "learning_rate": 2.773126717324879e-06, + "loss": 0.70793933, + "num_input_tokens_seen": 133555795, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.515625, + "step": 6216, + "time_per_iteration": 2.409013509750366 + }, + { + "auxiliary_loss_clip": 0.01074612, + "auxiliary_loss_mlp": 0.01031845, + "balance_loss_clip": 1.01682472, + "balance_loss_mlp": 1.02353728, + "epoch": 0.3737862618367654, + "flos": 22928753831040.0, + "grad_norm": 2.794683952874038, + "language_loss": 0.65873545, + "learning_rate": 2.7727782520338227e-06, + "loss": 0.67980003, + "num_input_tokens_seen": 133575905, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.5078125, + "step": 6217, + "time_per_iteration": 2.4490344524383545 + }, + { + "auxiliary_loss_clip": 0.01074285, + "auxiliary_loss_mlp": 0.01028153, + "balance_loss_clip": 1.01288247, + "balance_loss_mlp": 1.02340353, + "epoch": 0.37384638508943335, + "flos": 15408881377920.0, + "grad_norm": 1.8113159017269491, + "language_loss": 0.80279231, + "learning_rate": 2.772429759164299e-06, + "loss": 0.82381666, + "num_input_tokens_seen": 133592585, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.5078125, + "step": 6218, + "time_per_iteration": 3.7649877071380615 + }, + { + "auxiliary_loss_clip": 0.01069091, + "auxiliary_loss_mlp": 0.01029567, + "balance_loss_clip": 1.01571488, + "balance_loss_mlp": 1.02250803, + "epoch": 0.3739065083421013, + "flos": 24279648134400.0, + "grad_norm": 1.8279393161731508, + "language_loss": 0.7868017, + "learning_rate": 2.7720812387287444e-06, + "loss": 0.80778825, + "num_input_tokens_seen": 133615070, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.46484375, + "step": 6219, + "time_per_iteration": 2.487558126449585 + }, + { + "auxiliary_loss_clip": 0.0107171, + "auxiliary_loss_mlp": 0.01033462, + "balance_loss_clip": 1.01756525, + "balance_loss_mlp": 1.02380347, + "epoch": 0.3739666315947693, + "flos": 23001722305920.0, + "grad_norm": 2.4528825866951887, + "language_loss": 0.76458478, + "learning_rate": 2.771732690739596e-06, + "loss": 0.78563643, + "num_input_tokens_seen": 133633490, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.47851562, + "step": 6220, + "time_per_iteration": 2.4168243408203125 + }, + { + "auxiliary_loss_clip": 0.01073589, + "auxiliary_loss_mlp": 0.01033883, + "balance_loss_clip": 1.01837349, + "balance_loss_mlp": 1.02267933, + "epoch": 0.37402675484743725, + "flos": 19390097001600.0, + "grad_norm": 1.5716476723891935, + "language_loss": 0.82563639, + "learning_rate": 2.771384115209293e-06, + "loss": 0.8467111, + "num_input_tokens_seen": 133653425, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.5078125, + "step": 6221, + "time_per_iteration": 2.4402990341186523 + }, + { + "auxiliary_loss_clip": 0.01073214, + "auxiliary_loss_mlp": 0.01036029, + "balance_loss_clip": 1.02105641, + "balance_loss_mlp": 1.0236305, + "epoch": 0.3740868781001052, + "flos": 17125281371520.0, + "grad_norm": 1.7713849061384412, + "language_loss": 0.76439321, + "learning_rate": 2.771035512150275e-06, + "loss": 0.78548568, + "num_input_tokens_seen": 133670220, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.49609375, + "step": 6222, + "time_per_iteration": 3.8430683612823486 + }, + { + "auxiliary_loss_clip": 0.0107383, + "auxiliary_loss_mlp": 0.01030599, + "balance_loss_clip": 1.01489902, + "balance_loss_mlp": 1.02368045, + "epoch": 0.3741470013527732, + "flos": 20042589000960.0, + "grad_norm": 1.6175378588570886, + "language_loss": 0.70520711, + "learning_rate": 2.770686881574983e-06, + "loss": 0.72625136, + "num_input_tokens_seen": 133688910, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.5, + "step": 6223, + "time_per_iteration": 3.9344568252563477 + }, + { + "auxiliary_loss_clip": 0.01073837, + "auxiliary_loss_mlp": 0.01030566, + "balance_loss_clip": 1.01515806, + "balance_loss_mlp": 1.02440631, + "epoch": 0.37420712460544114, + "flos": 36895967637120.0, + "grad_norm": 1.8608568225668625, + "language_loss": 0.68456465, + "learning_rate": 2.770338223495859e-06, + "loss": 0.70560867, + "num_input_tokens_seen": 133708690, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.49414062, + "step": 6224, + "time_per_iteration": 2.5350565910339355 + }, + { + "auxiliary_loss_clip": 0.0107091, + "auxiliary_loss_mlp": 0.01031762, + "balance_loss_clip": 1.01719451, + "balance_loss_mlp": 1.02378225, + "epoch": 0.3742672478581091, + "flos": 22200081511680.0, + "grad_norm": 1.75345015961603, + "language_loss": 0.70153582, + "learning_rate": 2.7699895379253447e-06, + "loss": 0.72256255, + "num_input_tokens_seen": 133728095, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.47070312, + "step": 6225, + "time_per_iteration": 2.4380321502685547 + }, + { + "auxiliary_loss_clip": 0.01071065, + "auxiliary_loss_mlp": 0.01029559, + "balance_loss_clip": 1.01430035, + "balance_loss_mlp": 1.02285171, + "epoch": 0.3743273711107771, + "flos": 24680381253120.0, + "grad_norm": 2.474053064681131, + "language_loss": 0.78778261, + "learning_rate": 2.7696408248758846e-06, + "loss": 0.80878878, + "num_input_tokens_seen": 133745590, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.48242188, + "step": 6226, + "time_per_iteration": 2.4330577850341797 + }, + { + "auxiliary_loss_clip": 0.01072624, + "auxiliary_loss_mlp": 0.01031714, + "balance_loss_clip": 1.01613379, + "balance_loss_mlp": 1.02253902, + "epoch": 0.37438749436344504, + "flos": 24458543274240.0, + "grad_norm": 1.8687216456812434, + "language_loss": 0.68062204, + "learning_rate": 2.7692920843599238e-06, + "loss": 0.7016654, + "num_input_tokens_seen": 133766155, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.5, + "step": 6227, + "time_per_iteration": 3.9569149017333984 + }, + { + "auxiliary_loss_clip": 0.01072357, + "auxiliary_loss_mlp": 0.01026676, + "balance_loss_clip": 1.01247859, + "balance_loss_mlp": 1.02367711, + "epoch": 0.374447617616113, + "flos": 21797672647680.0, + "grad_norm": 1.5764803169889852, + "language_loss": 0.82882261, + "learning_rate": 2.7689433163899073e-06, + "loss": 0.84981292, + "num_input_tokens_seen": 133783185, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.48632812, + "step": 6228, + "time_per_iteration": 2.3952555656433105 + }, + { + "auxiliary_loss_clip": 0.01071655, + "auxiliary_loss_mlp": 0.01035699, + "balance_loss_clip": 1.02030945, + "balance_loss_mlp": 1.02389169, + "epoch": 0.374507740868781, + "flos": 17967211741440.0, + "grad_norm": 1.4758673450195732, + "language_loss": 0.74699509, + "learning_rate": 2.7685945209782816e-06, + "loss": 0.76806861, + "num_input_tokens_seen": 133800975, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.4765625, + "step": 6229, + "time_per_iteration": 2.408698558807373 + }, + { + "auxiliary_loss_clip": 0.0107123, + "auxiliary_loss_mlp": 0.01029753, + "balance_loss_clip": 1.01397026, + "balance_loss_mlp": 1.02218044, + "epoch": 0.374567864121449, + "flos": 16104944620800.0, + "grad_norm": 1.8773641338396432, + "language_loss": 0.8345238, + "learning_rate": 2.7682456981374946e-06, + "loss": 0.85553372, + "num_input_tokens_seen": 133818020, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.48828125, + "step": 6230, + "time_per_iteration": 2.366354465484619 + }, + { + "auxiliary_loss_clip": 0.01073039, + "auxiliary_loss_mlp": 0.01035277, + "balance_loss_clip": 1.01930881, + "balance_loss_mlp": 1.02375996, + "epoch": 0.37462798737411696, + "flos": 25772045644800.0, + "grad_norm": 2.4945950179972467, + "language_loss": 0.72984105, + "learning_rate": 2.7678968478799943e-06, + "loss": 0.75092417, + "num_input_tokens_seen": 133840690, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.4921875, + "step": 6231, + "time_per_iteration": 2.485710620880127 + }, + { + "auxiliary_loss_clip": 0.01075258, + "auxiliary_loss_mlp": 0.01031794, + "balance_loss_clip": 1.01655364, + "balance_loss_mlp": 1.02421534, + "epoch": 0.3746881106267849, + "flos": 16653569725440.0, + "grad_norm": 4.6471805573035505, + "language_loss": 0.73937887, + "learning_rate": 2.767547970218231e-06, + "loss": 0.76044941, + "num_input_tokens_seen": 133858350, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.51171875, + "step": 6232, + "time_per_iteration": 2.3835694789886475 + }, + { + "auxiliary_loss_clip": 0.0107136, + "auxiliary_loss_mlp": 0.01027617, + "balance_loss_clip": 1.01240623, + "balance_loss_mlp": 1.02195716, + "epoch": 0.3747482338794529, + "flos": 26176758658560.0, + "grad_norm": 1.6102416345045005, + "language_loss": 0.77547932, + "learning_rate": 2.767199065164655e-06, + "loss": 0.79646909, + "num_input_tokens_seen": 133879775, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.49414062, + "step": 6233, + "time_per_iteration": 2.4585635662078857 + }, + { + "auxiliary_loss_clip": 0.01072705, + "auxiliary_loss_mlp": 0.01032043, + "balance_loss_clip": 1.01750541, + "balance_loss_mlp": 1.02270055, + "epoch": 0.37480835713212085, + "flos": 12020246127360.0, + "grad_norm": 1.7627136175414742, + "language_loss": 0.69532347, + "learning_rate": 2.7668501327317184e-06, + "loss": 0.71637094, + "num_input_tokens_seen": 133898295, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.5, + "step": 6234, + "time_per_iteration": 2.3981785774230957 + }, + { + "auxiliary_loss_clip": 0.01071549, + "auxiliary_loss_mlp": 0.01028956, + "balance_loss_clip": 1.01517558, + "balance_loss_mlp": 1.02282214, + "epoch": 0.3748684803847888, + "flos": 19678340119680.0, + "grad_norm": 1.9495729458196984, + "language_loss": 0.82885414, + "learning_rate": 2.7665011729318727e-06, + "loss": 0.84985918, + "num_input_tokens_seen": 133915230, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.48828125, + "step": 6235, + "time_per_iteration": 2.409662961959839 + }, + { + "auxiliary_loss_clip": 0.01075161, + "auxiliary_loss_mlp": 0.01033312, + "balance_loss_clip": 1.01858401, + "balance_loss_mlp": 1.02454495, + "epoch": 0.3749286036374568, + "flos": 20520165755520.0, + "grad_norm": 1.9546016377829813, + "language_loss": 0.78228092, + "learning_rate": 2.7661521857775715e-06, + "loss": 0.80336571, + "num_input_tokens_seen": 133934110, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.5078125, + "step": 6236, + "time_per_iteration": 2.4079718589782715 + }, + { + "auxiliary_loss_clip": 0.01074648, + "auxiliary_loss_mlp": 0.01035374, + "balance_loss_clip": 1.01854134, + "balance_loss_mlp": 1.0232482, + "epoch": 0.37498872689012475, + "flos": 20703564460800.0, + "grad_norm": 3.050010797417045, + "language_loss": 0.73828101, + "learning_rate": 2.76580317128127e-06, + "loss": 0.75938118, + "num_input_tokens_seen": 133952395, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.515625, + "step": 6237, + "time_per_iteration": 2.4097652435302734 + }, + { + "auxiliary_loss_clip": 0.01073537, + "auxiliary_loss_mlp": 0.01028589, + "balance_loss_clip": 1.0125674, + "balance_loss_mlp": 1.02277398, + "epoch": 0.3750488501427927, + "flos": 21573914544000.0, + "grad_norm": 2.0114703892423718, + "language_loss": 0.93086565, + "learning_rate": 2.765454129455423e-06, + "loss": 0.95188695, + "num_input_tokens_seen": 133969635, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.5078125, + "step": 6238, + "time_per_iteration": 2.401618480682373 + }, + { + "auxiliary_loss_clip": 0.01072132, + "auxiliary_loss_mlp": 0.01029359, + "balance_loss_clip": 1.0143919, + "balance_loss_mlp": 1.02237415, + "epoch": 0.3751089733954607, + "flos": 15922977281280.0, + "grad_norm": 2.053237020034645, + "language_loss": 0.71100038, + "learning_rate": 2.765105060312487e-06, + "loss": 0.73201525, + "num_input_tokens_seen": 133987215, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.49804688, + "step": 6239, + "time_per_iteration": 2.4259181022644043 + }, + { + "auxiliary_loss_clip": 0.01077219, + "auxiliary_loss_mlp": 0.01030316, + "balance_loss_clip": 1.01528382, + "balance_loss_mlp": 1.025388, + "epoch": 0.37516909664812864, + "flos": 36283136808960.0, + "grad_norm": 1.5102689779306429, + "language_loss": 0.65338063, + "learning_rate": 2.76475596386492e-06, + "loss": 0.674456, + "num_input_tokens_seen": 134009250, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.515625, + "step": 6240, + "time_per_iteration": 2.530618667602539 + }, + { + "auxiliary_loss_clip": 0.01073841, + "auxiliary_loss_mlp": 0.01028872, + "balance_loss_clip": 1.01401806, + "balance_loss_mlp": 1.02319646, + "epoch": 0.3752292199007966, + "flos": 13515087432960.0, + "grad_norm": 1.6989651921009004, + "language_loss": 0.75595129, + "learning_rate": 2.764406840125179e-06, + "loss": 0.77697843, + "num_input_tokens_seen": 134026875, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.50390625, + "step": 6241, + "time_per_iteration": 2.4018259048461914 + }, + { + "auxiliary_loss_clip": 0.01074721, + "auxiliary_loss_mlp": 0.01038657, + "balance_loss_clip": 1.0213958, + "balance_loss_mlp": 1.02353191, + "epoch": 0.3752893431534646, + "flos": 27196885941120.0, + "grad_norm": 2.089112001114548, + "language_loss": 0.84214926, + "learning_rate": 2.7640576891057246e-06, + "loss": 0.86328304, + "num_input_tokens_seen": 134047185, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.51171875, + "step": 6242, + "time_per_iteration": 2.448578357696533 + }, + { + "auxiliary_loss_clip": 0.01074038, + "auxiliary_loss_mlp": 0.01035701, + "balance_loss_clip": 1.02131283, + "balance_loss_mlp": 1.02355623, + "epoch": 0.3753494664061326, + "flos": 30006381692160.0, + "grad_norm": 1.7973205343506828, + "language_loss": 0.68096197, + "learning_rate": 2.763708510819017e-06, + "loss": 0.70205939, + "num_input_tokens_seen": 134067330, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.50390625, + "step": 6243, + "time_per_iteration": 2.481720209121704 + }, + { + "auxiliary_loss_clip": 0.01073019, + "auxiliary_loss_mlp": 0.01036294, + "balance_loss_clip": 1.01986682, + "balance_loss_mlp": 1.02390313, + "epoch": 0.37540958965880056, + "flos": 24460812512640.0, + "grad_norm": 1.995312742969244, + "language_loss": 0.83813632, + "learning_rate": 2.7633593052775174e-06, + "loss": 0.85922945, + "num_input_tokens_seen": 134085525, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.4921875, + "step": 6244, + "time_per_iteration": 2.432147741317749 + }, + { + "auxiliary_loss_clip": 0.01070223, + "auxiliary_loss_mlp": 0.01028842, + "balance_loss_clip": 1.0146023, + "balance_loss_mlp": 1.02315938, + "epoch": 0.3754697129114685, + "flos": 16507458218880.0, + "grad_norm": 2.526467172015851, + "language_loss": 0.83259434, + "learning_rate": 2.763010072493687e-06, + "loss": 0.853585, + "num_input_tokens_seen": 134101855, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.46875, + "step": 6245, + "time_per_iteration": 2.388734817504883 + }, + { + "auxiliary_loss_clip": 0.01072486, + "auxiliary_loss_mlp": 0.01035757, + "balance_loss_clip": 1.02055764, + "balance_loss_mlp": 1.02326655, + "epoch": 0.3755298361641365, + "flos": 19389887533440.0, + "grad_norm": 4.46910041998433, + "language_loss": 0.63639015, + "learning_rate": 2.76266081247999e-06, + "loss": 0.65747261, + "num_input_tokens_seen": 134119360, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.4921875, + "step": 6246, + "time_per_iteration": 2.402914047241211 + }, + { + "auxiliary_loss_clip": 0.01074434, + "auxiliary_loss_mlp": 0.01035935, + "balance_loss_clip": 1.01923394, + "balance_loss_mlp": 1.02348852, + "epoch": 0.37558995941680445, + "flos": 14719521116160.0, + "grad_norm": 1.6823006376077325, + "language_loss": 0.74856913, + "learning_rate": 2.7623115252488905e-06, + "loss": 0.76967275, + "num_input_tokens_seen": 134137475, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.51171875, + "step": 6247, + "time_per_iteration": 2.3875598907470703 + }, + { + "auxiliary_loss_clip": 0.01074235, + "auxiliary_loss_mlp": 0.01027744, + "balance_loss_clip": 1.01266456, + "balance_loss_mlp": 1.02294374, + "epoch": 0.3756500826694724, + "flos": 21688813428480.0, + "grad_norm": 13.28785055875454, + "language_loss": 0.54971582, + "learning_rate": 2.7619622108128534e-06, + "loss": 0.57073557, + "num_input_tokens_seen": 134154580, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.51171875, + "step": 6248, + "time_per_iteration": 2.4299628734588623 + }, + { + "auxiliary_loss_clip": 0.01072113, + "auxiliary_loss_mlp": 0.01032303, + "balance_loss_clip": 1.01741385, + "balance_loss_mlp": 1.02287054, + "epoch": 0.3757102059221404, + "flos": 26504453479680.0, + "grad_norm": 2.633121587450112, + "language_loss": 0.84367877, + "learning_rate": 2.7616128691843452e-06, + "loss": 0.86472297, + "num_input_tokens_seen": 134174285, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.4921875, + "step": 6249, + "time_per_iteration": 2.438007354736328 + }, + { + "auxiliary_loss_clip": 0.01071086, + "auxiliary_loss_mlp": 0.0103139, + "balance_loss_clip": 1.01693547, + "balance_loss_mlp": 1.02238822, + "epoch": 0.37577032917480835, + "flos": 37336676129280.0, + "grad_norm": 1.6156659381612262, + "language_loss": 0.67711782, + "learning_rate": 2.761263500375832e-06, + "loss": 0.69814253, + "num_input_tokens_seen": 134195940, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.48632812, + "step": 6250, + "time_per_iteration": 2.5422446727752686 + }, + { + "auxiliary_loss_clip": 0.01075228, + "auxiliary_loss_mlp": 0.01035926, + "balance_loss_clip": 1.02216983, + "balance_loss_mlp": 1.02463162, + "epoch": 0.3758304524274763, + "flos": 21907509384960.0, + "grad_norm": 2.4290308739049267, + "language_loss": 0.77889025, + "learning_rate": 2.760914104399784e-06, + "loss": 0.80000186, + "num_input_tokens_seen": 134212235, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.50390625, + "step": 6251, + "time_per_iteration": 2.395646095275879 + }, + { + "auxiliary_loss_clip": 0.01073676, + "auxiliary_loss_mlp": 0.01036992, + "balance_loss_clip": 1.02190053, + "balance_loss_mlp": 1.02341557, + "epoch": 0.3758905756801443, + "flos": 36568028436480.0, + "grad_norm": 1.835856414744348, + "language_loss": 0.5781337, + "learning_rate": 2.7605646812686687e-06, + "loss": 0.59924042, + "num_input_tokens_seen": 134233810, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.50390625, + "step": 6252, + "time_per_iteration": 2.5421323776245117 + }, + { + "auxiliary_loss_clip": 0.01074985, + "auxiliary_loss_mlp": 0.01034224, + "balance_loss_clip": 1.01913857, + "balance_loss_mlp": 1.02336311, + "epoch": 0.37595069893281224, + "flos": 24527811144960.0, + "grad_norm": 1.7371595917188212, + "language_loss": 0.89704341, + "learning_rate": 2.7602152309949552e-06, + "loss": 0.91813552, + "num_input_tokens_seen": 134252020, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.515625, + "step": 6253, + "time_per_iteration": 2.42207407951355 + }, + { + "auxiliary_loss_clip": 0.010721, + "auxiliary_loss_mlp": 0.01034114, + "balance_loss_clip": 1.01974916, + "balance_loss_mlp": 1.02340984, + "epoch": 0.3760108221854802, + "flos": 16434105719040.0, + "grad_norm": 1.754969666250421, + "language_loss": 0.76611519, + "learning_rate": 2.7598657535911166e-06, + "loss": 0.78717726, + "num_input_tokens_seen": 134269495, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.48632812, + "step": 6254, + "time_per_iteration": 2.3783488273620605 + }, + { + "auxiliary_loss_clip": 0.01076465, + "auxiliary_loss_mlp": 0.01037614, + "balance_loss_clip": 1.02150869, + "balance_loss_mlp": 1.02485919, + "epoch": 0.37607094543814823, + "flos": 13770896031360.0, + "grad_norm": 2.2643559385653274, + "language_loss": 0.61856663, + "learning_rate": 2.759516249069623e-06, + "loss": 0.63970739, + "num_input_tokens_seen": 134287035, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.515625, + "step": 6255, + "time_per_iteration": 2.3731582164764404 + }, + { + "auxiliary_loss_clip": 0.01075863, + "auxiliary_loss_mlp": 0.01033719, + "balance_loss_clip": 1.01719642, + "balance_loss_mlp": 1.02263391, + "epoch": 0.3761310686908162, + "flos": 19857095614080.0, + "grad_norm": 2.859601823354851, + "language_loss": 0.73764277, + "learning_rate": 2.7591667174429487e-06, + "loss": 0.75873852, + "num_input_tokens_seen": 134304840, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.53125, + "step": 6256, + "time_per_iteration": 2.405026912689209 + }, + { + "auxiliary_loss_clip": 0.01077443, + "auxiliary_loss_mlp": 0.01035933, + "balance_loss_clip": 1.02024484, + "balance_loss_mlp": 1.02585077, + "epoch": 0.37619119194348416, + "flos": 12749965787520.0, + "grad_norm": 1.8261874370215634, + "language_loss": 0.70658994, + "learning_rate": 2.758817158723568e-06, + "loss": 0.72772372, + "num_input_tokens_seen": 134323180, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.515625, + "step": 6257, + "time_per_iteration": 3.853148937225342 + }, + { + "auxiliary_loss_clip": 0.01072975, + "auxiliary_loss_mlp": 0.01028035, + "balance_loss_clip": 1.01311588, + "balance_loss_mlp": 1.02380705, + "epoch": 0.3762513151961521, + "flos": 17529575448960.0, + "grad_norm": 2.414141865492626, + "language_loss": 0.84515548, + "learning_rate": 2.7584675729239537e-06, + "loss": 0.86616558, + "num_input_tokens_seen": 134341390, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.4921875, + "step": 6258, + "time_per_iteration": 2.411872148513794 + }, + { + "auxiliary_loss_clip": 0.01071784, + "auxiliary_loss_mlp": 0.01030284, + "balance_loss_clip": 1.01638472, + "balance_loss_mlp": 1.02268147, + "epoch": 0.3763114384488201, + "flos": 23616438347520.0, + "grad_norm": 1.4496009404322672, + "language_loss": 0.80593866, + "learning_rate": 2.7581179600565833e-06, + "loss": 0.82695937, + "num_input_tokens_seen": 134360425, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.4921875, + "step": 6259, + "time_per_iteration": 2.4118783473968506 + }, + { + "auxiliary_loss_clip": 0.0107643, + "auxiliary_loss_mlp": 0.010338, + "balance_loss_clip": 1.01738453, + "balance_loss_mlp": 1.02409172, + "epoch": 0.37637156170148806, + "flos": 25405911550080.0, + "grad_norm": 2.117599986329863, + "language_loss": 0.70945382, + "learning_rate": 2.7577683201339324e-06, + "loss": 0.73055607, + "num_input_tokens_seen": 134379775, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.5234375, + "step": 6260, + "time_per_iteration": 2.4490718841552734 + }, + { + "auxiliary_loss_clip": 0.01072644, + "auxiliary_loss_mlp": 0.01028651, + "balance_loss_clip": 1.01315427, + "balance_loss_mlp": 1.02171111, + "epoch": 0.376431684954156, + "flos": 23439777534720.0, + "grad_norm": 1.690756104317432, + "language_loss": 0.78478593, + "learning_rate": 2.75741865316848e-06, + "loss": 0.80579889, + "num_input_tokens_seen": 134400315, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.5078125, + "step": 6261, + "time_per_iteration": 3.8701090812683105 + }, + { + "auxiliary_loss_clip": 0.01078287, + "auxiliary_loss_mlp": 0.01031069, + "balance_loss_clip": 1.01501119, + "balance_loss_mlp": 1.0252254, + "epoch": 0.376491808206824, + "flos": 34203046515840.0, + "grad_norm": 1.7092523183873836, + "language_loss": 0.80223846, + "learning_rate": 2.757068959172704e-06, + "loss": 0.82333195, + "num_input_tokens_seen": 134422875, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.53125, + "step": 6262, + "time_per_iteration": 2.504891872406006 + }, + { + "auxiliary_loss_clip": 0.01070617, + "auxiliary_loss_mlp": 0.01029798, + "balance_loss_clip": 1.01481318, + "balance_loss_mlp": 1.02237606, + "epoch": 0.37655193145949195, + "flos": 35184315588480.0, + "grad_norm": 2.4167735123511265, + "language_loss": 0.79974389, + "learning_rate": 2.7567192381590837e-06, + "loss": 0.82074803, + "num_input_tokens_seen": 134443025, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.48242188, + "step": 6263, + "time_per_iteration": 3.9764513969421387 + }, + { + "auxiliary_loss_clip": 0.01073939, + "auxiliary_loss_mlp": 0.01033742, + "balance_loss_clip": 1.0184176, + "balance_loss_mlp": 1.02370751, + "epoch": 0.3766120547121599, + "flos": 16760962667520.0, + "grad_norm": 2.864044229868821, + "language_loss": 0.79476255, + "learning_rate": 2.756369490140101e-06, + "loss": 0.81583941, + "num_input_tokens_seen": 134460945, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.50390625, + "step": 6264, + "time_per_iteration": 2.3893048763275146 + }, + { + "auxiliary_loss_clip": 0.01070372, + "auxiliary_loss_mlp": 0.01032755, + "balance_loss_clip": 1.01699543, + "balance_loss_mlp": 1.02184439, + "epoch": 0.3766721779648279, + "flos": 23549230247040.0, + "grad_norm": 1.7936473927642498, + "language_loss": 0.73654813, + "learning_rate": 2.756019715128236e-06, + "loss": 0.75757939, + "num_input_tokens_seen": 134480440, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.484375, + "step": 6265, + "time_per_iteration": 2.4322702884674072 + }, + { + "auxiliary_loss_clip": 0.01072317, + "auxiliary_loss_mlp": 0.01030421, + "balance_loss_clip": 1.01732612, + "balance_loss_mlp": 1.02465034, + "epoch": 0.37673230121749585, + "flos": 29128001996160.0, + "grad_norm": 1.6215015501810288, + "language_loss": 0.68630373, + "learning_rate": 2.755669913135973e-06, + "loss": 0.70733112, + "num_input_tokens_seen": 134501110, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.4765625, + "step": 6266, + "time_per_iteration": 2.470392942428589 + }, + { + "auxiliary_loss_clip": 0.01075878, + "auxiliary_loss_mlp": 0.01033552, + "balance_loss_clip": 1.01776886, + "balance_loss_mlp": 1.02227426, + "epoch": 0.3767924244701638, + "flos": 28145545937280.0, + "grad_norm": 3.0757996640837706, + "language_loss": 0.63183129, + "learning_rate": 2.755320084175794e-06, + "loss": 0.65292561, + "num_input_tokens_seen": 134522460, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.53515625, + "step": 6267, + "time_per_iteration": 3.894496202468872 + }, + { + "auxiliary_loss_clip": 0.01014141, + "auxiliary_loss_mlp": 0.01000666, + "balance_loss_clip": 0.99904507, + "balance_loss_mlp": 1.0044384, + "epoch": 0.37685254772283183, + "flos": 60794153723520.0, + "grad_norm": 0.7232547072605194, + "language_loss": 0.5886519, + "learning_rate": 2.7549702282601847e-06, + "loss": 0.60880005, + "num_input_tokens_seen": 134589545, + "router_z_loss_clip": 0.01623535, + "router_z_loss_mlp": 0.09667969, + "step": 6268, + "time_per_iteration": 3.177900791168213 + }, + { + "auxiliary_loss_clip": 0.01075304, + "auxiliary_loss_mlp": 0.01031828, + "balance_loss_clip": 1.01556742, + "balance_loss_mlp": 1.02371275, + "epoch": 0.3769126709754998, + "flos": 26031310467840.0, + "grad_norm": 2.5875969271903445, + "language_loss": 0.65139586, + "learning_rate": 2.7546203454016294e-06, + "loss": 0.67246711, + "num_input_tokens_seen": 134610550, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.515625, + "step": 6269, + "time_per_iteration": 2.4589014053344727 + }, + { + "auxiliary_loss_clip": 0.01074055, + "auxiliary_loss_mlp": 0.01034448, + "balance_loss_clip": 1.01830125, + "balance_loss_mlp": 1.02519464, + "epoch": 0.37697279422816776, + "flos": 23578941680640.0, + "grad_norm": 1.7417019894549575, + "language_loss": 0.70729268, + "learning_rate": 2.7542704356126154e-06, + "loss": 0.7283777, + "num_input_tokens_seen": 134630485, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.48828125, + "step": 6270, + "time_per_iteration": 2.4513440132141113 + }, + { + "auxiliary_loss_clip": 0.01012673, + "auxiliary_loss_mlp": 0.01001323, + "balance_loss_clip": 0.99986249, + "balance_loss_mlp": 1.00271273, + "epoch": 0.3770329174808357, + "flos": 64742550802560.0, + "grad_norm": 0.7303118194290956, + "language_loss": 0.56058031, + "learning_rate": 2.7539204989056295e-06, + "loss": 0.58072025, + "num_input_tokens_seen": 134693510, + "router_z_loss_clip": 0.0145874, + "router_z_loss_mlp": 0.09960938, + "step": 6271, + "time_per_iteration": 3.072298288345337 + }, + { + "auxiliary_loss_clip": 0.01069461, + "auxiliary_loss_mlp": 0.01030193, + "balance_loss_clip": 1.01455879, + "balance_loss_mlp": 1.02062869, + "epoch": 0.3770930407335037, + "flos": 21834226707840.0, + "grad_norm": 6.395244620300128, + "language_loss": 0.7960465, + "learning_rate": 2.753570535293161e-06, + "loss": 0.81704307, + "num_input_tokens_seen": 134713115, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.48828125, + "step": 6272, + "time_per_iteration": 2.4121525287628174 + }, + { + "auxiliary_loss_clip": 0.01070385, + "auxiliary_loss_mlp": 0.01031363, + "balance_loss_clip": 1.01643252, + "balance_loss_mlp": 1.02212548, + "epoch": 0.37715316398617166, + "flos": 22746786491520.0, + "grad_norm": 1.5296451727932296, + "language_loss": 0.73967814, + "learning_rate": 2.753220544787698e-06, + "loss": 0.76069564, + "num_input_tokens_seen": 134732635, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.48242188, + "step": 6273, + "time_per_iteration": 2.4095168113708496 + }, + { + "auxiliary_loss_clip": 0.01072926, + "auxiliary_loss_mlp": 0.01033402, + "balance_loss_clip": 1.01832223, + "balance_loss_mlp": 1.02345395, + "epoch": 0.3772132872388396, + "flos": 18913637410560.0, + "grad_norm": 1.4279827035167507, + "language_loss": 0.7181142, + "learning_rate": 2.7528705274017315e-06, + "loss": 0.73917747, + "num_input_tokens_seen": 134750695, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.49414062, + "step": 6274, + "time_per_iteration": 2.4185595512390137 + }, + { + "auxiliary_loss_clip": 0.01074572, + "auxiliary_loss_mlp": 0.01030207, + "balance_loss_clip": 1.01608706, + "balance_loss_mlp": 1.0242064, + "epoch": 0.3772734104915076, + "flos": 17345303959680.0, + "grad_norm": 1.5912507120991288, + "language_loss": 0.83663416, + "learning_rate": 2.752520483147752e-06, + "loss": 0.85768199, + "num_input_tokens_seen": 134768935, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.50390625, + "step": 6275, + "time_per_iteration": 2.3772263526916504 + }, + { + "auxiliary_loss_clip": 0.01070714, + "auxiliary_loss_mlp": 0.01024163, + "balance_loss_clip": 1.01109779, + "balance_loss_mlp": 1.02379036, + "epoch": 0.37733353374417555, + "flos": 32341023774720.0, + "grad_norm": 1.9142428283276909, + "language_loss": 0.75319606, + "learning_rate": 2.7521704120382523e-06, + "loss": 0.77414483, + "num_input_tokens_seen": 134791260, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.46875, + "step": 6276, + "time_per_iteration": 2.4952826499938965 + }, + { + "auxiliary_loss_clip": 0.01074544, + "auxiliary_loss_mlp": 0.01029551, + "balance_loss_clip": 1.01324272, + "balance_loss_mlp": 1.02363646, + "epoch": 0.3773936569968435, + "flos": 23359756965120.0, + "grad_norm": 2.3124840352635823, + "language_loss": 0.85441828, + "learning_rate": 2.7518203140857255e-06, + "loss": 0.87545919, + "num_input_tokens_seen": 134808350, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.5078125, + "step": 6277, + "time_per_iteration": 2.402688980102539 + }, + { + "auxiliary_loss_clip": 0.01071467, + "auxiliary_loss_mlp": 0.01024563, + "balance_loss_clip": 1.0106039, + "balance_loss_mlp": 1.02440333, + "epoch": 0.3774537802495115, + "flos": 21465823374720.0, + "grad_norm": 1.7312496003702398, + "language_loss": 0.78453618, + "learning_rate": 2.7514701893026656e-06, + "loss": 0.80549651, + "num_input_tokens_seen": 134826005, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.47070312, + "step": 6278, + "time_per_iteration": 2.406852960586548 + }, + { + "auxiliary_loss_clip": 0.01076556, + "auxiliary_loss_mlp": 0.010345, + "balance_loss_clip": 1.01885355, + "balance_loss_mlp": 1.02478933, + "epoch": 0.37751390350217945, + "flos": 24972534443520.0, + "grad_norm": 1.6764196658342847, + "language_loss": 0.83040118, + "learning_rate": 2.751120037701568e-06, + "loss": 0.85151172, + "num_input_tokens_seen": 134844995, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.515625, + "step": 6279, + "time_per_iteration": 2.440063714981079 + }, + { + "auxiliary_loss_clip": 0.01072209, + "auxiliary_loss_mlp": 0.01030203, + "balance_loss_clip": 1.01636267, + "balance_loss_mlp": 1.02347875, + "epoch": 0.3775740267548474, + "flos": 27817851116160.0, + "grad_norm": 2.0566915307246942, + "language_loss": 0.7479161, + "learning_rate": 2.7507698592949276e-06, + "loss": 0.76894021, + "num_input_tokens_seen": 134865285, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.48828125, + "step": 6280, + "time_per_iteration": 2.4522063732147217 + }, + { + "auxiliary_loss_clip": 0.0107077, + "auxiliary_loss_mlp": 0.0103151, + "balance_loss_clip": 1.01828361, + "balance_loss_mlp": 1.02410412, + "epoch": 0.3776341500075154, + "flos": 22564120924800.0, + "grad_norm": 1.4303838571205254, + "language_loss": 0.76384938, + "learning_rate": 2.750419654095243e-06, + "loss": 0.78487211, + "num_input_tokens_seen": 134886535, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.46875, + "step": 6281, + "time_per_iteration": 2.4241533279418945 + }, + { + "auxiliary_loss_clip": 0.01071908, + "auxiliary_loss_mlp": 0.01030148, + "balance_loss_clip": 1.01518691, + "balance_loss_mlp": 1.02314448, + "epoch": 0.3776942732601834, + "flos": 23076087235200.0, + "grad_norm": 1.3792810654890517, + "language_loss": 0.84199941, + "learning_rate": 2.75006942211501e-06, + "loss": 0.86301994, + "num_input_tokens_seen": 134907435, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.48828125, + "step": 6282, + "time_per_iteration": 2.464942693710327 + }, + { + "auxiliary_loss_clip": 0.01071301, + "auxiliary_loss_mlp": 0.01028181, + "balance_loss_clip": 1.01394749, + "balance_loss_mlp": 1.02381754, + "epoch": 0.37775439651285136, + "flos": 21723377541120.0, + "grad_norm": 1.6139905577321032, + "language_loss": 0.69656861, + "learning_rate": 2.74971916336673e-06, + "loss": 0.71756351, + "num_input_tokens_seen": 134925360, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.47460938, + "step": 6283, + "time_per_iteration": 2.400484561920166 + }, + { + "auxiliary_loss_clip": 0.01073656, + "auxiliary_loss_mlp": 0.01033042, + "balance_loss_clip": 1.01753283, + "balance_loss_mlp": 1.02443361, + "epoch": 0.37781451976551933, + "flos": 23986622160000.0, + "grad_norm": 1.6480396498006487, + "language_loss": 0.76377159, + "learning_rate": 2.7493688778629012e-06, + "loss": 0.78483856, + "num_input_tokens_seen": 134944205, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.4921875, + "step": 6284, + "time_per_iteration": 2.422675132751465 + }, + { + "auxiliary_loss_clip": 0.01077091, + "auxiliary_loss_mlp": 0.0103539, + "balance_loss_clip": 1.01945162, + "balance_loss_mlp": 1.02470207, + "epoch": 0.3778746430181873, + "flos": 13727324787840.0, + "grad_norm": 2.566403663663605, + "language_loss": 0.85713154, + "learning_rate": 2.7490185656160244e-06, + "loss": 0.87825632, + "num_input_tokens_seen": 134960255, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.5234375, + "step": 6285, + "time_per_iteration": 2.3761963844299316 + }, + { + "auxiliary_loss_clip": 0.01074062, + "auxiliary_loss_mlp": 0.01031416, + "balance_loss_clip": 1.01485157, + "balance_loss_mlp": 1.02363992, + "epoch": 0.37793476627085526, + "flos": 19459574340480.0, + "grad_norm": 2.094610939693785, + "language_loss": 0.84092492, + "learning_rate": 2.7486682266386025e-06, + "loss": 0.86197972, + "num_input_tokens_seen": 134978605, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.50390625, + "step": 6286, + "time_per_iteration": 2.3991265296936035 + }, + { + "auxiliary_loss_clip": 0.01070327, + "auxiliary_loss_mlp": 0.0102899, + "balance_loss_clip": 1.01445818, + "balance_loss_mlp": 1.02181339, + "epoch": 0.3779948895235232, + "flos": 10706254997760.0, + "grad_norm": 1.9934881542143204, + "language_loss": 0.82329166, + "learning_rate": 2.748317860943137e-06, + "loss": 0.84428483, + "num_input_tokens_seen": 134995020, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.484375, + "step": 6287, + "time_per_iteration": 2.378243923187256 + }, + { + "auxiliary_loss_clip": 0.01072337, + "auxiliary_loss_mlp": 0.01033723, + "balance_loss_clip": 1.01854157, + "balance_loss_mlp": 1.02278781, + "epoch": 0.3780550127761912, + "flos": 22308905819520.0, + "grad_norm": 2.230633692956723, + "language_loss": 0.73443872, + "learning_rate": 2.747967468542132e-06, + "loss": 0.75549936, + "num_input_tokens_seen": 135012620, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.49609375, + "step": 6288, + "time_per_iteration": 2.41206693649292 + }, + { + "auxiliary_loss_clip": 0.01074087, + "auxiliary_loss_mlp": 0.01029323, + "balance_loss_clip": 1.01541138, + "balance_loss_mlp": 1.02432442, + "epoch": 0.37811513602885916, + "flos": 28949351235840.0, + "grad_norm": 1.5667994640064582, + "language_loss": 0.75021863, + "learning_rate": 2.7476170494480915e-06, + "loss": 0.77125275, + "num_input_tokens_seen": 135033365, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.49609375, + "step": 6289, + "time_per_iteration": 2.455432891845703 + }, + { + "auxiliary_loss_clip": 0.01072542, + "auxiliary_loss_mlp": 0.01030923, + "balance_loss_clip": 1.01656473, + "balance_loss_mlp": 1.02365494, + "epoch": 0.3781752592815271, + "flos": 23111803422720.0, + "grad_norm": 1.979789191022597, + "language_loss": 0.739528, + "learning_rate": 2.7472666036735225e-06, + "loss": 0.76056266, + "num_input_tokens_seen": 135052185, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.48828125, + "step": 6290, + "time_per_iteration": 2.4248533248901367 + }, + { + "auxiliary_loss_clip": 0.01073788, + "auxiliary_loss_mlp": 0.01033367, + "balance_loss_clip": 1.0163914, + "balance_loss_mlp": 1.02196646, + "epoch": 0.3782353825341951, + "flos": 19754904464640.0, + "grad_norm": 1.9777769257529845, + "language_loss": 0.79205358, + "learning_rate": 2.74691613123093e-06, + "loss": 0.81312513, + "num_input_tokens_seen": 135070425, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.51953125, + "step": 6291, + "time_per_iteration": 2.3897156715393066 + }, + { + "auxiliary_loss_clip": 0.01074113, + "auxiliary_loss_mlp": 0.01031127, + "balance_loss_clip": 1.01504517, + "balance_loss_mlp": 1.0225873, + "epoch": 0.37829550578686305, + "flos": 22049850464640.0, + "grad_norm": 1.7934932271908763, + "language_loss": 0.7603035, + "learning_rate": 2.746565632132822e-06, + "loss": 0.78135592, + "num_input_tokens_seen": 135090525, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.515625, + "step": 6292, + "time_per_iteration": 2.4194183349609375 + }, + { + "auxiliary_loss_clip": 0.01074045, + "auxiliary_loss_mlp": 0.01037065, + "balance_loss_clip": 1.02020836, + "balance_loss_mlp": 1.02358246, + "epoch": 0.378355629039531, + "flos": 16469472792960.0, + "grad_norm": 1.6865967932240056, + "language_loss": 0.69078517, + "learning_rate": 2.746215106391707e-06, + "loss": 0.71189624, + "num_input_tokens_seen": 135109575, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.50390625, + "step": 6293, + "time_per_iteration": 2.408169984817505 + }, + { + "auxiliary_loss_clip": 0.01072111, + "auxiliary_loss_mlp": 0.0103168, + "balance_loss_clip": 1.01699996, + "balance_loss_mlp": 1.02243948, + "epoch": 0.378415752292199, + "flos": 19973809889280.0, + "grad_norm": 1.7124043676158545, + "language_loss": 0.71046495, + "learning_rate": 2.745864554020095e-06, + "loss": 0.73150283, + "num_input_tokens_seen": 135127000, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.49804688, + "step": 6294, + "time_per_iteration": 2.3941314220428467 + }, + { + "auxiliary_loss_clip": 0.01077332, + "auxiliary_loss_mlp": 0.01030504, + "balance_loss_clip": 1.01368988, + "balance_loss_mlp": 1.02372241, + "epoch": 0.378475875544867, + "flos": 14646517729920.0, + "grad_norm": 1.936339766769601, + "language_loss": 0.82476676, + "learning_rate": 2.7455139750304947e-06, + "loss": 0.8458451, + "num_input_tokens_seen": 135145285, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.5390625, + "step": 6295, + "time_per_iteration": 2.3830618858337402 + }, + { + "auxiliary_loss_clip": 0.01072647, + "auxiliary_loss_mlp": 0.01029392, + "balance_loss_clip": 1.0137043, + "balance_loss_mlp": 1.02266371, + "epoch": 0.37853599879753497, + "flos": 26649796936320.0, + "grad_norm": 1.8331920370295154, + "language_loss": 0.71501195, + "learning_rate": 2.7451633694354194e-06, + "loss": 0.73603237, + "num_input_tokens_seen": 135165240, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.5, + "step": 6296, + "time_per_iteration": 3.8663713932037354 + }, + { + "auxiliary_loss_clip": 0.0107309, + "auxiliary_loss_mlp": 0.01035425, + "balance_loss_clip": 1.02033329, + "balance_loss_mlp": 1.02387309, + "epoch": 0.37859612205020293, + "flos": 17310984226560.0, + "grad_norm": 1.923398765396373, + "language_loss": 0.76767504, + "learning_rate": 2.7448127372473793e-06, + "loss": 0.78876019, + "num_input_tokens_seen": 135184045, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.4921875, + "step": 6297, + "time_per_iteration": 2.374267816543579 + }, + { + "auxiliary_loss_clip": 0.01073937, + "auxiliary_loss_mlp": 0.0103051, + "balance_loss_clip": 1.01524556, + "balance_loss_mlp": 1.02417874, + "epoch": 0.3786562453028709, + "flos": 18219494292480.0, + "grad_norm": 1.8864093205172274, + "language_loss": 0.79092687, + "learning_rate": 2.7444620784788887e-06, + "loss": 0.81197131, + "num_input_tokens_seen": 135202365, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.49609375, + "step": 6298, + "time_per_iteration": 2.3878748416900635 + }, + { + "auxiliary_loss_clip": 0.01070902, + "auxiliary_loss_mlp": 0.01028893, + "balance_loss_clip": 1.01406288, + "balance_loss_mlp": 1.02275205, + "epoch": 0.37871636855553886, + "flos": 21213820114560.0, + "grad_norm": 1.4950767241682503, + "language_loss": 0.84254462, + "learning_rate": 2.744111393142462e-06, + "loss": 0.86354256, + "num_input_tokens_seen": 135220955, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.48242188, + "step": 6299, + "time_per_iteration": 2.3771533966064453 + }, + { + "auxiliary_loss_clip": 0.01073882, + "auxiliary_loss_mlp": 0.0103219, + "balance_loss_clip": 1.01631761, + "balance_loss_mlp": 1.02298355, + "epoch": 0.3787764918082068, + "flos": 20951867116800.0, + "grad_norm": 2.274343151011981, + "language_loss": 0.76291323, + "learning_rate": 2.743760681250613e-06, + "loss": 0.78397393, + "num_input_tokens_seen": 135239715, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.5078125, + "step": 6300, + "time_per_iteration": 2.395735025405884 + }, + { + "auxiliary_loss_clip": 0.01076958, + "auxiliary_loss_mlp": 0.01036494, + "balance_loss_clip": 1.01796937, + "balance_loss_mlp": 1.02357352, + "epoch": 0.3788366150608748, + "flos": 17307143976960.0, + "grad_norm": 2.000230347571311, + "language_loss": 0.82237911, + "learning_rate": 2.743409942815859e-06, + "loss": 0.84351361, + "num_input_tokens_seen": 135257035, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.53515625, + "step": 6301, + "time_per_iteration": 3.9260175228118896 + }, + { + "auxiliary_loss_clip": 0.01072777, + "auxiliary_loss_mlp": 0.01031796, + "balance_loss_clip": 1.01597679, + "balance_loss_mlp": 1.0223999, + "epoch": 0.37889673831354276, + "flos": 24310092706560.0, + "grad_norm": 1.7917661272143974, + "language_loss": 0.67988241, + "learning_rate": 2.743059177850716e-06, + "loss": 0.70092809, + "num_input_tokens_seen": 135275720, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.50390625, + "step": 6302, + "time_per_iteration": 2.4174301624298096 + }, + { + "auxiliary_loss_clip": 0.01074916, + "auxiliary_loss_mlp": 0.01027897, + "balance_loss_clip": 1.01318622, + "balance_loss_mlp": 1.02632046, + "epoch": 0.3789568615662107, + "flos": 26682510746880.0, + "grad_norm": 1.8529723406615421, + "language_loss": 0.68552291, + "learning_rate": 2.7427083863677035e-06, + "loss": 0.70655102, + "num_input_tokens_seen": 135294140, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.48632812, + "step": 6303, + "time_per_iteration": 3.881007194519043 + }, + { + "auxiliary_loss_clip": 0.01070759, + "auxiliary_loss_mlp": 0.01025119, + "balance_loss_clip": 1.01091516, + "balance_loss_mlp": 1.02199697, + "epoch": 0.3790169848188787, + "flos": 23584108561920.0, + "grad_norm": 1.6044972558547639, + "language_loss": 0.77564681, + "learning_rate": 2.742357568379338e-06, + "loss": 0.79660559, + "num_input_tokens_seen": 135314845, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.48828125, + "step": 6304, + "time_per_iteration": 2.4242565631866455 + }, + { + "auxiliary_loss_clip": 0.01076963, + "auxiliary_loss_mlp": 0.0102708, + "balance_loss_clip": 1.01080799, + "balance_loss_mlp": 1.02434444, + "epoch": 0.37907710807154665, + "flos": 18436584326400.0, + "grad_norm": 2.16826351294024, + "language_loss": 0.80214989, + "learning_rate": 2.7420067238981405e-06, + "loss": 0.82319027, + "num_input_tokens_seen": 135333055, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.52734375, + "step": 6305, + "time_per_iteration": 2.389779567718506 + }, + { + "auxiliary_loss_clip": 0.01011544, + "auxiliary_loss_mlp": 0.01006682, + "balance_loss_clip": 1.005198, + "balance_loss_mlp": 1.00198126, + "epoch": 0.3791372313242146, + "flos": 50104411799040.0, + "grad_norm": 0.9611161886767963, + "language_loss": 0.64461195, + "learning_rate": 2.741655852936632e-06, + "loss": 0.66479421, + "num_input_tokens_seen": 135387865, + "router_z_loss_clip": 0.01483154, + "router_z_loss_mlp": 0.09570312, + "step": 6306, + "time_per_iteration": 2.95611834526062 + }, + { + "auxiliary_loss_clip": 0.01076049, + "auxiliary_loss_mlp": 0.01040722, + "balance_loss_clip": 1.02399743, + "balance_loss_mlp": 1.0246563, + "epoch": 0.3791973545768826, + "flos": 24315399233280.0, + "grad_norm": 1.4776211144927904, + "language_loss": 0.73504448, + "learning_rate": 2.741304955507334e-06, + "loss": 0.75621223, + "num_input_tokens_seen": 135409095, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.515625, + "step": 6307, + "time_per_iteration": 3.84859561920166 + }, + { + "auxiliary_loss_clip": 0.01076621, + "auxiliary_loss_mlp": 0.01036385, + "balance_loss_clip": 1.02015483, + "balance_loss_mlp": 1.02474093, + "epoch": 0.3792574778295506, + "flos": 21578837045760.0, + "grad_norm": 1.5295517112865868, + "language_loss": 0.78396779, + "learning_rate": 2.7409540316227686e-06, + "loss": 0.80509782, + "num_input_tokens_seen": 135429585, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.51953125, + "step": 6308, + "time_per_iteration": 2.4184396266937256 + }, + { + "auxiliary_loss_clip": 0.01070955, + "auxiliary_loss_mlp": 0.01029698, + "balance_loss_clip": 1.01374197, + "balance_loss_mlp": 1.02192664, + "epoch": 0.37931760108221857, + "flos": 22271653532160.0, + "grad_norm": 3.2319191580634827, + "language_loss": 0.73008668, + "learning_rate": 2.7406030812954596e-06, + "loss": 0.75109327, + "num_input_tokens_seen": 135446320, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.49023438, + "step": 6309, + "time_per_iteration": 2.3869950771331787 + }, + { + "auxiliary_loss_clip": 0.01072777, + "auxiliary_loss_mlp": 0.01030266, + "balance_loss_clip": 1.01474452, + "balance_loss_mlp": 1.02375412, + "epoch": 0.37937772433488653, + "flos": 19681970901120.0, + "grad_norm": 1.4278587239728795, + "language_loss": 0.78796041, + "learning_rate": 2.740252104537932e-06, + "loss": 0.8089909, + "num_input_tokens_seen": 135465720, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.4921875, + "step": 6310, + "time_per_iteration": 2.410391092300415 + }, + { + "auxiliary_loss_clip": 0.01073416, + "auxiliary_loss_mlp": 0.01028842, + "balance_loss_clip": 1.01381028, + "balance_loss_mlp": 1.02311683, + "epoch": 0.3794378475875545, + "flos": 19098362747520.0, + "grad_norm": 1.935800809432045, + "language_loss": 0.76142395, + "learning_rate": 2.7399011013627112e-06, + "loss": 0.78244656, + "num_input_tokens_seen": 135485155, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.50390625, + "step": 6311, + "time_per_iteration": 2.391130208969116 + }, + { + "auxiliary_loss_clip": 0.01073151, + "auxiliary_loss_mlp": 0.01031542, + "balance_loss_clip": 1.01680803, + "balance_loss_mlp": 1.0238483, + "epoch": 0.37949797084022246, + "flos": 20338617352320.0, + "grad_norm": 1.6251359330124207, + "language_loss": 0.70789325, + "learning_rate": 2.7395500717823233e-06, + "loss": 0.72894013, + "num_input_tokens_seen": 135502675, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.4921875, + "step": 6312, + "time_per_iteration": 2.395550489425659 + }, + { + "auxiliary_loss_clip": 0.01011316, + "auxiliary_loss_mlp": 0.01000873, + "balance_loss_clip": 0.99937081, + "balance_loss_mlp": 1.00167561, + "epoch": 0.37955809409289043, + "flos": 63969050430720.0, + "grad_norm": 0.7845213858642633, + "language_loss": 0.56086898, + "learning_rate": 2.739199015809296e-06, + "loss": 0.58099091, + "num_input_tokens_seen": 135562005, + "router_z_loss_clip": 0.01501465, + "router_z_loss_mlp": 0.09667969, + "step": 6313, + "time_per_iteration": 3.005631923675537 + }, + { + "auxiliary_loss_clip": 0.01072487, + "auxiliary_loss_mlp": 0.01034646, + "balance_loss_clip": 1.01942253, + "balance_loss_mlp": 1.0229528, + "epoch": 0.3796182173455584, + "flos": 31539313157760.0, + "grad_norm": 1.9726428742886393, + "language_loss": 0.71231735, + "learning_rate": 2.738847933456156e-06, + "loss": 0.73338866, + "num_input_tokens_seen": 135582600, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.49609375, + "step": 6314, + "time_per_iteration": 2.5943915843963623 + }, + { + "auxiliary_loss_clip": 0.0107584, + "auxiliary_loss_mlp": 0.01032347, + "balance_loss_clip": 1.01571691, + "balance_loss_mlp": 1.02327943, + "epoch": 0.37967834059822636, + "flos": 12129978130560.0, + "grad_norm": 1.7401836139182052, + "language_loss": 0.73164737, + "learning_rate": 2.738496824735435e-06, + "loss": 0.75272924, + "num_input_tokens_seen": 135600280, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.5234375, + "step": 6315, + "time_per_iteration": 2.3773343563079834 + }, + { + "auxiliary_loss_clip": 0.01074051, + "auxiliary_loss_mlp": 0.01033628, + "balance_loss_clip": 1.01839304, + "balance_loss_mlp": 1.02460265, + "epoch": 0.3797384638508943, + "flos": 39347009792640.0, + "grad_norm": 1.8086305695487093, + "language_loss": 0.70935374, + "learning_rate": 2.738145689659661e-06, + "loss": 0.73043054, + "num_input_tokens_seen": 135621560, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.49414062, + "step": 6316, + "time_per_iteration": 2.5793564319610596 + }, + { + "auxiliary_loss_clip": 0.01072395, + "auxiliary_loss_mlp": 0.01031413, + "balance_loss_clip": 1.01746511, + "balance_loss_mlp": 1.02356768, + "epoch": 0.3797985871035623, + "flos": 34052710734720.0, + "grad_norm": 2.327006941177403, + "language_loss": 0.65011269, + "learning_rate": 2.737794528241367e-06, + "loss": 0.67115074, + "num_input_tokens_seen": 135641745, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.48828125, + "step": 6317, + "time_per_iteration": 2.5013041496276855 + }, + { + "auxiliary_loss_clip": 0.0107017, + "auxiliary_loss_mlp": 0.01026863, + "balance_loss_clip": 1.01313019, + "balance_loss_mlp": 1.02251899, + "epoch": 0.37985871035623026, + "flos": 23221046666880.0, + "grad_norm": 2.5560401766138963, + "language_loss": 0.84915054, + "learning_rate": 2.737443340493084e-06, + "loss": 0.87012076, + "num_input_tokens_seen": 135660650, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.4765625, + "step": 6318, + "time_per_iteration": 2.42867374420166 + }, + { + "auxiliary_loss_clip": 0.010729, + "auxiliary_loss_mlp": 0.01034103, + "balance_loss_clip": 1.01797414, + "balance_loss_mlp": 1.02248192, + "epoch": 0.3799188336088982, + "flos": 18113951652480.0, + "grad_norm": 2.073614079920041, + "language_loss": 0.76291788, + "learning_rate": 2.737092126427345e-06, + "loss": 0.78398788, + "num_input_tokens_seen": 135679980, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.50390625, + "step": 6319, + "time_per_iteration": 2.3749430179595947 + }, + { + "auxiliary_loss_clip": 0.01073521, + "auxiliary_loss_mlp": 0.01035917, + "balance_loss_clip": 1.02171898, + "balance_loss_mlp": 1.02473748, + "epoch": 0.3799789568615662, + "flos": 21870815679360.0, + "grad_norm": 1.8058167086776955, + "language_loss": 0.64246219, + "learning_rate": 2.736740886056684e-06, + "loss": 0.66355658, + "num_input_tokens_seen": 135699400, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.48632812, + "step": 6320, + "time_per_iteration": 2.43791127204895 + }, + { + "auxiliary_loss_clip": 0.01072692, + "auxiliary_loss_mlp": 0.01035849, + "balance_loss_clip": 1.0207696, + "balance_loss_mlp": 1.02388418, + "epoch": 0.3800390801142342, + "flos": 32961570013440.0, + "grad_norm": 1.6897334924965235, + "language_loss": 0.70931941, + "learning_rate": 2.7363896193936356e-06, + "loss": 0.73040479, + "num_input_tokens_seen": 135723455, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.48828125, + "step": 6321, + "time_per_iteration": 2.504054546356201 + }, + { + "auxiliary_loss_clip": 0.01073745, + "auxiliary_loss_mlp": 0.01028804, + "balance_loss_clip": 1.01347327, + "balance_loss_mlp": 1.02246118, + "epoch": 0.38009920336690217, + "flos": 26905849914240.0, + "grad_norm": 1.8903169460824751, + "language_loss": 0.74813843, + "learning_rate": 2.7360383264507364e-06, + "loss": 0.76916397, + "num_input_tokens_seen": 135744335, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.51171875, + "step": 6322, + "time_per_iteration": 2.4458911418914795 + }, + { + "auxiliary_loss_clip": 0.01071188, + "auxiliary_loss_mlp": 0.01032696, + "balance_loss_clip": 1.01799774, + "balance_loss_mlp": 1.02290261, + "epoch": 0.38015932661957014, + "flos": 22487905693440.0, + "grad_norm": 2.0829829938278053, + "language_loss": 0.85222924, + "learning_rate": 2.735687007240522e-06, + "loss": 0.87326807, + "num_input_tokens_seen": 135761440, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.48242188, + "step": 6323, + "time_per_iteration": 2.3958194255828857 + }, + { + "auxiliary_loss_clip": 0.01073985, + "auxiliary_loss_mlp": 0.01031721, + "balance_loss_clip": 1.01593733, + "balance_loss_mlp": 1.0234499, + "epoch": 0.3802194498722381, + "flos": 21979919278080.0, + "grad_norm": 2.4657232571983982, + "language_loss": 0.73532248, + "learning_rate": 2.735335661775531e-06, + "loss": 0.75637954, + "num_input_tokens_seen": 135779955, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.5078125, + "step": 6324, + "time_per_iteration": 2.4065213203430176 + }, + { + "auxiliary_loss_clip": 0.01075219, + "auxiliary_loss_mlp": 0.01033289, + "balance_loss_clip": 1.01786327, + "balance_loss_mlp": 1.02478135, + "epoch": 0.38027957312490607, + "flos": 21323796497280.0, + "grad_norm": 1.850138936990718, + "language_loss": 0.84612614, + "learning_rate": 2.734984290068302e-06, + "loss": 0.86721122, + "num_input_tokens_seen": 135799840, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.50390625, + "step": 6325, + "time_per_iteration": 2.402581214904785 + }, + { + "auxiliary_loss_clip": 0.01073272, + "auxiliary_loss_mlp": 0.01029424, + "balance_loss_clip": 1.01516056, + "balance_loss_mlp": 1.02385521, + "epoch": 0.38033969637757403, + "flos": 16690298342400.0, + "grad_norm": 2.2045871825243117, + "language_loss": 0.79654181, + "learning_rate": 2.734632892131374e-06, + "loss": 0.81756878, + "num_input_tokens_seen": 135817880, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.4921875, + "step": 6326, + "time_per_iteration": 2.3796725273132324 + }, + { + "auxiliary_loss_clip": 0.0107083, + "auxiliary_loss_mlp": 0.01029436, + "balance_loss_clip": 1.01546466, + "balance_loss_mlp": 1.0218854, + "epoch": 0.380399819630242, + "flos": 36209365372800.0, + "grad_norm": 2.4240213340391743, + "language_loss": 0.73113871, + "learning_rate": 2.734281467977288e-06, + "loss": 0.75214136, + "num_input_tokens_seen": 135838940, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.48828125, + "step": 6327, + "time_per_iteration": 2.565790891647339 + }, + { + "auxiliary_loss_clip": 0.01071873, + "auxiliary_loss_mlp": 0.01033757, + "balance_loss_clip": 1.01784277, + "balance_loss_mlp": 1.02409625, + "epoch": 0.38045994288290996, + "flos": 21287766107520.0, + "grad_norm": 1.494018950564544, + "language_loss": 0.83084941, + "learning_rate": 2.733930017618585e-06, + "loss": 0.8519057, + "num_input_tokens_seen": 135858325, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.4765625, + "step": 6328, + "time_per_iteration": 2.3930253982543945 + }, + { + "auxiliary_loss_clip": 0.01069988, + "auxiliary_loss_mlp": 0.01029184, + "balance_loss_clip": 1.01433659, + "balance_loss_mlp": 1.02169073, + "epoch": 0.38052006613557793, + "flos": 20921841480960.0, + "grad_norm": 1.4257515592237942, + "language_loss": 0.61200964, + "learning_rate": 2.733578541067808e-06, + "loss": 0.63300145, + "num_input_tokens_seen": 135878430, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.484375, + "step": 6329, + "time_per_iteration": 2.41375732421875 + }, + { + "auxiliary_loss_clip": 0.01072631, + "auxiliary_loss_mlp": 0.01033787, + "balance_loss_clip": 1.01839685, + "balance_loss_mlp": 1.02264071, + "epoch": 0.3805801893882459, + "flos": 20989817631360.0, + "grad_norm": 2.8991918272289494, + "language_loss": 0.56211734, + "learning_rate": 2.733227038337499e-06, + "loss": 0.58318144, + "num_input_tokens_seen": 135894755, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.5, + "step": 6330, + "time_per_iteration": 2.3851916790008545 + }, + { + "auxiliary_loss_clip": 0.01070925, + "auxiliary_loss_mlp": 0.01027293, + "balance_loss_clip": 1.01420426, + "balance_loss_mlp": 1.02431893, + "epoch": 0.38064031264091386, + "flos": 25557364494720.0, + "grad_norm": 2.134191527011971, + "language_loss": 0.65927303, + "learning_rate": 2.7328755094402036e-06, + "loss": 0.68025517, + "num_input_tokens_seen": 135918275, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.46484375, + "step": 6331, + "time_per_iteration": 2.479475498199463 + }, + { + "auxiliary_loss_clip": 0.01073984, + "auxiliary_loss_mlp": 0.0104059, + "balance_loss_clip": 1.02493167, + "balance_loss_mlp": 1.0249294, + "epoch": 0.3807004358935818, + "flos": 15084956983680.0, + "grad_norm": 1.6180832428334229, + "language_loss": 0.75569254, + "learning_rate": 2.732523954388466e-06, + "loss": 0.7768383, + "num_input_tokens_seen": 135937430, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.49023438, + "step": 6332, + "time_per_iteration": 2.3726158142089844 + }, + { + "auxiliary_loss_clip": 0.01072455, + "auxiliary_loss_mlp": 0.01029656, + "balance_loss_clip": 1.01440954, + "balance_loss_mlp": 1.0220623, + "epoch": 0.3807605591462498, + "flos": 16398459354240.0, + "grad_norm": 2.0739452714131534, + "language_loss": 0.82257962, + "learning_rate": 2.732172373194834e-06, + "loss": 0.84360069, + "num_input_tokens_seen": 135954210, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.50390625, + "step": 6333, + "time_per_iteration": 2.3743832111358643 + }, + { + "auxiliary_loss_clip": 0.01069972, + "auxiliary_loss_mlp": 0.01027534, + "balance_loss_clip": 1.01314545, + "balance_loss_mlp": 1.02167153, + "epoch": 0.3808206823989178, + "flos": 29055871393920.0, + "grad_norm": 1.5541221219981671, + "language_loss": 0.86296082, + "learning_rate": 2.731820765871853e-06, + "loss": 0.88393587, + "num_input_tokens_seen": 135974425, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.48242188, + "step": 6334, + "time_per_iteration": 2.452577590942383 + }, + { + "auxiliary_loss_clip": 0.01071537, + "auxiliary_loss_mlp": 0.0103303, + "balance_loss_clip": 1.01790214, + "balance_loss_mlp": 1.02261972, + "epoch": 0.3808808056515858, + "flos": 15704944640640.0, + "grad_norm": 7.015227393238082, + "language_loss": 0.79345757, + "learning_rate": 2.7314691324320705e-06, + "loss": 0.81450319, + "num_input_tokens_seen": 135991985, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.48828125, + "step": 6335, + "time_per_iteration": 2.3867104053497314 + }, + { + "auxiliary_loss_clip": 0.01073225, + "auxiliary_loss_mlp": 0.0103004, + "balance_loss_clip": 1.01421452, + "balance_loss_mlp": 1.02300286, + "epoch": 0.38094092890425374, + "flos": 20703529549440.0, + "grad_norm": 2.5870340626875112, + "language_loss": 0.72556508, + "learning_rate": 2.7311174728880364e-06, + "loss": 0.74659771, + "num_input_tokens_seen": 136010015, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.5, + "step": 6336, + "time_per_iteration": 3.906203508377075 + }, + { + "auxiliary_loss_clip": 0.01070224, + "auxiliary_loss_mlp": 0.01029884, + "balance_loss_clip": 1.01577568, + "balance_loss_mlp": 1.0226469, + "epoch": 0.3810010521569217, + "flos": 20666905666560.0, + "grad_norm": 1.8096865588641151, + "language_loss": 0.69679999, + "learning_rate": 2.730765787252301e-06, + "loss": 0.71780109, + "num_input_tokens_seen": 136028440, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.4765625, + "step": 6337, + "time_per_iteration": 2.3771214485168457 + }, + { + "auxiliary_loss_clip": 0.0107268, + "auxiliary_loss_mlp": 0.01029277, + "balance_loss_clip": 1.01425695, + "balance_loss_mlp": 1.0236088, + "epoch": 0.38106117540958967, + "flos": 31826404200960.0, + "grad_norm": 2.406627064099211, + "language_loss": 0.63582182, + "learning_rate": 2.7304140755374137e-06, + "loss": 0.65684134, + "num_input_tokens_seen": 136048360, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.49023438, + "step": 6338, + "time_per_iteration": 2.477311611175537 + }, + { + "auxiliary_loss_clip": 0.01072969, + "auxiliary_loss_mlp": 0.01031572, + "balance_loss_clip": 1.01634908, + "balance_loss_mlp": 1.02388, + "epoch": 0.38112129866225763, + "flos": 16902012026880.0, + "grad_norm": 2.919416394949127, + "language_loss": 0.69507802, + "learning_rate": 2.7300623377559273e-06, + "loss": 0.71612334, + "num_input_tokens_seen": 136065500, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.4921875, + "step": 6339, + "time_per_iteration": 2.360309600830078 + }, + { + "auxiliary_loss_clip": 0.0107491, + "auxiliary_loss_mlp": 0.01036389, + "balance_loss_clip": 1.02217925, + "balance_loss_mlp": 1.02416372, + "epoch": 0.3811814219149256, + "flos": 20886160204800.0, + "grad_norm": 3.2329403842978155, + "language_loss": 0.68271172, + "learning_rate": 2.729710573920394e-06, + "loss": 0.70382476, + "num_input_tokens_seen": 136084060, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.5078125, + "step": 6340, + "time_per_iteration": 2.4211418628692627 + }, + { + "auxiliary_loss_clip": 0.0107451, + "auxiliary_loss_mlp": 0.01031814, + "balance_loss_clip": 1.01581609, + "balance_loss_mlp": 1.0231539, + "epoch": 0.38124154516759357, + "flos": 16689879406080.0, + "grad_norm": 1.94844788594245, + "language_loss": 0.89741421, + "learning_rate": 2.729358784043367e-06, + "loss": 0.91847742, + "num_input_tokens_seen": 136102310, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.515625, + "step": 6341, + "time_per_iteration": 3.7907562255859375 + }, + { + "auxiliary_loss_clip": 0.01075309, + "auxiliary_loss_mlp": 0.01033236, + "balance_loss_clip": 1.01729822, + "balance_loss_mlp": 1.02402592, + "epoch": 0.38130166842026153, + "flos": 19680958471680.0, + "grad_norm": 1.6311095392505879, + "language_loss": 0.75363111, + "learning_rate": 2.7290069681374018e-06, + "loss": 0.77471656, + "num_input_tokens_seen": 136120725, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.515625, + "step": 6342, + "time_per_iteration": 2.39212703704834 + }, + { + "auxiliary_loss_clip": 0.01072937, + "auxiliary_loss_mlp": 0.01031331, + "balance_loss_clip": 1.01666236, + "balance_loss_mlp": 1.0224123, + "epoch": 0.3813617916729295, + "flos": 22197393336960.0, + "grad_norm": 1.6209662581670443, + "language_loss": 0.83417922, + "learning_rate": 2.7286551262150522e-06, + "loss": 0.85522187, + "num_input_tokens_seen": 136139105, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.50390625, + "step": 6343, + "time_per_iteration": 3.7917282581329346 + }, + { + "auxiliary_loss_clip": 0.01070836, + "auxiliary_loss_mlp": 0.01032255, + "balance_loss_clip": 1.01746726, + "balance_loss_mlp": 1.02193069, + "epoch": 0.38142191492559746, + "flos": 19095953863680.0, + "grad_norm": 1.6678018005756956, + "language_loss": 0.76731622, + "learning_rate": 2.7283032582888763e-06, + "loss": 0.78834707, + "num_input_tokens_seen": 136158265, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.48828125, + "step": 6344, + "time_per_iteration": 2.38700008392334 + }, + { + "auxiliary_loss_clip": 0.01075757, + "auxiliary_loss_mlp": 0.01032631, + "balance_loss_clip": 1.01753259, + "balance_loss_mlp": 1.02496266, + "epoch": 0.3814820381782654, + "flos": 24096598542720.0, + "grad_norm": 2.4158382563911376, + "language_loss": 0.73242879, + "learning_rate": 2.7279513643714304e-06, + "loss": 0.75351268, + "num_input_tokens_seen": 136176100, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.5078125, + "step": 6345, + "time_per_iteration": 2.4203360080718994 + }, + { + "auxiliary_loss_clip": 0.01070512, + "auxiliary_loss_mlp": 0.0102623, + "balance_loss_clip": 1.01216364, + "balance_loss_mlp": 1.02177739, + "epoch": 0.3815421614309334, + "flos": 15777598913280.0, + "grad_norm": 1.6191840928302095, + "language_loss": 0.69535041, + "learning_rate": 2.727599444475272e-06, + "loss": 0.71631777, + "num_input_tokens_seen": 136195125, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.48828125, + "step": 6346, + "time_per_iteration": 3.7785873413085938 + }, + { + "auxiliary_loss_clip": 0.01073276, + "auxiliary_loss_mlp": 0.01029892, + "balance_loss_clip": 1.01497293, + "balance_loss_mlp": 1.02398419, + "epoch": 0.38160228468360136, + "flos": 19898781644160.0, + "grad_norm": 1.7791035163357487, + "language_loss": 0.74928927, + "learning_rate": 2.7272474986129622e-06, + "loss": 0.77032089, + "num_input_tokens_seen": 136213885, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.4921875, + "step": 6347, + "time_per_iteration": 2.3902587890625 + }, + { + "auxiliary_loss_clip": 0.01071621, + "auxiliary_loss_mlp": 0.01030149, + "balance_loss_clip": 1.01610613, + "balance_loss_mlp": 1.02165222, + "epoch": 0.3816624079362694, + "flos": 19280050796160.0, + "grad_norm": 3.775672963559628, + "language_loss": 0.74341285, + "learning_rate": 2.7268955267970594e-06, + "loss": 0.76443052, + "num_input_tokens_seen": 136232700, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.5, + "step": 6348, + "time_per_iteration": 2.368314743041992 + }, + { + "auxiliary_loss_clip": 0.01070911, + "auxiliary_loss_mlp": 0.01027743, + "balance_loss_clip": 1.01357532, + "balance_loss_mlp": 1.02221847, + "epoch": 0.38172253118893734, + "flos": 21176532915840.0, + "grad_norm": 2.345112785963392, + "language_loss": 0.87265092, + "learning_rate": 2.726543529040125e-06, + "loss": 0.89363742, + "num_input_tokens_seen": 136248975, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.48632812, + "step": 6349, + "time_per_iteration": 2.3773574829101562 + }, + { + "auxiliary_loss_clip": 0.01071461, + "auxiliary_loss_mlp": 0.01028906, + "balance_loss_clip": 1.01403427, + "balance_loss_mlp": 1.02234077, + "epoch": 0.3817826544416053, + "flos": 17528283728640.0, + "grad_norm": 1.6432091634171837, + "language_loss": 0.76380199, + "learning_rate": 2.7261915053547216e-06, + "loss": 0.78480566, + "num_input_tokens_seen": 136266710, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.4921875, + "step": 6350, + "time_per_iteration": 2.348567485809326 + }, + { + "auxiliary_loss_clip": 0.01071636, + "auxiliary_loss_mlp": 0.01026879, + "balance_loss_clip": 1.01115584, + "balance_loss_mlp": 1.02230453, + "epoch": 0.38184277769427327, + "flos": 16325595613440.0, + "grad_norm": 1.9467085013459524, + "language_loss": 0.75772917, + "learning_rate": 2.7258394557534103e-06, + "loss": 0.77871436, + "num_input_tokens_seen": 136284445, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.4921875, + "step": 6351, + "time_per_iteration": 2.3566505908966064 + }, + { + "auxiliary_loss_clip": 0.01074607, + "auxiliary_loss_mlp": 0.01032537, + "balance_loss_clip": 1.01671815, + "balance_loss_mlp": 1.02312076, + "epoch": 0.38190290094694124, + "flos": 30442202593920.0, + "grad_norm": 1.752944037582217, + "language_loss": 0.74010742, + "learning_rate": 2.725487380248756e-06, + "loss": 0.76117879, + "num_input_tokens_seen": 136305730, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.515625, + "step": 6352, + "time_per_iteration": 2.478961944580078 + }, + { + "auxiliary_loss_clip": 0.01069502, + "auxiliary_loss_mlp": 0.01025978, + "balance_loss_clip": 1.01283479, + "balance_loss_mlp": 1.02186477, + "epoch": 0.3819630241996092, + "flos": 14209055994240.0, + "grad_norm": 1.8667879904088551, + "language_loss": 0.63989282, + "learning_rate": 2.7251352788533237e-06, + "loss": 0.66084754, + "num_input_tokens_seen": 136323850, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.4765625, + "step": 6353, + "time_per_iteration": 2.390054941177368 + }, + { + "auxiliary_loss_clip": 0.01069347, + "auxiliary_loss_mlp": 0.01028094, + "balance_loss_clip": 1.01348531, + "balance_loss_mlp": 1.02158535, + "epoch": 0.38202314745227717, + "flos": 25008529921920.0, + "grad_norm": 1.5842716576676679, + "language_loss": 0.83193064, + "learning_rate": 2.7247831515796786e-06, + "loss": 0.85290504, + "num_input_tokens_seen": 136344880, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.4765625, + "step": 6354, + "time_per_iteration": 2.4330496788024902 + }, + { + "auxiliary_loss_clip": 0.01071746, + "auxiliary_loss_mlp": 0.01026443, + "balance_loss_clip": 1.01256669, + "balance_loss_mlp": 1.02348781, + "epoch": 0.38208327070494513, + "flos": 20813436109440.0, + "grad_norm": 1.7118059923719808, + "language_loss": 0.80289, + "learning_rate": 2.7244309984403865e-06, + "loss": 0.82387185, + "num_input_tokens_seen": 136366060, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.484375, + "step": 6355, + "time_per_iteration": 2.4107625484466553 + }, + { + "auxiliary_loss_clip": 0.01072404, + "auxiliary_loss_mlp": 0.01029717, + "balance_loss_clip": 1.01535213, + "balance_loss_mlp": 1.02283466, + "epoch": 0.3821433939576131, + "flos": 22636635552000.0, + "grad_norm": 1.8051748239290533, + "language_loss": 0.75453568, + "learning_rate": 2.7240788194480163e-06, + "loss": 0.77555686, + "num_input_tokens_seen": 136385625, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.49609375, + "step": 6356, + "time_per_iteration": 2.426191806793213 + }, + { + "auxiliary_loss_clip": 0.01071852, + "auxiliary_loss_mlp": 0.01027571, + "balance_loss_clip": 1.01277745, + "balance_loss_mlp": 1.02296412, + "epoch": 0.38220351721028106, + "flos": 26868667449600.0, + "grad_norm": 2.743462649009737, + "language_loss": 0.81357545, + "learning_rate": 2.7237266146151357e-06, + "loss": 0.83456969, + "num_input_tokens_seen": 136405750, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.48828125, + "step": 6357, + "time_per_iteration": 2.4445321559906006 + }, + { + "auxiliary_loss_clip": 0.01077669, + "auxiliary_loss_mlp": 0.0103619, + "balance_loss_clip": 1.01894021, + "balance_loss_mlp": 1.02636087, + "epoch": 0.38226364046294903, + "flos": 23366355212160.0, + "grad_norm": 1.6343022709106763, + "language_loss": 0.77832496, + "learning_rate": 2.7233743839543135e-06, + "loss": 0.79946357, + "num_input_tokens_seen": 136426085, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.51171875, + "step": 6358, + "time_per_iteration": 2.4251880645751953 + }, + { + "auxiliary_loss_clip": 0.01073259, + "auxiliary_loss_mlp": 0.01032635, + "balance_loss_clip": 1.017591, + "balance_loss_mlp": 1.02269793, + "epoch": 0.382323763715617, + "flos": 19645207372800.0, + "grad_norm": 2.192897322384427, + "language_loss": 0.79104972, + "learning_rate": 2.7230221274781204e-06, + "loss": 0.81210864, + "num_input_tokens_seen": 136442670, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.5078125, + "step": 6359, + "time_per_iteration": 2.3775861263275146 + }, + { + "auxiliary_loss_clip": 0.01072341, + "auxiliary_loss_mlp": 0.01026915, + "balance_loss_clip": 1.01232946, + "balance_loss_mlp": 1.02312422, + "epoch": 0.38238388696828496, + "flos": 54122776842240.0, + "grad_norm": 1.8630713747801777, + "language_loss": 0.69627303, + "learning_rate": 2.722669845199127e-06, + "loss": 0.71726561, + "num_input_tokens_seen": 136465730, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.4921875, + "step": 6360, + "time_per_iteration": 2.6730566024780273 + }, + { + "auxiliary_loss_clip": 0.01071773, + "auxiliary_loss_mlp": 0.01024345, + "balance_loss_clip": 1.00970566, + "balance_loss_mlp": 1.02280796, + "epoch": 0.382444010220953, + "flos": 24935037776640.0, + "grad_norm": 1.584770867287631, + "language_loss": 0.78851461, + "learning_rate": 2.7223175371299062e-06, + "loss": 0.80947578, + "num_input_tokens_seen": 136487215, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.48828125, + "step": 6361, + "time_per_iteration": 2.4543378353118896 + }, + { + "auxiliary_loss_clip": 0.01069438, + "auxiliary_loss_mlp": 0.01026662, + "balance_loss_clip": 1.01302397, + "balance_loss_mlp": 1.02275753, + "epoch": 0.38250413347362094, + "flos": 42335784708480.0, + "grad_norm": 1.3613893411603089, + "language_loss": 0.65544206, + "learning_rate": 2.72196520328303e-06, + "loss": 0.67640305, + "num_input_tokens_seen": 136510365, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.46679688, + "step": 6362, + "time_per_iteration": 2.5893757343292236 + }, + { + "auxiliary_loss_clip": 0.01070188, + "auxiliary_loss_mlp": 0.01028316, + "balance_loss_clip": 1.01351023, + "balance_loss_mlp": 1.02255678, + "epoch": 0.3825642567262889, + "flos": 16288308414720.0, + "grad_norm": 1.7003943566148527, + "language_loss": 0.8184911, + "learning_rate": 2.7216128436710737e-06, + "loss": 0.83947611, + "num_input_tokens_seen": 136527100, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.4765625, + "step": 6363, + "time_per_iteration": 2.398024320602417 + }, + { + "auxiliary_loss_clip": 0.01071189, + "auxiliary_loss_mlp": 0.01028626, + "balance_loss_clip": 1.01458287, + "balance_loss_mlp": 1.02377224, + "epoch": 0.3826243799789569, + "flos": 45653197052160.0, + "grad_norm": 1.8400656753131923, + "language_loss": 0.58986646, + "learning_rate": 2.7212604583066107e-06, + "loss": 0.61086464, + "num_input_tokens_seen": 136550870, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.47460938, + "step": 6364, + "time_per_iteration": 2.6245498657226562 + }, + { + "auxiliary_loss_clip": 0.01072472, + "auxiliary_loss_mlp": 0.01029269, + "balance_loss_clip": 1.0138073, + "balance_loss_mlp": 1.02297175, + "epoch": 0.38268450323162484, + "flos": 25300403821440.0, + "grad_norm": 2.5557872774094137, + "language_loss": 0.69215167, + "learning_rate": 2.7209080472022174e-06, + "loss": 0.7131691, + "num_input_tokens_seen": 136569895, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.49609375, + "step": 6365, + "time_per_iteration": 2.4672000408172607 + }, + { + "auxiliary_loss_clip": 0.01072812, + "auxiliary_loss_mlp": 0.01028341, + "balance_loss_clip": 1.01259947, + "balance_loss_mlp": 1.02237844, + "epoch": 0.3827446264842928, + "flos": 21834924935040.0, + "grad_norm": 2.041020458544721, + "language_loss": 0.73036033, + "learning_rate": 2.72055561037047e-06, + "loss": 0.75137186, + "num_input_tokens_seen": 136588585, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.50390625, + "step": 6366, + "time_per_iteration": 2.4111130237579346 + }, + { + "auxiliary_loss_clip": 0.01073342, + "auxiliary_loss_mlp": 0.01030665, + "balance_loss_clip": 1.01455986, + "balance_loss_mlp": 1.02303851, + "epoch": 0.38280474973696077, + "flos": 25733536548480.0, + "grad_norm": 2.2766831018131457, + "language_loss": 0.68395281, + "learning_rate": 2.720203147823947e-06, + "loss": 0.70499289, + "num_input_tokens_seen": 136606640, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.50390625, + "step": 6367, + "time_per_iteration": 2.4468324184417725 + }, + { + "auxiliary_loss_clip": 0.0107014, + "auxiliary_loss_mlp": 0.01032627, + "balance_loss_clip": 1.01766062, + "balance_loss_mlp": 1.0221231, + "epoch": 0.38286487298962874, + "flos": 24894887846400.0, + "grad_norm": 1.9425800311136414, + "language_loss": 0.63679582, + "learning_rate": 2.719850659575225e-06, + "loss": 0.65782344, + "num_input_tokens_seen": 136624940, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.48046875, + "step": 6368, + "time_per_iteration": 2.399765968322754 + }, + { + "auxiliary_loss_clip": 0.01071034, + "auxiliary_loss_mlp": 0.01029105, + "balance_loss_clip": 1.01450181, + "balance_loss_mlp": 1.02185559, + "epoch": 0.3829249962422967, + "flos": 28542578451840.0, + "grad_norm": 1.3072999188014507, + "language_loss": 0.68166196, + "learning_rate": 2.7194981456368857e-06, + "loss": 0.70266342, + "num_input_tokens_seen": 136645540, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.4921875, + "step": 6369, + "time_per_iteration": 2.4642443656921387 + }, + { + "auxiliary_loss_clip": 0.01072015, + "auxiliary_loss_mlp": 0.01029282, + "balance_loss_clip": 1.01568067, + "balance_loss_mlp": 1.02340841, + "epoch": 0.38298511949496467, + "flos": 21470117472000.0, + "grad_norm": 1.6192077323976262, + "language_loss": 0.78242338, + "learning_rate": 2.719145606021508e-06, + "loss": 0.8034364, + "num_input_tokens_seen": 136664530, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.484375, + "step": 6370, + "time_per_iteration": 2.387908935546875 + }, + { + "auxiliary_loss_clip": 0.01071998, + "auxiliary_loss_mlp": 0.01031595, + "balance_loss_clip": 1.01696777, + "balance_loss_mlp": 1.02367973, + "epoch": 0.38304524274763263, + "flos": 31678826417280.0, + "grad_norm": 3.2585058839607592, + "language_loss": 0.6452136, + "learning_rate": 2.7187930407416738e-06, + "loss": 0.66624951, + "num_input_tokens_seen": 136682315, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.48242188, + "step": 6371, + "time_per_iteration": 2.4578778743743896 + }, + { + "auxiliary_loss_clip": 0.01074473, + "auxiliary_loss_mlp": 0.01030038, + "balance_loss_clip": 1.01361084, + "balance_loss_mlp": 1.02350307, + "epoch": 0.3831053660003006, + "flos": 25075807845120.0, + "grad_norm": 1.9348233168684754, + "language_loss": 0.72772771, + "learning_rate": 2.7184404498099644e-06, + "loss": 0.74877286, + "num_input_tokens_seen": 136701185, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.5078125, + "step": 6372, + "time_per_iteration": 2.4492855072021484 + }, + { + "auxiliary_loss_clip": 0.0107258, + "auxiliary_loss_mlp": 0.01032563, + "balance_loss_clip": 1.01692247, + "balance_loss_mlp": 1.02249813, + "epoch": 0.38316548925296856, + "flos": 23257880017920.0, + "grad_norm": 1.7487877720047569, + "language_loss": 0.847821, + "learning_rate": 2.7180878332389638e-06, + "loss": 0.86887246, + "num_input_tokens_seen": 136721265, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.5, + "step": 6373, + "time_per_iteration": 2.397027015686035 + }, + { + "auxiliary_loss_clip": 0.01075996, + "auxiliary_loss_mlp": 0.01036034, + "balance_loss_clip": 1.02008963, + "balance_loss_mlp": 1.0246737, + "epoch": 0.3832256125056366, + "flos": 34422021763200.0, + "grad_norm": 2.0572916232565457, + "language_loss": 0.74710399, + "learning_rate": 2.7177351910412553e-06, + "loss": 0.76822436, + "num_input_tokens_seen": 136741885, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.51171875, + "step": 6374, + "time_per_iteration": 2.4963696002960205 + }, + { + "auxiliary_loss_clip": 0.01075414, + "auxiliary_loss_mlp": 0.01031086, + "balance_loss_clip": 1.01593471, + "balance_loss_mlp": 1.02460217, + "epoch": 0.38328573575830455, + "flos": 21761677169280.0, + "grad_norm": 2.384889744945214, + "language_loss": 0.76147139, + "learning_rate": 2.717382523229424e-06, + "loss": 0.78253639, + "num_input_tokens_seen": 136760905, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.5078125, + "step": 6375, + "time_per_iteration": 2.3789172172546387 + }, + { + "auxiliary_loss_clip": 0.01071459, + "auxiliary_loss_mlp": 0.01030654, + "balance_loss_clip": 1.01647997, + "balance_loss_mlp": 1.0228107, + "epoch": 0.3833458590109725, + "flos": 17379169845120.0, + "grad_norm": 2.4970649556509303, + "language_loss": 0.72878611, + "learning_rate": 2.7170298298160558e-06, + "loss": 0.74980718, + "num_input_tokens_seen": 136777240, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.48632812, + "step": 6376, + "time_per_iteration": 3.754441261291504 + }, + { + "auxiliary_loss_clip": 0.01068327, + "auxiliary_loss_mlp": 0.01027071, + "balance_loss_clip": 1.01203871, + "balance_loss_mlp": 1.02115405, + "epoch": 0.3834059822636405, + "flos": 29423262297600.0, + "grad_norm": 1.6191556308667414, + "language_loss": 0.67835015, + "learning_rate": 2.7166771108137373e-06, + "loss": 0.6993041, + "num_input_tokens_seen": 136801040, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.47265625, + "step": 6377, + "time_per_iteration": 2.451864242553711 + }, + { + "auxiliary_loss_clip": 0.01072518, + "auxiliary_loss_mlp": 0.01032173, + "balance_loss_clip": 1.01624131, + "balance_loss_mlp": 1.02335477, + "epoch": 0.38346610551630844, + "flos": 21469663624320.0, + "grad_norm": 1.8065877179990457, + "language_loss": 0.73104817, + "learning_rate": 2.7163243662350574e-06, + "loss": 0.75209504, + "num_input_tokens_seen": 136819495, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.4921875, + "step": 6378, + "time_per_iteration": 2.3680737018585205 + }, + { + "auxiliary_loss_clip": 0.01073427, + "auxiliary_loss_mlp": 0.01033182, + "balance_loss_clip": 1.01871037, + "balance_loss_mlp": 1.02270603, + "epoch": 0.3835262287689764, + "flos": 27560052570240.0, + "grad_norm": 1.7622105152330363, + "language_loss": 0.69400299, + "learning_rate": 2.7159715960926025e-06, + "loss": 0.71506906, + "num_input_tokens_seen": 136838840, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.5078125, + "step": 6379, + "time_per_iteration": 2.443986415863037 + }, + { + "auxiliary_loss_clip": 0.01070451, + "auxiliary_loss_mlp": 0.01028671, + "balance_loss_clip": 1.01378214, + "balance_loss_mlp": 1.02287591, + "epoch": 0.3835863520216444, + "flos": 15522802744320.0, + "grad_norm": 1.7018887126676754, + "language_loss": 0.83296108, + "learning_rate": 2.715618800398963e-06, + "loss": 0.85395229, + "num_input_tokens_seen": 136854425, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.47460938, + "step": 6380, + "time_per_iteration": 3.7976813316345215 + }, + { + "auxiliary_loss_clip": 0.01070669, + "auxiliary_loss_mlp": 0.01024584, + "balance_loss_clip": 1.01063609, + "balance_loss_mlp": 1.02305818, + "epoch": 0.38364647527431234, + "flos": 21903948426240.0, + "grad_norm": 1.3731764596619855, + "language_loss": 0.81131947, + "learning_rate": 2.7152659791667296e-06, + "loss": 0.83227193, + "num_input_tokens_seen": 136874355, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.4765625, + "step": 6381, + "time_per_iteration": 2.413869857788086 + }, + { + "auxiliary_loss_clip": 0.01011962, + "auxiliary_loss_mlp": 0.01016199, + "balance_loss_clip": 1.01488221, + "balance_loss_mlp": 1.00208092, + "epoch": 0.3837065985269803, + "flos": 65531902798080.0, + "grad_norm": 0.7940873470552489, + "language_loss": 0.60454381, + "learning_rate": 2.7149131324084925e-06, + "loss": 0.62482536, + "num_input_tokens_seen": 136937475, + "router_z_loss_clip": 0.01318359, + "router_z_loss_mlp": 0.09863281, + "step": 6382, + "time_per_iteration": 4.399012565612793 + }, + { + "auxiliary_loss_clip": 0.01075227, + "auxiliary_loss_mlp": 0.01027514, + "balance_loss_clip": 1.01227927, + "balance_loss_mlp": 1.0229882, + "epoch": 0.38376672177964827, + "flos": 28255347763200.0, + "grad_norm": 4.7659232854408815, + "language_loss": 0.66728902, + "learning_rate": 2.714560260136846e-06, + "loss": 0.68831635, + "num_input_tokens_seen": 136955805, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.5234375, + "step": 6383, + "time_per_iteration": 2.427001953125 + }, + { + "auxiliary_loss_clip": 0.01073185, + "auxiliary_loss_mlp": 0.0102933, + "balance_loss_clip": 1.01498294, + "balance_loss_mlp": 1.02327538, + "epoch": 0.38382684503231623, + "flos": 20630316695040.0, + "grad_norm": 1.6175670644657814, + "language_loss": 0.74437582, + "learning_rate": 2.714207362364381e-06, + "loss": 0.76540101, + "num_input_tokens_seen": 136975240, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.5, + "step": 6384, + "time_per_iteration": 2.3825740814208984 + }, + { + "auxiliary_loss_clip": 0.01071315, + "auxiliary_loss_mlp": 0.01027726, + "balance_loss_clip": 1.01310515, + "balance_loss_mlp": 1.02346826, + "epoch": 0.3838869682849842, + "flos": 19604917797120.0, + "grad_norm": 1.5585058862983203, + "language_loss": 0.76371676, + "learning_rate": 2.7138544391036925e-06, + "loss": 0.78470719, + "num_input_tokens_seen": 136994985, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.47851562, + "step": 6385, + "time_per_iteration": 2.400175094604492 + }, + { + "auxiliary_loss_clip": 0.01011066, + "auxiliary_loss_mlp": 0.01001261, + "balance_loss_clip": 0.99994946, + "balance_loss_mlp": 1.00165391, + "epoch": 0.38394709153765216, + "flos": 56553428897280.0, + "grad_norm": 0.9083990369851577, + "language_loss": 0.67076528, + "learning_rate": 2.7135014903673748e-06, + "loss": 0.69088852, + "num_input_tokens_seen": 137046290, + "router_z_loss_clip": 0.01312256, + "router_z_loss_mlp": 0.09423828, + "step": 6386, + "time_per_iteration": 4.288057327270508 + }, + { + "auxiliary_loss_clip": 0.01071762, + "auxiliary_loss_mlp": 0.01027539, + "balance_loss_clip": 1.01443815, + "balance_loss_mlp": 1.02398896, + "epoch": 0.3840072147903202, + "flos": 15887819675520.0, + "grad_norm": 1.6717493867185969, + "language_loss": 0.72432387, + "learning_rate": 2.713148516168025e-06, + "loss": 0.74531686, + "num_input_tokens_seen": 137064725, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.47851562, + "step": 6387, + "time_per_iteration": 2.398987293243408 + }, + { + "auxiliary_loss_clip": 0.01074301, + "auxiliary_loss_mlp": 0.01031313, + "balance_loss_clip": 1.0169003, + "balance_loss_mlp": 1.02620327, + "epoch": 0.38406733804298815, + "flos": 28216838666880.0, + "grad_norm": 1.5624960160758032, + "language_loss": 0.81052637, + "learning_rate": 2.712795516518239e-06, + "loss": 0.83158249, + "num_input_tokens_seen": 137086030, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.48046875, + "step": 6388, + "time_per_iteration": 2.438119411468506 + }, + { + "auxiliary_loss_clip": 0.01069137, + "auxiliary_loss_mlp": 0.01025842, + "balance_loss_clip": 1.01230025, + "balance_loss_mlp": 1.02218246, + "epoch": 0.3841274612956561, + "flos": 18222601403520.0, + "grad_norm": 1.9034531866687259, + "language_loss": 0.76234365, + "learning_rate": 2.7124424914306143e-06, + "loss": 0.78329349, + "num_input_tokens_seen": 137105400, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.47070312, + "step": 6389, + "time_per_iteration": 2.367206573486328 + }, + { + "auxiliary_loss_clip": 0.01075656, + "auxiliary_loss_mlp": 0.01037016, + "balance_loss_clip": 1.02145958, + "balance_loss_mlp": 1.02457583, + "epoch": 0.3841875845483241, + "flos": 19791842549760.0, + "grad_norm": 3.0480319906333055, + "language_loss": 0.76977909, + "learning_rate": 2.71208944091775e-06, + "loss": 0.79090583, + "num_input_tokens_seen": 137124985, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.51171875, + "step": 6390, + "time_per_iteration": 2.382983446121216 + }, + { + "auxiliary_loss_clip": 0.01073509, + "auxiliary_loss_mlp": 0.01039734, + "balance_loss_clip": 1.02360523, + "balance_loss_mlp": 1.02343667, + "epoch": 0.38424770780099204, + "flos": 29897522472960.0, + "grad_norm": 1.542899672739723, + "language_loss": 0.69253361, + "learning_rate": 2.7117363649922453e-06, + "loss": 0.71366596, + "num_input_tokens_seen": 137146745, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.5, + "step": 6391, + "time_per_iteration": 2.4826838970184326 + }, + { + "auxiliary_loss_clip": 0.01072126, + "auxiliary_loss_mlp": 0.01035514, + "balance_loss_clip": 1.01970649, + "balance_loss_mlp": 1.0217371, + "epoch": 0.38430783105366, + "flos": 20812668059520.0, + "grad_norm": 1.7190900938160967, + "language_loss": 0.84027886, + "learning_rate": 2.7113832636667e-06, + "loss": 0.8613553, + "num_input_tokens_seen": 137163195, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.50390625, + "step": 6392, + "time_per_iteration": 2.3855299949645996 + }, + { + "auxiliary_loss_clip": 0.01071006, + "auxiliary_loss_mlp": 0.01033321, + "balance_loss_clip": 1.01841426, + "balance_loss_mlp": 1.02166486, + "epoch": 0.384367954306328, + "flos": 10997814695040.0, + "grad_norm": 2.2771003122968074, + "language_loss": 0.61486125, + "learning_rate": 2.7110301369537168e-06, + "loss": 0.63590455, + "num_input_tokens_seen": 137179330, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.49414062, + "step": 6393, + "time_per_iteration": 2.3465282917022705 + }, + { + "auxiliary_loss_clip": 0.01074606, + "auxiliary_loss_mlp": 0.01033036, + "balance_loss_clip": 1.01738417, + "balance_loss_mlp": 1.02258933, + "epoch": 0.38442807755899594, + "flos": 25336853147520.0, + "grad_norm": 2.2141847589341936, + "language_loss": 0.71160275, + "learning_rate": 2.7106769848658965e-06, + "loss": 0.73267913, + "num_input_tokens_seen": 137198655, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.51953125, + "step": 6394, + "time_per_iteration": 2.417149066925049 + }, + { + "auxiliary_loss_clip": 0.010775, + "auxiliary_loss_mlp": 0.01034021, + "balance_loss_clip": 1.01730824, + "balance_loss_mlp": 1.02492261, + "epoch": 0.3844882008116639, + "flos": 21068686126080.0, + "grad_norm": 1.971583195942539, + "language_loss": 0.80974996, + "learning_rate": 2.710323807415843e-06, + "loss": 0.83086514, + "num_input_tokens_seen": 137217120, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.52734375, + "step": 6395, + "time_per_iteration": 2.3835811614990234 + }, + { + "auxiliary_loss_clip": 0.01073392, + "auxiliary_loss_mlp": 0.0102853, + "balance_loss_clip": 1.01352715, + "balance_loss_mlp": 1.02477539, + "epoch": 0.38454832406433187, + "flos": 17962393973760.0, + "grad_norm": 1.8966165835568778, + "language_loss": 0.70925415, + "learning_rate": 2.7099706046161593e-06, + "loss": 0.73027331, + "num_input_tokens_seen": 137234410, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.48632812, + "step": 6396, + "time_per_iteration": 2.3668510913848877 + }, + { + "auxiliary_loss_clip": 0.01011183, + "auxiliary_loss_mlp": 0.01005274, + "balance_loss_clip": 1.00379539, + "balance_loss_mlp": 1.00181806, + "epoch": 0.38460844731699984, + "flos": 67921392493440.0, + "grad_norm": 1.0275815071984948, + "language_loss": 0.5960207, + "learning_rate": 2.7096173764794514e-06, + "loss": 0.61618525, + "num_input_tokens_seen": 137294940, + "router_z_loss_clip": 0.01477051, + "router_z_loss_mlp": 0.09375, + "step": 6397, + "time_per_iteration": 3.0761427879333496 + }, + { + "auxiliary_loss_clip": 0.0107341, + "auxiliary_loss_mlp": 0.01027271, + "balance_loss_clip": 1.01259041, + "balance_loss_mlp": 1.02422559, + "epoch": 0.3846685705696678, + "flos": 25847876851200.0, + "grad_norm": 1.8712059024006888, + "language_loss": 0.84767878, + "learning_rate": 2.7092641230183243e-06, + "loss": 0.86868554, + "num_input_tokens_seen": 137315035, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.4921875, + "step": 6398, + "time_per_iteration": 2.4344027042388916 + }, + { + "auxiliary_loss_clip": 0.01071551, + "auxiliary_loss_mlp": 0.01024638, + "balance_loss_clip": 1.01069093, + "balance_loss_mlp": 1.02309728, + "epoch": 0.38472869382233577, + "flos": 16289251021440.0, + "grad_norm": 2.3018006319989928, + "language_loss": 0.79428566, + "learning_rate": 2.7089108442453854e-06, + "loss": 0.81524754, + "num_input_tokens_seen": 137333155, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.484375, + "step": 6399, + "time_per_iteration": 2.3623061180114746 + }, + { + "auxiliary_loss_clip": 0.01073319, + "auxiliary_loss_mlp": 0.01027959, + "balance_loss_clip": 1.01175857, + "balance_loss_mlp": 1.02303982, + "epoch": 0.38478881707500373, + "flos": 19352146487040.0, + "grad_norm": 1.7496453168028605, + "language_loss": 0.66749483, + "learning_rate": 2.7085575401732423e-06, + "loss": 0.68850756, + "num_input_tokens_seen": 137351515, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.50390625, + "step": 6400, + "time_per_iteration": 2.3797361850738525 + }, + { + "auxiliary_loss_clip": 0.01074579, + "auxiliary_loss_mlp": 0.01034192, + "balance_loss_clip": 1.01956546, + "balance_loss_mlp": 1.02425206, + "epoch": 0.38484894032767175, + "flos": 24859765152000.0, + "grad_norm": 1.8224421794010612, + "language_loss": 0.73357236, + "learning_rate": 2.708204210814503e-06, + "loss": 0.75466013, + "num_input_tokens_seen": 137371255, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.50390625, + "step": 6401, + "time_per_iteration": 2.4101078510284424 + }, + { + "auxiliary_loss_clip": 0.01072633, + "auxiliary_loss_mlp": 0.01033675, + "balance_loss_clip": 1.01918507, + "balance_loss_mlp": 1.02406967, + "epoch": 0.3849090635803397, + "flos": 14500929893760.0, + "grad_norm": 1.9233321395908127, + "language_loss": 0.7154358, + "learning_rate": 2.707850856181777e-06, + "loss": 0.73649889, + "num_input_tokens_seen": 137388980, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.48632812, + "step": 6402, + "time_per_iteration": 2.362643003463745 + }, + { + "auxiliary_loss_clip": 0.01069168, + "auxiliary_loss_mlp": 0.01026851, + "balance_loss_clip": 1.01277208, + "balance_loss_mlp": 1.02213025, + "epoch": 0.3849691868330077, + "flos": 18514859328000.0, + "grad_norm": 2.4061362794420322, + "language_loss": 0.83045423, + "learning_rate": 2.707497476287675e-06, + "loss": 0.85141438, + "num_input_tokens_seen": 137406885, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.47070312, + "step": 6403, + "time_per_iteration": 2.3753793239593506 + }, + { + "auxiliary_loss_clip": 0.01071288, + "auxiliary_loss_mlp": 0.01029477, + "balance_loss_clip": 1.01444423, + "balance_loss_mlp": 1.02305806, + "epoch": 0.38502931008567565, + "flos": 21615321283200.0, + "grad_norm": 1.9041392491294706, + "language_loss": 0.82957143, + "learning_rate": 2.7071440711448077e-06, + "loss": 0.85057902, + "num_input_tokens_seen": 137425535, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.48242188, + "step": 6404, + "time_per_iteration": 2.403550148010254 + }, + { + "auxiliary_loss_clip": 0.01074332, + "auxiliary_loss_mlp": 0.01029241, + "balance_loss_clip": 1.0150671, + "balance_loss_mlp": 1.02420688, + "epoch": 0.3850894333383436, + "flos": 25414045896960.0, + "grad_norm": 1.4784060048485066, + "language_loss": 0.69566846, + "learning_rate": 2.7067906407657877e-06, + "loss": 0.71670413, + "num_input_tokens_seen": 137447700, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.5, + "step": 6405, + "time_per_iteration": 2.500673770904541 + }, + { + "auxiliary_loss_clip": 0.01069248, + "auxiliary_loss_mlp": 0.01028059, + "balance_loss_clip": 1.01427257, + "balance_loss_mlp": 1.02266669, + "epoch": 0.3851495565910116, + "flos": 20226895401600.0, + "grad_norm": 1.922006172058228, + "language_loss": 0.78940684, + "learning_rate": 2.706437185163228e-06, + "loss": 0.81037986, + "num_input_tokens_seen": 137462245, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.46484375, + "step": 6406, + "time_per_iteration": 2.3813631534576416 + }, + { + "auxiliary_loss_clip": 0.01075237, + "auxiliary_loss_mlp": 0.01030729, + "balance_loss_clip": 1.01598334, + "balance_loss_mlp": 1.02550876, + "epoch": 0.38520967984367954, + "flos": 16507528041600.0, + "grad_norm": 2.819893938423056, + "language_loss": 0.84415394, + "learning_rate": 2.7060837043497416e-06, + "loss": 0.86521363, + "num_input_tokens_seen": 137476455, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.49609375, + "step": 6407, + "time_per_iteration": 2.3337297439575195 + }, + { + "auxiliary_loss_clip": 0.01012659, + "auxiliary_loss_mlp": 0.01004048, + "balance_loss_clip": 1.00264144, + "balance_loss_mlp": 1.00296116, + "epoch": 0.3852698030963475, + "flos": 61310553776640.0, + "grad_norm": 0.8243768979758257, + "language_loss": 0.64843804, + "learning_rate": 2.7057301983379452e-06, + "loss": 0.66860509, + "num_input_tokens_seen": 137539845, + "router_z_loss_clip": 0.01403809, + "router_z_loss_mlp": 0.09667969, + "step": 6408, + "time_per_iteration": 3.0884344577789307 + }, + { + "auxiliary_loss_clip": 0.01072947, + "auxiliary_loss_mlp": 0.01031923, + "balance_loss_clip": 1.01689124, + "balance_loss_mlp": 1.02335691, + "epoch": 0.3853299263490155, + "flos": 22891920480000.0, + "grad_norm": 2.2340149704786887, + "language_loss": 0.73693776, + "learning_rate": 2.705376667140452e-06, + "loss": 0.75798643, + "num_input_tokens_seen": 137559880, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.49609375, + "step": 6409, + "time_per_iteration": 2.39340877532959 + }, + { + "auxiliary_loss_clip": 0.01078745, + "auxiliary_loss_mlp": 0.01043236, + "balance_loss_clip": 1.02677345, + "balance_loss_mlp": 1.02513075, + "epoch": 0.38539004960168344, + "flos": 20046464161920.0, + "grad_norm": 2.003972399415588, + "language_loss": 0.70190561, + "learning_rate": 2.705023110769881e-06, + "loss": 0.7231254, + "num_input_tokens_seen": 137578225, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.53515625, + "step": 6410, + "time_per_iteration": 2.3725991249084473 + }, + { + "auxiliary_loss_clip": 0.01010867, + "auxiliary_loss_mlp": 0.01001821, + "balance_loss_clip": 1.00045645, + "balance_loss_mlp": 1.00138807, + "epoch": 0.3854501728543514, + "flos": 68726978271360.0, + "grad_norm": 0.6712661248004854, + "language_loss": 0.60383004, + "learning_rate": 2.7046695292388485e-06, + "loss": 0.62395692, + "num_input_tokens_seen": 137645770, + "router_z_loss_clip": 0.01367188, + "router_z_loss_mlp": 0.09472656, + "step": 6411, + "time_per_iteration": 3.1204044818878174 + }, + { + "auxiliary_loss_clip": 0.01070236, + "auxiliary_loss_mlp": 0.01027987, + "balance_loss_clip": 1.01416492, + "balance_loss_mlp": 1.02227807, + "epoch": 0.38551029610701937, + "flos": 20483995720320.0, + "grad_norm": 1.7445958039108738, + "language_loss": 0.77560854, + "learning_rate": 2.7043159225599727e-06, + "loss": 0.79659081, + "num_input_tokens_seen": 137664090, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.48046875, + "step": 6412, + "time_per_iteration": 2.3860766887664795 + }, + { + "auxiliary_loss_clip": 0.01074559, + "auxiliary_loss_mlp": 0.01031584, + "balance_loss_clip": 1.01528811, + "balance_loss_mlp": 1.02346826, + "epoch": 0.38557041935968733, + "flos": 23470815600000.0, + "grad_norm": 2.3715598386389916, + "language_loss": 0.77797347, + "learning_rate": 2.703962290745874e-06, + "loss": 0.79903489, + "num_input_tokens_seen": 137683190, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.51171875, + "step": 6413, + "time_per_iteration": 2.4085443019866943 + }, + { + "auxiliary_loss_clip": 0.01010277, + "auxiliary_loss_mlp": 0.01000332, + "balance_loss_clip": 0.99906224, + "balance_loss_mlp": 1.00077569, + "epoch": 0.38563054261235535, + "flos": 63963639169920.0, + "grad_norm": 0.8244576745760341, + "language_loss": 0.61250973, + "learning_rate": 2.703608633809171e-06, + "loss": 0.6326158, + "num_input_tokens_seen": 137737315, + "router_z_loss_clip": 0.01269531, + "router_z_loss_mlp": 0.09472656, + "step": 6414, + "time_per_iteration": 2.9032487869262695 + }, + { + "auxiliary_loss_clip": 0.01075195, + "auxiliary_loss_mlp": 0.01029152, + "balance_loss_clip": 1.01401854, + "balance_loss_mlp": 1.02491593, + "epoch": 0.3856906658650233, + "flos": 23986657071360.0, + "grad_norm": 2.068949848606914, + "language_loss": 0.7726903, + "learning_rate": 2.7032549517624865e-06, + "loss": 0.79373378, + "num_input_tokens_seen": 137753535, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.50390625, + "step": 6415, + "time_per_iteration": 2.40846586227417 + }, + { + "auxiliary_loss_clip": 0.01066083, + "auxiliary_loss_mlp": 0.01023546, + "balance_loss_clip": 1.01011753, + "balance_loss_mlp": 1.02223635, + "epoch": 0.3857507891176913, + "flos": 25006330506240.0, + "grad_norm": 1.6635642160114679, + "language_loss": 0.79608589, + "learning_rate": 2.702901244618442e-06, + "loss": 0.81698215, + "num_input_tokens_seen": 137773405, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.43945312, + "step": 6416, + "time_per_iteration": 3.8112735748291016 + }, + { + "auxiliary_loss_clip": 0.01071455, + "auxiliary_loss_mlp": 0.0102933, + "balance_loss_clip": 1.01581168, + "balance_loss_mlp": 1.02252793, + "epoch": 0.38581091237035925, + "flos": 21535894206720.0, + "grad_norm": 1.768979154164975, + "language_loss": 0.7882784, + "learning_rate": 2.7025475123896597e-06, + "loss": 0.80928624, + "num_input_tokens_seen": 137790810, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.48828125, + "step": 6417, + "time_per_iteration": 2.4051363468170166 + }, + { + "auxiliary_loss_clip": 0.01070115, + "auxiliary_loss_mlp": 0.01027896, + "balance_loss_clip": 1.01463938, + "balance_loss_mlp": 1.0216831, + "epoch": 0.3858710356230272, + "flos": 17382940272000.0, + "grad_norm": 2.0697801577287827, + "language_loss": 0.79743356, + "learning_rate": 2.702193755088764e-06, + "loss": 0.81841362, + "num_input_tokens_seen": 137810265, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.48242188, + "step": 6418, + "time_per_iteration": 2.3811984062194824 + }, + { + "auxiliary_loss_clip": 0.01069308, + "auxiliary_loss_mlp": 0.01025373, + "balance_loss_clip": 1.01234293, + "balance_loss_mlp": 1.02141714, + "epoch": 0.3859311588756952, + "flos": 20338547529600.0, + "grad_norm": 1.8091425112120791, + "language_loss": 0.79684544, + "learning_rate": 2.701839972728379e-06, + "loss": 0.8177923, + "num_input_tokens_seen": 137828580, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.48046875, + "step": 6419, + "time_per_iteration": 2.405202627182007 + }, + { + "auxiliary_loss_clip": 0.01067784, + "auxiliary_loss_mlp": 0.01028805, + "balance_loss_clip": 1.0134511, + "balance_loss_mlp": 1.02183938, + "epoch": 0.38599128212836314, + "flos": 26320007433600.0, + "grad_norm": 2.031902158674048, + "language_loss": 0.67538393, + "learning_rate": 2.7014861653211314e-06, + "loss": 0.6963498, + "num_input_tokens_seen": 137846145, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.45898438, + "step": 6420, + "time_per_iteration": 3.852813482284546 + }, + { + "auxiliary_loss_clip": 0.01070134, + "auxiliary_loss_mlp": 0.01026774, + "balance_loss_clip": 1.01369035, + "balance_loss_mlp": 1.02428031, + "epoch": 0.3860514053810311, + "flos": 13552968124800.0, + "grad_norm": 1.9556236432527907, + "language_loss": 0.81696451, + "learning_rate": 2.701132332879646e-06, + "loss": 0.83793354, + "num_input_tokens_seen": 137863705, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.45898438, + "step": 6421, + "time_per_iteration": 3.7698209285736084 + }, + { + "auxiliary_loss_clip": 0.0107089, + "auxiliary_loss_mlp": 0.01025447, + "balance_loss_clip": 1.01142216, + "balance_loss_mlp": 1.02289939, + "epoch": 0.3861115286336991, + "flos": 20953368305280.0, + "grad_norm": 2.181946551620686, + "language_loss": 0.71599036, + "learning_rate": 2.700778475416552e-06, + "loss": 0.73695374, + "num_input_tokens_seen": 137880285, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.47851562, + "step": 6422, + "time_per_iteration": 2.382434129714966 + }, + { + "auxiliary_loss_clip": 0.01069288, + "auxiliary_loss_mlp": 0.01024515, + "balance_loss_clip": 1.01148498, + "balance_loss_mlp": 1.02364755, + "epoch": 0.38617165188636704, + "flos": 16361765648640.0, + "grad_norm": 1.5707566251050673, + "language_loss": 0.66728193, + "learning_rate": 2.7004245929444776e-06, + "loss": 0.6882199, + "num_input_tokens_seen": 137898335, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.45703125, + "step": 6423, + "time_per_iteration": 2.401672124862671 + }, + { + "auxiliary_loss_clip": 0.01072334, + "auxiliary_loss_mlp": 0.01028594, + "balance_loss_clip": 1.01381183, + "balance_loss_mlp": 1.02393031, + "epoch": 0.386231775139035, + "flos": 34785851708160.0, + "grad_norm": 1.7924744273413715, + "language_loss": 0.68787992, + "learning_rate": 2.7000706854760504e-06, + "loss": 0.70888919, + "num_input_tokens_seen": 137918605, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.48242188, + "step": 6424, + "time_per_iteration": 2.5371901988983154 + }, + { + "auxiliary_loss_clip": 0.01068759, + "auxiliary_loss_mlp": 0.01031866, + "balance_loss_clip": 1.01734054, + "balance_loss_mlp": 1.02193975, + "epoch": 0.38629189839170297, + "flos": 21725088197760.0, + "grad_norm": 1.3597977567548862, + "language_loss": 0.72149193, + "learning_rate": 2.699716753023901e-06, + "loss": 0.74249816, + "num_input_tokens_seen": 137938245, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.46875, + "step": 6425, + "time_per_iteration": 3.8290693759918213 + }, + { + "auxiliary_loss_clip": 0.01073253, + "auxiliary_loss_mlp": 0.01033421, + "balance_loss_clip": 1.01898479, + "balance_loss_mlp": 1.02292788, + "epoch": 0.38635202164437094, + "flos": 27922520972160.0, + "grad_norm": 1.8597590825559922, + "language_loss": 0.81127745, + "learning_rate": 2.69936279560066e-06, + "loss": 0.83234417, + "num_input_tokens_seen": 137956770, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.50390625, + "step": 6426, + "time_per_iteration": 2.444472312927246 + }, + { + "auxiliary_loss_clip": 0.01071708, + "auxiliary_loss_mlp": 0.01033324, + "balance_loss_clip": 1.01869082, + "balance_loss_mlp": 1.02344429, + "epoch": 0.38641214489703896, + "flos": 23585505016320.0, + "grad_norm": 1.9859570078916087, + "language_loss": 0.7460295, + "learning_rate": 2.699008813218961e-06, + "loss": 0.76707983, + "num_input_tokens_seen": 137977040, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.48242188, + "step": 6427, + "time_per_iteration": 2.413057327270508 + }, + { + "auxiliary_loss_clip": 0.01069843, + "auxiliary_loss_mlp": 0.01032713, + "balance_loss_clip": 1.01843154, + "balance_loss_mlp": 1.02340186, + "epoch": 0.3864722681497069, + "flos": 12640408341120.0, + "grad_norm": 2.1221194787307676, + "language_loss": 0.70528924, + "learning_rate": 2.698654805891435e-06, + "loss": 0.72631478, + "num_input_tokens_seen": 137993545, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.46484375, + "step": 6428, + "time_per_iteration": 2.3660902976989746 + }, + { + "auxiliary_loss_clip": 0.01072193, + "auxiliary_loss_mlp": 0.01032739, + "balance_loss_clip": 1.01941705, + "balance_loss_mlp": 1.02372837, + "epoch": 0.3865323914023749, + "flos": 17598075269760.0, + "grad_norm": 2.2501368626028895, + "language_loss": 0.84230453, + "learning_rate": 2.6983007736307158e-06, + "loss": 0.86335385, + "num_input_tokens_seen": 138010140, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.48632812, + "step": 6429, + "time_per_iteration": 2.353111743927002 + }, + { + "auxiliary_loss_clip": 0.01072221, + "auxiliary_loss_mlp": 0.01027437, + "balance_loss_clip": 1.01343548, + "balance_loss_mlp": 1.02398658, + "epoch": 0.38659251465504285, + "flos": 18477956154240.0, + "grad_norm": 2.1164698228429746, + "language_loss": 0.81232369, + "learning_rate": 2.6979467164494387e-06, + "loss": 0.8333202, + "num_input_tokens_seen": 138028880, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.48046875, + "step": 6430, + "time_per_iteration": 2.406045913696289 + }, + { + "auxiliary_loss_clip": 0.01012101, + "auxiliary_loss_mlp": 0.01012191, + "balance_loss_clip": 1.01060581, + "balance_loss_mlp": 1.00226521, + "epoch": 0.3866526379077108, + "flos": 64162259228160.0, + "grad_norm": 0.7232263840275747, + "language_loss": 0.58855486, + "learning_rate": 2.697592634360238e-06, + "loss": 0.60879779, + "num_input_tokens_seen": 138098090, + "router_z_loss_clip": 0.01586914, + "router_z_loss_mlp": 0.09814453, + "step": 6431, + "time_per_iteration": 3.0711395740509033 + }, + { + "auxiliary_loss_clip": 0.01072427, + "auxiliary_loss_mlp": 0.01028632, + "balance_loss_clip": 1.01259255, + "balance_loss_mlp": 1.02267051, + "epoch": 0.3867127611603788, + "flos": 14387532197760.0, + "grad_norm": 2.257020157116884, + "language_loss": 0.79567808, + "learning_rate": 2.6972385273757513e-06, + "loss": 0.81668866, + "num_input_tokens_seen": 138114735, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.49804688, + "step": 6432, + "time_per_iteration": 2.3641786575317383 + }, + { + "auxiliary_loss_clip": 0.0107401, + "auxiliary_loss_mlp": 0.01032301, + "balance_loss_clip": 1.01705456, + "balance_loss_mlp": 1.02310538, + "epoch": 0.38677288441304675, + "flos": 20009735544960.0, + "grad_norm": 2.141807986625703, + "language_loss": 0.80594218, + "learning_rate": 2.6968843955086155e-06, + "loss": 0.82700533, + "num_input_tokens_seen": 138130480, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.5078125, + "step": 6433, + "time_per_iteration": 2.3678066730499268 + }, + { + "auxiliary_loss_clip": 0.01073231, + "auxiliary_loss_mlp": 0.01029923, + "balance_loss_clip": 1.01449776, + "balance_loss_mlp": 1.02351093, + "epoch": 0.3868330076657147, + "flos": 22235797699200.0, + "grad_norm": 1.5544772490871022, + "language_loss": 0.70720983, + "learning_rate": 2.696530238771467e-06, + "loss": 0.72824132, + "num_input_tokens_seen": 138150640, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.49609375, + "step": 6434, + "time_per_iteration": 2.4034829139709473 + }, + { + "auxiliary_loss_clip": 0.01074369, + "auxiliary_loss_mlp": 0.01032387, + "balance_loss_clip": 1.01748013, + "balance_loss_mlp": 1.02348089, + "epoch": 0.3868931309183827, + "flos": 16726503288960.0, + "grad_norm": 1.6909865285431163, + "language_loss": 0.77458197, + "learning_rate": 2.696176057176947e-06, + "loss": 0.79564953, + "num_input_tokens_seen": 138169700, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.5078125, + "step": 6435, + "time_per_iteration": 2.358459234237671 + }, + { + "auxiliary_loss_clip": 0.01071084, + "auxiliary_loss_mlp": 0.01033566, + "balance_loss_clip": 1.01889133, + "balance_loss_mlp": 1.02302909, + "epoch": 0.38695325417105064, + "flos": 22673608548480.0, + "grad_norm": 1.6545892669921791, + "language_loss": 0.79653662, + "learning_rate": 2.6958218507376936e-06, + "loss": 0.81758314, + "num_input_tokens_seen": 138185835, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.48046875, + "step": 6436, + "time_per_iteration": 2.393425226211548 + }, + { + "auxiliary_loss_clip": 0.01069698, + "auxiliary_loss_mlp": 0.01029697, + "balance_loss_clip": 1.01666141, + "balance_loss_mlp": 1.02266824, + "epoch": 0.3870133774237186, + "flos": 23110930638720.0, + "grad_norm": 1.6240719441988032, + "language_loss": 0.76536834, + "learning_rate": 2.6954676194663486e-06, + "loss": 0.78636229, + "num_input_tokens_seen": 138204080, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.47070312, + "step": 6437, + "time_per_iteration": 2.3904120922088623 + }, + { + "auxiliary_loss_clip": 0.01070421, + "auxiliary_loss_mlp": 0.01035005, + "balance_loss_clip": 1.02124822, + "balance_loss_mlp": 1.02351093, + "epoch": 0.3870735006763866, + "flos": 17674744348800.0, + "grad_norm": 2.1502736092777988, + "language_loss": 0.81843197, + "learning_rate": 2.6951133633755538e-06, + "loss": 0.83948618, + "num_input_tokens_seen": 138220710, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.46875, + "step": 6438, + "time_per_iteration": 2.3632795810699463 + }, + { + "auxiliary_loss_clip": 0.01072497, + "auxiliary_loss_mlp": 0.01032213, + "balance_loss_clip": 1.01767588, + "balance_loss_mlp": 1.02363777, + "epoch": 0.38713362392905454, + "flos": 23294643546240.0, + "grad_norm": 1.737557186528857, + "language_loss": 0.75216937, + "learning_rate": 2.6947590824779502e-06, + "loss": 0.77321649, + "num_input_tokens_seen": 138241720, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.48828125, + "step": 6439, + "time_per_iteration": 2.3971452713012695 + }, + { + "auxiliary_loss_clip": 0.01068407, + "auxiliary_loss_mlp": 0.0103023, + "balance_loss_clip": 1.01677775, + "balance_loss_mlp": 1.02272558, + "epoch": 0.38719374718172256, + "flos": 21030177029760.0, + "grad_norm": 1.4625146126776782, + "language_loss": 0.73726898, + "learning_rate": 2.694404776786182e-06, + "loss": 0.75825536, + "num_input_tokens_seen": 138261885, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.45703125, + "step": 6440, + "time_per_iteration": 2.4049558639526367 + }, + { + "auxiliary_loss_clip": 0.01072184, + "auxiliary_loss_mlp": 0.01031043, + "balance_loss_clip": 1.01621962, + "balance_loss_mlp": 1.02272618, + "epoch": 0.3872538704343905, + "flos": 19608758046720.0, + "grad_norm": 1.9449260359710494, + "language_loss": 0.82023013, + "learning_rate": 2.6940504463128933e-06, + "loss": 0.8412624, + "num_input_tokens_seen": 138280255, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.49414062, + "step": 6441, + "time_per_iteration": 2.3869669437408447 + }, + { + "auxiliary_loss_clip": 0.01072746, + "auxiliary_loss_mlp": 0.01035742, + "balance_loss_clip": 1.02196097, + "balance_loss_mlp": 1.02449954, + "epoch": 0.3873139936870585, + "flos": 17529086689920.0, + "grad_norm": 2.217150395137991, + "language_loss": 0.81612539, + "learning_rate": 2.6936960910707307e-06, + "loss": 0.8372103, + "num_input_tokens_seen": 138296675, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.48242188, + "step": 6442, + "time_per_iteration": 2.3692736625671387 + }, + { + "auxiliary_loss_clip": 0.01070033, + "auxiliary_loss_mlp": 0.01026524, + "balance_loss_clip": 1.01205254, + "balance_loss_mlp": 1.02191114, + "epoch": 0.38737411693972645, + "flos": 17785558604160.0, + "grad_norm": 1.5835994959432471, + "language_loss": 0.83717436, + "learning_rate": 2.693341711072338e-06, + "loss": 0.85813987, + "num_input_tokens_seen": 138314985, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.48046875, + "step": 6443, + "time_per_iteration": 2.3825783729553223 + }, + { + "auxiliary_loss_clip": 0.01010682, + "auxiliary_loss_mlp": 0.01003713, + "balance_loss_clip": 1.00215173, + "balance_loss_mlp": 1.00128007, + "epoch": 0.3874342401923944, + "flos": 58301984119680.0, + "grad_norm": 0.7545362952080468, + "language_loss": 0.50255579, + "learning_rate": 2.6929873063303634e-06, + "loss": 0.52269971, + "num_input_tokens_seen": 138373275, + "router_z_loss_clip": 0.015625, + "router_z_loss_mlp": 0.09375, + "step": 6444, + "time_per_iteration": 3.0356690883636475 + }, + { + "auxiliary_loss_clip": 0.01068063, + "auxiliary_loss_mlp": 0.01026805, + "balance_loss_clip": 1.01370442, + "balance_loss_mlp": 1.02287197, + "epoch": 0.3874943634450624, + "flos": 17710984206720.0, + "grad_norm": 1.6195469748190225, + "language_loss": 0.78630078, + "learning_rate": 2.6926328768574545e-06, + "loss": 0.80724943, + "num_input_tokens_seen": 138391145, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.45117188, + "step": 6445, + "time_per_iteration": 2.3599650859832764 + }, + { + "auxiliary_loss_clip": 0.01069329, + "auxiliary_loss_mlp": 0.01025607, + "balance_loss_clip": 1.01224995, + "balance_loss_mlp": 1.02317977, + "epoch": 0.38755448669773035, + "flos": 19243845849600.0, + "grad_norm": 1.9021278803043031, + "language_loss": 0.80816722, + "learning_rate": 2.6922784226662595e-06, + "loss": 0.82911658, + "num_input_tokens_seen": 138409875, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.4609375, + "step": 6446, + "time_per_iteration": 2.3811867237091064 + }, + { + "auxiliary_loss_clip": 0.01069717, + "auxiliary_loss_mlp": 0.01030068, + "balance_loss_clip": 1.01606715, + "balance_loss_mlp": 1.02237332, + "epoch": 0.3876146099503983, + "flos": 20593238964480.0, + "grad_norm": 1.8365217279326018, + "language_loss": 0.77433693, + "learning_rate": 2.6919239437694288e-06, + "loss": 0.79533482, + "num_input_tokens_seen": 138428965, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.47265625, + "step": 6447, + "time_per_iteration": 2.3864388465881348 + }, + { + "auxiliary_loss_clip": 0.01070298, + "auxiliary_loss_mlp": 0.01027419, + "balance_loss_clip": 1.01415086, + "balance_loss_mlp": 1.02335882, + "epoch": 0.3876747332030663, + "flos": 19280120618880.0, + "grad_norm": 1.5373124043863238, + "language_loss": 0.7613622, + "learning_rate": 2.691569440179612e-06, + "loss": 0.78233933, + "num_input_tokens_seen": 138448090, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.46875, + "step": 6448, + "time_per_iteration": 2.3873212337493896 + }, + { + "auxiliary_loss_clip": 0.01067967, + "auxiliary_loss_mlp": 0.01029386, + "balance_loss_clip": 1.0157125, + "balance_loss_mlp": 1.0216198, + "epoch": 0.38773485645573424, + "flos": 18945094412160.0, + "grad_norm": 1.6357830520393237, + "language_loss": 0.75671995, + "learning_rate": 2.691214911909461e-06, + "loss": 0.77769339, + "num_input_tokens_seen": 138466105, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.46289062, + "step": 6449, + "time_per_iteration": 2.3578057289123535 + }, + { + "auxiliary_loss_clip": 0.01070564, + "auxiliary_loss_mlp": 0.0103165, + "balance_loss_clip": 1.01674914, + "balance_loss_mlp": 1.0209111, + "epoch": 0.3877949797084022, + "flos": 23070361772160.0, + "grad_norm": 1.6775319351474467, + "language_loss": 0.78557169, + "learning_rate": 2.690860358971628e-06, + "loss": 0.80659378, + "num_input_tokens_seen": 138485160, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.49609375, + "step": 6450, + "time_per_iteration": 2.3988382816314697 + }, + { + "auxiliary_loss_clip": 0.01074259, + "auxiliary_loss_mlp": 0.01028328, + "balance_loss_clip": 1.01351023, + "balance_loss_mlp": 1.02337241, + "epoch": 0.3878551029610702, + "flos": 29094275756160.0, + "grad_norm": 2.2474928407061308, + "language_loss": 0.7726, + "learning_rate": 2.690505781378766e-06, + "loss": 0.79362583, + "num_input_tokens_seen": 138504135, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.5078125, + "step": 6451, + "time_per_iteration": 2.4257657527923584 + }, + { + "auxiliary_loss_clip": 0.01067293, + "auxiliary_loss_mlp": 0.01025521, + "balance_loss_clip": 1.01206207, + "balance_loss_mlp": 1.02180934, + "epoch": 0.38791522621373814, + "flos": 20995333626240.0, + "grad_norm": 2.1197352035668127, + "language_loss": 0.76294684, + "learning_rate": 2.6901511791435286e-06, + "loss": 0.78387499, + "num_input_tokens_seen": 138523955, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.45507812, + "step": 6452, + "time_per_iteration": 2.406665086746216 + }, + { + "auxiliary_loss_clip": 0.0107, + "auxiliary_loss_mlp": 0.01032745, + "balance_loss_clip": 1.01923287, + "balance_loss_mlp": 1.02287245, + "epoch": 0.3879753494664061, + "flos": 15485934481920.0, + "grad_norm": 1.7227356610543463, + "language_loss": 0.79668105, + "learning_rate": 2.689796552278571e-06, + "loss": 0.81770849, + "num_input_tokens_seen": 138541655, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.47070312, + "step": 6453, + "time_per_iteration": 2.3568108081817627 + }, + { + "auxiliary_loss_clip": 0.01075601, + "auxiliary_loss_mlp": 0.01032157, + "balance_loss_clip": 1.0161593, + "balance_loss_mlp": 1.02417755, + "epoch": 0.3880354727190741, + "flos": 22052887752960.0, + "grad_norm": 1.6979432927561684, + "language_loss": 0.7157777, + "learning_rate": 2.689441900796549e-06, + "loss": 0.73685527, + "num_input_tokens_seen": 138560860, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.515625, + "step": 6454, + "time_per_iteration": 2.392834186553955 + }, + { + "auxiliary_loss_clip": 0.01073756, + "auxiliary_loss_mlp": 0.01027454, + "balance_loss_clip": 1.01199269, + "balance_loss_mlp": 1.02341008, + "epoch": 0.3880955959717421, + "flos": 20339245756800.0, + "grad_norm": 2.4324141555355134, + "language_loss": 0.77854466, + "learning_rate": 2.689087224710119e-06, + "loss": 0.79955673, + "num_input_tokens_seen": 138580200, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.50390625, + "step": 6455, + "time_per_iteration": 3.828409433364868 + }, + { + "auxiliary_loss_clip": 0.01067146, + "auxiliary_loss_mlp": 0.01024271, + "balance_loss_clip": 1.0107286, + "balance_loss_mlp": 1.0212636, + "epoch": 0.38815571922441006, + "flos": 23074306755840.0, + "grad_norm": 1.4008675524529726, + "language_loss": 0.75463307, + "learning_rate": 2.688732524031938e-06, + "loss": 0.77554727, + "num_input_tokens_seen": 138598315, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.45898438, + "step": 6456, + "time_per_iteration": 2.392615556716919 + }, + { + "auxiliary_loss_clip": 0.01073966, + "auxiliary_loss_mlp": 0.01031225, + "balance_loss_clip": 1.01632977, + "balance_loss_mlp": 1.02410126, + "epoch": 0.388215842477078, + "flos": 20775904531200.0, + "grad_norm": 2.467623895924287, + "language_loss": 0.59536481, + "learning_rate": 2.688377798774665e-06, + "loss": 0.61641669, + "num_input_tokens_seen": 138615695, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.49804688, + "step": 6457, + "time_per_iteration": 2.3738536834716797 + }, + { + "auxiliary_loss_clip": 0.01073014, + "auxiliary_loss_mlp": 0.01030647, + "balance_loss_clip": 1.0152992, + "balance_loss_mlp": 1.02263951, + "epoch": 0.388275965729746, + "flos": 20447162369280.0, + "grad_norm": 2.0753033357475736, + "language_loss": 0.79797566, + "learning_rate": 2.688023048950959e-06, + "loss": 0.81901228, + "num_input_tokens_seen": 138633180, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.50390625, + "step": 6458, + "time_per_iteration": 2.3875813484191895 + }, + { + "auxiliary_loss_clip": 0.01072008, + "auxiliary_loss_mlp": 0.01025145, + "balance_loss_clip": 1.01094759, + "balance_loss_mlp": 1.0228076, + "epoch": 0.38833608898241395, + "flos": 27891133793280.0, + "grad_norm": 1.8802103691728687, + "language_loss": 0.81069863, + "learning_rate": 2.6876682745734807e-06, + "loss": 0.83167017, + "num_input_tokens_seen": 138654785, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.4921875, + "step": 6459, + "time_per_iteration": 2.4388864040374756 + }, + { + "auxiliary_loss_clip": 0.01071504, + "auxiliary_loss_mlp": 0.01026047, + "balance_loss_clip": 1.0121057, + "balance_loss_mlp": 1.02348924, + "epoch": 0.3883962122350819, + "flos": 18075442556160.0, + "grad_norm": 1.7271235675480758, + "language_loss": 0.61615485, + "learning_rate": 2.6873134756548902e-06, + "loss": 0.63713038, + "num_input_tokens_seen": 138673330, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.48046875, + "step": 6460, + "time_per_iteration": 3.7920982837677 + }, + { + "auxiliary_loss_clip": 0.0107027, + "auxiliary_loss_mlp": 0.01028064, + "balance_loss_clip": 1.01505256, + "balance_loss_mlp": 1.02309084, + "epoch": 0.3884563354877499, + "flos": 23621151381120.0, + "grad_norm": 1.603888331922925, + "language_loss": 0.86011147, + "learning_rate": 2.6869586522078494e-06, + "loss": 0.88109481, + "num_input_tokens_seen": 138694185, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.47265625, + "step": 6461, + "time_per_iteration": 3.8378467559814453 + }, + { + "auxiliary_loss_clip": 0.01071492, + "auxiliary_loss_mlp": 0.01030206, + "balance_loss_clip": 1.01687813, + "balance_loss_mlp": 1.02263474, + "epoch": 0.38851645874041785, + "flos": 27452310514560.0, + "grad_norm": 2.155288372346121, + "language_loss": 0.70770979, + "learning_rate": 2.686603804245022e-06, + "loss": 0.72872674, + "num_input_tokens_seen": 138714625, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.48828125, + "step": 6462, + "time_per_iteration": 2.4351794719696045 + }, + { + "auxiliary_loss_clip": 0.01070858, + "auxiliary_loss_mlp": 0.01025792, + "balance_loss_clip": 1.01176691, + "balance_loss_mlp": 1.02240705, + "epoch": 0.3885765819930858, + "flos": 25226911676160.0, + "grad_norm": 2.0032015468710274, + "language_loss": 0.7602641, + "learning_rate": 2.6862489317790708e-06, + "loss": 0.78123057, + "num_input_tokens_seen": 138733585, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.48632812, + "step": 6463, + "time_per_iteration": 2.415416717529297 + }, + { + "auxiliary_loss_clip": 0.01072443, + "auxiliary_loss_mlp": 0.01037744, + "balance_loss_clip": 1.02204442, + "balance_loss_mlp": 1.02330184, + "epoch": 0.3886367052457538, + "flos": 16945653093120.0, + "grad_norm": 2.460515244440767, + "language_loss": 0.70125246, + "learning_rate": 2.6858940348226606e-06, + "loss": 0.72235441, + "num_input_tokens_seen": 138752335, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.4921875, + "step": 6464, + "time_per_iteration": 2.359823703765869 + }, + { + "auxiliary_loss_clip": 0.01070411, + "auxiliary_loss_mlp": 0.01027254, + "balance_loss_clip": 1.0133487, + "balance_loss_mlp": 1.02377701, + "epoch": 0.38869682849842174, + "flos": 27153140140800.0, + "grad_norm": 2.167047065494318, + "language_loss": 0.69389498, + "learning_rate": 2.685539113388456e-06, + "loss": 0.71487164, + "num_input_tokens_seen": 138768450, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.46679688, + "step": 6465, + "time_per_iteration": 3.7958550453186035 + }, + { + "auxiliary_loss_clip": 0.01071652, + "auxiliary_loss_mlp": 0.01037322, + "balance_loss_clip": 1.02180052, + "balance_loss_mlp": 1.02325916, + "epoch": 0.3887569517510897, + "flos": 21062716283520.0, + "grad_norm": 1.9154723107647353, + "language_loss": 0.77889287, + "learning_rate": 2.6851841674891242e-06, + "loss": 0.79998267, + "num_input_tokens_seen": 138786775, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.484375, + "step": 6466, + "time_per_iteration": 2.3764679431915283 + }, + { + "auxiliary_loss_clip": 0.01071944, + "auxiliary_loss_mlp": 0.01038414, + "balance_loss_clip": 1.0237987, + "balance_loss_mlp": 1.02218235, + "epoch": 0.38881707500375773, + "flos": 29496091127040.0, + "grad_norm": 1.5082812801305472, + "language_loss": 0.69736731, + "learning_rate": 2.6848291971373325e-06, + "loss": 0.71847093, + "num_input_tokens_seen": 138810100, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.49804688, + "step": 6467, + "time_per_iteration": 2.4498167037963867 + }, + { + "auxiliary_loss_clip": 0.01070674, + "auxiliary_loss_mlp": 0.01039417, + "balance_loss_clip": 1.02347827, + "balance_loss_mlp": 1.02192819, + "epoch": 0.3888771982564257, + "flos": 17487470482560.0, + "grad_norm": 2.673371480529278, + "language_loss": 0.83308017, + "learning_rate": 2.684474202345748e-06, + "loss": 0.85418105, + "num_input_tokens_seen": 138825140, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.48632812, + "step": 6468, + "time_per_iteration": 2.345813035964966 + }, + { + "auxiliary_loss_clip": 0.01069591, + "auxiliary_loss_mlp": 0.01031796, + "balance_loss_clip": 1.01799738, + "balance_loss_mlp": 1.02141571, + "epoch": 0.38893732150909366, + "flos": 21941410181760.0, + "grad_norm": 1.9098837271843332, + "language_loss": 0.84474897, + "learning_rate": 2.6841191831270394e-06, + "loss": 0.86576289, + "num_input_tokens_seen": 138844115, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.48242188, + "step": 6469, + "time_per_iteration": 2.384739875793457 + }, + { + "auxiliary_loss_clip": 0.01068844, + "auxiliary_loss_mlp": 0.010318, + "balance_loss_clip": 1.01747084, + "balance_loss_mlp": 1.0214411, + "epoch": 0.3889974447617616, + "flos": 24275319125760.0, + "grad_norm": 1.532286436761178, + "language_loss": 0.74667895, + "learning_rate": 2.683764139493878e-06, + "loss": 0.76768535, + "num_input_tokens_seen": 138860860, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.47460938, + "step": 6470, + "time_per_iteration": 2.403606414794922 + }, + { + "auxiliary_loss_clip": 0.01072047, + "auxiliary_loss_mlp": 0.01030451, + "balance_loss_clip": 1.01594281, + "balance_loss_mlp": 1.02286077, + "epoch": 0.3890575680144296, + "flos": 25665909511680.0, + "grad_norm": 1.8054630740411863, + "language_loss": 0.74865347, + "learning_rate": 2.683409071458932e-06, + "loss": 0.76967847, + "num_input_tokens_seen": 138881910, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.4921875, + "step": 6471, + "time_per_iteration": 2.4449217319488525 + }, + { + "auxiliary_loss_clip": 0.01070529, + "auxiliary_loss_mlp": 0.01028642, + "balance_loss_clip": 1.0143311, + "balance_loss_mlp": 1.02330923, + "epoch": 0.38911769126709755, + "flos": 22854214344960.0, + "grad_norm": 1.6527159984556832, + "language_loss": 0.6779635, + "learning_rate": 2.6830539790348755e-06, + "loss": 0.69895518, + "num_input_tokens_seen": 138900975, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.47265625, + "step": 6472, + "time_per_iteration": 2.3853163719177246 + }, + { + "auxiliary_loss_clip": 0.01069038, + "auxiliary_loss_mlp": 0.01034152, + "balance_loss_clip": 1.02073467, + "balance_loss_mlp": 1.02235472, + "epoch": 0.3891778145197655, + "flos": 25446340771200.0, + "grad_norm": 1.6312456325993039, + "language_loss": 0.76454026, + "learning_rate": 2.6826988622343783e-06, + "loss": 0.78557217, + "num_input_tokens_seen": 138920795, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.46679688, + "step": 6473, + "time_per_iteration": 2.413755178451538 + }, + { + "auxiliary_loss_clip": 0.01073302, + "auxiliary_loss_mlp": 0.01033323, + "balance_loss_clip": 1.01818395, + "balance_loss_mlp": 1.02502763, + "epoch": 0.3892379377724335, + "flos": 14027088654720.0, + "grad_norm": 2.1327613707513464, + "language_loss": 0.70278001, + "learning_rate": 2.6823437210701155e-06, + "loss": 0.72384632, + "num_input_tokens_seen": 138938770, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.48242188, + "step": 6474, + "time_per_iteration": 2.3581089973449707 + }, + { + "auxiliary_loss_clip": 0.01071001, + "auxiliary_loss_mlp": 0.01027767, + "balance_loss_clip": 1.01386678, + "balance_loss_mlp": 1.02233028, + "epoch": 0.38929806102510145, + "flos": 20156405633280.0, + "grad_norm": 1.9791197149879987, + "language_loss": 0.68668908, + "learning_rate": 2.68198855555476e-06, + "loss": 0.70767677, + "num_input_tokens_seen": 138958880, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.48828125, + "step": 6475, + "time_per_iteration": 2.410979747772217 + }, + { + "auxiliary_loss_clip": 0.01076725, + "auxiliary_loss_mlp": 0.01034471, + "balance_loss_clip": 1.0193913, + "balance_loss_mlp": 1.02451897, + "epoch": 0.3893581842777694, + "flos": 22162864135680.0, + "grad_norm": 1.846463898915937, + "language_loss": 0.76028711, + "learning_rate": 2.6816333657009876e-06, + "loss": 0.78139907, + "num_input_tokens_seen": 138977240, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.5234375, + "step": 6476, + "time_per_iteration": 2.3859353065490723 + }, + { + "auxiliary_loss_clip": 0.01012144, + "auxiliary_loss_mlp": 0.01006416, + "balance_loss_clip": 1.00499177, + "balance_loss_mlp": 1.00233853, + "epoch": 0.3894183075304374, + "flos": 67298367548160.0, + "grad_norm": 0.7936274695501733, + "language_loss": 0.58226871, + "learning_rate": 2.6812781515214742e-06, + "loss": 0.6024543, + "num_input_tokens_seen": 139039035, + "router_z_loss_clip": 0.01422119, + "router_z_loss_mlp": 0.09765625, + "step": 6477, + "time_per_iteration": 3.0317206382751465 + }, + { + "auxiliary_loss_clip": 0.01071261, + "auxiliary_loss_mlp": 0.01029629, + "balance_loss_clip": 1.01383948, + "balance_loss_mlp": 1.02180839, + "epoch": 0.38947843078310534, + "flos": 18546630531840.0, + "grad_norm": 2.5432454095304675, + "language_loss": 0.78060055, + "learning_rate": 2.680922913028895e-06, + "loss": 0.8016094, + "num_input_tokens_seen": 139055560, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.49414062, + "step": 6478, + "time_per_iteration": 2.367189884185791 + }, + { + "auxiliary_loss_clip": 0.01068925, + "auxiliary_loss_mlp": 0.01030842, + "balance_loss_clip": 1.01638794, + "balance_loss_mlp": 1.02023172, + "epoch": 0.3895385540357733, + "flos": 14605145902080.0, + "grad_norm": 2.1643034659257996, + "language_loss": 0.82361877, + "learning_rate": 2.680567650235929e-06, + "loss": 0.84461641, + "num_input_tokens_seen": 139071865, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.48632812, + "step": 6479, + "time_per_iteration": 2.361206531524658 + }, + { + "auxiliary_loss_clip": 0.01068606, + "auxiliary_loss_mlp": 0.01027348, + "balance_loss_clip": 1.01347816, + "balance_loss_mlp": 1.02187848, + "epoch": 0.38959867728844133, + "flos": 19974159002880.0, + "grad_norm": 1.6593907654725017, + "language_loss": 0.79844761, + "learning_rate": 2.680212363155254e-06, + "loss": 0.81940711, + "num_input_tokens_seen": 139089640, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.46679688, + "step": 6480, + "time_per_iteration": 2.387129545211792 + }, + { + "auxiliary_loss_clip": 0.01068032, + "auxiliary_loss_mlp": 0.01025764, + "balance_loss_clip": 1.01264453, + "balance_loss_mlp": 1.02230835, + "epoch": 0.3896588005411093, + "flos": 22671094930560.0, + "grad_norm": 1.6120260428091473, + "language_loss": 0.83225536, + "learning_rate": 2.6798570517995505e-06, + "loss": 0.85319334, + "num_input_tokens_seen": 139109365, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.45703125, + "step": 6481, + "time_per_iteration": 2.4379022121429443 + }, + { + "auxiliary_loss_clip": 0.0107096, + "auxiliary_loss_mlp": 0.01029307, + "balance_loss_clip": 1.01559782, + "balance_loss_mlp": 1.02476382, + "epoch": 0.38971892379377726, + "flos": 20994984512640.0, + "grad_norm": 1.5549126313373527, + "language_loss": 0.75272417, + "learning_rate": 2.679501716181497e-06, + "loss": 0.77372682, + "num_input_tokens_seen": 139128260, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.4609375, + "step": 6482, + "time_per_iteration": 2.407773733139038 + }, + { + "auxiliary_loss_clip": 0.01070952, + "auxiliary_loss_mlp": 0.01027525, + "balance_loss_clip": 1.01332748, + "balance_loss_mlp": 1.02251124, + "epoch": 0.3897790470464452, + "flos": 22527392307840.0, + "grad_norm": 2.435021378516795, + "language_loss": 0.78798187, + "learning_rate": 2.6791463563137752e-06, + "loss": 0.8089667, + "num_input_tokens_seen": 139147315, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.484375, + "step": 6483, + "time_per_iteration": 2.4283652305603027 + }, + { + "auxiliary_loss_clip": 0.01069997, + "auxiliary_loss_mlp": 0.01024407, + "balance_loss_clip": 1.00904083, + "balance_loss_mlp": 1.02176023, + "epoch": 0.3898391702991132, + "flos": 26208809153280.0, + "grad_norm": 1.4183277940049122, + "language_loss": 0.80131119, + "learning_rate": 2.6787909722090667e-06, + "loss": 0.82225525, + "num_input_tokens_seen": 139167270, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.48242188, + "step": 6484, + "time_per_iteration": 2.443831205368042 + }, + { + "auxiliary_loss_clip": 0.01071077, + "auxiliary_loss_mlp": 0.01031662, + "balance_loss_clip": 1.01605761, + "balance_loss_mlp": 1.02364087, + "epoch": 0.38989929355178116, + "flos": 21064601496960.0, + "grad_norm": 1.627917343405308, + "language_loss": 0.78003567, + "learning_rate": 2.6784355638800545e-06, + "loss": 0.80106306, + "num_input_tokens_seen": 139185970, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.47460938, + "step": 6485, + "time_per_iteration": 2.4115326404571533 + }, + { + "auxiliary_loss_clip": 0.01071824, + "auxiliary_loss_mlp": 0.01033282, + "balance_loss_clip": 1.01825571, + "balance_loss_mlp": 1.02301431, + "epoch": 0.3899594168044491, + "flos": 25482929742720.0, + "grad_norm": 2.4352871616528615, + "language_loss": 0.84839928, + "learning_rate": 2.6780801313394225e-06, + "loss": 0.86945033, + "num_input_tokens_seen": 139203730, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.48828125, + "step": 6486, + "time_per_iteration": 2.4192097187042236 + }, + { + "auxiliary_loss_clip": 0.01072469, + "auxiliary_loss_mlp": 0.01027385, + "balance_loss_clip": 1.01297235, + "balance_loss_mlp": 1.02232361, + "epoch": 0.3900195400571171, + "flos": 31138021457280.0, + "grad_norm": 1.8294198896871836, + "language_loss": 0.85289669, + "learning_rate": 2.677724674599854e-06, + "loss": 0.87389517, + "num_input_tokens_seen": 139222560, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.5, + "step": 6487, + "time_per_iteration": 2.457425832748413 + }, + { + "auxiliary_loss_clip": 0.01071129, + "auxiliary_loss_mlp": 0.01026888, + "balance_loss_clip": 1.01184368, + "balance_loss_mlp": 1.02410769, + "epoch": 0.39007966330978505, + "flos": 20228885349120.0, + "grad_norm": 1.4305199433096794, + "language_loss": 0.72924948, + "learning_rate": 2.6773691936740357e-06, + "loss": 0.75022966, + "num_input_tokens_seen": 139242165, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.46875, + "step": 6488, + "time_per_iteration": 2.3979246616363525 + }, + { + "auxiliary_loss_clip": 0.01072582, + "auxiliary_loss_mlp": 0.01028207, + "balance_loss_clip": 1.01380038, + "balance_loss_mlp": 1.02384758, + "epoch": 0.390139786562453, + "flos": 22527636687360.0, + "grad_norm": 1.8267719893420327, + "language_loss": 0.68645287, + "learning_rate": 2.677013688574654e-06, + "loss": 0.70746076, + "num_input_tokens_seen": 139262525, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.48828125, + "step": 6489, + "time_per_iteration": 2.415444850921631 + }, + { + "auxiliary_loss_clip": 0.01068445, + "auxiliary_loss_mlp": 0.0103154, + "balance_loss_clip": 1.01842117, + "balance_loss_mlp": 1.02287579, + "epoch": 0.390199909815121, + "flos": 26431694472960.0, + "grad_norm": 2.277746125402378, + "language_loss": 0.80612254, + "learning_rate": 2.6766581593143937e-06, + "loss": 0.82712239, + "num_input_tokens_seen": 139282835, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.45703125, + "step": 6490, + "time_per_iteration": 2.4373369216918945 + }, + { + "auxiliary_loss_clip": 0.01072244, + "auxiliary_loss_mlp": 0.01028336, + "balance_loss_clip": 1.01412678, + "balance_loss_mlp": 1.0231123, + "epoch": 0.39026003306778895, + "flos": 17273627205120.0, + "grad_norm": 2.084094265430606, + "language_loss": 0.89501071, + "learning_rate": 2.6763026059059455e-06, + "loss": 0.91601658, + "num_input_tokens_seen": 139299490, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.48828125, + "step": 6491, + "time_per_iteration": 2.3583133220672607 + }, + { + "auxiliary_loss_clip": 0.01069856, + "auxiliary_loss_mlp": 0.01028887, + "balance_loss_clip": 1.01433182, + "balance_loss_mlp": 1.02214003, + "epoch": 0.3903201563204569, + "flos": 24531756128640.0, + "grad_norm": 1.6843933061451706, + "language_loss": 0.78731203, + "learning_rate": 2.675947028361996e-06, + "loss": 0.80829942, + "num_input_tokens_seen": 139317865, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.4765625, + "step": 6492, + "time_per_iteration": 2.4168667793273926 + }, + { + "auxiliary_loss_clip": 0.0107176, + "auxiliary_loss_mlp": 0.01029228, + "balance_loss_clip": 1.01584113, + "balance_loss_mlp": 1.02330852, + "epoch": 0.39038027957312493, + "flos": 23766843951360.0, + "grad_norm": 1.692379312343689, + "language_loss": 0.74323332, + "learning_rate": 2.6755914266952365e-06, + "loss": 0.76424325, + "num_input_tokens_seen": 139339840, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.484375, + "step": 6493, + "time_per_iteration": 2.423197031021118 + }, + { + "auxiliary_loss_clip": 0.01073855, + "auxiliary_loss_mlp": 0.01031531, + "balance_loss_clip": 1.01586103, + "balance_loss_mlp": 1.02310073, + "epoch": 0.3904404028257929, + "flos": 14099742927360.0, + "grad_norm": 1.78031682394404, + "language_loss": 0.76241297, + "learning_rate": 2.675235800918357e-06, + "loss": 0.78346688, + "num_input_tokens_seen": 139357555, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.5078125, + "step": 6494, + "time_per_iteration": 2.3686842918395996 + }, + { + "auxiliary_loss_clip": 0.01076157, + "auxiliary_loss_mlp": 0.01039687, + "balance_loss_clip": 1.02342701, + "balance_loss_mlp": 1.02274215, + "epoch": 0.39050052607846086, + "flos": 16909099032960.0, + "grad_norm": 7.283111660615988, + "language_loss": 0.74459386, + "learning_rate": 2.6748801510440484e-06, + "loss": 0.76575232, + "num_input_tokens_seen": 139374455, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.53515625, + "step": 6495, + "time_per_iteration": 3.778712511062622 + }, + { + "auxiliary_loss_clip": 0.01071609, + "auxiliary_loss_mlp": 0.0103057, + "balance_loss_clip": 1.01537085, + "balance_loss_mlp": 1.02238202, + "epoch": 0.39056064933112883, + "flos": 25914735838080.0, + "grad_norm": 1.683167137524962, + "language_loss": 0.67795867, + "learning_rate": 2.674524477085003e-06, + "loss": 0.69898045, + "num_input_tokens_seen": 139394770, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.4921875, + "step": 6496, + "time_per_iteration": 2.4624650478363037 + }, + { + "auxiliary_loss_clip": 0.01011279, + "auxiliary_loss_mlp": 0.0100104, + "balance_loss_clip": 0.99935949, + "balance_loss_mlp": 1.00167203, + "epoch": 0.3906207725837968, + "flos": 60025471119360.0, + "grad_norm": 0.7020153734480111, + "language_loss": 0.53949678, + "learning_rate": 2.674168779053914e-06, + "loss": 0.55962002, + "num_input_tokens_seen": 139454760, + "router_z_loss_clip": 0.0168457, + "router_z_loss_mlp": 0.09619141, + "step": 6497, + "time_per_iteration": 3.1207151412963867 + }, + { + "auxiliary_loss_clip": 0.01072483, + "auxiliary_loss_mlp": 0.01030652, + "balance_loss_clip": 1.01607883, + "balance_loss_mlp": 1.02425528, + "epoch": 0.39068089583646476, + "flos": 21067638785280.0, + "grad_norm": 1.9295321526756164, + "language_loss": 0.68641782, + "learning_rate": 2.6738130569634763e-06, + "loss": 0.7074492, + "num_input_tokens_seen": 139472645, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.48242188, + "step": 6498, + "time_per_iteration": 2.3912081718444824 + }, + { + "auxiliary_loss_clip": 0.010112, + "auxiliary_loss_mlp": 0.01000682, + "balance_loss_clip": 0.99929941, + "balance_loss_mlp": 1.00132179, + "epoch": 0.3907410190891327, + "flos": 70441911987840.0, + "grad_norm": 0.7281905591079403, + "language_loss": 0.51770073, + "learning_rate": 2.673457310826383e-06, + "loss": 0.53781956, + "num_input_tokens_seen": 139536730, + "router_z_loss_clip": 0.01385498, + "router_z_loss_mlp": 0.09863281, + "step": 6499, + "time_per_iteration": 4.466989040374756 + }, + { + "auxiliary_loss_clip": 0.010724, + "auxiliary_loss_mlp": 0.01037923, + "balance_loss_clip": 1.02064979, + "balance_loss_mlp": 1.02197552, + "epoch": 0.3908011423418007, + "flos": 27961274448000.0, + "grad_norm": 1.6407594440968745, + "language_loss": 0.73890769, + "learning_rate": 2.673101540655331e-06, + "loss": 0.76001096, + "num_input_tokens_seen": 139557540, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.50390625, + "step": 6500, + "time_per_iteration": 2.4607081413269043 + }, + { + "auxiliary_loss_clip": 0.01071294, + "auxiliary_loss_mlp": 0.01029275, + "balance_loss_clip": 1.01432037, + "balance_loss_mlp": 1.02248883, + "epoch": 0.39086126559446865, + "flos": 24460952158080.0, + "grad_norm": 2.075601751373423, + "language_loss": 0.69014835, + "learning_rate": 2.6727457464630166e-06, + "loss": 0.71115398, + "num_input_tokens_seen": 139576875, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.48828125, + "step": 6501, + "time_per_iteration": 2.419100522994995 + }, + { + "auxiliary_loss_clip": 0.01071081, + "auxiliary_loss_mlp": 0.01033222, + "balance_loss_clip": 1.01877975, + "balance_loss_mlp": 1.02292275, + "epoch": 0.3909213888471366, + "flos": 16940730591360.0, + "grad_norm": 1.660495213771565, + "language_loss": 0.78816283, + "learning_rate": 2.6723899282621363e-06, + "loss": 0.80920589, + "num_input_tokens_seen": 139594295, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.48242188, + "step": 6502, + "time_per_iteration": 3.780056953430176 + }, + { + "auxiliary_loss_clip": 0.01070883, + "auxiliary_loss_mlp": 0.01028311, + "balance_loss_clip": 1.01530504, + "balance_loss_mlp": 1.02517021, + "epoch": 0.3909815120998046, + "flos": 29277115879680.0, + "grad_norm": 2.1044036887519217, + "language_loss": 0.80444592, + "learning_rate": 2.6720340860653894e-06, + "loss": 0.82543778, + "num_input_tokens_seen": 139614080, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.45703125, + "step": 6503, + "time_per_iteration": 2.4481537342071533 + }, + { + "auxiliary_loss_clip": 0.01067127, + "auxiliary_loss_mlp": 0.01027997, + "balance_loss_clip": 1.01439512, + "balance_loss_mlp": 1.02128088, + "epoch": 0.39104163535247255, + "flos": 18950296204800.0, + "grad_norm": 5.435741648370807, + "language_loss": 0.71616352, + "learning_rate": 2.671678219885475e-06, + "loss": 0.73711479, + "num_input_tokens_seen": 139632755, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.45898438, + "step": 6504, + "time_per_iteration": 2.3827760219573975 + }, + { + "auxiliary_loss_clip": 0.01070296, + "auxiliary_loss_mlp": 0.01032959, + "balance_loss_clip": 1.01871371, + "balance_loss_mlp": 1.02241278, + "epoch": 0.3911017586051405, + "flos": 26322137026560.0, + "grad_norm": 1.510870588813075, + "language_loss": 0.83190632, + "learning_rate": 2.6713223297350926e-06, + "loss": 0.85293889, + "num_input_tokens_seen": 139654205, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.48046875, + "step": 6505, + "time_per_iteration": 3.819749355316162 + }, + { + "auxiliary_loss_clip": 0.01070381, + "auxiliary_loss_mlp": 0.0103343, + "balance_loss_clip": 1.01857638, + "balance_loss_mlp": 1.02335227, + "epoch": 0.3911618818578085, + "flos": 21834680555520.0, + "grad_norm": 3.53475962100524, + "language_loss": 0.71180999, + "learning_rate": 2.6709664156269426e-06, + "loss": 0.73284805, + "num_input_tokens_seen": 139673595, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.47070312, + "step": 6506, + "time_per_iteration": 2.4021599292755127 + }, + { + "auxiliary_loss_clip": 0.01068826, + "auxiliary_loss_mlp": 0.01033462, + "balance_loss_clip": 1.01986074, + "balance_loss_mlp": 1.02240252, + "epoch": 0.3912220051104765, + "flos": 16358833094400.0, + "grad_norm": 2.1625273886153793, + "language_loss": 0.75075412, + "learning_rate": 2.670610477573727e-06, + "loss": 0.77177703, + "num_input_tokens_seen": 139690565, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.46484375, + "step": 6507, + "time_per_iteration": 2.361372947692871 + }, + { + "auxiliary_loss_clip": 0.0101144, + "auxiliary_loss_mlp": 0.01009372, + "balance_loss_clip": 1.00803065, + "balance_loss_mlp": 1.00197387, + "epoch": 0.39128212836314447, + "flos": 71047620898560.0, + "grad_norm": 0.759223497609016, + "language_loss": 0.56506526, + "learning_rate": 2.670254515588149e-06, + "loss": 0.58527339, + "num_input_tokens_seen": 139749420, + "router_z_loss_clip": 0.01342773, + "router_z_loss_mlp": 0.09472656, + "step": 6508, + "time_per_iteration": 3.1868019104003906 + }, + { + "auxiliary_loss_clip": 0.01070415, + "auxiliary_loss_mlp": 0.01033676, + "balance_loss_clip": 1.0200026, + "balance_loss_mlp": 1.02263308, + "epoch": 0.39134225161581243, + "flos": 20331146321280.0, + "grad_norm": 1.79345970067716, + "language_loss": 0.75974876, + "learning_rate": 2.6698985296829115e-06, + "loss": 0.78078967, + "num_input_tokens_seen": 139766265, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.4765625, + "step": 6509, + "time_per_iteration": 2.3881888389587402 + }, + { + "auxiliary_loss_clip": 0.01070691, + "auxiliary_loss_mlp": 0.01035217, + "balance_loss_clip": 1.01974368, + "balance_loss_mlp": 1.02108085, + "epoch": 0.3914023748684804, + "flos": 17017469493120.0, + "grad_norm": 2.7341706514927857, + "language_loss": 0.82551944, + "learning_rate": 2.6695425198707187e-06, + "loss": 0.84657848, + "num_input_tokens_seen": 139782400, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.49804688, + "step": 6510, + "time_per_iteration": 2.360872268676758 + }, + { + "auxiliary_loss_clip": 0.01070789, + "auxiliary_loss_mlp": 0.01026773, + "balance_loss_clip": 1.01234913, + "balance_loss_mlp": 1.02283633, + "epoch": 0.39146249812114836, + "flos": 18404254540800.0, + "grad_norm": 2.061524877530665, + "language_loss": 0.76299548, + "learning_rate": 2.669186486164276e-06, + "loss": 0.78397119, + "num_input_tokens_seen": 139801435, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.48046875, + "step": 6511, + "time_per_iteration": 2.374729871749878 + }, + { + "auxiliary_loss_clip": 0.01010949, + "auxiliary_loss_mlp": 0.01003806, + "balance_loss_clip": 1.00247073, + "balance_loss_mlp": 1.00114608, + "epoch": 0.3915226213738163, + "flos": 67633638134400.0, + "grad_norm": 0.7227842401470164, + "language_loss": 0.57769883, + "learning_rate": 2.6688304285762878e-06, + "loss": 0.59784639, + "num_input_tokens_seen": 139869700, + "router_z_loss_clip": 0.0133667, + "router_z_loss_mlp": 0.09814453, + "step": 6512, + "time_per_iteration": 3.1133248805999756 + }, + { + "auxiliary_loss_clip": 0.01072146, + "auxiliary_loss_mlp": 0.01030807, + "balance_loss_clip": 1.01521444, + "balance_loss_mlp": 1.02253473, + "epoch": 0.3915827446264843, + "flos": 26358132504960.0, + "grad_norm": 1.6924279689360266, + "language_loss": 0.69814038, + "learning_rate": 2.6684743471194627e-06, + "loss": 0.71916991, + "num_input_tokens_seen": 139890140, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.49609375, + "step": 6513, + "time_per_iteration": 2.425922393798828 + }, + { + "auxiliary_loss_clip": 0.01073337, + "auxiliary_loss_mlp": 0.01030735, + "balance_loss_clip": 1.01479053, + "balance_loss_mlp": 1.02243829, + "epoch": 0.39164286787915226, + "flos": 21942841547520.0, + "grad_norm": 4.158661308402621, + "language_loss": 0.75745928, + "learning_rate": 2.668118241806508e-06, + "loss": 0.77849996, + "num_input_tokens_seen": 139908020, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.5078125, + "step": 6514, + "time_per_iteration": 2.3662989139556885 + }, + { + "auxiliary_loss_clip": 0.0107254, + "auxiliary_loss_mlp": 0.0102883, + "balance_loss_clip": 1.01482892, + "balance_loss_mlp": 1.02344418, + "epoch": 0.3917029911318202, + "flos": 16398878290560.0, + "grad_norm": 1.9568285385612725, + "language_loss": 0.77076769, + "learning_rate": 2.6677621126501316e-06, + "loss": 0.79178137, + "num_input_tokens_seen": 139926180, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.4921875, + "step": 6515, + "time_per_iteration": 2.4456069469451904 + }, + { + "auxiliary_loss_clip": 0.01068817, + "auxiliary_loss_mlp": 0.01025696, + "balance_loss_clip": 1.0126667, + "balance_loss_mlp": 1.02213216, + "epoch": 0.3917631143844882, + "flos": 26210554721280.0, + "grad_norm": 1.3565878970265306, + "language_loss": 0.80078703, + "learning_rate": 2.667405959663043e-06, + "loss": 0.82173216, + "num_input_tokens_seen": 139947420, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.46679688, + "step": 6516, + "time_per_iteration": 2.4280171394348145 + }, + { + "auxiliary_loss_clip": 0.01072937, + "auxiliary_loss_mlp": 0.01029463, + "balance_loss_clip": 1.01500273, + "balance_loss_mlp": 1.02349448, + "epoch": 0.39182323763715615, + "flos": 18547468404480.0, + "grad_norm": 2.2096037200594503, + "language_loss": 0.70288467, + "learning_rate": 2.667049782857952e-06, + "loss": 0.72390866, + "num_input_tokens_seen": 139965800, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.49414062, + "step": 6517, + "time_per_iteration": 2.3704264163970947 + }, + { + "auxiliary_loss_clip": 0.01070886, + "auxiliary_loss_mlp": 0.01032609, + "balance_loss_clip": 1.01746893, + "balance_loss_mlp": 1.02152848, + "epoch": 0.3918833608898241, + "flos": 34312115203200.0, + "grad_norm": 1.809158599947749, + "language_loss": 0.7184099, + "learning_rate": 2.666693582247571e-06, + "loss": 0.73944485, + "num_input_tokens_seen": 139988140, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.49414062, + "step": 6518, + "time_per_iteration": 2.4997715950012207 + }, + { + "auxiliary_loss_clip": 0.010734, + "auxiliary_loss_mlp": 0.01029901, + "balance_loss_clip": 1.01490474, + "balance_loss_mlp": 1.02406752, + "epoch": 0.3919434841424921, + "flos": 36938107514880.0, + "grad_norm": 1.5865486953082641, + "language_loss": 0.61552501, + "learning_rate": 2.66633735784461e-06, + "loss": 0.63655806, + "num_input_tokens_seen": 140010060, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.4921875, + "step": 6519, + "time_per_iteration": 2.522475004196167 + }, + { + "auxiliary_loss_clip": 0.0107778, + "auxiliary_loss_mlp": 0.01035704, + "balance_loss_clip": 1.01977742, + "balance_loss_mlp": 1.02490759, + "epoch": 0.3920036073951601, + "flos": 23507963153280.0, + "grad_norm": 2.165308211753418, + "language_loss": 0.67055762, + "learning_rate": 2.665981109661784e-06, + "loss": 0.69169247, + "num_input_tokens_seen": 140029400, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.52734375, + "step": 6520, + "time_per_iteration": 2.4056036472320557 + }, + { + "auxiliary_loss_clip": 0.01070733, + "auxiliary_loss_mlp": 0.01028202, + "balance_loss_clip": 1.01427794, + "balance_loss_mlp": 1.02329373, + "epoch": 0.39206373064782807, + "flos": 18405092413440.0, + "grad_norm": 1.7036751181350607, + "language_loss": 0.78544468, + "learning_rate": 2.6656248377118043e-06, + "loss": 0.80643404, + "num_input_tokens_seen": 140048940, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.47265625, + "step": 6521, + "time_per_iteration": 2.388385534286499 + }, + { + "auxiliary_loss_clip": 0.0107627, + "auxiliary_loss_mlp": 0.01031928, + "balance_loss_clip": 1.01412439, + "balance_loss_mlp": 1.02421784, + "epoch": 0.39212385390049603, + "flos": 12312224760960.0, + "grad_norm": 2.1774890338730417, + "language_loss": 0.69795561, + "learning_rate": 2.6652685420073867e-06, + "loss": 0.71903753, + "num_input_tokens_seen": 140066380, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.51953125, + "step": 6522, + "time_per_iteration": 2.3660755157470703 + }, + { + "auxiliary_loss_clip": 0.01073926, + "auxiliary_loss_mlp": 0.01030369, + "balance_loss_clip": 1.01585519, + "balance_loss_mlp": 1.02330637, + "epoch": 0.392183977153164, + "flos": 19718140936320.0, + "grad_norm": 1.8100435821216008, + "language_loss": 0.7645672, + "learning_rate": 2.664912222561246e-06, + "loss": 0.78561014, + "num_input_tokens_seen": 140085275, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.5078125, + "step": 6523, + "time_per_iteration": 2.380765676498413 + }, + { + "auxiliary_loss_clip": 0.01073468, + "auxiliary_loss_mlp": 0.01028199, + "balance_loss_clip": 1.01300025, + "balance_loss_mlp": 1.02377224, + "epoch": 0.39224410040583196, + "flos": 33143537352960.0, + "grad_norm": 2.19565243827638, + "language_loss": 0.62009531, + "learning_rate": 2.664555879386098e-06, + "loss": 0.64111197, + "num_input_tokens_seen": 140105105, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.49609375, + "step": 6524, + "time_per_iteration": 2.48087739944458 + }, + { + "auxiliary_loss_clip": 0.0107139, + "auxiliary_loss_mlp": 0.0102822, + "balance_loss_clip": 1.0127461, + "balance_loss_mlp": 1.02229893, + "epoch": 0.39230422365849993, + "flos": 27781192321920.0, + "grad_norm": 1.7799679359940686, + "language_loss": 0.74032056, + "learning_rate": 2.6641995124946606e-06, + "loss": 0.7613166, + "num_input_tokens_seen": 140125645, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.49023438, + "step": 6525, + "time_per_iteration": 2.4335601329803467 + }, + { + "auxiliary_loss_clip": 0.01072296, + "auxiliary_loss_mlp": 0.01030525, + "balance_loss_clip": 1.01657772, + "balance_loss_mlp": 1.02280092, + "epoch": 0.3923643469111679, + "flos": 17930657681280.0, + "grad_norm": 1.922923076222103, + "language_loss": 0.81455332, + "learning_rate": 2.6638431218996517e-06, + "loss": 0.83558154, + "num_input_tokens_seen": 140141925, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.49609375, + "step": 6526, + "time_per_iteration": 2.3506667613983154 + }, + { + "auxiliary_loss_clip": 0.01072488, + "auxiliary_loss_mlp": 0.01027286, + "balance_loss_clip": 1.01310003, + "balance_loss_mlp": 1.02294397, + "epoch": 0.39242447016383586, + "flos": 24058438560000.0, + "grad_norm": 1.7242206333312153, + "language_loss": 0.69962192, + "learning_rate": 2.6634867076137886e-06, + "loss": 0.72061968, + "num_input_tokens_seen": 140160965, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.49609375, + "step": 6527, + "time_per_iteration": 2.414858818054199 + }, + { + "auxiliary_loss_clip": 0.01069636, + "auxiliary_loss_mlp": 0.01031325, + "balance_loss_clip": 1.01660883, + "balance_loss_mlp": 1.02246583, + "epoch": 0.3924845934165038, + "flos": 10663486715520.0, + "grad_norm": 3.0285435867831803, + "language_loss": 0.82174188, + "learning_rate": 2.663130269649792e-06, + "loss": 0.8427515, + "num_input_tokens_seen": 140177780, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.47265625, + "step": 6528, + "time_per_iteration": 2.350456714630127 + }, + { + "auxiliary_loss_clip": 0.01071739, + "auxiliary_loss_mlp": 0.01029933, + "balance_loss_clip": 1.01494205, + "balance_loss_mlp": 1.02350736, + "epoch": 0.3925447166691718, + "flos": 31244646349440.0, + "grad_norm": 1.5841101196191865, + "language_loss": 0.68338889, + "learning_rate": 2.6627738080203817e-06, + "loss": 0.70440561, + "num_input_tokens_seen": 140201660, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.48242188, + "step": 6529, + "time_per_iteration": 2.492323875427246 + }, + { + "auxiliary_loss_clip": 0.01074034, + "auxiliary_loss_mlp": 0.01031782, + "balance_loss_clip": 1.01621377, + "balance_loss_mlp": 1.02381659, + "epoch": 0.39260483992183975, + "flos": 29414010787200.0, + "grad_norm": 2.4145039296146105, + "language_loss": 0.8057965, + "learning_rate": 2.662417322738279e-06, + "loss": 0.82685471, + "num_input_tokens_seen": 140218585, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.50390625, + "step": 6530, + "time_per_iteration": 2.416611433029175 + }, + { + "auxiliary_loss_clip": 0.01070743, + "auxiliary_loss_mlp": 0.01029347, + "balance_loss_clip": 1.01610875, + "balance_loss_mlp": 1.02235818, + "epoch": 0.3926649631745077, + "flos": 22856658140160.0, + "grad_norm": 1.4002561573666452, + "language_loss": 0.75478733, + "learning_rate": 2.6620608138162055e-06, + "loss": 0.77578831, + "num_input_tokens_seen": 140239905, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.484375, + "step": 6531, + "time_per_iteration": 2.4023075103759766 + }, + { + "auxiliary_loss_clip": 0.01011892, + "auxiliary_loss_mlp": 0.01001146, + "balance_loss_clip": 0.99988264, + "balance_loss_mlp": 1.00243986, + "epoch": 0.3927250864271757, + "flos": 63890880163200.0, + "grad_norm": 0.8061435048317566, + "language_loss": 0.60290277, + "learning_rate": 2.6617042812668857e-06, + "loss": 0.62303311, + "num_input_tokens_seen": 140293820, + "router_z_loss_clip": 0.01263428, + "router_z_loss_mlp": 0.09472656, + "step": 6532, + "time_per_iteration": 2.8868446350097656 + }, + { + "auxiliary_loss_clip": 0.01011789, + "auxiliary_loss_mlp": 0.01000566, + "balance_loss_clip": 0.99911773, + "balance_loss_mlp": 1.00211549, + "epoch": 0.3927852096798437, + "flos": 68906117790720.0, + "grad_norm": 0.7726829765055597, + "language_loss": 0.554878, + "learning_rate": 2.661347725103041e-06, + "loss": 0.57500154, + "num_input_tokens_seen": 140360420, + "router_z_loss_clip": 0.01446533, + "router_z_loss_mlp": 0.09667969, + "step": 6533, + "time_per_iteration": 3.139435291290283 + }, + { + "auxiliary_loss_clip": 0.0107725, + "auxiliary_loss_mlp": 0.0103213, + "balance_loss_clip": 1.01665688, + "balance_loss_mlp": 1.02562857, + "epoch": 0.39284533293251167, + "flos": 29714682349440.0, + "grad_norm": 1.8939754286522168, + "language_loss": 0.76383758, + "learning_rate": 2.6609911453373978e-06, + "loss": 0.78493142, + "num_input_tokens_seen": 140381950, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.515625, + "step": 6534, + "time_per_iteration": 3.886058807373047 + }, + { + "auxiliary_loss_clip": 0.01074632, + "auxiliary_loss_mlp": 0.01034021, + "balance_loss_clip": 1.01887536, + "balance_loss_mlp": 1.02283192, + "epoch": 0.39290545618517964, + "flos": 18551029363200.0, + "grad_norm": 2.4519466383752313, + "language_loss": 0.78075075, + "learning_rate": 2.660634541982681e-06, + "loss": 0.80183721, + "num_input_tokens_seen": 140399410, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.51953125, + "step": 6535, + "time_per_iteration": 2.3610856533050537 + }, + { + "auxiliary_loss_clip": 0.0107189, + "auxiliary_loss_mlp": 0.01026046, + "balance_loss_clip": 1.01236725, + "balance_loss_mlp": 1.02398586, + "epoch": 0.3929655794378476, + "flos": 26248295767680.0, + "grad_norm": 1.9195857503071385, + "language_loss": 0.69086009, + "learning_rate": 2.6602779150516163e-06, + "loss": 0.7118395, + "num_input_tokens_seen": 140419055, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.48046875, + "step": 6536, + "time_per_iteration": 2.4630112648010254 + }, + { + "auxiliary_loss_clip": 0.01068108, + "auxiliary_loss_mlp": 0.0102794, + "balance_loss_clip": 1.01433218, + "balance_loss_mlp": 1.02199149, + "epoch": 0.39302570269051557, + "flos": 29276662032000.0, + "grad_norm": 1.7973021879875843, + "language_loss": 0.69352496, + "learning_rate": 2.6599212645569316e-06, + "loss": 0.71448541, + "num_input_tokens_seen": 140438800, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.4609375, + "step": 6537, + "time_per_iteration": 2.4424188137054443 + }, + { + "auxiliary_loss_clip": 0.01074702, + "auxiliary_loss_mlp": 0.01030436, + "balance_loss_clip": 1.01601124, + "balance_loss_mlp": 1.02404118, + "epoch": 0.39308582594318353, + "flos": 17346490945920.0, + "grad_norm": 1.6493351350649215, + "language_loss": 0.78746736, + "learning_rate": 2.6595645905113546e-06, + "loss": 0.80851877, + "num_input_tokens_seen": 140456880, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.5078125, + "step": 6538, + "time_per_iteration": 2.366671562194824 + }, + { + "auxiliary_loss_clip": 0.01011739, + "auxiliary_loss_mlp": 0.01002507, + "balance_loss_clip": 1.00125504, + "balance_loss_mlp": 1.00184202, + "epoch": 0.3931459491958515, + "flos": 61004296396800.0, + "grad_norm": 0.8007630957422047, + "language_loss": 0.61874413, + "learning_rate": 2.659207892927614e-06, + "loss": 0.63888663, + "num_input_tokens_seen": 140507510, + "router_z_loss_clip": 0.01251221, + "router_z_loss_mlp": 0.09912109, + "step": 6539, + "time_per_iteration": 4.198698282241821 + }, + { + "auxiliary_loss_clip": 0.01073834, + "auxiliary_loss_mlp": 0.01029192, + "balance_loss_clip": 1.01349247, + "balance_loss_mlp": 1.0240736, + "epoch": 0.39320607244851946, + "flos": 39014567026560.0, + "grad_norm": 1.9675065114053225, + "language_loss": 0.68048406, + "learning_rate": 2.658851171818439e-06, + "loss": 0.70151436, + "num_input_tokens_seen": 140528740, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.49804688, + "step": 6540, + "time_per_iteration": 2.5468459129333496 + }, + { + "auxiliary_loss_clip": 0.0107159, + "auxiliary_loss_mlp": 0.01028758, + "balance_loss_clip": 1.01441133, + "balance_loss_mlp": 1.02308488, + "epoch": 0.3932661957011874, + "flos": 24678635685120.0, + "grad_norm": 1.7055337015864176, + "language_loss": 0.72762537, + "learning_rate": 2.65849442719656e-06, + "loss": 0.7486288, + "num_input_tokens_seen": 140547560, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.484375, + "step": 6541, + "time_per_iteration": 3.7921223640441895 + }, + { + "auxiliary_loss_clip": 0.01011855, + "auxiliary_loss_mlp": 0.0100478, + "balance_loss_clip": 1.00346291, + "balance_loss_mlp": 1.00177538, + "epoch": 0.3933263189538554, + "flos": 70093375084800.0, + "grad_norm": 0.8597354786466657, + "language_loss": 0.60322255, + "learning_rate": 2.65813765907471e-06, + "loss": 0.62338883, + "num_input_tokens_seen": 140601175, + "router_z_loss_clip": 0.01318359, + "router_z_loss_mlp": 0.10058594, + "step": 6542, + "time_per_iteration": 2.870962619781494 + }, + { + "auxiliary_loss_clip": 0.01072311, + "auxiliary_loss_mlp": 0.01033129, + "balance_loss_clip": 1.01869273, + "balance_loss_mlp": 1.02378333, + "epoch": 0.39338644220652336, + "flos": 22927985781120.0, + "grad_norm": 1.4538670705019117, + "language_loss": 0.82157886, + "learning_rate": 2.657780867465619e-06, + "loss": 0.84263325, + "num_input_tokens_seen": 140622200, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.484375, + "step": 6543, + "time_per_iteration": 2.408843994140625 + }, + { + "auxiliary_loss_clip": 0.0107018, + "auxiliary_loss_mlp": 0.0102878, + "balance_loss_clip": 1.01434934, + "balance_loss_mlp": 1.02141893, + "epoch": 0.3934465654591913, + "flos": 30846810873600.0, + "grad_norm": 1.4830334299248502, + "language_loss": 0.68879461, + "learning_rate": 2.6574240523820214e-06, + "loss": 0.70978415, + "num_input_tokens_seen": 140643125, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.48828125, + "step": 6544, + "time_per_iteration": 3.8891849517822266 + }, + { + "auxiliary_loss_clip": 0.01074671, + "auxiliary_loss_mlp": 0.0103487, + "balance_loss_clip": 1.01855075, + "balance_loss_mlp": 1.02308834, + "epoch": 0.3935066887118593, + "flos": 29235394938240.0, + "grad_norm": 2.233016669922964, + "language_loss": 0.75380683, + "learning_rate": 2.6570672138366503e-06, + "loss": 0.77490222, + "num_input_tokens_seen": 140662500, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.515625, + "step": 6545, + "time_per_iteration": 2.4544827938079834 + }, + { + "auxiliary_loss_clip": 0.01070354, + "auxiliary_loss_mlp": 0.01032217, + "balance_loss_clip": 1.01915145, + "balance_loss_mlp": 1.02438796, + "epoch": 0.3935668119645273, + "flos": 19134288403200.0, + "grad_norm": 1.375636981667089, + "language_loss": 0.74318087, + "learning_rate": 2.65671035184224e-06, + "loss": 0.76420653, + "num_input_tokens_seen": 140681960, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.4609375, + "step": 6546, + "time_per_iteration": 2.3854589462280273 + }, + { + "auxiliary_loss_clip": 0.01074191, + "auxiliary_loss_mlp": 0.01033753, + "balance_loss_clip": 1.01900089, + "balance_loss_mlp": 1.02254164, + "epoch": 0.3936269352171953, + "flos": 18515103707520.0, + "grad_norm": 2.160252578989639, + "language_loss": 0.81589425, + "learning_rate": 2.656353466411527e-06, + "loss": 0.83697367, + "num_input_tokens_seen": 140699170, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.515625, + "step": 6547, + "time_per_iteration": 2.370854139328003 + }, + { + "auxiliary_loss_clip": 0.01071497, + "auxiliary_loss_mlp": 0.01028708, + "balance_loss_clip": 1.01473093, + "balance_loss_mlp": 1.02171779, + "epoch": 0.39368705846986324, + "flos": 15631906343040.0, + "grad_norm": 2.0570765177892136, + "language_loss": 0.84051442, + "learning_rate": 2.6559965575572475e-06, + "loss": 0.86151642, + "num_input_tokens_seen": 140714920, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.49609375, + "step": 6548, + "time_per_iteration": 2.3534698486328125 + }, + { + "auxiliary_loss_clip": 0.01068807, + "auxiliary_loss_mlp": 0.01024729, + "balance_loss_clip": 1.01131225, + "balance_loss_mlp": 1.02213502, + "epoch": 0.3937471817225312, + "flos": 21324739104000.0, + "grad_norm": 1.456044065726379, + "language_loss": 0.72929382, + "learning_rate": 2.6556396252921375e-06, + "loss": 0.75022912, + "num_input_tokens_seen": 140734595, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.46679688, + "step": 6549, + "time_per_iteration": 2.414046049118042 + }, + { + "auxiliary_loss_clip": 0.01074092, + "auxiliary_loss_mlp": 0.01030853, + "balance_loss_clip": 1.01500392, + "balance_loss_mlp": 1.0246824, + "epoch": 0.39380730497519917, + "flos": 20775660151680.0, + "grad_norm": 1.8810851407891063, + "language_loss": 0.7762537, + "learning_rate": 2.6552826696289363e-06, + "loss": 0.79730314, + "num_input_tokens_seen": 140754050, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.4921875, + "step": 6550, + "time_per_iteration": 2.393656015396118 + }, + { + "auxiliary_loss_clip": 0.01070584, + "auxiliary_loss_mlp": 0.01026359, + "balance_loss_clip": 1.01179183, + "balance_loss_mlp": 1.02308702, + "epoch": 0.39386742822786713, + "flos": 21608897592960.0, + "grad_norm": 1.8924948801528723, + "language_loss": 0.81007159, + "learning_rate": 2.6549256905803815e-06, + "loss": 0.83104104, + "num_input_tokens_seen": 140771440, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.47460938, + "step": 6551, + "time_per_iteration": 2.3955979347229004 + }, + { + "auxiliary_loss_clip": 0.0107367, + "auxiliary_loss_mlp": 0.0103241, + "balance_loss_clip": 1.01774752, + "balance_loss_mlp": 1.02270997, + "epoch": 0.3939275514805351, + "flos": 12414031885440.0, + "grad_norm": 2.706361055961179, + "language_loss": 0.79950058, + "learning_rate": 2.654568688159214e-06, + "loss": 0.82056135, + "num_input_tokens_seen": 140786715, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.5078125, + "step": 6552, + "time_per_iteration": 2.3435208797454834 + }, + { + "auxiliary_loss_clip": 0.01072476, + "auxiliary_loss_mlp": 0.01033191, + "balance_loss_clip": 1.01882601, + "balance_loss_mlp": 1.02324486, + "epoch": 0.39398767473320306, + "flos": 18551029363200.0, + "grad_norm": 2.3230211960388516, + "language_loss": 0.71203274, + "learning_rate": 2.6542116623781736e-06, + "loss": 0.73308933, + "num_input_tokens_seen": 140804950, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.4921875, + "step": 6553, + "time_per_iteration": 2.4161274433135986 + }, + { + "auxiliary_loss_clip": 0.01072341, + "auxiliary_loss_mlp": 0.01033597, + "balance_loss_clip": 1.01913667, + "balance_loss_mlp": 1.02345705, + "epoch": 0.39404779798587103, + "flos": 29307769920000.0, + "grad_norm": 2.607946335846458, + "language_loss": 0.6435535, + "learning_rate": 2.6538546132500023e-06, + "loss": 0.66461289, + "num_input_tokens_seen": 140822800, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.48828125, + "step": 6554, + "time_per_iteration": 2.4407150745391846 + }, + { + "auxiliary_loss_clip": 0.01073041, + "auxiliary_loss_mlp": 0.01037871, + "balance_loss_clip": 1.02342331, + "balance_loss_mlp": 1.02415478, + "epoch": 0.394107921238539, + "flos": 34895618622720.0, + "grad_norm": 1.8755209423884256, + "language_loss": 0.79221523, + "learning_rate": 2.6534975407874417e-06, + "loss": 0.81332433, + "num_input_tokens_seen": 140842940, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.49023438, + "step": 6555, + "time_per_iteration": 2.5411691665649414 + }, + { + "auxiliary_loss_clip": 0.01075385, + "auxiliary_loss_mlp": 0.01034733, + "balance_loss_clip": 1.01912856, + "balance_loss_mlp": 1.02499366, + "epoch": 0.39416804449120696, + "flos": 25080276499200.0, + "grad_norm": 1.7085186996768669, + "language_loss": 0.71265376, + "learning_rate": 2.653140445003234e-06, + "loss": 0.73375487, + "num_input_tokens_seen": 140863060, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.50390625, + "step": 6556, + "time_per_iteration": 2.4155921936035156 + }, + { + "auxiliary_loss_clip": 0.01072876, + "auxiliary_loss_mlp": 0.01023499, + "balance_loss_clip": 1.00914669, + "balance_loss_mlp": 1.02310085, + "epoch": 0.3942281677438749, + "flos": 32305272675840.0, + "grad_norm": 1.7635482576620043, + "language_loss": 0.83707261, + "learning_rate": 2.652783325910125e-06, + "loss": 0.85803634, + "num_input_tokens_seen": 140883795, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.49804688, + "step": 6557, + "time_per_iteration": 2.5170037746429443 + }, + { + "auxiliary_loss_clip": 0.01073957, + "auxiliary_loss_mlp": 0.01026951, + "balance_loss_clip": 1.01318264, + "balance_loss_mlp": 1.02437162, + "epoch": 0.3942882909965429, + "flos": 24935456712960.0, + "grad_norm": 4.2244640702949, + "language_loss": 0.80184871, + "learning_rate": 2.652426183520857e-06, + "loss": 0.82285774, + "num_input_tokens_seen": 140903055, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.49609375, + "step": 6558, + "time_per_iteration": 2.408504009246826 + }, + { + "auxiliary_loss_clip": 0.01070894, + "auxiliary_loss_mlp": 0.01028135, + "balance_loss_clip": 1.01507616, + "balance_loss_mlp": 1.02291012, + "epoch": 0.39434841424921085, + "flos": 11873994975360.0, + "grad_norm": 1.7397184575560158, + "language_loss": 0.70887214, + "learning_rate": 2.652069017848178e-06, + "loss": 0.72986245, + "num_input_tokens_seen": 140920685, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.48046875, + "step": 6559, + "time_per_iteration": 2.380168914794922 + }, + { + "auxiliary_loss_clip": 0.01074489, + "auxiliary_loss_mlp": 0.0103316, + "balance_loss_clip": 1.01749015, + "balance_loss_mlp": 1.02278757, + "epoch": 0.3944085375018789, + "flos": 16360718307840.0, + "grad_norm": 1.9244022930595914, + "language_loss": 0.80489105, + "learning_rate": 2.651711828904833e-06, + "loss": 0.82596755, + "num_input_tokens_seen": 140937320, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.515625, + "step": 6560, + "time_per_iteration": 2.408808708190918 + }, + { + "auxiliary_loss_clip": 0.01074947, + "auxiliary_loss_mlp": 0.01030504, + "balance_loss_clip": 1.01499486, + "balance_loss_mlp": 1.02466321, + "epoch": 0.39446866075454684, + "flos": 10632623207040.0, + "grad_norm": 2.068328291681275, + "language_loss": 0.83086205, + "learning_rate": 2.6513546167035687e-06, + "loss": 0.85191661, + "num_input_tokens_seen": 140954855, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.50390625, + "step": 6561, + "time_per_iteration": 2.393876314163208 + }, + { + "auxiliary_loss_clip": 0.01073276, + "auxiliary_loss_mlp": 0.01027744, + "balance_loss_clip": 1.01291418, + "balance_loss_mlp": 1.02332056, + "epoch": 0.3945287840072148, + "flos": 18186501191040.0, + "grad_norm": 2.4164623338452085, + "language_loss": 0.79811245, + "learning_rate": 2.6509973812571336e-06, + "loss": 0.81912267, + "num_input_tokens_seen": 140973250, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.5, + "step": 6562, + "time_per_iteration": 2.3570621013641357 + }, + { + "auxiliary_loss_clip": 0.01069398, + "auxiliary_loss_mlp": 0.01033221, + "balance_loss_clip": 1.01935077, + "balance_loss_mlp": 1.02289522, + "epoch": 0.39458890725988277, + "flos": 23038765125120.0, + "grad_norm": 1.5110629373540265, + "language_loss": 0.81414276, + "learning_rate": 2.6506401225782763e-06, + "loss": 0.83516896, + "num_input_tokens_seen": 140993050, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.46484375, + "step": 6563, + "time_per_iteration": 2.4207420349121094 + }, + { + "auxiliary_loss_clip": 0.0107185, + "auxiliary_loss_mlp": 0.01034271, + "balance_loss_clip": 1.01910758, + "balance_loss_mlp": 1.02302861, + "epoch": 0.39464903051255074, + "flos": 17158274472960.0, + "grad_norm": 3.81212947711729, + "language_loss": 0.70077831, + "learning_rate": 2.650282840679747e-06, + "loss": 0.72183955, + "num_input_tokens_seen": 141010815, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.48828125, + "step": 6564, + "time_per_iteration": 2.3709397315979004 + }, + { + "auxiliary_loss_clip": 0.01074149, + "auxiliary_loss_mlp": 0.0103804, + "balance_loss_clip": 1.02231073, + "balance_loss_mlp": 1.02338731, + "epoch": 0.3947091537652187, + "flos": 15888064055040.0, + "grad_norm": 2.6621807915574287, + "language_loss": 0.83101928, + "learning_rate": 2.6499255355742966e-06, + "loss": 0.85214114, + "num_input_tokens_seen": 141028720, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.5078125, + "step": 6565, + "time_per_iteration": 2.383371114730835 + }, + { + "auxiliary_loss_clip": 0.01072819, + "auxiliary_loss_mlp": 0.01031104, + "balance_loss_clip": 1.01696622, + "balance_loss_mlp": 1.02392125, + "epoch": 0.39476927701788667, + "flos": 18544675495680.0, + "grad_norm": 1.7362731446266715, + "language_loss": 0.83571386, + "learning_rate": 2.649568207274674e-06, + "loss": 0.85675311, + "num_input_tokens_seen": 141046025, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.48828125, + "step": 6566, + "time_per_iteration": 2.390942335128784 + }, + { + "auxiliary_loss_clip": 0.01076011, + "auxiliary_loss_mlp": 0.01033674, + "balance_loss_clip": 1.01796269, + "balance_loss_mlp": 1.02457607, + "epoch": 0.39482940027055463, + "flos": 22274551175040.0, + "grad_norm": 1.628242175835704, + "language_loss": 0.77361, + "learning_rate": 2.649210855793634e-06, + "loss": 0.79470682, + "num_input_tokens_seen": 141066865, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.515625, + "step": 6567, + "time_per_iteration": 2.4575908184051514 + }, + { + "auxiliary_loss_clip": 0.01068418, + "auxiliary_loss_mlp": 0.01030884, + "balance_loss_clip": 1.01729381, + "balance_loss_mlp": 1.02343106, + "epoch": 0.3948895235232226, + "flos": 14756738492160.0, + "grad_norm": 1.8477315175351015, + "language_loss": 0.80487537, + "learning_rate": 2.648853481143928e-06, + "loss": 0.82586837, + "num_input_tokens_seen": 141084210, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.44921875, + "step": 6568, + "time_per_iteration": 2.340587615966797 + }, + { + "auxiliary_loss_clip": 0.01072536, + "auxiliary_loss_mlp": 0.01028241, + "balance_loss_clip": 1.01344717, + "balance_loss_mlp": 1.02351689, + "epoch": 0.39494964677589056, + "flos": 22564644595200.0, + "grad_norm": 1.6683448632107267, + "language_loss": 0.84761685, + "learning_rate": 2.648496083338311e-06, + "loss": 0.86862463, + "num_input_tokens_seen": 141103895, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.49023438, + "step": 6569, + "time_per_iteration": 2.4029245376586914 + }, + { + "auxiliary_loss_clip": 0.01073176, + "auxiliary_loss_mlp": 0.01029803, + "balance_loss_clip": 1.0162673, + "balance_loss_mlp": 1.02544475, + "epoch": 0.3950097700285585, + "flos": 22962165868800.0, + "grad_norm": 2.29149063381115, + "language_loss": 0.74519515, + "learning_rate": 2.648138662389537e-06, + "loss": 0.76622492, + "num_input_tokens_seen": 141124000, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.47851562, + "step": 6570, + "time_per_iteration": 2.4220478534698486 + }, + { + "auxiliary_loss_clip": 0.01072221, + "auxiliary_loss_mlp": 0.01026709, + "balance_loss_clip": 1.01253557, + "balance_loss_mlp": 1.02289999, + "epoch": 0.3950698932812265, + "flos": 20594181571200.0, + "grad_norm": 2.5870159572169644, + "language_loss": 0.79812562, + "learning_rate": 2.6477812183103606e-06, + "loss": 0.81911492, + "num_input_tokens_seen": 141142535, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.49414062, + "step": 6571, + "time_per_iteration": 2.3925352096557617 + }, + { + "auxiliary_loss_clip": 0.01071571, + "auxiliary_loss_mlp": 0.01037539, + "balance_loss_clip": 1.02297211, + "balance_loss_mlp": 1.02379191, + "epoch": 0.39513001653389446, + "flos": 20374752476160.0, + "grad_norm": 1.6882012571418306, + "language_loss": 0.77988535, + "learning_rate": 2.647423751113539e-06, + "loss": 0.80097646, + "num_input_tokens_seen": 141161575, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.4765625, + "step": 6572, + "time_per_iteration": 2.3814303874969482 + }, + { + "auxiliary_loss_clip": 0.01073321, + "auxiliary_loss_mlp": 0.01031301, + "balance_loss_clip": 1.01555371, + "balance_loss_mlp": 1.02300763, + "epoch": 0.3951901397865625, + "flos": 26462592892800.0, + "grad_norm": 1.6707871094968614, + "language_loss": 0.74629748, + "learning_rate": 2.6470662608118294e-06, + "loss": 0.7673437, + "num_input_tokens_seen": 141181150, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.50390625, + "step": 6573, + "time_per_iteration": 3.8870034217834473 + }, + { + "auxiliary_loss_clip": 0.01069885, + "auxiliary_loss_mlp": 0.01027312, + "balance_loss_clip": 1.01440132, + "balance_loss_mlp": 1.02291834, + "epoch": 0.39525026303923044, + "flos": 43836595856640.0, + "grad_norm": 1.6465136559072577, + "language_loss": 0.68008959, + "learning_rate": 2.64670874741799e-06, + "loss": 0.70106161, + "num_input_tokens_seen": 141206310, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.46875, + "step": 6574, + "time_per_iteration": 2.599647045135498 + }, + { + "auxiliary_loss_clip": 0.01073978, + "auxiliary_loss_mlp": 0.01027054, + "balance_loss_clip": 1.01165223, + "balance_loss_mlp": 1.02387023, + "epoch": 0.3953103862918984, + "flos": 18039831102720.0, + "grad_norm": 2.3783593159200893, + "language_loss": 0.71899128, + "learning_rate": 2.6463512109447776e-06, + "loss": 0.74000162, + "num_input_tokens_seen": 141223925, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.5, + "step": 6575, + "time_per_iteration": 2.3828279972076416 + }, + { + "auxiliary_loss_clip": 0.01073082, + "auxiliary_loss_mlp": 0.01034518, + "balance_loss_clip": 1.01983726, + "balance_loss_mlp": 1.02329433, + "epoch": 0.3953705095445664, + "flos": 16975259792640.0, + "grad_norm": 1.7801197347859303, + "language_loss": 0.73074341, + "learning_rate": 2.645993651404954e-06, + "loss": 0.75181937, + "num_input_tokens_seen": 141239010, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.49804688, + "step": 6576, + "time_per_iteration": 2.353875160217285 + }, + { + "auxiliary_loss_clip": 0.01071715, + "auxiliary_loss_mlp": 0.01030612, + "balance_loss_clip": 1.01729655, + "balance_loss_mlp": 1.02338433, + "epoch": 0.39543063279723434, + "flos": 17410452289920.0, + "grad_norm": 2.196600370720259, + "language_loss": 0.83569044, + "learning_rate": 2.6456360688112785e-06, + "loss": 0.85671371, + "num_input_tokens_seen": 141252255, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.484375, + "step": 6577, + "time_per_iteration": 2.3597140312194824 + }, + { + "auxiliary_loss_clip": 0.01071332, + "auxiliary_loss_mlp": 0.01029181, + "balance_loss_clip": 1.0148046, + "balance_loss_mlp": 1.02414203, + "epoch": 0.3954907560499023, + "flos": 22783096172160.0, + "grad_norm": 2.4305163458615993, + "language_loss": 0.89513612, + "learning_rate": 2.6452784631765117e-06, + "loss": 0.91614127, + "num_input_tokens_seen": 141269325, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.47265625, + "step": 6578, + "time_per_iteration": 2.3956236839294434 + }, + { + "auxiliary_loss_clip": 0.01074793, + "auxiliary_loss_mlp": 0.01031306, + "balance_loss_clip": 1.01591635, + "balance_loss_mlp": 1.02413046, + "epoch": 0.39555087930257027, + "flos": 21943330306560.0, + "grad_norm": 1.8394691599252675, + "language_loss": 0.78009337, + "learning_rate": 2.6449208345134174e-06, + "loss": 0.80115438, + "num_input_tokens_seen": 141288505, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.5078125, + "step": 6579, + "time_per_iteration": 3.8065104484558105 + }, + { + "auxiliary_loss_clip": 0.01073089, + "auxiliary_loss_mlp": 0.01031146, + "balance_loss_clip": 1.01633966, + "balance_loss_mlp": 1.02277017, + "epoch": 0.39561100255523823, + "flos": 20403800593920.0, + "grad_norm": 1.9642206333441918, + "language_loss": 0.68220782, + "learning_rate": 2.6445631828347566e-06, + "loss": 0.70325017, + "num_input_tokens_seen": 141303680, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.50390625, + "step": 6580, + "time_per_iteration": 3.7811038494110107 + }, + { + "auxiliary_loss_clip": 0.01071221, + "auxiliary_loss_mlp": 0.01030542, + "balance_loss_clip": 1.01605809, + "balance_loss_mlp": 1.02261019, + "epoch": 0.3956711258079062, + "flos": 27963334218240.0, + "grad_norm": 2.2522349701773847, + "language_loss": 0.58720434, + "learning_rate": 2.644205508153295e-06, + "loss": 0.60822201, + "num_input_tokens_seen": 141324090, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.48632812, + "step": 6581, + "time_per_iteration": 2.4350225925445557 + }, + { + "auxiliary_loss_clip": 0.01076462, + "auxiliary_loss_mlp": 0.01032451, + "balance_loss_clip": 1.01679897, + "balance_loss_mlp": 1.02452028, + "epoch": 0.39573124906057416, + "flos": 14427437748480.0, + "grad_norm": 1.775692491099468, + "language_loss": 0.69364727, + "learning_rate": 2.6438478104817953e-06, + "loss": 0.7147364, + "num_input_tokens_seen": 141342235, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.51953125, + "step": 6582, + "time_per_iteration": 2.395512580871582 + }, + { + "auxiliary_loss_clip": 0.0107273, + "auxiliary_loss_mlp": 0.01030377, + "balance_loss_clip": 1.01527953, + "balance_loss_mlp": 1.02370632, + "epoch": 0.39579137231324213, + "flos": 18732717411840.0, + "grad_norm": 3.289534970259823, + "language_loss": 0.75627226, + "learning_rate": 2.643490089833023e-06, + "loss": 0.77730334, + "num_input_tokens_seen": 141361195, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.49023438, + "step": 6583, + "time_per_iteration": 2.370741367340088 + }, + { + "auxiliary_loss_clip": 0.01072094, + "auxiliary_loss_mlp": 0.01032853, + "balance_loss_clip": 1.0186677, + "balance_loss_mlp": 1.02330935, + "epoch": 0.3958514955659101, + "flos": 17675442576000.0, + "grad_norm": 1.8348948649214971, + "language_loss": 0.65808529, + "learning_rate": 2.6431323462197453e-06, + "loss": 0.67913473, + "num_input_tokens_seen": 141378275, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.48828125, + "step": 6584, + "time_per_iteration": 3.7506649494171143 + }, + { + "auxiliary_loss_clip": 0.01074744, + "auxiliary_loss_mlp": 0.01038771, + "balance_loss_clip": 1.02268934, + "balance_loss_mlp": 1.02272248, + "epoch": 0.39591161881857806, + "flos": 29307979388160.0, + "grad_norm": 1.9879930838699553, + "language_loss": 0.72674608, + "learning_rate": 2.642774579654728e-06, + "loss": 0.74788117, + "num_input_tokens_seen": 141396960, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.51953125, + "step": 6585, + "time_per_iteration": 2.459266424179077 + }, + { + "auxiliary_loss_clip": 0.01073069, + "auxiliary_loss_mlp": 0.01027575, + "balance_loss_clip": 1.01352, + "balance_loss_mlp": 1.02397621, + "epoch": 0.3959717420712461, + "flos": 25770753924480.0, + "grad_norm": 1.7278749965790197, + "language_loss": 0.73095381, + "learning_rate": 2.6424167901507393e-06, + "loss": 0.75196028, + "num_input_tokens_seen": 141417320, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.49023438, + "step": 6586, + "time_per_iteration": 2.4353652000427246 + }, + { + "auxiliary_loss_clip": 0.01072521, + "auxiliary_loss_mlp": 0.01030391, + "balance_loss_clip": 1.01594853, + "balance_loss_mlp": 1.02377987, + "epoch": 0.39603186532391405, + "flos": 20922714264960.0, + "grad_norm": 1.6781280423681042, + "language_loss": 0.71794808, + "learning_rate": 2.6420589777205483e-06, + "loss": 0.73897719, + "num_input_tokens_seen": 141435985, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.48632812, + "step": 6587, + "time_per_iteration": 2.405137777328491 + }, + { + "auxiliary_loss_clip": 0.01015345, + "auxiliary_loss_mlp": 0.01003776, + "balance_loss_clip": 1.00241685, + "balance_loss_mlp": 1.00530243, + "epoch": 0.396091988576582, + "flos": 54878261086080.0, + "grad_norm": 0.8987258930778897, + "language_loss": 0.61268282, + "learning_rate": 2.641701142376924e-06, + "loss": 0.63287407, + "num_input_tokens_seen": 141486075, + "router_z_loss_clip": 0.01361084, + "router_z_loss_mlp": 0.10058594, + "step": 6588, + "time_per_iteration": 2.8463587760925293 + }, + { + "auxiliary_loss_clip": 0.01072132, + "auxiliary_loss_mlp": 0.01027692, + "balance_loss_clip": 1.01319623, + "balance_loss_mlp": 1.02277255, + "epoch": 0.39615211182925, + "flos": 20701888715520.0, + "grad_norm": 1.6636573203544305, + "language_loss": 0.8132534, + "learning_rate": 2.6413432841326364e-06, + "loss": 0.83425158, + "num_input_tokens_seen": 141505280, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.4921875, + "step": 6589, + "time_per_iteration": 2.4195492267608643 + }, + { + "auxiliary_loss_clip": 0.01072344, + "auxiliary_loss_mlp": 0.01027265, + "balance_loss_clip": 1.01224494, + "balance_loss_mlp": 1.0235672, + "epoch": 0.39621223508191794, + "flos": 20993308767360.0, + "grad_norm": 4.030522095439193, + "language_loss": 0.7032398, + "learning_rate": 2.6409854030004564e-06, + "loss": 0.72423589, + "num_input_tokens_seen": 141523930, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.48632812, + "step": 6590, + "time_per_iteration": 2.3825924396514893 + }, + { + "auxiliary_loss_clip": 0.01073648, + "auxiliary_loss_mlp": 0.01029856, + "balance_loss_clip": 1.01562881, + "balance_loss_mlp": 1.0243094, + "epoch": 0.3962723583345859, + "flos": 23367681843840.0, + "grad_norm": 1.7927683030698112, + "language_loss": 0.76021945, + "learning_rate": 2.640627498993157e-06, + "loss": 0.78125453, + "num_input_tokens_seen": 141541320, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.4921875, + "step": 6591, + "time_per_iteration": 2.378952741622925 + }, + { + "auxiliary_loss_clip": 0.0107514, + "auxiliary_loss_mlp": 0.01039716, + "balance_loss_clip": 1.0240761, + "balance_loss_mlp": 1.02498293, + "epoch": 0.39633248158725387, + "flos": 25114526409600.0, + "grad_norm": 2.0173311176285207, + "language_loss": 0.78368616, + "learning_rate": 2.6402695721235094e-06, + "loss": 0.80483472, + "num_input_tokens_seen": 141561880, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.5, + "step": 6592, + "time_per_iteration": 2.427351474761963 + }, + { + "auxiliary_loss_clip": 0.01068367, + "auxiliary_loss_mlp": 0.01027567, + "balance_loss_clip": 1.01445425, + "balance_loss_mlp": 1.02219844, + "epoch": 0.39639260483992184, + "flos": 39786007628160.0, + "grad_norm": 2.0917031031685687, + "language_loss": 0.69307292, + "learning_rate": 2.6399116224042875e-06, + "loss": 0.71403217, + "num_input_tokens_seen": 141586460, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.4609375, + "step": 6593, + "time_per_iteration": 2.5655195713043213 + }, + { + "auxiliary_loss_clip": 0.01076396, + "auxiliary_loss_mlp": 0.01032775, + "balance_loss_clip": 1.01721835, + "balance_loss_mlp": 1.02364969, + "epoch": 0.3964527280925898, + "flos": 17346106920960.0, + "grad_norm": 1.6529949247531577, + "language_loss": 0.77677226, + "learning_rate": 2.6395536498482666e-06, + "loss": 0.79786396, + "num_input_tokens_seen": 141605955, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.52734375, + "step": 6594, + "time_per_iteration": 2.421786069869995 + }, + { + "auxiliary_loss_clip": 0.01013301, + "auxiliary_loss_mlp": 0.01002876, + "balance_loss_clip": 1.00164258, + "balance_loss_mlp": 1.00343752, + "epoch": 0.39651285134525777, + "flos": 71714182176000.0, + "grad_norm": 0.9507616657036345, + "language_loss": 0.63048959, + "learning_rate": 2.6391956544682205e-06, + "loss": 0.6506514, + "num_input_tokens_seen": 141673140, + "router_z_loss_clip": 0.0123291, + "router_z_loss_mlp": 0.09863281, + "step": 6595, + "time_per_iteration": 3.070241689682007 + }, + { + "auxiliary_loss_clip": 0.01077312, + "auxiliary_loss_mlp": 0.01042543, + "balance_loss_clip": 1.02587748, + "balance_loss_mlp": 1.02509665, + "epoch": 0.39657297459792573, + "flos": 25774524351360.0, + "grad_norm": 2.095917998493346, + "language_loss": 0.63511324, + "learning_rate": 2.6388376362769258e-06, + "loss": 0.65631175, + "num_input_tokens_seen": 141692955, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.5234375, + "step": 6596, + "time_per_iteration": 2.433924436569214 + }, + { + "auxiliary_loss_clip": 0.01070177, + "auxiliary_loss_mlp": 0.01035416, + "balance_loss_clip": 1.02105105, + "balance_loss_mlp": 1.0235101, + "epoch": 0.3966330978505937, + "flos": 20265090295680.0, + "grad_norm": 2.288815085915132, + "language_loss": 0.78737879, + "learning_rate": 2.638479595287159e-06, + "loss": 0.80843472, + "num_input_tokens_seen": 141710680, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.46679688, + "step": 6597, + "time_per_iteration": 2.441993474960327 + }, + { + "auxiliary_loss_clip": 0.01075419, + "auxiliary_loss_mlp": 0.01031484, + "balance_loss_clip": 1.01642215, + "balance_loss_mlp": 1.02461541, + "epoch": 0.39669322110326166, + "flos": 20630142138240.0, + "grad_norm": 2.048859630911311, + "language_loss": 0.67515361, + "learning_rate": 2.638121531511698e-06, + "loss": 0.69622266, + "num_input_tokens_seen": 141729860, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.5078125, + "step": 6598, + "time_per_iteration": 2.5750367641448975 + }, + { + "auxiliary_loss_clip": 0.01073735, + "auxiliary_loss_mlp": 0.01032783, + "balance_loss_clip": 1.01866269, + "balance_loss_mlp": 1.02414203, + "epoch": 0.3967533443559297, + "flos": 21724983463680.0, + "grad_norm": 1.665831349375174, + "language_loss": 0.78801513, + "learning_rate": 2.637763444963321e-06, + "loss": 0.80908036, + "num_input_tokens_seen": 141749060, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.49609375, + "step": 6599, + "time_per_iteration": 2.4718222618103027 + }, + { + "auxiliary_loss_clip": 0.01073288, + "auxiliary_loss_mlp": 0.01028626, + "balance_loss_clip": 1.01309323, + "balance_loss_mlp": 1.02418017, + "epoch": 0.39681346760859765, + "flos": 25482964654080.0, + "grad_norm": 2.4949119101660076, + "language_loss": 0.72558117, + "learning_rate": 2.637405335654807e-06, + "loss": 0.74660027, + "num_input_tokens_seen": 141769860, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.4921875, + "step": 6600, + "time_per_iteration": 2.468285083770752 + }, + { + "auxiliary_loss_clip": 0.01072041, + "auxiliary_loss_mlp": 0.01030601, + "balance_loss_clip": 1.01596189, + "balance_loss_mlp": 1.02299666, + "epoch": 0.3968735908612656, + "flos": 20958535186560.0, + "grad_norm": 2.1987392358751072, + "language_loss": 0.84941244, + "learning_rate": 2.6370472035989367e-06, + "loss": 0.87043887, + "num_input_tokens_seen": 141788465, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.49023438, + "step": 6601, + "time_per_iteration": 2.467297077178955 + }, + { + "auxiliary_loss_clip": 0.01079397, + "auxiliary_loss_mlp": 0.0103755, + "balance_loss_clip": 1.02028847, + "balance_loss_mlp": 1.02517986, + "epoch": 0.3969337141139336, + "flos": 10706324820480.0, + "grad_norm": 12.416557919366461, + "language_loss": 0.70184731, + "learning_rate": 2.6366890488084897e-06, + "loss": 0.72301674, + "num_input_tokens_seen": 141804955, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.54296875, + "step": 6602, + "time_per_iteration": 2.425172805786133 + }, + { + "auxiliary_loss_clip": 0.01073373, + "auxiliary_loss_mlp": 0.01036399, + "balance_loss_clip": 1.02181971, + "balance_loss_mlp": 1.02396977, + "epoch": 0.39699383736660154, + "flos": 17593013122560.0, + "grad_norm": 2.2917741666065705, + "language_loss": 0.83603591, + "learning_rate": 2.636330871296249e-06, + "loss": 0.85713363, + "num_input_tokens_seen": 141820025, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.49414062, + "step": 6603, + "time_per_iteration": 2.417780637741089 + }, + { + "auxiliary_loss_clip": 0.01071579, + "auxiliary_loss_mlp": 0.01028203, + "balance_loss_clip": 1.01385045, + "balance_loss_mlp": 1.02388239, + "epoch": 0.3970539606192695, + "flos": 17784965111040.0, + "grad_norm": 1.535697167451568, + "language_loss": 0.7330876, + "learning_rate": 2.635972671074996e-06, + "loss": 0.75408542, + "num_input_tokens_seen": 141838735, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.4765625, + "step": 6604, + "time_per_iteration": 2.4277050495147705 + }, + { + "auxiliary_loss_clip": 0.01069355, + "auxiliary_loss_mlp": 0.01028661, + "balance_loss_clip": 1.01400399, + "balance_loss_mlp": 1.02336073, + "epoch": 0.3971140838719375, + "flos": 24788367688320.0, + "grad_norm": 2.023280842428495, + "language_loss": 0.82218075, + "learning_rate": 2.6356144481575144e-06, + "loss": 0.84316093, + "num_input_tokens_seen": 141858090, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.4609375, + "step": 6605, + "time_per_iteration": 2.429494857788086 + }, + { + "auxiliary_loss_clip": 0.010693, + "auxiliary_loss_mlp": 0.01025142, + "balance_loss_clip": 1.01207614, + "balance_loss_mlp": 1.02218819, + "epoch": 0.39717420712460544, + "flos": 24242430758400.0, + "grad_norm": 1.5420859583299282, + "language_loss": 0.73923743, + "learning_rate": 2.6352562025565885e-06, + "loss": 0.7601819, + "num_input_tokens_seen": 141877540, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.47070312, + "step": 6606, + "time_per_iteration": 2.4811911582946777 + }, + { + "auxiliary_loss_clip": 0.0107577, + "auxiliary_loss_mlp": 0.01030118, + "balance_loss_clip": 1.01478803, + "balance_loss_mlp": 1.02628827, + "epoch": 0.3972343303772734, + "flos": 25883523216000.0, + "grad_norm": 2.0636722669012992, + "language_loss": 0.73940217, + "learning_rate": 2.634897934285002e-06, + "loss": 0.76046103, + "num_input_tokens_seen": 141897315, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.49414062, + "step": 6607, + "time_per_iteration": 2.4875147342681885 + }, + { + "auxiliary_loss_clip": 0.01074223, + "auxiliary_loss_mlp": 0.01033751, + "balance_loss_clip": 1.01935089, + "balance_loss_mlp": 1.02472949, + "epoch": 0.39729445362994137, + "flos": 45621984430080.0, + "grad_norm": 2.1273720981107083, + "language_loss": 0.67869693, + "learning_rate": 2.6345396433555415e-06, + "loss": 0.69977665, + "num_input_tokens_seen": 141919580, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.49414062, + "step": 6608, + "time_per_iteration": 2.6384451389312744 + }, + { + "auxiliary_loss_clip": 0.0107368, + "auxiliary_loss_mlp": 0.01031088, + "balance_loss_clip": 1.01509643, + "balance_loss_mlp": 1.0232302, + "epoch": 0.39735457688260933, + "flos": 20192924782080.0, + "grad_norm": 1.9367330876541697, + "language_loss": 0.74092335, + "learning_rate": 2.6341813297809937e-06, + "loss": 0.76197106, + "num_input_tokens_seen": 141937045, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.50390625, + "step": 6609, + "time_per_iteration": 2.423671245574951 + }, + { + "auxiliary_loss_clip": 0.01074604, + "auxiliary_loss_mlp": 0.01027454, + "balance_loss_clip": 1.0122726, + "balance_loss_mlp": 1.02458334, + "epoch": 0.3974147001352773, + "flos": 23330045531520.0, + "grad_norm": 1.9670779569865173, + "language_loss": 0.71762049, + "learning_rate": 2.633822993574145e-06, + "loss": 0.73864102, + "num_input_tokens_seen": 141956695, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.5, + "step": 6610, + "time_per_iteration": 2.447556972503662 + }, + { + "auxiliary_loss_clip": 0.0106819, + "auxiliary_loss_mlp": 0.01029794, + "balance_loss_clip": 1.01582897, + "balance_loss_mlp": 1.02203941, + "epoch": 0.39747482338794526, + "flos": 21687591530880.0, + "grad_norm": 1.4957365240001774, + "language_loss": 0.78553832, + "learning_rate": 2.633464634747785e-06, + "loss": 0.8065182, + "num_input_tokens_seen": 141975935, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.4609375, + "step": 6611, + "time_per_iteration": 2.4243054389953613 + }, + { + "auxiliary_loss_clip": 0.01074559, + "auxiliary_loss_mlp": 0.01030905, + "balance_loss_clip": 1.01669502, + "balance_loss_mlp": 1.02427948, + "epoch": 0.3975349466406133, + "flos": 30987511119360.0, + "grad_norm": 2.294494954631813, + "language_loss": 0.79610085, + "learning_rate": 2.6331062533147002e-06, + "loss": 0.81715554, + "num_input_tokens_seen": 141995750, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.50390625, + "step": 6612, + "time_per_iteration": 2.50923752784729 + }, + { + "auxiliary_loss_clip": 0.01012531, + "auxiliary_loss_mlp": 0.00999866, + "balance_loss_clip": 0.99842912, + "balance_loss_mlp": 1.00267804, + "epoch": 0.39759506989328125, + "flos": 63680702578560.0, + "grad_norm": 0.8377082850529332, + "language_loss": 0.6494652, + "learning_rate": 2.632747849287683e-06, + "loss": 0.66958922, + "num_input_tokens_seen": 142057655, + "router_z_loss_clip": 0.01434326, + "router_z_loss_mlp": 0.09863281, + "step": 6613, + "time_per_iteration": 4.366408109664917 + }, + { + "auxiliary_loss_clip": 0.01072996, + "auxiliary_loss_mlp": 0.01032572, + "balance_loss_clip": 1.01753378, + "balance_loss_mlp": 1.02373576, + "epoch": 0.3976551931459492, + "flos": 23694713349120.0, + "grad_norm": 2.3277149819818406, + "language_loss": 0.71182394, + "learning_rate": 2.632389422679523e-06, + "loss": 0.73287964, + "num_input_tokens_seen": 142076020, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.4921875, + "step": 6614, + "time_per_iteration": 2.449972152709961 + }, + { + "auxiliary_loss_clip": 0.01074282, + "auxiliary_loss_mlp": 0.01028998, + "balance_loss_clip": 1.01391792, + "balance_loss_mlp": 1.02487075, + "epoch": 0.3977153163986172, + "flos": 15668739694080.0, + "grad_norm": 2.2532170211440516, + "language_loss": 0.81478083, + "learning_rate": 2.63203097350301e-06, + "loss": 0.83581364, + "num_input_tokens_seen": 142093790, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.49414062, + "step": 6615, + "time_per_iteration": 2.4869959354400635 + }, + { + "auxiliary_loss_clip": 0.01070477, + "auxiliary_loss_mlp": 0.01027059, + "balance_loss_clip": 1.01311135, + "balance_loss_mlp": 1.02264977, + "epoch": 0.39777543965128515, + "flos": 14063817271680.0, + "grad_norm": 1.7476188459762432, + "language_loss": 0.66889179, + "learning_rate": 2.631672501770938e-06, + "loss": 0.68986714, + "num_input_tokens_seen": 142110545, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.47851562, + "step": 6616, + "time_per_iteration": 2.449387311935425 + }, + { + "auxiliary_loss_clip": 0.01074761, + "auxiliary_loss_mlp": 0.01035294, + "balance_loss_clip": 1.01970732, + "balance_loss_mlp": 1.02320814, + "epoch": 0.3978355629039531, + "flos": 23366355212160.0, + "grad_norm": 2.75083159245793, + "language_loss": 0.8351993, + "learning_rate": 2.631314007496099e-06, + "loss": 0.85629982, + "num_input_tokens_seen": 142128695, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.515625, + "step": 6617, + "time_per_iteration": 2.4497227668762207 + }, + { + "auxiliary_loss_clip": 0.01069706, + "auxiliary_loss_mlp": 0.0103205, + "balance_loss_clip": 1.01942551, + "balance_loss_mlp": 1.02357531, + "epoch": 0.3978956861566211, + "flos": 19061773776000.0, + "grad_norm": 1.4966178189274915, + "language_loss": 0.71953106, + "learning_rate": 2.6309554906912873e-06, + "loss": 0.74054861, + "num_input_tokens_seen": 142148375, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.4609375, + "step": 6618, + "time_per_iteration": 3.8436081409454346 + }, + { + "auxiliary_loss_clip": 0.01073663, + "auxiliary_loss_mlp": 0.01028251, + "balance_loss_clip": 1.01286101, + "balance_loss_mlp": 1.02433228, + "epoch": 0.39795580940928904, + "flos": 30226369368960.0, + "grad_norm": 1.8691823070633389, + "language_loss": 0.65258539, + "learning_rate": 2.6305969513692965e-06, + "loss": 0.67360455, + "num_input_tokens_seen": 142169735, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.4921875, + "step": 6619, + "time_per_iteration": 2.5060930252075195 + }, + { + "auxiliary_loss_clip": 0.01012081, + "auxiliary_loss_mlp": 0.01003052, + "balance_loss_clip": 1.00174701, + "balance_loss_mlp": 1.00220847, + "epoch": 0.398015932661957, + "flos": 69843885442560.0, + "grad_norm": 0.8225231916317525, + "language_loss": 0.58208799, + "learning_rate": 2.630238389542924e-06, + "loss": 0.60223931, + "num_input_tokens_seen": 142229520, + "router_z_loss_clip": 0.01306152, + "router_z_loss_mlp": 0.09863281, + "step": 6620, + "time_per_iteration": 4.43730354309082 + }, + { + "auxiliary_loss_clip": 0.01072964, + "auxiliary_loss_mlp": 0.01032571, + "balance_loss_clip": 1.01952314, + "balance_loss_mlp": 1.02452278, + "epoch": 0.39807605591462497, + "flos": 20156719835520.0, + "grad_norm": 1.6378975711741601, + "language_loss": 0.78978765, + "learning_rate": 2.629879805224964e-06, + "loss": 0.81084305, + "num_input_tokens_seen": 142247660, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.484375, + "step": 6621, + "time_per_iteration": 2.4630956649780273 + }, + { + "auxiliary_loss_clip": 0.01071045, + "auxiliary_loss_mlp": 0.01026801, + "balance_loss_clip": 1.01306832, + "balance_loss_mlp": 1.0235759, + "epoch": 0.39813617916729294, + "flos": 21140711994240.0, + "grad_norm": 2.657284676416563, + "language_loss": 0.78505963, + "learning_rate": 2.629521198428213e-06, + "loss": 0.80603814, + "num_input_tokens_seen": 142266990, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.47460938, + "step": 6622, + "time_per_iteration": 2.426262855529785 + }, + { + "auxiliary_loss_clip": 0.01071193, + "auxiliary_loss_mlp": 0.01031058, + "balance_loss_clip": 1.01594281, + "balance_loss_mlp": 1.0229032, + "epoch": 0.3981963024199609, + "flos": 18987513580800.0, + "grad_norm": 1.6999053600728438, + "language_loss": 0.75029296, + "learning_rate": 2.6291625691654702e-06, + "loss": 0.77131546, + "num_input_tokens_seen": 142287170, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.484375, + "step": 6623, + "time_per_iteration": 3.8371999263763428 + }, + { + "auxiliary_loss_clip": 0.01071046, + "auxiliary_loss_mlp": 0.01033594, + "balance_loss_clip": 1.01858592, + "balance_loss_mlp": 1.02266097, + "epoch": 0.39825642567262887, + "flos": 16574352117120.0, + "grad_norm": 1.7613877439509331, + "language_loss": 0.79156911, + "learning_rate": 2.6288039174495334e-06, + "loss": 0.81261551, + "num_input_tokens_seen": 142305405, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.484375, + "step": 6624, + "time_per_iteration": 2.4280402660369873 + }, + { + "auxiliary_loss_clip": 0.01075203, + "auxiliary_loss_mlp": 0.01040329, + "balance_loss_clip": 1.02346134, + "balance_loss_mlp": 1.02439928, + "epoch": 0.39831654892529683, + "flos": 22198754880000.0, + "grad_norm": 1.8766172744924776, + "language_loss": 0.83392131, + "learning_rate": 2.6284452432932034e-06, + "loss": 0.85507667, + "num_input_tokens_seen": 142322710, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.5078125, + "step": 6625, + "time_per_iteration": 2.477288007736206 + }, + { + "auxiliary_loss_clip": 0.01069912, + "auxiliary_loss_mlp": 0.01030699, + "balance_loss_clip": 1.01625037, + "balance_loss_mlp": 1.02270007, + "epoch": 0.39837667217796485, + "flos": 10487209927680.0, + "grad_norm": 2.031089026264865, + "language_loss": 0.86226273, + "learning_rate": 2.6280865467092787e-06, + "loss": 0.88326883, + "num_input_tokens_seen": 142338535, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.47265625, + "step": 6626, + "time_per_iteration": 2.441574811935425 + }, + { + "auxiliary_loss_clip": 0.01073493, + "auxiliary_loss_mlp": 0.01029173, + "balance_loss_clip": 1.0135802, + "balance_loss_mlp": 1.02473068, + "epoch": 0.3984367954306328, + "flos": 17964383921280.0, + "grad_norm": 2.4560288013611933, + "language_loss": 0.83296955, + "learning_rate": 2.6277278277105604e-06, + "loss": 0.85399616, + "num_input_tokens_seen": 142354570, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.48828125, + "step": 6627, + "time_per_iteration": 2.47760272026062 + }, + { + "auxiliary_loss_clip": 0.01069432, + "auxiliary_loss_mlp": 0.01034995, + "balance_loss_clip": 1.0210712, + "balance_loss_mlp": 1.02272487, + "epoch": 0.3984969186833008, + "flos": 22709953140480.0, + "grad_norm": 1.539132968554749, + "language_loss": 0.82766044, + "learning_rate": 2.627369086309851e-06, + "loss": 0.84870476, + "num_input_tokens_seen": 142374395, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.46679688, + "step": 6628, + "time_per_iteration": 2.5086874961853027 + }, + { + "auxiliary_loss_clip": 0.01073103, + "auxiliary_loss_mlp": 0.01035979, + "balance_loss_clip": 1.02126884, + "balance_loss_mlp": 1.02380443, + "epoch": 0.39855704193596875, + "flos": 23404619928960.0, + "grad_norm": 1.6556971975652655, + "language_loss": 0.71342582, + "learning_rate": 2.6270103225199524e-06, + "loss": 0.73451662, + "num_input_tokens_seen": 142396040, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.4921875, + "step": 6629, + "time_per_iteration": 2.491774082183838 + }, + { + "auxiliary_loss_clip": 0.010727, + "auxiliary_loss_mlp": 0.01032295, + "balance_loss_clip": 1.01768541, + "balance_loss_mlp": 1.02521884, + "epoch": 0.3986171651886367, + "flos": 21250862933760.0, + "grad_norm": 1.7014086988520736, + "language_loss": 0.80730289, + "learning_rate": 2.626651536353668e-06, + "loss": 0.82835281, + "num_input_tokens_seen": 142415495, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.47460938, + "step": 6630, + "time_per_iteration": 2.4837019443511963 + }, + { + "auxiliary_loss_clip": 0.01074594, + "auxiliary_loss_mlp": 0.01022779, + "balance_loss_clip": 1.00870633, + "balance_loss_mlp": 1.02513838, + "epoch": 0.3986772884413047, + "flos": 12457882419840.0, + "grad_norm": 1.8134919079533278, + "language_loss": 0.74934614, + "learning_rate": 2.6262927278238032e-06, + "loss": 0.77031994, + "num_input_tokens_seen": 142431865, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.49414062, + "step": 6631, + "time_per_iteration": 2.435589551925659 + }, + { + "auxiliary_loss_clip": 0.0107291, + "auxiliary_loss_mlp": 0.01031434, + "balance_loss_clip": 1.01608586, + "balance_loss_mlp": 1.0240016, + "epoch": 0.39873741169397264, + "flos": 19645102638720.0, + "grad_norm": 2.0932890864924847, + "language_loss": 0.7138592, + "learning_rate": 2.6259338969431613e-06, + "loss": 0.73490262, + "num_input_tokens_seen": 142450595, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.48828125, + "step": 6632, + "time_per_iteration": 2.533177137374878 + }, + { + "auxiliary_loss_clip": 0.01068979, + "auxiliary_loss_mlp": 0.01030101, + "balance_loss_clip": 1.01622534, + "balance_loss_mlp": 1.0216732, + "epoch": 0.3987975349466406, + "flos": 21683821104000.0, + "grad_norm": 1.7749245307922714, + "language_loss": 0.75056088, + "learning_rate": 2.6255750437245487e-06, + "loss": 0.77155167, + "num_input_tokens_seen": 142466650, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.47460938, + "step": 6633, + "time_per_iteration": 2.5153300762176514 + }, + { + "auxiliary_loss_clip": 0.01070432, + "auxiliary_loss_mlp": 0.01024752, + "balance_loss_clip": 1.01084614, + "balance_loss_mlp": 1.02195311, + "epoch": 0.3988576581993086, + "flos": 23912955457920.0, + "grad_norm": 1.7628208212191583, + "language_loss": 0.81495905, + "learning_rate": 2.625216168180772e-06, + "loss": 0.83591092, + "num_input_tokens_seen": 142486165, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.484375, + "step": 6634, + "time_per_iteration": 2.4775242805480957 + }, + { + "auxiliary_loss_clip": 0.01072265, + "auxiliary_loss_mlp": 0.01030621, + "balance_loss_clip": 1.01656055, + "balance_loss_mlp": 1.02456665, + "epoch": 0.39891778145197654, + "flos": 18148934701440.0, + "grad_norm": 1.671007261610527, + "language_loss": 0.74796832, + "learning_rate": 2.624857270324639e-06, + "loss": 0.76899719, + "num_input_tokens_seen": 142505035, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.4765625, + "step": 6635, + "time_per_iteration": 2.4950473308563232 + }, + { + "auxiliary_loss_clip": 0.01076285, + "auxiliary_loss_mlp": 0.01035496, + "balance_loss_clip": 1.0191946, + "balance_loss_mlp": 1.02347112, + "epoch": 0.3989779047046445, + "flos": 22594356028800.0, + "grad_norm": 2.9618247065671524, + "language_loss": 0.66572481, + "learning_rate": 2.6244983501689574e-06, + "loss": 0.68684262, + "num_input_tokens_seen": 142521870, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.52734375, + "step": 6636, + "time_per_iteration": 2.53560209274292 + }, + { + "auxiliary_loss_clip": 0.01071293, + "auxiliary_loss_mlp": 0.01029443, + "balance_loss_clip": 1.01565683, + "balance_loss_mlp": 1.02332687, + "epoch": 0.39903802795731247, + "flos": 18076245517440.0, + "grad_norm": 2.7573624566813977, + "language_loss": 0.81554866, + "learning_rate": 2.6241394077265352e-06, + "loss": 0.83655596, + "num_input_tokens_seen": 142540455, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.48046875, + "step": 6637, + "time_per_iteration": 2.467667579650879 + }, + { + "auxiliary_loss_clip": 0.01011698, + "auxiliary_loss_mlp": 0.01002102, + "balance_loss_clip": 1.00095117, + "balance_loss_mlp": 1.00210905, + "epoch": 0.39909815120998043, + "flos": 70437722624640.0, + "grad_norm": 0.7138867261636817, + "language_loss": 0.53198743, + "learning_rate": 2.6237804430101835e-06, + "loss": 0.55212545, + "num_input_tokens_seen": 142599665, + "router_z_loss_clip": 0.01147461, + "router_z_loss_mlp": 0.09570312, + "step": 6638, + "time_per_iteration": 3.1159331798553467 + }, + { + "auxiliary_loss_clip": 0.01072711, + "auxiliary_loss_mlp": 0.0102694, + "balance_loss_clip": 1.01229489, + "balance_loss_mlp": 1.02373385, + "epoch": 0.39915827446264845, + "flos": 18548341188480.0, + "grad_norm": 1.6126167806358793, + "language_loss": 0.75386608, + "learning_rate": 2.623421456032712e-06, + "loss": 0.77486265, + "num_input_tokens_seen": 142618845, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.49023438, + "step": 6639, + "time_per_iteration": 2.4533066749572754 + }, + { + "auxiliary_loss_clip": 0.01069403, + "auxiliary_loss_mlp": 0.01030946, + "balance_loss_clip": 1.01736772, + "balance_loss_mlp": 1.02293527, + "epoch": 0.3992183977153164, + "flos": 29895986373120.0, + "grad_norm": 5.721938861641512, + "language_loss": 0.76002169, + "learning_rate": 2.6230624468069326e-06, + "loss": 0.78102523, + "num_input_tokens_seen": 142640885, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.46484375, + "step": 6640, + "time_per_iteration": 2.5051097869873047 + }, + { + "auxiliary_loss_clip": 0.01073513, + "auxiliary_loss_mlp": 0.01031181, + "balance_loss_clip": 1.0161314, + "balance_loss_mlp": 1.02374053, + "epoch": 0.3992785209679844, + "flos": 22563981279360.0, + "grad_norm": 2.46462815576201, + "language_loss": 0.8213411, + "learning_rate": 2.6227034153456573e-06, + "loss": 0.84238803, + "num_input_tokens_seen": 142659340, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.49804688, + "step": 6641, + "time_per_iteration": 2.5071916580200195 + }, + { + "auxiliary_loss_clip": 0.01073203, + "auxiliary_loss_mlp": 0.01031901, + "balance_loss_clip": 1.01683319, + "balance_loss_mlp": 1.02350163, + "epoch": 0.39933864422065235, + "flos": 19681656698880.0, + "grad_norm": 2.5993960764360624, + "language_loss": 0.76370227, + "learning_rate": 2.6223443616616985e-06, + "loss": 0.78475332, + "num_input_tokens_seen": 142677085, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.49804688, + "step": 6642, + "time_per_iteration": 2.466419219970703 + }, + { + "auxiliary_loss_clip": 0.0107244, + "auxiliary_loss_mlp": 0.01031026, + "balance_loss_clip": 1.01634574, + "balance_loss_mlp": 1.02371955, + "epoch": 0.3993987674733203, + "flos": 23037403582080.0, + "grad_norm": 1.8469196847473455, + "language_loss": 0.72247189, + "learning_rate": 2.621985285767871e-06, + "loss": 0.74350661, + "num_input_tokens_seen": 142694595, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.48828125, + "step": 6643, + "time_per_iteration": 2.4636638164520264 + }, + { + "auxiliary_loss_clip": 0.01072605, + "auxiliary_loss_mlp": 0.01028365, + "balance_loss_clip": 1.01352334, + "balance_loss_mlp": 1.02277112, + "epoch": 0.3994588907259883, + "flos": 19389817710720.0, + "grad_norm": 1.7385154055731078, + "language_loss": 0.66287923, + "learning_rate": 2.621626187676988e-06, + "loss": 0.68388891, + "num_input_tokens_seen": 142714175, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.49804688, + "step": 6644, + "time_per_iteration": 2.4885880947113037 + }, + { + "auxiliary_loss_clip": 0.01071197, + "auxiliary_loss_mlp": 0.01033215, + "balance_loss_clip": 1.01815891, + "balance_loss_mlp": 1.02299833, + "epoch": 0.39951901397865625, + "flos": 13733573921280.0, + "grad_norm": 2.2930590810623346, + "language_loss": 0.78312695, + "learning_rate": 2.6212670674018657e-06, + "loss": 0.80417103, + "num_input_tokens_seen": 142730955, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.48242188, + "step": 6645, + "time_per_iteration": 2.4476451873779297 + }, + { + "auxiliary_loss_clip": 0.01072112, + "auxiliary_loss_mlp": 0.01035208, + "balance_loss_clip": 1.02000928, + "balance_loss_mlp": 1.02392793, + "epoch": 0.3995791372313242, + "flos": 23585330459520.0, + "grad_norm": 3.9877109267143065, + "language_loss": 0.70151591, + "learning_rate": 2.6209079249553195e-06, + "loss": 0.72258914, + "num_input_tokens_seen": 142751200, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.48046875, + "step": 6646, + "time_per_iteration": 2.4883899688720703 + }, + { + "auxiliary_loss_clip": 0.01070617, + "auxiliary_loss_mlp": 0.01032113, + "balance_loss_clip": 1.01739049, + "balance_loss_mlp": 1.02275634, + "epoch": 0.3996392604839922, + "flos": 21354974208000.0, + "grad_norm": 2.344451303145302, + "language_loss": 0.71729481, + "learning_rate": 2.6205487603501672e-06, + "loss": 0.73832202, + "num_input_tokens_seen": 142770170, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.47851562, + "step": 6647, + "time_per_iteration": 2.4864189624786377 + }, + { + "auxiliary_loss_clip": 0.01068436, + "auxiliary_loss_mlp": 0.01030012, + "balance_loss_clip": 1.01629114, + "balance_loss_mlp": 1.0223968, + "epoch": 0.39969938373666014, + "flos": 26031031176960.0, + "grad_norm": 1.5577673638209417, + "language_loss": 0.74076974, + "learning_rate": 2.6201895735992255e-06, + "loss": 0.76175427, + "num_input_tokens_seen": 142792680, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.4609375, + "step": 6648, + "time_per_iteration": 2.5327861309051514 + }, + { + "auxiliary_loss_clip": 0.01073442, + "auxiliary_loss_mlp": 0.01028694, + "balance_loss_clip": 1.01361442, + "balance_loss_mlp": 1.02304292, + "epoch": 0.3997595069893281, + "flos": 20115452741760.0, + "grad_norm": 3.683464481059715, + "language_loss": 0.66093767, + "learning_rate": 2.6198303647153133e-06, + "loss": 0.68195903, + "num_input_tokens_seen": 142810510, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.50390625, + "step": 6649, + "time_per_iteration": 2.4660842418670654 + }, + { + "auxiliary_loss_clip": 0.01075217, + "auxiliary_loss_mlp": 0.01032727, + "balance_loss_clip": 1.01812959, + "balance_loss_mlp": 1.02606964, + "epoch": 0.39981963024199607, + "flos": 27782134928640.0, + "grad_norm": 1.503133258693868, + "language_loss": 0.75100648, + "learning_rate": 2.61947113371125e-06, + "loss": 0.77208591, + "num_input_tokens_seen": 142832455, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.49023438, + "step": 6650, + "time_per_iteration": 2.4915287494659424 + }, + { + "auxiliary_loss_clip": 0.01072812, + "auxiliary_loss_mlp": 0.01032197, + "balance_loss_clip": 1.01808834, + "balance_loss_mlp": 1.02349114, + "epoch": 0.39987975349466404, + "flos": 21943365217920.0, + "grad_norm": 1.5056294246379212, + "language_loss": 0.72204274, + "learning_rate": 2.6191118805998547e-06, + "loss": 0.74309278, + "num_input_tokens_seen": 142852590, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.4921875, + "step": 6651, + "time_per_iteration": 2.467857837677002 + }, + { + "auxiliary_loss_clip": 0.01072487, + "auxiliary_loss_mlp": 0.01031596, + "balance_loss_clip": 1.01676691, + "balance_loss_mlp": 1.02346241, + "epoch": 0.39993987674733206, + "flos": 20703354992640.0, + "grad_norm": 1.9292758740306415, + "language_loss": 0.72752506, + "learning_rate": 2.6187526053939497e-06, + "loss": 0.74856591, + "num_input_tokens_seen": 142870595, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.49023438, + "step": 6652, + "time_per_iteration": 2.442746877670288 + }, + { + "auxiliary_loss_clip": 0.01011719, + "auxiliary_loss_mlp": 0.01000631, + "balance_loss_clip": 0.99931401, + "balance_loss_mlp": 1.00189078, + "epoch": 0.4, + "flos": 61522407106560.0, + "grad_norm": 0.8483957684346403, + "language_loss": 0.60638869, + "learning_rate": 2.6183933081063556e-06, + "loss": 0.62651217, + "num_input_tokens_seen": 142925805, + "router_z_loss_clip": 0.01318359, + "router_z_loss_mlp": 0.09863281, + "step": 6653, + "time_per_iteration": 4.38383674621582 + }, + { + "auxiliary_loss_clip": 0.010707, + "auxiliary_loss_mlp": 0.0103061, + "balance_loss_clip": 1.0166986, + "balance_loss_mlp": 1.02513754, + "epoch": 0.400060123252668, + "flos": 14501418652800.0, + "grad_norm": 2.0324203839737622, + "language_loss": 0.67108071, + "learning_rate": 2.6180339887498946e-06, + "loss": 0.69209385, + "num_input_tokens_seen": 142943145, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.45507812, + "step": 6654, + "time_per_iteration": 2.475163698196411 + }, + { + "auxiliary_loss_clip": 0.0107101, + "auxiliary_loss_mlp": 0.01032268, + "balance_loss_clip": 1.01865411, + "balance_loss_mlp": 1.02303898, + "epoch": 0.40012024650533595, + "flos": 19092462727680.0, + "grad_norm": 2.1088906145080926, + "language_loss": 0.89763999, + "learning_rate": 2.617674647337391e-06, + "loss": 0.9186728, + "num_input_tokens_seen": 142956925, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.48046875, + "step": 6655, + "time_per_iteration": 2.4557688236236572 + }, + { + "auxiliary_loss_clip": 0.01069039, + "auxiliary_loss_mlp": 0.01026128, + "balance_loss_clip": 1.01370692, + "balance_loss_mlp": 1.02378738, + "epoch": 0.4001803697580039, + "flos": 29349735240960.0, + "grad_norm": 1.5996997307484297, + "language_loss": 0.73221993, + "learning_rate": 2.6173152838816673e-06, + "loss": 0.75317162, + "num_input_tokens_seen": 142978040, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.453125, + "step": 6656, + "time_per_iteration": 3.932459831237793 + }, + { + "auxiliary_loss_clip": 0.01072523, + "auxiliary_loss_mlp": 0.01033067, + "balance_loss_clip": 1.01851773, + "balance_loss_mlp": 1.02328372, + "epoch": 0.4002404930106719, + "flos": 20919083483520.0, + "grad_norm": 2.208990144459969, + "language_loss": 0.73409355, + "learning_rate": 2.6169558983955496e-06, + "loss": 0.75514948, + "num_input_tokens_seen": 142998390, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.4921875, + "step": 6657, + "time_per_iteration": 2.5025126934051514 + }, + { + "auxiliary_loss_clip": 0.01072914, + "auxiliary_loss_mlp": 0.01039104, + "balance_loss_clip": 1.02296913, + "balance_loss_mlp": 1.02331233, + "epoch": 0.40030061626333985, + "flos": 28404391824000.0, + "grad_norm": 1.9332346053758316, + "language_loss": 0.79681247, + "learning_rate": 2.6165964908918624e-06, + "loss": 0.81793272, + "num_input_tokens_seen": 143021505, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.49609375, + "step": 6658, + "time_per_iteration": 3.922158718109131 + }, + { + "auxiliary_loss_clip": 0.01071624, + "auxiliary_loss_mlp": 0.01028503, + "balance_loss_clip": 1.01455569, + "balance_loss_mlp": 1.02253807, + "epoch": 0.4003607395160078, + "flos": 25920426389760.0, + "grad_norm": 2.467380417348607, + "language_loss": 0.77468818, + "learning_rate": 2.6162370613834333e-06, + "loss": 0.79568946, + "num_input_tokens_seen": 143041375, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.49023438, + "step": 6659, + "time_per_iteration": 2.4807848930358887 + }, + { + "auxiliary_loss_clip": 0.01065468, + "auxiliary_loss_mlp": 0.01027774, + "balance_loss_clip": 1.0157218, + "balance_loss_mlp": 1.02249718, + "epoch": 0.4004208627686758, + "flos": 20767840007040.0, + "grad_norm": 1.8210449046187138, + "language_loss": 0.73279178, + "learning_rate": 2.6158776098830884e-06, + "loss": 0.75372428, + "num_input_tokens_seen": 143058725, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.4296875, + "step": 6660, + "time_per_iteration": 2.406367063522339 + }, + { + "auxiliary_loss_clip": 0.01068811, + "auxiliary_loss_mlp": 0.0102922, + "balance_loss_clip": 1.01598752, + "balance_loss_mlp": 1.02269816, + "epoch": 0.40048098602134374, + "flos": 24680067050880.0, + "grad_norm": 2.0564316469125328, + "language_loss": 0.70958173, + "learning_rate": 2.6155181364036556e-06, + "loss": 0.73056197, + "num_input_tokens_seen": 143076995, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.4609375, + "step": 6661, + "time_per_iteration": 2.4488096237182617 + }, + { + "auxiliary_loss_clip": 0.010726, + "auxiliary_loss_mlp": 0.01031911, + "balance_loss_clip": 1.01752281, + "balance_loss_mlp": 1.02385426, + "epoch": 0.4005411092740117, + "flos": 23184562429440.0, + "grad_norm": 1.5958757689363112, + "language_loss": 0.75307924, + "learning_rate": 2.615158640957964e-06, + "loss": 0.77412432, + "num_input_tokens_seen": 143096780, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.48632812, + "step": 6662, + "time_per_iteration": 3.8144590854644775 + }, + { + "auxiliary_loss_clip": 0.01070916, + "auxiliary_loss_mlp": 0.01030744, + "balance_loss_clip": 1.01624227, + "balance_loss_mlp": 1.02195811, + "epoch": 0.4006012325266797, + "flos": 17521580747520.0, + "grad_norm": 2.186676963015024, + "language_loss": 0.66001856, + "learning_rate": 2.614799123558842e-06, + "loss": 0.68103516, + "num_input_tokens_seen": 143112590, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.49023438, + "step": 6663, + "time_per_iteration": 2.3775179386138916 + }, + { + "auxiliary_loss_clip": 0.0106955, + "auxiliary_loss_mlp": 0.01035124, + "balance_loss_clip": 1.02148616, + "balance_loss_mlp": 1.02212155, + "epoch": 0.40066135577934764, + "flos": 19856397386880.0, + "grad_norm": 2.013399279056383, + "language_loss": 0.85741991, + "learning_rate": 2.6144395842191227e-06, + "loss": 0.87846673, + "num_input_tokens_seen": 143130220, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.47460938, + "step": 6664, + "time_per_iteration": 2.407982110977173 + }, + { + "auxiliary_loss_clip": 0.01071457, + "auxiliary_loss_mlp": 0.01027089, + "balance_loss_clip": 1.01349354, + "balance_loss_mlp": 1.02353013, + "epoch": 0.40072147903201566, + "flos": 18149039435520.0, + "grad_norm": 2.5997109183150773, + "language_loss": 0.8480159, + "learning_rate": 2.6140800229516337e-06, + "loss": 0.86900139, + "num_input_tokens_seen": 143147160, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.47851562, + "step": 6665, + "time_per_iteration": 2.403843879699707 + }, + { + "auxiliary_loss_clip": 0.01074358, + "auxiliary_loss_mlp": 0.0103228, + "balance_loss_clip": 1.01738453, + "balance_loss_mlp": 1.0249989, + "epoch": 0.4007816022846836, + "flos": 18660272607360.0, + "grad_norm": 1.7772292949845703, + "language_loss": 0.7839129, + "learning_rate": 2.613720439769208e-06, + "loss": 0.80497932, + "num_input_tokens_seen": 143164605, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.49414062, + "step": 6666, + "time_per_iteration": 2.4071192741394043 + }, + { + "auxiliary_loss_clip": 0.01073634, + "auxiliary_loss_mlp": 0.01029116, + "balance_loss_clip": 1.01496553, + "balance_loss_mlp": 1.024297, + "epoch": 0.4008417255373516, + "flos": 25701974812800.0, + "grad_norm": 1.813945554725285, + "language_loss": 0.7348913, + "learning_rate": 2.6133608346846794e-06, + "loss": 0.75591874, + "num_input_tokens_seen": 143183965, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.49414062, + "step": 6667, + "time_per_iteration": 2.465135335922241 + }, + { + "auxiliary_loss_clip": 0.01073435, + "auxiliary_loss_mlp": 0.01034397, + "balance_loss_clip": 1.01924539, + "balance_loss_mlp": 1.02399898, + "epoch": 0.40090184879001955, + "flos": 22857461101440.0, + "grad_norm": 1.6007119961425718, + "language_loss": 0.76171446, + "learning_rate": 2.61300120771088e-06, + "loss": 0.78279269, + "num_input_tokens_seen": 143204965, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.49414062, + "step": 6668, + "time_per_iteration": 2.4447526931762695 + }, + { + "auxiliary_loss_clip": 0.01074418, + "auxiliary_loss_mlp": 0.01029265, + "balance_loss_clip": 1.01482928, + "balance_loss_mlp": 1.02609324, + "epoch": 0.4009619720426875, + "flos": 29058559568640.0, + "grad_norm": 1.7174291347845285, + "language_loss": 0.82202959, + "learning_rate": 2.6126415588606443e-06, + "loss": 0.84306639, + "num_input_tokens_seen": 143225015, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.484375, + "step": 6669, + "time_per_iteration": 2.478872537612915 + }, + { + "auxiliary_loss_clip": 0.01011841, + "auxiliary_loss_mlp": 0.01007281, + "balance_loss_clip": 1.0061183, + "balance_loss_mlp": 1.00223196, + "epoch": 0.4010220952953555, + "flos": 66056332464000.0, + "grad_norm": 0.7088749526354309, + "language_loss": 0.53364331, + "learning_rate": 2.6122818881468072e-06, + "loss": 0.55383456, + "num_input_tokens_seen": 143294925, + "router_z_loss_clip": 0.01159668, + "router_z_loss_mlp": 0.09570312, + "step": 6670, + "time_per_iteration": 3.2194297313690186 + }, + { + "auxiliary_loss_clip": 0.01070723, + "auxiliary_loss_mlp": 0.01027227, + "balance_loss_clip": 1.01416135, + "balance_loss_mlp": 1.0237366, + "epoch": 0.40108221854802345, + "flos": 29641539317760.0, + "grad_norm": 1.9171775027729765, + "language_loss": 0.88869125, + "learning_rate": 2.6119221955822044e-06, + "loss": 0.90967071, + "num_input_tokens_seen": 143314170, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.46875, + "step": 6671, + "time_per_iteration": 2.4665517807006836 + }, + { + "auxiliary_loss_clip": 0.01069136, + "auxiliary_loss_mlp": 0.01026449, + "balance_loss_clip": 1.01287687, + "balance_loss_mlp": 1.02240705, + "epoch": 0.4011423418006914, + "flos": 19928772368640.0, + "grad_norm": 1.8419208081134786, + "language_loss": 0.79202813, + "learning_rate": 2.611562481179673e-06, + "loss": 0.81298399, + "num_input_tokens_seen": 143330050, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.46679688, + "step": 6672, + "time_per_iteration": 2.396230459213257 + }, + { + "auxiliary_loss_clip": 0.01067293, + "auxiliary_loss_mlp": 0.01027777, + "balance_loss_clip": 1.01437819, + "balance_loss_mlp": 1.02174556, + "epoch": 0.4012024650533594, + "flos": 20083262601600.0, + "grad_norm": 1.9460511721003255, + "language_loss": 0.62819916, + "learning_rate": 2.611202744952049e-06, + "loss": 0.64914984, + "num_input_tokens_seen": 143348650, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.45507812, + "step": 6673, + "time_per_iteration": 2.4075849056243896 + }, + { + "auxiliary_loss_clip": 0.01069589, + "auxiliary_loss_mlp": 0.01027478, + "balance_loss_clip": 1.01459777, + "balance_loss_mlp": 1.02258945, + "epoch": 0.40126258830602735, + "flos": 21694469068800.0, + "grad_norm": 1.439803454479585, + "language_loss": 0.80129451, + "learning_rate": 2.610842986912172e-06, + "loss": 0.82226515, + "num_input_tokens_seen": 143370275, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.47070312, + "step": 6674, + "time_per_iteration": 2.463350296020508 + }, + { + "auxiliary_loss_clip": 0.01072746, + "auxiliary_loss_mlp": 0.01030631, + "balance_loss_clip": 1.01654673, + "balance_loss_mlp": 1.02420938, + "epoch": 0.4013227115586953, + "flos": 12019582811520.0, + "grad_norm": 2.117495968570646, + "language_loss": 0.82048941, + "learning_rate": 2.61048320707288e-06, + "loss": 0.84152317, + "num_input_tokens_seen": 143385390, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.48632812, + "step": 6675, + "time_per_iteration": 2.385601043701172 + }, + { + "auxiliary_loss_clip": 0.01068514, + "auxiliary_loss_mlp": 0.01024907, + "balance_loss_clip": 1.01112652, + "balance_loss_mlp": 1.02222598, + "epoch": 0.4013828348113633, + "flos": 25446340771200.0, + "grad_norm": 1.597855349312592, + "language_loss": 0.9331007, + "learning_rate": 2.6101234054470118e-06, + "loss": 0.95403492, + "num_input_tokens_seen": 143404215, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.4609375, + "step": 6676, + "time_per_iteration": 2.4667534828186035 + }, + { + "auxiliary_loss_clip": 0.01076217, + "auxiliary_loss_mlp": 0.01034683, + "balance_loss_clip": 1.01878047, + "balance_loss_mlp": 1.02512443, + "epoch": 0.40144295806403124, + "flos": 18582102339840.0, + "grad_norm": 1.9830255229769822, + "language_loss": 0.79591095, + "learning_rate": 2.6097635820474095e-06, + "loss": 0.81701994, + "num_input_tokens_seen": 143422245, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.51171875, + "step": 6677, + "time_per_iteration": 2.4285614490509033 + }, + { + "auxiliary_loss_clip": 0.01071441, + "auxiliary_loss_mlp": 0.01026554, + "balance_loss_clip": 1.01296997, + "balance_loss_mlp": 1.02400756, + "epoch": 0.4015030813166992, + "flos": 22929102944640.0, + "grad_norm": 1.7369708307660194, + "language_loss": 0.83837557, + "learning_rate": 2.609403736886913e-06, + "loss": 0.85935557, + "num_input_tokens_seen": 143443130, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.47460938, + "step": 6678, + "time_per_iteration": 2.4638166427612305 + }, + { + "auxiliary_loss_clip": 0.01070308, + "auxiliary_loss_mlp": 0.01032495, + "balance_loss_clip": 1.0181005, + "balance_loss_mlp": 1.02403545, + "epoch": 0.4015632045693672, + "flos": 20594007014400.0, + "grad_norm": 2.1482870711743103, + "language_loss": 0.6387248, + "learning_rate": 2.6090438699783655e-06, + "loss": 0.65975285, + "num_input_tokens_seen": 143461385, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.46289062, + "step": 6679, + "time_per_iteration": 2.4286577701568604 + }, + { + "auxiliary_loss_clip": 0.01067283, + "auxiliary_loss_mlp": 0.01024836, + "balance_loss_clip": 1.01230705, + "balance_loss_mlp": 1.02210546, + "epoch": 0.4016233278220352, + "flos": 23437857409920.0, + "grad_norm": 1.8790551960261805, + "language_loss": 0.78690499, + "learning_rate": 2.608683981334608e-06, + "loss": 0.80782622, + "num_input_tokens_seen": 143481750, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.453125, + "step": 6680, + "time_per_iteration": 2.4915313720703125 + }, + { + "auxiliary_loss_clip": 0.01074811, + "auxiliary_loss_mlp": 0.0104218, + "balance_loss_clip": 1.02789307, + "balance_loss_mlp": 1.02533209, + "epoch": 0.40168345107470316, + "flos": 21430072275840.0, + "grad_norm": 1.6913264656852987, + "language_loss": 0.75566924, + "learning_rate": 2.6083240709684856e-06, + "loss": 0.77683914, + "num_input_tokens_seen": 143501540, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.49609375, + "step": 6681, + "time_per_iteration": 2.432297468185425 + }, + { + "auxiliary_loss_clip": 0.01070729, + "auxiliary_loss_mlp": 0.01025996, + "balance_loss_clip": 1.01310337, + "balance_loss_mlp": 1.02344656, + "epoch": 0.4017435743273711, + "flos": 22856099558400.0, + "grad_norm": 1.7316322769375432, + "language_loss": 0.64014649, + "learning_rate": 2.6079641388928417e-06, + "loss": 0.66111374, + "num_input_tokens_seen": 143520530, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.47265625, + "step": 6682, + "time_per_iteration": 2.4512219429016113 + }, + { + "auxiliary_loss_clip": 0.01071963, + "auxiliary_loss_mlp": 0.0102591, + "balance_loss_clip": 1.01230252, + "balance_loss_mlp": 1.02329099, + "epoch": 0.4018036975800391, + "flos": 28621028010240.0, + "grad_norm": 1.7153399579539625, + "language_loss": 0.72643971, + "learning_rate": 2.6076041851205214e-06, + "loss": 0.74741846, + "num_input_tokens_seen": 143540210, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.48632812, + "step": 6683, + "time_per_iteration": 2.468168258666992 + }, + { + "auxiliary_loss_clip": 0.01069401, + "auxiliary_loss_mlp": 0.01030976, + "balance_loss_clip": 1.01783955, + "balance_loss_mlp": 1.02231419, + "epoch": 0.40186382083270705, + "flos": 26650006404480.0, + "grad_norm": 1.6024236367783344, + "language_loss": 0.72975457, + "learning_rate": 2.6072442096643707e-06, + "loss": 0.75075835, + "num_input_tokens_seen": 143560940, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.47070312, + "step": 6684, + "time_per_iteration": 2.4754834175109863 + }, + { + "auxiliary_loss_clip": 0.01012945, + "auxiliary_loss_mlp": 0.01001054, + "balance_loss_clip": 0.9999398, + "balance_loss_mlp": 1.00330806, + "epoch": 0.401923944085375, + "flos": 59257102717440.0, + "grad_norm": 0.8043368043515626, + "language_loss": 0.60420907, + "learning_rate": 2.606884212537236e-06, + "loss": 0.62434906, + "num_input_tokens_seen": 143624015, + "router_z_loss_clip": 0.01116943, + "router_z_loss_mlp": 0.09667969, + "step": 6685, + "time_per_iteration": 3.1802947521209717 + }, + { + "auxiliary_loss_clip": 0.0107141, + "auxiliary_loss_mlp": 0.01023657, + "balance_loss_clip": 1.01019835, + "balance_loss_mlp": 1.02383602, + "epoch": 0.401984067338043, + "flos": 16981858039680.0, + "grad_norm": 1.7565249440447923, + "language_loss": 0.70044386, + "learning_rate": 2.6065241937519653e-06, + "loss": 0.72139448, + "num_input_tokens_seen": 143642750, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.4765625, + "step": 6686, + "time_per_iteration": 2.4383654594421387 + }, + { + "auxiliary_loss_clip": 0.01067459, + "auxiliary_loss_mlp": 0.0102907, + "balance_loss_clip": 1.01584387, + "balance_loss_mlp": 1.02106762, + "epoch": 0.40204419059071095, + "flos": 24971347457280.0, + "grad_norm": 1.4921462980280609, + "language_loss": 0.74266863, + "learning_rate": 2.6061641533214062e-06, + "loss": 0.76363391, + "num_input_tokens_seen": 143664515, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.46484375, + "step": 6687, + "time_per_iteration": 2.4394235610961914 + }, + { + "auxiliary_loss_clip": 0.01072378, + "auxiliary_loss_mlp": 0.01028593, + "balance_loss_clip": 1.01434183, + "balance_loss_mlp": 1.02389669, + "epoch": 0.4021043138433789, + "flos": 23476331594880.0, + "grad_norm": 1.7180136118505265, + "language_loss": 0.70519584, + "learning_rate": 2.6058040912584075e-06, + "loss": 0.72620559, + "num_input_tokens_seen": 143683135, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.484375, + "step": 6688, + "time_per_iteration": 2.4209628105163574 + }, + { + "auxiliary_loss_clip": 0.01073572, + "auxiliary_loss_mlp": 0.01031855, + "balance_loss_clip": 1.0166322, + "balance_loss_mlp": 1.02350402, + "epoch": 0.4021644370960469, + "flos": 25994581850880.0, + "grad_norm": 1.5527280071691902, + "language_loss": 0.64241159, + "learning_rate": 2.605444007575819e-06, + "loss": 0.6634658, + "num_input_tokens_seen": 143703985, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.5, + "step": 6689, + "time_per_iteration": 2.4474921226501465 + }, + { + "auxiliary_loss_clip": 0.01072437, + "auxiliary_loss_mlp": 0.0102562, + "balance_loss_clip": 1.01178002, + "balance_loss_mlp": 1.02360165, + "epoch": 0.40222456034871484, + "flos": 13587183123840.0, + "grad_norm": 1.9864078873715083, + "language_loss": 0.72876096, + "learning_rate": 2.605083902286491e-06, + "loss": 0.74974155, + "num_input_tokens_seen": 143719245, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.49023438, + "step": 6690, + "time_per_iteration": 2.415900230407715 + }, + { + "auxiliary_loss_clip": 0.01068603, + "auxiliary_loss_mlp": 0.01035205, + "balance_loss_clip": 1.02109075, + "balance_loss_mlp": 1.02151752, + "epoch": 0.4022846836013828, + "flos": 24276925048320.0, + "grad_norm": 1.5168222905538338, + "language_loss": 0.74897945, + "learning_rate": 2.6047237754032755e-06, + "loss": 0.7700175, + "num_input_tokens_seen": 143739575, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.47265625, + "step": 6691, + "time_per_iteration": 2.448310375213623 + }, + { + "auxiliary_loss_clip": 0.0107207, + "auxiliary_loss_mlp": 0.01035402, + "balance_loss_clip": 1.0207746, + "balance_loss_mlp": 1.02322388, + "epoch": 0.40234480685405083, + "flos": 20150715081600.0, + "grad_norm": 1.5330730443518472, + "language_loss": 0.722238, + "learning_rate": 2.6043636269390245e-06, + "loss": 0.74331272, + "num_input_tokens_seen": 143758515, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.48828125, + "step": 6692, + "time_per_iteration": 3.872291326522827 + }, + { + "auxiliary_loss_clip": 0.01070258, + "auxiliary_loss_mlp": 0.01031344, + "balance_loss_clip": 1.01770055, + "balance_loss_mlp": 1.02222133, + "epoch": 0.4024049301067188, + "flos": 22929102944640.0, + "grad_norm": 4.448279726317034, + "language_loss": 0.8407445, + "learning_rate": 2.6040034569065893e-06, + "loss": 0.8617605, + "num_input_tokens_seen": 143776770, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.48046875, + "step": 6693, + "time_per_iteration": 2.437262535095215 + }, + { + "auxiliary_loss_clip": 0.01074944, + "auxiliary_loss_mlp": 0.01033527, + "balance_loss_clip": 1.02009821, + "balance_loss_mlp": 1.02642143, + "epoch": 0.40246505335938676, + "flos": 36026944185600.0, + "grad_norm": 1.8117827715724335, + "language_loss": 0.70784926, + "learning_rate": 2.6036432653188254e-06, + "loss": 0.72893405, + "num_input_tokens_seen": 143798450, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.484375, + "step": 6694, + "time_per_iteration": 2.507660150527954 + }, + { + "auxiliary_loss_clip": 0.01069602, + "auxiliary_loss_mlp": 0.01029699, + "balance_loss_clip": 1.0155015, + "balance_loss_mlp": 1.02231407, + "epoch": 0.4025251766120547, + "flos": 20593273875840.0, + "grad_norm": 1.928745401754457, + "language_loss": 0.67619473, + "learning_rate": 2.603283052188585e-06, + "loss": 0.69718772, + "num_input_tokens_seen": 143816995, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.47265625, + "step": 6695, + "time_per_iteration": 2.3780624866485596 + }, + { + "auxiliary_loss_clip": 0.01069616, + "auxiliary_loss_mlp": 0.01028933, + "balance_loss_clip": 1.0158323, + "balance_loss_mlp": 1.02201307, + "epoch": 0.4025852998647227, + "flos": 64521657296640.0, + "grad_norm": 2.334044444320753, + "language_loss": 0.7956413, + "learning_rate": 2.602922817528725e-06, + "loss": 0.81662679, + "num_input_tokens_seen": 143842090, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.4765625, + "step": 6696, + "time_per_iteration": 4.180573463439941 + }, + { + "auxiliary_loss_clip": 0.01070499, + "auxiliary_loss_mlp": 0.01031913, + "balance_loss_clip": 1.01756024, + "balance_loss_mlp": 1.02340341, + "epoch": 0.40264542311739066, + "flos": 20885252509440.0, + "grad_norm": 2.0792906409399152, + "language_loss": 0.71069521, + "learning_rate": 2.6025625613521005e-06, + "loss": 0.73171937, + "num_input_tokens_seen": 143860800, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.47070312, + "step": 6697, + "time_per_iteration": 2.3770360946655273 + }, + { + "auxiliary_loss_clip": 0.01071334, + "auxiliary_loss_mlp": 0.01034561, + "balance_loss_clip": 1.02132845, + "balance_loss_mlp": 1.02374947, + "epoch": 0.4027055463700586, + "flos": 26248993994880.0, + "grad_norm": 2.3430267384703494, + "language_loss": 0.61649925, + "learning_rate": 2.602202283671568e-06, + "loss": 0.63755822, + "num_input_tokens_seen": 143878950, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.4765625, + "step": 6698, + "time_per_iteration": 3.7839009761810303 + }, + { + "auxiliary_loss_clip": 0.01071402, + "auxiliary_loss_mlp": 0.01029868, + "balance_loss_clip": 1.01565266, + "balance_loss_mlp": 1.02305651, + "epoch": 0.4027656696227266, + "flos": 20630351606400.0, + "grad_norm": 1.9616400572586652, + "language_loss": 0.76578283, + "learning_rate": 2.601841984499985e-06, + "loss": 0.7867955, + "num_input_tokens_seen": 143898385, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.484375, + "step": 6699, + "time_per_iteration": 2.394257068634033 + }, + { + "auxiliary_loss_clip": 0.01067991, + "auxiliary_loss_mlp": 0.01024252, + "balance_loss_clip": 1.01131821, + "balance_loss_mlp": 1.02313197, + "epoch": 0.40282579287539455, + "flos": 22345180588800.0, + "grad_norm": 2.017549463584276, + "language_loss": 0.80040705, + "learning_rate": 2.6014816638502094e-06, + "loss": 0.82132953, + "num_input_tokens_seen": 143918795, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.44726562, + "step": 6700, + "time_per_iteration": 2.3827855587005615 + }, + { + "auxiliary_loss_clip": 0.01072089, + "auxiliary_loss_mlp": 0.01033363, + "balance_loss_clip": 1.01889122, + "balance_loss_mlp": 1.02261591, + "epoch": 0.4028859161280625, + "flos": 29273799300480.0, + "grad_norm": 1.8515921259079118, + "language_loss": 0.74803984, + "learning_rate": 2.601121321735101e-06, + "loss": 0.76909435, + "num_input_tokens_seen": 143938245, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.49414062, + "step": 6701, + "time_per_iteration": 2.4392647743225098 + }, + { + "auxiliary_loss_clip": 0.01070019, + "auxiliary_loss_mlp": 0.01029882, + "balance_loss_clip": 1.01623273, + "balance_loss_mlp": 1.02227938, + "epoch": 0.4029460393807305, + "flos": 28621028010240.0, + "grad_norm": 1.660724997344224, + "language_loss": 0.66196728, + "learning_rate": 2.6007609581675183e-06, + "loss": 0.68296623, + "num_input_tokens_seen": 143960995, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.47851562, + "step": 6702, + "time_per_iteration": 3.80287766456604 + }, + { + "auxiliary_loss_clip": 0.01071172, + "auxiliary_loss_mlp": 0.01030132, + "balance_loss_clip": 1.01527286, + "balance_loss_mlp": 1.02262247, + "epoch": 0.40300616263339845, + "flos": 22600814630400.0, + "grad_norm": 1.4799584018381888, + "language_loss": 0.66093421, + "learning_rate": 2.600400573160323e-06, + "loss": 0.68194729, + "num_input_tokens_seen": 143979910, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.484375, + "step": 6703, + "time_per_iteration": 2.3769912719726562 + }, + { + "auxiliary_loss_clip": 0.01072155, + "auxiliary_loss_mlp": 0.01031354, + "balance_loss_clip": 1.01748967, + "balance_loss_mlp": 1.02441585, + "epoch": 0.4030662858860664, + "flos": 25519134689280.0, + "grad_norm": 1.7806189183954881, + "language_loss": 0.81785989, + "learning_rate": 2.6000401667263755e-06, + "loss": 0.83889496, + "num_input_tokens_seen": 144000095, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.4765625, + "step": 6704, + "time_per_iteration": 2.4478447437286377 + }, + { + "auxiliary_loss_clip": 0.01072017, + "auxiliary_loss_mlp": 0.01029249, + "balance_loss_clip": 1.01579034, + "balance_loss_mlp": 1.02311778, + "epoch": 0.40312640913873443, + "flos": 23585574839040.0, + "grad_norm": 1.6247923818113437, + "language_loss": 0.73448586, + "learning_rate": 2.5996797388785373e-06, + "loss": 0.75549853, + "num_input_tokens_seen": 144019695, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.48828125, + "step": 6705, + "time_per_iteration": 2.393566131591797 + }, + { + "auxiliary_loss_clip": 0.0106812, + "auxiliary_loss_mlp": 0.01031775, + "balance_loss_clip": 1.01889992, + "balance_loss_mlp": 1.02267659, + "epoch": 0.4031865323914024, + "flos": 20010014835840.0, + "grad_norm": 1.8807125494388137, + "language_loss": 0.6590941, + "learning_rate": 2.5993192896296727e-06, + "loss": 0.68009305, + "num_input_tokens_seen": 144038525, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.453125, + "step": 6706, + "time_per_iteration": 2.4063916206359863 + }, + { + "auxiliary_loss_clip": 0.01071017, + "auxiliary_loss_mlp": 0.01030983, + "balance_loss_clip": 1.01715446, + "balance_loss_mlp": 1.02327776, + "epoch": 0.40324665564407036, + "flos": 21870361831680.0, + "grad_norm": 1.281786852473995, + "language_loss": 0.712466, + "learning_rate": 2.5989588189926433e-06, + "loss": 0.73348594, + "num_input_tokens_seen": 144059485, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.4765625, + "step": 6707, + "time_per_iteration": 2.3874239921569824 + }, + { + "auxiliary_loss_clip": 0.01067508, + "auxiliary_loss_mlp": 0.01024969, + "balance_loss_clip": 1.01161218, + "balance_loss_mlp": 1.02194262, + "epoch": 0.4033067788967383, + "flos": 23877588384000.0, + "grad_norm": 1.8442680950979915, + "language_loss": 0.80019307, + "learning_rate": 2.598598326980315e-06, + "loss": 0.82111788, + "num_input_tokens_seen": 144080265, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.45507812, + "step": 6708, + "time_per_iteration": 2.4280776977539062 + }, + { + "auxiliary_loss_clip": 0.01071436, + "auxiliary_loss_mlp": 0.01032388, + "balance_loss_clip": 1.01838112, + "balance_loss_mlp": 1.02153969, + "epoch": 0.4033669021494063, + "flos": 17418970661760.0, + "grad_norm": 2.7220186023396606, + "language_loss": 0.83391029, + "learning_rate": 2.5982378136055525e-06, + "loss": 0.85494852, + "num_input_tokens_seen": 144098040, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.5, + "step": 6709, + "time_per_iteration": 2.3736016750335693 + }, + { + "auxiliary_loss_clip": 0.01073416, + "auxiliary_loss_mlp": 0.01029175, + "balance_loss_clip": 1.01469088, + "balance_loss_mlp": 1.02389145, + "epoch": 0.40342702540207426, + "flos": 29599434351360.0, + "grad_norm": 1.4620937803003349, + "language_loss": 0.71417767, + "learning_rate": 2.597877278881221e-06, + "loss": 0.73520362, + "num_input_tokens_seen": 144118265, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.49414062, + "step": 6710, + "time_per_iteration": 2.4869158267974854 + }, + { + "auxiliary_loss_clip": 0.01069928, + "auxiliary_loss_mlp": 0.01029567, + "balance_loss_clip": 1.01523805, + "balance_loss_mlp": 1.02250266, + "epoch": 0.4034871486547422, + "flos": 11283998042880.0, + "grad_norm": 2.225492428183902, + "language_loss": 0.85168844, + "learning_rate": 2.5975167228201875e-06, + "loss": 0.87268341, + "num_input_tokens_seen": 144133865, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.47460938, + "step": 6711, + "time_per_iteration": 2.353952646255493 + }, + { + "auxiliary_loss_clip": 0.01072298, + "auxiliary_loss_mlp": 0.01031261, + "balance_loss_clip": 1.01657414, + "balance_loss_mlp": 1.02441168, + "epoch": 0.4035472719074102, + "flos": 15552130152960.0, + "grad_norm": 1.9709441492648438, + "language_loss": 0.76532757, + "learning_rate": 2.5971561454353185e-06, + "loss": 0.78636324, + "num_input_tokens_seen": 144150125, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.48046875, + "step": 6712, + "time_per_iteration": 2.370089054107666 + }, + { + "auxiliary_loss_clip": 0.01070584, + "auxiliary_loss_mlp": 0.01031262, + "balance_loss_clip": 1.0172013, + "balance_loss_mlp": 1.02368808, + "epoch": 0.40360739516007815, + "flos": 24673398981120.0, + "grad_norm": 2.9071080106476224, + "language_loss": 0.78645885, + "learning_rate": 2.596795546739483e-06, + "loss": 0.80747724, + "num_input_tokens_seen": 144169295, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.46875, + "step": 6713, + "time_per_iteration": 2.44400954246521 + }, + { + "auxiliary_loss_clip": 0.01070429, + "auxiliary_loss_mlp": 0.01030027, + "balance_loss_clip": 1.01604331, + "balance_loss_mlp": 1.02221429, + "epoch": 0.4036675184127461, + "flos": 17303338638720.0, + "grad_norm": 2.4461264597185037, + "language_loss": 0.88258076, + "learning_rate": 2.59643492674555e-06, + "loss": 0.90358531, + "num_input_tokens_seen": 144185790, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.48242188, + "step": 6714, + "time_per_iteration": 2.3925466537475586 + }, + { + "auxiliary_loss_clip": 0.01068928, + "auxiliary_loss_mlp": 0.01032224, + "balance_loss_clip": 1.01762033, + "balance_loss_mlp": 1.02164114, + "epoch": 0.4037276416654141, + "flos": 19863030545280.0, + "grad_norm": 1.8112846271295442, + "language_loss": 0.69164318, + "learning_rate": 2.596074285466388e-06, + "loss": 0.71265471, + "num_input_tokens_seen": 144205190, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.47265625, + "step": 6715, + "time_per_iteration": 2.4749972820281982 + }, + { + "auxiliary_loss_clip": 0.01072327, + "auxiliary_loss_mlp": 0.01031854, + "balance_loss_clip": 1.01725125, + "balance_loss_mlp": 1.02351642, + "epoch": 0.40378776491808205, + "flos": 18295290587520.0, + "grad_norm": 2.376784938497402, + "language_loss": 0.77109265, + "learning_rate": 2.595713622914869e-06, + "loss": 0.79213452, + "num_input_tokens_seen": 144222705, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.48828125, + "step": 6716, + "time_per_iteration": 2.393552303314209 + }, + { + "auxiliary_loss_clip": 0.01069994, + "auxiliary_loss_mlp": 0.01027648, + "balance_loss_clip": 1.01429677, + "balance_loss_mlp": 1.02290905, + "epoch": 0.40384788817075, + "flos": 15048472746240.0, + "grad_norm": 2.352356310420093, + "language_loss": 0.77081996, + "learning_rate": 2.595352939103862e-06, + "loss": 0.79179645, + "num_input_tokens_seen": 144239545, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.47070312, + "step": 6717, + "time_per_iteration": 2.402529716491699 + }, + { + "auxiliary_loss_clip": 0.01014213, + "auxiliary_loss_mlp": 0.01000202, + "balance_loss_clip": 0.99909949, + "balance_loss_mlp": 1.00465059, + "epoch": 0.40390801142341803, + "flos": 61926805918080.0, + "grad_norm": 0.9129433166890172, + "language_loss": 0.60651308, + "learning_rate": 2.594992234046241e-06, + "loss": 0.62665725, + "num_input_tokens_seen": 144288145, + "router_z_loss_clip": 0.01104736, + "router_z_loss_mlp": 0.09570312, + "step": 6718, + "time_per_iteration": 2.866379976272583 + }, + { + "auxiliary_loss_clip": 0.01074617, + "auxiliary_loss_mlp": 0.01031593, + "balance_loss_clip": 1.01578546, + "balance_loss_mlp": 1.02404618, + "epoch": 0.403968134676086, + "flos": 22737919006080.0, + "grad_norm": 1.8173189573366864, + "language_loss": 0.74836361, + "learning_rate": 2.594631507754877e-06, + "loss": 0.76942575, + "num_input_tokens_seen": 144302315, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.5078125, + "step": 6719, + "time_per_iteration": 2.360084295272827 + }, + { + "auxiliary_loss_clip": 0.01071384, + "auxiliary_loss_mlp": 0.01032918, + "balance_loss_clip": 1.01801682, + "balance_loss_mlp": 1.02309501, + "epoch": 0.40402825792875396, + "flos": 19783603468800.0, + "grad_norm": 1.9485673682580325, + "language_loss": 0.80936158, + "learning_rate": 2.594270760242644e-06, + "loss": 0.83040458, + "num_input_tokens_seen": 144318990, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.48242188, + "step": 6720, + "time_per_iteration": 2.3561317920684814 + }, + { + "auxiliary_loss_clip": 0.01069354, + "auxiliary_loss_mlp": 0.01025538, + "balance_loss_clip": 1.01182866, + "balance_loss_mlp": 1.02155221, + "epoch": 0.40408838118142193, + "flos": 19608269287680.0, + "grad_norm": 1.8002104065484328, + "language_loss": 0.7651999, + "learning_rate": 2.593909991522417e-06, + "loss": 0.78614879, + "num_input_tokens_seen": 144335765, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.47851562, + "step": 6721, + "time_per_iteration": 2.3579611778259277 + }, + { + "auxiliary_loss_clip": 0.01067206, + "auxiliary_loss_mlp": 0.01025878, + "balance_loss_clip": 1.01400495, + "balance_loss_mlp": 1.0231992, + "epoch": 0.4041485044340899, + "flos": 24424886856960.0, + "grad_norm": 1.5892809462479252, + "language_loss": 0.722139, + "learning_rate": 2.5935492016070697e-06, + "loss": 0.74306977, + "num_input_tokens_seen": 144355825, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.44140625, + "step": 6722, + "time_per_iteration": 2.399749755859375 + }, + { + "auxiliary_loss_clip": 0.01069189, + "auxiliary_loss_mlp": 0.01028793, + "balance_loss_clip": 1.01621616, + "balance_loss_mlp": 1.02270103, + "epoch": 0.40420862768675786, + "flos": 16759356744960.0, + "grad_norm": 1.8281146777268826, + "language_loss": 0.65610337, + "learning_rate": 2.593188390509478e-06, + "loss": 0.67708313, + "num_input_tokens_seen": 144374320, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.46484375, + "step": 6723, + "time_per_iteration": 2.3577027320861816 + }, + { + "auxiliary_loss_clip": 0.01070093, + "auxiliary_loss_mlp": 0.01038623, + "balance_loss_clip": 1.02346516, + "balance_loss_mlp": 1.02265847, + "epoch": 0.4042687509394258, + "flos": 22490489134080.0, + "grad_norm": 1.37953587085212, + "language_loss": 0.7358411, + "learning_rate": 2.5928275582425184e-06, + "loss": 0.75692827, + "num_input_tokens_seen": 144394325, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.47460938, + "step": 6724, + "time_per_iteration": 2.383662223815918 + }, + { + "auxiliary_loss_clip": 0.01065074, + "auxiliary_loss_mlp": 0.01028717, + "balance_loss_clip": 1.01624799, + "balance_loss_mlp": 1.02067518, + "epoch": 0.4043288741920938, + "flos": 30334844563200.0, + "grad_norm": 1.8780420398644666, + "language_loss": 0.74605525, + "learning_rate": 2.5924667048190687e-06, + "loss": 0.76699317, + "num_input_tokens_seen": 144412765, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.4453125, + "step": 6725, + "time_per_iteration": 2.4324820041656494 + }, + { + "auxiliary_loss_clip": 0.01067035, + "auxiliary_loss_mlp": 0.01030584, + "balance_loss_clip": 1.01525974, + "balance_loss_mlp": 1.02033603, + "epoch": 0.40438899744476176, + "flos": 46346711765760.0, + "grad_norm": 1.5561101468494758, + "language_loss": 0.76378453, + "learning_rate": 2.5921058302520066e-06, + "loss": 0.78476071, + "num_input_tokens_seen": 144435400, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.46679688, + "step": 6726, + "time_per_iteration": 2.570880651473999 + }, + { + "auxiliary_loss_clip": 0.01071763, + "auxiliary_loss_mlp": 0.01026249, + "balance_loss_clip": 1.01211643, + "balance_loss_mlp": 1.023664, + "epoch": 0.4044491206974297, + "flos": 13332701157120.0, + "grad_norm": 2.0118351270839487, + "language_loss": 0.81811047, + "learning_rate": 2.5917449345542093e-06, + "loss": 0.83909053, + "num_input_tokens_seen": 144452925, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.48046875, + "step": 6727, + "time_per_iteration": 2.369816303253174 + }, + { + "auxiliary_loss_clip": 0.01068221, + "auxiliary_loss_mlp": 0.01028316, + "balance_loss_clip": 1.01461923, + "balance_loss_mlp": 1.02186322, + "epoch": 0.4045092439500977, + "flos": 12092935311360.0, + "grad_norm": 3.2268447428090274, + "language_loss": 0.85171485, + "learning_rate": 2.5913840177385588e-06, + "loss": 0.87268019, + "num_input_tokens_seen": 144470195, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.46484375, + "step": 6728, + "time_per_iteration": 2.3561768531799316 + }, + { + "auxiliary_loss_clip": 0.01070637, + "auxiliary_loss_mlp": 0.01029275, + "balance_loss_clip": 1.0158999, + "balance_loss_mlp": 1.02347755, + "epoch": 0.40456936720276565, + "flos": 21178592686080.0, + "grad_norm": 2.4173578809663794, + "language_loss": 0.8171019, + "learning_rate": 2.5910230798179325e-06, + "loss": 0.83810103, + "num_input_tokens_seen": 144490320, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.47265625, + "step": 6729, + "time_per_iteration": 2.3980486392974854 + }, + { + "auxiliary_loss_clip": 0.01069041, + "auxiliary_loss_mlp": 0.01030328, + "balance_loss_clip": 1.01732254, + "balance_loss_mlp": 1.0222826, + "epoch": 0.4046294904554336, + "flos": 23914142444160.0, + "grad_norm": 6.34302156480861, + "language_loss": 0.7326926, + "learning_rate": 2.590662120805214e-06, + "loss": 0.75368631, + "num_input_tokens_seen": 144508990, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.46679688, + "step": 6730, + "time_per_iteration": 2.3998124599456787 + }, + { + "auxiliary_loss_clip": 0.01068215, + "auxiliary_loss_mlp": 0.01030086, + "balance_loss_clip": 1.01616216, + "balance_loss_mlp": 1.02154875, + "epoch": 0.4046896137081016, + "flos": 38069712368640.0, + "grad_norm": 2.38652263509956, + "language_loss": 0.67787206, + "learning_rate": 2.5903011407132834e-06, + "loss": 0.69885516, + "num_input_tokens_seen": 144529550, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.46679688, + "step": 6731, + "time_per_iteration": 3.923856496810913 + }, + { + "auxiliary_loss_clip": 0.01066961, + "auxiliary_loss_mlp": 0.01033152, + "balance_loss_clip": 1.01971674, + "balance_loss_mlp": 1.02077353, + "epoch": 0.4047497369607696, + "flos": 23616298702080.0, + "grad_norm": 1.5392470929226152, + "language_loss": 0.73836839, + "learning_rate": 2.589940139555023e-06, + "loss": 0.75936949, + "num_input_tokens_seen": 144549310, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.4609375, + "step": 6732, + "time_per_iteration": 2.394658088684082 + }, + { + "auxiliary_loss_clip": 0.01068289, + "auxiliary_loss_mlp": 0.01027315, + "balance_loss_clip": 1.01486373, + "balance_loss_mlp": 1.02236748, + "epoch": 0.40480986021343757, + "flos": 12822759705600.0, + "grad_norm": 1.6900086931393503, + "language_loss": 0.77402937, + "learning_rate": 2.589579117343317e-06, + "loss": 0.79498541, + "num_input_tokens_seen": 144567430, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.45898438, + "step": 6733, + "time_per_iteration": 2.365248441696167 + }, + { + "auxiliary_loss_clip": 0.01072237, + "auxiliary_loss_mlp": 0.01035491, + "balance_loss_clip": 1.0202086, + "balance_loss_mlp": 1.0233736, + "epoch": 0.40486998346610553, + "flos": 23767646912640.0, + "grad_norm": 6.665058146746988, + "language_loss": 0.76779372, + "learning_rate": 2.5892180740910487e-06, + "loss": 0.78887093, + "num_input_tokens_seen": 144585975, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.48828125, + "step": 6734, + "time_per_iteration": 2.3903045654296875 + }, + { + "auxiliary_loss_clip": 0.01072332, + "auxiliary_loss_mlp": 0.01034265, + "balance_loss_clip": 1.01951933, + "balance_loss_mlp": 1.02279019, + "epoch": 0.4049301067187735, + "flos": 22855715533440.0, + "grad_norm": 2.612601511799963, + "language_loss": 0.64894855, + "learning_rate": 2.5888570098111028e-06, + "loss": 0.6700145, + "num_input_tokens_seen": 144605225, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.49609375, + "step": 6735, + "time_per_iteration": 3.7507734298706055 + }, + { + "auxiliary_loss_clip": 0.01066196, + "auxiliary_loss_mlp": 0.01027212, + "balance_loss_clip": 1.0140394, + "balance_loss_mlp": 1.0211128, + "epoch": 0.40499022997144146, + "flos": 22782886704000.0, + "grad_norm": 1.6473127383223372, + "language_loss": 0.83260894, + "learning_rate": 2.5884959245163656e-06, + "loss": 0.85354304, + "num_input_tokens_seen": 144624145, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.45117188, + "step": 6736, + "time_per_iteration": 2.39146089553833 + }, + { + "auxiliary_loss_clip": 0.01071421, + "auxiliary_loss_mlp": 0.0103072, + "balance_loss_clip": 1.01618218, + "balance_loss_mlp": 1.02221251, + "epoch": 0.4050503532241094, + "flos": 23038241454720.0, + "grad_norm": 1.6542446425362878, + "language_loss": 0.75193226, + "learning_rate": 2.588134818219722e-06, + "loss": 0.77295369, + "num_input_tokens_seen": 144644470, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.4921875, + "step": 6737, + "time_per_iteration": 3.7925775051116943 + }, + { + "auxiliary_loss_clip": 0.01069569, + "auxiliary_loss_mlp": 0.01029331, + "balance_loss_clip": 1.01472771, + "balance_loss_mlp": 1.02312136, + "epoch": 0.4051104764767774, + "flos": 16647006389760.0, + "grad_norm": 2.235296063509325, + "language_loss": 0.7187497, + "learning_rate": 2.5877736909340597e-06, + "loss": 0.7397387, + "num_input_tokens_seen": 144661055, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.46484375, + "step": 6738, + "time_per_iteration": 2.354199171066284 + }, + { + "auxiliary_loss_clip": 0.0106831, + "auxiliary_loss_mlp": 0.01027191, + "balance_loss_clip": 1.01401877, + "balance_loss_mlp": 1.02158296, + "epoch": 0.40517059972944536, + "flos": 16358134867200.0, + "grad_norm": 1.963273455085497, + "language_loss": 0.74832803, + "learning_rate": 2.587412542672267e-06, + "loss": 0.76928312, + "num_input_tokens_seen": 144677935, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.46679688, + "step": 6739, + "time_per_iteration": 2.3229143619537354 + }, + { + "auxiliary_loss_clip": 0.01071431, + "auxiliary_loss_mlp": 0.01024907, + "balance_loss_clip": 1.01092374, + "balance_loss_mlp": 1.02297068, + "epoch": 0.4052307229821133, + "flos": 28802122565760.0, + "grad_norm": 1.9205291727747411, + "language_loss": 0.73877335, + "learning_rate": 2.587051373447231e-06, + "loss": 0.75973678, + "num_input_tokens_seen": 144697725, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.484375, + "step": 6740, + "time_per_iteration": 2.4274768829345703 + }, + { + "auxiliary_loss_clip": 0.01068012, + "auxiliary_loss_mlp": 0.01028526, + "balance_loss_clip": 1.01373243, + "balance_loss_mlp": 1.02207065, + "epoch": 0.4052908462347813, + "flos": 21396799883520.0, + "grad_norm": 1.569536957793962, + "language_loss": 0.7720961, + "learning_rate": 2.586690183271842e-06, + "loss": 0.79306144, + "num_input_tokens_seen": 144718805, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.45898438, + "step": 6741, + "time_per_iteration": 3.797069549560547 + }, + { + "auxiliary_loss_clip": 0.01071242, + "auxiliary_loss_mlp": 0.01024997, + "balance_loss_clip": 1.01076961, + "balance_loss_mlp": 1.0228771, + "epoch": 0.40535096948744925, + "flos": 22417974506880.0, + "grad_norm": 1.8714597145174598, + "language_loss": 0.71167606, + "learning_rate": 2.5863289721589887e-06, + "loss": 0.73263842, + "num_input_tokens_seen": 144737105, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.484375, + "step": 6742, + "time_per_iteration": 2.3611204624176025 + }, + { + "auxiliary_loss_clip": 0.01069952, + "auxiliary_loss_mlp": 0.01024507, + "balance_loss_clip": 1.01088738, + "balance_loss_mlp": 1.02258253, + "epoch": 0.4054110927401172, + "flos": 17010068284800.0, + "grad_norm": 2.146259162742394, + "language_loss": 0.72295761, + "learning_rate": 2.585967740121564e-06, + "loss": 0.74390221, + "num_input_tokens_seen": 144751350, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.47460938, + "step": 6743, + "time_per_iteration": 2.325887441635132 + }, + { + "auxiliary_loss_clip": 0.01071192, + "auxiliary_loss_mlp": 0.01030542, + "balance_loss_clip": 1.01570058, + "balance_loss_mlp": 1.02182496, + "epoch": 0.4054712159927852, + "flos": 21613820094720.0, + "grad_norm": 2.1146632487652015, + "language_loss": 0.70260417, + "learning_rate": 2.5856064871724565e-06, + "loss": 0.72362161, + "num_input_tokens_seen": 144770030, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.49414062, + "step": 6744, + "time_per_iteration": 2.350968599319458 + }, + { + "auxiliary_loss_clip": 0.01067624, + "auxiliary_loss_mlp": 0.01026136, + "balance_loss_clip": 1.01228428, + "balance_loss_mlp": 1.02179658, + "epoch": 0.4055313392454532, + "flos": 25811357702400.0, + "grad_norm": 1.6548231298351643, + "language_loss": 0.80011898, + "learning_rate": 2.58524521332456e-06, + "loss": 0.8210566, + "num_input_tokens_seen": 144790965, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.45898438, + "step": 6745, + "time_per_iteration": 2.430224657058716 + }, + { + "auxiliary_loss_clip": 0.01070046, + "auxiliary_loss_mlp": 0.01029626, + "balance_loss_clip": 1.01520145, + "balance_loss_mlp": 1.02342439, + "epoch": 0.40559146249812117, + "flos": 14136227164800.0, + "grad_norm": 1.649382215657964, + "language_loss": 0.66812259, + "learning_rate": 2.5848839185907673e-06, + "loss": 0.68911928, + "num_input_tokens_seen": 144807755, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.46679688, + "step": 6746, + "time_per_iteration": 2.325143337249756 + }, + { + "auxiliary_loss_clip": 0.01066229, + "auxiliary_loss_mlp": 0.01029414, + "balance_loss_clip": 1.01588964, + "balance_loss_mlp": 1.02101254, + "epoch": 0.40565158575078913, + "flos": 41353852320000.0, + "grad_norm": 1.510991387101237, + "language_loss": 0.57044923, + "learning_rate": 2.584522602983973e-06, + "loss": 0.59140575, + "num_input_tokens_seen": 144832405, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.453125, + "step": 6747, + "time_per_iteration": 2.5534520149230957 + }, + { + "auxiliary_loss_clip": 0.01071799, + "auxiliary_loss_mlp": 0.01031644, + "balance_loss_clip": 1.0181911, + "balance_loss_mlp": 1.02396011, + "epoch": 0.4057117090034571, + "flos": 28543381413120.0, + "grad_norm": 1.725194172916797, + "language_loss": 0.84455717, + "learning_rate": 2.58416126651707e-06, + "loss": 0.86559165, + "num_input_tokens_seen": 144853890, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.47851562, + "step": 6748, + "time_per_iteration": 2.4420316219329834 + }, + { + "auxiliary_loss_clip": 0.01071958, + "auxiliary_loss_mlp": 0.01027976, + "balance_loss_clip": 1.01401615, + "balance_loss_mlp": 1.02387619, + "epoch": 0.40577183225612506, + "flos": 18003102485760.0, + "grad_norm": 2.7078638294695736, + "language_loss": 0.81360143, + "learning_rate": 2.5837999092029535e-06, + "loss": 0.83460081, + "num_input_tokens_seen": 144871395, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.48046875, + "step": 6749, + "time_per_iteration": 2.368690013885498 + }, + { + "auxiliary_loss_clip": 0.01071793, + "auxiliary_loss_mlp": 0.01029092, + "balance_loss_clip": 1.01493549, + "balance_loss_mlp": 1.02431774, + "epoch": 0.40583195550879303, + "flos": 19535719749120.0, + "grad_norm": 1.5669932607726222, + "language_loss": 0.75566363, + "learning_rate": 2.5834385310545208e-06, + "loss": 0.77667248, + "num_input_tokens_seen": 144890975, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.47460938, + "step": 6750, + "time_per_iteration": 2.3974595069885254 + }, + { + "auxiliary_loss_clip": 0.01070745, + "auxiliary_loss_mlp": 0.01031957, + "balance_loss_clip": 1.01707363, + "balance_loss_mlp": 1.02173162, + "epoch": 0.405892078761461, + "flos": 22308382149120.0, + "grad_norm": 2.271652943940136, + "language_loss": 0.73575765, + "learning_rate": 2.583077132084667e-06, + "loss": 0.75678462, + "num_input_tokens_seen": 144908170, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.4921875, + "step": 6751, + "time_per_iteration": 2.3680331707000732 + }, + { + "auxiliary_loss_clip": 0.01070803, + "auxiliary_loss_mlp": 0.01028017, + "balance_loss_clip": 1.01363993, + "balance_loss_mlp": 1.02279353, + "epoch": 0.40595220201412896, + "flos": 25483209033600.0, + "grad_norm": 1.621435904569357, + "language_loss": 0.6691432, + "learning_rate": 2.5827157123062906e-06, + "loss": 0.69013143, + "num_input_tokens_seen": 144928020, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.48046875, + "step": 6752, + "time_per_iteration": 2.405547618865967 + }, + { + "auxiliary_loss_clip": 0.01074214, + "auxiliary_loss_mlp": 0.01029727, + "balance_loss_clip": 1.01480222, + "balance_loss_mlp": 1.0239408, + "epoch": 0.4060123252667969, + "flos": 49854155973120.0, + "grad_norm": 1.8033550961089702, + "language_loss": 0.70984793, + "learning_rate": 2.5823542717322895e-06, + "loss": 0.73088729, + "num_input_tokens_seen": 144951240, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.50390625, + "step": 6753, + "time_per_iteration": 2.642747640609741 + }, + { + "auxiliary_loss_clip": 0.01070738, + "auxiliary_loss_mlp": 0.01029507, + "balance_loss_clip": 1.01443887, + "balance_loss_mlp": 1.02150321, + "epoch": 0.4060724485194649, + "flos": 21134602506240.0, + "grad_norm": 2.446515928901878, + "language_loss": 0.71824443, + "learning_rate": 2.5819928103755625e-06, + "loss": 0.7392469, + "num_input_tokens_seen": 144969100, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.49023438, + "step": 6754, + "time_per_iteration": 2.3631203174591064 + }, + { + "auxiliary_loss_clip": 0.01070819, + "auxiliary_loss_mlp": 0.01029679, + "balance_loss_clip": 1.01506948, + "balance_loss_mlp": 1.02187312, + "epoch": 0.40613257177213286, + "flos": 21757103781120.0, + "grad_norm": 5.258864490308463, + "language_loss": 0.82849824, + "learning_rate": 2.581631328249009e-06, + "loss": 0.84950328, + "num_input_tokens_seen": 144987065, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.48828125, + "step": 6755, + "time_per_iteration": 2.38019061088562 + }, + { + "auxiliary_loss_clip": 0.01073747, + "auxiliary_loss_mlp": 0.01033778, + "balance_loss_clip": 1.01836395, + "balance_loss_mlp": 1.02485871, + "epoch": 0.4061926950248008, + "flos": 25553943181440.0, + "grad_norm": 1.5158367527827938, + "language_loss": 0.70678288, + "learning_rate": 2.5812698253655293e-06, + "loss": 0.72785819, + "num_input_tokens_seen": 145007310, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.48828125, + "step": 6756, + "time_per_iteration": 2.4420454502105713 + }, + { + "auxiliary_loss_clip": 0.01074339, + "auxiliary_loss_mlp": 0.01032889, + "balance_loss_clip": 1.01815474, + "balance_loss_mlp": 1.02314186, + "epoch": 0.4062528182774688, + "flos": 23694678437760.0, + "grad_norm": 2.307787551172105, + "language_loss": 0.79158831, + "learning_rate": 2.580908301738025e-06, + "loss": 0.81266063, + "num_input_tokens_seen": 145026210, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.51171875, + "step": 6757, + "time_per_iteration": 2.3936681747436523 + }, + { + "auxiliary_loss_clip": 0.01070793, + "auxiliary_loss_mlp": 0.0102805, + "balance_loss_clip": 1.01329195, + "balance_loss_mlp": 1.02315772, + "epoch": 0.4063129415301368, + "flos": 21724948552320.0, + "grad_norm": 6.229924784215153, + "language_loss": 0.78622389, + "learning_rate": 2.5805467573793977e-06, + "loss": 0.80721241, + "num_input_tokens_seen": 145045475, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.4765625, + "step": 6758, + "time_per_iteration": 2.389322280883789 + }, + { + "auxiliary_loss_clip": 0.01070337, + "auxiliary_loss_mlp": 0.01026695, + "balance_loss_clip": 1.0123899, + "balance_loss_mlp": 1.02250147, + "epoch": 0.40637306478280477, + "flos": 12786729315840.0, + "grad_norm": 1.9563186481092167, + "language_loss": 0.88513279, + "learning_rate": 2.5801851923025495e-06, + "loss": 0.90610313, + "num_input_tokens_seen": 145062260, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.47851562, + "step": 6759, + "time_per_iteration": 2.332597255706787 + }, + { + "auxiliary_loss_clip": 0.01071991, + "auxiliary_loss_mlp": 0.01027321, + "balance_loss_clip": 1.01352835, + "balance_loss_mlp": 1.02373266, + "epoch": 0.40643318803547274, + "flos": 24023350776960.0, + "grad_norm": 2.8638156739429177, + "language_loss": 0.6379143, + "learning_rate": 2.579823606520385e-06, + "loss": 0.65890747, + "num_input_tokens_seen": 145082470, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.48242188, + "step": 6760, + "time_per_iteration": 2.408644199371338 + }, + { + "auxiliary_loss_clip": 0.01069924, + "auxiliary_loss_mlp": 0.01026546, + "balance_loss_clip": 1.01132274, + "balance_loss_mlp": 1.02163565, + "epoch": 0.4064933112881407, + "flos": 25591265291520.0, + "grad_norm": 1.8474983053500185, + "language_loss": 0.74860454, + "learning_rate": 2.5794620000458065e-06, + "loss": 0.76956928, + "num_input_tokens_seen": 145105685, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.484375, + "step": 6761, + "time_per_iteration": 2.4151394367218018 + }, + { + "auxiliary_loss_clip": 0.01066547, + "auxiliary_loss_mlp": 0.01029494, + "balance_loss_clip": 1.01594543, + "balance_loss_mlp": 1.02272248, + "epoch": 0.40655343454080867, + "flos": 22053236866560.0, + "grad_norm": 1.5202499675454852, + "language_loss": 0.69843274, + "learning_rate": 2.5791003728917204e-06, + "loss": 0.71939313, + "num_input_tokens_seen": 145125590, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.4375, + "step": 6762, + "time_per_iteration": 2.390000343322754 + }, + { + "auxiliary_loss_clip": 0.01069426, + "auxiliary_loss_mlp": 0.0103252, + "balance_loss_clip": 1.01882243, + "balance_loss_mlp": 1.02268732, + "epoch": 0.40661355779347663, + "flos": 26467689951360.0, + "grad_norm": 1.6965544758820772, + "language_loss": 0.73104417, + "learning_rate": 2.578738725071032e-06, + "loss": 0.75206369, + "num_input_tokens_seen": 145146810, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.46875, + "step": 6763, + "time_per_iteration": 2.421066999435425 + }, + { + "auxiliary_loss_clip": 0.01072847, + "auxiliary_loss_mlp": 0.01026218, + "balance_loss_clip": 1.01150203, + "balance_loss_mlp": 1.022995, + "epoch": 0.4066736810461446, + "flos": 13260291264000.0, + "grad_norm": 1.9264072443478988, + "language_loss": 0.69346821, + "learning_rate": 2.578377056596646e-06, + "loss": 0.71445888, + "num_input_tokens_seen": 145163130, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.49804688, + "step": 6764, + "time_per_iteration": 2.331397294998169 + }, + { + "auxiliary_loss_clip": 0.01072961, + "auxiliary_loss_mlp": 0.01030969, + "balance_loss_clip": 1.01559722, + "balance_loss_mlp": 1.02315533, + "epoch": 0.40673380429881256, + "flos": 28802366945280.0, + "grad_norm": 2.16049075504269, + "language_loss": 0.91001081, + "learning_rate": 2.5780153674814714e-06, + "loss": 0.93105006, + "num_input_tokens_seen": 145181420, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.49804688, + "step": 6765, + "time_per_iteration": 2.416363477706909 + }, + { + "auxiliary_loss_clip": 0.01072599, + "auxiliary_loss_mlp": 0.010303, + "balance_loss_clip": 1.01482642, + "balance_loss_mlp": 1.02258444, + "epoch": 0.4067939275514805, + "flos": 12494506302720.0, + "grad_norm": 2.332640814172266, + "language_loss": 0.78810829, + "learning_rate": 2.5776536577384148e-06, + "loss": 0.80913723, + "num_input_tokens_seen": 145198545, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.5, + "step": 6766, + "time_per_iteration": 2.356525182723999 + }, + { + "auxiliary_loss_clip": 0.01069771, + "auxiliary_loss_mlp": 0.01032203, + "balance_loss_clip": 1.01681316, + "balance_loss_mlp": 1.0209806, + "epoch": 0.4068540508041485, + "flos": 18769515851520.0, + "grad_norm": 1.9313717383806484, + "language_loss": 0.76252061, + "learning_rate": 2.5772919273803855e-06, + "loss": 0.78354037, + "num_input_tokens_seen": 145215835, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.48828125, + "step": 6767, + "time_per_iteration": 2.341477870941162 + }, + { + "auxiliary_loss_clip": 0.01069338, + "auxiliary_loss_mlp": 0.01029963, + "balance_loss_clip": 1.01598525, + "balance_loss_mlp": 1.02414203, + "epoch": 0.40691417405681646, + "flos": 28511540386560.0, + "grad_norm": 1.8061546337580698, + "language_loss": 0.77520949, + "learning_rate": 2.576930176420292e-06, + "loss": 0.79620254, + "num_input_tokens_seen": 145236555, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.453125, + "step": 6768, + "time_per_iteration": 2.42618989944458 + }, + { + "auxiliary_loss_clip": 0.01071368, + "auxiliary_loss_mlp": 0.01031271, + "balance_loss_clip": 1.01725197, + "balance_loss_mlp": 1.02340436, + "epoch": 0.4069742973094844, + "flos": 20812982261760.0, + "grad_norm": 2.0131359970069735, + "language_loss": 0.86878633, + "learning_rate": 2.5765684048710452e-06, + "loss": 0.88981277, + "num_input_tokens_seen": 145254595, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.48046875, + "step": 6769, + "time_per_iteration": 2.353557825088501 + }, + { + "auxiliary_loss_clip": 0.01068167, + "auxiliary_loss_mlp": 0.01026992, + "balance_loss_clip": 1.0141238, + "balance_loss_mlp": 1.0231986, + "epoch": 0.4070344205621524, + "flos": 21469209776640.0, + "grad_norm": 2.1222402394798796, + "language_loss": 0.80614281, + "learning_rate": 2.5762066127455544e-06, + "loss": 0.82709438, + "num_input_tokens_seen": 145274005, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.44921875, + "step": 6770, + "time_per_iteration": 2.398536443710327 + }, + { + "auxiliary_loss_clip": 0.01072589, + "auxiliary_loss_mlp": 0.01027818, + "balance_loss_clip": 1.01242781, + "balance_loss_mlp": 1.02253175, + "epoch": 0.4070945438148204, + "flos": 26828936455680.0, + "grad_norm": 9.86414748276642, + "language_loss": 0.80358911, + "learning_rate": 2.5758448000567324e-06, + "loss": 0.82459325, + "num_input_tokens_seen": 145294850, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.5, + "step": 6771, + "time_per_iteration": 3.7735135555267334 + }, + { + "auxiliary_loss_clip": 0.01067928, + "auxiliary_loss_mlp": 0.01031593, + "balance_loss_clip": 1.01835465, + "balance_loss_mlp": 1.02166486, + "epoch": 0.4071546670674884, + "flos": 26353105269120.0, + "grad_norm": 1.4044224238419387, + "language_loss": 0.75963652, + "learning_rate": 2.57548296681749e-06, + "loss": 0.78063172, + "num_input_tokens_seen": 145317050, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.4609375, + "step": 6772, + "time_per_iteration": 2.4337310791015625 + }, + { + "auxiliary_loss_clip": 0.01069332, + "auxiliary_loss_mlp": 0.01034378, + "balance_loss_clip": 1.01946521, + "balance_loss_mlp": 1.02102971, + "epoch": 0.40721479032015634, + "flos": 17894417823360.0, + "grad_norm": 1.792144494549322, + "language_loss": 0.81155348, + "learning_rate": 2.5751211130407414e-06, + "loss": 0.83259058, + "num_input_tokens_seen": 145334480, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.484375, + "step": 6773, + "time_per_iteration": 2.3488004207611084 + }, + { + "auxiliary_loss_clip": 0.01012613, + "auxiliary_loss_mlp": 0.00999505, + "balance_loss_clip": 0.99858737, + "balance_loss_mlp": 1.00278783, + "epoch": 0.4072749135728243, + "flos": 49851745223040.0, + "grad_norm": 0.8647044492081482, + "language_loss": 0.64313209, + "learning_rate": 2.5747592387393993e-06, + "loss": 0.66325319, + "num_input_tokens_seen": 145388695, + "router_z_loss_clip": 0.00915527, + "router_z_loss_mlp": 0.09863281, + "step": 6774, + "time_per_iteration": 4.307700872421265 + }, + { + "auxiliary_loss_clip": 0.01070768, + "auxiliary_loss_mlp": 0.01025447, + "balance_loss_clip": 1.0112195, + "balance_loss_mlp": 1.02242064, + "epoch": 0.40733503682549227, + "flos": 27562391631360.0, + "grad_norm": 3.502446291548673, + "language_loss": 0.72698224, + "learning_rate": 2.574397343926379e-06, + "loss": 0.74794436, + "num_input_tokens_seen": 145408240, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.48242188, + "step": 6775, + "time_per_iteration": 2.4629786014556885 + }, + { + "auxiliary_loss_clip": 0.01070367, + "auxiliary_loss_mlp": 0.01035215, + "balance_loss_clip": 1.02105296, + "balance_loss_mlp": 1.02162981, + "epoch": 0.40739516007816023, + "flos": 22125891139200.0, + "grad_norm": 1.5483394020475754, + "language_loss": 0.77811998, + "learning_rate": 2.5740354286145936e-06, + "loss": 0.7991758, + "num_input_tokens_seen": 145428395, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.48828125, + "step": 6776, + "time_per_iteration": 2.3791956901550293 + }, + { + "auxiliary_loss_clip": 0.01072612, + "auxiliary_loss_mlp": 0.01026947, + "balance_loss_clip": 1.01214111, + "balance_loss_mlp": 1.02327108, + "epoch": 0.4074552833308282, + "flos": 23841104146560.0, + "grad_norm": 2.7660797328815065, + "language_loss": 0.79176748, + "learning_rate": 2.5736734928169616e-06, + "loss": 0.81276298, + "num_input_tokens_seen": 145448290, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.4921875, + "step": 6777, + "time_per_iteration": 3.747183084487915 + }, + { + "auxiliary_loss_clip": 0.01012629, + "auxiliary_loss_mlp": 0.01001792, + "balance_loss_clip": 1.00079668, + "balance_loss_mlp": 1.00278544, + "epoch": 0.40751540658349616, + "flos": 57996702391680.0, + "grad_norm": 0.6812871602903141, + "language_loss": 0.53173167, + "learning_rate": 2.5733115365463976e-06, + "loss": 0.55187589, + "num_input_tokens_seen": 145509785, + "router_z_loss_clip": 0.00994873, + "router_z_loss_mlp": 0.09863281, + "step": 6778, + "time_per_iteration": 3.0530123710632324 + }, + { + "auxiliary_loss_clip": 0.01070583, + "auxiliary_loss_mlp": 0.01030723, + "balance_loss_clip": 1.01610172, + "balance_loss_mlp": 1.02306414, + "epoch": 0.40757552983616413, + "flos": 21213610646400.0, + "grad_norm": 2.007535243672699, + "language_loss": 0.81935978, + "learning_rate": 2.5729495598158205e-06, + "loss": 0.8403728, + "num_input_tokens_seen": 145528620, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.47460938, + "step": 6779, + "time_per_iteration": 2.379159450531006 + }, + { + "auxiliary_loss_clip": 0.01069527, + "auxiliary_loss_mlp": 0.01027506, + "balance_loss_clip": 1.01241446, + "balance_loss_mlp": 1.02196705, + "epoch": 0.4076356530888321, + "flos": 26832322857600.0, + "grad_norm": 2.6829506857287853, + "language_loss": 0.76185381, + "learning_rate": 2.572587562638147e-06, + "loss": 0.78282416, + "num_input_tokens_seen": 145547775, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.47460938, + "step": 6780, + "time_per_iteration": 3.8712751865386963 + }, + { + "auxiliary_loss_clip": 0.01068903, + "auxiliary_loss_mlp": 0.01030597, + "balance_loss_clip": 1.01744866, + "balance_loss_mlp": 1.02311921, + "epoch": 0.40769577634150006, + "flos": 12202213466880.0, + "grad_norm": 1.9864026402658193, + "language_loss": 0.6612184, + "learning_rate": 2.572225545026296e-06, + "loss": 0.68221343, + "num_input_tokens_seen": 145564465, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.45898438, + "step": 6781, + "time_per_iteration": 2.333235263824463 + }, + { + "auxiliary_loss_clip": 0.01067373, + "auxiliary_loss_mlp": 0.01034942, + "balance_loss_clip": 1.02075613, + "balance_loss_mlp": 1.02118492, + "epoch": 0.407755899594168, + "flos": 33653897740800.0, + "grad_norm": 1.5871166968702548, + "language_loss": 0.71699488, + "learning_rate": 2.5718635069931875e-06, + "loss": 0.7380181, + "num_input_tokens_seen": 145585965, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.46289062, + "step": 6782, + "time_per_iteration": 2.4750659465789795 + }, + { + "auxiliary_loss_clip": 0.01071825, + "auxiliary_loss_mlp": 0.0103013, + "balance_loss_clip": 1.01522338, + "balance_loss_mlp": 1.02348185, + "epoch": 0.407816022846836, + "flos": 20156300899200.0, + "grad_norm": 1.7085130866774696, + "language_loss": 0.82700086, + "learning_rate": 2.571501448551741e-06, + "loss": 0.84802032, + "num_input_tokens_seen": 145605000, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.484375, + "step": 6783, + "time_per_iteration": 2.368363380432129 + }, + { + "auxiliary_loss_clip": 0.01071065, + "auxiliary_loss_mlp": 0.01030515, + "balance_loss_clip": 1.01560211, + "balance_loss_mlp": 1.023103, + "epoch": 0.40787614609950396, + "flos": 21177754813440.0, + "grad_norm": 1.8994056122495198, + "language_loss": 0.80821627, + "learning_rate": 2.5711393697148787e-06, + "loss": 0.8292321, + "num_input_tokens_seen": 145623740, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.48046875, + "step": 6784, + "time_per_iteration": 2.39919376373291 + }, + { + "auxiliary_loss_clip": 0.01073518, + "auxiliary_loss_mlp": 0.01033259, + "balance_loss_clip": 1.0168556, + "balance_loss_mlp": 1.02292514, + "epoch": 0.407936269352172, + "flos": 20519642085120.0, + "grad_norm": 1.803321441780094, + "language_loss": 0.65450573, + "learning_rate": 2.5707772704955214e-06, + "loss": 0.67557347, + "num_input_tokens_seen": 145643515, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.50390625, + "step": 6785, + "time_per_iteration": 2.372666835784912 + }, + { + "auxiliary_loss_clip": 0.01070182, + "auxiliary_loss_mlp": 0.01029617, + "balance_loss_clip": 1.01544905, + "balance_loss_mlp": 1.02303791, + "epoch": 0.40799639260483994, + "flos": 20117826714240.0, + "grad_norm": 2.374723674940677, + "language_loss": 0.79911333, + "learning_rate": 2.570415150906591e-06, + "loss": 0.82011133, + "num_input_tokens_seen": 145660890, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.47265625, + "step": 6786, + "time_per_iteration": 2.3486101627349854 + }, + { + "auxiliary_loss_clip": 0.01069525, + "auxiliary_loss_mlp": 0.01028746, + "balance_loss_clip": 1.0150193, + "balance_loss_mlp": 1.02234197, + "epoch": 0.4080565158575079, + "flos": 20996241321600.0, + "grad_norm": 1.9913624457611243, + "language_loss": 0.81865668, + "learning_rate": 2.570053010961011e-06, + "loss": 0.83963943, + "num_input_tokens_seen": 145680070, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.47265625, + "step": 6787, + "time_per_iteration": 2.369250774383545 + }, + { + "auxiliary_loss_clip": 0.01067796, + "auxiliary_loss_mlp": 0.01024637, + "balance_loss_clip": 1.01029015, + "balance_loss_mlp": 1.02279854, + "epoch": 0.40811663911017587, + "flos": 19316709590400.0, + "grad_norm": 1.7124272564759941, + "language_loss": 0.67843306, + "learning_rate": 2.5696908506717054e-06, + "loss": 0.69935733, + "num_input_tokens_seen": 145698010, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.44921875, + "step": 6788, + "time_per_iteration": 2.3602333068847656 + }, + { + "auxiliary_loss_clip": 0.01068048, + "auxiliary_loss_mlp": 0.01029789, + "balance_loss_clip": 1.01533437, + "balance_loss_mlp": 1.02178621, + "epoch": 0.40817676236284384, + "flos": 40623783546240.0, + "grad_norm": 2.0211163942417323, + "language_loss": 0.65949368, + "learning_rate": 2.5693286700515993e-06, + "loss": 0.68047202, + "num_input_tokens_seen": 145722215, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.46289062, + "step": 6789, + "time_per_iteration": 2.5659029483795166 + }, + { + "auxiliary_loss_clip": 0.01070439, + "auxiliary_loss_mlp": 0.01025824, + "balance_loss_clip": 1.01178074, + "balance_loss_mlp": 1.02277291, + "epoch": 0.4082368856155118, + "flos": 20521038539520.0, + "grad_norm": 1.7128437784241677, + "language_loss": 0.6015017, + "learning_rate": 2.568966469113617e-06, + "loss": 0.6224643, + "num_input_tokens_seen": 145741090, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.4765625, + "step": 6790, + "time_per_iteration": 2.3918235301971436 + }, + { + "auxiliary_loss_clip": 0.01071197, + "auxiliary_loss_mlp": 0.01031046, + "balance_loss_clip": 1.01661563, + "balance_loss_mlp": 1.02349114, + "epoch": 0.40829700886817977, + "flos": 11427211906560.0, + "grad_norm": 3.4423362186287156, + "language_loss": 0.69535053, + "learning_rate": 2.568604247870685e-06, + "loss": 0.71637297, + "num_input_tokens_seen": 145754985, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.4765625, + "step": 6791, + "time_per_iteration": 2.349632740020752 + }, + { + "auxiliary_loss_clip": 0.01011949, + "auxiliary_loss_mlp": 0.01007222, + "balance_loss_clip": 1.00610769, + "balance_loss_mlp": 1.0022645, + "epoch": 0.40835713212084773, + "flos": 67327380754560.0, + "grad_norm": 0.7419159836389468, + "language_loss": 0.59673309, + "learning_rate": 2.5682420063357308e-06, + "loss": 0.61692476, + "num_input_tokens_seen": 145815260, + "router_z_loss_clip": 0.01116943, + "router_z_loss_mlp": 0.09667969, + "step": 6792, + "time_per_iteration": 3.003708600997925 + }, + { + "auxiliary_loss_clip": 0.0107319, + "auxiliary_loss_mlp": 0.01035566, + "balance_loss_clip": 1.02005732, + "balance_loss_mlp": 1.02407932, + "epoch": 0.4084172553735157, + "flos": 21760944030720.0, + "grad_norm": 1.6989987123418613, + "language_loss": 0.80104542, + "learning_rate": 2.5678797445216798e-06, + "loss": 0.82213306, + "num_input_tokens_seen": 145832665, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.4921875, + "step": 6793, + "time_per_iteration": 2.375223159790039 + }, + { + "auxiliary_loss_clip": 0.01071659, + "auxiliary_loss_mlp": 0.01030176, + "balance_loss_clip": 1.01501274, + "balance_loss_mlp": 1.02324581, + "epoch": 0.40847737862618366, + "flos": 27416035745280.0, + "grad_norm": 1.8826417374156947, + "language_loss": 0.84705305, + "learning_rate": 2.5675174624414626e-06, + "loss": 0.86807138, + "num_input_tokens_seen": 145850240, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.484375, + "step": 6794, + "time_per_iteration": 2.436815023422241 + }, + { + "auxiliary_loss_clip": 0.01072372, + "auxiliary_loss_mlp": 0.01028621, + "balance_loss_clip": 1.01453662, + "balance_loss_mlp": 1.02419114, + "epoch": 0.4085375018788516, + "flos": 18586291703040.0, + "grad_norm": 1.6081441511663996, + "language_loss": 0.80222994, + "learning_rate": 2.5671551601080057e-06, + "loss": 0.8232398, + "num_input_tokens_seen": 145869545, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.48046875, + "step": 6795, + "time_per_iteration": 2.3763954639434814 + }, + { + "auxiliary_loss_clip": 0.01073548, + "auxiliary_loss_mlp": 0.01030911, + "balance_loss_clip": 1.01538432, + "balance_loss_mlp": 1.02244473, + "epoch": 0.4085976251315196, + "flos": 15410941148160.0, + "grad_norm": 2.2366101498858746, + "language_loss": 0.70252764, + "learning_rate": 2.5667928375342414e-06, + "loss": 0.72357225, + "num_input_tokens_seen": 145884025, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.51171875, + "step": 6796, + "time_per_iteration": 2.327960968017578 + }, + { + "auxiliary_loss_clip": 0.0106897, + "auxiliary_loss_mlp": 0.01030241, + "balance_loss_clip": 1.01612711, + "balance_loss_mlp": 1.02222121, + "epoch": 0.40865774838418756, + "flos": 21251142224640.0, + "grad_norm": 1.8229554235251837, + "language_loss": 0.77677798, + "learning_rate": 2.5664304947330985e-06, + "loss": 0.79777002, + "num_input_tokens_seen": 145903210, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.46875, + "step": 6797, + "time_per_iteration": 2.381263017654419 + }, + { + "auxiliary_loss_clip": 0.010702, + "auxiliary_loss_mlp": 0.01031229, + "balance_loss_clip": 1.01741314, + "balance_loss_mlp": 1.0219605, + "epoch": 0.4087178716368556, + "flos": 13771384790400.0, + "grad_norm": 1.803033884575777, + "language_loss": 0.85323894, + "learning_rate": 2.5660681317175076e-06, + "loss": 0.87425327, + "num_input_tokens_seen": 145920985, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.48242188, + "step": 6798, + "time_per_iteration": 2.3375985622406006 + }, + { + "auxiliary_loss_clip": 0.01069059, + "auxiliary_loss_mlp": 0.01028965, + "balance_loss_clip": 1.01584625, + "balance_loss_mlp": 1.02386451, + "epoch": 0.40877799488952354, + "flos": 23620662622080.0, + "grad_norm": 1.5170671942072367, + "language_loss": 0.8429361, + "learning_rate": 2.5657057485004016e-06, + "loss": 0.86391634, + "num_input_tokens_seen": 145940350, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.45117188, + "step": 6799, + "time_per_iteration": 2.3903050422668457 + }, + { + "auxiliary_loss_clip": 0.01073691, + "auxiliary_loss_mlp": 0.01038829, + "balance_loss_clip": 1.02390385, + "balance_loss_mlp": 1.02479386, + "epoch": 0.4088381181421915, + "flos": 20917861585920.0, + "grad_norm": 2.695034570479182, + "language_loss": 0.83554947, + "learning_rate": 2.565343345094712e-06, + "loss": 0.85667461, + "num_input_tokens_seen": 145957460, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.48828125, + "step": 6800, + "time_per_iteration": 2.3522610664367676 + }, + { + "auxiliary_loss_clip": 0.01071051, + "auxiliary_loss_mlp": 0.01030084, + "balance_loss_clip": 1.01513517, + "balance_loss_mlp": 1.02247679, + "epoch": 0.4088982413948595, + "flos": 13296740590080.0, + "grad_norm": 1.8535662763196286, + "language_loss": 0.74267709, + "learning_rate": 2.5649809215133737e-06, + "loss": 0.76368845, + "num_input_tokens_seen": 145975285, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.48632812, + "step": 6801, + "time_per_iteration": 2.4040329456329346 + }, + { + "auxiliary_loss_clip": 0.01070094, + "auxiliary_loss_mlp": 0.01033049, + "balance_loss_clip": 1.01894641, + "balance_loss_mlp": 1.02243233, + "epoch": 0.40895836464752744, + "flos": 15266784677760.0, + "grad_norm": 1.9815218020442402, + "language_loss": 0.80010808, + "learning_rate": 2.5646184777693193e-06, + "loss": 0.82113951, + "num_input_tokens_seen": 145989150, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.4765625, + "step": 6802, + "time_per_iteration": 2.3442373275756836 + }, + { + "auxiliary_loss_clip": 0.01071792, + "auxiliary_loss_mlp": 0.01035992, + "balance_loss_clip": 1.01989901, + "balance_loss_mlp": 1.02280021, + "epoch": 0.4090184879001954, + "flos": 14500545868800.0, + "grad_norm": 2.0520559510038083, + "language_loss": 0.76117265, + "learning_rate": 2.5642560138754833e-06, + "loss": 0.78225052, + "num_input_tokens_seen": 146006980, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.49023438, + "step": 6803, + "time_per_iteration": 2.3382906913757324 + }, + { + "auxiliary_loss_clip": 0.0107129, + "auxiliary_loss_mlp": 0.01035453, + "balance_loss_clip": 1.02088523, + "balance_loss_mlp": 1.02294755, + "epoch": 0.40907861115286337, + "flos": 13880732768640.0, + "grad_norm": 9.559075136923987, + "language_loss": 0.78819716, + "learning_rate": 2.5638935298448017e-06, + "loss": 0.8092646, + "num_input_tokens_seen": 146025125, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.484375, + "step": 6804, + "time_per_iteration": 2.345945358276367 + }, + { + "auxiliary_loss_clip": 0.01074361, + "auxiliary_loss_mlp": 0.01030832, + "balance_loss_clip": 1.01582348, + "balance_loss_mlp": 1.0239377, + "epoch": 0.40913873440553133, + "flos": 28036372515840.0, + "grad_norm": 1.8771511241250898, + "language_loss": 0.74873304, + "learning_rate": 2.5635310256902106e-06, + "loss": 0.76978493, + "num_input_tokens_seen": 146044990, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.50390625, + "step": 6805, + "time_per_iteration": 2.423391580581665 + }, + { + "auxiliary_loss_clip": 0.01073318, + "auxiliary_loss_mlp": 0.01033526, + "balance_loss_clip": 1.01929247, + "balance_loss_mlp": 1.02551174, + "epoch": 0.4091988576581993, + "flos": 21617066851200.0, + "grad_norm": 1.6115646215200137, + "language_loss": 0.79510528, + "learning_rate": 2.563168501424647e-06, + "loss": 0.81617373, + "num_input_tokens_seen": 146066045, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.47851562, + "step": 6806, + "time_per_iteration": 2.403144121170044 + }, + { + "auxiliary_loss_clip": 0.01070482, + "auxiliary_loss_mlp": 0.0102873, + "balance_loss_clip": 1.01314974, + "balance_loss_mlp": 1.02242827, + "epoch": 0.40925898091086726, + "flos": 25223036515200.0, + "grad_norm": 1.8786747263081227, + "language_loss": 0.72319138, + "learning_rate": 2.562805957061048e-06, + "loss": 0.74418348, + "num_input_tokens_seen": 146086280, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.48046875, + "step": 6807, + "time_per_iteration": 2.404571056365967 + }, + { + "auxiliary_loss_clip": 0.01011993, + "auxiliary_loss_mlp": 0.01002756, + "balance_loss_clip": 1.00172484, + "balance_loss_mlp": 1.00204647, + "epoch": 0.40931910416353523, + "flos": 68927380675200.0, + "grad_norm": 0.8157209897836727, + "language_loss": 0.58783925, + "learning_rate": 2.5624433926123524e-06, + "loss": 0.60798669, + "num_input_tokens_seen": 146148840, + "router_z_loss_clip": 0.01031494, + "router_z_loss_mlp": 0.09960938, + "step": 6808, + "time_per_iteration": 3.105621814727783 + }, + { + "auxiliary_loss_clip": 0.01069361, + "auxiliary_loss_mlp": 0.01037348, + "balance_loss_clip": 1.02371693, + "balance_loss_mlp": 1.02266181, + "epoch": 0.4093792274162032, + "flos": 20188630684800.0, + "grad_norm": 1.82038170123016, + "language_loss": 0.54140264, + "learning_rate": 2.5620808080914985e-06, + "loss": 0.56246978, + "num_input_tokens_seen": 146166195, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.46484375, + "step": 6809, + "time_per_iteration": 2.3663761615753174 + }, + { + "auxiliary_loss_clip": 0.01072927, + "auxiliary_loss_mlp": 0.01021696, + "balance_loss_clip": 1.00825477, + "balance_loss_mlp": 1.02399659, + "epoch": 0.40943935066887116, + "flos": 25227574992000.0, + "grad_norm": 1.6906560948523253, + "language_loss": 0.8330425, + "learning_rate": 2.5617182035114262e-06, + "loss": 0.85398865, + "num_input_tokens_seen": 146185045, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.48828125, + "step": 6810, + "time_per_iteration": 3.8594319820404053 + }, + { + "auxiliary_loss_clip": 0.0107353, + "auxiliary_loss_mlp": 0.01031689, + "balance_loss_clip": 1.01626301, + "balance_loss_mlp": 1.02441025, + "epoch": 0.4094994739215392, + "flos": 23254284147840.0, + "grad_norm": 2.2016167394010644, + "language_loss": 0.77551007, + "learning_rate": 2.5613555788850768e-06, + "loss": 0.79656231, + "num_input_tokens_seen": 146204655, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.49023438, + "step": 6811, + "time_per_iteration": 2.368163824081421 + }, + { + "auxiliary_loss_clip": 0.01069804, + "auxiliary_loss_mlp": 0.01028177, + "balance_loss_clip": 1.01275134, + "balance_loss_mlp": 1.02206635, + "epoch": 0.40955959717420715, + "flos": 17381264526720.0, + "grad_norm": 1.6958132592292081, + "language_loss": 0.70275563, + "learning_rate": 2.5609929342253905e-06, + "loss": 0.72373545, + "num_input_tokens_seen": 146222000, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.4765625, + "step": 6812, + "time_per_iteration": 2.3505349159240723 + }, + { + "auxiliary_loss_clip": 0.01068639, + "auxiliary_loss_mlp": 0.01036118, + "balance_loss_clip": 1.02175927, + "balance_loss_mlp": 1.02166402, + "epoch": 0.4096197204268751, + "flos": 25081254017280.0, + "grad_norm": 2.857927966384935, + "language_loss": 0.66329747, + "learning_rate": 2.5606302695453093e-06, + "loss": 0.68434501, + "num_input_tokens_seen": 146242630, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.47070312, + "step": 6813, + "time_per_iteration": 2.398864507675171 + }, + { + "auxiliary_loss_clip": 0.0107143, + "auxiliary_loss_mlp": 0.01029203, + "balance_loss_clip": 1.01430202, + "balance_loss_mlp": 1.02415061, + "epoch": 0.4096798436795431, + "flos": 30585591014400.0, + "grad_norm": 1.9361590789125729, + "language_loss": 0.73674417, + "learning_rate": 2.5602675848577763e-06, + "loss": 0.75775045, + "num_input_tokens_seen": 146263070, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.47265625, + "step": 6814, + "time_per_iteration": 3.8620433807373047 + }, + { + "auxiliary_loss_clip": 0.01072751, + "auxiliary_loss_mlp": 0.01033865, + "balance_loss_clip": 1.01883304, + "balance_loss_mlp": 1.02361798, + "epoch": 0.40973996693221104, + "flos": 24132489287040.0, + "grad_norm": 2.8713217961728046, + "language_loss": 0.66395146, + "learning_rate": 2.5599048801757345e-06, + "loss": 0.68501759, + "num_input_tokens_seen": 146282890, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.49023438, + "step": 6815, + "time_per_iteration": 2.4038314819335938 + }, + { + "auxiliary_loss_clip": 0.01070512, + "auxiliary_loss_mlp": 0.01031687, + "balance_loss_clip": 1.01833582, + "balance_loss_mlp": 1.02434206, + "epoch": 0.409800090184879, + "flos": 23987809146240.0, + "grad_norm": 3.764289264157004, + "language_loss": 0.76538342, + "learning_rate": 2.559542155512127e-06, + "loss": 0.78640544, + "num_input_tokens_seen": 146301755, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.4609375, + "step": 6816, + "time_per_iteration": 2.401986598968506 + }, + { + "auxiliary_loss_clip": 0.01071186, + "auxiliary_loss_mlp": 0.01031415, + "balance_loss_clip": 1.01695514, + "balance_loss_mlp": 1.02314579, + "epoch": 0.40986021343754697, + "flos": 16142755489920.0, + "grad_norm": 2.016740668090845, + "language_loss": 0.82343602, + "learning_rate": 2.5591794108798996e-06, + "loss": 0.84446204, + "num_input_tokens_seen": 146316835, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.48046875, + "step": 6817, + "time_per_iteration": 3.6796512603759766 + }, + { + "auxiliary_loss_clip": 0.01071686, + "auxiliary_loss_mlp": 0.01028317, + "balance_loss_clip": 1.01209259, + "balance_loss_mlp": 1.02188432, + "epoch": 0.40992033669021494, + "flos": 24789659408640.0, + "grad_norm": 2.258737455133246, + "language_loss": 0.80263257, + "learning_rate": 2.5588166462919977e-06, + "loss": 0.8236326, + "num_input_tokens_seen": 146336650, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.49804688, + "step": 6818, + "time_per_iteration": 2.4060869216918945 + }, + { + "auxiliary_loss_clip": 0.01069163, + "auxiliary_loss_mlp": 0.01029611, + "balance_loss_clip": 1.01607454, + "balance_loss_mlp": 1.02205515, + "epoch": 0.4099804599428829, + "flos": 29640631622400.0, + "grad_norm": 1.5559221649418766, + "language_loss": 0.66237116, + "learning_rate": 2.558453861761367e-06, + "loss": 0.68335891, + "num_input_tokens_seen": 146357640, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.47070312, + "step": 6819, + "time_per_iteration": 3.7800543308258057 + }, + { + "auxiliary_loss_clip": 0.01075855, + "auxiliary_loss_mlp": 0.01034944, + "balance_loss_clip": 1.01849318, + "balance_loss_mlp": 1.02339292, + "epoch": 0.41004058319555087, + "flos": 24825445418880.0, + "grad_norm": 1.536465283800061, + "language_loss": 0.85190713, + "learning_rate": 2.5580910573009544e-06, + "loss": 0.87301517, + "num_input_tokens_seen": 146379325, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.5234375, + "step": 6820, + "time_per_iteration": 2.396991491317749 + }, + { + "auxiliary_loss_clip": 0.01070754, + "auxiliary_loss_mlp": 0.01028904, + "balance_loss_clip": 1.01470041, + "balance_loss_mlp": 1.023633, + "epoch": 0.41010070644821883, + "flos": 25736329457280.0, + "grad_norm": 1.6480613882278838, + "language_loss": 0.7125718, + "learning_rate": 2.5577282329237072e-06, + "loss": 0.73356831, + "num_input_tokens_seen": 146398635, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.47265625, + "step": 6821, + "time_per_iteration": 2.4129202365875244 + }, + { + "auxiliary_loss_clip": 0.01072815, + "auxiliary_loss_mlp": 0.01031289, + "balance_loss_clip": 1.01744914, + "balance_loss_mlp": 1.02468276, + "epoch": 0.4101608297008868, + "flos": 22344971120640.0, + "grad_norm": 1.7418445181984965, + "language_loss": 0.7443831, + "learning_rate": 2.5573653886425745e-06, + "loss": 0.76542413, + "num_input_tokens_seen": 146417585, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.48046875, + "step": 6822, + "time_per_iteration": 2.3635151386260986 + }, + { + "auxiliary_loss_clip": 0.01070819, + "auxiliary_loss_mlp": 0.01032193, + "balance_loss_clip": 1.01843667, + "balance_loss_mlp": 1.02254891, + "epoch": 0.41022095295355476, + "flos": 21943993622400.0, + "grad_norm": 2.0891889050682666, + "language_loss": 0.75656128, + "learning_rate": 2.5570025244705044e-06, + "loss": 0.77759135, + "num_input_tokens_seen": 146437035, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.48242188, + "step": 6823, + "time_per_iteration": 2.3817458152770996 + }, + { + "auxiliary_loss_clip": 0.01075934, + "auxiliary_loss_mlp": 0.01035287, + "balance_loss_clip": 1.01926553, + "balance_loss_mlp": 1.02365136, + "epoch": 0.4102810762062228, + "flos": 27449377960320.0, + "grad_norm": 1.6950894858580572, + "language_loss": 0.73033452, + "learning_rate": 2.5566396404204473e-06, + "loss": 0.75144678, + "num_input_tokens_seen": 146457370, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.5234375, + "step": 6824, + "time_per_iteration": 2.4197685718536377 + }, + { + "auxiliary_loss_clip": 0.01072068, + "auxiliary_loss_mlp": 0.01030959, + "balance_loss_clip": 1.01565242, + "balance_loss_mlp": 1.02210569, + "epoch": 0.41034119945889075, + "flos": 24498099711360.0, + "grad_norm": 2.1456375577122215, + "language_loss": 0.71785915, + "learning_rate": 2.556276736505353e-06, + "loss": 0.73888934, + "num_input_tokens_seen": 146478105, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.5, + "step": 6825, + "time_per_iteration": 2.395307779312134 + }, + { + "auxiliary_loss_clip": 0.0107073, + "auxiliary_loss_mlp": 0.01030405, + "balance_loss_clip": 1.01626658, + "balance_loss_mlp": 1.02358508, + "epoch": 0.4104013227115587, + "flos": 24351499445760.0, + "grad_norm": 1.8738583528423634, + "language_loss": 0.84817088, + "learning_rate": 2.555913812738173e-06, + "loss": 0.86918229, + "num_input_tokens_seen": 146497835, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.47070312, + "step": 6826, + "time_per_iteration": 2.3874599933624268 + }, + { + "auxiliary_loss_clip": 0.01068102, + "auxiliary_loss_mlp": 0.01028247, + "balance_loss_clip": 1.01406074, + "balance_loss_mlp": 1.02299333, + "epoch": 0.4104614459642267, + "flos": 23728299943680.0, + "grad_norm": 2.310350944628638, + "language_loss": 0.66125596, + "learning_rate": 2.555550869131859e-06, + "loss": 0.68221939, + "num_input_tokens_seen": 146517735, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.45117188, + "step": 6827, + "time_per_iteration": 2.4118964672088623 + }, + { + "auxiliary_loss_clip": 0.01069144, + "auxiliary_loss_mlp": 0.01027827, + "balance_loss_clip": 1.01433277, + "balance_loss_mlp": 1.02299356, + "epoch": 0.41052156921689464, + "flos": 22126868657280.0, + "grad_norm": 1.417022427089946, + "language_loss": 0.72067219, + "learning_rate": 2.555187905699364e-06, + "loss": 0.74164188, + "num_input_tokens_seen": 146537640, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.4609375, + "step": 6828, + "time_per_iteration": 2.366746425628662 + }, + { + "auxiliary_loss_clip": 0.01073594, + "auxiliary_loss_mlp": 0.01033195, + "balance_loss_clip": 1.01841283, + "balance_loss_mlp": 1.0238508, + "epoch": 0.4105816924695626, + "flos": 20083332424320.0, + "grad_norm": 1.8276712385745477, + "language_loss": 0.82953995, + "learning_rate": 2.5548249224536404e-06, + "loss": 0.85060787, + "num_input_tokens_seen": 146554695, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.49609375, + "step": 6829, + "time_per_iteration": 2.356093406677246 + }, + { + "auxiliary_loss_clip": 0.01067777, + "auxiliary_loss_mlp": 0.01026991, + "balance_loss_clip": 1.01352, + "balance_loss_mlp": 1.02243114, + "epoch": 0.4106418157222306, + "flos": 18075826581120.0, + "grad_norm": 1.4619835389990454, + "language_loss": 0.89773607, + "learning_rate": 2.5544619194076425e-06, + "loss": 0.91868377, + "num_input_tokens_seen": 146573740, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.453125, + "step": 6830, + "time_per_iteration": 2.375152826309204 + }, + { + "auxiliary_loss_clip": 0.01071174, + "auxiliary_loss_mlp": 0.01027282, + "balance_loss_clip": 1.01268494, + "balance_loss_mlp": 1.0227474, + "epoch": 0.41070193897489854, + "flos": 21646917930240.0, + "grad_norm": 2.062887251507835, + "language_loss": 0.65535003, + "learning_rate": 2.5540988965743252e-06, + "loss": 0.67633456, + "num_input_tokens_seen": 146592885, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.484375, + "step": 6831, + "time_per_iteration": 2.3642213344573975 + }, + { + "auxiliary_loss_clip": 0.01069794, + "auxiliary_loss_mlp": 0.01026971, + "balance_loss_clip": 1.01292205, + "balance_loss_mlp": 1.02318001, + "epoch": 0.4107620622275665, + "flos": 26647073850240.0, + "grad_norm": 1.7134508303130769, + "language_loss": 0.69318336, + "learning_rate": 2.553735853966643e-06, + "loss": 0.71415102, + "num_input_tokens_seen": 146611995, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.46484375, + "step": 6832, + "time_per_iteration": 2.413144588470459 + }, + { + "auxiliary_loss_clip": 0.01068391, + "auxiliary_loss_mlp": 0.01030771, + "balance_loss_clip": 1.0169723, + "balance_loss_mlp": 1.02204156, + "epoch": 0.41082218548023447, + "flos": 18733310904960.0, + "grad_norm": 2.7807444521760973, + "language_loss": 0.73383456, + "learning_rate": 2.553372791597553e-06, + "loss": 0.75482619, + "num_input_tokens_seen": 146628045, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.46289062, + "step": 6833, + "time_per_iteration": 2.3391172885894775 + }, + { + "auxiliary_loss_clip": 0.01075022, + "auxiliary_loss_mlp": 0.01030452, + "balance_loss_clip": 1.01432288, + "balance_loss_mlp": 1.0244509, + "epoch": 0.41088230873290243, + "flos": 22892653618560.0, + "grad_norm": 2.4050882123985686, + "language_loss": 0.72357494, + "learning_rate": 2.553009709480011e-06, + "loss": 0.74462968, + "num_input_tokens_seen": 146648355, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.5078125, + "step": 6834, + "time_per_iteration": 2.379518747329712 + }, + { + "auxiliary_loss_clip": 0.01071632, + "auxiliary_loss_mlp": 0.01034352, + "balance_loss_clip": 1.01927161, + "balance_loss_mlp": 1.02214825, + "epoch": 0.4109424319855704, + "flos": 24275912618880.0, + "grad_norm": 2.010593019270302, + "language_loss": 0.7126807, + "learning_rate": 2.5526466076269756e-06, + "loss": 0.73374051, + "num_input_tokens_seen": 146668370, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.49609375, + "step": 6835, + "time_per_iteration": 2.4197404384613037 + }, + { + "auxiliary_loss_clip": 0.0107166, + "auxiliary_loss_mlp": 0.01029939, + "balance_loss_clip": 1.01407242, + "balance_loss_mlp": 1.02339292, + "epoch": 0.41100255523823837, + "flos": 12968312630400.0, + "grad_norm": 1.8611482051274517, + "language_loss": 0.87306398, + "learning_rate": 2.5522834860514044e-06, + "loss": 0.89407998, + "num_input_tokens_seen": 146686665, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.484375, + "step": 6836, + "time_per_iteration": 2.356461524963379 + }, + { + "auxiliary_loss_clip": 0.01070379, + "auxiliary_loss_mlp": 0.01031935, + "balance_loss_clip": 1.01754606, + "balance_loss_mlp": 1.02415276, + "epoch": 0.4110626784909064, + "flos": 23144621967360.0, + "grad_norm": 1.9842992557298158, + "language_loss": 0.68503505, + "learning_rate": 2.5519203447662554e-06, + "loss": 0.7060582, + "num_input_tokens_seen": 146706570, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.4609375, + "step": 6837, + "time_per_iteration": 2.3727900981903076 + }, + { + "auxiliary_loss_clip": 0.01072299, + "auxiliary_loss_mlp": 0.01032321, + "balance_loss_clip": 1.01713347, + "balance_loss_mlp": 1.02330422, + "epoch": 0.41112280174357435, + "flos": 22746297732480.0, + "grad_norm": 2.015972773582062, + "language_loss": 0.75307292, + "learning_rate": 2.5515571837844902e-06, + "loss": 0.77411914, + "num_input_tokens_seen": 146723425, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.49023438, + "step": 6838, + "time_per_iteration": 2.3702847957611084 + }, + { + "auxiliary_loss_clip": 0.01071439, + "auxiliary_loss_mlp": 0.010297, + "balance_loss_clip": 1.01401162, + "balance_loss_mlp": 1.0233146, + "epoch": 0.4111829249962423, + "flos": 21101434848000.0, + "grad_norm": 2.067495810041452, + "language_loss": 0.8241576, + "learning_rate": 2.5511940031190663e-06, + "loss": 0.84516901, + "num_input_tokens_seen": 146741640, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.48046875, + "step": 6839, + "time_per_iteration": 2.3549976348876953 + }, + { + "auxiliary_loss_clip": 0.01068582, + "auxiliary_loss_mlp": 0.01028364, + "balance_loss_clip": 1.01493526, + "balance_loss_mlp": 1.02148759, + "epoch": 0.4112430482489103, + "flos": 21504751407360.0, + "grad_norm": 1.6671552403740855, + "language_loss": 0.80184591, + "learning_rate": 2.550830802782948e-06, + "loss": 0.8228153, + "num_input_tokens_seen": 146759195, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.47070312, + "step": 6840, + "time_per_iteration": 2.3657047748565674 + }, + { + "auxiliary_loss_clip": 0.01066901, + "auxiliary_loss_mlp": 0.01024528, + "balance_loss_clip": 1.01166511, + "balance_loss_mlp": 1.02098584, + "epoch": 0.41130317150157825, + "flos": 19569096875520.0, + "grad_norm": 2.368709233661815, + "language_loss": 0.68344259, + "learning_rate": 2.5504675827890945e-06, + "loss": 0.70435691, + "num_input_tokens_seen": 146774990, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.45898438, + "step": 6841, + "time_per_iteration": 2.3337619304656982 + }, + { + "auxiliary_loss_clip": 0.01012352, + "auxiliary_loss_mlp": 0.01004627, + "balance_loss_clip": 1.0035243, + "balance_loss_mlp": 1.00284743, + "epoch": 0.4113632947542462, + "flos": 62379593740800.0, + "grad_norm": 0.7580120656161936, + "language_loss": 0.59648132, + "learning_rate": 2.5501043431504683e-06, + "loss": 0.61665106, + "num_input_tokens_seen": 146839610, + "router_z_loss_clip": 0.01104736, + "router_z_loss_mlp": 0.09521484, + "step": 6842, + "time_per_iteration": 3.1008148193359375 + }, + { + "auxiliary_loss_clip": 0.01068663, + "auxiliary_loss_mlp": 0.01032332, + "balance_loss_clip": 1.01871252, + "balance_loss_mlp": 1.02369237, + "epoch": 0.4114234180069142, + "flos": 13917740676480.0, + "grad_norm": 1.8287325055888215, + "language_loss": 0.69865274, + "learning_rate": 2.5497410838800337e-06, + "loss": 0.71966267, + "num_input_tokens_seen": 146857360, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.44921875, + "step": 6843, + "time_per_iteration": 2.3353593349456787 + }, + { + "auxiliary_loss_clip": 0.01011783, + "auxiliary_loss_mlp": 0.01007415, + "balance_loss_clip": 1.00641966, + "balance_loss_mlp": 1.00212193, + "epoch": 0.41148354125958214, + "flos": 64925111635200.0, + "grad_norm": 0.7231191446166241, + "language_loss": 0.53649282, + "learning_rate": 2.5493778049907537e-06, + "loss": 0.55668479, + "num_input_tokens_seen": 146917055, + "router_z_loss_clip": 0.00994873, + "router_z_loss_mlp": 0.09667969, + "step": 6844, + "time_per_iteration": 3.0286238193511963 + }, + { + "auxiliary_loss_clip": 0.01069234, + "auxiliary_loss_mlp": 0.01030368, + "balance_loss_clip": 1.01722527, + "balance_loss_mlp": 1.02270865, + "epoch": 0.4115436645122501, + "flos": 18727934555520.0, + "grad_norm": 1.9775248227070374, + "language_loss": 0.65918547, + "learning_rate": 2.549014506495594e-06, + "loss": 0.6801815, + "num_input_tokens_seen": 146935215, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.46484375, + "step": 6845, + "time_per_iteration": 2.3410146236419678 + }, + { + "auxiliary_loss_clip": 0.01068917, + "auxiliary_loss_mlp": 0.01026672, + "balance_loss_clip": 1.01334405, + "balance_loss_mlp": 1.02350223, + "epoch": 0.41160378776491807, + "flos": 16251998734080.0, + "grad_norm": 1.9355034246388638, + "language_loss": 0.70224738, + "learning_rate": 2.5486511884075184e-06, + "loss": 0.72320324, + "num_input_tokens_seen": 146951970, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.453125, + "step": 6846, + "time_per_iteration": 2.343996286392212 + }, + { + "auxiliary_loss_clip": 0.01068888, + "auxiliary_loss_mlp": 0.01026664, + "balance_loss_clip": 1.01241255, + "balance_loss_mlp": 1.02190018, + "epoch": 0.41166391101758604, + "flos": 27968640744960.0, + "grad_norm": 1.3450028987154108, + "language_loss": 0.64688534, + "learning_rate": 2.5482878507394924e-06, + "loss": 0.6678409, + "num_input_tokens_seen": 146975615, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.47070312, + "step": 6847, + "time_per_iteration": 2.444783926010132 + }, + { + "auxiliary_loss_clip": 0.01070099, + "auxiliary_loss_mlp": 0.01033536, + "balance_loss_clip": 1.01937973, + "balance_loss_mlp": 1.02273917, + "epoch": 0.411724034270254, + "flos": 34129868572800.0, + "grad_norm": 1.4414618138540842, + "language_loss": 0.70569962, + "learning_rate": 2.547924493504484e-06, + "loss": 0.72673595, + "num_input_tokens_seen": 146998855, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.47460938, + "step": 6848, + "time_per_iteration": 2.487396478652954 + }, + { + "auxiliary_loss_clip": 0.01011877, + "auxiliary_loss_mlp": 0.01000431, + "balance_loss_clip": 0.99916697, + "balance_loss_mlp": 1.00210762, + "epoch": 0.41178415752292197, + "flos": 67920100773120.0, + "grad_norm": 0.8911961049552051, + "language_loss": 0.5627166, + "learning_rate": 2.5475611167154595e-06, + "loss": 0.58283967, + "num_input_tokens_seen": 147062710, + "router_z_loss_clip": 0.01263428, + "router_z_loss_mlp": 0.09765625, + "step": 6849, + "time_per_iteration": 3.1032485961914062 + }, + { + "auxiliary_loss_clip": 0.01071439, + "auxiliary_loss_mlp": 0.01031605, + "balance_loss_clip": 1.01685905, + "balance_loss_mlp": 1.02453279, + "epoch": 0.41184428077558993, + "flos": 34312499228160.0, + "grad_norm": 1.8787123760002953, + "language_loss": 0.75819302, + "learning_rate": 2.5471977203853874e-06, + "loss": 0.77922344, + "num_input_tokens_seen": 147086075, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.47070312, + "step": 6850, + "time_per_iteration": 3.8857243061065674 + }, + { + "auxiliary_loss_clip": 0.01068778, + "auxiliary_loss_mlp": 0.01027058, + "balance_loss_clip": 1.0128839, + "balance_loss_mlp": 1.02322304, + "epoch": 0.41190440402825795, + "flos": 35442672716160.0, + "grad_norm": 1.5038294018365757, + "language_loss": 0.67830467, + "learning_rate": 2.5468343045272363e-06, + "loss": 0.69926298, + "num_input_tokens_seen": 147107590, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.45703125, + "step": 6851, + "time_per_iteration": 2.5105338096618652 + }, + { + "auxiliary_loss_clip": 0.01073882, + "auxiliary_loss_mlp": 0.01033348, + "balance_loss_clip": 1.01739216, + "balance_loss_mlp": 1.02394629, + "epoch": 0.4119645272809259, + "flos": 20848838094720.0, + "grad_norm": 3.8281248623780777, + "language_loss": 0.79207462, + "learning_rate": 2.546470869153975e-06, + "loss": 0.81314695, + "num_input_tokens_seen": 147123715, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.5, + "step": 6852, + "time_per_iteration": 2.3704447746276855 + }, + { + "auxiliary_loss_clip": 0.01071755, + "auxiliary_loss_mlp": 0.01032551, + "balance_loss_clip": 1.01627851, + "balance_loss_mlp": 1.02286184, + "epoch": 0.4120246505335939, + "flos": 27560855531520.0, + "grad_norm": 1.8388770280494082, + "language_loss": 0.77698433, + "learning_rate": 2.546107414278575e-06, + "loss": 0.7980274, + "num_input_tokens_seen": 147144290, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.49023438, + "step": 6853, + "time_per_iteration": 3.795747756958008 + }, + { + "auxiliary_loss_clip": 0.01074249, + "auxiliary_loss_mlp": 0.01030578, + "balance_loss_clip": 1.01435411, + "balance_loss_mlp": 1.02394128, + "epoch": 0.41208477378626185, + "flos": 37813938681600.0, + "grad_norm": 1.7022927992939405, + "language_loss": 0.6559599, + "learning_rate": 2.545743939914005e-06, + "loss": 0.67700815, + "num_input_tokens_seen": 147166340, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.50390625, + "step": 6854, + "time_per_iteration": 2.526571750640869 + }, + { + "auxiliary_loss_clip": 0.01069016, + "auxiliary_loss_mlp": 0.0102971, + "balance_loss_clip": 1.01628757, + "balance_loss_mlp": 1.0228883, + "epoch": 0.4121448970389298, + "flos": 23439637889280.0, + "grad_norm": 1.727478368259255, + "language_loss": 0.8348484, + "learning_rate": 2.5453804460732385e-06, + "loss": 0.85583568, + "num_input_tokens_seen": 147184025, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.4609375, + "step": 6855, + "time_per_iteration": 2.384894371032715 + }, + { + "auxiliary_loss_clip": 0.01068311, + "auxiliary_loss_mlp": 0.01033618, + "balance_loss_clip": 1.01935518, + "balance_loss_mlp": 1.02248383, + "epoch": 0.4122050202915978, + "flos": 21324215433600.0, + "grad_norm": 1.6595783020814412, + "language_loss": 0.78943467, + "learning_rate": 2.5450169327692463e-06, + "loss": 0.81045401, + "num_input_tokens_seen": 147202730, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.45898438, + "step": 6856, + "time_per_iteration": 3.771777629852295 + }, + { + "auxiliary_loss_clip": 0.01071575, + "auxiliary_loss_mlp": 0.01030036, + "balance_loss_clip": 1.01519394, + "balance_loss_mlp": 1.02389359, + "epoch": 0.41226514354426574, + "flos": 17305468231680.0, + "grad_norm": 2.2416487882310854, + "language_loss": 0.79722559, + "learning_rate": 2.5446534000150017e-06, + "loss": 0.81824172, + "num_input_tokens_seen": 147215315, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.4765625, + "step": 6857, + "time_per_iteration": 2.3123586177825928 + }, + { + "auxiliary_loss_clip": 0.01072373, + "auxiliary_loss_mlp": 0.01032384, + "balance_loss_clip": 1.01749527, + "balance_loss_mlp": 1.02334023, + "epoch": 0.4123252667969337, + "flos": 17637910997760.0, + "grad_norm": 2.44548798878122, + "language_loss": 0.70619226, + "learning_rate": 2.5442898478234787e-06, + "loss": 0.72723985, + "num_input_tokens_seen": 147233330, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.49023438, + "step": 6858, + "time_per_iteration": 3.71753191947937 + }, + { + "auxiliary_loss_clip": 0.0106866, + "auxiliary_loss_mlp": 0.01031344, + "balance_loss_clip": 1.01616871, + "balance_loss_mlp": 1.02245998, + "epoch": 0.4123853900496017, + "flos": 46423101553920.0, + "grad_norm": 1.663753114059785, + "language_loss": 0.59596658, + "learning_rate": 2.543926276207651e-06, + "loss": 0.61696661, + "num_input_tokens_seen": 147257780, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.4609375, + "step": 6859, + "time_per_iteration": 2.5819568634033203 + }, + { + "auxiliary_loss_clip": 0.01069241, + "auxiliary_loss_mlp": 0.01031683, + "balance_loss_clip": 1.01690066, + "balance_loss_mlp": 1.02198029, + "epoch": 0.41244551330226964, + "flos": 17674220678400.0, + "grad_norm": 2.3938394101490688, + "language_loss": 0.73232591, + "learning_rate": 2.543562685180494e-06, + "loss": 0.75333512, + "num_input_tokens_seen": 147276055, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.47265625, + "step": 6860, + "time_per_iteration": 2.362062931060791 + }, + { + "auxiliary_loss_clip": 0.01068226, + "auxiliary_loss_mlp": 0.01027989, + "balance_loss_clip": 1.01447058, + "balance_loss_mlp": 1.02192235, + "epoch": 0.4125056365549376, + "flos": 18692846772480.0, + "grad_norm": 1.5815520376026873, + "language_loss": 0.74218154, + "learning_rate": 2.543199074754982e-06, + "loss": 0.76314366, + "num_input_tokens_seen": 147293200, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.46289062, + "step": 6861, + "time_per_iteration": 2.3416614532470703 + }, + { + "auxiliary_loss_clip": 0.01068689, + "auxiliary_loss_mlp": 0.01027277, + "balance_loss_clip": 1.01396155, + "balance_loss_mlp": 1.02361131, + "epoch": 0.41256575980760557, + "flos": 17894313089280.0, + "grad_norm": 2.4410594830428742, + "language_loss": 0.79583317, + "learning_rate": 2.542835444944093e-06, + "loss": 0.81679285, + "num_input_tokens_seen": 147310640, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.45117188, + "step": 6862, + "time_per_iteration": 2.347041606903076 + }, + { + "auxiliary_loss_clip": 0.01068642, + "auxiliary_loss_mlp": 0.01029536, + "balance_loss_clip": 1.01602972, + "balance_loss_mlp": 1.02265716, + "epoch": 0.41262588306027354, + "flos": 21980233480320.0, + "grad_norm": 1.677679154614065, + "language_loss": 0.76022917, + "learning_rate": 2.5424717957608034e-06, + "loss": 0.78121096, + "num_input_tokens_seen": 147329435, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.45898438, + "step": 6863, + "time_per_iteration": 2.3615684509277344 + }, + { + "auxiliary_loss_clip": 0.01012979, + "auxiliary_loss_mlp": 0.01003684, + "balance_loss_clip": 1.00253367, + "balance_loss_mlp": 1.00320458, + "epoch": 0.41268600631294156, + "flos": 53347284656640.0, + "grad_norm": 0.6971947331299172, + "language_loss": 0.52666748, + "learning_rate": 2.5421081272180904e-06, + "loss": 0.54683411, + "num_input_tokens_seen": 147385805, + "router_z_loss_clip": 0.01147461, + "router_z_loss_mlp": 0.09765625, + "step": 6864, + "time_per_iteration": 2.9822375774383545 + }, + { + "auxiliary_loss_clip": 0.01070952, + "auxiliary_loss_mlp": 0.01032472, + "balance_loss_clip": 1.01760113, + "balance_loss_mlp": 1.02184701, + "epoch": 0.4127461295656095, + "flos": 24384317990400.0, + "grad_norm": 1.7255229803240557, + "language_loss": 0.72592151, + "learning_rate": 2.541744439328933e-06, + "loss": 0.74695575, + "num_input_tokens_seen": 147405160, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.4921875, + "step": 6865, + "time_per_iteration": 2.393799066543579 + }, + { + "auxiliary_loss_clip": 0.01012023, + "auxiliary_loss_mlp": 0.01000226, + "balance_loss_clip": 0.99904633, + "balance_loss_mlp": 1.00230956, + "epoch": 0.4128062528182775, + "flos": 71701928288640.0, + "grad_norm": 0.9227695354665939, + "language_loss": 0.66565537, + "learning_rate": 2.5413807321063097e-06, + "loss": 0.68577784, + "num_input_tokens_seen": 147460245, + "router_z_loss_clip": 0.01177979, + "router_z_loss_mlp": 0.09716797, + "step": 6866, + "time_per_iteration": 2.9189491271972656 + }, + { + "auxiliary_loss_clip": 0.01067022, + "auxiliary_loss_mlp": 0.01030017, + "balance_loss_clip": 1.01614738, + "balance_loss_mlp": 1.02054, + "epoch": 0.41286637607094545, + "flos": 17848402784640.0, + "grad_norm": 2.010408554505244, + "language_loss": 0.80965149, + "learning_rate": 2.5410170055632016e-06, + "loss": 0.83062184, + "num_input_tokens_seen": 147476200, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.46484375, + "step": 6867, + "time_per_iteration": 2.3265223503112793 + }, + { + "auxiliary_loss_clip": 0.010723, + "auxiliary_loss_mlp": 0.01034523, + "balance_loss_clip": 1.01968098, + "balance_loss_mlp": 1.02360642, + "epoch": 0.4129264993236134, + "flos": 25548566832000.0, + "grad_norm": 2.0721998316815817, + "language_loss": 0.77530539, + "learning_rate": 2.5406532597125873e-06, + "loss": 0.79637361, + "num_input_tokens_seen": 147494315, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.48828125, + "step": 6868, + "time_per_iteration": 2.3932623863220215 + }, + { + "auxiliary_loss_clip": 0.0101045, + "auxiliary_loss_mlp": 0.01002441, + "balance_loss_clip": 1.00149298, + "balance_loss_mlp": 1.00081182, + "epoch": 0.4129866225762814, + "flos": 65411732298240.0, + "grad_norm": 0.8435280446741441, + "language_loss": 0.57782769, + "learning_rate": 2.5402894945674492e-06, + "loss": 0.59795666, + "num_input_tokens_seen": 147543665, + "router_z_loss_clip": 0.00946045, + "router_z_loss_mlp": 0.09667969, + "step": 6869, + "time_per_iteration": 2.792311906814575 + }, + { + "auxiliary_loss_clip": 0.01067729, + "auxiliary_loss_mlp": 0.01025933, + "balance_loss_clip": 1.01215792, + "balance_loss_mlp": 1.02173519, + "epoch": 0.41304674582894935, + "flos": 28875719445120.0, + "grad_norm": 1.7975514993647659, + "language_loss": 0.75488734, + "learning_rate": 2.539925710140769e-06, + "loss": 0.77582395, + "num_input_tokens_seen": 147564870, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.45898438, + "step": 6870, + "time_per_iteration": 2.4398751258850098 + }, + { + "auxiliary_loss_clip": 0.01069449, + "auxiliary_loss_mlp": 0.01033788, + "balance_loss_clip": 1.01914275, + "balance_loss_mlp": 1.02248693, + "epoch": 0.4131068690816173, + "flos": 22890908050560.0, + "grad_norm": 1.8127118759821814, + "language_loss": 0.82982624, + "learning_rate": 2.539561906445528e-06, + "loss": 0.85085857, + "num_input_tokens_seen": 147584840, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.47070312, + "step": 6871, + "time_per_iteration": 2.3807919025421143 + }, + { + "auxiliary_loss_clip": 0.01073742, + "auxiliary_loss_mlp": 0.01034089, + "balance_loss_clip": 1.01984346, + "balance_loss_mlp": 1.02515697, + "epoch": 0.4131669923342853, + "flos": 26064059189760.0, + "grad_norm": 2.0286335667093547, + "language_loss": 0.6846633, + "learning_rate": 2.5391980834947122e-06, + "loss": 0.70574164, + "num_input_tokens_seen": 147604635, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.48632812, + "step": 6872, + "time_per_iteration": 2.4135138988494873 + }, + { + "auxiliary_loss_clip": 0.0107022, + "auxiliary_loss_mlp": 0.01029127, + "balance_loss_clip": 1.01573944, + "balance_loss_mlp": 1.02320158, + "epoch": 0.41322711558695324, + "flos": 19243566558720.0, + "grad_norm": 1.8967614193894282, + "language_loss": 0.75692809, + "learning_rate": 2.538834241301303e-06, + "loss": 0.77792156, + "num_input_tokens_seen": 147620700, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.47070312, + "step": 6873, + "time_per_iteration": 2.3587610721588135 + }, + { + "auxiliary_loss_clip": 0.01072848, + "auxiliary_loss_mlp": 0.01031946, + "balance_loss_clip": 1.01672339, + "balance_loss_mlp": 1.02409291, + "epoch": 0.4132872388396212, + "flos": 22673364168960.0, + "grad_norm": 1.7846737962064347, + "language_loss": 0.82785654, + "learning_rate": 2.5384703798782852e-06, + "loss": 0.84890449, + "num_input_tokens_seen": 147639490, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.48632812, + "step": 6874, + "time_per_iteration": 2.3966686725616455 + }, + { + "auxiliary_loss_clip": 0.01070741, + "auxiliary_loss_mlp": 0.0103119, + "balance_loss_clip": 1.01557314, + "balance_loss_mlp": 1.02292681, + "epoch": 0.4133473620922892, + "flos": 20149353538560.0, + "grad_norm": 2.155580930700582, + "language_loss": 0.71793103, + "learning_rate": 2.538106499238646e-06, + "loss": 0.73895025, + "num_input_tokens_seen": 147657205, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.47851562, + "step": 6875, + "time_per_iteration": 2.3536081314086914 + }, + { + "auxiliary_loss_clip": 0.01067786, + "auxiliary_loss_mlp": 0.01031223, + "balance_loss_clip": 1.01760316, + "balance_loss_mlp": 1.02386785, + "epoch": 0.41340748534495714, + "flos": 24241627797120.0, + "grad_norm": 1.6807572701389986, + "language_loss": 0.83115685, + "learning_rate": 2.537742599395369e-06, + "loss": 0.85214686, + "num_input_tokens_seen": 147677005, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.43945312, + "step": 6876, + "time_per_iteration": 2.407388210296631 + }, + { + "auxiliary_loss_clip": 0.01072134, + "auxiliary_loss_mlp": 0.01029169, + "balance_loss_clip": 1.01393366, + "balance_loss_mlp": 1.02250552, + "epoch": 0.41346760859762516, + "flos": 14391302624640.0, + "grad_norm": 2.101781922256585, + "language_loss": 0.65916669, + "learning_rate": 2.5373786803614423e-06, + "loss": 0.68017972, + "num_input_tokens_seen": 147693435, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.49609375, + "step": 6877, + "time_per_iteration": 2.341932773590088 + }, + { + "auxiliary_loss_clip": 0.01071445, + "auxiliary_loss_mlp": 0.01031823, + "balance_loss_clip": 1.01748204, + "balance_loss_mlp": 1.0243988, + "epoch": 0.4135277318502931, + "flos": 22490908070400.0, + "grad_norm": 1.8084752249565887, + "language_loss": 0.76241839, + "learning_rate": 2.5370147421498523e-06, + "loss": 0.78345102, + "num_input_tokens_seen": 147714000, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.47070312, + "step": 6878, + "time_per_iteration": 2.3955705165863037 + }, + { + "auxiliary_loss_clip": 0.01069272, + "auxiliary_loss_mlp": 0.0102859, + "balance_loss_clip": 1.01486921, + "balance_loss_mlp": 1.02298582, + "epoch": 0.4135878551029611, + "flos": 22417660304640.0, + "grad_norm": 2.108621196741715, + "language_loss": 0.80344343, + "learning_rate": 2.536650784773588e-06, + "loss": 0.82442212, + "num_input_tokens_seen": 147731010, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.46289062, + "step": 6879, + "time_per_iteration": 2.371194362640381 + }, + { + "auxiliary_loss_clip": 0.01067736, + "auxiliary_loss_mlp": 0.01030072, + "balance_loss_clip": 1.01577842, + "balance_loss_mlp": 1.02167034, + "epoch": 0.41364797835562905, + "flos": 27051996332160.0, + "grad_norm": 2.1988095798575236, + "language_loss": 0.84831059, + "learning_rate": 2.5362868082456376e-06, + "loss": 0.86928862, + "num_input_tokens_seen": 147750880, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.4609375, + "step": 6880, + "time_per_iteration": 2.423956871032715 + }, + { + "auxiliary_loss_clip": 0.01070997, + "auxiliary_loss_mlp": 0.010284, + "balance_loss_clip": 1.01410687, + "balance_loss_mlp": 1.02231896, + "epoch": 0.413708101608297, + "flos": 22966459966080.0, + "grad_norm": 1.6110607262116827, + "language_loss": 0.7045061, + "learning_rate": 2.53592281257899e-06, + "loss": 0.72550005, + "num_input_tokens_seen": 147771360, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.48828125, + "step": 6881, + "time_per_iteration": 2.3745036125183105 + }, + { + "auxiliary_loss_clip": 0.01069749, + "auxiliary_loss_mlp": 0.01027963, + "balance_loss_clip": 1.01460564, + "balance_loss_mlp": 1.02367949, + "epoch": 0.413768224860965, + "flos": 13333155004800.0, + "grad_norm": 1.882246247338197, + "language_loss": 0.81391317, + "learning_rate": 2.5355587977866364e-06, + "loss": 0.83489031, + "num_input_tokens_seen": 147787440, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.4609375, + "step": 6882, + "time_per_iteration": 2.357234477996826 + }, + { + "auxiliary_loss_clip": 0.0107311, + "auxiliary_loss_mlp": 0.01032803, + "balance_loss_clip": 1.01653051, + "balance_loss_mlp": 1.02473712, + "epoch": 0.41382834811363295, + "flos": 20812912439040.0, + "grad_norm": 1.9969192215909495, + "language_loss": 0.69726086, + "learning_rate": 2.5351947638815665e-06, + "loss": 0.71832001, + "num_input_tokens_seen": 147805720, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.48242188, + "step": 6883, + "time_per_iteration": 2.361564874649048 + }, + { + "auxiliary_loss_clip": 0.01068501, + "auxiliary_loss_mlp": 0.01031792, + "balance_loss_clip": 1.01835752, + "balance_loss_mlp": 1.02198982, + "epoch": 0.4138884713663009, + "flos": 20666102705280.0, + "grad_norm": 1.7736629574126175, + "language_loss": 0.7536509, + "learning_rate": 2.5348307108767724e-06, + "loss": 0.77465385, + "num_input_tokens_seen": 147824605, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.46484375, + "step": 6884, + "time_per_iteration": 2.3790128231048584 + }, + { + "auxiliary_loss_clip": 0.01070793, + "auxiliary_loss_mlp": 0.01033355, + "balance_loss_clip": 1.01886511, + "balance_loss_mlp": 1.0231998, + "epoch": 0.4139485946189689, + "flos": 29055417546240.0, + "grad_norm": 1.5348164143346925, + "language_loss": 0.7566309, + "learning_rate": 2.534466638785246e-06, + "loss": 0.77767235, + "num_input_tokens_seen": 147845445, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.4765625, + "step": 6885, + "time_per_iteration": 2.4264075756073 + }, + { + "auxiliary_loss_clip": 0.01069107, + "auxiliary_loss_mlp": 0.01028358, + "balance_loss_clip": 1.01414847, + "balance_loss_mlp": 1.02134645, + "epoch": 0.41400871787163684, + "flos": 24424572654720.0, + "grad_norm": 2.1886749906289764, + "language_loss": 0.69634664, + "learning_rate": 2.5341025476199795e-06, + "loss": 0.71732134, + "num_input_tokens_seen": 147865580, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.4765625, + "step": 6886, + "time_per_iteration": 2.3978052139282227 + }, + { + "auxiliary_loss_clip": 0.01069323, + "auxiliary_loss_mlp": 0.01032015, + "balance_loss_clip": 1.01777518, + "balance_loss_mlp": 1.02160656, + "epoch": 0.4140688411243048, + "flos": 19463030565120.0, + "grad_norm": 1.4882578942278777, + "language_loss": 0.75442976, + "learning_rate": 2.5337384373939677e-06, + "loss": 0.77544308, + "num_input_tokens_seen": 147885230, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.4765625, + "step": 6887, + "time_per_iteration": 2.395981788635254 + }, + { + "auxiliary_loss_clip": 0.01068928, + "auxiliary_loss_mlp": 0.01025702, + "balance_loss_clip": 1.01175499, + "balance_loss_mlp": 1.02234733, + "epoch": 0.4141289643769728, + "flos": 19312764606720.0, + "grad_norm": 1.849879062463816, + "language_loss": 0.70081753, + "learning_rate": 2.5333743081202034e-06, + "loss": 0.72176385, + "num_input_tokens_seen": 147903035, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.46484375, + "step": 6888, + "time_per_iteration": 3.8609752655029297 + }, + { + "auxiliary_loss_clip": 0.0106752, + "auxiliary_loss_mlp": 0.01028536, + "balance_loss_clip": 1.0144937, + "balance_loss_mlp": 1.02146375, + "epoch": 0.41418908762964074, + "flos": 16725979618560.0, + "grad_norm": 1.871361834688514, + "language_loss": 0.76135111, + "learning_rate": 2.5330101598116823e-06, + "loss": 0.78231168, + "num_input_tokens_seen": 147918745, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.4609375, + "step": 6889, + "time_per_iteration": 2.337906837463379 + }, + { + "auxiliary_loss_clip": 0.0106871, + "auxiliary_loss_mlp": 0.01028195, + "balance_loss_clip": 1.0140568, + "balance_loss_mlp": 1.02305377, + "epoch": 0.41424921088230876, + "flos": 25295795521920.0, + "grad_norm": 1.6240062384273304, + "language_loss": 0.80121958, + "learning_rate": 2.5326459924814007e-06, + "loss": 0.82218856, + "num_input_tokens_seen": 147938265, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.45507812, + "step": 6890, + "time_per_iteration": 2.416794538497925 + }, + { + "auxiliary_loss_clip": 0.0106816, + "auxiliary_loss_mlp": 0.01029058, + "balance_loss_clip": 1.01472902, + "balance_loss_mlp": 1.02259409, + "epoch": 0.4143093341349767, + "flos": 20959442881920.0, + "grad_norm": 1.593745837010182, + "language_loss": 0.74089086, + "learning_rate": 2.532281806142352e-06, + "loss": 0.76186299, + "num_input_tokens_seen": 147957320, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.45703125, + "step": 6891, + "time_per_iteration": 2.3640666007995605 + }, + { + "auxiliary_loss_clip": 0.01071724, + "auxiliary_loss_mlp": 0.01033952, + "balance_loss_clip": 1.0190866, + "balance_loss_mlp": 1.02335048, + "epoch": 0.4143694573876447, + "flos": 22016612983680.0, + "grad_norm": 2.1038990597922473, + "language_loss": 0.84374863, + "learning_rate": 2.531917600807536e-06, + "loss": 0.86480534, + "num_input_tokens_seen": 147977045, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.484375, + "step": 6892, + "time_per_iteration": 3.7833666801452637 + }, + { + "auxiliary_loss_clip": 0.01067902, + "auxiliary_loss_mlp": 0.01025515, + "balance_loss_clip": 1.01187134, + "balance_loss_mlp": 1.02278852, + "epoch": 0.41442958064031266, + "flos": 35696002608000.0, + "grad_norm": 1.7225767092629618, + "language_loss": 0.70555174, + "learning_rate": 2.5315533764899487e-06, + "loss": 0.72648591, + "num_input_tokens_seen": 147996905, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.45117188, + "step": 6893, + "time_per_iteration": 2.502600908279419 + }, + { + "auxiliary_loss_clip": 0.01070734, + "auxiliary_loss_mlp": 0.01027342, + "balance_loss_clip": 1.0139128, + "balance_loss_mlp": 1.02385855, + "epoch": 0.4144897038929806, + "flos": 28292495316480.0, + "grad_norm": 1.4387975161548954, + "language_loss": 0.72929025, + "learning_rate": 2.5311891332025886e-06, + "loss": 0.75027096, + "num_input_tokens_seen": 148017875, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.46875, + "step": 6894, + "time_per_iteration": 2.4353291988372803 + }, + { + "auxiliary_loss_clip": 0.01068955, + "auxiliary_loss_mlp": 0.01027912, + "balance_loss_clip": 1.01437557, + "balance_loss_mlp": 1.02234054, + "epoch": 0.4145498271456486, + "flos": 11647513785600.0, + "grad_norm": 2.200737693144487, + "language_loss": 0.6248489, + "learning_rate": 2.530824870958455e-06, + "loss": 0.64581758, + "num_input_tokens_seen": 148032300, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.46679688, + "step": 6895, + "time_per_iteration": 3.739891767501831 + }, + { + "auxiliary_loss_clip": 0.01067839, + "auxiliary_loss_mlp": 0.01031315, + "balance_loss_clip": 1.01764214, + "balance_loss_mlp": 1.0219276, + "epoch": 0.41460995039831655, + "flos": 27234382608000.0, + "grad_norm": 1.375875337276067, + "language_loss": 0.70392108, + "learning_rate": 2.5304605897705465e-06, + "loss": 0.72491264, + "num_input_tokens_seen": 148053260, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.45898438, + "step": 6896, + "time_per_iteration": 2.422586441040039 + }, + { + "auxiliary_loss_clip": 0.0106858, + "auxiliary_loss_mlp": 0.01030644, + "balance_loss_clip": 1.01635122, + "balance_loss_mlp": 1.02134109, + "epoch": 0.4146700736509845, + "flos": 25921159528320.0, + "grad_norm": 2.2045435743424235, + "language_loss": 0.73269612, + "learning_rate": 2.530096289651865e-06, + "loss": 0.7536884, + "num_input_tokens_seen": 148072965, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.47265625, + "step": 6897, + "time_per_iteration": 2.412428140640259 + }, + { + "auxiliary_loss_clip": 0.01067971, + "auxiliary_loss_mlp": 0.0103186, + "balance_loss_clip": 1.01852083, + "balance_loss_mlp": 1.02290523, + "epoch": 0.4147301969036525, + "flos": 26832043566720.0, + "grad_norm": 1.5841233921464402, + "language_loss": 0.84477496, + "learning_rate": 2.5297319706154095e-06, + "loss": 0.86577326, + "num_input_tokens_seen": 148093240, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.45117188, + "step": 6898, + "time_per_iteration": 3.789635181427002 + }, + { + "auxiliary_loss_clip": 0.01068277, + "auxiliary_loss_mlp": 0.01030456, + "balance_loss_clip": 1.01627636, + "balance_loss_mlp": 1.02116716, + "epoch": 0.41479032015632045, + "flos": 20297385169920.0, + "grad_norm": 1.688137598646365, + "language_loss": 0.74455142, + "learning_rate": 2.5293676326741838e-06, + "loss": 0.76553875, + "num_input_tokens_seen": 148110925, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.47265625, + "step": 6899, + "time_per_iteration": 2.3669254779815674 + }, + { + "auxiliary_loss_clip": 0.01072105, + "auxiliary_loss_mlp": 0.01032043, + "balance_loss_clip": 1.01627135, + "balance_loss_mlp": 1.02260315, + "epoch": 0.4148504434089884, + "flos": 25263814849920.0, + "grad_norm": 2.1393141555323534, + "language_loss": 0.75716382, + "learning_rate": 2.529003275841188e-06, + "loss": 0.77820528, + "num_input_tokens_seen": 148130670, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.49609375, + "step": 6900, + "time_per_iteration": 2.401484727859497 + }, + { + "auxiliary_loss_clip": 0.01068228, + "auxiliary_loss_mlp": 0.01029209, + "balance_loss_clip": 1.0146594, + "balance_loss_mlp": 1.021842, + "epoch": 0.4149105666616564, + "flos": 12821502896640.0, + "grad_norm": 2.1990332437089957, + "language_loss": 0.80243123, + "learning_rate": 2.5286389001294265e-06, + "loss": 0.82340556, + "num_input_tokens_seen": 148148350, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.46484375, + "step": 6901, + "time_per_iteration": 2.361698865890503 + }, + { + "auxiliary_loss_clip": 0.01067012, + "auxiliary_loss_mlp": 0.01027138, + "balance_loss_clip": 1.01345873, + "balance_loss_mlp": 1.02166414, + "epoch": 0.41497068991432434, + "flos": 16762952615040.0, + "grad_norm": 1.6721896802291731, + "language_loss": 0.69527543, + "learning_rate": 2.5282745055519027e-06, + "loss": 0.71621692, + "num_input_tokens_seen": 148167550, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.453125, + "step": 6902, + "time_per_iteration": 2.3608453273773193 + }, + { + "auxiliary_loss_clip": 0.01067081, + "auxiliary_loss_mlp": 0.01032891, + "balance_loss_clip": 1.0193193, + "balance_loss_mlp": 1.02184772, + "epoch": 0.4150308131669923, + "flos": 18000030286080.0, + "grad_norm": 1.5659887209480605, + "language_loss": 0.83665156, + "learning_rate": 2.5279100921216197e-06, + "loss": 0.85765129, + "num_input_tokens_seen": 148184740, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.453125, + "step": 6903, + "time_per_iteration": 2.3410301208496094 + }, + { + "auxiliary_loss_clip": 0.01073396, + "auxiliary_loss_mlp": 0.01032324, + "balance_loss_clip": 1.01587343, + "balance_loss_mlp": 1.02333283, + "epoch": 0.41509093641966033, + "flos": 30043459422720.0, + "grad_norm": 2.044817019834815, + "language_loss": 0.68054205, + "learning_rate": 2.5275456598515846e-06, + "loss": 0.70159924, + "num_input_tokens_seen": 148204605, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.5, + "step": 6904, + "time_per_iteration": 2.4396564960479736 + }, + { + "auxiliary_loss_clip": 0.01071128, + "auxiliary_loss_mlp": 0.01028595, + "balance_loss_clip": 1.01346135, + "balance_loss_mlp": 1.02358174, + "epoch": 0.4151510596723283, + "flos": 24278845173120.0, + "grad_norm": 2.5272537761682776, + "language_loss": 0.77553606, + "learning_rate": 2.5271812087548014e-06, + "loss": 0.79653335, + "num_input_tokens_seen": 148224675, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.47460938, + "step": 6905, + "time_per_iteration": 2.401001453399658 + }, + { + "auxiliary_loss_clip": 0.010693, + "auxiliary_loss_mlp": 0.01024531, + "balance_loss_clip": 1.01028585, + "balance_loss_mlp": 1.02263117, + "epoch": 0.41521118292499626, + "flos": 23475109697280.0, + "grad_norm": 1.5898709947524627, + "language_loss": 0.68472552, + "learning_rate": 2.526816738844277e-06, + "loss": 0.7056638, + "num_input_tokens_seen": 148243375, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.46875, + "step": 6906, + "time_per_iteration": 2.3974807262420654 + }, + { + "auxiliary_loss_clip": 0.01066409, + "auxiliary_loss_mlp": 0.01023359, + "balance_loss_clip": 1.01015699, + "balance_loss_mlp": 1.02123809, + "epoch": 0.4152713061776642, + "flos": 22124459773440.0, + "grad_norm": 2.275018169416303, + "language_loss": 0.67392218, + "learning_rate": 2.5264522501330186e-06, + "loss": 0.69481981, + "num_input_tokens_seen": 148261140, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.453125, + "step": 6907, + "time_per_iteration": 2.383169412612915 + }, + { + "auxiliary_loss_clip": 0.01069759, + "auxiliary_loss_mlp": 0.01027465, + "balance_loss_clip": 1.01426852, + "balance_loss_mlp": 1.02312016, + "epoch": 0.4153314294303322, + "flos": 21250339263360.0, + "grad_norm": 1.8036987811334955, + "language_loss": 0.77010524, + "learning_rate": 2.5260877426340326e-06, + "loss": 0.79107749, + "num_input_tokens_seen": 148279655, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.46679688, + "step": 6908, + "time_per_iteration": 2.382509231567383 + }, + { + "auxiliary_loss_clip": 0.0101166, + "auxiliary_loss_mlp": 0.0100172, + "balance_loss_clip": 1.00078464, + "balance_loss_mlp": 1.00254261, + "epoch": 0.41539155268300015, + "flos": 57917554606080.0, + "grad_norm": 0.8822804973238411, + "language_loss": 0.64858544, + "learning_rate": 2.5257232163603297e-06, + "loss": 0.66871929, + "num_input_tokens_seen": 148339005, + "router_z_loss_clip": 0.00933838, + "router_z_loss_mlp": 0.09082031, + "step": 6909, + "time_per_iteration": 2.988574981689453 + }, + { + "auxiliary_loss_clip": 0.01069702, + "auxiliary_loss_mlp": 0.01028391, + "balance_loss_clip": 1.01381195, + "balance_loss_mlp": 1.02303529, + "epoch": 0.4154516759356681, + "flos": 21902726528640.0, + "grad_norm": 1.44546443999055, + "language_loss": 0.86864883, + "learning_rate": 2.5253586713249164e-06, + "loss": 0.88962978, + "num_input_tokens_seen": 148358715, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.46679688, + "step": 6910, + "time_per_iteration": 2.3863494396209717 + }, + { + "auxiliary_loss_clip": 0.01011207, + "auxiliary_loss_mlp": 0.00999742, + "balance_loss_clip": 0.99875849, + "balance_loss_mlp": 1.00199461, + "epoch": 0.4155117991883361, + "flos": 67830584313600.0, + "grad_norm": 0.8156419659066633, + "language_loss": 0.62087607, + "learning_rate": 2.524994107540804e-06, + "loss": 0.64098549, + "num_input_tokens_seen": 148417280, + "router_z_loss_clip": 0.00982666, + "router_z_loss_mlp": 0.09179688, + "step": 6911, + "time_per_iteration": 3.0026540756225586 + }, + { + "auxiliary_loss_clip": 0.01073823, + "auxiliary_loss_mlp": 0.0103069, + "balance_loss_clip": 1.01537216, + "balance_loss_mlp": 1.02399874, + "epoch": 0.41557192244100405, + "flos": 14281815000960.0, + "grad_norm": 4.537509830699755, + "language_loss": 0.87886655, + "learning_rate": 2.5246295250210024e-06, + "loss": 0.89991164, + "num_input_tokens_seen": 148432610, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.49804688, + "step": 6912, + "time_per_iteration": 2.3631138801574707 + }, + { + "auxiliary_loss_clip": 0.01069571, + "auxiliary_loss_mlp": 0.01026466, + "balance_loss_clip": 1.01358509, + "balance_loss_mlp": 1.02397084, + "epoch": 0.415632045693672, + "flos": 24460812512640.0, + "grad_norm": 2.204223300052811, + "language_loss": 0.62790143, + "learning_rate": 2.5242649237785224e-06, + "loss": 0.64886177, + "num_input_tokens_seen": 148451510, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.45703125, + "step": 6913, + "time_per_iteration": 2.397660255432129 + }, + { + "auxiliary_loss_clip": 0.01067976, + "auxiliary_loss_mlp": 0.01026757, + "balance_loss_clip": 1.01392365, + "balance_loss_mlp": 1.02311456, + "epoch": 0.41569216894634, + "flos": 20114405400960.0, + "grad_norm": 1.8370935021328485, + "language_loss": 0.75378853, + "learning_rate": 2.5239003038263764e-06, + "loss": 0.77473587, + "num_input_tokens_seen": 148469945, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.44921875, + "step": 6914, + "time_per_iteration": 2.386561393737793 + }, + { + "auxiliary_loss_clip": 0.01070482, + "auxiliary_loss_mlp": 0.01033304, + "balance_loss_clip": 1.01814067, + "balance_loss_mlp": 1.02228892, + "epoch": 0.41575229219900794, + "flos": 23797882016640.0, + "grad_norm": 1.705434970635847, + "language_loss": 0.87367904, + "learning_rate": 2.523535665177575e-06, + "loss": 0.89471698, + "num_input_tokens_seen": 148486655, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.48242188, + "step": 6915, + "time_per_iteration": 2.3936846256256104 + }, + { + "auxiliary_loss_clip": 0.01070788, + "auxiliary_loss_mlp": 0.01030044, + "balance_loss_clip": 1.01637614, + "balance_loss_mlp": 1.02343774, + "epoch": 0.4158124154516759, + "flos": 23107230034560.0, + "grad_norm": 2.29093687766695, + "language_loss": 0.71647096, + "learning_rate": 2.5231710078451333e-06, + "loss": 0.73747927, + "num_input_tokens_seen": 148505035, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.47265625, + "step": 6916, + "time_per_iteration": 2.3700342178344727 + }, + { + "auxiliary_loss_clip": 0.01072428, + "auxiliary_loss_mlp": 0.01030532, + "balance_loss_clip": 1.01535654, + "balance_loss_mlp": 1.0233041, + "epoch": 0.41587253870434393, + "flos": 24241837265280.0, + "grad_norm": 1.3678364360375688, + "language_loss": 0.71828145, + "learning_rate": 2.522806331842064e-06, + "loss": 0.7393111, + "num_input_tokens_seen": 148525575, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.49023438, + "step": 6917, + "time_per_iteration": 2.4047839641571045 + }, + { + "auxiliary_loss_clip": 0.01069933, + "auxiliary_loss_mlp": 0.01025933, + "balance_loss_clip": 1.0122478, + "balance_loss_mlp": 1.02257729, + "epoch": 0.4159326619570119, + "flos": 23880381292800.0, + "grad_norm": 1.5283939853373432, + "language_loss": 0.81073976, + "learning_rate": 2.522441637181381e-06, + "loss": 0.83169842, + "num_input_tokens_seen": 148547270, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.47460938, + "step": 6918, + "time_per_iteration": 2.402900218963623 + }, + { + "auxiliary_loss_clip": 0.01072815, + "auxiliary_loss_mlp": 0.01028611, + "balance_loss_clip": 1.013906, + "balance_loss_mlp": 1.02446496, + "epoch": 0.41599278520967986, + "flos": 40624900709760.0, + "grad_norm": 1.6384704178836476, + "language_loss": 0.70108664, + "learning_rate": 2.5220769238761008e-06, + "loss": 0.72210085, + "num_input_tokens_seen": 148572100, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.484375, + "step": 6919, + "time_per_iteration": 2.5616559982299805 + }, + { + "auxiliary_loss_clip": 0.01073757, + "auxiliary_loss_mlp": 0.01031561, + "balance_loss_clip": 1.01608777, + "balance_loss_mlp": 1.02468348, + "epoch": 0.4160529084623478, + "flos": 18221972999040.0, + "grad_norm": 1.9634989473163877, + "language_loss": 0.81060797, + "learning_rate": 2.5217121919392378e-06, + "loss": 0.83166111, + "num_input_tokens_seen": 148591245, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.49023438, + "step": 6920, + "time_per_iteration": 2.3551957607269287 + }, + { + "auxiliary_loss_clip": 0.01073213, + "auxiliary_loss_mlp": 0.01031184, + "balance_loss_clip": 1.01538873, + "balance_loss_mlp": 1.02393925, + "epoch": 0.4161130317150158, + "flos": 13661129116800.0, + "grad_norm": 2.7474691649606067, + "language_loss": 0.65050244, + "learning_rate": 2.521347441383808e-06, + "loss": 0.67154646, + "num_input_tokens_seen": 148607980, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.4921875, + "step": 6921, + "time_per_iteration": 2.346557140350342 + }, + { + "auxiliary_loss_clip": 0.01071451, + "auxiliary_loss_mlp": 0.01032031, + "balance_loss_clip": 1.0182091, + "balance_loss_mlp": 1.02330613, + "epoch": 0.41617315496768376, + "flos": 16177633804800.0, + "grad_norm": 2.030024822624344, + "language_loss": 0.8066771, + "learning_rate": 2.5209826722228293e-06, + "loss": 0.82771194, + "num_input_tokens_seen": 148624490, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.48046875, + "step": 6922, + "time_per_iteration": 2.33309006690979 + }, + { + "auxiliary_loss_clip": 0.01070447, + "auxiliary_loss_mlp": 0.01026626, + "balance_loss_clip": 1.01255322, + "balance_loss_mlp": 1.02283621, + "epoch": 0.4162332782203517, + "flos": 26212125732480.0, + "grad_norm": 1.6145357130431925, + "language_loss": 0.67709816, + "learning_rate": 2.5206178844693195e-06, + "loss": 0.69806886, + "num_input_tokens_seen": 148646490, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.4765625, + "step": 6923, + "time_per_iteration": 2.4180209636688232 + }, + { + "auxiliary_loss_clip": 0.01072083, + "auxiliary_loss_mlp": 0.01031601, + "balance_loss_clip": 1.01634216, + "balance_loss_mlp": 1.02357793, + "epoch": 0.4162934014730197, + "flos": 28182728401920.0, + "grad_norm": 1.7328529094657736, + "language_loss": 0.75655949, + "learning_rate": 2.5202530781362966e-06, + "loss": 0.77759629, + "num_input_tokens_seen": 148668580, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.484375, + "step": 6924, + "time_per_iteration": 2.429680347442627 + }, + { + "auxiliary_loss_clip": 0.01070482, + "auxiliary_loss_mlp": 0.01031153, + "balance_loss_clip": 1.01693177, + "balance_loss_mlp": 1.022331, + "epoch": 0.41635352472568765, + "flos": 19864287354240.0, + "grad_norm": 1.8677185955646807, + "language_loss": 0.7321403, + "learning_rate": 2.51988825323678e-06, + "loss": 0.75315666, + "num_input_tokens_seen": 148688410, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.48046875, + "step": 6925, + "time_per_iteration": 2.369055986404419 + }, + { + "auxiliary_loss_clip": 0.01067116, + "auxiliary_loss_mlp": 0.01026617, + "balance_loss_clip": 1.01334262, + "balance_loss_mlp": 1.02082062, + "epoch": 0.4164136479783556, + "flos": 14934586291200.0, + "grad_norm": 1.9042066174062526, + "language_loss": 0.83785653, + "learning_rate": 2.5195234097837883e-06, + "loss": 0.85879385, + "num_input_tokens_seen": 148704855, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.46289062, + "step": 6926, + "time_per_iteration": 2.341604471206665 + }, + { + "auxiliary_loss_clip": 0.0106783, + "auxiliary_loss_mlp": 0.01026216, + "balance_loss_clip": 1.01337743, + "balance_loss_mlp": 1.02353275, + "epoch": 0.4164737712310236, + "flos": 21756649933440.0, + "grad_norm": 2.6172876657908484, + "language_loss": 0.86816251, + "learning_rate": 2.5191585477903423e-06, + "loss": 0.88910294, + "num_input_tokens_seen": 148723065, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.44335938, + "step": 6927, + "time_per_iteration": 2.377873420715332 + }, + { + "auxiliary_loss_clip": 0.01069476, + "auxiliary_loss_mlp": 0.01028029, + "balance_loss_clip": 1.01369965, + "balance_loss_mlp": 1.02179217, + "epoch": 0.41653389448369155, + "flos": 20739106091520.0, + "grad_norm": 2.0507787608344583, + "language_loss": 0.72090262, + "learning_rate": 2.5187936672694636e-06, + "loss": 0.74187768, + "num_input_tokens_seen": 148741780, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.4765625, + "step": 6928, + "time_per_iteration": 3.8046388626098633 + }, + { + "auxiliary_loss_clip": 0.01067177, + "auxiliary_loss_mlp": 0.01027784, + "balance_loss_clip": 1.01402712, + "balance_loss_mlp": 1.02116847, + "epoch": 0.4165940177363595, + "flos": 24971731482240.0, + "grad_norm": 2.795764903946246, + "language_loss": 0.78340334, + "learning_rate": 2.5184287682341733e-06, + "loss": 0.80435294, + "num_input_tokens_seen": 148759795, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.4609375, + "step": 6929, + "time_per_iteration": 2.39945912361145 + }, + { + "auxiliary_loss_clip": 0.01069639, + "auxiliary_loss_mlp": 0.01028844, + "balance_loss_clip": 1.01525974, + "balance_loss_mlp": 1.02248168, + "epoch": 0.41665414098902753, + "flos": 20520689425920.0, + "grad_norm": 1.682548214023034, + "language_loss": 0.71017277, + "learning_rate": 2.5180638506974935e-06, + "loss": 0.7311576, + "num_input_tokens_seen": 148778680, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.47265625, + "step": 6930, + "time_per_iteration": 2.352583408355713 + }, + { + "auxiliary_loss_clip": 0.01067792, + "auxiliary_loss_mlp": 0.01027426, + "balance_loss_clip": 1.01366365, + "balance_loss_mlp": 1.02065015, + "epoch": 0.4167142642416955, + "flos": 19681901078400.0, + "grad_norm": 1.68155508883522, + "language_loss": 0.81029385, + "learning_rate": 2.517698914672448e-06, + "loss": 0.83124602, + "num_input_tokens_seen": 148796470, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.47265625, + "step": 6931, + "time_per_iteration": 2.39155650138855 + }, + { + "auxiliary_loss_clip": 0.01066484, + "auxiliary_loss_mlp": 0.01028876, + "balance_loss_clip": 1.01448131, + "balance_loss_mlp": 1.02020812, + "epoch": 0.41677438749436346, + "flos": 23762759322240.0, + "grad_norm": 1.9957844666241185, + "language_loss": 0.78916872, + "learning_rate": 2.5173339601720595e-06, + "loss": 0.81012237, + "num_input_tokens_seen": 148815300, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.46289062, + "step": 6932, + "time_per_iteration": 3.8012776374816895 + }, + { + "auxiliary_loss_clip": 0.01067099, + "auxiliary_loss_mlp": 0.01029641, + "balance_loss_clip": 1.0151093, + "balance_loss_mlp": 1.02106476, + "epoch": 0.41683451074703143, + "flos": 30408720733440.0, + "grad_norm": 2.591031638802521, + "language_loss": 0.81475532, + "learning_rate": 2.516968987209353e-06, + "loss": 0.83572274, + "num_input_tokens_seen": 148834315, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.4609375, + "step": 6933, + "time_per_iteration": 2.436124086380005 + }, + { + "auxiliary_loss_clip": 0.01070102, + "auxiliary_loss_mlp": 0.01031063, + "balance_loss_clip": 1.01567888, + "balance_loss_mlp": 1.02202344, + "epoch": 0.4168946339996994, + "flos": 21505694014080.0, + "grad_norm": 2.5647937133311434, + "language_loss": 0.77257097, + "learning_rate": 2.516603995797353e-06, + "loss": 0.79358262, + "num_input_tokens_seen": 148852420, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.48046875, + "step": 6934, + "time_per_iteration": 2.3651375770568848 + }, + { + "auxiliary_loss_clip": 0.01070758, + "auxiliary_loss_mlp": 0.01029268, + "balance_loss_clip": 1.01504087, + "balance_loss_mlp": 1.02326035, + "epoch": 0.41695475725236736, + "flos": 17637736440960.0, + "grad_norm": 1.6959229695960676, + "language_loss": 0.66897106, + "learning_rate": 2.5162389859490856e-06, + "loss": 0.68997133, + "num_input_tokens_seen": 148869305, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.4765625, + "step": 6935, + "time_per_iteration": 3.798678398132324 + }, + { + "auxiliary_loss_clip": 0.0106984, + "auxiliary_loss_mlp": 0.01031577, + "balance_loss_clip": 1.01690841, + "balance_loss_mlp": 1.02212238, + "epoch": 0.4170148805050353, + "flos": 15668006555520.0, + "grad_norm": 3.6165368659135253, + "language_loss": 0.73397833, + "learning_rate": 2.5158739576775766e-06, + "loss": 0.75499249, + "num_input_tokens_seen": 148886395, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.4765625, + "step": 6936, + "time_per_iteration": 2.3512656688690186 + }, + { + "auxiliary_loss_clip": 0.01070453, + "auxiliary_loss_mlp": 0.010276, + "balance_loss_clip": 1.01411152, + "balance_loss_mlp": 1.02267337, + "epoch": 0.4170750037577033, + "flos": 15158239660800.0, + "grad_norm": 1.786209863619034, + "language_loss": 0.74137294, + "learning_rate": 2.5155089109958526e-06, + "loss": 0.76235348, + "num_input_tokens_seen": 148905235, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.47851562, + "step": 6937, + "time_per_iteration": 2.3711459636688232 + }, + { + "auxiliary_loss_clip": 0.01066036, + "auxiliary_loss_mlp": 0.01031606, + "balance_loss_clip": 1.01795101, + "balance_loss_mlp": 1.02076495, + "epoch": 0.41713512701037125, + "flos": 28766999871360.0, + "grad_norm": 1.4583295292483844, + "language_loss": 0.84297037, + "learning_rate": 2.5151438459169424e-06, + "loss": 0.8639468, + "num_input_tokens_seen": 148928130, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.453125, + "step": 6938, + "time_per_iteration": 3.834561824798584 + }, + { + "auxiliary_loss_clip": 0.01069252, + "auxiliary_loss_mlp": 0.01027659, + "balance_loss_clip": 1.01309156, + "balance_loss_mlp": 1.02149081, + "epoch": 0.4171952502630392, + "flos": 28255731788160.0, + "grad_norm": 2.0600361200989292, + "language_loss": 0.74061006, + "learning_rate": 2.514778762453873e-06, + "loss": 0.76157922, + "num_input_tokens_seen": 148948790, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.47851562, + "step": 6939, + "time_per_iteration": 2.419184923171997 + }, + { + "auxiliary_loss_clip": 0.01070857, + "auxiliary_loss_mlp": 0.01029762, + "balance_loss_clip": 1.01560557, + "balance_loss_mlp": 1.02300465, + "epoch": 0.4172553735157072, + "flos": 24570544515840.0, + "grad_norm": 1.6836038351724936, + "language_loss": 0.7469027, + "learning_rate": 2.5144136606196732e-06, + "loss": 0.76790881, + "num_input_tokens_seen": 148967690, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.47851562, + "step": 6940, + "time_per_iteration": 2.400230646133423 + }, + { + "auxiliary_loss_clip": 0.01067506, + "auxiliary_loss_mlp": 0.01030193, + "balance_loss_clip": 1.01681209, + "balance_loss_mlp": 1.02177453, + "epoch": 0.41731549676837515, + "flos": 27044769680640.0, + "grad_norm": 1.7755436853065534, + "language_loss": 0.71399987, + "learning_rate": 2.5140485404273737e-06, + "loss": 0.73497683, + "num_input_tokens_seen": 148987150, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.45703125, + "step": 6941, + "time_per_iteration": 2.40478253364563 + }, + { + "auxiliary_loss_clip": 0.01071416, + "auxiliary_loss_mlp": 0.01034749, + "balance_loss_clip": 1.02002692, + "balance_loss_mlp": 1.02314699, + "epoch": 0.4173756200210431, + "flos": 28393045632000.0, + "grad_norm": 2.2607128972403117, + "language_loss": 0.73729891, + "learning_rate": 2.5136834018900038e-06, + "loss": 0.75836056, + "num_input_tokens_seen": 149004895, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.48242188, + "step": 6942, + "time_per_iteration": 2.415416717529297 + }, + { + "auxiliary_loss_clip": 0.01073552, + "auxiliary_loss_mlp": 0.01034985, + "balance_loss_clip": 1.01955891, + "balance_loss_mlp": 1.02341151, + "epoch": 0.41743574327371114, + "flos": 22520654415360.0, + "grad_norm": 3.235912242183001, + "language_loss": 0.72655326, + "learning_rate": 2.513318245020595e-06, + "loss": 0.74763858, + "num_input_tokens_seen": 149020970, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.5, + "step": 6943, + "time_per_iteration": 2.3561487197875977 + }, + { + "auxiliary_loss_clip": 0.01070393, + "auxiliary_loss_mlp": 0.0102972, + "balance_loss_clip": 1.01549864, + "balance_loss_mlp": 1.02205813, + "epoch": 0.4174958665263791, + "flos": 30112238534400.0, + "grad_norm": 1.555266867180129, + "language_loss": 0.63699955, + "learning_rate": 2.5129530698321775e-06, + "loss": 0.65800071, + "num_input_tokens_seen": 149041795, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.484375, + "step": 6944, + "time_per_iteration": 2.436521291732788 + }, + { + "auxiliary_loss_clip": 0.01070711, + "auxiliary_loss_mlp": 0.0102798, + "balance_loss_clip": 1.01464677, + "balance_loss_mlp": 1.02313888, + "epoch": 0.41755598977904707, + "flos": 25262313661440.0, + "grad_norm": 1.4259272096798161, + "language_loss": 0.70192814, + "learning_rate": 2.5125878763377857e-06, + "loss": 0.72291505, + "num_input_tokens_seen": 149063700, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.4765625, + "step": 6945, + "time_per_iteration": 2.4245893955230713 + }, + { + "auxiliary_loss_clip": 0.01067873, + "auxiliary_loss_mlp": 0.01033137, + "balance_loss_clip": 1.01792622, + "balance_loss_mlp": 1.02115893, + "epoch": 0.41761611303171503, + "flos": 19827558737280.0, + "grad_norm": 1.5616174183416704, + "language_loss": 0.80878049, + "learning_rate": 2.5122226645504506e-06, + "loss": 0.82979059, + "num_input_tokens_seen": 149082410, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.46679688, + "step": 6946, + "time_per_iteration": 2.3573198318481445 + }, + { + "auxiliary_loss_clip": 0.01068385, + "auxiliary_loss_mlp": 0.01025764, + "balance_loss_clip": 1.01203132, + "balance_loss_mlp": 1.02154446, + "epoch": 0.417676236284383, + "flos": 15522348896640.0, + "grad_norm": 2.111054605953402, + "language_loss": 0.77842045, + "learning_rate": 2.5118574344832056e-06, + "loss": 0.79936188, + "num_input_tokens_seen": 149098745, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.46875, + "step": 6947, + "time_per_iteration": 2.3477113246917725 + }, + { + "auxiliary_loss_clip": 0.01070024, + "auxiliary_loss_mlp": 0.01030268, + "balance_loss_clip": 1.01629639, + "balance_loss_mlp": 1.02225077, + "epoch": 0.41773635953705096, + "flos": 32523130759680.0, + "grad_norm": 1.4925485093919824, + "language_loss": 0.71666503, + "learning_rate": 2.5114921861490865e-06, + "loss": 0.73766804, + "num_input_tokens_seen": 149122255, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.4765625, + "step": 6948, + "time_per_iteration": 2.469848155975342 + }, + { + "auxiliary_loss_clip": 0.01066155, + "auxiliary_loss_mlp": 0.01025771, + "balance_loss_clip": 1.01199007, + "balance_loss_mlp": 1.02069831, + "epoch": 0.4177964827897189, + "flos": 23439812446080.0, + "grad_norm": 1.4428936479056789, + "language_loss": 0.76838762, + "learning_rate": 2.511126919561126e-06, + "loss": 0.78930688, + "num_input_tokens_seen": 149142845, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.45507812, + "step": 6949, + "time_per_iteration": 2.3792507648468018 + }, + { + "auxiliary_loss_clip": 0.01072143, + "auxiliary_loss_mlp": 0.01030382, + "balance_loss_clip": 1.0160352, + "balance_loss_mlp": 1.02313447, + "epoch": 0.4178566060423869, + "flos": 22747764009600.0, + "grad_norm": 1.6489411351506302, + "language_loss": 0.81967115, + "learning_rate": 2.5107616347323617e-06, + "loss": 0.84069639, + "num_input_tokens_seen": 149163375, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.49023438, + "step": 6950, + "time_per_iteration": 2.3954927921295166 + }, + { + "auxiliary_loss_clip": 0.0106571, + "auxiliary_loss_mlp": 0.01024351, + "balance_loss_clip": 1.01142228, + "balance_loss_mlp": 1.02163815, + "epoch": 0.41791672929505486, + "flos": 26031554847360.0, + "grad_norm": 2.1076168308391305, + "language_loss": 0.7585417, + "learning_rate": 2.510396331675828e-06, + "loss": 0.77944231, + "num_input_tokens_seen": 149185610, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.43945312, + "step": 6951, + "time_per_iteration": 2.423574686050415 + }, + { + "auxiliary_loss_clip": 0.01071439, + "auxiliary_loss_mlp": 0.01027649, + "balance_loss_clip": 1.01261663, + "balance_loss_mlp": 1.02237225, + "epoch": 0.4179768525477228, + "flos": 19567805155200.0, + "grad_norm": 1.7928283678614756, + "language_loss": 0.73144007, + "learning_rate": 2.5100310104045613e-06, + "loss": 0.75243092, + "num_input_tokens_seen": 149203990, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.48828125, + "step": 6952, + "time_per_iteration": 2.3543052673339844 + }, + { + "auxiliary_loss_clip": 0.01068966, + "auxiliary_loss_mlp": 0.01032572, + "balance_loss_clip": 1.019346, + "balance_loss_mlp": 1.02228987, + "epoch": 0.4180369758003908, + "flos": 17782905340800.0, + "grad_norm": 2.2384751517877186, + "language_loss": 0.71743965, + "learning_rate": 2.5096656709316008e-06, + "loss": 0.738455, + "num_input_tokens_seen": 149221385, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.46679688, + "step": 6953, + "time_per_iteration": 2.3334155082702637 + }, + { + "auxiliary_loss_clip": 0.01068545, + "auxiliary_loss_mlp": 0.01029877, + "balance_loss_clip": 1.014678, + "balance_loss_mlp": 1.02271414, + "epoch": 0.41809709905305875, + "flos": 20959582527360.0, + "grad_norm": 2.583621919595002, + "language_loss": 0.76306111, + "learning_rate": 2.509300313269983e-06, + "loss": 0.78404534, + "num_input_tokens_seen": 149241175, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.45898438, + "step": 6954, + "time_per_iteration": 2.4177088737487793 + }, + { + "auxiliary_loss_clip": 0.01070716, + "auxiliary_loss_mlp": 0.01036332, + "balance_loss_clip": 1.02056074, + "balance_loss_mlp": 1.02198839, + "epoch": 0.4181572223057267, + "flos": 22016543160960.0, + "grad_norm": 2.2586516458305734, + "language_loss": 0.84701991, + "learning_rate": 2.5089349374327472e-06, + "loss": 0.86809045, + "num_input_tokens_seen": 149259115, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.48828125, + "step": 6955, + "time_per_iteration": 2.358339309692383 + }, + { + "auxiliary_loss_clip": 0.01068752, + "auxiliary_loss_mlp": 0.01030143, + "balance_loss_clip": 1.01660061, + "balance_loss_mlp": 1.02198434, + "epoch": 0.4182173455583947, + "flos": 26244455518080.0, + "grad_norm": 1.5174728557227655, + "language_loss": 0.82866532, + "learning_rate": 2.5085695434329327e-06, + "loss": 0.8496542, + "num_input_tokens_seen": 149278705, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.46679688, + "step": 6956, + "time_per_iteration": 2.4133174419403076 + }, + { + "auxiliary_loss_clip": 0.01070747, + "auxiliary_loss_mlp": 0.01030782, + "balance_loss_clip": 1.01546395, + "balance_loss_mlp": 1.02305341, + "epoch": 0.4182774688110627, + "flos": 14790778934400.0, + "grad_norm": 2.1926289281246194, + "language_loss": 0.72179937, + "learning_rate": 2.5082041312835792e-06, + "loss": 0.74281466, + "num_input_tokens_seen": 149294040, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.4765625, + "step": 6957, + "time_per_iteration": 2.3237624168395996 + }, + { + "auxiliary_loss_clip": 0.01070405, + "auxiliary_loss_mlp": 0.01036647, + "balance_loss_clip": 1.02303934, + "balance_loss_mlp": 1.02254236, + "epoch": 0.41833759206373067, + "flos": 20410992334080.0, + "grad_norm": 1.894762508776966, + "language_loss": 0.75695431, + "learning_rate": 2.507838700997728e-06, + "loss": 0.77802485, + "num_input_tokens_seen": 149310385, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.47851562, + "step": 6958, + "time_per_iteration": 2.368889570236206 + }, + { + "auxiliary_loss_clip": 0.01070732, + "auxiliary_loss_mlp": 0.01030468, + "balance_loss_clip": 1.01603174, + "balance_loss_mlp": 1.02196407, + "epoch": 0.41839771531639863, + "flos": 23655296557440.0, + "grad_norm": 1.5942086911349371, + "language_loss": 0.77149034, + "learning_rate": 2.5074732525884186e-06, + "loss": 0.79250228, + "num_input_tokens_seen": 149328235, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.48828125, + "step": 6959, + "time_per_iteration": 2.369788408279419 + }, + { + "auxiliary_loss_clip": 0.01070503, + "auxiliary_loss_mlp": 0.01031447, + "balance_loss_clip": 1.01665914, + "balance_loss_mlp": 1.02184391, + "epoch": 0.4184578385690666, + "flos": 19753158896640.0, + "grad_norm": 1.6440110659344211, + "language_loss": 0.7701236, + "learning_rate": 2.5071077860686954e-06, + "loss": 0.79114306, + "num_input_tokens_seen": 149347465, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.48828125, + "step": 6960, + "time_per_iteration": 2.367098093032837 + }, + { + "auxiliary_loss_clip": 0.01066434, + "auxiliary_loss_mlp": 0.01027646, + "balance_loss_clip": 1.01464581, + "balance_loss_mlp": 1.02206016, + "epoch": 0.41851796182173456, + "flos": 20192366200320.0, + "grad_norm": 1.769625992111527, + "language_loss": 0.75745523, + "learning_rate": 2.5067423014515995e-06, + "loss": 0.77839601, + "num_input_tokens_seen": 149366685, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.4453125, + "step": 6961, + "time_per_iteration": 2.4013307094573975 + }, + { + "auxiliary_loss_clip": 0.01012024, + "auxiliary_loss_mlp": 0.01006314, + "balance_loss_clip": 1.00459719, + "balance_loss_mlp": 1.00236082, + "epoch": 0.41857808507440253, + "flos": 59015537953920.0, + "grad_norm": 0.781383726497082, + "language_loss": 0.54580557, + "learning_rate": 2.5063767987501745e-06, + "loss": 0.5659889, + "num_input_tokens_seen": 149422925, + "router_z_loss_clip": 0.01721191, + "router_z_loss_mlp": 0.09667969, + "step": 6962, + "time_per_iteration": 2.9663476943969727 + }, + { + "auxiliary_loss_clip": 0.01074077, + "auxiliary_loss_mlp": 0.01027241, + "balance_loss_clip": 1.01220894, + "balance_loss_mlp": 1.02509284, + "epoch": 0.4186382083270705, + "flos": 18477816508800.0, + "grad_norm": 1.7248243147436755, + "language_loss": 0.85095686, + "learning_rate": 2.506011277977464e-06, + "loss": 0.87197006, + "num_input_tokens_seen": 149440820, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.49023438, + "step": 6963, + "time_per_iteration": 2.3413503170013428 + }, + { + "auxiliary_loss_clip": 0.01070363, + "auxiliary_loss_mlp": 0.01030078, + "balance_loss_clip": 1.01577902, + "balance_loss_mlp": 1.02355838, + "epoch": 0.41869833157973846, + "flos": 21577719882240.0, + "grad_norm": 1.5200699809471974, + "language_loss": 0.70421785, + "learning_rate": 2.5056457391465123e-06, + "loss": 0.72522229, + "num_input_tokens_seen": 149461060, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.46875, + "step": 6964, + "time_per_iteration": 2.370666265487671 + }, + { + "auxiliary_loss_clip": 0.01066968, + "auxiliary_loss_mlp": 0.01029886, + "balance_loss_clip": 1.01586676, + "balance_loss_mlp": 1.02218485, + "epoch": 0.4187584548324064, + "flos": 35515955393280.0, + "grad_norm": 1.4856459792166319, + "language_loss": 0.71294975, + "learning_rate": 2.505280182270365e-06, + "loss": 0.73391831, + "num_input_tokens_seen": 149483115, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.44726562, + "step": 6965, + "time_per_iteration": 2.4996140003204346 + }, + { + "auxiliary_loss_clip": 0.01068787, + "auxiliary_loss_mlp": 0.01034177, + "balance_loss_clip": 1.02073658, + "balance_loss_mlp": 1.02276516, + "epoch": 0.4188185780850744, + "flos": 18655035903360.0, + "grad_norm": 1.5283080310033095, + "language_loss": 0.72445834, + "learning_rate": 2.504914607362068e-06, + "loss": 0.74548799, + "num_input_tokens_seen": 149501495, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.4609375, + "step": 6966, + "time_per_iteration": 2.350865602493286 + }, + { + "auxiliary_loss_clip": 0.01071901, + "auxiliary_loss_mlp": 0.01030791, + "balance_loss_clip": 1.01636672, + "balance_loss_mlp": 1.02285051, + "epoch": 0.41887870133774235, + "flos": 40331839824000.0, + "grad_norm": 1.8889127293117212, + "language_loss": 0.70942706, + "learning_rate": 2.5045490144346673e-06, + "loss": 0.73045397, + "num_input_tokens_seen": 149523170, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.49023438, + "step": 6967, + "time_per_iteration": 2.5310304164886475 + }, + { + "auxiliary_loss_clip": 0.01011927, + "auxiliary_loss_mlp": 0.01004635, + "balance_loss_clip": 1.00327015, + "balance_loss_mlp": 1.00215864, + "epoch": 0.4189388245904103, + "flos": 61368545278080.0, + "grad_norm": 0.7856195556592521, + "language_loss": 0.46207666, + "learning_rate": 2.5041834035012103e-06, + "loss": 0.48224229, + "num_input_tokens_seen": 149583955, + "router_z_loss_clip": 0.01367188, + "router_z_loss_mlp": 0.09765625, + "step": 6968, + "time_per_iteration": 4.432485342025757 + }, + { + "auxiliary_loss_clip": 0.0107258, + "auxiliary_loss_mlp": 0.01033584, + "balance_loss_clip": 1.01825333, + "balance_loss_mlp": 1.02291763, + "epoch": 0.4189989478430783, + "flos": 28214499605760.0, + "grad_norm": 1.7022849953948012, + "language_loss": 0.748891, + "learning_rate": 2.503817774574744e-06, + "loss": 0.76995265, + "num_input_tokens_seen": 149604440, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.49609375, + "step": 6969, + "time_per_iteration": 2.4819388389587402 + }, + { + "auxiliary_loss_clip": 0.01068355, + "auxiliary_loss_mlp": 0.01028134, + "balance_loss_clip": 1.01473522, + "balance_loss_mlp": 1.02149463, + "epoch": 0.4190590710957463, + "flos": 24564888875520.0, + "grad_norm": 1.601943558377932, + "language_loss": 0.80647337, + "learning_rate": 2.503452127668318e-06, + "loss": 0.82743829, + "num_input_tokens_seen": 149623745, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.46875, + "step": 6970, + "time_per_iteration": 2.4321630001068115 + }, + { + "auxiliary_loss_clip": 0.01067681, + "auxiliary_loss_mlp": 0.01033182, + "balance_loss_clip": 1.01920438, + "balance_loss_mlp": 1.02179086, + "epoch": 0.41911919434841427, + "flos": 21724948552320.0, + "grad_norm": 5.438662525862278, + "language_loss": 0.83042163, + "learning_rate": 2.50308646279498e-06, + "loss": 0.85143024, + "num_input_tokens_seen": 149643025, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.45898438, + "step": 6971, + "time_per_iteration": 3.7370216846466064 + }, + { + "auxiliary_loss_clip": 0.01069173, + "auxiliary_loss_mlp": 0.01030555, + "balance_loss_clip": 1.01629162, + "balance_loss_mlp": 1.02401686, + "epoch": 0.41917931760108224, + "flos": 17600623799040.0, + "grad_norm": 1.625364252422733, + "language_loss": 0.74935466, + "learning_rate": 2.50272077996778e-06, + "loss": 0.77035201, + "num_input_tokens_seen": 149660695, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.45117188, + "step": 6972, + "time_per_iteration": 2.350653886795044 + }, + { + "auxiliary_loss_clip": 0.01065788, + "auxiliary_loss_mlp": 0.01027214, + "balance_loss_clip": 1.01389837, + "balance_loss_mlp": 1.02041173, + "epoch": 0.4192394408537502, + "flos": 37815160579200.0, + "grad_norm": 1.551742226952976, + "language_loss": 0.72768927, + "learning_rate": 2.5023550791997695e-06, + "loss": 0.74861932, + "num_input_tokens_seen": 149682040, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.453125, + "step": 6973, + "time_per_iteration": 2.519556999206543 + }, + { + "auxiliary_loss_clip": 0.01075686, + "auxiliary_loss_mlp": 0.01031282, + "balance_loss_clip": 1.01655412, + "balance_loss_mlp": 1.02491808, + "epoch": 0.41929956410641817, + "flos": 23106741275520.0, + "grad_norm": 1.8751771001880975, + "language_loss": 0.74938506, + "learning_rate": 2.5019893605039976e-06, + "loss": 0.77045476, + "num_input_tokens_seen": 149700855, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.5078125, + "step": 6974, + "time_per_iteration": 2.376729965209961 + }, + { + "auxiliary_loss_clip": 0.01069707, + "auxiliary_loss_mlp": 0.01026209, + "balance_loss_clip": 1.01201105, + "balance_loss_mlp": 1.02292371, + "epoch": 0.41935968735908613, + "flos": 22523552058240.0, + "grad_norm": 2.7525470360889552, + "language_loss": 0.72754717, + "learning_rate": 2.501623623893517e-06, + "loss": 0.74850631, + "num_input_tokens_seen": 149717360, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.46679688, + "step": 6975, + "time_per_iteration": 3.734311580657959 + }, + { + "auxiliary_loss_clip": 0.01070309, + "auxiliary_loss_mlp": 0.01030964, + "balance_loss_clip": 1.01694465, + "balance_loss_mlp": 1.02328563, + "epoch": 0.4194198106117541, + "flos": 26869226031360.0, + "grad_norm": 1.5333423125985457, + "language_loss": 0.81014729, + "learning_rate": 2.5012578693813796e-06, + "loss": 0.83115995, + "num_input_tokens_seen": 149738975, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.46875, + "step": 6976, + "time_per_iteration": 2.4140875339508057 + }, + { + "auxiliary_loss_clip": 0.01067269, + "auxiliary_loss_mlp": 0.01028347, + "balance_loss_clip": 1.01508498, + "balance_loss_mlp": 1.02034354, + "epoch": 0.41947993386442206, + "flos": 19901365084800.0, + "grad_norm": 2.067211251210026, + "language_loss": 0.67750865, + "learning_rate": 2.5008920969806386e-06, + "loss": 0.69846481, + "num_input_tokens_seen": 149757055, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.46875, + "step": 6977, + "time_per_iteration": 2.408724546432495 + }, + { + "auxiliary_loss_clip": 0.01071052, + "auxiliary_loss_mlp": 0.0102811, + "balance_loss_clip": 1.01443708, + "balance_loss_mlp": 1.02253222, + "epoch": 0.41954005711709, + "flos": 17382940272000.0, + "grad_norm": 2.6575293256356525, + "language_loss": 0.80874556, + "learning_rate": 2.5005263067043464e-06, + "loss": 0.82973719, + "num_input_tokens_seen": 149772885, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.484375, + "step": 6978, + "time_per_iteration": 3.6917779445648193 + }, + { + "auxiliary_loss_clip": 0.01069045, + "auxiliary_loss_mlp": 0.01029966, + "balance_loss_clip": 1.01521981, + "balance_loss_mlp": 1.0209229, + "epoch": 0.419600180369758, + "flos": 25002315699840.0, + "grad_norm": 1.7866235985726986, + "language_loss": 0.825912, + "learning_rate": 2.500160498565558e-06, + "loss": 0.84690213, + "num_input_tokens_seen": 149791515, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.48046875, + "step": 6979, + "time_per_iteration": 2.4065709114074707 + }, + { + "auxiliary_loss_clip": 0.01069528, + "auxiliary_loss_mlp": 0.01030252, + "balance_loss_clip": 1.0153923, + "balance_loss_mlp": 1.02223241, + "epoch": 0.41966030362242596, + "flos": 17382835537920.0, + "grad_norm": 1.7937898570371682, + "language_loss": 0.83443779, + "learning_rate": 2.499794672577329e-06, + "loss": 0.85543561, + "num_input_tokens_seen": 149807250, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.47265625, + "step": 6980, + "time_per_iteration": 2.3307483196258545 + }, + { + "auxiliary_loss_clip": 0.01070856, + "auxiliary_loss_mlp": 0.01028828, + "balance_loss_clip": 1.01511919, + "balance_loss_mlp": 1.02273846, + "epoch": 0.4197204268750939, + "flos": 22155288370560.0, + "grad_norm": 7.03170080630423, + "language_loss": 0.7913779, + "learning_rate": 2.4994288287527126e-06, + "loss": 0.81237471, + "num_input_tokens_seen": 149821640, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.48046875, + "step": 6981, + "time_per_iteration": 2.354881525039673 + }, + { + "auxiliary_loss_clip": 0.01066439, + "auxiliary_loss_mlp": 0.01026111, + "balance_loss_clip": 1.01333153, + "balance_loss_mlp": 1.0212338, + "epoch": 0.4197805501277619, + "flos": 22083227591040.0, + "grad_norm": 7.968779026924264, + "language_loss": 0.84596753, + "learning_rate": 2.499062967104766e-06, + "loss": 0.86689299, + "num_input_tokens_seen": 149840545, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.453125, + "step": 6982, + "time_per_iteration": 2.3704073429107666 + }, + { + "auxiliary_loss_clip": 0.0107006, + "auxiliary_loss_mlp": 0.01033759, + "balance_loss_clip": 1.01959705, + "balance_loss_mlp": 1.02175009, + "epoch": 0.4198406733804299, + "flos": 26430996245760.0, + "grad_norm": 1.9966018314493397, + "language_loss": 0.56760621, + "learning_rate": 2.498697087646546e-06, + "loss": 0.58864439, + "num_input_tokens_seen": 149860375, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.48242188, + "step": 6983, + "time_per_iteration": 2.404977321624756 + }, + { + "auxiliary_loss_clip": 0.01068364, + "auxiliary_loss_mlp": 0.01032145, + "balance_loss_clip": 1.01835227, + "balance_loss_mlp": 1.02250648, + "epoch": 0.4199007966330979, + "flos": 12530222490240.0, + "grad_norm": 1.8593185763439286, + "language_loss": 0.82240087, + "learning_rate": 2.49833119039111e-06, + "loss": 0.8434059, + "num_input_tokens_seen": 149877850, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.45898438, + "step": 6984, + "time_per_iteration": 2.3424839973449707 + }, + { + "auxiliary_loss_clip": 0.01067564, + "auxiliary_loss_mlp": 0.01025034, + "balance_loss_clip": 1.01151013, + "balance_loss_mlp": 1.02211368, + "epoch": 0.41996091988576584, + "flos": 21761851726080.0, + "grad_norm": 1.95431454357706, + "language_loss": 0.78885239, + "learning_rate": 2.497965275351516e-06, + "loss": 0.80977839, + "num_input_tokens_seen": 149896110, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.453125, + "step": 6985, + "time_per_iteration": 2.3781180381774902 + }, + { + "auxiliary_loss_clip": 0.01071121, + "auxiliary_loss_mlp": 0.01028453, + "balance_loss_clip": 1.01309896, + "balance_loss_mlp": 1.02181423, + "epoch": 0.4200210431384338, + "flos": 26540728248960.0, + "grad_norm": 1.6593250132939774, + "language_loss": 0.7860918, + "learning_rate": 2.4975993425408216e-06, + "loss": 0.8070876, + "num_input_tokens_seen": 149916495, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.4921875, + "step": 6986, + "time_per_iteration": 2.4019339084625244 + }, + { + "auxiliary_loss_clip": 0.01065846, + "auxiliary_loss_mlp": 0.01028602, + "balance_loss_clip": 1.01569819, + "balance_loss_mlp": 1.021752, + "epoch": 0.42008116639110177, + "flos": 26794651633920.0, + "grad_norm": 1.3969686221308772, + "language_loss": 0.72438449, + "learning_rate": 2.497233391972087e-06, + "loss": 0.74532896, + "num_input_tokens_seen": 149936445, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.44140625, + "step": 6987, + "time_per_iteration": 2.4226670265197754 + }, + { + "auxiliary_loss_clip": 0.01070802, + "auxiliary_loss_mlp": 0.01033933, + "balance_loss_clip": 1.01939559, + "balance_loss_mlp": 1.02337337, + "epoch": 0.42014128964376973, + "flos": 32085983226240.0, + "grad_norm": 1.659126801584171, + "language_loss": 0.75381434, + "learning_rate": 2.496867423658371e-06, + "loss": 0.77486169, + "num_input_tokens_seen": 149959430, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.47460938, + "step": 6988, + "time_per_iteration": 2.472352981567383 + }, + { + "auxiliary_loss_clip": 0.01071073, + "auxiliary_loss_mlp": 0.01034074, + "balance_loss_clip": 1.01966155, + "balance_loss_mlp": 1.02257502, + "epoch": 0.4202014128964377, + "flos": 26465979294720.0, + "grad_norm": 1.772043771352536, + "language_loss": 0.7428543, + "learning_rate": 2.496501437612735e-06, + "loss": 0.76390576, + "num_input_tokens_seen": 149980365, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.484375, + "step": 6989, + "time_per_iteration": 2.423642635345459 + }, + { + "auxiliary_loss_clip": 0.01069517, + "auxiliary_loss_mlp": 0.01029771, + "balance_loss_clip": 1.0155257, + "balance_loss_mlp": 1.02186203, + "epoch": 0.42026153614910566, + "flos": 13400537662080.0, + "grad_norm": 3.1323352490745786, + "language_loss": 0.71082687, + "learning_rate": 2.4961354338482406e-06, + "loss": 0.73181987, + "num_input_tokens_seen": 149997375, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.4765625, + "step": 6990, + "time_per_iteration": 2.3522121906280518 + }, + { + "auxiliary_loss_clip": 0.0101136, + "auxiliary_loss_mlp": 0.01002144, + "balance_loss_clip": 1.00077879, + "balance_loss_mlp": 1.00175357, + "epoch": 0.42032165940177363, + "flos": 60247413832320.0, + "grad_norm": 0.8328467511641501, + "language_loss": 0.60500485, + "learning_rate": 2.4957694123779477e-06, + "loss": 0.62513989, + "num_input_tokens_seen": 150051230, + "router_z_loss_clip": 0.01367188, + "router_z_loss_mlp": 0.09619141, + "step": 6991, + "time_per_iteration": 2.9300758838653564 + }, + { + "auxiliary_loss_clip": 0.01072615, + "auxiliary_loss_mlp": 0.01030313, + "balance_loss_clip": 1.01627636, + "balance_loss_mlp": 1.02420759, + "epoch": 0.4203817826544416, + "flos": 24534060278400.0, + "grad_norm": 1.4958611060371199, + "language_loss": 0.83245087, + "learning_rate": 2.4954033732149208e-06, + "loss": 0.8534801, + "num_input_tokens_seen": 150071135, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.484375, + "step": 6992, + "time_per_iteration": 2.3917934894561768 + }, + { + "auxiliary_loss_clip": 0.01011648, + "auxiliary_loss_mlp": 0.01003257, + "balance_loss_clip": 1.00194621, + "balance_loss_mlp": 1.00192344, + "epoch": 0.42044190590710956, + "flos": 58817965236480.0, + "grad_norm": 0.8132453250053372, + "language_loss": 0.65543461, + "learning_rate": 2.495037316372221e-06, + "loss": 0.6755836, + "num_input_tokens_seen": 150125220, + "router_z_loss_clip": 0.01312256, + "router_z_loss_mlp": 0.09765625, + "step": 6993, + "time_per_iteration": 3.0050182342529297 + }, + { + "auxiliary_loss_clip": 0.01073478, + "auxiliary_loss_mlp": 0.01028749, + "balance_loss_clip": 1.01304948, + "balance_loss_mlp": 1.0230372, + "epoch": 0.4205020291597775, + "flos": 16435118148480.0, + "grad_norm": 2.0246374198952215, + "language_loss": 0.83558172, + "learning_rate": 2.4946712418629133e-06, + "loss": 0.85660398, + "num_input_tokens_seen": 150142300, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.50390625, + "step": 6994, + "time_per_iteration": 2.328662395477295 + }, + { + "auxiliary_loss_clip": 0.01069019, + "auxiliary_loss_mlp": 0.01025561, + "balance_loss_clip": 1.01195979, + "balance_loss_mlp": 1.02431417, + "epoch": 0.4205621524124455, + "flos": 18404673477120.0, + "grad_norm": 1.9794158358691067, + "language_loss": 0.78257751, + "learning_rate": 2.4943051497000616e-06, + "loss": 0.8035233, + "num_input_tokens_seen": 150161345, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.44726562, + "step": 6995, + "time_per_iteration": 2.366546154022217 + }, + { + "auxiliary_loss_clip": 0.0106608, + "auxiliary_loss_mlp": 0.01026579, + "balance_loss_clip": 1.01361465, + "balance_loss_mlp": 1.02138782, + "epoch": 0.4206222756651135, + "flos": 25518925221120.0, + "grad_norm": 1.5386380426991597, + "language_loss": 0.80001485, + "learning_rate": 2.4939390398967303e-06, + "loss": 0.82094139, + "num_input_tokens_seen": 150182420, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.4453125, + "step": 6996, + "time_per_iteration": 2.404432535171509 + }, + { + "auxiliary_loss_clip": 0.01072325, + "auxiliary_loss_mlp": 0.01022658, + "balance_loss_clip": 1.00875199, + "balance_loss_mlp": 1.02378774, + "epoch": 0.4206823989177815, + "flos": 15303443472000.0, + "grad_norm": 3.3245932105926252, + "language_loss": 0.75639623, + "learning_rate": 2.493572912465985e-06, + "loss": 0.77734607, + "num_input_tokens_seen": 150200175, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.484375, + "step": 6997, + "time_per_iteration": 2.358534574508667 + }, + { + "auxiliary_loss_clip": 0.01069289, + "auxiliary_loss_mlp": 0.01035096, + "balance_loss_clip": 1.02113032, + "balance_loss_mlp": 1.02161193, + "epoch": 0.42074252217044944, + "flos": 15553352050560.0, + "grad_norm": 1.8143910278602928, + "language_loss": 0.75353694, + "learning_rate": 2.493206767420892e-06, + "loss": 0.77458078, + "num_input_tokens_seen": 150217100, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.4765625, + "step": 6998, + "time_per_iteration": 2.346398115158081 + }, + { + "auxiliary_loss_clip": 0.01073388, + "auxiliary_loss_mlp": 0.01029485, + "balance_loss_clip": 1.01396394, + "balance_loss_mlp": 1.02406979, + "epoch": 0.4208026454231174, + "flos": 26144533607040.0, + "grad_norm": 1.9376161271694685, + "language_loss": 0.76293278, + "learning_rate": 2.492840604774519e-06, + "loss": 0.78396153, + "num_input_tokens_seen": 150239830, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.4921875, + "step": 6999, + "time_per_iteration": 2.4521055221557617 + }, + { + "auxiliary_loss_clip": 0.0106985, + "auxiliary_loss_mlp": 0.01035018, + "balance_loss_clip": 1.02081466, + "balance_loss_mlp": 1.02311087, + "epoch": 0.42086276867578537, + "flos": 23548985867520.0, + "grad_norm": 3.5150728853095554, + "language_loss": 0.6482327, + "learning_rate": 2.492474424539932e-06, + "loss": 0.66928136, + "num_input_tokens_seen": 150260690, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.46679688, + "step": 7000, + "time_per_iteration": 2.3886754512786865 + }, + { + "auxiliary_loss_clip": 0.01068577, + "auxiliary_loss_mlp": 0.01028415, + "balance_loss_clip": 1.01419914, + "balance_loss_mlp": 1.02188277, + "epoch": 0.42092289192845334, + "flos": 18112450464000.0, + "grad_norm": 1.4979345807365263, + "language_loss": 0.76265794, + "learning_rate": 2.4921082267301994e-06, + "loss": 0.78362793, + "num_input_tokens_seen": 150279885, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.46679688, + "step": 7001, + "time_per_iteration": 2.365105390548706 + }, + { + "auxiliary_loss_clip": 0.01069443, + "auxiliary_loss_mlp": 0.01029597, + "balance_loss_clip": 1.01621032, + "balance_loss_mlp": 1.02274466, + "epoch": 0.4209830151811213, + "flos": 20005685827200.0, + "grad_norm": 1.6425589035641484, + "language_loss": 0.86717111, + "learning_rate": 2.49174201135839e-06, + "loss": 0.88816154, + "num_input_tokens_seen": 150297390, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.46875, + "step": 7002, + "time_per_iteration": 2.355429172515869 + }, + { + "auxiliary_loss_clip": 0.01067213, + "auxiliary_loss_mlp": 0.01027774, + "balance_loss_clip": 1.01407671, + "balance_loss_mlp": 1.02119946, + "epoch": 0.42104313843378927, + "flos": 21977929330560.0, + "grad_norm": 1.9562396919995368, + "language_loss": 0.67967856, + "learning_rate": 2.491375778437573e-06, + "loss": 0.70062846, + "num_input_tokens_seen": 150317390, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.4609375, + "step": 7003, + "time_per_iteration": 2.378953456878662 + }, + { + "auxiliary_loss_clip": 0.01070311, + "auxiliary_loss_mlp": 0.01032307, + "balance_loss_clip": 1.01769197, + "balance_loss_mlp": 1.02195048, + "epoch": 0.42110326168645723, + "flos": 25442884546560.0, + "grad_norm": 1.7080576443747515, + "language_loss": 0.77331436, + "learning_rate": 2.491009527980819e-06, + "loss": 0.79434049, + "num_input_tokens_seen": 150337455, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.484375, + "step": 7004, + "time_per_iteration": 2.402089834213257 + }, + { + "auxiliary_loss_clip": 0.01069252, + "auxiliary_loss_mlp": 0.01028938, + "balance_loss_clip": 1.01429319, + "balance_loss_mlp": 1.02397037, + "epoch": 0.4211633849391252, + "flos": 17821588993920.0, + "grad_norm": 1.670718372497601, + "language_loss": 0.68627906, + "learning_rate": 2.4906432600011983e-06, + "loss": 0.70726097, + "num_input_tokens_seen": 150355385, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.453125, + "step": 7005, + "time_per_iteration": 2.425891160964966 + }, + { + "auxiliary_loss_clip": 0.01068527, + "auxiliary_loss_mlp": 0.01025287, + "balance_loss_clip": 1.01223958, + "balance_loss_mlp": 1.02202368, + "epoch": 0.42122350819179316, + "flos": 16281710167680.0, + "grad_norm": 1.8096274493022777, + "language_loss": 0.72627544, + "learning_rate": 2.4902769745117805e-06, + "loss": 0.7472136, + "num_input_tokens_seen": 150371750, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.46484375, + "step": 7006, + "time_per_iteration": 2.3313686847686768 + }, + { + "auxiliary_loss_clip": 0.0107037, + "auxiliary_loss_mlp": 0.01028705, + "balance_loss_clip": 1.01475787, + "balance_loss_mlp": 1.02301991, + "epoch": 0.4212836314444611, + "flos": 19644858259200.0, + "grad_norm": 1.6536435504095255, + "language_loss": 0.71075231, + "learning_rate": 2.4899106715256394e-06, + "loss": 0.73174304, + "num_input_tokens_seen": 150389955, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.47265625, + "step": 7007, + "time_per_iteration": 3.7965524196624756 + }, + { + "auxiliary_loss_clip": 0.01069614, + "auxiliary_loss_mlp": 0.0103066, + "balance_loss_clip": 1.0168016, + "balance_loss_mlp": 1.02268386, + "epoch": 0.4213437546971291, + "flos": 18368049594240.0, + "grad_norm": 2.3077340664301595, + "language_loss": 0.8251152, + "learning_rate": 2.4895443510558467e-06, + "loss": 0.84611791, + "num_input_tokens_seen": 150405780, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.46875, + "step": 7008, + "time_per_iteration": 2.345515489578247 + }, + { + "auxiliary_loss_clip": 0.01068296, + "auxiliary_loss_mlp": 0.01031171, + "balance_loss_clip": 1.01726544, + "balance_loss_mlp": 1.02069831, + "epoch": 0.42140387794979706, + "flos": 27703406079360.0, + "grad_norm": 1.7302723178740909, + "language_loss": 0.72060621, + "learning_rate": 2.489178013115475e-06, + "loss": 0.74160087, + "num_input_tokens_seen": 150425615, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.4765625, + "step": 7009, + "time_per_iteration": 2.4318649768829346 + }, + { + "auxiliary_loss_clip": 0.01067864, + "auxiliary_loss_mlp": 0.01024488, + "balance_loss_clip": 1.01120234, + "balance_loss_mlp": 1.02099776, + "epoch": 0.4214640012024651, + "flos": 28145825228160.0, + "grad_norm": 6.076891758886555, + "language_loss": 0.66243422, + "learning_rate": 2.4888116577175987e-06, + "loss": 0.68335778, + "num_input_tokens_seen": 150445765, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.46875, + "step": 7010, + "time_per_iteration": 3.8358306884765625 + }, + { + "auxiliary_loss_clip": 0.0106702, + "auxiliary_loss_mlp": 0.01028254, + "balance_loss_clip": 1.01471138, + "balance_loss_mlp": 1.02122772, + "epoch": 0.42152412445513304, + "flos": 22996311045120.0, + "grad_norm": 1.4995133510943734, + "language_loss": 0.72547913, + "learning_rate": 2.4884452848752918e-06, + "loss": 0.74643183, + "num_input_tokens_seen": 150464405, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.45703125, + "step": 7011, + "time_per_iteration": 2.3793771266937256 + }, + { + "auxiliary_loss_clip": 0.01011611, + "auxiliary_loss_mlp": 0.01001112, + "balance_loss_clip": 0.99970484, + "balance_loss_mlp": 1.00179482, + "epoch": 0.421584247707801, + "flos": 63746549136000.0, + "grad_norm": 1.0151995836113952, + "language_loss": 0.54393584, + "learning_rate": 2.4880788946016287e-06, + "loss": 0.56406307, + "num_input_tokens_seen": 150520430, + "router_z_loss_clip": 0.01403809, + "router_z_loss_mlp": 0.09765625, + "step": 7012, + "time_per_iteration": 2.9065093994140625 + }, + { + "auxiliary_loss_clip": 0.01069781, + "auxiliary_loss_mlp": 0.01026299, + "balance_loss_clip": 1.01188064, + "balance_loss_mlp": 1.02189267, + "epoch": 0.421644370960469, + "flos": 24313514019840.0, + "grad_norm": 1.3699483117619788, + "language_loss": 0.78454578, + "learning_rate": 2.4877124869096855e-06, + "loss": 0.80550659, + "num_input_tokens_seen": 150542610, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.47851562, + "step": 7013, + "time_per_iteration": 2.398491382598877 + }, + { + "auxiliary_loss_clip": 0.01068601, + "auxiliary_loss_mlp": 0.01026259, + "balance_loss_clip": 1.01228178, + "balance_loss_mlp": 1.02087092, + "epoch": 0.42170449421313694, + "flos": 23439568066560.0, + "grad_norm": 1.952591849821917, + "language_loss": 0.8134802, + "learning_rate": 2.487346061812538e-06, + "loss": 0.83442879, + "num_input_tokens_seen": 150560970, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.4765625, + "step": 7014, + "time_per_iteration": 3.7374212741851807 + }, + { + "auxiliary_loss_clip": 0.01070501, + "auxiliary_loss_mlp": 0.0102916, + "balance_loss_clip": 1.0149976, + "balance_loss_mlp": 1.02270997, + "epoch": 0.4217646174658049, + "flos": 23694364235520.0, + "grad_norm": 1.4502929590216511, + "language_loss": 0.77684277, + "learning_rate": 2.4869796193232633e-06, + "loss": 0.7978394, + "num_input_tokens_seen": 150582615, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.47851562, + "step": 7015, + "time_per_iteration": 2.427457809448242 + }, + { + "auxiliary_loss_clip": 0.0106806, + "auxiliary_loss_mlp": 0.01033497, + "balance_loss_clip": 1.0186317, + "balance_loss_mlp": 1.02129924, + "epoch": 0.42182474071847287, + "flos": 24970439761920.0, + "grad_norm": 1.313137804981797, + "language_loss": 0.82173902, + "learning_rate": 2.4866131594549385e-06, + "loss": 0.8427546, + "num_input_tokens_seen": 150603640, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.46875, + "step": 7016, + "time_per_iteration": 2.4172892570495605 + }, + { + "auxiliary_loss_clip": 0.01070806, + "auxiliary_loss_mlp": 0.01027373, + "balance_loss_clip": 1.0130558, + "balance_loss_mlp": 1.02321148, + "epoch": 0.42188486397114083, + "flos": 22855540976640.0, + "grad_norm": 1.857401645495164, + "language_loss": 0.67554474, + "learning_rate": 2.4862466822206425e-06, + "loss": 0.69652653, + "num_input_tokens_seen": 150622490, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.4765625, + "step": 7017, + "time_per_iteration": 2.3720948696136475 + }, + { + "auxiliary_loss_clip": 0.01012663, + "auxiliary_loss_mlp": 0.00999398, + "balance_loss_clip": 0.99801558, + "balance_loss_mlp": 1.00285387, + "epoch": 0.4219449872238088, + "flos": 66972139004160.0, + "grad_norm": 0.7032057271156663, + "language_loss": 0.59454155, + "learning_rate": 2.485880187633452e-06, + "loss": 0.61466217, + "num_input_tokens_seen": 150689545, + "router_z_loss_clip": 0.01385498, + "router_z_loss_mlp": 0.09814453, + "step": 7018, + "time_per_iteration": 4.41826605796814 + }, + { + "auxiliary_loss_clip": 0.01069574, + "auxiliary_loss_mlp": 0.01031115, + "balance_loss_clip": 1.01693499, + "balance_loss_mlp": 1.02250338, + "epoch": 0.42200511047647676, + "flos": 13114528871040.0, + "grad_norm": 1.8904872337160952, + "language_loss": 0.75045532, + "learning_rate": 2.4855136757064487e-06, + "loss": 0.7714622, + "num_input_tokens_seen": 150707610, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.47070312, + "step": 7019, + "time_per_iteration": 2.350626230239868 + }, + { + "auxiliary_loss_clip": 0.010687, + "auxiliary_loss_mlp": 0.01032787, + "balance_loss_clip": 1.01763582, + "balance_loss_mlp": 1.02243078, + "epoch": 0.42206523372914473, + "flos": 13990325126400.0, + "grad_norm": 2.707574144119525, + "language_loss": 0.69124407, + "learning_rate": 2.4851471464527097e-06, + "loss": 0.71225893, + "num_input_tokens_seen": 150724530, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.46289062, + "step": 7020, + "time_per_iteration": 2.348933458328247 + }, + { + "auxiliary_loss_clip": 0.01068056, + "auxiliary_loss_mlp": 0.01026655, + "balance_loss_clip": 1.01431644, + "balance_loss_mlp": 1.02389717, + "epoch": 0.4221253569818127, + "flos": 21941305447680.0, + "grad_norm": 1.5997839225167194, + "language_loss": 0.80967414, + "learning_rate": 2.4847805998853184e-06, + "loss": 0.83062124, + "num_input_tokens_seen": 150742870, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.44140625, + "step": 7021, + "time_per_iteration": 2.372657537460327 + }, + { + "auxiliary_loss_clip": 0.01069472, + "auxiliary_loss_mlp": 0.01029717, + "balance_loss_clip": 1.01597798, + "balance_loss_mlp": 1.02271414, + "epoch": 0.42218548023448066, + "flos": 32191351309440.0, + "grad_norm": 1.9893867653940793, + "language_loss": 0.69857383, + "learning_rate": 2.484414036017354e-06, + "loss": 0.71956569, + "num_input_tokens_seen": 150765500, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.46875, + "step": 7022, + "time_per_iteration": 2.447925090789795 + }, + { + "auxiliary_loss_clip": 0.01066422, + "auxiliary_loss_mlp": 0.01025551, + "balance_loss_clip": 1.01413643, + "balance_loss_mlp": 1.02422714, + "epoch": 0.4222456034871487, + "flos": 30117614883840.0, + "grad_norm": 1.5535349702729038, + "language_loss": 0.67644322, + "learning_rate": 2.4840474548618986e-06, + "loss": 0.69736296, + "num_input_tokens_seen": 150784945, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.421875, + "step": 7023, + "time_per_iteration": 2.4660451412200928 + }, + { + "auxiliary_loss_clip": 0.01068172, + "auxiliary_loss_mlp": 0.01027258, + "balance_loss_clip": 1.01326334, + "balance_loss_mlp": 1.02352428, + "epoch": 0.42230572673981664, + "flos": 22126798834560.0, + "grad_norm": 1.5333989007542057, + "language_loss": 0.69259018, + "learning_rate": 2.4836808564320347e-06, + "loss": 0.71354449, + "num_input_tokens_seen": 150803120, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.44726562, + "step": 7024, + "time_per_iteration": 2.365180730819702 + }, + { + "auxiliary_loss_clip": 0.0106819, + "auxiliary_loss_mlp": 0.01031045, + "balance_loss_clip": 1.01775956, + "balance_loss_mlp": 1.02209651, + "epoch": 0.4223658499924846, + "flos": 22053970005120.0, + "grad_norm": 1.765977328527133, + "language_loss": 0.76683837, + "learning_rate": 2.4833142407408455e-06, + "loss": 0.78783071, + "num_input_tokens_seen": 150823135, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.4609375, + "step": 7025, + "time_per_iteration": 2.406040668487549 + }, + { + "auxiliary_loss_clip": 0.01064142, + "auxiliary_loss_mlp": 0.01031125, + "balance_loss_clip": 1.01732671, + "balance_loss_mlp": 1.02150285, + "epoch": 0.4224259732451526, + "flos": 20409735525120.0, + "grad_norm": 1.772165333035464, + "language_loss": 0.79897892, + "learning_rate": 2.4829476078014143e-06, + "loss": 0.81993157, + "num_input_tokens_seen": 150842070, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.42578125, + "step": 7026, + "time_per_iteration": 2.370894432067871 + }, + { + "auxiliary_loss_clip": 0.01011905, + "auxiliary_loss_mlp": 0.01011877, + "balance_loss_clip": 1.01048219, + "balance_loss_mlp": 1.00165749, + "epoch": 0.42248609649782054, + "flos": 62843380508160.0, + "grad_norm": 0.723354595410686, + "language_loss": 0.61862296, + "learning_rate": 2.4825809576268247e-06, + "loss": 0.63886076, + "num_input_tokens_seen": 150907450, + "router_z_loss_clip": 0.01397705, + "router_z_loss_mlp": 0.10253906, + "step": 7027, + "time_per_iteration": 3.0958189964294434 + }, + { + "auxiliary_loss_clip": 0.01069118, + "auxiliary_loss_mlp": 0.01029128, + "balance_loss_clip": 1.0155499, + "balance_loss_mlp": 1.02349615, + "epoch": 0.4225462197504885, + "flos": 26248749615360.0, + "grad_norm": 1.8698258942905561, + "language_loss": 0.71256566, + "learning_rate": 2.4822142902301622e-06, + "loss": 0.73354816, + "num_input_tokens_seen": 150928040, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.45703125, + "step": 7028, + "time_per_iteration": 2.408419132232666 + }, + { + "auxiliary_loss_clip": 0.01069947, + "auxiliary_loss_mlp": 0.01036986, + "balance_loss_clip": 1.02302623, + "balance_loss_mlp": 1.02292669, + "epoch": 0.42260634300315647, + "flos": 20520898894080.0, + "grad_norm": 1.9576664540416198, + "language_loss": 0.82189894, + "learning_rate": 2.481847605624512e-06, + "loss": 0.84296823, + "num_input_tokens_seen": 150945760, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.47070312, + "step": 7029, + "time_per_iteration": 2.389857769012451 + }, + { + "auxiliary_loss_clip": 0.01073114, + "auxiliary_loss_mlp": 0.01036109, + "balance_loss_clip": 1.02172613, + "balance_loss_mlp": 1.02457345, + "epoch": 0.42266646625582444, + "flos": 24315573790080.0, + "grad_norm": 1.8629669107898534, + "language_loss": 0.75121748, + "learning_rate": 2.481480903822961e-06, + "loss": 0.77230978, + "num_input_tokens_seen": 150965665, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.484375, + "step": 7030, + "time_per_iteration": 2.4038922786712646 + }, + { + "auxiliary_loss_clip": 0.01069032, + "auxiliary_loss_mlp": 0.01034105, + "balance_loss_clip": 1.01902533, + "balance_loss_mlp": 1.02207184, + "epoch": 0.4227265895084924, + "flos": 24203083789440.0, + "grad_norm": 2.4261046304473393, + "language_loss": 0.86878246, + "learning_rate": 2.4811141848385944e-06, + "loss": 0.88981384, + "num_input_tokens_seen": 150982260, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.46875, + "step": 7031, + "time_per_iteration": 2.403911590576172 + }, + { + "auxiliary_loss_clip": 0.01069129, + "auxiliary_loss_mlp": 0.01029479, + "balance_loss_clip": 1.01546586, + "balance_loss_mlp": 1.02271223, + "epoch": 0.42278671276116037, + "flos": 16908819742080.0, + "grad_norm": 1.8676226049537268, + "language_loss": 0.73278934, + "learning_rate": 2.4807474486844996e-06, + "loss": 0.75377548, + "num_input_tokens_seen": 150999990, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.46484375, + "step": 7032, + "time_per_iteration": 2.343925952911377 + }, + { + "auxiliary_loss_clip": 0.01068931, + "auxiliary_loss_mlp": 0.01037164, + "balance_loss_clip": 1.02338982, + "balance_loss_mlp": 1.0217582, + "epoch": 0.42284683601382833, + "flos": 25409891445120.0, + "grad_norm": 1.5597863768671352, + "language_loss": 0.70381868, + "learning_rate": 2.480380695373766e-06, + "loss": 0.72487974, + "num_input_tokens_seen": 151021105, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.47070312, + "step": 7033, + "time_per_iteration": 2.4300765991210938 + }, + { + "auxiliary_loss_clip": 0.01069944, + "auxiliary_loss_mlp": 0.01028383, + "balance_loss_clip": 1.01425672, + "balance_loss_mlp": 1.02306604, + "epoch": 0.4229069592664963, + "flos": 23039184061440.0, + "grad_norm": 2.015389909177172, + "language_loss": 0.89796388, + "learning_rate": 2.480013924919481e-06, + "loss": 0.91894716, + "num_input_tokens_seen": 151040665, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.46875, + "step": 7034, + "time_per_iteration": 2.3808624744415283 + }, + { + "auxiliary_loss_clip": 0.01067358, + "auxiliary_loss_mlp": 0.01029617, + "balance_loss_clip": 1.01606929, + "balance_loss_mlp": 1.02172542, + "epoch": 0.42296708251916426, + "flos": 26066258605440.0, + "grad_norm": 2.5295941283424357, + "language_loss": 0.77010691, + "learning_rate": 2.479647137334733e-06, + "loss": 0.79107666, + "num_input_tokens_seen": 151061240, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.45703125, + "step": 7035, + "time_per_iteration": 2.4020743370056152 + }, + { + "auxiliary_loss_clip": 0.01070532, + "auxiliary_loss_mlp": 0.0103401, + "balance_loss_clip": 1.01952577, + "balance_loss_mlp": 1.0239166, + "epoch": 0.4230272057718323, + "flos": 19457514570240.0, + "grad_norm": 1.8900723294226578, + "language_loss": 0.82292205, + "learning_rate": 2.479280332632613e-06, + "loss": 0.84396744, + "num_input_tokens_seen": 151076870, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.46484375, + "step": 7036, + "time_per_iteration": 2.3434605598449707 + }, + { + "auxiliary_loss_clip": 0.01067539, + "auxiliary_loss_mlp": 0.01030777, + "balance_loss_clip": 1.01760995, + "balance_loss_mlp": 1.02266741, + "epoch": 0.42308732902450025, + "flos": 22382188496640.0, + "grad_norm": 2.3460400416321883, + "language_loss": 0.70308745, + "learning_rate": 2.4789135108262105e-06, + "loss": 0.72407061, + "num_input_tokens_seen": 151095110, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.44921875, + "step": 7037, + "time_per_iteration": 2.366260528564453 + }, + { + "auxiliary_loss_clip": 0.01069287, + "auxiliary_loss_mlp": 0.01030526, + "balance_loss_clip": 1.01615524, + "balance_loss_mlp": 1.02233565, + "epoch": 0.4231474522771682, + "flos": 20994391019520.0, + "grad_norm": 1.7662887422006817, + "language_loss": 0.77891326, + "learning_rate": 2.478546671928617e-06, + "loss": 0.79991138, + "num_input_tokens_seen": 151114355, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.46875, + "step": 7038, + "time_per_iteration": 2.3783888816833496 + }, + { + "auxiliary_loss_clip": 0.01074894, + "auxiliary_loss_mlp": 0.01038643, + "balance_loss_clip": 1.02285969, + "balance_loss_mlp": 1.02485716, + "epoch": 0.4232075755298362, + "flos": 14974980600960.0, + "grad_norm": 3.6423179021063787, + "language_loss": 0.66889167, + "learning_rate": 2.4781798159529235e-06, + "loss": 0.69002712, + "num_input_tokens_seen": 151131505, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.5, + "step": 7039, + "time_per_iteration": 2.3622679710388184 + }, + { + "auxiliary_loss_clip": 0.01072996, + "auxiliary_loss_mlp": 0.0103708, + "balance_loss_clip": 1.02200639, + "balance_loss_mlp": 1.02320671, + "epoch": 0.42326769878250414, + "flos": 24531581571840.0, + "grad_norm": 1.5252277530062146, + "language_loss": 0.7601409, + "learning_rate": 2.477812942912223e-06, + "loss": 0.78124171, + "num_input_tokens_seen": 151151555, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.49804688, + "step": 7040, + "time_per_iteration": 2.4262492656707764 + }, + { + "auxiliary_loss_clip": 0.01068008, + "auxiliary_loss_mlp": 0.01028553, + "balance_loss_clip": 1.01457596, + "balance_loss_mlp": 1.02150702, + "epoch": 0.4233278220351721, + "flos": 26869086385920.0, + "grad_norm": 1.4660526156403406, + "language_loss": 0.65149498, + "learning_rate": 2.4774460528196073e-06, + "loss": 0.67246056, + "num_input_tokens_seen": 151172385, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.46484375, + "step": 7041, + "time_per_iteration": 2.4047229290008545 + }, + { + "auxiliary_loss_clip": 0.01070622, + "auxiliary_loss_mlp": 0.01027903, + "balance_loss_clip": 1.01326418, + "balance_loss_mlp": 1.02188492, + "epoch": 0.4233879452878401, + "flos": 42813256728960.0, + "grad_norm": 1.6984253567447611, + "language_loss": 0.74125391, + "learning_rate": 2.47707914568817e-06, + "loss": 0.76223916, + "num_input_tokens_seen": 151194930, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.48828125, + "step": 7042, + "time_per_iteration": 2.556972026824951 + }, + { + "auxiliary_loss_clip": 0.01067275, + "auxiliary_loss_mlp": 0.01027866, + "balance_loss_clip": 1.01499772, + "balance_loss_mlp": 1.02195311, + "epoch": 0.42344806854050804, + "flos": 25227819371520.0, + "grad_norm": 1.2676076224413264, + "language_loss": 0.82246459, + "learning_rate": 2.476712221531005e-06, + "loss": 0.84341598, + "num_input_tokens_seen": 151217905, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.453125, + "step": 7043, + "time_per_iteration": 2.419922113418579 + }, + { + "auxiliary_loss_clip": 0.01070389, + "auxiliary_loss_mlp": 0.01034864, + "balance_loss_clip": 1.02016544, + "balance_loss_mlp": 1.02415931, + "epoch": 0.423508191793176, + "flos": 22777859468160.0, + "grad_norm": 2.5860364164885756, + "language_loss": 0.64762211, + "learning_rate": 2.4763452803612077e-06, + "loss": 0.66867459, + "num_input_tokens_seen": 151234580, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.46289062, + "step": 7044, + "time_per_iteration": 2.360199213027954 + }, + { + "auxiliary_loss_clip": 0.01073222, + "auxiliary_loss_mlp": 0.01037585, + "balance_loss_clip": 1.02257657, + "balance_loss_mlp": 1.02348316, + "epoch": 0.42356831504584397, + "flos": 34636179242880.0, + "grad_norm": 1.7695807671865544, + "language_loss": 0.75281847, + "learning_rate": 2.4759783221918716e-06, + "loss": 0.77392656, + "num_input_tokens_seen": 151254765, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.49609375, + "step": 7045, + "time_per_iteration": 2.483752965927124 + }, + { + "auxiliary_loss_clip": 0.01071538, + "auxiliary_loss_mlp": 0.01033865, + "balance_loss_clip": 1.02020359, + "balance_loss_mlp": 1.02464867, + "epoch": 0.42362843829851193, + "flos": 17595980588160.0, + "grad_norm": 1.9194829777886564, + "language_loss": 0.81081939, + "learning_rate": 2.4756113470360944e-06, + "loss": 0.83187342, + "num_input_tokens_seen": 151269045, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.46875, + "step": 7046, + "time_per_iteration": 2.3332128524780273 + }, + { + "auxiliary_loss_clip": 0.01070978, + "auxiliary_loss_mlp": 0.01032746, + "balance_loss_clip": 1.01771379, + "balance_loss_mlp": 1.02249134, + "epoch": 0.4236885615511799, + "flos": 22564574772480.0, + "grad_norm": 1.7358059715463545, + "language_loss": 0.76649433, + "learning_rate": 2.4752443549069713e-06, + "loss": 0.78753161, + "num_input_tokens_seen": 151287530, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.484375, + "step": 7047, + "time_per_iteration": 3.7749459743499756 + }, + { + "auxiliary_loss_clip": 0.01071992, + "auxiliary_loss_mlp": 0.01029016, + "balance_loss_clip": 1.01527131, + "balance_loss_mlp": 1.02502489, + "epoch": 0.42374868480384786, + "flos": 26468004153600.0, + "grad_norm": 1.5875397307845611, + "language_loss": 0.67857015, + "learning_rate": 2.4748773458176e-06, + "loss": 0.69958019, + "num_input_tokens_seen": 151308905, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.46875, + "step": 7048, + "time_per_iteration": 2.4188876152038574 + }, + { + "auxiliary_loss_clip": 0.01070814, + "auxiliary_loss_mlp": 0.0102993, + "balance_loss_clip": 1.014588, + "balance_loss_mlp": 1.0228641, + "epoch": 0.4238088080565159, + "flos": 20369341215360.0, + "grad_norm": 3.0557670304387092, + "language_loss": 0.78473657, + "learning_rate": 2.4745103197810775e-06, + "loss": 0.80574399, + "num_input_tokens_seen": 151326525, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.48046875, + "step": 7049, + "time_per_iteration": 2.371249198913574 + }, + { + "auxiliary_loss_clip": 0.01072672, + "auxiliary_loss_mlp": 0.01033636, + "balance_loss_clip": 1.01823378, + "balance_loss_mlp": 1.02498412, + "epoch": 0.42386893130918385, + "flos": 21171226389120.0, + "grad_norm": 1.6553339289930455, + "language_loss": 0.827555, + "learning_rate": 2.474143276810502e-06, + "loss": 0.84861809, + "num_input_tokens_seen": 151344675, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.4765625, + "step": 7050, + "time_per_iteration": 3.820026159286499 + }, + { + "auxiliary_loss_clip": 0.01069591, + "auxiliary_loss_mlp": 0.01030275, + "balance_loss_clip": 1.01576781, + "balance_loss_mlp": 1.02124763, + "epoch": 0.4239290545618518, + "flos": 17674674526080.0, + "grad_norm": 2.060700430244388, + "language_loss": 0.73495746, + "learning_rate": 2.4737762169189728e-06, + "loss": 0.75595617, + "num_input_tokens_seen": 151360730, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.484375, + "step": 7051, + "time_per_iteration": 2.3488595485687256 + }, + { + "auxiliary_loss_clip": 0.01068993, + "auxiliary_loss_mlp": 0.01028287, + "balance_loss_clip": 1.01424408, + "balance_loss_mlp": 1.02285254, + "epoch": 0.4239891778145198, + "flos": 24313409285760.0, + "grad_norm": 1.5695425662036258, + "language_loss": 0.86173338, + "learning_rate": 2.473409140119589e-06, + "loss": 0.88270617, + "num_input_tokens_seen": 151380445, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.4609375, + "step": 7052, + "time_per_iteration": 2.389087438583374 + }, + { + "auxiliary_loss_clip": 0.01070245, + "auxiliary_loss_mlp": 0.01036547, + "balance_loss_clip": 1.02167535, + "balance_loss_mlp": 1.02265811, + "epoch": 0.42404930106718774, + "flos": 20557383131520.0, + "grad_norm": 1.3745147116572254, + "language_loss": 0.72180009, + "learning_rate": 2.4730420464254512e-06, + "loss": 0.74286807, + "num_input_tokens_seen": 151399325, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.4765625, + "step": 7053, + "time_per_iteration": 3.746805429458618 + }, + { + "auxiliary_loss_clip": 0.01070261, + "auxiliary_loss_mlp": 0.01030301, + "balance_loss_clip": 1.01625276, + "balance_loss_mlp": 1.0243268, + "epoch": 0.4241094243198557, + "flos": 22307020606080.0, + "grad_norm": 1.4947943338219138, + "language_loss": 0.8234539, + "learning_rate": 2.472674935849659e-06, + "loss": 0.84445953, + "num_input_tokens_seen": 151417240, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.45898438, + "step": 7054, + "time_per_iteration": 2.3735029697418213 + }, + { + "auxiliary_loss_clip": 0.01012567, + "auxiliary_loss_mlp": 0.01000849, + "balance_loss_clip": 0.9994778, + "balance_loss_mlp": 1.00256729, + "epoch": 0.4241695475725237, + "flos": 70609111822080.0, + "grad_norm": 0.7840854030412264, + "language_loss": 0.60405338, + "learning_rate": 2.4723078084053154e-06, + "loss": 0.62418759, + "num_input_tokens_seen": 151476015, + "router_z_loss_clip": 0.01373291, + "router_z_loss_mlp": 0.09960938, + "step": 7055, + "time_per_iteration": 2.969431161880493 + }, + { + "auxiliary_loss_clip": 0.01070568, + "auxiliary_loss_mlp": 0.0103519, + "balance_loss_clip": 1.01898348, + "balance_loss_mlp": 1.0228343, + "epoch": 0.42422967082519164, + "flos": 14026599895680.0, + "grad_norm": 2.202547399103567, + "language_loss": 0.76371002, + "learning_rate": 2.4719406641055197e-06, + "loss": 0.78476763, + "num_input_tokens_seen": 151492035, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.4765625, + "step": 7056, + "time_per_iteration": 2.346412420272827 + }, + { + "auxiliary_loss_clip": 0.01073833, + "auxiliary_loss_mlp": 0.01033938, + "balance_loss_clip": 1.01738572, + "balance_loss_mlp": 1.02328157, + "epoch": 0.4242897940778596, + "flos": 22344447450240.0, + "grad_norm": 1.9298768766090957, + "language_loss": 0.84448254, + "learning_rate": 2.471573502963376e-06, + "loss": 0.86556029, + "num_input_tokens_seen": 151508970, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.5078125, + "step": 7057, + "time_per_iteration": 3.725433349609375 + }, + { + "auxiliary_loss_clip": 0.01071997, + "auxiliary_loss_mlp": 0.0103462, + "balance_loss_clip": 1.01877713, + "balance_loss_mlp": 1.02235508, + "epoch": 0.42434991733052757, + "flos": 22594914610560.0, + "grad_norm": 2.068615897536432, + "language_loss": 0.82657212, + "learning_rate": 2.4712063249919876e-06, + "loss": 0.84763837, + "num_input_tokens_seen": 151525295, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.49609375, + "step": 7058, + "time_per_iteration": 2.356221914291382 + }, + { + "auxiliary_loss_clip": 0.01070705, + "auxiliary_loss_mlp": 0.01026318, + "balance_loss_clip": 1.01334763, + "balance_loss_mlp": 1.02393317, + "epoch": 0.42441004058319554, + "flos": 20010398860800.0, + "grad_norm": 1.7543601859464888, + "language_loss": 0.79971129, + "learning_rate": 2.470839130204457e-06, + "loss": 0.82068157, + "num_input_tokens_seen": 151544435, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.46679688, + "step": 7059, + "time_per_iteration": 2.3708670139312744 + }, + { + "auxiliary_loss_clip": 0.01070185, + "auxiliary_loss_mlp": 0.01029252, + "balance_loss_clip": 1.01299787, + "balance_loss_mlp": 1.02266955, + "epoch": 0.4244701638358635, + "flos": 11144205492480.0, + "grad_norm": 2.150036488344683, + "language_loss": 0.70313585, + "learning_rate": 2.4704719186138887e-06, + "loss": 0.72413015, + "num_input_tokens_seen": 151559520, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.47460938, + "step": 7060, + "time_per_iteration": 2.340439558029175 + }, + { + "auxiliary_loss_clip": 0.01069268, + "auxiliary_loss_mlp": 0.01028148, + "balance_loss_clip": 1.01372933, + "balance_loss_mlp": 1.0223763, + "epoch": 0.42453028708853147, + "flos": 23986622160000.0, + "grad_norm": 1.4402058521178394, + "language_loss": 0.76030648, + "learning_rate": 2.4701046902333886e-06, + "loss": 0.78128064, + "num_input_tokens_seen": 151579790, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.46875, + "step": 7061, + "time_per_iteration": 2.3947763442993164 + }, + { + "auxiliary_loss_clip": 0.01072975, + "auxiliary_loss_mlp": 0.0103237, + "balance_loss_clip": 1.01682568, + "balance_loss_mlp": 1.02413607, + "epoch": 0.42459041034119943, + "flos": 18405336792960.0, + "grad_norm": 1.802002645471642, + "language_loss": 0.72542024, + "learning_rate": 2.4697374450760606e-06, + "loss": 0.74647367, + "num_input_tokens_seen": 151598285, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.48828125, + "step": 7062, + "time_per_iteration": 2.380218744277954 + }, + { + "auxiliary_loss_clip": 0.01070564, + "auxiliary_loss_mlp": 0.01033484, + "balance_loss_clip": 1.01911902, + "balance_loss_mlp": 1.02232051, + "epoch": 0.42465053359386745, + "flos": 20956999086720.0, + "grad_norm": 1.806167940763372, + "language_loss": 0.66185647, + "learning_rate": 2.469370183155012e-06, + "loss": 0.68289697, + "num_input_tokens_seen": 151615430, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.48242188, + "step": 7063, + "time_per_iteration": 2.349761486053467 + }, + { + "auxiliary_loss_clip": 0.01066684, + "auxiliary_loss_mlp": 0.01027558, + "balance_loss_clip": 1.01418281, + "balance_loss_mlp": 1.02205205, + "epoch": 0.4247106568465354, + "flos": 33104888611200.0, + "grad_norm": 1.5018832444117989, + "language_loss": 0.78519362, + "learning_rate": 2.4690029044833483e-06, + "loss": 0.80613607, + "num_input_tokens_seen": 151637030, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.44726562, + "step": 7064, + "time_per_iteration": 2.4796528816223145 + }, + { + "auxiliary_loss_clip": 0.01069827, + "auxiliary_loss_mlp": 0.01028592, + "balance_loss_clip": 1.01385736, + "balance_loss_mlp": 1.02225614, + "epoch": 0.4247707800992034, + "flos": 20045905580160.0, + "grad_norm": 1.7897542221891278, + "language_loss": 0.75084126, + "learning_rate": 2.468635609074178e-06, + "loss": 0.77182549, + "num_input_tokens_seen": 151655745, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.4765625, + "step": 7065, + "time_per_iteration": 2.3712353706359863 + }, + { + "auxiliary_loss_clip": 0.0106986, + "auxiliary_loss_mlp": 0.01026795, + "balance_loss_clip": 1.01262665, + "balance_loss_mlp": 1.02293658, + "epoch": 0.42483090335187135, + "flos": 22383968976000.0, + "grad_norm": 1.2634282328969684, + "language_loss": 0.72600126, + "learning_rate": 2.468268296940608e-06, + "loss": 0.74696779, + "num_input_tokens_seen": 151678040, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.46875, + "step": 7066, + "time_per_iteration": 2.4198970794677734 + }, + { + "auxiliary_loss_clip": 0.01071938, + "auxiliary_loss_mlp": 0.01034279, + "balance_loss_clip": 1.01855505, + "balance_loss_mlp": 1.02290058, + "epoch": 0.4248910266045393, + "flos": 21355881903360.0, + "grad_norm": 1.8892572199920357, + "language_loss": 0.80213773, + "learning_rate": 2.467900968095747e-06, + "loss": 0.82319987, + "num_input_tokens_seen": 151696410, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.49023438, + "step": 7067, + "time_per_iteration": 2.378579616546631 + }, + { + "auxiliary_loss_clip": 0.01011582, + "auxiliary_loss_mlp": 0.0100339, + "balance_loss_clip": 1.00204921, + "balance_loss_mlp": 1.00136638, + "epoch": 0.4249511498572073, + "flos": 64004976086400.0, + "grad_norm": 0.9129518227948051, + "language_loss": 0.63437092, + "learning_rate": 2.4675336225527045e-06, + "loss": 0.65452063, + "num_input_tokens_seen": 151756365, + "router_z_loss_clip": 0.01342773, + "router_z_loss_mlp": 0.10205078, + "step": 7068, + "time_per_iteration": 2.9422662258148193 + }, + { + "auxiliary_loss_clip": 0.01069764, + "auxiliary_loss_mlp": 0.01028934, + "balance_loss_clip": 1.01331735, + "balance_loss_mlp": 1.02218437, + "epoch": 0.42501127310987524, + "flos": 19606104783360.0, + "grad_norm": 1.5256174735628485, + "language_loss": 0.72279102, + "learning_rate": 2.4671662603245892e-06, + "loss": 0.74377799, + "num_input_tokens_seen": 151775165, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.47460938, + "step": 7069, + "time_per_iteration": 2.3670334815979004 + }, + { + "auxiliary_loss_clip": 0.01070833, + "auxiliary_loss_mlp": 0.01033346, + "balance_loss_clip": 1.01676404, + "balance_loss_mlp": 1.02217317, + "epoch": 0.4250713963625432, + "flos": 19461354819840.0, + "grad_norm": 1.964309339168105, + "language_loss": 0.79635334, + "learning_rate": 2.4667988814245116e-06, + "loss": 0.81739515, + "num_input_tokens_seen": 151792620, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.48828125, + "step": 7070, + "time_per_iteration": 2.3519017696380615 + }, + { + "auxiliary_loss_clip": 0.01069132, + "auxiliary_loss_mlp": 0.01028434, + "balance_loss_clip": 1.01293635, + "balance_loss_mlp": 1.02194047, + "epoch": 0.4251315196152112, + "flos": 25336538945280.0, + "grad_norm": 1.6639705657858446, + "language_loss": 0.70497555, + "learning_rate": 2.466431485865584e-06, + "loss": 0.72595119, + "num_input_tokens_seen": 151812850, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.47265625, + "step": 7071, + "time_per_iteration": 2.3949882984161377 + }, + { + "auxiliary_loss_clip": 0.01070257, + "auxiliary_loss_mlp": 0.01032753, + "balance_loss_clip": 1.01784027, + "balance_loss_mlp": 1.02263677, + "epoch": 0.42519164286787914, + "flos": 26357992859520.0, + "grad_norm": 5.000409874060265, + "language_loss": 0.71318698, + "learning_rate": 2.466064073660915e-06, + "loss": 0.73421705, + "num_input_tokens_seen": 151831785, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.4765625, + "step": 7072, + "time_per_iteration": 2.3953468799591064 + }, + { + "auxiliary_loss_clip": 0.01070947, + "auxiliary_loss_mlp": 0.01029165, + "balance_loss_clip": 1.01425803, + "balance_loss_mlp": 1.02310133, + "epoch": 0.4252517661205471, + "flos": 26029879102080.0, + "grad_norm": 6.936292697705833, + "language_loss": 0.81688344, + "learning_rate": 2.465696644823619e-06, + "loss": 0.83788455, + "num_input_tokens_seen": 151853885, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.47851562, + "step": 7073, + "time_per_iteration": 2.4243381023406982 + }, + { + "auxiliary_loss_clip": 0.01068935, + "auxiliary_loss_mlp": 0.01028268, + "balance_loss_clip": 1.01406384, + "balance_loss_mlp": 1.02207434, + "epoch": 0.42531188937321507, + "flos": 12712818234240.0, + "grad_norm": 2.214419149542741, + "language_loss": 0.91003996, + "learning_rate": 2.465329199366806e-06, + "loss": 0.93101197, + "num_input_tokens_seen": 151871780, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.46875, + "step": 7074, + "time_per_iteration": 2.3354265689849854 + }, + { + "auxiliary_loss_clip": 0.01072316, + "auxiliary_loss_mlp": 0.01031552, + "balance_loss_clip": 1.01526821, + "balance_loss_mlp": 1.02236867, + "epoch": 0.42537201262588303, + "flos": 22090558976640.0, + "grad_norm": 2.4236082709047575, + "language_loss": 0.63888335, + "learning_rate": 2.4649617373035924e-06, + "loss": 0.659922, + "num_input_tokens_seen": 151891600, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.5, + "step": 7075, + "time_per_iteration": 2.382066249847412 + }, + { + "auxiliary_loss_clip": 0.01064641, + "auxiliary_loss_mlp": 0.01026085, + "balance_loss_clip": 1.0127337, + "balance_loss_mlp": 1.02074254, + "epoch": 0.42543213587855105, + "flos": 23002001596800.0, + "grad_norm": 1.645838291860392, + "language_loss": 0.73855734, + "learning_rate": 2.4645942586470898e-06, + "loss": 0.75946462, + "num_input_tokens_seen": 151911330, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.43945312, + "step": 7076, + "time_per_iteration": 2.3940985202789307 + }, + { + "auxiliary_loss_clip": 0.01071425, + "auxiliary_loss_mlp": 0.01031922, + "balance_loss_clip": 1.01747417, + "balance_loss_mlp": 1.02282071, + "epoch": 0.425492259131219, + "flos": 25081288928640.0, + "grad_norm": 1.9123978019773624, + "language_loss": 0.78310043, + "learning_rate": 2.4642267634104136e-06, + "loss": 0.80413389, + "num_input_tokens_seen": 151930355, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.484375, + "step": 7077, + "time_per_iteration": 2.3948001861572266 + }, + { + "auxiliary_loss_clip": 0.01066403, + "auxiliary_loss_mlp": 0.01024935, + "balance_loss_clip": 1.01229918, + "balance_loss_mlp": 1.02197206, + "epoch": 0.425552382383887, + "flos": 22815844894080.0, + "grad_norm": 1.66724171714771, + "language_loss": 0.7302103, + "learning_rate": 2.4638592516066784e-06, + "loss": 0.75112367, + "num_input_tokens_seen": 151949695, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.44335938, + "step": 7078, + "time_per_iteration": 2.3719089031219482 + }, + { + "auxiliary_loss_clip": 0.01071019, + "auxiliary_loss_mlp": 0.01034206, + "balance_loss_clip": 1.01953757, + "balance_loss_mlp": 1.02454066, + "epoch": 0.42561250563655495, + "flos": 13552723745280.0, + "grad_norm": 2.0752941012152775, + "language_loss": 0.79521108, + "learning_rate": 2.4634917232489993e-06, + "loss": 0.81626338, + "num_input_tokens_seen": 151967640, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.46484375, + "step": 7079, + "time_per_iteration": 2.3593320846557617 + }, + { + "auxiliary_loss_clip": 0.0107101, + "auxiliary_loss_mlp": 0.01033019, + "balance_loss_clip": 1.01951909, + "balance_loss_mlp": 1.02551746, + "epoch": 0.4256726288892229, + "flos": 46976404780800.0, + "grad_norm": 1.5233207854753479, + "language_loss": 0.72000873, + "learning_rate": 2.463124178350493e-06, + "loss": 0.74104905, + "num_input_tokens_seen": 151994020, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.45507812, + "step": 7080, + "time_per_iteration": 2.620838165283203 + }, + { + "auxiliary_loss_clip": 0.01067985, + "auxiliary_loss_mlp": 0.01028017, + "balance_loss_clip": 1.01442075, + "balance_loss_mlp": 1.0218128, + "epoch": 0.4257327521418909, + "flos": 23585330459520.0, + "grad_norm": 2.026083835297168, + "language_loss": 0.80883479, + "learning_rate": 2.4627566169242757e-06, + "loss": 0.82979482, + "num_input_tokens_seen": 152013415, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.4609375, + "step": 7081, + "time_per_iteration": 2.38944935798645 + }, + { + "auxiliary_loss_clip": 0.01065356, + "auxiliary_loss_mlp": 0.01024104, + "balance_loss_clip": 1.01069903, + "balance_loss_mlp": 1.02138436, + "epoch": 0.42579287539455885, + "flos": 18988979857920.0, + "grad_norm": 1.626176084576001, + "language_loss": 0.81705868, + "learning_rate": 2.4623890389834656e-06, + "loss": 0.83795327, + "num_input_tokens_seen": 152030860, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.43945312, + "step": 7082, + "time_per_iteration": 2.3462979793548584 + }, + { + "auxiliary_loss_clip": 0.01069652, + "auxiliary_loss_mlp": 0.01029882, + "balance_loss_clip": 1.0156666, + "balance_loss_mlp": 1.02358532, + "epoch": 0.4258529986472268, + "flos": 25190741640960.0, + "grad_norm": 2.5213769278045586, + "language_loss": 0.69876128, + "learning_rate": 2.46202144454118e-06, + "loss": 0.7197566, + "num_input_tokens_seen": 152050395, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.4609375, + "step": 7083, + "time_per_iteration": 2.409213066101074 + }, + { + "auxiliary_loss_clip": 0.01011668, + "auxiliary_loss_mlp": 0.01000565, + "balance_loss_clip": 0.99930722, + "balance_loss_mlp": 1.00144708, + "epoch": 0.4259131218998948, + "flos": 69964614524160.0, + "grad_norm": 0.8676444553078081, + "language_loss": 0.67096782, + "learning_rate": 2.4616538336105373e-06, + "loss": 0.69109017, + "num_input_tokens_seen": 152113555, + "router_z_loss_clip": 0.01257324, + "router_z_loss_mlp": 0.10205078, + "step": 7084, + "time_per_iteration": 3.096841335296631 + }, + { + "auxiliary_loss_clip": 0.01068526, + "auxiliary_loss_mlp": 0.0103259, + "balance_loss_clip": 1.01808286, + "balance_loss_mlp": 1.02347624, + "epoch": 0.42597324515256274, + "flos": 18003975269760.0, + "grad_norm": 2.2090338819755377, + "language_loss": 0.78599966, + "learning_rate": 2.461286206204657e-06, + "loss": 0.80701077, + "num_input_tokens_seen": 152131575, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.45117188, + "step": 7085, + "time_per_iteration": 2.3501226902008057 + }, + { + "auxiliary_loss_clip": 0.01072454, + "auxiliary_loss_mlp": 0.01035365, + "balance_loss_clip": 1.01977861, + "balance_loss_mlp": 1.02315199, + "epoch": 0.4260333684052307, + "flos": 15157890547200.0, + "grad_norm": 2.5866805617807205, + "language_loss": 0.75643808, + "learning_rate": 2.460918562336659e-06, + "loss": 0.77751625, + "num_input_tokens_seen": 152149435, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.4921875, + "step": 7086, + "time_per_iteration": 3.786205530166626 + }, + { + "auxiliary_loss_clip": 0.01068453, + "auxiliary_loss_mlp": 0.01032555, + "balance_loss_clip": 1.01854181, + "balance_loss_mlp": 1.02170014, + "epoch": 0.42609349165789867, + "flos": 14938461452160.0, + "grad_norm": 2.264415124416107, + "language_loss": 0.803895, + "learning_rate": 2.460550902019663e-06, + "loss": 0.82490504, + "num_input_tokens_seen": 152166860, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.46875, + "step": 7087, + "time_per_iteration": 2.4022862911224365 + }, + { + "auxiliary_loss_clip": 0.01069055, + "auxiliary_loss_mlp": 0.01029959, + "balance_loss_clip": 1.01639271, + "balance_loss_mlp": 1.02374351, + "epoch": 0.42615361491056664, + "flos": 23730848472960.0, + "grad_norm": 1.7160146693422558, + "language_loss": 0.65681148, + "learning_rate": 2.4601832252667893e-06, + "loss": 0.67780155, + "num_input_tokens_seen": 152187475, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.453125, + "step": 7088, + "time_per_iteration": 2.398263692855835 + }, + { + "auxiliary_loss_clip": 0.01072026, + "auxiliary_loss_mlp": 0.01032912, + "balance_loss_clip": 1.01874948, + "balance_loss_mlp": 1.02378917, + "epoch": 0.42621373816323466, + "flos": 24935282156160.0, + "grad_norm": 2.0293250146728905, + "language_loss": 0.69506109, + "learning_rate": 2.4598155320911604e-06, + "loss": 0.71611047, + "num_input_tokens_seen": 152207235, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.48242188, + "step": 7089, + "time_per_iteration": 2.4621808528900146 + }, + { + "auxiliary_loss_clip": 0.01068685, + "auxiliary_loss_mlp": 0.01030792, + "balance_loss_clip": 1.01749396, + "balance_loss_mlp": 1.02126682, + "epoch": 0.4262738614159026, + "flos": 13552130252160.0, + "grad_norm": 3.96983832639365, + "language_loss": 0.73637015, + "learning_rate": 2.459447822505898e-06, + "loss": 0.75736493, + "num_input_tokens_seen": 152224240, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.47460938, + "step": 7090, + "time_per_iteration": 3.7896735668182373 + }, + { + "auxiliary_loss_clip": 0.01071374, + "auxiliary_loss_mlp": 0.01031824, + "balance_loss_clip": 1.01749539, + "balance_loss_mlp": 1.02217126, + "epoch": 0.4263339846685706, + "flos": 29747605628160.0, + "grad_norm": 1.7586336293285827, + "language_loss": 0.74735034, + "learning_rate": 2.459080096524124e-06, + "loss": 0.76838243, + "num_input_tokens_seen": 152242595, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.4921875, + "step": 7091, + "time_per_iteration": 2.4141931533813477 + }, + { + "auxiliary_loss_clip": 0.0106575, + "auxiliary_loss_mlp": 0.01026914, + "balance_loss_clip": 1.01457, + "balance_loss_mlp": 1.02125371, + "epoch": 0.42639410792123855, + "flos": 16833337649280.0, + "grad_norm": 1.755023689834625, + "language_loss": 0.82678878, + "learning_rate": 2.458712354158963e-06, + "loss": 0.8477155, + "num_input_tokens_seen": 152260840, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.4453125, + "step": 7092, + "time_per_iteration": 2.3711578845977783 + }, + { + "auxiliary_loss_clip": 0.01072356, + "auxiliary_loss_mlp": 0.01034922, + "balance_loss_clip": 1.0200206, + "balance_loss_mlp": 1.02185655, + "epoch": 0.4264542311739065, + "flos": 28761972635520.0, + "grad_norm": 2.1049874984121715, + "language_loss": 0.7377851, + "learning_rate": 2.4583445954235384e-06, + "loss": 0.75885785, + "num_input_tokens_seen": 152280580, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.50390625, + "step": 7093, + "time_per_iteration": 3.8053674697875977 + }, + { + "auxiliary_loss_clip": 0.01068573, + "auxiliary_loss_mlp": 0.01032001, + "balance_loss_clip": 1.01746905, + "balance_loss_mlp": 1.02213454, + "epoch": 0.4265143544265745, + "flos": 24712571393280.0, + "grad_norm": 2.4185605502261938, + "language_loss": 0.6955238, + "learning_rate": 2.4579768203309733e-06, + "loss": 0.71652955, + "num_input_tokens_seen": 152298455, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.46484375, + "step": 7094, + "time_per_iteration": 2.4055802822113037 + }, + { + "auxiliary_loss_clip": 0.01068696, + "auxiliary_loss_mlp": 0.01031024, + "balance_loss_clip": 1.01652849, + "balance_loss_mlp": 1.02197719, + "epoch": 0.42657447767924245, + "flos": 21865055304960.0, + "grad_norm": 2.6689601555659825, + "language_loss": 0.81273216, + "learning_rate": 2.457609028894394e-06, + "loss": 0.83372939, + "num_input_tokens_seen": 152316995, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.46875, + "step": 7095, + "time_per_iteration": 2.3796017169952393 + }, + { + "auxiliary_loss_clip": 0.01067936, + "auxiliary_loss_mlp": 0.01030578, + "balance_loss_clip": 1.01629639, + "balance_loss_mlp": 1.02174497, + "epoch": 0.4266346009319104, + "flos": 21469174865280.0, + "grad_norm": 2.874638319341862, + "language_loss": 0.80327594, + "learning_rate": 2.457241221126925e-06, + "loss": 0.82426107, + "num_input_tokens_seen": 152334800, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.4609375, + "step": 7096, + "time_per_iteration": 3.7613332271575928 + }, + { + "auxiliary_loss_clip": 0.01071252, + "auxiliary_loss_mlp": 0.01030561, + "balance_loss_clip": 1.01694763, + "balance_loss_mlp": 1.02305126, + "epoch": 0.4266947241845784, + "flos": 25518226993920.0, + "grad_norm": 2.05768644435347, + "language_loss": 0.65609407, + "learning_rate": 2.4568733970416936e-06, + "loss": 0.67711216, + "num_input_tokens_seen": 152355175, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.48242188, + "step": 7097, + "time_per_iteration": 2.414677143096924 + }, + { + "auxiliary_loss_clip": 0.01068125, + "auxiliary_loss_mlp": 0.01028741, + "balance_loss_clip": 1.0153482, + "balance_loss_mlp": 1.02200484, + "epoch": 0.42675484743724634, + "flos": 26540030021760.0, + "grad_norm": 1.7329434418956413, + "language_loss": 0.74309623, + "learning_rate": 2.4565055566518252e-06, + "loss": 0.76406491, + "num_input_tokens_seen": 152377245, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.4609375, + "step": 7098, + "time_per_iteration": 2.4218549728393555 + }, + { + "auxiliary_loss_clip": 0.01067158, + "auxiliary_loss_mlp": 0.01024515, + "balance_loss_clip": 1.01037693, + "balance_loss_mlp": 1.0222652, + "epoch": 0.4268149706899143, + "flos": 23111593954560.0, + "grad_norm": 1.6262771216819958, + "language_loss": 0.74955463, + "learning_rate": 2.4561376999704488e-06, + "loss": 0.77047145, + "num_input_tokens_seen": 152396985, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.44921875, + "step": 7099, + "time_per_iteration": 2.377621650695801 + }, + { + "auxiliary_loss_clip": 0.010706, + "auxiliary_loss_mlp": 0.01029937, + "balance_loss_clip": 1.01597738, + "balance_loss_mlp": 1.02349043, + "epoch": 0.4268750939425823, + "flos": 22705554309120.0, + "grad_norm": 2.3886003922511865, + "language_loss": 0.82776904, + "learning_rate": 2.4557698270106906e-06, + "loss": 0.84877443, + "num_input_tokens_seen": 152415590, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.47265625, + "step": 7100, + "time_per_iteration": 2.3703346252441406 + }, + { + "auxiliary_loss_clip": 0.01069652, + "auxiliary_loss_mlp": 0.01024114, + "balance_loss_clip": 1.01086402, + "balance_loss_mlp": 1.02236986, + "epoch": 0.42693521719525024, + "flos": 25373686498560.0, + "grad_norm": 1.3085188717528546, + "language_loss": 0.82132638, + "learning_rate": 2.45540193778568e-06, + "loss": 0.84226406, + "num_input_tokens_seen": 152436735, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.47265625, + "step": 7101, + "time_per_iteration": 2.413560628890991 + }, + { + "auxiliary_loss_clip": 0.01071183, + "auxiliary_loss_mlp": 0.01027653, + "balance_loss_clip": 1.01319873, + "balance_loss_mlp": 1.02313423, + "epoch": 0.42699534044791826, + "flos": 18149702751360.0, + "grad_norm": 2.4754702840961067, + "language_loss": 0.7244978, + "learning_rate": 2.4550340323085453e-06, + "loss": 0.74548626, + "num_input_tokens_seen": 152455685, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.48046875, + "step": 7102, + "time_per_iteration": 2.367746114730835 + }, + { + "auxiliary_loss_clip": 0.01068045, + "auxiliary_loss_mlp": 0.01025164, + "balance_loss_clip": 1.01129365, + "balance_loss_mlp": 1.02259195, + "epoch": 0.4270554637005862, + "flos": 13697578442880.0, + "grad_norm": 1.8738049302464486, + "language_loss": 0.8284843, + "learning_rate": 2.4546661105924166e-06, + "loss": 0.84941638, + "num_input_tokens_seen": 152473500, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.453125, + "step": 7103, + "time_per_iteration": 2.3552534580230713 + }, + { + "auxiliary_loss_clip": 0.01070471, + "auxiliary_loss_mlp": 0.01033142, + "balance_loss_clip": 1.01874185, + "balance_loss_mlp": 1.02246463, + "epoch": 0.4271155869532542, + "flos": 17492637363840.0, + "grad_norm": 1.9484688173862654, + "language_loss": 0.7403264, + "learning_rate": 2.454298172650424e-06, + "loss": 0.76136249, + "num_input_tokens_seen": 152491320, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.48046875, + "step": 7104, + "time_per_iteration": 2.3729352951049805 + }, + { + "auxiliary_loss_clip": 0.01068332, + "auxiliary_loss_mlp": 0.01022599, + "balance_loss_clip": 1.00992167, + "balance_loss_mlp": 1.02206111, + "epoch": 0.42717571020592215, + "flos": 32450930334720.0, + "grad_norm": 2.0408393363728115, + "language_loss": 0.75003988, + "learning_rate": 2.4539302184956986e-06, + "loss": 0.77094924, + "num_input_tokens_seen": 152511970, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.46289062, + "step": 7105, + "time_per_iteration": 2.458487033843994 + }, + { + "auxiliary_loss_clip": 0.01070934, + "auxiliary_loss_mlp": 0.01030888, + "balance_loss_clip": 1.01782918, + "balance_loss_mlp": 1.02433515, + "epoch": 0.4272358334585901, + "flos": 16252138379520.0, + "grad_norm": 1.8438429506575298, + "language_loss": 0.77049285, + "learning_rate": 2.45356224814137e-06, + "loss": 0.79151106, + "num_input_tokens_seen": 152530515, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.46484375, + "step": 7106, + "time_per_iteration": 2.425480842590332 + }, + { + "auxiliary_loss_clip": 0.01067633, + "auxiliary_loss_mlp": 0.01032805, + "balance_loss_clip": 1.01951945, + "balance_loss_mlp": 1.0217489, + "epoch": 0.4272959567112581, + "flos": 24199138805760.0, + "grad_norm": 1.725877626739418, + "language_loss": 0.80046242, + "learning_rate": 2.453194261600573e-06, + "loss": 0.8214668, + "num_input_tokens_seen": 152549295, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.45898438, + "step": 7107, + "time_per_iteration": 2.4100077152252197 + }, + { + "auxiliary_loss_clip": 0.01068173, + "auxiliary_loss_mlp": 0.01025768, + "balance_loss_clip": 1.01187432, + "balance_loss_mlp": 1.02305651, + "epoch": 0.42735607996392605, + "flos": 27962286877440.0, + "grad_norm": 2.076557928633098, + "language_loss": 0.67938316, + "learning_rate": 2.4528262588864376e-06, + "loss": 0.70032263, + "num_input_tokens_seen": 152570725, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.45117188, + "step": 7108, + "time_per_iteration": 2.5545856952667236 + }, + { + "auxiliary_loss_clip": 0.01068209, + "auxiliary_loss_mlp": 0.01029391, + "balance_loss_clip": 1.01552749, + "balance_loss_mlp": 1.0216167, + "epoch": 0.427416203216594, + "flos": 20294766817920.0, + "grad_norm": 1.70225687498764, + "language_loss": 0.71362209, + "learning_rate": 2.452458240012098e-06, + "loss": 0.73459804, + "num_input_tokens_seen": 152588950, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.46484375, + "step": 7109, + "time_per_iteration": 2.5048344135284424 + }, + { + "auxiliary_loss_clip": 0.01069367, + "auxiliary_loss_mlp": 0.01032581, + "balance_loss_clip": 1.01709557, + "balance_loss_mlp": 1.02279019, + "epoch": 0.427476326469262, + "flos": 26942718176640.0, + "grad_norm": 1.9980434034698997, + "language_loss": 0.64426458, + "learning_rate": 2.4520902049906883e-06, + "loss": 0.66528404, + "num_input_tokens_seen": 152608965, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.46484375, + "step": 7110, + "time_per_iteration": 2.414036989212036 + }, + { + "auxiliary_loss_clip": 0.01068349, + "auxiliary_loss_mlp": 0.01026974, + "balance_loss_clip": 1.01307988, + "balance_loss_mlp": 1.02197278, + "epoch": 0.42753644972192995, + "flos": 25701660610560.0, + "grad_norm": 1.592178383579147, + "language_loss": 0.76632714, + "learning_rate": 2.4517221538353413e-06, + "loss": 0.78728044, + "num_input_tokens_seen": 152630220, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.46289062, + "step": 7111, + "time_per_iteration": 2.4149322509765625 + }, + { + "auxiliary_loss_clip": 0.01069638, + "auxiliary_loss_mlp": 0.01027602, + "balance_loss_clip": 1.01326144, + "balance_loss_mlp": 1.02318597, + "epoch": 0.4275965729745979, + "flos": 18766513474560.0, + "grad_norm": 2.0401846750446713, + "language_loss": 0.72783685, + "learning_rate": 2.4513540865591934e-06, + "loss": 0.74880928, + "num_input_tokens_seen": 152648835, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.46484375, + "step": 7112, + "time_per_iteration": 2.3611984252929688 + }, + { + "auxiliary_loss_clip": 0.0106559, + "auxiliary_loss_mlp": 0.01024686, + "balance_loss_clip": 1.0116384, + "balance_loss_mlp": 1.02179015, + "epoch": 0.4276566962272659, + "flos": 23763422638080.0, + "grad_norm": 1.9157133973676337, + "language_loss": 0.71585971, + "learning_rate": 2.450986003175378e-06, + "loss": 0.73676252, + "num_input_tokens_seen": 152668375, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.4375, + "step": 7113, + "time_per_iteration": 2.3880555629730225 + }, + { + "auxiliary_loss_clip": 0.01072181, + "auxiliary_loss_mlp": 0.01032494, + "balance_loss_clip": 1.01761711, + "balance_loss_mlp": 1.02473152, + "epoch": 0.42771681947993384, + "flos": 22491396829440.0, + "grad_norm": 2.336637374382169, + "language_loss": 0.61411488, + "learning_rate": 2.4506179036970333e-06, + "loss": 0.63516164, + "num_input_tokens_seen": 152689725, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.4765625, + "step": 7114, + "time_per_iteration": 2.4181065559387207 + }, + { + "auxiliary_loss_clip": 0.01072404, + "auxiliary_loss_mlp": 0.01038793, + "balance_loss_clip": 1.02235985, + "balance_loss_mlp": 1.02407742, + "epoch": 0.42777694273260186, + "flos": 25043582793600.0, + "grad_norm": 1.6826444287774003, + "language_loss": 0.65015781, + "learning_rate": 2.4502497881372943e-06, + "loss": 0.67126977, + "num_input_tokens_seen": 152709375, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.484375, + "step": 7115, + "time_per_iteration": 2.3916237354278564 + }, + { + "auxiliary_loss_clip": 0.01065878, + "auxiliary_loss_mlp": 0.01026131, + "balance_loss_clip": 1.0136919, + "balance_loss_mlp": 1.02030587, + "epoch": 0.4278370659852698, + "flos": 18660516986880.0, + "grad_norm": 1.5396066359297698, + "language_loss": 0.73793101, + "learning_rate": 2.449881656509299e-06, + "loss": 0.75885111, + "num_input_tokens_seen": 152727510, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.45703125, + "step": 7116, + "time_per_iteration": 2.3693411350250244 + }, + { + "auxiliary_loss_clip": 0.01066267, + "auxiliary_loss_mlp": 0.01026339, + "balance_loss_clip": 1.01421499, + "balance_loss_mlp": 1.02196622, + "epoch": 0.4278971892379378, + "flos": 27307036880640.0, + "grad_norm": 1.6598786607281502, + "language_loss": 0.69475919, + "learning_rate": 2.4495135088261844e-06, + "loss": 0.71568525, + "num_input_tokens_seen": 152746670, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.44140625, + "step": 7117, + "time_per_iteration": 2.406435012817383 + }, + { + "auxiliary_loss_clip": 0.01069524, + "auxiliary_loss_mlp": 0.010293, + "balance_loss_clip": 1.01497746, + "balance_loss_mlp": 1.02348304, + "epoch": 0.42795731249060576, + "flos": 12888082592640.0, + "grad_norm": 2.3682405005498266, + "language_loss": 0.7017101, + "learning_rate": 2.4491453451010883e-06, + "loss": 0.72269839, + "num_input_tokens_seen": 152760545, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.4609375, + "step": 7118, + "time_per_iteration": 2.327096462249756 + }, + { + "auxiliary_loss_clip": 0.0106843, + "auxiliary_loss_mlp": 0.01030786, + "balance_loss_clip": 1.01695812, + "balance_loss_mlp": 1.02288401, + "epoch": 0.4280174357432737, + "flos": 33400044178560.0, + "grad_norm": 1.83462190230631, + "language_loss": 0.74850404, + "learning_rate": 2.4487771653471508e-06, + "loss": 0.7694962, + "num_input_tokens_seen": 152780970, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.45507812, + "step": 7119, + "time_per_iteration": 2.4720468521118164 + }, + { + "auxiliary_loss_clip": 0.01068279, + "auxiliary_loss_mlp": 0.0102728, + "balance_loss_clip": 1.01429856, + "balance_loss_mlp": 1.02228689, + "epoch": 0.4280775589959417, + "flos": 18258143034240.0, + "grad_norm": 2.1120528200934743, + "language_loss": 0.74582911, + "learning_rate": 2.4484089695775104e-06, + "loss": 0.76678461, + "num_input_tokens_seen": 152798475, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.4609375, + "step": 7120, + "time_per_iteration": 2.3488359451293945 + }, + { + "auxiliary_loss_clip": 0.01068696, + "auxiliary_loss_mlp": 0.01030821, + "balance_loss_clip": 1.01729667, + "balance_loss_mlp": 1.02272224, + "epoch": 0.42813768224860965, + "flos": 21470187294720.0, + "grad_norm": 1.454080446177414, + "language_loss": 0.77105248, + "learning_rate": 2.4480407578053073e-06, + "loss": 0.79204768, + "num_input_tokens_seen": 152817555, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.45898438, + "step": 7121, + "time_per_iteration": 2.37868332862854 + }, + { + "auxiliary_loss_clip": 0.0106788, + "auxiliary_loss_mlp": 0.01026881, + "balance_loss_clip": 1.01475167, + "balance_loss_mlp": 1.02254725, + "epoch": 0.4281978055012776, + "flos": 15668355669120.0, + "grad_norm": 1.9199830852980027, + "language_loss": 0.85760057, + "learning_rate": 2.4476725300436823e-06, + "loss": 0.8785482, + "num_input_tokens_seen": 152836295, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.453125, + "step": 7122, + "time_per_iteration": 2.354254722595215 + }, + { + "auxiliary_loss_clip": 0.01066663, + "auxiliary_loss_mlp": 0.01028134, + "balance_loss_clip": 1.01545024, + "balance_loss_mlp": 1.02240968, + "epoch": 0.4282579287539456, + "flos": 17711054029440.0, + "grad_norm": 1.8184461344959382, + "language_loss": 0.81150472, + "learning_rate": 2.4473042863057763e-06, + "loss": 0.83245265, + "num_input_tokens_seen": 152854950, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.44140625, + "step": 7123, + "time_per_iteration": 2.3623242378234863 + }, + { + "auxiliary_loss_clip": 0.01065988, + "auxiliary_loss_mlp": 0.01032507, + "balance_loss_clip": 1.01987696, + "balance_loss_mlp": 1.02191663, + "epoch": 0.42831805200661355, + "flos": 19280155530240.0, + "grad_norm": 1.5379825009438424, + "language_loss": 0.81354016, + "learning_rate": 2.4469360266047305e-06, + "loss": 0.83452511, + "num_input_tokens_seen": 152873995, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.44140625, + "step": 7124, + "time_per_iteration": 2.373920440673828 + }, + { + "auxiliary_loss_clip": 0.0106748, + "auxiliary_loss_mlp": 0.01031567, + "balance_loss_clip": 1.01808429, + "balance_loss_mlp": 1.02222967, + "epoch": 0.4283781752592815, + "flos": 19791598170240.0, + "grad_norm": 1.6994997345817522, + "language_loss": 0.80339122, + "learning_rate": 2.4465677509536876e-06, + "loss": 0.82438171, + "num_input_tokens_seen": 152892925, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.453125, + "step": 7125, + "time_per_iteration": 2.3698484897613525 + }, + { + "auxiliary_loss_clip": 0.01067315, + "auxiliary_loss_mlp": 0.01028178, + "balance_loss_clip": 1.01536918, + "balance_loss_mlp": 1.02296364, + "epoch": 0.4284382985119495, + "flos": 16507144016640.0, + "grad_norm": 1.686875396968279, + "language_loss": 0.74980617, + "learning_rate": 2.446199459365791e-06, + "loss": 0.77076113, + "num_input_tokens_seen": 152910935, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.44335938, + "step": 7126, + "time_per_iteration": 3.7848150730133057 + }, + { + "auxiliary_loss_clip": 0.0106653, + "auxiliary_loss_mlp": 0.01030709, + "balance_loss_clip": 1.01720262, + "balance_loss_mlp": 1.02181208, + "epoch": 0.42849842176461744, + "flos": 23329661506560.0, + "grad_norm": 1.539277274639682, + "language_loss": 0.81289059, + "learning_rate": 2.445831151854183e-06, + "loss": 0.83386302, + "num_input_tokens_seen": 152931030, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.44726562, + "step": 7127, + "time_per_iteration": 2.3795886039733887 + }, + { + "auxiliary_loss_clip": 0.01066832, + "auxiliary_loss_mlp": 0.01031131, + "balance_loss_clip": 1.01698089, + "balance_loss_mlp": 1.02139759, + "epoch": 0.4285585450172854, + "flos": 17273487559680.0, + "grad_norm": 1.606175256518143, + "language_loss": 0.76203394, + "learning_rate": 2.445462828432008e-06, + "loss": 0.78301358, + "num_input_tokens_seen": 152948085, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.45507812, + "step": 7128, + "time_per_iteration": 2.352494955062866 + }, + { + "auxiliary_loss_clip": 0.01069389, + "auxiliary_loss_mlp": 0.01027495, + "balance_loss_clip": 1.01342237, + "balance_loss_mlp": 1.02181435, + "epoch": 0.42861866826995343, + "flos": 24278461148160.0, + "grad_norm": 1.9933319158081575, + "language_loss": 0.73833156, + "learning_rate": 2.4450944891124105e-06, + "loss": 0.75930035, + "num_input_tokens_seen": 152966265, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.4765625, + "step": 7129, + "time_per_iteration": 3.7472379207611084 + }, + { + "auxiliary_loss_clip": 0.01068068, + "auxiliary_loss_mlp": 0.01024312, + "balance_loss_clip": 1.01099646, + "balance_loss_mlp": 1.02254653, + "epoch": 0.4286787915226214, + "flos": 24351953293440.0, + "grad_norm": 1.5535587480214115, + "language_loss": 0.77389634, + "learning_rate": 2.4447261339085355e-06, + "loss": 0.79482019, + "num_input_tokens_seen": 152986775, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.45507812, + "step": 7130, + "time_per_iteration": 2.3985166549682617 + }, + { + "auxiliary_loss_clip": 0.0107228, + "auxiliary_loss_mlp": 0.01027815, + "balance_loss_clip": 1.01399255, + "balance_loss_mlp": 1.02457952, + "epoch": 0.42873891477528936, + "flos": 15449101130880.0, + "grad_norm": 10.448957249980126, + "language_loss": 0.73807192, + "learning_rate": 2.4443577628335297e-06, + "loss": 0.7590729, + "num_input_tokens_seen": 153003595, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.4765625, + "step": 7131, + "time_per_iteration": 2.3527040481567383 + }, + { + "auxiliary_loss_clip": 0.01066981, + "auxiliary_loss_mlp": 0.01026848, + "balance_loss_clip": 1.01342511, + "balance_loss_mlp": 1.02199435, + "epoch": 0.4287990380279573, + "flos": 17638609224960.0, + "grad_norm": 2.0341123942363963, + "language_loss": 0.77073008, + "learning_rate": 2.4439893759005374e-06, + "loss": 0.79166842, + "num_input_tokens_seen": 153021960, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.44921875, + "step": 7132, + "time_per_iteration": 2.3405683040618896 + }, + { + "auxiliary_loss_clip": 0.0106882, + "auxiliary_loss_mlp": 0.01030793, + "balance_loss_clip": 1.01646411, + "balance_loss_mlp": 1.02297997, + "epoch": 0.4288591612806253, + "flos": 27161099930880.0, + "grad_norm": 2.2053125078297606, + "language_loss": 0.78397292, + "learning_rate": 2.4436209731227066e-06, + "loss": 0.80496907, + "num_input_tokens_seen": 153042110, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.45898438, + "step": 7133, + "time_per_iteration": 3.782078742980957 + }, + { + "auxiliary_loss_clip": 0.01070025, + "auxiliary_loss_mlp": 0.01031946, + "balance_loss_clip": 1.01759291, + "balance_loss_mlp": 1.02225304, + "epoch": 0.42891928453329325, + "flos": 17162289279360.0, + "grad_norm": 1.9131661507880087, + "language_loss": 0.75153029, + "learning_rate": 2.4432525545131842e-06, + "loss": 0.77254999, + "num_input_tokens_seen": 153058925, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.4765625, + "step": 7134, + "time_per_iteration": 2.3236236572265625 + }, + { + "auxiliary_loss_clip": 0.01063436, + "auxiliary_loss_mlp": 0.01024882, + "balance_loss_clip": 1.01261556, + "balance_loss_mlp": 1.02003527, + "epoch": 0.4289794077859612, + "flos": 18186047343360.0, + "grad_norm": 1.7726182595072906, + "language_loss": 0.84361875, + "learning_rate": 2.4428841200851183e-06, + "loss": 0.86450183, + "num_input_tokens_seen": 153078070, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.43359375, + "step": 7135, + "time_per_iteration": 2.356985569000244 + }, + { + "auxiliary_loss_clip": 0.01064082, + "auxiliary_loss_mlp": 0.01025995, + "balance_loss_clip": 1.01332855, + "balance_loss_mlp": 1.02089286, + "epoch": 0.4290395310386292, + "flos": 28255627054080.0, + "grad_norm": 1.7159507857938692, + "language_loss": 0.75176835, + "learning_rate": 2.4425156698516576e-06, + "loss": 0.77266902, + "num_input_tokens_seen": 153096680, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.43164062, + "step": 7136, + "time_per_iteration": 3.7613720893859863 + }, + { + "auxiliary_loss_clip": 0.01066469, + "auxiliary_loss_mlp": 0.01025962, + "balance_loss_clip": 1.01275909, + "balance_loss_mlp": 1.0203979, + "epoch": 0.42909965429129715, + "flos": 16215165383040.0, + "grad_norm": 2.2413119211126515, + "language_loss": 0.7943182, + "learning_rate": 2.4421472038259513e-06, + "loss": 0.81524253, + "num_input_tokens_seen": 153113305, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.4609375, + "step": 7137, + "time_per_iteration": 2.3381054401397705 + }, + { + "auxiliary_loss_clip": 0.01070786, + "auxiliary_loss_mlp": 0.01026149, + "balance_loss_clip": 1.01136112, + "balance_loss_mlp": 1.02295363, + "epoch": 0.4291597775439651, + "flos": 23111733600000.0, + "grad_norm": 1.7528375957421014, + "language_loss": 0.75986314, + "learning_rate": 2.441778722021148e-06, + "loss": 0.78083247, + "num_input_tokens_seen": 153132735, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.47851562, + "step": 7138, + "time_per_iteration": 2.3778340816497803 + }, + { + "auxiliary_loss_clip": 0.01012804, + "auxiliary_loss_mlp": 0.01000999, + "balance_loss_clip": 0.9997409, + "balance_loss_mlp": 1.00282693, + "epoch": 0.4292199007966331, + "flos": 67543004511360.0, + "grad_norm": 0.7725918172263971, + "language_loss": 0.56212133, + "learning_rate": 2.441410224450399e-06, + "loss": 0.5822593, + "num_input_tokens_seen": 153187925, + "router_z_loss_clip": 0.01257324, + "router_z_loss_mlp": 0.09960938, + "step": 7139, + "time_per_iteration": 2.8831686973571777 + }, + { + "auxiliary_loss_clip": 0.0106557, + "auxiliary_loss_mlp": 0.01027467, + "balance_loss_clip": 1.01551056, + "balance_loss_mlp": 1.02213848, + "epoch": 0.42928002404930105, + "flos": 22998824663040.0, + "grad_norm": 2.158201759043669, + "language_loss": 0.8094784, + "learning_rate": 2.441041711126854e-06, + "loss": 0.83040881, + "num_input_tokens_seen": 153206990, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.43554688, + "step": 7140, + "time_per_iteration": 2.3815267086029053 + }, + { + "auxiliary_loss_clip": 0.01067373, + "auxiliary_loss_mlp": 0.01026733, + "balance_loss_clip": 1.01305354, + "balance_loss_mlp": 1.0220511, + "epoch": 0.429340147301969, + "flos": 11544170561280.0, + "grad_norm": 1.6585988910301397, + "language_loss": 0.8171581, + "learning_rate": 2.4406731820636652e-06, + "loss": 0.83809924, + "num_input_tokens_seen": 153222345, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.453125, + "step": 7141, + "time_per_iteration": 2.3834972381591797 + }, + { + "auxiliary_loss_clip": 0.01070671, + "auxiliary_loss_mlp": 0.01031935, + "balance_loss_clip": 1.01756454, + "balance_loss_mlp": 1.02319884, + "epoch": 0.42940027055463703, + "flos": 25263814849920.0, + "grad_norm": 1.631641489450351, + "language_loss": 0.86372459, + "learning_rate": 2.4403046372739833e-06, + "loss": 0.8847506, + "num_input_tokens_seen": 153240570, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.47460938, + "step": 7142, + "time_per_iteration": 2.4098007678985596 + }, + { + "auxiliary_loss_clip": 0.01066415, + "auxiliary_loss_mlp": 0.01025184, + "balance_loss_clip": 1.01235127, + "balance_loss_mlp": 1.02297461, + "epoch": 0.429460393807305, + "flos": 23803886770560.0, + "grad_norm": 1.7624391542643223, + "language_loss": 0.77954364, + "learning_rate": 2.4399360767709627e-06, + "loss": 0.80045962, + "num_input_tokens_seen": 153259575, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.43359375, + "step": 7143, + "time_per_iteration": 2.4068210124969482 + }, + { + "auxiliary_loss_clip": 0.010671, + "auxiliary_loss_mlp": 0.01025739, + "balance_loss_clip": 1.01260781, + "balance_loss_mlp": 1.02191925, + "epoch": 0.42952051705997296, + "flos": 13917426474240.0, + "grad_norm": 1.8283802058491094, + "language_loss": 0.76526928, + "learning_rate": 2.4395675005677545e-06, + "loss": 0.78619772, + "num_input_tokens_seen": 153276650, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.45117188, + "step": 7144, + "time_per_iteration": 2.3456244468688965 + }, + { + "auxiliary_loss_clip": 0.01068227, + "auxiliary_loss_mlp": 0.01025129, + "balance_loss_clip": 1.01191509, + "balance_loss_mlp": 1.02224135, + "epoch": 0.4295806403126409, + "flos": 26759179825920.0, + "grad_norm": 1.593315296032038, + "language_loss": 0.73326218, + "learning_rate": 2.439198908677513e-06, + "loss": 0.75419575, + "num_input_tokens_seen": 153298025, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.45898438, + "step": 7145, + "time_per_iteration": 2.422935724258423 + }, + { + "auxiliary_loss_clip": 0.01068529, + "auxiliary_loss_mlp": 0.01031994, + "balance_loss_clip": 1.0180583, + "balance_loss_mlp": 1.02173173, + "epoch": 0.4296407635653089, + "flos": 20951762382720.0, + "grad_norm": 1.81827969342242, + "language_loss": 0.79489625, + "learning_rate": 2.4388303011133927e-06, + "loss": 0.8159014, + "num_input_tokens_seen": 153315775, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.46679688, + "step": 7146, + "time_per_iteration": 2.3946597576141357 + }, + { + "auxiliary_loss_clip": 0.01065752, + "auxiliary_loss_mlp": 0.01028083, + "balance_loss_clip": 1.0153631, + "balance_loss_mlp": 1.02165818, + "epoch": 0.42970088681797686, + "flos": 15851405260800.0, + "grad_norm": 1.9625689976094276, + "language_loss": 0.82837313, + "learning_rate": 2.438461677888547e-06, + "loss": 0.84931147, + "num_input_tokens_seen": 153332765, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.44140625, + "step": 7147, + "time_per_iteration": 2.3668181896209717 + }, + { + "auxiliary_loss_clip": 0.01071942, + "auxiliary_loss_mlp": 0.01030435, + "balance_loss_clip": 1.01542687, + "balance_loss_mlp": 1.02459884, + "epoch": 0.4297610100706448, + "flos": 22381525180800.0, + "grad_norm": 2.088812929756614, + "language_loss": 0.87398982, + "learning_rate": 2.4380930390161324e-06, + "loss": 0.89501357, + "num_input_tokens_seen": 153350760, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.47265625, + "step": 7148, + "time_per_iteration": 2.3849353790283203 + }, + { + "auxiliary_loss_clip": 0.010673, + "auxiliary_loss_mlp": 0.01028542, + "balance_loss_clip": 1.01492858, + "balance_loss_mlp": 1.021891, + "epoch": 0.4298211333233128, + "flos": 27924510919680.0, + "grad_norm": 1.5589301262901687, + "language_loss": 0.77819335, + "learning_rate": 2.437724384509304e-06, + "loss": 0.79915184, + "num_input_tokens_seen": 153370765, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.453125, + "step": 7149, + "time_per_iteration": 2.4137613773345947 + }, + { + "auxiliary_loss_clip": 0.01064051, + "auxiliary_loss_mlp": 0.01027087, + "balance_loss_clip": 1.01368213, + "balance_loss_mlp": 1.02084327, + "epoch": 0.42988125657598075, + "flos": 24424467920640.0, + "grad_norm": 1.6520054027072302, + "language_loss": 0.797683, + "learning_rate": 2.4373557143812184e-06, + "loss": 0.81859446, + "num_input_tokens_seen": 153390725, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.43164062, + "step": 7150, + "time_per_iteration": 2.3909871578216553 + }, + { + "auxiliary_loss_clip": 0.01071086, + "auxiliary_loss_mlp": 0.01032877, + "balance_loss_clip": 1.01768923, + "balance_loss_mlp": 1.0234766, + "epoch": 0.4299413798286487, + "flos": 15849310579200.0, + "grad_norm": 1.9652114105724972, + "language_loss": 0.74517381, + "learning_rate": 2.4369870286450318e-06, + "loss": 0.76621354, + "num_input_tokens_seen": 153408010, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.4765625, + "step": 7151, + "time_per_iteration": 2.3653314113616943 + }, + { + "auxiliary_loss_clip": 0.01070829, + "auxiliary_loss_mlp": 0.01025454, + "balance_loss_clip": 1.01086235, + "balance_loss_mlp": 1.02318871, + "epoch": 0.4300015030813167, + "flos": 22308417060480.0, + "grad_norm": 1.8299634024562605, + "language_loss": 0.69942617, + "learning_rate": 2.436618327313902e-06, + "loss": 0.72038901, + "num_input_tokens_seen": 153426865, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.4765625, + "step": 7152, + "time_per_iteration": 2.3781909942626953 + }, + { + "auxiliary_loss_clip": 0.01066351, + "auxiliary_loss_mlp": 0.01030884, + "balance_loss_clip": 1.01703835, + "balance_loss_mlp": 1.0202539, + "epoch": 0.43006162633398465, + "flos": 34896212115840.0, + "grad_norm": 1.6188455942074294, + "language_loss": 0.71547878, + "learning_rate": 2.4362496104009886e-06, + "loss": 0.73645115, + "num_input_tokens_seen": 153449410, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.4609375, + "step": 7153, + "time_per_iteration": 2.4922547340393066 + }, + { + "auxiliary_loss_clip": 0.01069504, + "auxiliary_loss_mlp": 0.01035432, + "balance_loss_clip": 1.02131188, + "balance_loss_mlp": 1.02244496, + "epoch": 0.4301217495866526, + "flos": 15960648504960.0, + "grad_norm": 1.8393569260678118, + "language_loss": 0.77825266, + "learning_rate": 2.4358808779194477e-06, + "loss": 0.79930198, + "num_input_tokens_seen": 153467910, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.47070312, + "step": 7154, + "time_per_iteration": 2.3450822830200195 + }, + { + "auxiliary_loss_clip": 0.01064018, + "auxiliary_loss_mlp": 0.01025443, + "balance_loss_clip": 1.01331949, + "balance_loss_mlp": 1.02089632, + "epoch": 0.43018187283932063, + "flos": 18769376206080.0, + "grad_norm": 1.8009848139324587, + "language_loss": 0.78922105, + "learning_rate": 2.43551212988244e-06, + "loss": 0.81011558, + "num_input_tokens_seen": 153487100, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.43164062, + "step": 7155, + "time_per_iteration": 2.37268328666687 + }, + { + "auxiliary_loss_clip": 0.01064699, + "auxiliary_loss_mlp": 0.01021172, + "balance_loss_clip": 1.00912046, + "balance_loss_mlp": 1.02030253, + "epoch": 0.4302419960919886, + "flos": 20150819815680.0, + "grad_norm": 1.6169748232987917, + "language_loss": 0.88548368, + "learning_rate": 2.435143366303124e-06, + "loss": 0.90634239, + "num_input_tokens_seen": 153505565, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.44335938, + "step": 7156, + "time_per_iteration": 2.363849401473999 + }, + { + "auxiliary_loss_clip": 0.01067932, + "auxiliary_loss_mlp": 0.01023919, + "balance_loss_clip": 1.00981641, + "balance_loss_mlp": 1.02130222, + "epoch": 0.43030211934465656, + "flos": 26431519916160.0, + "grad_norm": 2.044795382523769, + "language_loss": 0.8292979, + "learning_rate": 2.4347745871946607e-06, + "loss": 0.85021639, + "num_input_tokens_seen": 153526130, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.46679688, + "step": 7157, + "time_per_iteration": 2.4083197116851807 + }, + { + "auxiliary_loss_clip": 0.01070988, + "auxiliary_loss_mlp": 0.01032009, + "balance_loss_clip": 1.01722717, + "balance_loss_mlp": 1.02258325, + "epoch": 0.43036224259732453, + "flos": 24388088417280.0, + "grad_norm": 1.853992134726112, + "language_loss": 0.7207092, + "learning_rate": 2.4344057925702113e-06, + "loss": 0.74173921, + "num_input_tokens_seen": 153546370, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.484375, + "step": 7158, + "time_per_iteration": 2.411576271057129 + }, + { + "auxiliary_loss_clip": 0.01067894, + "auxiliary_loss_mlp": 0.01027573, + "balance_loss_clip": 1.01383996, + "balance_loss_mlp": 1.02208328, + "epoch": 0.4304223658499925, + "flos": 17200763464320.0, + "grad_norm": 1.8090044660134066, + "language_loss": 0.82882762, + "learning_rate": 2.4340369824429364e-06, + "loss": 0.84978235, + "num_input_tokens_seen": 153562800, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.45703125, + "step": 7159, + "time_per_iteration": 2.347777843475342 + }, + { + "auxiliary_loss_clip": 0.01068337, + "auxiliary_loss_mlp": 0.01027269, + "balance_loss_clip": 1.01387596, + "balance_loss_mlp": 1.02252984, + "epoch": 0.43048248910266046, + "flos": 14902116860160.0, + "grad_norm": 1.9698254200623817, + "language_loss": 0.82745612, + "learning_rate": 2.433668156825998e-06, + "loss": 0.84841216, + "num_input_tokens_seen": 153578395, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.45703125, + "step": 7160, + "time_per_iteration": 2.3276684284210205 + }, + { + "auxiliary_loss_clip": 0.01069161, + "auxiliary_loss_mlp": 0.01035162, + "balance_loss_clip": 1.01936674, + "balance_loss_mlp": 1.02206266, + "epoch": 0.4305426123553284, + "flos": 21578767223040.0, + "grad_norm": 1.9400906733980818, + "language_loss": 0.77171612, + "learning_rate": 2.4332993157325588e-06, + "loss": 0.7927593, + "num_input_tokens_seen": 153596880, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.47070312, + "step": 7161, + "time_per_iteration": 2.4091479778289795 + }, + { + "auxiliary_loss_clip": 0.01068051, + "auxiliary_loss_mlp": 0.01026015, + "balance_loss_clip": 1.0126816, + "balance_loss_mlp": 1.02164125, + "epoch": 0.4306027356079964, + "flos": 22600186225920.0, + "grad_norm": 1.939832067311781, + "language_loss": 0.72893465, + "learning_rate": 2.4329304591757815e-06, + "loss": 0.74987531, + "num_input_tokens_seen": 153616570, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.46484375, + "step": 7162, + "time_per_iteration": 2.375570774078369 + }, + { + "auxiliary_loss_clip": 0.01011013, + "auxiliary_loss_mlp": 0.00999563, + "balance_loss_clip": 0.99842429, + "balance_loss_mlp": 1.00148439, + "epoch": 0.43066285886066435, + "flos": 70669128182400.0, + "grad_norm": 0.7852147114811114, + "language_loss": 0.59009963, + "learning_rate": 2.4325615871688297e-06, + "loss": 0.61020541, + "num_input_tokens_seen": 153671450, + "router_z_loss_clip": 0.01141357, + "router_z_loss_mlp": 0.09521484, + "step": 7163, + "time_per_iteration": 2.89432430267334 + }, + { + "auxiliary_loss_clip": 0.01065507, + "auxiliary_loss_mlp": 0.01027097, + "balance_loss_clip": 1.01473522, + "balance_loss_mlp": 1.02025032, + "epoch": 0.4307229821133323, + "flos": 26719483743360.0, + "grad_norm": 2.598408629083429, + "language_loss": 0.79490215, + "learning_rate": 2.4321926997248676e-06, + "loss": 0.8158282, + "num_input_tokens_seen": 153691405, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.45117188, + "step": 7164, + "time_per_iteration": 2.431840658187866 + }, + { + "auxiliary_loss_clip": 0.01067835, + "auxiliary_loss_mlp": 0.01032331, + "balance_loss_clip": 1.01778746, + "balance_loss_mlp": 1.02064538, + "epoch": 0.4307831053660003, + "flos": 26175920785920.0, + "grad_norm": 1.986487626511535, + "language_loss": 0.67558801, + "learning_rate": 2.4318237968570594e-06, + "loss": 0.69658971, + "num_input_tokens_seen": 153711555, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.47265625, + "step": 7165, + "time_per_iteration": 2.4339656829833984 + }, + { + "auxiliary_loss_clip": 0.01068175, + "auxiliary_loss_mlp": 0.01030236, + "balance_loss_clip": 1.0173378, + "balance_loss_mlp": 1.02283847, + "epoch": 0.43084322861866825, + "flos": 18909517870080.0, + "grad_norm": 1.8987393255513458, + "language_loss": 0.74891061, + "learning_rate": 2.4314548785785713e-06, + "loss": 0.76989472, + "num_input_tokens_seen": 153730095, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.453125, + "step": 7166, + "time_per_iteration": 3.8439574241638184 + }, + { + "auxiliary_loss_clip": 0.0106676, + "auxiliary_loss_mlp": 0.01028301, + "balance_loss_clip": 1.01570082, + "balance_loss_mlp": 1.02245593, + "epoch": 0.4309033518713362, + "flos": 26431694472960.0, + "grad_norm": 1.6654344809608648, + "language_loss": 0.7171855, + "learning_rate": 2.4310859449025675e-06, + "loss": 0.73813611, + "num_input_tokens_seen": 153749320, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.44140625, + "step": 7167, + "time_per_iteration": 2.4447569847106934 + }, + { + "auxiliary_loss_clip": 0.01063976, + "auxiliary_loss_mlp": 0.01024208, + "balance_loss_clip": 1.01166105, + "balance_loss_mlp": 1.02105856, + "epoch": 0.43096347512400424, + "flos": 21212284014720.0, + "grad_norm": 1.630533167623571, + "language_loss": 0.78587055, + "learning_rate": 2.430716995842216e-06, + "loss": 0.80675244, + "num_input_tokens_seen": 153767825, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.4296875, + "step": 7168, + "time_per_iteration": 2.3639230728149414 + }, + { + "auxiliary_loss_clip": 0.01069259, + "auxiliary_loss_mlp": 0.01033363, + "balance_loss_clip": 1.02010655, + "balance_loss_mlp": 1.02216363, + "epoch": 0.4310235983766722, + "flos": 16539334156800.0, + "grad_norm": 1.978171428801282, + "language_loss": 0.82534385, + "learning_rate": 2.4303480314106823e-06, + "loss": 0.8463701, + "num_input_tokens_seen": 153785350, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.47070312, + "step": 7169, + "time_per_iteration": 3.777245283126831 + }, + { + "auxiliary_loss_clip": 0.01011764, + "auxiliary_loss_mlp": 0.01007436, + "balance_loss_clip": 1.00623786, + "balance_loss_mlp": 1.00195956, + "epoch": 0.43108372162934017, + "flos": 64772506615680.0, + "grad_norm": 0.6713635707149396, + "language_loss": 0.60663116, + "learning_rate": 2.429979051621135e-06, + "loss": 0.62682319, + "num_input_tokens_seen": 153856400, + "router_z_loss_clip": 0.01196289, + "router_z_loss_mlp": 0.09765625, + "step": 7170, + "time_per_iteration": 3.1692774295806885 + }, + { + "auxiliary_loss_clip": 0.01068522, + "auxiliary_loss_mlp": 0.01029129, + "balance_loss_clip": 1.01622486, + "balance_loss_mlp": 1.02199471, + "epoch": 0.43114384488200813, + "flos": 13443236121600.0, + "grad_norm": 1.729054955810495, + "language_loss": 0.75172973, + "learning_rate": 2.429610056486741e-06, + "loss": 0.77270621, + "num_input_tokens_seen": 153875230, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.46484375, + "step": 7171, + "time_per_iteration": 2.348550319671631 + }, + { + "auxiliary_loss_clip": 0.01065854, + "auxiliary_loss_mlp": 0.01028201, + "balance_loss_clip": 1.0153085, + "balance_loss_mlp": 1.02090263, + "epoch": 0.4312039681346761, + "flos": 26285478232320.0, + "grad_norm": 2.002722011983886, + "language_loss": 0.77473116, + "learning_rate": 2.4292410460206693e-06, + "loss": 0.7956717, + "num_input_tokens_seen": 153894740, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.44921875, + "step": 7172, + "time_per_iteration": 3.788330316543579 + }, + { + "auxiliary_loss_clip": 0.0106843, + "auxiliary_loss_mlp": 0.01032505, + "balance_loss_clip": 1.01754391, + "balance_loss_mlp": 1.02136302, + "epoch": 0.43126409138734406, + "flos": 20375625260160.0, + "grad_norm": 1.686971768230089, + "language_loss": 0.76498592, + "learning_rate": 2.4288720202360887e-06, + "loss": 0.78599524, + "num_input_tokens_seen": 153913230, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.47070312, + "step": 7173, + "time_per_iteration": 2.3515172004699707 + }, + { + "auxiliary_loss_clip": 0.01067688, + "auxiliary_loss_mlp": 0.01029142, + "balance_loss_clip": 1.01638055, + "balance_loss_mlp": 1.02291965, + "epoch": 0.431324214640012, + "flos": 22122120712320.0, + "grad_norm": 1.5757412849449384, + "language_loss": 0.76877582, + "learning_rate": 2.4285029791461687e-06, + "loss": 0.78974414, + "num_input_tokens_seen": 153933250, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.44726562, + "step": 7174, + "time_per_iteration": 2.4084906578063965 + }, + { + "auxiliary_loss_clip": 0.01068797, + "auxiliary_loss_mlp": 0.01027616, + "balance_loss_clip": 1.01319146, + "balance_loss_mlp": 1.0220921, + "epoch": 0.43138433789268, + "flos": 15230125883520.0, + "grad_norm": 1.4426303176081317, + "language_loss": 0.82020307, + "learning_rate": 2.42813392276408e-06, + "loss": 0.84116715, + "num_input_tokens_seen": 153951325, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.46679688, + "step": 7175, + "time_per_iteration": 3.760488986968994 + }, + { + "auxiliary_loss_clip": 0.01070887, + "auxiliary_loss_mlp": 0.01036739, + "balance_loss_clip": 1.0222671, + "balance_loss_mlp": 1.02277541, + "epoch": 0.43144446114534796, + "flos": 18149318726400.0, + "grad_norm": 2.0273715133330605, + "language_loss": 0.74447805, + "learning_rate": 2.4277648511029936e-06, + "loss": 0.76555431, + "num_input_tokens_seen": 153966975, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.48046875, + "step": 7176, + "time_per_iteration": 2.342381238937378 + }, + { + "auxiliary_loss_clip": 0.01069034, + "auxiliary_loss_mlp": 0.01028702, + "balance_loss_clip": 1.01475441, + "balance_loss_mlp": 1.02231193, + "epoch": 0.4315045843980159, + "flos": 22928753831040.0, + "grad_norm": 1.9374636821881386, + "language_loss": 0.73297906, + "learning_rate": 2.4273957641760784e-06, + "loss": 0.75395644, + "num_input_tokens_seen": 153986695, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.46679688, + "step": 7177, + "time_per_iteration": 2.4034736156463623 + }, + { + "auxiliary_loss_clip": 0.0107437, + "auxiliary_loss_mlp": 0.01031176, + "balance_loss_clip": 1.01546454, + "balance_loss_mlp": 1.02225637, + "epoch": 0.4315647076506839, + "flos": 22125786405120.0, + "grad_norm": 1.9405549599931402, + "language_loss": 0.81200826, + "learning_rate": 2.4270266619965087e-06, + "loss": 0.83306372, + "num_input_tokens_seen": 154004710, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.51953125, + "step": 7178, + "time_per_iteration": 2.3793046474456787 + }, + { + "auxiliary_loss_clip": 0.01066527, + "auxiliary_loss_mlp": 0.01028259, + "balance_loss_clip": 1.01556921, + "balance_loss_mlp": 1.02265668, + "epoch": 0.43162483090335185, + "flos": 26869889347200.0, + "grad_norm": 1.5108048072959384, + "language_loss": 0.8405472, + "learning_rate": 2.4266575445774555e-06, + "loss": 0.86149508, + "num_input_tokens_seen": 154024320, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.43945312, + "step": 7179, + "time_per_iteration": 2.42510986328125 + }, + { + "auxiliary_loss_clip": 0.01067299, + "auxiliary_loss_mlp": 0.01028991, + "balance_loss_clip": 1.01571727, + "balance_loss_mlp": 1.02200627, + "epoch": 0.4316849541560198, + "flos": 13912399238400.0, + "grad_norm": 1.8781728879188744, + "language_loss": 0.7523371, + "learning_rate": 2.4262884119320924e-06, + "loss": 0.77329993, + "num_input_tokens_seen": 154041755, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.453125, + "step": 7180, + "time_per_iteration": 2.3455090522766113 + }, + { + "auxiliary_loss_clip": 0.01067626, + "auxiliary_loss_mlp": 0.01031136, + "balance_loss_clip": 1.01666999, + "balance_loss_mlp": 1.02075934, + "epoch": 0.4317450774086878, + "flos": 16434245364480.0, + "grad_norm": 2.070890465702492, + "language_loss": 0.82078153, + "learning_rate": 2.4259192640735923e-06, + "loss": 0.84176916, + "num_input_tokens_seen": 154056775, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.46875, + "step": 7181, + "time_per_iteration": 2.3229105472564697 + }, + { + "auxiliary_loss_clip": 0.01069557, + "auxiliary_loss_mlp": 0.01029905, + "balance_loss_clip": 1.01546264, + "balance_loss_mlp": 1.02173841, + "epoch": 0.4318052006613558, + "flos": 20554031640960.0, + "grad_norm": 1.5910770090195134, + "language_loss": 0.88372689, + "learning_rate": 2.4255501010151287e-06, + "loss": 0.9047215, + "num_input_tokens_seen": 154075015, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.4765625, + "step": 7182, + "time_per_iteration": 2.381136178970337 + }, + { + "auxiliary_loss_clip": 0.01067556, + "auxiliary_loss_mlp": 0.0103242, + "balance_loss_clip": 1.01896703, + "balance_loss_mlp": 1.02213168, + "epoch": 0.43186532391402377, + "flos": 22818952005120.0, + "grad_norm": 1.70357962587199, + "language_loss": 0.76046169, + "learning_rate": 2.4251809227698777e-06, + "loss": 0.78146148, + "num_input_tokens_seen": 154095170, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.453125, + "step": 7183, + "time_per_iteration": 2.37636137008667 + }, + { + "auxiliary_loss_clip": 0.01067658, + "auxiliary_loss_mlp": 0.01027683, + "balance_loss_clip": 1.01445079, + "balance_loss_mlp": 1.0208385, + "epoch": 0.43192544716669173, + "flos": 25555409458560.0, + "grad_norm": 1.5562642540118512, + "language_loss": 0.77398002, + "learning_rate": 2.4248117293510123e-06, + "loss": 0.79493344, + "num_input_tokens_seen": 154116895, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.46679688, + "step": 7184, + "time_per_iteration": 2.4115989208221436 + }, + { + "auxiliary_loss_clip": 0.01063368, + "auxiliary_loss_mlp": 0.01029339, + "balance_loss_clip": 1.01627409, + "balance_loss_mlp": 1.02098155, + "epoch": 0.4319855704193597, + "flos": 30953400854400.0, + "grad_norm": 1.6780588925734008, + "language_loss": 0.73524404, + "learning_rate": 2.42444252077171e-06, + "loss": 0.75617111, + "num_input_tokens_seen": 154138395, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.42382812, + "step": 7185, + "time_per_iteration": 2.4395370483398438 + }, + { + "auxiliary_loss_clip": 0.01069055, + "auxiliary_loss_mlp": 0.01029668, + "balance_loss_clip": 1.01551175, + "balance_loss_mlp": 1.02127647, + "epoch": 0.43204569367202766, + "flos": 24237717724800.0, + "grad_norm": 3.1808519526228367, + "language_loss": 0.75636703, + "learning_rate": 2.4240732970451445e-06, + "loss": 0.7773543, + "num_input_tokens_seen": 154156775, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.47851562, + "step": 7186, + "time_per_iteration": 2.400520086288452 + }, + { + "auxiliary_loss_clip": 0.01067724, + "auxiliary_loss_mlp": 0.01030793, + "balance_loss_clip": 1.01751947, + "balance_loss_mlp": 1.02157485, + "epoch": 0.43210581692469563, + "flos": 18405930286080.0, + "grad_norm": 2.3115944466463363, + "language_loss": 0.76532423, + "learning_rate": 2.423704058184495e-06, + "loss": 0.78630942, + "num_input_tokens_seen": 154177500, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.4609375, + "step": 7187, + "time_per_iteration": 2.4102330207824707 + }, + { + "auxiliary_loss_clip": 0.01010739, + "auxiliary_loss_mlp": 0.01000657, + "balance_loss_clip": 0.99939322, + "balance_loss_mlp": 1.00096655, + "epoch": 0.4321659401773636, + "flos": 49829155706880.0, + "grad_norm": 0.8566689335098601, + "language_loss": 0.68207145, + "learning_rate": 2.4233348042029374e-06, + "loss": 0.70218539, + "num_input_tokens_seen": 154237110, + "router_z_loss_clip": 0.01263428, + "router_z_loss_mlp": 0.09765625, + "step": 7188, + "time_per_iteration": 3.033536434173584 + }, + { + "auxiliary_loss_clip": 0.01065032, + "auxiliary_loss_mlp": 0.01026398, + "balance_loss_clip": 1.01362431, + "balance_loss_mlp": 1.02141201, + "epoch": 0.43222606343003156, + "flos": 17127620432640.0, + "grad_norm": 2.2626382446686617, + "language_loss": 0.78255296, + "learning_rate": 2.4229655351136493e-06, + "loss": 0.80346727, + "num_input_tokens_seen": 154253910, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.4375, + "step": 7189, + "time_per_iteration": 2.337010383605957 + }, + { + "auxiliary_loss_clip": 0.01069184, + "auxiliary_loss_mlp": 0.01032511, + "balance_loss_clip": 1.0179255, + "balance_loss_mlp": 1.02036023, + "epoch": 0.4322861866826995, + "flos": 22748776439040.0, + "grad_norm": 1.7345281132885007, + "language_loss": 0.70971668, + "learning_rate": 2.4225962509298097e-06, + "loss": 0.73073363, + "num_input_tokens_seen": 154274770, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.48828125, + "step": 7190, + "time_per_iteration": 2.401818037033081 + }, + { + "auxiliary_loss_clip": 0.01065337, + "auxiliary_loss_mlp": 0.01024134, + "balance_loss_clip": 1.01074672, + "balance_loss_mlp": 1.02051115, + "epoch": 0.4323463099353675, + "flos": 27890679945600.0, + "grad_norm": 1.4843526693011309, + "language_loss": 0.79869592, + "learning_rate": 2.422226951664597e-06, + "loss": 0.81959069, + "num_input_tokens_seen": 154295035, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.44726562, + "step": 7191, + "time_per_iteration": 2.4184799194335938 + }, + { + "auxiliary_loss_clip": 0.01065429, + "auxiliary_loss_mlp": 0.01023488, + "balance_loss_clip": 1.01076221, + "balance_loss_mlp": 1.02091491, + "epoch": 0.43240643318803546, + "flos": 21613715360640.0, + "grad_norm": 1.7070633953433119, + "language_loss": 0.75070155, + "learning_rate": 2.42185763733119e-06, + "loss": 0.77159071, + "num_input_tokens_seen": 154314905, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.4453125, + "step": 7192, + "time_per_iteration": 2.385045051574707 + }, + { + "auxiliary_loss_clip": 0.01067306, + "auxiliary_loss_mlp": 0.01029717, + "balance_loss_clip": 1.01535249, + "balance_loss_mlp": 1.02163458, + "epoch": 0.4324665564407034, + "flos": 17557646048640.0, + "grad_norm": 1.978450545626261, + "language_loss": 0.78924394, + "learning_rate": 2.4214883079427693e-06, + "loss": 0.81021416, + "num_input_tokens_seen": 154331740, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.45703125, + "step": 7193, + "time_per_iteration": 2.361384153366089 + }, + { + "auxiliary_loss_clip": 0.01066353, + "auxiliary_loss_mlp": 0.01031637, + "balance_loss_clip": 1.01693821, + "balance_loss_mlp": 1.02054572, + "epoch": 0.4325266796933714, + "flos": 18401531454720.0, + "grad_norm": 2.0122875922233727, + "language_loss": 0.75699848, + "learning_rate": 2.421118963512515e-06, + "loss": 0.77797836, + "num_input_tokens_seen": 154348740, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.45898438, + "step": 7194, + "time_per_iteration": 2.3461952209472656 + }, + { + "auxiliary_loss_clip": 0.01069253, + "auxiliary_loss_mlp": 0.01031399, + "balance_loss_clip": 1.01739168, + "balance_loss_mlp": 1.02340567, + "epoch": 0.4325868029460394, + "flos": 22563701988480.0, + "grad_norm": 1.7053175905300888, + "language_loss": 0.59754646, + "learning_rate": 2.4207496040536086e-06, + "loss": 0.61855292, + "num_input_tokens_seen": 154368835, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.45898438, + "step": 7195, + "time_per_iteration": 2.3907155990600586 + }, + { + "auxiliary_loss_clip": 0.01070041, + "auxiliary_loss_mlp": 0.01029023, + "balance_loss_clip": 1.01422358, + "balance_loss_mlp": 1.02213955, + "epoch": 0.43264692619870737, + "flos": 14604796788480.0, + "grad_norm": 2.07095628654936, + "language_loss": 0.65218318, + "learning_rate": 2.4203802295792303e-06, + "loss": 0.67317384, + "num_input_tokens_seen": 154384620, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.48046875, + "step": 7196, + "time_per_iteration": 2.3674089908599854 + }, + { + "auxiliary_loss_clip": 0.01072623, + "auxiliary_loss_mlp": 0.01033778, + "balance_loss_clip": 1.01867414, + "balance_loss_mlp": 1.02359056, + "epoch": 0.43270704945137534, + "flos": 21500736600960.0, + "grad_norm": 2.129931372947105, + "language_loss": 0.72538817, + "learning_rate": 2.4200108401025635e-06, + "loss": 0.74645221, + "num_input_tokens_seen": 154402865, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.49023438, + "step": 7197, + "time_per_iteration": 2.3766610622406006 + }, + { + "auxiliary_loss_clip": 0.01066033, + "auxiliary_loss_mlp": 0.01026872, + "balance_loss_clip": 1.01325858, + "balance_loss_mlp": 1.02144766, + "epoch": 0.4327671727040433, + "flos": 25154711251200.0, + "grad_norm": 1.6036424480918698, + "language_loss": 0.72691977, + "learning_rate": 2.41964143563679e-06, + "loss": 0.74784881, + "num_input_tokens_seen": 154423625, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.4453125, + "step": 7198, + "time_per_iteration": 2.460930347442627 + }, + { + "auxiliary_loss_clip": 0.01065163, + "auxiliary_loss_mlp": 0.01027727, + "balance_loss_clip": 1.0144825, + "balance_loss_mlp": 1.0203805, + "epoch": 0.43282729595671127, + "flos": 25445991657600.0, + "grad_norm": 1.4518525351511444, + "language_loss": 0.81346732, + "learning_rate": 2.419272016195093e-06, + "loss": 0.83439618, + "num_input_tokens_seen": 154444775, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.44726562, + "step": 7199, + "time_per_iteration": 2.4216833114624023 + }, + { + "auxiliary_loss_clip": 0.0106898, + "auxiliary_loss_mlp": 0.01027258, + "balance_loss_clip": 1.0139184, + "balance_loss_mlp": 1.02316821, + "epoch": 0.43288741920937923, + "flos": 24125192812800.0, + "grad_norm": 4.287402620487779, + "language_loss": 0.68933165, + "learning_rate": 2.4189025817906567e-06, + "loss": 0.71029401, + "num_input_tokens_seen": 154460815, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.45703125, + "step": 7200, + "time_per_iteration": 2.4448158740997314 + }, + { + "auxiliary_loss_clip": 0.01067971, + "auxiliary_loss_mlp": 0.01031127, + "balance_loss_clip": 1.0165062, + "balance_loss_mlp": 1.02128291, + "epoch": 0.4329475424620472, + "flos": 19204045032960.0, + "grad_norm": 1.7569357997674984, + "language_loss": 0.87039006, + "learning_rate": 2.4185331324366642e-06, + "loss": 0.89138108, + "num_input_tokens_seen": 154479145, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.46679688, + "step": 7201, + "time_per_iteration": 2.4016921520233154 + }, + { + "auxiliary_loss_clip": 0.01072437, + "auxiliary_loss_mlp": 0.01031548, + "balance_loss_clip": 1.01553845, + "balance_loss_mlp": 1.02437532, + "epoch": 0.43300766571471516, + "flos": 22637263956480.0, + "grad_norm": 2.1695770802287924, + "language_loss": 0.64487505, + "learning_rate": 2.418163668146301e-06, + "loss": 0.66591495, + "num_input_tokens_seen": 154498905, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.48046875, + "step": 7202, + "time_per_iteration": 2.3896257877349854 + }, + { + "auxiliary_loss_clip": 0.01068531, + "auxiliary_loss_mlp": 0.01021867, + "balance_loss_clip": 1.00883198, + "balance_loss_mlp": 1.02257347, + "epoch": 0.4330677889673831, + "flos": 22120165676160.0, + "grad_norm": 1.67017684341916, + "language_loss": 0.81978804, + "learning_rate": 2.4177941889327523e-06, + "loss": 0.84069204, + "num_input_tokens_seen": 154517270, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.45898438, + "step": 7203, + "time_per_iteration": 2.3737378120422363 + }, + { + "auxiliary_loss_clip": 0.01070529, + "auxiliary_loss_mlp": 0.01031158, + "balance_loss_clip": 1.01684093, + "balance_loss_mlp": 1.02158761, + "epoch": 0.4331279122200511, + "flos": 23220418262400.0, + "grad_norm": 1.7246496965436953, + "language_loss": 0.81054711, + "learning_rate": 2.4174246948092035e-06, + "loss": 0.83156395, + "num_input_tokens_seen": 154535945, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.49023438, + "step": 7204, + "time_per_iteration": 2.409620523452759 + }, + { + "auxiliary_loss_clip": 0.01067883, + "auxiliary_loss_mlp": 0.01025753, + "balance_loss_clip": 1.01235962, + "balance_loss_mlp": 1.02149105, + "epoch": 0.43318803547271906, + "flos": 18258771438720.0, + "grad_norm": 1.8144532107508853, + "language_loss": 0.73653531, + "learning_rate": 2.4170551857888414e-06, + "loss": 0.75747168, + "num_input_tokens_seen": 154554935, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.46484375, + "step": 7205, + "time_per_iteration": 3.7572977542877197 + }, + { + "auxiliary_loss_clip": 0.01067376, + "auxiliary_loss_mlp": 0.01029058, + "balance_loss_clip": 1.01584375, + "balance_loss_mlp": 1.02246666, + "epoch": 0.433248158725387, + "flos": 27417152908800.0, + "grad_norm": 1.6313036413925803, + "language_loss": 0.74968463, + "learning_rate": 2.4166856618848526e-06, + "loss": 0.77064896, + "num_input_tokens_seen": 154576065, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.44921875, + "step": 7206, + "time_per_iteration": 2.457095146179199 + }, + { + "auxiliary_loss_clip": 0.01066958, + "auxiliary_loss_mlp": 0.01027039, + "balance_loss_clip": 1.01266217, + "balance_loss_mlp": 1.02166963, + "epoch": 0.433308281978055, + "flos": 23216996949120.0, + "grad_norm": 2.0255504784374754, + "language_loss": 0.7862277, + "learning_rate": 2.416316123110424e-06, + "loss": 0.80716765, + "num_input_tokens_seen": 154595110, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.453125, + "step": 7207, + "time_per_iteration": 2.3825480937957764 + }, + { + "auxiliary_loss_clip": 0.01070774, + "auxiliary_loss_mlp": 0.01032237, + "balance_loss_clip": 1.01783037, + "balance_loss_mlp": 1.02327514, + "epoch": 0.433368405230723, + "flos": 15851475083520.0, + "grad_norm": 1.6862250308809414, + "language_loss": 0.80683559, + "learning_rate": 2.415946569478744e-06, + "loss": 0.82786572, + "num_input_tokens_seen": 154612255, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.47460938, + "step": 7208, + "time_per_iteration": 2.383016586303711 + }, + { + "auxiliary_loss_clip": 0.01067731, + "auxiliary_loss_mlp": 0.01029916, + "balance_loss_clip": 1.01641583, + "balance_loss_mlp": 1.02226138, + "epoch": 0.433428528483391, + "flos": 19025080070400.0, + "grad_norm": 2.1097232431102078, + "language_loss": 0.70158577, + "learning_rate": 2.415577001003001e-06, + "loss": 0.72256231, + "num_input_tokens_seen": 154630440, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.45507812, + "step": 7209, + "time_per_iteration": 3.8164384365081787 + }, + { + "auxiliary_loss_clip": 0.0106637, + "auxiliary_loss_mlp": 0.01024978, + "balance_loss_clip": 1.01142406, + "balance_loss_mlp": 1.0215466, + "epoch": 0.43348865173605894, + "flos": 24717074958720.0, + "grad_norm": 1.4806031014606427, + "language_loss": 0.8127315, + "learning_rate": 2.4152074176963838e-06, + "loss": 0.83364499, + "num_input_tokens_seen": 154652515, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.44726562, + "step": 7210, + "time_per_iteration": 2.4360530376434326 + }, + { + "auxiliary_loss_clip": 0.01066881, + "auxiliary_loss_mlp": 0.01024518, + "balance_loss_clip": 1.01164985, + "balance_loss_mlp": 1.02180064, + "epoch": 0.4335487749887269, + "flos": 22089581458560.0, + "grad_norm": 1.8385712926658002, + "language_loss": 0.81926048, + "learning_rate": 2.4148378195720816e-06, + "loss": 0.8401745, + "num_input_tokens_seen": 154670965, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.45117188, + "step": 7211, + "time_per_iteration": 3.7286908626556396 + }, + { + "auxiliary_loss_clip": 0.01065951, + "auxiliary_loss_mlp": 0.01026399, + "balance_loss_clip": 1.01423359, + "balance_loss_mlp": 1.02117741, + "epoch": 0.43360889824139487, + "flos": 22381839383040.0, + "grad_norm": 1.6105420646652227, + "language_loss": 0.74535894, + "learning_rate": 2.4144682066432847e-06, + "loss": 0.76628244, + "num_input_tokens_seen": 154689980, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.44726562, + "step": 7212, + "time_per_iteration": 2.3859431743621826 + }, + { + "auxiliary_loss_clip": 0.01066952, + "auxiliary_loss_mlp": 0.01024849, + "balance_loss_clip": 1.01296389, + "balance_loss_mlp": 1.02255726, + "epoch": 0.43366902149406283, + "flos": 17527376033280.0, + "grad_norm": 1.665101136749794, + "language_loss": 0.76810914, + "learning_rate": 2.4140985789231838e-06, + "loss": 0.78902721, + "num_input_tokens_seen": 154706570, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.44335938, + "step": 7213, + "time_per_iteration": 2.337003707885742 + }, + { + "auxiliary_loss_clip": 0.01066808, + "auxiliary_loss_mlp": 0.01025264, + "balance_loss_clip": 1.01243734, + "balance_loss_mlp": 1.02213335, + "epoch": 0.4337291447467308, + "flos": 19021763491200.0, + "grad_norm": 1.4093273510529538, + "language_loss": 0.64906472, + "learning_rate": 2.4137289364249698e-06, + "loss": 0.66998547, + "num_input_tokens_seen": 154725210, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.4453125, + "step": 7214, + "time_per_iteration": 2.379922866821289 + }, + { + "auxiliary_loss_clip": 0.01063101, + "auxiliary_loss_mlp": 0.01030755, + "balance_loss_clip": 1.01849437, + "balance_loss_mlp": 1.02092421, + "epoch": 0.43378926799939876, + "flos": 27232846508160.0, + "grad_norm": 2.0370730574488025, + "language_loss": 0.71594834, + "learning_rate": 2.4133592791618348e-06, + "loss": 0.73688698, + "num_input_tokens_seen": 154745945, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.421875, + "step": 7215, + "time_per_iteration": 3.7907307147979736 + }, + { + "auxiliary_loss_clip": 0.01067201, + "auxiliary_loss_mlp": 0.0102901, + "balance_loss_clip": 1.0166539, + "balance_loss_mlp": 1.023651, + "epoch": 0.43384939125206673, + "flos": 15960194657280.0, + "grad_norm": 1.8046650693957909, + "language_loss": 0.7496686, + "learning_rate": 2.4129896071469697e-06, + "loss": 0.77063072, + "num_input_tokens_seen": 154763580, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.43554688, + "step": 7216, + "time_per_iteration": 2.3675992488861084 + }, + { + "auxiliary_loss_clip": 0.01072524, + "auxiliary_loss_mlp": 0.01031019, + "balance_loss_clip": 1.01604652, + "balance_loss_mlp": 1.02307296, + "epoch": 0.4339095145047347, + "flos": 21792296298240.0, + "grad_norm": 3.3367611147565506, + "language_loss": 0.75514704, + "learning_rate": 2.412619920393568e-06, + "loss": 0.77618247, + "num_input_tokens_seen": 154776825, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.4921875, + "step": 7217, + "time_per_iteration": 2.3455653190612793 + }, + { + "auxiliary_loss_clip": 0.01067033, + "auxiliary_loss_mlp": 0.01026779, + "balance_loss_clip": 1.01379108, + "balance_loss_mlp": 1.02194881, + "epoch": 0.43396963775740266, + "flos": 14208986171520.0, + "grad_norm": 2.1728079836608196, + "language_loss": 0.73962414, + "learning_rate": 2.4122502189148225e-06, + "loss": 0.7605623, + "num_input_tokens_seen": 154794025, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.45117188, + "step": 7218, + "time_per_iteration": 2.3589930534362793 + }, + { + "auxiliary_loss_clip": 0.01068843, + "auxiliary_loss_mlp": 0.0102792, + "balance_loss_clip": 1.01409769, + "balance_loss_mlp": 1.02254796, + "epoch": 0.4340297610100706, + "flos": 19718036202240.0, + "grad_norm": 1.7273312832032315, + "language_loss": 0.68727696, + "learning_rate": 2.4118805027239277e-06, + "loss": 0.70824462, + "num_input_tokens_seen": 154813105, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.46484375, + "step": 7219, + "time_per_iteration": 2.3717095851898193 + }, + { + "auxiliary_loss_clip": 0.01066231, + "auxiliary_loss_mlp": 0.0102744, + "balance_loss_clip": 1.01383185, + "balance_loss_mlp": 1.02229095, + "epoch": 0.4340898842627386, + "flos": 18952495620480.0, + "grad_norm": 1.6165086589294329, + "language_loss": 0.77458262, + "learning_rate": 2.411510771834077e-06, + "loss": 0.79551935, + "num_input_tokens_seen": 154833525, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.43945312, + "step": 7220, + "time_per_iteration": 2.437492847442627 + }, + { + "auxiliary_loss_clip": 0.0106714, + "auxiliary_loss_mlp": 0.01028173, + "balance_loss_clip": 1.01505971, + "balance_loss_mlp": 1.0214293, + "epoch": 0.4341500075154066, + "flos": 22017206476800.0, + "grad_norm": 1.835313093205297, + "language_loss": 0.6935783, + "learning_rate": 2.411141026258466e-06, + "loss": 0.71453142, + "num_input_tokens_seen": 154853090, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.45703125, + "step": 7221, + "time_per_iteration": 2.3767826557159424 + }, + { + "auxiliary_loss_clip": 0.01067534, + "auxiliary_loss_mlp": 0.01028727, + "balance_loss_clip": 1.01604891, + "balance_loss_mlp": 1.02137828, + "epoch": 0.4342101307680746, + "flos": 23581455298560.0, + "grad_norm": 1.7999042226063735, + "language_loss": 0.6499337, + "learning_rate": 2.4107712660102885e-06, + "loss": 0.67089623, + "num_input_tokens_seen": 154872055, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.4609375, + "step": 7222, + "time_per_iteration": 2.401731014251709 + }, + { + "auxiliary_loss_clip": 0.01065443, + "auxiliary_loss_mlp": 0.0102654, + "balance_loss_clip": 1.01281321, + "balance_loss_mlp": 1.0205574, + "epoch": 0.43427025402074254, + "flos": 17967002273280.0, + "grad_norm": 2.3571826245453598, + "language_loss": 0.73136979, + "learning_rate": 2.410401491102741e-06, + "loss": 0.75228965, + "num_input_tokens_seen": 154886645, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.44921875, + "step": 7223, + "time_per_iteration": 2.3421363830566406 + }, + { + "auxiliary_loss_clip": 0.01068029, + "auxiliary_loss_mlp": 0.01025262, + "balance_loss_clip": 1.01167226, + "balance_loss_mlp": 1.02068925, + "epoch": 0.4343303772734105, + "flos": 26285198941440.0, + "grad_norm": 3.218554987433896, + "language_loss": 0.94193447, + "learning_rate": 2.4100317015490204e-06, + "loss": 0.96286738, + "num_input_tokens_seen": 154906775, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.47265625, + "step": 7224, + "time_per_iteration": 2.435818672180176 + }, + { + "auxiliary_loss_clip": 0.01065529, + "auxiliary_loss_mlp": 0.01025807, + "balance_loss_clip": 1.01294422, + "balance_loss_mlp": 1.02103221, + "epoch": 0.43439050052607847, + "flos": 26832741793920.0, + "grad_norm": 1.4898742554562974, + "language_loss": 0.60910529, + "learning_rate": 2.4096618973623227e-06, + "loss": 0.63001865, + "num_input_tokens_seen": 154926990, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.4453125, + "step": 7225, + "time_per_iteration": 2.4137604236602783 + }, + { + "auxiliary_loss_clip": 0.01066829, + "auxiliary_loss_mlp": 0.01029279, + "balance_loss_clip": 1.01539111, + "balance_loss_mlp": 1.02236569, + "epoch": 0.43445062377874644, + "flos": 21396590415360.0, + "grad_norm": 2.2077380105961137, + "language_loss": 0.77729309, + "learning_rate": 2.4092920785558465e-06, + "loss": 0.79825419, + "num_input_tokens_seen": 154946210, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.4453125, + "step": 7226, + "time_per_iteration": 2.4013419151306152 + }, + { + "auxiliary_loss_clip": 0.01065899, + "auxiliary_loss_mlp": 0.01027073, + "balance_loss_clip": 1.01407909, + "balance_loss_mlp": 1.02222466, + "epoch": 0.4345107470314144, + "flos": 19900911237120.0, + "grad_norm": 1.6875669383790657, + "language_loss": 0.84537756, + "learning_rate": 2.408922245142788e-06, + "loss": 0.86630726, + "num_input_tokens_seen": 154964995, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.4375, + "step": 7227, + "time_per_iteration": 2.3872430324554443 + }, + { + "auxiliary_loss_clip": 0.01064684, + "auxiliary_loss_mlp": 0.01028355, + "balance_loss_clip": 1.01484227, + "balance_loss_mlp": 1.02087688, + "epoch": 0.43457087028408237, + "flos": 26431415182080.0, + "grad_norm": 2.311497131242922, + "language_loss": 0.76651704, + "learning_rate": 2.408552397136347e-06, + "loss": 0.78744745, + "num_input_tokens_seen": 154984775, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.4375, + "step": 7228, + "time_per_iteration": 2.409581184387207 + }, + { + "auxiliary_loss_clip": 0.0106764, + "auxiliary_loss_mlp": 0.01030756, + "balance_loss_clip": 1.01730371, + "balance_loss_mlp": 1.02234232, + "epoch": 0.43463099353675033, + "flos": 31867461826560.0, + "grad_norm": 1.716902452340054, + "language_loss": 0.80639338, + "learning_rate": 2.408182534549722e-06, + "loss": 0.82737726, + "num_input_tokens_seen": 155008125, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.453125, + "step": 7229, + "time_per_iteration": 2.4758975505828857 + }, + { + "auxiliary_loss_clip": 0.01066723, + "auxiliary_loss_mlp": 0.01031145, + "balance_loss_clip": 1.01803744, + "balance_loss_mlp": 1.02078497, + "epoch": 0.4346911167894183, + "flos": 24570125579520.0, + "grad_norm": 1.7742450528187463, + "language_loss": 0.81553656, + "learning_rate": 2.4078126573961117e-06, + "loss": 0.83651519, + "num_input_tokens_seen": 155027885, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.45898438, + "step": 7230, + "time_per_iteration": 2.394817352294922 + }, + { + "auxiliary_loss_clip": 0.01072645, + "auxiliary_loss_mlp": 0.01029039, + "balance_loss_clip": 1.01434636, + "balance_loss_mlp": 1.02411938, + "epoch": 0.43475124004208626, + "flos": 17089774652160.0, + "grad_norm": 2.0741533555835114, + "language_loss": 0.77293789, + "learning_rate": 2.407442765688717e-06, + "loss": 0.79395473, + "num_input_tokens_seen": 155043375, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.484375, + "step": 7231, + "time_per_iteration": 2.3363640308380127 + }, + { + "auxiliary_loss_clip": 0.01066622, + "auxiliary_loss_mlp": 0.01028389, + "balance_loss_clip": 1.01618195, + "balance_loss_mlp": 1.02233517, + "epoch": 0.4348113632947542, + "flos": 26103406158720.0, + "grad_norm": 1.4524629671452713, + "language_loss": 0.68616235, + "learning_rate": 2.407072859440738e-06, + "loss": 0.70711243, + "num_input_tokens_seen": 155062930, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.44335938, + "step": 7232, + "time_per_iteration": 2.40046763420105 + }, + { + "auxiliary_loss_clip": 0.01069001, + "auxiliary_loss_mlp": 0.01027558, + "balance_loss_clip": 1.01390266, + "balance_loss_mlp": 1.02195859, + "epoch": 0.4348714865474222, + "flos": 34199171354880.0, + "grad_norm": 1.7212980107050107, + "language_loss": 0.72085422, + "learning_rate": 2.4067029386653758e-06, + "loss": 0.74181986, + "num_input_tokens_seen": 155084980, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.46875, + "step": 7233, + "time_per_iteration": 2.483920097351074 + }, + { + "auxiliary_loss_clip": 0.01069119, + "auxiliary_loss_mlp": 0.01025537, + "balance_loss_clip": 1.01164365, + "balance_loss_mlp": 1.0225594, + "epoch": 0.43493160980009016, + "flos": 31536206046720.0, + "grad_norm": 1.692759635413815, + "language_loss": 0.74482071, + "learning_rate": 2.4063330033758316e-06, + "loss": 0.76576734, + "num_input_tokens_seen": 155107260, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.46484375, + "step": 7234, + "time_per_iteration": 2.4501137733459473 + }, + { + "auxiliary_loss_clip": 0.01068321, + "auxiliary_loss_mlp": 0.01028396, + "balance_loss_clip": 1.01409698, + "balance_loss_mlp": 1.02166271, + "epoch": 0.4349917330527582, + "flos": 24060184128000.0, + "grad_norm": 2.640462513920913, + "language_loss": 0.59121382, + "learning_rate": 2.4059630535853074e-06, + "loss": 0.61218101, + "num_input_tokens_seen": 155126720, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.46679688, + "step": 7235, + "time_per_iteration": 2.4159250259399414 + }, + { + "auxiliary_loss_clip": 0.01066699, + "auxiliary_loss_mlp": 0.01029257, + "balance_loss_clip": 1.01542902, + "balance_loss_mlp": 1.02046919, + "epoch": 0.43505185630542614, + "flos": 30517998888960.0, + "grad_norm": 1.8341074372904738, + "language_loss": 0.77254486, + "learning_rate": 2.4055930893070076e-06, + "loss": 0.79350448, + "num_input_tokens_seen": 155148640, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.46289062, + "step": 7236, + "time_per_iteration": 2.4337453842163086 + }, + { + "auxiliary_loss_clip": 0.01063424, + "auxiliary_loss_mlp": 0.01023137, + "balance_loss_clip": 1.01043582, + "balance_loss_mlp": 1.02073252, + "epoch": 0.4351119795580941, + "flos": 15734446606080.0, + "grad_norm": 1.586876044053892, + "language_loss": 0.81077021, + "learning_rate": 2.405223110554133e-06, + "loss": 0.83163583, + "num_input_tokens_seen": 155165870, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.42578125, + "step": 7237, + "time_per_iteration": 2.3759078979492188 + }, + { + "auxiliary_loss_clip": 0.01066335, + "auxiliary_loss_mlp": 0.01030228, + "balance_loss_clip": 1.01750255, + "balance_loss_mlp": 1.02200258, + "epoch": 0.4351721028107621, + "flos": 18730832198400.0, + "grad_norm": 1.4556527620126574, + "language_loss": 0.63048345, + "learning_rate": 2.4048531173398873e-06, + "loss": 0.65144902, + "num_input_tokens_seen": 155185315, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.44335938, + "step": 7238, + "time_per_iteration": 2.369387626647949 + }, + { + "auxiliary_loss_clip": 0.01063444, + "auxiliary_loss_mlp": 0.01028571, + "balance_loss_clip": 1.01622725, + "balance_loss_mlp": 1.02036548, + "epoch": 0.43523222606343004, + "flos": 25225759601280.0, + "grad_norm": 1.5674012052573256, + "language_loss": 0.85684443, + "learning_rate": 2.4044831096774756e-06, + "loss": 0.87776464, + "num_input_tokens_seen": 155205790, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.4296875, + "step": 7239, + "time_per_iteration": 2.424705982208252 + }, + { + "auxiliary_loss_clip": 0.01066851, + "auxiliary_loss_mlp": 0.0102441, + "balance_loss_clip": 1.01197624, + "balance_loss_mlp": 1.02230656, + "epoch": 0.435292349316098, + "flos": 22708137749760.0, + "grad_norm": 1.5120053740302446, + "language_loss": 0.7213372, + "learning_rate": 2.4041130875801025e-06, + "loss": 0.74224985, + "num_input_tokens_seen": 155226475, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.4453125, + "step": 7240, + "time_per_iteration": 2.384749412536621 + }, + { + "auxiliary_loss_clip": 0.01069054, + "auxiliary_loss_mlp": 0.0102203, + "balance_loss_clip": 1.00832152, + "balance_loss_mlp": 1.02273726, + "epoch": 0.43535247256876597, + "flos": 25774698908160.0, + "grad_norm": 1.948383516136765, + "language_loss": 0.8161996, + "learning_rate": 2.4037430510609728e-06, + "loss": 0.8371104, + "num_input_tokens_seen": 155247110, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.46289062, + "step": 7241, + "time_per_iteration": 2.406350612640381 + }, + { + "auxiliary_loss_clip": 0.01071709, + "auxiliary_loss_mlp": 0.01032462, + "balance_loss_clip": 1.01800191, + "balance_loss_mlp": 1.02321291, + "epoch": 0.43541259582143393, + "flos": 17527236387840.0, + "grad_norm": 2.530988614751346, + "language_loss": 0.79420865, + "learning_rate": 2.4033730001332917e-06, + "loss": 0.81525034, + "num_input_tokens_seen": 155261335, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.484375, + "step": 7242, + "time_per_iteration": 2.335125684738159 + }, + { + "auxiliary_loss_clip": 0.01067965, + "auxiliary_loss_mlp": 0.01026113, + "balance_loss_clip": 1.01245761, + "balance_loss_mlp": 1.02177918, + "epoch": 0.4354727190741019, + "flos": 15194304961920.0, + "grad_norm": 3.508565750056895, + "language_loss": 0.67764491, + "learning_rate": 2.4030029348102657e-06, + "loss": 0.69858569, + "num_input_tokens_seen": 155278510, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.4609375, + "step": 7243, + "time_per_iteration": 2.352240800857544 + }, + { + "auxiliary_loss_clip": 0.01063685, + "auxiliary_loss_mlp": 0.01027885, + "balance_loss_clip": 1.01519513, + "balance_loss_mlp": 1.01992786, + "epoch": 0.43553284232676986, + "flos": 16648472666880.0, + "grad_norm": 1.716309238695424, + "language_loss": 0.81258696, + "learning_rate": 2.4026328551051023e-06, + "loss": 0.83350259, + "num_input_tokens_seen": 155296450, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.4375, + "step": 7244, + "time_per_iteration": 3.7402613162994385 + }, + { + "auxiliary_loss_clip": 0.01066754, + "auxiliary_loss_mlp": 0.01023464, + "balance_loss_clip": 1.01060712, + "balance_loss_mlp": 1.02121592, + "epoch": 0.43559296557943783, + "flos": 23399837072640.0, + "grad_norm": 2.0768188769163127, + "language_loss": 0.73504078, + "learning_rate": 2.4022627610310075e-06, + "loss": 0.75594294, + "num_input_tokens_seen": 155316080, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.45507812, + "step": 7245, + "time_per_iteration": 2.4213075637817383 + }, + { + "auxiliary_loss_clip": 0.01067438, + "auxiliary_loss_mlp": 0.01026977, + "balance_loss_clip": 1.01416218, + "balance_loss_mlp": 1.02233648, + "epoch": 0.4356530888321058, + "flos": 22417974506880.0, + "grad_norm": 1.7004392135167103, + "language_loss": 0.7682991, + "learning_rate": 2.4018926526011895e-06, + "loss": 0.78924328, + "num_input_tokens_seen": 155336765, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.45117188, + "step": 7246, + "time_per_iteration": 2.4265403747558594 + }, + { + "auxiliary_loss_clip": 0.01068287, + "auxiliary_loss_mlp": 0.01029223, + "balance_loss_clip": 1.0155313, + "balance_loss_mlp": 1.02171612, + "epoch": 0.43571321208477376, + "flos": 21615076903680.0, + "grad_norm": 1.7577259522004658, + "language_loss": 0.85542452, + "learning_rate": 2.4015225298288566e-06, + "loss": 0.87639964, + "num_input_tokens_seen": 155356440, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.46679688, + "step": 7247, + "time_per_iteration": 2.418332576751709 + }, + { + "auxiliary_loss_clip": 0.01065964, + "auxiliary_loss_mlp": 0.0102974, + "balance_loss_clip": 1.01679409, + "balance_loss_mlp": 1.02148688, + "epoch": 0.4357733353374418, + "flos": 23986238135040.0, + "grad_norm": 1.5221555870233587, + "language_loss": 0.72430992, + "learning_rate": 2.4011523927272177e-06, + "loss": 0.74526697, + "num_input_tokens_seen": 155377070, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.4453125, + "step": 7248, + "time_per_iteration": 3.805873155593872 + }, + { + "auxiliary_loss_clip": 0.01067628, + "auxiliary_loss_mlp": 0.01028504, + "balance_loss_clip": 1.01519454, + "balance_loss_mlp": 1.02176893, + "epoch": 0.43583345859010975, + "flos": 25263570470400.0, + "grad_norm": 1.562287093104665, + "language_loss": 0.87234449, + "learning_rate": 2.4007822413094815e-06, + "loss": 0.89330584, + "num_input_tokens_seen": 155398415, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.45898438, + "step": 7249, + "time_per_iteration": 2.4386487007141113 + }, + { + "auxiliary_loss_clip": 0.01067318, + "auxiliary_loss_mlp": 0.01033914, + "balance_loss_clip": 1.02101564, + "balance_loss_mlp": 1.02153075, + "epoch": 0.4358935818427777, + "flos": 23695167196800.0, + "grad_norm": 1.8886100699025787, + "language_loss": 0.82198668, + "learning_rate": 2.400412075588858e-06, + "loss": 0.84299904, + "num_input_tokens_seen": 155415625, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.45703125, + "step": 7250, + "time_per_iteration": 2.400247812271118 + }, + { + "auxiliary_loss_clip": 0.01069933, + "auxiliary_loss_mlp": 0.01026436, + "balance_loss_clip": 1.01300716, + "balance_loss_mlp": 1.02358484, + "epoch": 0.4359537050954457, + "flos": 29531562935040.0, + "grad_norm": 1.6297746698644633, + "language_loss": 0.85008121, + "learning_rate": 2.400041895578558e-06, + "loss": 0.87104487, + "num_input_tokens_seen": 155435505, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.46484375, + "step": 7251, + "time_per_iteration": 3.883236885070801 + }, + { + "auxiliary_loss_clip": 0.0106394, + "auxiliary_loss_mlp": 0.01028101, + "balance_loss_clip": 1.01418293, + "balance_loss_mlp": 1.02065563, + "epoch": 0.43601382834811364, + "flos": 22710162608640.0, + "grad_norm": 1.5256227058869274, + "language_loss": 0.69114304, + "learning_rate": 2.3996717012917912e-06, + "loss": 0.71206343, + "num_input_tokens_seen": 155455425, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.43359375, + "step": 7252, + "time_per_iteration": 2.4278242588043213 + }, + { + "auxiliary_loss_clip": 0.01067222, + "auxiliary_loss_mlp": 0.01026326, + "balance_loss_clip": 1.01351678, + "balance_loss_mlp": 1.02233744, + "epoch": 0.4360739516007816, + "flos": 19097734343040.0, + "grad_norm": 1.5462752143603624, + "language_loss": 0.83900148, + "learning_rate": 2.3993014927417704e-06, + "loss": 0.85993695, + "num_input_tokens_seen": 155474250, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.44921875, + "step": 7253, + "time_per_iteration": 2.41941499710083 + }, + { + "auxiliary_loss_clip": 0.01065933, + "auxiliary_loss_mlp": 0.01030072, + "balance_loss_clip": 1.01707852, + "balance_loss_mlp": 1.02116418, + "epoch": 0.43613407485344957, + "flos": 23403293297280.0, + "grad_norm": 1.719700977239462, + "language_loss": 0.70403725, + "learning_rate": 2.3989312699417057e-06, + "loss": 0.72499728, + "num_input_tokens_seen": 155494685, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.44726562, + "step": 7254, + "time_per_iteration": 2.4322075843811035 + }, + { + "auxiliary_loss_clip": 0.01066077, + "auxiliary_loss_mlp": 0.01030579, + "balance_loss_clip": 1.01717997, + "balance_loss_mlp": 1.02237558, + "epoch": 0.43619419810611754, + "flos": 22636705374720.0, + "grad_norm": 1.8158000178681513, + "language_loss": 0.8139329, + "learning_rate": 2.39856103290481e-06, + "loss": 0.83489943, + "num_input_tokens_seen": 155513040, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.4375, + "step": 7255, + "time_per_iteration": 3.7655422687530518 + }, + { + "auxiliary_loss_clip": 0.01066481, + "auxiliary_loss_mlp": 0.01028166, + "balance_loss_clip": 1.01477242, + "balance_loss_mlp": 1.02093506, + "epoch": 0.4362543213587855, + "flos": 20046918009600.0, + "grad_norm": 1.70914848458815, + "language_loss": 0.77693874, + "learning_rate": 2.398190781644296e-06, + "loss": 0.79788518, + "num_input_tokens_seen": 155530100, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.45507812, + "step": 7256, + "time_per_iteration": 2.3921706676483154 + }, + { + "auxiliary_loss_clip": 0.01068793, + "auxiliary_loss_mlp": 0.01025501, + "balance_loss_clip": 1.01303148, + "balance_loss_mlp": 1.02406693, + "epoch": 0.43631444461145347, + "flos": 21360245823360.0, + "grad_norm": 1.4601173477520193, + "language_loss": 0.76388502, + "learning_rate": 2.397820516173378e-06, + "loss": 0.78482801, + "num_input_tokens_seen": 155549375, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.44726562, + "step": 7257, + "time_per_iteration": 2.378453254699707 + }, + { + "auxiliary_loss_clip": 0.01067727, + "auxiliary_loss_mlp": 0.01025485, + "balance_loss_clip": 1.01131129, + "balance_loss_mlp": 1.02257025, + "epoch": 0.43637456786412143, + "flos": 22417450836480.0, + "grad_norm": 1.71669928993661, + "language_loss": 0.7288754, + "learning_rate": 2.3974502365052685e-06, + "loss": 0.74980748, + "num_input_tokens_seen": 155569395, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.453125, + "step": 7258, + "time_per_iteration": 2.398702383041382 + }, + { + "auxiliary_loss_clip": 0.0106663, + "auxiliary_loss_mlp": 0.01026898, + "balance_loss_clip": 1.01348126, + "balance_loss_mlp": 1.02090788, + "epoch": 0.4364346911167894, + "flos": 28547570776320.0, + "grad_norm": 2.908320665487618, + "language_loss": 0.76599526, + "learning_rate": 2.3970799426531833e-06, + "loss": 0.7869305, + "num_input_tokens_seen": 155589090, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.45703125, + "step": 7259, + "time_per_iteration": 2.4137871265411377 + }, + { + "auxiliary_loss_clip": 0.01013584, + "auxiliary_loss_mlp": 0.0100393, + "balance_loss_clip": 1.00280333, + "balance_loss_mlp": 1.00361919, + "epoch": 0.43649481436945736, + "flos": 62656211376000.0, + "grad_norm": 0.741311114727298, + "language_loss": 0.56993949, + "learning_rate": 2.396709634630335e-06, + "loss": 0.59011459, + "num_input_tokens_seen": 155648660, + "router_z_loss_clip": 0.0112915, + "router_z_loss_mlp": 0.09960938, + "step": 7260, + "time_per_iteration": 3.0136260986328125 + }, + { + "auxiliary_loss_clip": 0.01068076, + "auxiliary_loss_mlp": 0.01034907, + "balance_loss_clip": 1.02066159, + "balance_loss_mlp": 1.02157569, + "epoch": 0.4365549376221254, + "flos": 30590792807040.0, + "grad_norm": 1.9094855838602753, + "language_loss": 0.71135736, + "learning_rate": 2.3963393124499415e-06, + "loss": 0.73238719, + "num_input_tokens_seen": 155669945, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.46484375, + "step": 7261, + "time_per_iteration": 2.459855079650879 + }, + { + "auxiliary_loss_clip": 0.01065904, + "auxiliary_loss_mlp": 0.01030938, + "balance_loss_clip": 1.01741958, + "balance_loss_mlp": 1.02045643, + "epoch": 0.43661506087479335, + "flos": 17164907631360.0, + "grad_norm": 2.0772233946382577, + "language_loss": 0.69549984, + "learning_rate": 2.395968976125217e-06, + "loss": 0.71646821, + "num_input_tokens_seen": 155688555, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.45507812, + "step": 7262, + "time_per_iteration": 2.372657299041748 + }, + { + "auxiliary_loss_clip": 0.01063193, + "auxiliary_loss_mlp": 0.01023262, + "balance_loss_clip": 1.01140046, + "balance_loss_mlp": 1.02156937, + "epoch": 0.4366751841274613, + "flos": 22046603708160.0, + "grad_norm": 1.5820614233490475, + "language_loss": 0.79698598, + "learning_rate": 2.3955986256693783e-06, + "loss": 0.81785047, + "num_input_tokens_seen": 155705370, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.41601562, + "step": 7263, + "time_per_iteration": 2.3733668327331543 + }, + { + "auxiliary_loss_clip": 0.01067807, + "auxiliary_loss_mlp": 0.01027571, + "balance_loss_clip": 1.01410651, + "balance_loss_mlp": 1.02288651, + "epoch": 0.4367353073801293, + "flos": 15996399603840.0, + "grad_norm": 1.6593756708475351, + "language_loss": 0.74982655, + "learning_rate": 2.3952282610956426e-06, + "loss": 0.77078032, + "num_input_tokens_seen": 155721890, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.44921875, + "step": 7264, + "time_per_iteration": 2.3615360260009766 + }, + { + "auxiliary_loss_clip": 0.01065437, + "auxiliary_loss_mlp": 0.01028202, + "balance_loss_clip": 1.01529741, + "balance_loss_mlp": 1.02137458, + "epoch": 0.43679543063279724, + "flos": 38215998432000.0, + "grad_norm": 1.6640107511813393, + "language_loss": 0.62019509, + "learning_rate": 2.3948578824172264e-06, + "loss": 0.6411314, + "num_input_tokens_seen": 155743970, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.44140625, + "step": 7265, + "time_per_iteration": 2.531256675720215 + }, + { + "auxiliary_loss_clip": 0.01063585, + "auxiliary_loss_mlp": 0.01021828, + "balance_loss_clip": 1.00912654, + "balance_loss_mlp": 1.01991379, + "epoch": 0.4368555538854652, + "flos": 15192384837120.0, + "grad_norm": 1.8725232864647694, + "language_loss": 0.72590047, + "learning_rate": 2.394487489647349e-06, + "loss": 0.74675459, + "num_input_tokens_seen": 155761830, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.4375, + "step": 7266, + "time_per_iteration": 2.3529348373413086 + }, + { + "auxiliary_loss_clip": 0.01069693, + "auxiliary_loss_mlp": 0.01027842, + "balance_loss_clip": 1.0146997, + "balance_loss_mlp": 1.02332544, + "epoch": 0.4369156771381332, + "flos": 23068162356480.0, + "grad_norm": 1.9121691656230495, + "language_loss": 0.82106769, + "learning_rate": 2.3941170827992264e-06, + "loss": 0.84204298, + "num_input_tokens_seen": 155779610, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.46289062, + "step": 7267, + "time_per_iteration": 2.3762753009796143 + }, + { + "auxiliary_loss_clip": 0.01064736, + "auxiliary_loss_mlp": 0.01026106, + "balance_loss_clip": 1.01366615, + "balance_loss_mlp": 1.0208745, + "epoch": 0.43697580039080114, + "flos": 23439952091520.0, + "grad_norm": 1.548866705744468, + "language_loss": 0.74577576, + "learning_rate": 2.39374666188608e-06, + "loss": 0.76668417, + "num_input_tokens_seen": 155798765, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.43945312, + "step": 7268, + "time_per_iteration": 2.4048726558685303 + }, + { + "auxiliary_loss_clip": 0.01069087, + "auxiliary_loss_mlp": 0.010328, + "balance_loss_clip": 1.01896572, + "balance_loss_mlp": 1.02167237, + "epoch": 0.4370359236434691, + "flos": 18513707253120.0, + "grad_norm": 2.5604380266893387, + "language_loss": 0.79823267, + "learning_rate": 2.3933762269211273e-06, + "loss": 0.81925154, + "num_input_tokens_seen": 155817750, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.47265625, + "step": 7269, + "time_per_iteration": 2.3532586097717285 + }, + { + "auxiliary_loss_clip": 0.01066819, + "auxiliary_loss_mlp": 0.01027348, + "balance_loss_clip": 1.01459837, + "balance_loss_mlp": 1.02200985, + "epoch": 0.43709604689613707, + "flos": 23221360869120.0, + "grad_norm": 1.8034146909245476, + "language_loss": 0.75270152, + "learning_rate": 2.3930057779175894e-06, + "loss": 0.7736432, + "num_input_tokens_seen": 155836490, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.44921875, + "step": 7270, + "time_per_iteration": 2.400887966156006 + }, + { + "auxiliary_loss_clip": 0.01066885, + "auxiliary_loss_mlp": 0.01026682, + "balance_loss_clip": 1.0136826, + "balance_loss_mlp": 1.02224922, + "epoch": 0.43715617014880503, + "flos": 23802629961600.0, + "grad_norm": 1.9790305036823848, + "language_loss": 0.79380774, + "learning_rate": 2.3926353148886864e-06, + "loss": 0.81474346, + "num_input_tokens_seen": 155856225, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.44726562, + "step": 7271, + "time_per_iteration": 2.396700143814087 + }, + { + "auxiliary_loss_clip": 0.010121, + "auxiliary_loss_mlp": 0.01008132, + "balance_loss_clip": 1.00681436, + "balance_loss_mlp": 1.00228024, + "epoch": 0.437216293401473, + "flos": 61937768085120.0, + "grad_norm": 0.7068477379989649, + "language_loss": 0.54944789, + "learning_rate": 2.3922648378476388e-06, + "loss": 0.56965017, + "num_input_tokens_seen": 155916770, + "router_z_loss_clip": 0.01318359, + "router_z_loss_mlp": 0.09814453, + "step": 7272, + "time_per_iteration": 3.0839085578918457 + }, + { + "auxiliary_loss_clip": 0.01068876, + "auxiliary_loss_mlp": 0.01031194, + "balance_loss_clip": 1.01747906, + "balance_loss_mlp": 1.02301478, + "epoch": 0.43727641665414096, + "flos": 21981141175680.0, + "grad_norm": 1.5291317700424218, + "language_loss": 0.69446576, + "learning_rate": 2.391894346807668e-06, + "loss": 0.71546644, + "num_input_tokens_seen": 155936490, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.45898438, + "step": 7273, + "time_per_iteration": 2.3763790130615234 + }, + { + "auxiliary_loss_clip": 0.01068436, + "auxiliary_loss_mlp": 0.010251, + "balance_loss_clip": 1.01235056, + "balance_loss_mlp": 1.02206016, + "epoch": 0.437336539906809, + "flos": 39529291334400.0, + "grad_norm": 1.8966604072488724, + "language_loss": 0.75403506, + "learning_rate": 2.391523841781996e-06, + "loss": 0.77497041, + "num_input_tokens_seen": 155957595, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.46289062, + "step": 7274, + "time_per_iteration": 2.550288677215576 + }, + { + "auxiliary_loss_clip": 0.01065381, + "auxiliary_loss_mlp": 0.01021247, + "balance_loss_clip": 1.00955892, + "balance_loss_mlp": 1.0218612, + "epoch": 0.43739666315947695, + "flos": 17456188037760.0, + "grad_norm": 1.6979211679174235, + "language_loss": 0.80037141, + "learning_rate": 2.3911533227838455e-06, + "loss": 0.82123768, + "num_input_tokens_seen": 155975710, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.43554688, + "step": 7275, + "time_per_iteration": 2.3538830280303955 + }, + { + "auxiliary_loss_clip": 0.01068356, + "auxiliary_loss_mlp": 0.01031932, + "balance_loss_clip": 1.01901007, + "balance_loss_mlp": 1.0240047, + "epoch": 0.4374567864121449, + "flos": 16357925399040.0, + "grad_norm": 3.0201827488596904, + "language_loss": 0.81247318, + "learning_rate": 2.390782789826439e-06, + "loss": 0.83347607, + "num_input_tokens_seen": 155993090, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.44335938, + "step": 7276, + "time_per_iteration": 2.3578240871429443 + }, + { + "auxiliary_loss_clip": 0.01069748, + "auxiliary_loss_mlp": 0.01029694, + "balance_loss_clip": 1.01578796, + "balance_loss_mlp": 1.02234817, + "epoch": 0.4375169096648129, + "flos": 30586324152960.0, + "grad_norm": 1.7631943531276082, + "language_loss": 0.73071802, + "learning_rate": 2.3904122429229997e-06, + "loss": 0.75171244, + "num_input_tokens_seen": 156013685, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.47460938, + "step": 7277, + "time_per_iteration": 2.451894521713257 + }, + { + "auxiliary_loss_clip": 0.01067183, + "auxiliary_loss_mlp": 0.01027743, + "balance_loss_clip": 1.01450527, + "balance_loss_mlp": 1.02208138, + "epoch": 0.43757703291748085, + "flos": 30554273658240.0, + "grad_norm": 1.840668816937261, + "language_loss": 0.72350156, + "learning_rate": 2.390041682086752e-06, + "loss": 0.74445087, + "num_input_tokens_seen": 156034300, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.44921875, + "step": 7278, + "time_per_iteration": 2.4590959548950195 + }, + { + "auxiliary_loss_clip": 0.01066184, + "auxiliary_loss_mlp": 0.01031758, + "balance_loss_clip": 1.01972961, + "balance_loss_mlp": 1.02191591, + "epoch": 0.4376371561701488, + "flos": 21396311124480.0, + "grad_norm": 1.4499285395554027, + "language_loss": 0.65959859, + "learning_rate": 2.3896711073309193e-06, + "loss": 0.68057799, + "num_input_tokens_seen": 156053805, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.44335938, + "step": 7279, + "time_per_iteration": 2.3795182704925537 + }, + { + "auxiliary_loss_clip": 0.01072715, + "auxiliary_loss_mlp": 0.01028331, + "balance_loss_clip": 1.01435947, + "balance_loss_mlp": 1.02554154, + "epoch": 0.4376972794228168, + "flos": 23403258385920.0, + "grad_norm": 1.7207249990287266, + "language_loss": 0.81126302, + "learning_rate": 2.389300518668728e-06, + "loss": 0.83227348, + "num_input_tokens_seen": 156073295, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.47265625, + "step": 7280, + "time_per_iteration": 2.401724100112915 + }, + { + "auxiliary_loss_clip": 0.01065287, + "auxiliary_loss_mlp": 0.01029442, + "balance_loss_clip": 1.01657331, + "balance_loss_mlp": 1.0214529, + "epoch": 0.43775740267548474, + "flos": 22891850657280.0, + "grad_norm": 1.4863865544580297, + "language_loss": 0.76957524, + "learning_rate": 2.3889299161134027e-06, + "loss": 0.79052252, + "num_input_tokens_seen": 156094540, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.4375, + "step": 7281, + "time_per_iteration": 2.3928675651550293 + }, + { + "auxiliary_loss_clip": 0.01066469, + "auxiliary_loss_mlp": 0.01033675, + "balance_loss_clip": 1.02145576, + "balance_loss_mlp": 1.02202106, + "epoch": 0.4378175259281527, + "flos": 23293282003200.0, + "grad_norm": 2.64490473849056, + "language_loss": 0.75774133, + "learning_rate": 2.3885592996781686e-06, + "loss": 0.77874273, + "num_input_tokens_seen": 156114070, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.44335938, + "step": 7282, + "time_per_iteration": 2.433715343475342 + }, + { + "auxiliary_loss_clip": 0.01068215, + "auxiliary_loss_mlp": 0.0103786, + "balance_loss_clip": 1.0248065, + "balance_loss_mlp": 1.02303946, + "epoch": 0.43787764918082067, + "flos": 23875807904640.0, + "grad_norm": 1.7705778295454655, + "language_loss": 0.84759855, + "learning_rate": 2.388188669376253e-06, + "loss": 0.86865932, + "num_input_tokens_seen": 156132130, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.453125, + "step": 7283, + "time_per_iteration": 3.8448495864868164 + }, + { + "auxiliary_loss_clip": 0.01069652, + "auxiliary_loss_mlp": 0.01026392, + "balance_loss_clip": 1.0130105, + "balance_loss_mlp": 1.02181196, + "epoch": 0.43793777243348864, + "flos": 23987006184960.0, + "grad_norm": 1.8833942617746549, + "language_loss": 0.79978019, + "learning_rate": 2.3878180252208815e-06, + "loss": 0.82074058, + "num_input_tokens_seen": 156150820, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.47851562, + "step": 7284, + "time_per_iteration": 2.4049441814422607 + }, + { + "auxiliary_loss_clip": 0.01067651, + "auxiliary_loss_mlp": 0.01031212, + "balance_loss_clip": 1.01906419, + "balance_loss_mlp": 1.02273703, + "epoch": 0.4379978956861566, + "flos": 18623090142720.0, + "grad_norm": 1.5134597314324285, + "language_loss": 0.80824095, + "learning_rate": 2.3874473672252834e-06, + "loss": 0.82922959, + "num_input_tokens_seen": 156170125, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.44921875, + "step": 7285, + "time_per_iteration": 2.447779655456543 + }, + { + "auxiliary_loss_clip": 0.01066277, + "auxiliary_loss_mlp": 0.01026927, + "balance_loss_clip": 1.01423705, + "balance_loss_mlp": 1.02208698, + "epoch": 0.43805801893882457, + "flos": 21980303303040.0, + "grad_norm": 2.006586485476521, + "language_loss": 0.74657476, + "learning_rate": 2.387076695402685e-06, + "loss": 0.76750684, + "num_input_tokens_seen": 156187320, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.44140625, + "step": 7286, + "time_per_iteration": 2.3785505294799805 + }, + { + "auxiliary_loss_clip": 0.01067295, + "auxiliary_loss_mlp": 0.01025593, + "balance_loss_clip": 1.01264691, + "balance_loss_mlp": 1.02088594, + "epoch": 0.43811814219149253, + "flos": 26392207858560.0, + "grad_norm": 1.893074037821412, + "language_loss": 0.73432899, + "learning_rate": 2.386706009766314e-06, + "loss": 0.75525784, + "num_input_tokens_seen": 156207455, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.46484375, + "step": 7287, + "time_per_iteration": 2.45509672164917 + }, + { + "auxiliary_loss_clip": 0.01069086, + "auxiliary_loss_mlp": 0.01029676, + "balance_loss_clip": 1.01655054, + "balance_loss_mlp": 1.02303553, + "epoch": 0.43817826544416055, + "flos": 17492358072960.0, + "grad_norm": 1.8050893247627149, + "language_loss": 0.82183194, + "learning_rate": 2.3863353103294017e-06, + "loss": 0.84281957, + "num_input_tokens_seen": 156226560, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.4609375, + "step": 7288, + "time_per_iteration": 3.7694687843322754 + }, + { + "auxiliary_loss_clip": 0.01066773, + "auxiliary_loss_mlp": 0.01031257, + "balance_loss_clip": 1.01803684, + "balance_loss_mlp": 1.01971078, + "epoch": 0.4382383886968285, + "flos": 21579919297920.0, + "grad_norm": 1.730857194898685, + "language_loss": 0.84224093, + "learning_rate": 2.385964597105175e-06, + "loss": 0.86322129, + "num_input_tokens_seen": 156246740, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.47070312, + "step": 7289, + "time_per_iteration": 2.391387462615967 + }, + { + "auxiliary_loss_clip": 0.01070169, + "auxiliary_loss_mlp": 0.01031889, + "balance_loss_clip": 1.01860344, + "balance_loss_mlp": 1.0215714, + "epoch": 0.4382985119494965, + "flos": 27922625706240.0, + "grad_norm": 2.037536990661709, + "language_loss": 0.78176498, + "learning_rate": 2.3855938701068647e-06, + "loss": 0.80278552, + "num_input_tokens_seen": 156266440, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.48632812, + "step": 7290, + "time_per_iteration": 2.429898738861084 + }, + { + "auxiliary_loss_clip": 0.01066865, + "auxiliary_loss_mlp": 0.01025994, + "balance_loss_clip": 1.01298213, + "balance_loss_mlp": 1.02158546, + "epoch": 0.43835863520216445, + "flos": 24935666181120.0, + "grad_norm": 4.001330616817745, + "language_loss": 0.78265262, + "learning_rate": 2.385223129347701e-06, + "loss": 0.80358124, + "num_input_tokens_seen": 156286900, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.453125, + "step": 7291, + "time_per_iteration": 3.8007733821868896 + }, + { + "auxiliary_loss_clip": 0.01067558, + "auxiliary_loss_mlp": 0.01025039, + "balance_loss_clip": 1.01205683, + "balance_loss_mlp": 1.02229881, + "epoch": 0.4384187584548324, + "flos": 33508903397760.0, + "grad_norm": 2.1565048049408015, + "language_loss": 0.64823782, + "learning_rate": 2.3848523748409153e-06, + "loss": 0.66916382, + "num_input_tokens_seen": 156307690, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.453125, + "step": 7292, + "time_per_iteration": 2.4734513759613037 + }, + { + "auxiliary_loss_clip": 0.01067758, + "auxiliary_loss_mlp": 0.01025701, + "balance_loss_clip": 1.01307106, + "balance_loss_mlp": 1.02165484, + "epoch": 0.4384788817075004, + "flos": 23949928454400.0, + "grad_norm": 1.477356440088029, + "language_loss": 0.73789692, + "learning_rate": 2.3844816065997385e-06, + "loss": 0.7588315, + "num_input_tokens_seen": 156326620, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.4609375, + "step": 7293, + "time_per_iteration": 2.4099795818328857 + }, + { + "auxiliary_loss_clip": 0.01068537, + "auxiliary_loss_mlp": 0.0102831, + "balance_loss_clip": 1.01469612, + "balance_loss_mlp": 1.02268219, + "epoch": 0.43853900496016834, + "flos": 19097524874880.0, + "grad_norm": 1.9680259818948218, + "language_loss": 0.7862283, + "learning_rate": 2.3841108246374012e-06, + "loss": 0.80719674, + "num_input_tokens_seen": 156345495, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.45898438, + "step": 7294, + "time_per_iteration": 3.7705461978912354 + }, + { + "auxiliary_loss_clip": 0.01069427, + "auxiliary_loss_mlp": 0.01029369, + "balance_loss_clip": 1.01608312, + "balance_loss_mlp": 1.02280998, + "epoch": 0.4385991282128363, + "flos": 13224505253760.0, + "grad_norm": 2.0854836471769436, + "language_loss": 0.72696018, + "learning_rate": 2.3837400289671376e-06, + "loss": 0.74794817, + "num_input_tokens_seen": 156363155, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.46679688, + "step": 7295, + "time_per_iteration": 2.3849282264709473 + }, + { + "auxiliary_loss_clip": 0.01069663, + "auxiliary_loss_mlp": 0.0102992, + "balance_loss_clip": 1.01571035, + "balance_loss_mlp": 1.02167642, + "epoch": 0.4386592514655043, + "flos": 14318997465600.0, + "grad_norm": 1.9695145503115776, + "language_loss": 0.75327003, + "learning_rate": 2.3833692196021788e-06, + "loss": 0.77426589, + "num_input_tokens_seen": 156380940, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.48046875, + "step": 7296, + "time_per_iteration": 2.348862886428833 + }, + { + "auxiliary_loss_clip": 0.01067137, + "auxiliary_loss_mlp": 0.01025632, + "balance_loss_clip": 1.01272178, + "balance_loss_mlp": 1.02155089, + "epoch": 0.43871937471817224, + "flos": 22783305640320.0, + "grad_norm": 4.222016097640164, + "language_loss": 0.69280243, + "learning_rate": 2.382998396555759e-06, + "loss": 0.7137301, + "num_input_tokens_seen": 156400415, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.45507812, + "step": 7297, + "time_per_iteration": 2.4010088443756104 + }, + { + "auxiliary_loss_clip": 0.01064984, + "auxiliary_loss_mlp": 0.0102398, + "balance_loss_clip": 1.01124263, + "balance_loss_mlp": 1.0206964, + "epoch": 0.4387794979708402, + "flos": 28071111185280.0, + "grad_norm": 1.5067412474434823, + "language_loss": 0.69915098, + "learning_rate": 2.3826275598411113e-06, + "loss": 0.72004056, + "num_input_tokens_seen": 156421120, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.44335938, + "step": 7298, + "time_per_iteration": 2.4245784282684326 + }, + { + "auxiliary_loss_clip": 0.01065368, + "auxiliary_loss_mlp": 0.01026462, + "balance_loss_clip": 1.0135994, + "balance_loss_mlp": 1.02186251, + "epoch": 0.43883962122350817, + "flos": 26248365590400.0, + "grad_norm": 1.481346529716383, + "language_loss": 0.72172219, + "learning_rate": 2.3822567094714704e-06, + "loss": 0.7426405, + "num_input_tokens_seen": 156441535, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.43554688, + "step": 7299, + "time_per_iteration": 2.4347283840179443 + }, + { + "auxiliary_loss_clip": 0.01068846, + "auxiliary_loss_mlp": 0.01033838, + "balance_loss_clip": 1.01975369, + "balance_loss_mlp": 1.02169013, + "epoch": 0.43889974447617613, + "flos": 25882615520640.0, + "grad_norm": 1.7088059816409324, + "language_loss": 0.76721287, + "learning_rate": 2.3818858454600713e-06, + "loss": 0.78823972, + "num_input_tokens_seen": 156462015, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.47070312, + "step": 7300, + "time_per_iteration": 2.424647569656372 + }, + { + "auxiliary_loss_clip": 0.0106569, + "auxiliary_loss_mlp": 0.01027496, + "balance_loss_clip": 1.01420379, + "balance_loss_mlp": 1.02049375, + "epoch": 0.43895986772884416, + "flos": 25045433095680.0, + "grad_norm": 1.695008667764067, + "language_loss": 0.7051391, + "learning_rate": 2.3815149678201474e-06, + "loss": 0.72607094, + "num_input_tokens_seen": 156482165, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.45117188, + "step": 7301, + "time_per_iteration": 2.4310576915740967 + }, + { + "auxiliary_loss_clip": 0.01071351, + "auxiliary_loss_mlp": 0.01031465, + "balance_loss_clip": 1.0173986, + "balance_loss_mlp": 1.02410221, + "epoch": 0.4390199909815121, + "flos": 25993394864640.0, + "grad_norm": 2.085339454256274, + "language_loss": 0.70724308, + "learning_rate": 2.381144076564937e-06, + "loss": 0.72827125, + "num_input_tokens_seen": 156503170, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.47265625, + "step": 7302, + "time_per_iteration": 2.417558431625366 + }, + { + "auxiliary_loss_clip": 0.01067381, + "auxiliary_loss_mlp": 0.01027797, + "balance_loss_clip": 1.01478553, + "balance_loss_mlp": 1.02247787, + "epoch": 0.4390801142341801, + "flos": 29020993079040.0, + "grad_norm": 1.5293703708610515, + "language_loss": 0.8236506, + "learning_rate": 2.3807731717076748e-06, + "loss": 0.84460241, + "num_input_tokens_seen": 156523005, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.44921875, + "step": 7303, + "time_per_iteration": 2.449758768081665 + }, + { + "auxiliary_loss_clip": 0.01071938, + "auxiliary_loss_mlp": 0.01035951, + "balance_loss_clip": 1.02094817, + "balance_loss_mlp": 1.02326453, + "epoch": 0.43914023748684805, + "flos": 33437121909120.0, + "grad_norm": 2.170439942002019, + "language_loss": 0.68327439, + "learning_rate": 2.3804022532615965e-06, + "loss": 0.70435327, + "num_input_tokens_seen": 156544440, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.48632812, + "step": 7304, + "time_per_iteration": 2.5009753704071045 + }, + { + "auxiliary_loss_clip": 0.01065079, + "auxiliary_loss_mlp": 0.01030006, + "balance_loss_clip": 1.0175128, + "balance_loss_mlp": 1.02139831, + "epoch": 0.439200360739516, + "flos": 34530427134720.0, + "grad_norm": 1.5092765054567732, + "language_loss": 0.77921844, + "learning_rate": 2.3800313212399412e-06, + "loss": 0.80016923, + "num_input_tokens_seen": 156565410, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.4375, + "step": 7305, + "time_per_iteration": 2.4913430213928223 + }, + { + "auxiliary_loss_clip": 0.01066529, + "auxiliary_loss_mlp": 0.01024984, + "balance_loss_clip": 1.01121497, + "balance_loss_mlp": 1.02193689, + "epoch": 0.439260483992184, + "flos": 21906776246400.0, + "grad_norm": 1.670073197583112, + "language_loss": 0.68673897, + "learning_rate": 2.379660375655945e-06, + "loss": 0.70765406, + "num_input_tokens_seen": 156584210, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.4453125, + "step": 7306, + "time_per_iteration": 2.382509231567383 + }, + { + "auxiliary_loss_clip": 0.01066753, + "auxiliary_loss_mlp": 0.0102754, + "balance_loss_clip": 1.01447475, + "balance_loss_mlp": 1.02201915, + "epoch": 0.43932060724485195, + "flos": 20995368537600.0, + "grad_norm": 2.1844379359446644, + "language_loss": 0.62688291, + "learning_rate": 2.379289416522847e-06, + "loss": 0.64782584, + "num_input_tokens_seen": 156602730, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.44726562, + "step": 7307, + "time_per_iteration": 2.395474433898926 + }, + { + "auxiliary_loss_clip": 0.01069624, + "auxiliary_loss_mlp": 0.01024663, + "balance_loss_clip": 1.01107299, + "balance_loss_mlp": 1.0222764, + "epoch": 0.4393807304975199, + "flos": 17746141812480.0, + "grad_norm": 4.06564361475154, + "language_loss": 0.71797889, + "learning_rate": 2.378918443853886e-06, + "loss": 0.73892176, + "num_input_tokens_seen": 156619405, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.47265625, + "step": 7308, + "time_per_iteration": 2.350015163421631 + }, + { + "auxiliary_loss_clip": 0.01069595, + "auxiliary_loss_mlp": 0.01027761, + "balance_loss_clip": 1.01408172, + "balance_loss_mlp": 1.02167344, + "epoch": 0.4394408537501879, + "flos": 22527427219200.0, + "grad_norm": 1.911058568767653, + "language_loss": 0.76760745, + "learning_rate": 2.378547457662299e-06, + "loss": 0.78858101, + "num_input_tokens_seen": 156638165, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.47851562, + "step": 7309, + "time_per_iteration": 2.3941140174865723 + }, + { + "auxiliary_loss_clip": 0.01063287, + "auxiliary_loss_mlp": 0.01024091, + "balance_loss_clip": 1.01190138, + "balance_loss_mlp": 1.02069449, + "epoch": 0.43950097700285584, + "flos": 23439533155200.0, + "grad_norm": 1.5778124600762506, + "language_loss": 0.70569152, + "learning_rate": 2.378176457961328e-06, + "loss": 0.72656524, + "num_input_tokens_seen": 156658845, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.42578125, + "step": 7310, + "time_per_iteration": 2.4062163829803467 + }, + { + "auxiliary_loss_clip": 0.01068477, + "auxiliary_loss_mlp": 0.01026437, + "balance_loss_clip": 1.01307917, + "balance_loss_mlp": 1.02218342, + "epoch": 0.4395611002555238, + "flos": 23179709750400.0, + "grad_norm": 2.035250756513866, + "language_loss": 0.76069045, + "learning_rate": 2.3778054447642124e-06, + "loss": 0.78163958, + "num_input_tokens_seen": 156677275, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.46289062, + "step": 7311, + "time_per_iteration": 2.4151101112365723 + }, + { + "auxiliary_loss_clip": 0.01069251, + "auxiliary_loss_mlp": 0.01024469, + "balance_loss_clip": 1.01126623, + "balance_loss_mlp": 1.02241564, + "epoch": 0.43962122350819177, + "flos": 22126275164160.0, + "grad_norm": 1.9890444070501072, + "language_loss": 0.81300652, + "learning_rate": 2.3774344180841917e-06, + "loss": 0.83394378, + "num_input_tokens_seen": 156695815, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.46875, + "step": 7312, + "time_per_iteration": 2.3781380653381348 + }, + { + "auxiliary_loss_clip": 0.010653, + "auxiliary_loss_mlp": 0.01025148, + "balance_loss_clip": 1.01245844, + "balance_loss_mlp": 1.01947701, + "epoch": 0.43968134676085974, + "flos": 17419599066240.0, + "grad_norm": 2.0392383302956234, + "language_loss": 0.84731162, + "learning_rate": 2.3770633779345074e-06, + "loss": 0.8682161, + "num_input_tokens_seen": 156714385, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.45898438, + "step": 7313, + "time_per_iteration": 2.3656349182128906 + }, + { + "auxiliary_loss_clip": 0.01067546, + "auxiliary_loss_mlp": 0.01025708, + "balance_loss_clip": 1.01223695, + "balance_loss_mlp": 1.02078867, + "epoch": 0.43974147001352776, + "flos": 18951657747840.0, + "grad_norm": 3.075219796038178, + "language_loss": 0.67568696, + "learning_rate": 2.376692324328401e-06, + "loss": 0.69661951, + "num_input_tokens_seen": 156732615, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.46679688, + "step": 7314, + "time_per_iteration": 2.3782942295074463 + }, + { + "auxiliary_loss_clip": 0.01070464, + "auxiliary_loss_mlp": 0.0102613, + "balance_loss_clip": 1.01193869, + "balance_loss_mlp": 1.02350986, + "epoch": 0.4398015932661957, + "flos": 18952495620480.0, + "grad_norm": 1.7811284445773143, + "language_loss": 0.76920003, + "learning_rate": 2.376321257279115e-06, + "loss": 0.79016602, + "num_input_tokens_seen": 156750920, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.47070312, + "step": 7315, + "time_per_iteration": 2.3914103507995605 + }, + { + "auxiliary_loss_clip": 0.01065438, + "auxiliary_loss_mlp": 0.0102496, + "balance_loss_clip": 1.01235962, + "balance_loss_mlp": 1.02112913, + "epoch": 0.4398617165188637, + "flos": 24198964248960.0, + "grad_norm": 2.373060023471036, + "language_loss": 0.74444026, + "learning_rate": 2.375950176799891e-06, + "loss": 0.76534426, + "num_input_tokens_seen": 156768520, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.44335938, + "step": 7316, + "time_per_iteration": 2.3876848220825195 + }, + { + "auxiliary_loss_clip": 0.01068629, + "auxiliary_loss_mlp": 0.01033072, + "balance_loss_clip": 1.0196197, + "balance_loss_mlp": 1.02283001, + "epoch": 0.43992183977153165, + "flos": 22235588231040.0, + "grad_norm": 3.457084490827884, + "language_loss": 0.65232313, + "learning_rate": 2.375579082903972e-06, + "loss": 0.67334008, + "num_input_tokens_seen": 156788700, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.45703125, + "step": 7317, + "time_per_iteration": 2.418240785598755 + }, + { + "auxiliary_loss_clip": 0.01070495, + "auxiliary_loss_mlp": 0.01025363, + "balance_loss_clip": 1.01244092, + "balance_loss_mlp": 1.02403331, + "epoch": 0.4399819630241996, + "flos": 18696477553920.0, + "grad_norm": 2.003920030367403, + "language_loss": 0.79870534, + "learning_rate": 2.3752079756046015e-06, + "loss": 0.81966394, + "num_input_tokens_seen": 156806470, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.46484375, + "step": 7318, + "time_per_iteration": 2.3626792430877686 + }, + { + "auxiliary_loss_clip": 0.01071628, + "auxiliary_loss_mlp": 0.01032059, + "balance_loss_clip": 1.01798034, + "balance_loss_mlp": 1.02358246, + "epoch": 0.4400420862768676, + "flos": 23878216788480.0, + "grad_norm": 1.748811073338383, + "language_loss": 0.7934953, + "learning_rate": 2.374836854915024e-06, + "loss": 0.81453216, + "num_input_tokens_seen": 156825895, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.48046875, + "step": 7319, + "time_per_iteration": 2.4187049865722656 + }, + { + "auxiliary_loss_clip": 0.01067986, + "auxiliary_loss_mlp": 0.01033722, + "balance_loss_clip": 1.019238, + "balance_loss_mlp": 1.02134204, + "epoch": 0.44010220952953555, + "flos": 28036372515840.0, + "grad_norm": 2.122085803039445, + "language_loss": 0.7937516, + "learning_rate": 2.3744657208484835e-06, + "loss": 0.81476867, + "num_input_tokens_seen": 156845990, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.46679688, + "step": 7320, + "time_per_iteration": 2.4267382621765137 + }, + { + "auxiliary_loss_clip": 0.01066093, + "auxiliary_loss_mlp": 0.01030059, + "balance_loss_clip": 1.01667786, + "balance_loss_mlp": 1.02105093, + "epoch": 0.4401623327822035, + "flos": 23767856380800.0, + "grad_norm": 4.357537938104312, + "language_loss": 0.69978678, + "learning_rate": 2.374094573418224e-06, + "loss": 0.72074831, + "num_input_tokens_seen": 156866685, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.45117188, + "step": 7321, + "time_per_iteration": 2.4132466316223145 + }, + { + "auxiliary_loss_clip": 0.01012633, + "auxiliary_loss_mlp": 0.01012559, + "balance_loss_clip": 1.01115191, + "balance_loss_mlp": 1.0030477, + "epoch": 0.4402224560348715, + "flos": 70770793795200.0, + "grad_norm": 0.8791755273740665, + "language_loss": 0.56862605, + "learning_rate": 2.3737234126374923e-06, + "loss": 0.58887798, + "num_input_tokens_seen": 156923450, + "router_z_loss_clip": 0.01403809, + "router_z_loss_mlp": 0.09570312, + "step": 7322, + "time_per_iteration": 4.473577260971069 + }, + { + "auxiliary_loss_clip": 0.01065771, + "auxiliary_loss_mlp": 0.01021559, + "balance_loss_clip": 1.01025796, + "balance_loss_mlp": 1.02133751, + "epoch": 0.44028257928753944, + "flos": 22890733493760.0, + "grad_norm": 1.4670766101792385, + "language_loss": 0.76228476, + "learning_rate": 2.3733522385195325e-06, + "loss": 0.783158, + "num_input_tokens_seen": 156944795, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.4453125, + "step": 7323, + "time_per_iteration": 2.466904878616333 + }, + { + "auxiliary_loss_clip": 0.01067827, + "auxiliary_loss_mlp": 0.01030956, + "balance_loss_clip": 1.01654935, + "balance_loss_mlp": 1.0214299, + "epoch": 0.4403427025402074, + "flos": 17894766936960.0, + "grad_norm": 1.919450424670931, + "language_loss": 0.80852675, + "learning_rate": 2.372981051077592e-06, + "loss": 0.82951456, + "num_input_tokens_seen": 156962755, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.46289062, + "step": 7324, + "time_per_iteration": 2.390606164932251 + }, + { + "auxiliary_loss_clip": 0.01011392, + "auxiliary_loss_mlp": 0.0101078, + "balance_loss_clip": 1.00960612, + "balance_loss_mlp": 1.00158596, + "epoch": 0.4404028257928754, + "flos": 69558993815040.0, + "grad_norm": 0.6613429671155459, + "language_loss": 0.54513073, + "learning_rate": 2.3726098503249175e-06, + "loss": 0.56535244, + "num_input_tokens_seen": 157028095, + "router_z_loss_clip": 0.01171875, + "router_z_loss_mlp": 0.09765625, + "step": 7325, + "time_per_iteration": 3.0721030235290527 + }, + { + "auxiliary_loss_clip": 0.01067211, + "auxiliary_loss_mlp": 0.01024091, + "balance_loss_clip": 1.01191938, + "balance_loss_mlp": 1.02337766, + "epoch": 0.44046294904554334, + "flos": 20922609530880.0, + "grad_norm": 1.643523630788585, + "language_loss": 0.69996369, + "learning_rate": 2.3722386362747558e-06, + "loss": 0.72087669, + "num_input_tokens_seen": 157048365, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.4375, + "step": 7326, + "time_per_iteration": 2.474679708480835 + }, + { + "auxiliary_loss_clip": 0.01065679, + "auxiliary_loss_mlp": 0.01023559, + "balance_loss_clip": 1.01036835, + "balance_loss_mlp": 1.02110422, + "epoch": 0.44052307229821136, + "flos": 23622338367360.0, + "grad_norm": 1.638873996479361, + "language_loss": 0.76135713, + "learning_rate": 2.371867408940355e-06, + "loss": 0.78224951, + "num_input_tokens_seen": 157069130, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.4453125, + "step": 7327, + "time_per_iteration": 3.89322566986084 + }, + { + "auxiliary_loss_clip": 0.01064627, + "auxiliary_loss_mlp": 0.01029566, + "balance_loss_clip": 1.01663184, + "balance_loss_mlp": 1.0203433, + "epoch": 0.4405831955508793, + "flos": 17596853372160.0, + "grad_norm": 1.8955033922309152, + "language_loss": 0.83783782, + "learning_rate": 2.371496168334962e-06, + "loss": 0.85877979, + "num_input_tokens_seen": 157084940, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.44335938, + "step": 7328, + "time_per_iteration": 2.3692851066589355 + }, + { + "auxiliary_loss_clip": 0.01067058, + "auxiliary_loss_mlp": 0.0102387, + "balance_loss_clip": 1.01095939, + "balance_loss_mlp": 1.02202737, + "epoch": 0.4406433188035473, + "flos": 21462506795520.0, + "grad_norm": 1.9738752906605108, + "language_loss": 0.77272344, + "learning_rate": 2.371124914471827e-06, + "loss": 0.79363269, + "num_input_tokens_seen": 157102770, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.44921875, + "step": 7329, + "time_per_iteration": 2.4042446613311768 + }, + { + "auxiliary_loss_clip": 0.01069152, + "auxiliary_loss_mlp": 0.0102772, + "balance_loss_clip": 1.01442862, + "balance_loss_mlp": 1.02213573, + "epoch": 0.44070344205621526, + "flos": 22672491384960.0, + "grad_norm": 1.5710490433115647, + "language_loss": 0.7330249, + "learning_rate": 2.3707536473641987e-06, + "loss": 0.75399363, + "num_input_tokens_seen": 157122035, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.47070312, + "step": 7330, + "time_per_iteration": 2.451658010482788 + }, + { + "auxiliary_loss_clip": 0.01069354, + "auxiliary_loss_mlp": 0.01031169, + "balance_loss_clip": 1.0183301, + "balance_loss_mlp": 1.02368641, + "epoch": 0.4407635653088832, + "flos": 23440056825600.0, + "grad_norm": 1.8886488760838134, + "language_loss": 0.74119925, + "learning_rate": 2.3703823670253257e-06, + "loss": 0.76220453, + "num_input_tokens_seen": 157142800, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.45703125, + "step": 7331, + "time_per_iteration": 3.9551870822906494 + }, + { + "auxiliary_loss_clip": 0.01066067, + "auxiliary_loss_mlp": 0.01027297, + "balance_loss_clip": 1.01489365, + "balance_loss_mlp": 1.02308202, + "epoch": 0.4408236885615512, + "flos": 24020243665920.0, + "grad_norm": 1.4065765224303692, + "language_loss": 0.76129133, + "learning_rate": 2.370011073468459e-06, + "loss": 0.78222501, + "num_input_tokens_seen": 157163295, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.4296875, + "step": 7332, + "time_per_iteration": 2.434854745864868 + }, + { + "auxiliary_loss_clip": 0.01067044, + "auxiliary_loss_mlp": 0.01036068, + "balance_loss_clip": 1.02342629, + "balance_loss_mlp": 1.02253413, + "epoch": 0.44088381181421915, + "flos": 12676019794560.0, + "grad_norm": 1.8372815045016255, + "language_loss": 0.73397189, + "learning_rate": 2.3696397667068488e-06, + "loss": 0.75500304, + "num_input_tokens_seen": 157180890, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.4453125, + "step": 7333, + "time_per_iteration": 2.3688178062438965 + }, + { + "auxiliary_loss_clip": 0.01064937, + "auxiliary_loss_mlp": 0.01025628, + "balance_loss_clip": 1.01380897, + "balance_loss_mlp": 1.02207541, + "epoch": 0.4409439350668871, + "flos": 24568764036480.0, + "grad_norm": 2.046421076304599, + "language_loss": 0.79642963, + "learning_rate": 2.3692684467537467e-06, + "loss": 0.81733531, + "num_input_tokens_seen": 157200580, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.4296875, + "step": 7334, + "time_per_iteration": 3.8238611221313477 + }, + { + "auxiliary_loss_clip": 0.01073805, + "auxiliary_loss_mlp": 0.01032206, + "balance_loss_clip": 1.01637459, + "balance_loss_mlp": 1.02283168, + "epoch": 0.4410040583195551, + "flos": 22667638705920.0, + "grad_norm": 1.9713774269250286, + "language_loss": 0.75139797, + "learning_rate": 2.3688971136224027e-06, + "loss": 0.77245808, + "num_input_tokens_seen": 157218345, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.5078125, + "step": 7335, + "time_per_iteration": 2.4025678634643555 + }, + { + "auxiliary_loss_clip": 0.01068775, + "auxiliary_loss_mlp": 0.0103152, + "balance_loss_clip": 1.01819849, + "balance_loss_mlp": 1.02330291, + "epoch": 0.44106418157222305, + "flos": 10851773011200.0, + "grad_norm": 2.1935650062395693, + "language_loss": 0.72708499, + "learning_rate": 2.3685257673260702e-06, + "loss": 0.748088, + "num_input_tokens_seen": 157234395, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.45507812, + "step": 7336, + "time_per_iteration": 2.3567609786987305 + }, + { + "auxiliary_loss_clip": 0.01069444, + "auxiliary_loss_mlp": 0.01033551, + "balance_loss_clip": 1.01962709, + "balance_loss_mlp": 1.02181053, + "epoch": 0.441124304824891, + "flos": 21725611868160.0, + "grad_norm": 2.574831642009228, + "language_loss": 0.63041508, + "learning_rate": 2.3681544078780013e-06, + "loss": 0.65144503, + "num_input_tokens_seen": 157254805, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.4765625, + "step": 7337, + "time_per_iteration": 2.407491445541382 + }, + { + "auxiliary_loss_clip": 0.01013509, + "auxiliary_loss_mlp": 0.00999973, + "balance_loss_clip": 0.99869716, + "balance_loss_mlp": 1.00424719, + "epoch": 0.441184428077559, + "flos": 63216950722560.0, + "grad_norm": 0.7450510072177793, + "language_loss": 0.52638292, + "learning_rate": 2.367783035291448e-06, + "loss": 0.54651779, + "num_input_tokens_seen": 157317870, + "router_z_loss_clip": 0.01275635, + "router_z_loss_mlp": 0.09277344, + "step": 7338, + "time_per_iteration": 3.0763192176818848 + }, + { + "auxiliary_loss_clip": 0.01070866, + "auxiliary_loss_mlp": 0.01035564, + "balance_loss_clip": 1.01975107, + "balance_loss_mlp": 1.0218184, + "epoch": 0.44124455133022694, + "flos": 21176916940800.0, + "grad_norm": 2.2456901588813385, + "language_loss": 0.70730346, + "learning_rate": 2.3674116495796642e-06, + "loss": 0.72836769, + "num_input_tokens_seen": 157336505, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.49023438, + "step": 7339, + "time_per_iteration": 2.4551875591278076 + }, + { + "auxiliary_loss_clip": 0.01066493, + "auxiliary_loss_mlp": 0.01027272, + "balance_loss_clip": 1.01518428, + "balance_loss_mlp": 1.02353311, + "epoch": 0.4413046745828949, + "flos": 17456886264960.0, + "grad_norm": 1.4759038518482357, + "language_loss": 0.69624043, + "learning_rate": 2.367040250755904e-06, + "loss": 0.71717805, + "num_input_tokens_seen": 157354995, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.4296875, + "step": 7340, + "time_per_iteration": 2.3842039108276367 + }, + { + "auxiliary_loss_clip": 0.01011135, + "auxiliary_loss_mlp": 0.01002345, + "balance_loss_clip": 1.00113547, + "balance_loss_mlp": 1.00178528, + "epoch": 0.4413647978355629, + "flos": 61583470807680.0, + "grad_norm": 0.8924772409067624, + "language_loss": 0.64010513, + "learning_rate": 2.3666688388334215e-06, + "loss": 0.66023993, + "num_input_tokens_seen": 157404260, + "router_z_loss_clip": 0.01208496, + "router_z_loss_mlp": 0.09375, + "step": 7341, + "time_per_iteration": 2.8827569484710693 + }, + { + "auxiliary_loss_clip": 0.01065237, + "auxiliary_loss_mlp": 0.01031014, + "balance_loss_clip": 1.01707256, + "balance_loss_mlp": 1.02114201, + "epoch": 0.4414249210882309, + "flos": 27525767748480.0, + "grad_norm": 2.518033725801175, + "language_loss": 0.73447245, + "learning_rate": 2.366297413825472e-06, + "loss": 0.75543499, + "num_input_tokens_seen": 157423045, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.44140625, + "step": 7342, + "time_per_iteration": 2.4284284114837646 + }, + { + "auxiliary_loss_clip": 0.01065147, + "auxiliary_loss_mlp": 0.01033093, + "balance_loss_clip": 1.01854956, + "balance_loss_mlp": 1.0200119, + "epoch": 0.44148504434089886, + "flos": 23512850743680.0, + "grad_norm": 2.119770737180489, + "language_loss": 0.79646921, + "learning_rate": 2.365925975745309e-06, + "loss": 0.81745166, + "num_input_tokens_seen": 157441815, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.45117188, + "step": 7343, + "time_per_iteration": 2.460235357284546 + }, + { + "auxiliary_loss_clip": 0.01065415, + "auxiliary_loss_mlp": 0.01026029, + "balance_loss_clip": 1.01269543, + "balance_loss_mlp": 1.02041328, + "epoch": 0.4415451675935668, + "flos": 21579500361600.0, + "grad_norm": 2.380150118568221, + "language_loss": 0.76581752, + "learning_rate": 2.3655545246061893e-06, + "loss": 0.78673196, + "num_input_tokens_seen": 157460470, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.44921875, + "step": 7344, + "time_per_iteration": 2.4252257347106934 + }, + { + "auxiliary_loss_clip": 0.01012399, + "auxiliary_loss_mlp": 0.01000179, + "balance_loss_clip": 0.9990167, + "balance_loss_mlp": 1.00308847, + "epoch": 0.4416052908462348, + "flos": 59003458623360.0, + "grad_norm": 0.7965813212143824, + "language_loss": 0.63803375, + "learning_rate": 2.365183060421369e-06, + "loss": 0.65815949, + "num_input_tokens_seen": 157512655, + "router_z_loss_clip": 0.01159668, + "router_z_loss_mlp": 0.09277344, + "step": 7345, + "time_per_iteration": 2.8555827140808105 + }, + { + "auxiliary_loss_clip": 0.01070525, + "auxiliary_loss_mlp": 0.01036088, + "balance_loss_clip": 1.02249861, + "balance_loss_mlp": 1.02391565, + "epoch": 0.44166541409890275, + "flos": 26356491671040.0, + "grad_norm": 1.7339393533339853, + "language_loss": 0.85957098, + "learning_rate": 2.364811583204105e-06, + "loss": 0.88063705, + "num_input_tokens_seen": 157533700, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.46484375, + "step": 7346, + "time_per_iteration": 2.4572582244873047 + }, + { + "auxiliary_loss_clip": 0.01070894, + "auxiliary_loss_mlp": 0.01030769, + "balance_loss_clip": 1.01598668, + "balance_loss_mlp": 1.02438879, + "epoch": 0.4417255373515707, + "flos": 20191667973120.0, + "grad_norm": 2.3614820243651033, + "language_loss": 0.80577183, + "learning_rate": 2.364440092967654e-06, + "loss": 0.82678843, + "num_input_tokens_seen": 157551105, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.46484375, + "step": 7347, + "time_per_iteration": 2.3787384033203125 + }, + { + "auxiliary_loss_clip": 0.01067926, + "auxiliary_loss_mlp": 0.01029389, + "balance_loss_clip": 1.01441669, + "balance_loss_mlp": 1.02244854, + "epoch": 0.4417856606042387, + "flos": 17887121349120.0, + "grad_norm": 1.7165532417094271, + "language_loss": 0.83231586, + "learning_rate": 2.3640685897252726e-06, + "loss": 0.85328901, + "num_input_tokens_seen": 157568285, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.45507812, + "step": 7348, + "time_per_iteration": 2.376331090927124 + }, + { + "auxiliary_loss_clip": 0.01071259, + "auxiliary_loss_mlp": 0.01030098, + "balance_loss_clip": 1.01598954, + "balance_loss_mlp": 1.02338207, + "epoch": 0.44184578385690665, + "flos": 27962810547840.0, + "grad_norm": 3.7985211395260152, + "language_loss": 0.70507181, + "learning_rate": 2.3636970734902205e-06, + "loss": 0.72608536, + "num_input_tokens_seen": 157590405, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.47851562, + "step": 7349, + "time_per_iteration": 2.480605363845825 + }, + { + "auxiliary_loss_clip": 0.01071431, + "auxiliary_loss_mlp": 0.01029357, + "balance_loss_clip": 1.01462269, + "balance_loss_mlp": 1.02442765, + "epoch": 0.4419059071095746, + "flos": 23366774148480.0, + "grad_norm": 1.74224493567371, + "language_loss": 0.74681908, + "learning_rate": 2.363325544275755e-06, + "loss": 0.76782697, + "num_input_tokens_seen": 157607420, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.47070312, + "step": 7350, + "time_per_iteration": 2.4460537433624268 + }, + { + "auxiliary_loss_clip": 0.01068668, + "auxiliary_loss_mlp": 0.01028885, + "balance_loss_clip": 1.01556957, + "balance_loss_mlp": 1.02223802, + "epoch": 0.4419660303622426, + "flos": 15011290281600.0, + "grad_norm": 2.283531659940804, + "language_loss": 0.81093752, + "learning_rate": 2.362954002095136e-06, + "loss": 0.83191311, + "num_input_tokens_seen": 157624990, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.46484375, + "step": 7351, + "time_per_iteration": 2.379521131515503 + }, + { + "auxiliary_loss_clip": 0.01065902, + "auxiliary_loss_mlp": 0.01026516, + "balance_loss_clip": 1.01404643, + "balance_loss_mlp": 1.02211881, + "epoch": 0.44202615361491054, + "flos": 25370649210240.0, + "grad_norm": 1.9919271523728592, + "language_loss": 0.73270607, + "learning_rate": 2.3625824469616222e-06, + "loss": 0.75363028, + "num_input_tokens_seen": 157645300, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.4375, + "step": 7352, + "time_per_iteration": 2.43753719329834 + }, + { + "auxiliary_loss_clip": 0.01067542, + "auxiliary_loss_mlp": 0.0102741, + "balance_loss_clip": 1.01372433, + "balance_loss_mlp": 1.02303159, + "epoch": 0.4420862768675785, + "flos": 24679962316800.0, + "grad_norm": 1.8996601603970498, + "language_loss": 0.87182808, + "learning_rate": 2.362210878888473e-06, + "loss": 0.89277762, + "num_input_tokens_seen": 157664060, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.4453125, + "step": 7353, + "time_per_iteration": 2.4737606048583984 + }, + { + "auxiliary_loss_clip": 0.01070133, + "auxiliary_loss_mlp": 0.01027777, + "balance_loss_clip": 1.01506317, + "balance_loss_mlp": 1.02399015, + "epoch": 0.44214640012024653, + "flos": 19527655224960.0, + "grad_norm": 1.9157185722891001, + "language_loss": 0.76094747, + "learning_rate": 2.3618392978889498e-06, + "loss": 0.78192657, + "num_input_tokens_seen": 157680905, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.4609375, + "step": 7354, + "time_per_iteration": 2.4064974784851074 + }, + { + "auxiliary_loss_clip": 0.0106601, + "auxiliary_loss_mlp": 0.01031635, + "balance_loss_clip": 1.01917791, + "balance_loss_mlp": 1.02208328, + "epoch": 0.4422065233729145, + "flos": 47555649014400.0, + "grad_norm": 2.0401354268514953, + "language_loss": 0.64482296, + "learning_rate": 2.3614677039763122e-06, + "loss": 0.66579938, + "num_input_tokens_seen": 157701980, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.43945312, + "step": 7355, + "time_per_iteration": 2.629253625869751 + }, + { + "auxiliary_loss_clip": 0.01071131, + "auxiliary_loss_mlp": 0.01033222, + "balance_loss_clip": 1.01748061, + "balance_loss_mlp": 1.02305174, + "epoch": 0.44226664662558246, + "flos": 19280050796160.0, + "grad_norm": 1.7254199532359256, + "language_loss": 0.77605134, + "learning_rate": 2.3610960971638224e-06, + "loss": 0.79709482, + "num_input_tokens_seen": 157720555, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.48046875, + "step": 7356, + "time_per_iteration": 2.413994789123535 + }, + { + "auxiliary_loss_clip": 0.01070667, + "auxiliary_loss_mlp": 0.01031044, + "balance_loss_clip": 1.01763272, + "balance_loss_mlp": 1.02317965, + "epoch": 0.4423267698782504, + "flos": 17820855855360.0, + "grad_norm": 1.5461619394206094, + "language_loss": 0.77268058, + "learning_rate": 2.3607244774647423e-06, + "loss": 0.79369766, + "num_input_tokens_seen": 157739160, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.4765625, + "step": 7357, + "time_per_iteration": 2.4078359603881836 + }, + { + "auxiliary_loss_clip": 0.01068258, + "auxiliary_loss_mlp": 0.01032839, + "balance_loss_clip": 1.01939237, + "balance_loss_mlp": 1.02252007, + "epoch": 0.4423868931309184, + "flos": 29203169886720.0, + "grad_norm": 1.44112206606686, + "language_loss": 0.73464096, + "learning_rate": 2.360352844892333e-06, + "loss": 0.75565189, + "num_input_tokens_seen": 157760020, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.45703125, + "step": 7358, + "time_per_iteration": 2.484079360961914 + }, + { + "auxiliary_loss_clip": 0.01069875, + "auxiliary_loss_mlp": 0.01026677, + "balance_loss_clip": 1.01401067, + "balance_loss_mlp": 1.0240736, + "epoch": 0.44244701638358636, + "flos": 29711924352000.0, + "grad_norm": 1.8937250761368456, + "language_loss": 0.75743824, + "learning_rate": 2.359981199459858e-06, + "loss": 0.77840376, + "num_input_tokens_seen": 157780435, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.45898438, + "step": 7359, + "time_per_iteration": 2.498840808868408 + }, + { + "auxiliary_loss_clip": 0.01070787, + "auxiliary_loss_mlp": 0.01027938, + "balance_loss_clip": 1.01387715, + "balance_loss_mlp": 1.02433181, + "epoch": 0.4425071396362543, + "flos": 22928928387840.0, + "grad_norm": 1.9255982489290764, + "language_loss": 0.69819403, + "learning_rate": 2.3596095411805794e-06, + "loss": 0.7191813, + "num_input_tokens_seen": 157799420, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.46484375, + "step": 7360, + "time_per_iteration": 2.420524835586548 + }, + { + "auxiliary_loss_clip": 0.01067971, + "auxiliary_loss_mlp": 0.01028931, + "balance_loss_clip": 1.01560974, + "balance_loss_mlp": 1.02234328, + "epoch": 0.4425672628889223, + "flos": 19791318879360.0, + "grad_norm": 2.191593359193914, + "language_loss": 0.69357896, + "learning_rate": 2.359237870067761e-06, + "loss": 0.71454799, + "num_input_tokens_seen": 157817025, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.45507812, + "step": 7361, + "time_per_iteration": 3.834197998046875 + }, + { + "auxiliary_loss_clip": 0.01069018, + "auxiliary_loss_mlp": 0.01027559, + "balance_loss_clip": 1.01421309, + "balance_loss_mlp": 1.02243459, + "epoch": 0.44262738614159025, + "flos": 13661373496320.0, + "grad_norm": 2.357801978380074, + "language_loss": 0.82654482, + "learning_rate": 2.3588661861346676e-06, + "loss": 0.84751058, + "num_input_tokens_seen": 157834345, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.46679688, + "step": 7362, + "time_per_iteration": 2.3890833854675293 + }, + { + "auxiliary_loss_clip": 0.01071997, + "auxiliary_loss_mlp": 0.01034297, + "balance_loss_clip": 1.01958632, + "balance_loss_mlp": 1.02310729, + "epoch": 0.4426875093942582, + "flos": 14209335285120.0, + "grad_norm": 1.6942760620523534, + "language_loss": 0.74600351, + "learning_rate": 2.3584944893945634e-06, + "loss": 0.76706636, + "num_input_tokens_seen": 157852290, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.48828125, + "step": 7363, + "time_per_iteration": 2.3934593200683594 + }, + { + "auxiliary_loss_clip": 0.01010999, + "auxiliary_loss_mlp": 0.01003406, + "balance_loss_clip": 1.00213623, + "balance_loss_mlp": 1.00142968, + "epoch": 0.4427476326469262, + "flos": 70113763319040.0, + "grad_norm": 0.674881632138052, + "language_loss": 0.55666471, + "learning_rate": 2.3581227798607126e-06, + "loss": 0.57680881, + "num_input_tokens_seen": 157923060, + "router_z_loss_clip": 0.01269531, + "router_z_loss_mlp": 0.09570312, + "step": 7364, + "time_per_iteration": 3.1195318698883057 + }, + { + "auxiliary_loss_clip": 0.01066382, + "auxiliary_loss_mlp": 0.01026319, + "balance_loss_clip": 1.01349235, + "balance_loss_mlp": 1.02050376, + "epoch": 0.44280775589959415, + "flos": 25443966798720.0, + "grad_norm": 2.0281664712014593, + "language_loss": 0.74107242, + "learning_rate": 2.3577510575463806e-06, + "loss": 0.76199937, + "num_input_tokens_seen": 157944110, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.45898438, + "step": 7365, + "time_per_iteration": 2.4245405197143555 + }, + { + "auxiliary_loss_clip": 0.01067171, + "auxiliary_loss_mlp": 0.01033484, + "balance_loss_clip": 1.0201149, + "balance_loss_mlp": 1.02120459, + "epoch": 0.4428678791522621, + "flos": 22856099558400.0, + "grad_norm": 1.4924402055970007, + "language_loss": 0.74191099, + "learning_rate": 2.357379322464834e-06, + "loss": 0.76291752, + "num_input_tokens_seen": 157964295, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.4609375, + "step": 7366, + "time_per_iteration": 2.488027811050415 + }, + { + "auxiliary_loss_clip": 0.0106987, + "auxiliary_loss_mlp": 0.01029678, + "balance_loss_clip": 1.01648784, + "balance_loss_mlp": 1.02322495, + "epoch": 0.44292800240493013, + "flos": 25811252968320.0, + "grad_norm": 2.2143037851294176, + "language_loss": 0.7324782, + "learning_rate": 2.357007574629339e-06, + "loss": 0.7534737, + "num_input_tokens_seen": 157983970, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.46679688, + "step": 7367, + "time_per_iteration": 3.938261032104492 + }, + { + "auxiliary_loss_clip": 0.01070406, + "auxiliary_loss_mlp": 0.01034042, + "balance_loss_clip": 1.02041602, + "balance_loss_mlp": 1.02419925, + "epoch": 0.4429881256575981, + "flos": 32415493438080.0, + "grad_norm": 1.4821012043062287, + "language_loss": 0.73671699, + "learning_rate": 2.356635814053162e-06, + "loss": 0.75776148, + "num_input_tokens_seen": 158006515, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.46289062, + "step": 7368, + "time_per_iteration": 2.4816949367523193 + }, + { + "auxiliary_loss_clip": 0.01064568, + "auxiliary_loss_mlp": 0.01027367, + "balance_loss_clip": 1.01515448, + "balance_loss_mlp": 1.02113795, + "epoch": 0.44304824891026606, + "flos": 22162619756160.0, + "grad_norm": 1.6530928950738526, + "language_loss": 0.80023211, + "learning_rate": 2.3562640407495697e-06, + "loss": 0.82115144, + "num_input_tokens_seen": 158025565, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.43554688, + "step": 7369, + "time_per_iteration": 2.3917317390441895 + }, + { + "auxiliary_loss_clip": 0.01064346, + "auxiliary_loss_mlp": 0.01025859, + "balance_loss_clip": 1.01359296, + "balance_loss_mlp": 1.02183962, + "epoch": 0.443108372162934, + "flos": 25337376817920.0, + "grad_norm": 1.7489029854601246, + "language_loss": 0.71714687, + "learning_rate": 2.3558922547318304e-06, + "loss": 0.73804891, + "num_input_tokens_seen": 158045620, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.42578125, + "step": 7370, + "time_per_iteration": 2.4389548301696777 + }, + { + "auxiliary_loss_clip": 0.01067288, + "auxiliary_loss_mlp": 0.01030758, + "balance_loss_clip": 1.01711476, + "balance_loss_mlp": 1.02147388, + "epoch": 0.443168495415602, + "flos": 23329836063360.0, + "grad_norm": 1.7823479033135885, + "language_loss": 0.7050432, + "learning_rate": 2.3555204560132123e-06, + "loss": 0.72602361, + "num_input_tokens_seen": 158063505, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.45703125, + "step": 7371, + "time_per_iteration": 3.8304615020751953 + }, + { + "auxiliary_loss_clip": 0.01065127, + "auxiliary_loss_mlp": 0.01027111, + "balance_loss_clip": 1.01493931, + "balance_loss_mlp": 1.02131641, + "epoch": 0.44322861866826996, + "flos": 21870431654400.0, + "grad_norm": 2.157057988070128, + "language_loss": 0.68297076, + "learning_rate": 2.3551486446069834e-06, + "loss": 0.70389307, + "num_input_tokens_seen": 158080335, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.4375, + "step": 7372, + "time_per_iteration": 2.405256509780884 + }, + { + "auxiliary_loss_clip": 0.0106744, + "auxiliary_loss_mlp": 0.01027631, + "balance_loss_clip": 1.01425576, + "balance_loss_mlp": 1.02182412, + "epoch": 0.4432887419209379, + "flos": 20083367335680.0, + "grad_norm": 1.7417411189418868, + "language_loss": 0.83409214, + "learning_rate": 2.3547768205264133e-06, + "loss": 0.85504282, + "num_input_tokens_seen": 158098955, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.45507812, + "step": 7373, + "time_per_iteration": 3.8197414875030518 + }, + { + "auxiliary_loss_clip": 0.01012032, + "auxiliary_loss_mlp": 0.01007516, + "balance_loss_clip": 1.00616884, + "balance_loss_mlp": 1.00249815, + "epoch": 0.4433488651736059, + "flos": 70032032092800.0, + "grad_norm": 0.7627624683214604, + "language_loss": 0.55197746, + "learning_rate": 2.3544049837847708e-06, + "loss": 0.57217294, + "num_input_tokens_seen": 158164110, + "router_z_loss_clip": 0.01348877, + "router_z_loss_mlp": 0.09521484, + "step": 7374, + "time_per_iteration": 3.0982959270477295 + }, + { + "auxiliary_loss_clip": 0.01068165, + "auxiliary_loss_mlp": 0.01029236, + "balance_loss_clip": 1.01571822, + "balance_loss_mlp": 1.02279496, + "epoch": 0.44340898842627385, + "flos": 16281745079040.0, + "grad_norm": 2.497983909450368, + "language_loss": 0.82322532, + "learning_rate": 2.354033134395325e-06, + "loss": 0.8441993, + "num_input_tokens_seen": 158179850, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.453125, + "step": 7375, + "time_per_iteration": 2.4058990478515625 + }, + { + "auxiliary_loss_clip": 0.01068717, + "auxiliary_loss_mlp": 0.01024418, + "balance_loss_clip": 1.01107883, + "balance_loss_mlp": 1.02277708, + "epoch": 0.4434691116789418, + "flos": 16611220379520.0, + "grad_norm": 1.8499241314826882, + "language_loss": 0.83558899, + "learning_rate": 2.3536612723713487e-06, + "loss": 0.85652035, + "num_input_tokens_seen": 158196590, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.4609375, + "step": 7376, + "time_per_iteration": 2.347107410430908 + }, + { + "auxiliary_loss_clip": 0.01065915, + "auxiliary_loss_mlp": 0.01023815, + "balance_loss_clip": 1.01061296, + "balance_loss_mlp": 1.02213812, + "epoch": 0.4435292349316098, + "flos": 19062227623680.0, + "grad_norm": 1.7111886982081392, + "language_loss": 0.77528077, + "learning_rate": 2.353289397726111e-06, + "loss": 0.7961781, + "num_input_tokens_seen": 158216355, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.4375, + "step": 7377, + "time_per_iteration": 2.4020791053771973 + }, + { + "auxiliary_loss_clip": 0.01063962, + "auxiliary_loss_mlp": 0.01023839, + "balance_loss_clip": 1.01183474, + "balance_loss_mlp": 1.02101302, + "epoch": 0.44358935818427775, + "flos": 21250269440640.0, + "grad_norm": 2.0099321764188662, + "language_loss": 0.75567853, + "learning_rate": 2.352917510472883e-06, + "loss": 0.77655661, + "num_input_tokens_seen": 158235825, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.4296875, + "step": 7378, + "time_per_iteration": 2.3827764987945557 + }, + { + "auxiliary_loss_clip": 0.01065955, + "auxiliary_loss_mlp": 0.01030208, + "balance_loss_clip": 1.01716614, + "balance_loss_mlp": 1.02076399, + "epoch": 0.4436494814369457, + "flos": 12494471391360.0, + "grad_norm": 1.897390644277825, + "language_loss": 0.69179583, + "learning_rate": 2.3525456106249367e-06, + "loss": 0.71275747, + "num_input_tokens_seen": 158254230, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.453125, + "step": 7379, + "time_per_iteration": 2.4198546409606934 + }, + { + "auxiliary_loss_clip": 0.01067712, + "auxiliary_loss_mlp": 0.01028881, + "balance_loss_clip": 1.01617908, + "balance_loss_mlp": 1.02232003, + "epoch": 0.44370960468961373, + "flos": 23658717870720.0, + "grad_norm": 1.7349064043675366, + "language_loss": 0.73117042, + "learning_rate": 2.3521736981955454e-06, + "loss": 0.75213641, + "num_input_tokens_seen": 158273400, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.453125, + "step": 7380, + "time_per_iteration": 2.4257311820983887 + }, + { + "auxiliary_loss_clip": 0.01065351, + "auxiliary_loss_mlp": 0.01024015, + "balance_loss_clip": 1.01108682, + "balance_loss_mlp": 1.02125549, + "epoch": 0.4437697279422817, + "flos": 32415458526720.0, + "grad_norm": 1.4695416003358772, + "language_loss": 0.64793813, + "learning_rate": 2.35180177319798e-06, + "loss": 0.66883183, + "num_input_tokens_seen": 158296840, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.44140625, + "step": 7381, + "time_per_iteration": 2.532421588897705 + }, + { + "auxiliary_loss_clip": 0.01067768, + "auxiliary_loss_mlp": 0.01027472, + "balance_loss_clip": 1.01463318, + "balance_loss_mlp": 1.02302909, + "epoch": 0.44382985119494966, + "flos": 18111926793600.0, + "grad_norm": 2.7929330144595346, + "language_loss": 0.80243814, + "learning_rate": 2.3514298356455145e-06, + "loss": 0.82339048, + "num_input_tokens_seen": 158314935, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.44726562, + "step": 7382, + "time_per_iteration": 2.4057977199554443 + }, + { + "auxiliary_loss_clip": 0.01069739, + "auxiliary_loss_mlp": 0.01038379, + "balance_loss_clip": 1.02467597, + "balance_loss_mlp": 1.02402771, + "epoch": 0.44388997444761763, + "flos": 30772829969280.0, + "grad_norm": 2.5423022903758437, + "language_loss": 0.64859343, + "learning_rate": 2.351057885551422e-06, + "loss": 0.66967463, + "num_input_tokens_seen": 158334620, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.45703125, + "step": 7383, + "time_per_iteration": 2.4870193004608154 + }, + { + "auxiliary_loss_clip": 0.01069633, + "auxiliary_loss_mlp": 0.01028187, + "balance_loss_clip": 1.01368511, + "balance_loss_mlp": 1.02273726, + "epoch": 0.4439500977002856, + "flos": 20338128593280.0, + "grad_norm": 2.706661740477055, + "language_loss": 0.76240945, + "learning_rate": 2.3506859229289768e-06, + "loss": 0.78338766, + "num_input_tokens_seen": 158350550, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.46875, + "step": 7384, + "time_per_iteration": 2.3810484409332275 + }, + { + "auxiliary_loss_clip": 0.01066476, + "auxiliary_loss_mlp": 0.01032175, + "balance_loss_clip": 1.01868653, + "balance_loss_mlp": 1.02144694, + "epoch": 0.44401022095295356, + "flos": 20370318733440.0, + "grad_norm": 1.643307569788969, + "language_loss": 0.80701005, + "learning_rate": 2.3503139477914532e-06, + "loss": 0.82799655, + "num_input_tokens_seen": 158369555, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.45117188, + "step": 7385, + "time_per_iteration": 2.3949904441833496 + }, + { + "auxiliary_loss_clip": 0.01067461, + "auxiliary_loss_mlp": 0.01029506, + "balance_loss_clip": 1.01547527, + "balance_loss_mlp": 1.02168655, + "epoch": 0.4440703442056215, + "flos": 20229583576320.0, + "grad_norm": 2.232673781752431, + "language_loss": 0.81511003, + "learning_rate": 2.349941960152126e-06, + "loss": 0.83607972, + "num_input_tokens_seen": 158388045, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.45703125, + "step": 7386, + "time_per_iteration": 2.3828611373901367 + }, + { + "auxiliary_loss_clip": 0.01069426, + "auxiliary_loss_mlp": 0.01028063, + "balance_loss_clip": 1.01335835, + "balance_loss_mlp": 1.02240634, + "epoch": 0.4441304674582895, + "flos": 39493121299200.0, + "grad_norm": 2.2395838762557325, + "language_loss": 0.6999594, + "learning_rate": 2.34956996002427e-06, + "loss": 0.72093427, + "num_input_tokens_seen": 158410115, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.46875, + "step": 7387, + "time_per_iteration": 2.631871461868286 + }, + { + "auxiliary_loss_clip": 0.01066259, + "auxiliary_loss_mlp": 0.01029998, + "balance_loss_clip": 1.01671219, + "balance_loss_mlp": 1.02056897, + "epoch": 0.44419059071095746, + "flos": 14828799271680.0, + "grad_norm": 1.9017915389270548, + "language_loss": 0.71551067, + "learning_rate": 2.3491979474211615e-06, + "loss": 0.7364732, + "num_input_tokens_seen": 158427765, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.45703125, + "step": 7388, + "time_per_iteration": 2.371278762817383 + }, + { + "auxiliary_loss_clip": 0.01067849, + "auxiliary_loss_mlp": 0.01026036, + "balance_loss_clip": 1.01236844, + "balance_loss_mlp": 1.02155209, + "epoch": 0.4442507139636254, + "flos": 22636740286080.0, + "grad_norm": 1.6684838464196956, + "language_loss": 0.69193459, + "learning_rate": 2.3488259223560766e-06, + "loss": 0.71287334, + "num_input_tokens_seen": 158446375, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.46289062, + "step": 7389, + "time_per_iteration": 2.416297674179077 + }, + { + "auxiliary_loss_clip": 0.01066841, + "auxiliary_loss_mlp": 0.01024416, + "balance_loss_clip": 1.0122447, + "balance_loss_mlp": 1.02164888, + "epoch": 0.4443108372162934, + "flos": 38289176375040.0, + "grad_norm": 1.77843332687156, + "language_loss": 0.74671531, + "learning_rate": 2.3484538848422913e-06, + "loss": 0.76762784, + "num_input_tokens_seen": 158467260, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.453125, + "step": 7390, + "time_per_iteration": 2.549236536026001 + }, + { + "auxiliary_loss_clip": 0.01065524, + "auxiliary_loss_mlp": 0.01025834, + "balance_loss_clip": 1.01294112, + "balance_loss_mlp": 1.0216887, + "epoch": 0.44437096046896135, + "flos": 17748027025920.0, + "grad_norm": 1.7330774251573322, + "language_loss": 0.81499702, + "learning_rate": 2.348081834893084e-06, + "loss": 0.83591056, + "num_input_tokens_seen": 158486720, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.4375, + "step": 7391, + "time_per_iteration": 2.409886598587036 + }, + { + "auxiliary_loss_clip": 0.0106706, + "auxiliary_loss_mlp": 0.01028761, + "balance_loss_clip": 1.01538599, + "balance_loss_mlp": 1.0224967, + "epoch": 0.4444310837216293, + "flos": 13731583973760.0, + "grad_norm": 1.6627784752465362, + "language_loss": 0.73857999, + "learning_rate": 2.3477097725217306e-06, + "loss": 0.75953817, + "num_input_tokens_seen": 158502530, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.4453125, + "step": 7392, + "time_per_iteration": 2.3926239013671875 + }, + { + "auxiliary_loss_clip": 0.01065231, + "auxiliary_loss_mlp": 0.01026704, + "balance_loss_clip": 1.01443744, + "balance_loss_mlp": 1.02063131, + "epoch": 0.44449120697429734, + "flos": 25009053592320.0, + "grad_norm": 1.7036048273060405, + "language_loss": 0.79416776, + "learning_rate": 2.3473376977415102e-06, + "loss": 0.81508708, + "num_input_tokens_seen": 158522715, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.4453125, + "step": 7393, + "time_per_iteration": 2.444464683532715 + }, + { + "auxiliary_loss_clip": 0.01069702, + "auxiliary_loss_mlp": 0.01025799, + "balance_loss_clip": 1.01157784, + "balance_loss_mlp": 1.02237284, + "epoch": 0.4445513302269653, + "flos": 32670324518400.0, + "grad_norm": 1.7486568112213898, + "language_loss": 0.80806756, + "learning_rate": 2.3469656105657004e-06, + "loss": 0.82902265, + "num_input_tokens_seen": 158543615, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.47265625, + "step": 7394, + "time_per_iteration": 2.4748241901397705 + }, + { + "auxiliary_loss_clip": 0.01064256, + "auxiliary_loss_mlp": 0.01024445, + "balance_loss_clip": 1.01247668, + "balance_loss_mlp": 1.02054441, + "epoch": 0.44461145347963327, + "flos": 11655019728000.0, + "grad_norm": 1.9046138741452754, + "language_loss": 0.79471123, + "learning_rate": 2.346593511007581e-06, + "loss": 0.81559831, + "num_input_tokens_seen": 158560330, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.4375, + "step": 7395, + "time_per_iteration": 2.3722984790802 + }, + { + "auxiliary_loss_clip": 0.01066188, + "auxiliary_loss_mlp": 0.01024534, + "balance_loss_clip": 1.01156366, + "balance_loss_mlp": 1.02157795, + "epoch": 0.44467157673230123, + "flos": 20885706357120.0, + "grad_norm": 1.719555655816771, + "language_loss": 0.6867553, + "learning_rate": 2.3462213990804307e-06, + "loss": 0.70766246, + "num_input_tokens_seen": 158579735, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.4453125, + "step": 7396, + "time_per_iteration": 2.43796706199646 + }, + { + "auxiliary_loss_clip": 0.01067494, + "auxiliary_loss_mlp": 0.01031982, + "balance_loss_clip": 1.01833808, + "balance_loss_mlp": 1.02148461, + "epoch": 0.4447316999849692, + "flos": 18545303900160.0, + "grad_norm": 1.6586918790268184, + "language_loss": 0.8086316, + "learning_rate": 2.345849274797529e-06, + "loss": 0.82962632, + "num_input_tokens_seen": 158597075, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.4609375, + "step": 7397, + "time_per_iteration": 2.374906539916992 + }, + { + "auxiliary_loss_clip": 0.01066214, + "auxiliary_loss_mlp": 0.01027462, + "balance_loss_clip": 1.01465297, + "balance_loss_mlp": 1.02229631, + "epoch": 0.44479182323763716, + "flos": 23767926203520.0, + "grad_norm": 1.984016563347239, + "language_loss": 0.6743629, + "learning_rate": 2.3454771381721566e-06, + "loss": 0.69529963, + "num_input_tokens_seen": 158616650, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.43945312, + "step": 7398, + "time_per_iteration": 2.416959762573242 + }, + { + "auxiliary_loss_clip": 0.01066537, + "auxiliary_loss_mlp": 0.01027932, + "balance_loss_clip": 1.01592755, + "balance_loss_mlp": 1.02192974, + "epoch": 0.44485194649030513, + "flos": 16542930026880.0, + "grad_norm": 1.7895882434258052, + "language_loss": 0.69728124, + "learning_rate": 2.3451049892175934e-06, + "loss": 0.71822596, + "num_input_tokens_seen": 158634515, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.4453125, + "step": 7399, + "time_per_iteration": 2.3654441833496094 + }, + { + "auxiliary_loss_clip": 0.0106558, + "auxiliary_loss_mlp": 0.01027802, + "balance_loss_clip": 1.01503444, + "balance_loss_mlp": 1.02224946, + "epoch": 0.4449120697429731, + "flos": 22599872023680.0, + "grad_norm": 1.7801823362201141, + "language_loss": 0.72440344, + "learning_rate": 2.3447328279471213e-06, + "loss": 0.74533725, + "num_input_tokens_seen": 158653760, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.43359375, + "step": 7400, + "time_per_iteration": 3.788595676422119 + }, + { + "auxiliary_loss_clip": 0.01066785, + "auxiliary_loss_mlp": 0.01032135, + "balance_loss_clip": 1.01872981, + "balance_loss_mlp": 1.02224302, + "epoch": 0.44497219299564106, + "flos": 20004010081920.0, + "grad_norm": 1.7733866561682734, + "language_loss": 0.84876609, + "learning_rate": 2.3443606543740207e-06, + "loss": 0.86975533, + "num_input_tokens_seen": 158672190, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.4453125, + "step": 7401, + "time_per_iteration": 2.3973746299743652 + }, + { + "auxiliary_loss_clip": 0.01064049, + "auxiliary_loss_mlp": 0.0102594, + "balance_loss_clip": 1.0141505, + "balance_loss_mlp": 1.02111936, + "epoch": 0.445032316248309, + "flos": 25593045770880.0, + "grad_norm": 1.7643134422458155, + "language_loss": 0.83420205, + "learning_rate": 2.3439884685115753e-06, + "loss": 0.85510194, + "num_input_tokens_seen": 158694115, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.4296875, + "step": 7402, + "time_per_iteration": 2.4332966804504395 + }, + { + "auxiliary_loss_clip": 0.01067897, + "auxiliary_loss_mlp": 0.01026938, + "balance_loss_clip": 1.01327705, + "balance_loss_mlp": 1.02248573, + "epoch": 0.445092439500977, + "flos": 21249396656640.0, + "grad_norm": 2.0944059522060297, + "language_loss": 0.77006471, + "learning_rate": 2.343616270373066e-06, + "loss": 0.791013, + "num_input_tokens_seen": 158711000, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.45507812, + "step": 7403, + "time_per_iteration": 2.377959728240967 + }, + { + "auxiliary_loss_clip": 0.01010954, + "auxiliary_loss_mlp": 0.01001975, + "balance_loss_clip": 1.0006398, + "balance_loss_mlp": 1.00162435, + "epoch": 0.44515256275364495, + "flos": 57762051943680.0, + "grad_norm": 0.7808522633864254, + "language_loss": 0.60084581, + "learning_rate": 2.3432440599717748e-06, + "loss": 0.62097514, + "num_input_tokens_seen": 158769675, + "router_z_loss_clip": 0.0133667, + "router_z_loss_mlp": 0.09375, + "step": 7404, + "time_per_iteration": 3.025211811065674 + }, + { + "auxiliary_loss_clip": 0.01068158, + "auxiliary_loss_mlp": 0.0102793, + "balance_loss_clip": 1.01388693, + "balance_loss_mlp": 1.02188015, + "epoch": 0.4452126860063129, + "flos": 15595107903360.0, + "grad_norm": 1.587767098765343, + "language_loss": 0.82278627, + "learning_rate": 2.3428718373209872e-06, + "loss": 0.84374714, + "num_input_tokens_seen": 158788215, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.46289062, + "step": 7405, + "time_per_iteration": 2.3828985691070557 + }, + { + "auxiliary_loss_clip": 0.01064518, + "auxiliary_loss_mlp": 0.01022429, + "balance_loss_clip": 1.01026964, + "balance_loss_mlp": 1.0201174, + "epoch": 0.4452728092589809, + "flos": 21616298801280.0, + "grad_norm": 1.6152911726726356, + "language_loss": 0.75058639, + "learning_rate": 2.342499602433985e-06, + "loss": 0.77145588, + "num_input_tokens_seen": 158809090, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.44335938, + "step": 7406, + "time_per_iteration": 3.8140556812286377 + }, + { + "auxiliary_loss_clip": 0.01062131, + "auxiliary_loss_mlp": 0.01027113, + "balance_loss_clip": 1.01518583, + "balance_loss_mlp": 1.01932526, + "epoch": 0.4453329325116489, + "flos": 29496195861120.0, + "grad_norm": 1.5664786723582516, + "language_loss": 0.65247297, + "learning_rate": 2.3421273553240534e-06, + "loss": 0.67336547, + "num_input_tokens_seen": 158828320, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.42773438, + "step": 7407, + "time_per_iteration": 2.4533848762512207 + }, + { + "auxiliary_loss_clip": 0.01069183, + "auxiliary_loss_mlp": 0.01028167, + "balance_loss_clip": 1.0159359, + "balance_loss_mlp": 1.02374566, + "epoch": 0.44539305576431687, + "flos": 21360071266560.0, + "grad_norm": 1.9088487378642989, + "language_loss": 0.6798743, + "learning_rate": 2.3417550960044765e-06, + "loss": 0.7008478, + "num_input_tokens_seen": 158847040, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.453125, + "step": 7408, + "time_per_iteration": 2.4008548259735107 + }, + { + "auxiliary_loss_clip": 0.01064409, + "auxiliary_loss_mlp": 0.01029596, + "balance_loss_clip": 1.0156486, + "balance_loss_mlp": 1.01956904, + "epoch": 0.44545317901698483, + "flos": 41426017833600.0, + "grad_norm": 1.3768992506963276, + "language_loss": 0.71725965, + "learning_rate": 2.3413828244885386e-06, + "loss": 0.73819965, + "num_input_tokens_seen": 158870490, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.44921875, + "step": 7409, + "time_per_iteration": 2.560231924057007 + }, + { + "auxiliary_loss_clip": 0.01066075, + "auxiliary_loss_mlp": 0.01029208, + "balance_loss_clip": 1.01539779, + "balance_loss_mlp": 1.02026987, + "epoch": 0.4455133022696528, + "flos": 22053900182400.0, + "grad_norm": 1.8965101848216948, + "language_loss": 0.6494292, + "learning_rate": 2.341010540789527e-06, + "loss": 0.67038208, + "num_input_tokens_seen": 158889920, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.45703125, + "step": 7410, + "time_per_iteration": 3.8014862537384033 + }, + { + "auxiliary_loss_clip": 0.01070748, + "auxiliary_loss_mlp": 0.01028522, + "balance_loss_clip": 1.01468229, + "balance_loss_mlp": 1.02211428, + "epoch": 0.44557342552232077, + "flos": 23475842835840.0, + "grad_norm": 1.9355288418081675, + "language_loss": 0.74456894, + "learning_rate": 2.340638244920725e-06, + "loss": 0.76556164, + "num_input_tokens_seen": 158909580, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.48632812, + "step": 7411, + "time_per_iteration": 2.4792771339416504 + }, + { + "auxiliary_loss_clip": 0.01064957, + "auxiliary_loss_mlp": 0.0102534, + "balance_loss_clip": 1.01320446, + "balance_loss_mlp": 1.02196229, + "epoch": 0.44563354877498873, + "flos": 19133694910080.0, + "grad_norm": 1.6907075950324055, + "language_loss": 0.78878963, + "learning_rate": 2.3402659368954214e-06, + "loss": 0.80969262, + "num_input_tokens_seen": 158924600, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.4296875, + "step": 7412, + "time_per_iteration": 2.372171401977539 + }, + { + "auxiliary_loss_clip": 0.01066312, + "auxiliary_loss_mlp": 0.01027567, + "balance_loss_clip": 1.01447189, + "balance_loss_mlp": 1.02009785, + "epoch": 0.4456936720276567, + "flos": 13620699895680.0, + "grad_norm": 3.0612372222432565, + "language_loss": 0.79669106, + "learning_rate": 2.3398936167269016e-06, + "loss": 0.81762993, + "num_input_tokens_seen": 158939345, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.46289062, + "step": 7413, + "time_per_iteration": 3.757678508758545 + }, + { + "auxiliary_loss_clip": 0.01064721, + "auxiliary_loss_mlp": 0.01023003, + "balance_loss_clip": 1.01063502, + "balance_loss_mlp": 1.02129245, + "epoch": 0.44575379528032466, + "flos": 14713027603200.0, + "grad_norm": 2.705667316194214, + "language_loss": 0.76647013, + "learning_rate": 2.3395212844284525e-06, + "loss": 0.78734732, + "num_input_tokens_seen": 158955855, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.43359375, + "step": 7414, + "time_per_iteration": 2.340883731842041 + }, + { + "auxiliary_loss_clip": 0.01063966, + "auxiliary_loss_mlp": 0.01031838, + "balance_loss_clip": 1.01867759, + "balance_loss_mlp": 1.02043581, + "epoch": 0.4458139185329926, + "flos": 24169532106240.0, + "grad_norm": 1.509815405464693, + "language_loss": 0.83319509, + "learning_rate": 2.339148940013362e-06, + "loss": 0.8541531, + "num_input_tokens_seen": 158976315, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.43554688, + "step": 7415, + "time_per_iteration": 2.408250570297241 + }, + { + "auxiliary_loss_clip": 0.01065559, + "auxiliary_loss_mlp": 0.01025299, + "balance_loss_clip": 1.01208472, + "balance_loss_mlp": 1.02033997, + "epoch": 0.4458740417856606, + "flos": 21761153498880.0, + "grad_norm": 1.7225422618319908, + "language_loss": 0.84473431, + "learning_rate": 2.338776583494919e-06, + "loss": 0.86564291, + "num_input_tokens_seen": 158996725, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.453125, + "step": 7416, + "time_per_iteration": 2.394453525543213 + }, + { + "auxiliary_loss_clip": 0.01067058, + "auxiliary_loss_mlp": 0.01026415, + "balance_loss_clip": 1.01280737, + "balance_loss_mlp": 1.02036214, + "epoch": 0.44593416503832856, + "flos": 21067743519360.0, + "grad_norm": 1.5258376369739084, + "language_loss": 0.81026852, + "learning_rate": 2.3384042148864113e-06, + "loss": 0.83120322, + "num_input_tokens_seen": 159017255, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.46679688, + "step": 7417, + "time_per_iteration": 2.4205923080444336 + }, + { + "auxiliary_loss_clip": 0.01067348, + "auxiliary_loss_mlp": 0.01030748, + "balance_loss_clip": 1.016729, + "balance_loss_mlp": 1.02130532, + "epoch": 0.4459942882909965, + "flos": 22599418176000.0, + "grad_norm": 1.9174297293525147, + "language_loss": 0.80711055, + "learning_rate": 2.338031834201127e-06, + "loss": 0.8280915, + "num_input_tokens_seen": 159035010, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.4609375, + "step": 7418, + "time_per_iteration": 2.3873653411865234 + }, + { + "auxiliary_loss_clip": 0.01064476, + "auxiliary_loss_mlp": 0.0102518, + "balance_loss_clip": 1.01194787, + "balance_loss_mlp": 1.01996863, + "epoch": 0.4460544115436645, + "flos": 26504278922880.0, + "grad_norm": 2.121011864323281, + "language_loss": 0.77679944, + "learning_rate": 2.3376594414523565e-06, + "loss": 0.79769599, + "num_input_tokens_seen": 159055345, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.4453125, + "step": 7419, + "time_per_iteration": 2.4086825847625732 + }, + { + "auxiliary_loss_clip": 0.01065818, + "auxiliary_loss_mlp": 0.01028427, + "balance_loss_clip": 1.01612461, + "balance_loss_mlp": 1.02141595, + "epoch": 0.4461145347963325, + "flos": 17603032682880.0, + "grad_norm": 1.5139665881215985, + "language_loss": 0.72160923, + "learning_rate": 2.3372870366533885e-06, + "loss": 0.74255168, + "num_input_tokens_seen": 159074225, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.44335938, + "step": 7420, + "time_per_iteration": 2.3763427734375 + }, + { + "auxiliary_loss_clip": 0.01064604, + "auxiliary_loss_mlp": 0.01024031, + "balance_loss_clip": 1.01148391, + "balance_loss_mlp": 1.02101302, + "epoch": 0.44617465804900047, + "flos": 27267061507200.0, + "grad_norm": 1.9268965603480086, + "language_loss": 0.75081676, + "learning_rate": 2.3369146198175136e-06, + "loss": 0.77170312, + "num_input_tokens_seen": 159095415, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.43554688, + "step": 7421, + "time_per_iteration": 2.4258062839508057 + }, + { + "auxiliary_loss_clip": 0.0106235, + "auxiliary_loss_mlp": 0.01025295, + "balance_loss_clip": 1.01306391, + "balance_loss_mlp": 1.0207026, + "epoch": 0.44623478130166844, + "flos": 17785418958720.0, + "grad_norm": 1.798956783247094, + "language_loss": 0.76234698, + "learning_rate": 2.3365421909580234e-06, + "loss": 0.78322351, + "num_input_tokens_seen": 159114615, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.41796875, + "step": 7422, + "time_per_iteration": 2.374551296234131 + }, + { + "auxiliary_loss_clip": 0.01064091, + "auxiliary_loss_mlp": 0.01028764, + "balance_loss_clip": 1.01616919, + "balance_loss_mlp": 1.02018595, + "epoch": 0.4462949045543364, + "flos": 23001896862720.0, + "grad_norm": 1.5399632569304773, + "language_loss": 0.64942569, + "learning_rate": 2.3361697500882074e-06, + "loss": 0.67035425, + "num_input_tokens_seen": 159134370, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.4375, + "step": 7423, + "time_per_iteration": 2.415875196456909 + }, + { + "auxiliary_loss_clip": 0.01064397, + "auxiliary_loss_mlp": 0.01027005, + "balance_loss_clip": 1.01467907, + "balance_loss_mlp": 1.02104306, + "epoch": 0.44635502780700437, + "flos": 17819180110080.0, + "grad_norm": 1.489889305420742, + "language_loss": 0.79097605, + "learning_rate": 2.3357972972213585e-06, + "loss": 0.81189007, + "num_input_tokens_seen": 159152540, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.43359375, + "step": 7424, + "time_per_iteration": 2.3737576007843018 + }, + { + "auxiliary_loss_clip": 0.01062978, + "auxiliary_loss_mlp": 0.0102392, + "balance_loss_clip": 1.01273203, + "balance_loss_mlp": 1.02053189, + "epoch": 0.44641515105967233, + "flos": 26686804844160.0, + "grad_norm": 1.402091257410848, + "language_loss": 0.80391347, + "learning_rate": 2.3354248323707675e-06, + "loss": 0.82478249, + "num_input_tokens_seen": 159173425, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.42382812, + "step": 7425, + "time_per_iteration": 2.419194221496582 + }, + { + "auxiliary_loss_clip": 0.01064299, + "auxiliary_loss_mlp": 0.01025049, + "balance_loss_clip": 1.01267552, + "balance_loss_mlp": 1.02122712, + "epoch": 0.4464752743123403, + "flos": 18912415512960.0, + "grad_norm": 1.6722228217311743, + "language_loss": 0.77035224, + "learning_rate": 2.3350523555497265e-06, + "loss": 0.7912457, + "num_input_tokens_seen": 159191210, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.43164062, + "step": 7426, + "time_per_iteration": 2.381653070449829 + }, + { + "auxiliary_loss_clip": 0.01065774, + "auxiliary_loss_mlp": 0.01022622, + "balance_loss_clip": 1.01043248, + "balance_loss_mlp": 1.02124047, + "epoch": 0.44653539756500826, + "flos": 29569024690560.0, + "grad_norm": 1.5545826856573506, + "language_loss": 0.64741284, + "learning_rate": 2.3346798667715296e-06, + "loss": 0.66829681, + "num_input_tokens_seen": 159211755, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.4453125, + "step": 7427, + "time_per_iteration": 2.427748203277588 + }, + { + "auxiliary_loss_clip": 0.01067156, + "auxiliary_loss_mlp": 0.01026845, + "balance_loss_clip": 1.01378012, + "balance_loss_mlp": 1.02257323, + "epoch": 0.44659552081767623, + "flos": 21467952967680.0, + "grad_norm": 1.5881611336115122, + "language_loss": 0.75383961, + "learning_rate": 2.3343073660494685e-06, + "loss": 0.77477962, + "num_input_tokens_seen": 159230315, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.44726562, + "step": 7428, + "time_per_iteration": 2.3805835247039795 + }, + { + "auxiliary_loss_clip": 0.01065233, + "auxiliary_loss_mlp": 0.01022385, + "balance_loss_clip": 1.01026773, + "balance_loss_mlp": 1.02082944, + "epoch": 0.4466556440703442, + "flos": 17930902060800.0, + "grad_norm": 1.7876084188504164, + "language_loss": 0.77410007, + "learning_rate": 2.333934853396838e-06, + "loss": 0.79497629, + "num_input_tokens_seen": 159249810, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.44335938, + "step": 7429, + "time_per_iteration": 2.3603830337524414 + }, + { + "auxiliary_loss_clip": 0.01068329, + "auxiliary_loss_mlp": 0.01025569, + "balance_loss_clip": 1.01187766, + "balance_loss_mlp": 1.02247047, + "epoch": 0.44671576732301216, + "flos": 21106322438400.0, + "grad_norm": 4.3803270446480145, + "language_loss": 0.90983748, + "learning_rate": 2.3335623288269313e-06, + "loss": 0.93077648, + "num_input_tokens_seen": 159271715, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.45703125, + "step": 7430, + "time_per_iteration": 2.4150888919830322 + }, + { + "auxiliary_loss_clip": 0.01068247, + "auxiliary_loss_mlp": 0.01031068, + "balance_loss_clip": 1.01676309, + "balance_loss_mlp": 1.02132058, + "epoch": 0.4467758905756801, + "flos": 23507928241920.0, + "grad_norm": 2.630449862987803, + "language_loss": 0.79729092, + "learning_rate": 2.333189792353043e-06, + "loss": 0.81828403, + "num_input_tokens_seen": 159290690, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.46875, + "step": 7431, + "time_per_iteration": 2.3800759315490723 + }, + { + "auxiliary_loss_clip": 0.0106836, + "auxiliary_loss_mlp": 0.01027352, + "balance_loss_clip": 1.01363659, + "balance_loss_mlp": 1.02208495, + "epoch": 0.4468360138283481, + "flos": 18733031614080.0, + "grad_norm": 3.096421369659388, + "language_loss": 0.79878294, + "learning_rate": 2.3328172439884687e-06, + "loss": 0.81974006, + "num_input_tokens_seen": 159309400, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.46289062, + "step": 7432, + "time_per_iteration": 2.3540797233581543 + }, + { + "auxiliary_loss_clip": 0.01066986, + "auxiliary_loss_mlp": 0.01027028, + "balance_loss_clip": 1.01473117, + "balance_loss_mlp": 1.02110314, + "epoch": 0.4468961370810161, + "flos": 23476017392640.0, + "grad_norm": 1.9016111396824646, + "language_loss": 0.76408058, + "learning_rate": 2.3324446837465023e-06, + "loss": 0.78502071, + "num_input_tokens_seen": 159327425, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.45898438, + "step": 7433, + "time_per_iteration": 2.382256269454956 + }, + { + "auxiliary_loss_clip": 0.01060957, + "auxiliary_loss_mlp": 0.01025819, + "balance_loss_clip": 1.01500702, + "balance_loss_mlp": 1.01996207, + "epoch": 0.4469562603336841, + "flos": 30073903994880.0, + "grad_norm": 1.7488173185394869, + "language_loss": 0.77025628, + "learning_rate": 2.332072111640441e-06, + "loss": 0.79112405, + "num_input_tokens_seen": 159345805, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.41015625, + "step": 7434, + "time_per_iteration": 2.4539794921875 + }, + { + "auxiliary_loss_clip": 0.01068904, + "auxiliary_loss_mlp": 0.01024966, + "balance_loss_clip": 1.0126276, + "balance_loss_mlp": 1.02337468, + "epoch": 0.44701638358635204, + "flos": 22455296616960.0, + "grad_norm": 2.0952394646014207, + "language_loss": 0.64245927, + "learning_rate": 2.33169952768358e-06, + "loss": 0.66339803, + "num_input_tokens_seen": 159364595, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.45507812, + "step": 7435, + "time_per_iteration": 2.3765156269073486 + }, + { + "auxiliary_loss_clip": 0.01067227, + "auxiliary_loss_mlp": 0.0102286, + "balance_loss_clip": 1.00947285, + "balance_loss_mlp": 1.0218544, + "epoch": 0.44707650683902, + "flos": 24056797726080.0, + "grad_norm": 1.6093968550592719, + "language_loss": 0.83689892, + "learning_rate": 2.331326931889215e-06, + "loss": 0.85779977, + "num_input_tokens_seen": 159385265, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.453125, + "step": 7436, + "time_per_iteration": 2.436046600341797 + }, + { + "auxiliary_loss_clip": 0.01069914, + "auxiliary_loss_mlp": 0.01027661, + "balance_loss_clip": 1.01381493, + "balance_loss_mlp": 1.02316988, + "epoch": 0.44713663009168797, + "flos": 23765866433280.0, + "grad_norm": 1.6116800558032678, + "language_loss": 0.7963655, + "learning_rate": 2.3309543242706454e-06, + "loss": 0.81734133, + "num_input_tokens_seen": 159405080, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.46875, + "step": 7437, + "time_per_iteration": 2.385275363922119 + }, + { + "auxiliary_loss_clip": 0.01068321, + "auxiliary_loss_mlp": 0.01027852, + "balance_loss_clip": 1.01459622, + "balance_loss_mlp": 1.0214026, + "epoch": 0.44719675334435594, + "flos": 24498099711360.0, + "grad_norm": 1.7423875500702743, + "language_loss": 0.71767777, + "learning_rate": 2.3305817048411667e-06, + "loss": 0.73863947, + "num_input_tokens_seen": 159424595, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.46875, + "step": 7438, + "time_per_iteration": 2.404627799987793 + }, + { + "auxiliary_loss_clip": 0.01066334, + "auxiliary_loss_mlp": 0.01033024, + "balance_loss_clip": 1.01963711, + "balance_loss_mlp": 1.02071524, + "epoch": 0.4472568765970239, + "flos": 29780668552320.0, + "grad_norm": 1.6094533652295375, + "language_loss": 0.67031348, + "learning_rate": 2.3302090736140772e-06, + "loss": 0.69130707, + "num_input_tokens_seen": 159443865, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.45703125, + "step": 7439, + "time_per_iteration": 3.842332124710083 + }, + { + "auxiliary_loss_clip": 0.01071059, + "auxiliary_loss_mlp": 0.01028025, + "balance_loss_clip": 1.01423812, + "balance_loss_mlp": 1.02433014, + "epoch": 0.44731699984969187, + "flos": 24642011802240.0, + "grad_norm": 1.6342755324778229, + "language_loss": 0.73899746, + "learning_rate": 2.3298364306026757e-06, + "loss": 0.75998825, + "num_input_tokens_seen": 159464525, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.46679688, + "step": 7440, + "time_per_iteration": 2.4054641723632812 + }, + { + "auxiliary_loss_clip": 0.01067256, + "auxiliary_loss_mlp": 0.01025647, + "balance_loss_clip": 1.01291502, + "balance_loss_mlp": 1.02175641, + "epoch": 0.44737712310235983, + "flos": 29454544742400.0, + "grad_norm": 1.7088143293566926, + "language_loss": 0.74198139, + "learning_rate": 2.32946377582026e-06, + "loss": 0.76291043, + "num_input_tokens_seen": 159486385, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.45507812, + "step": 7441, + "time_per_iteration": 2.4351255893707275 + }, + { + "auxiliary_loss_clip": 0.01068725, + "auxiliary_loss_mlp": 0.01028465, + "balance_loss_clip": 1.01524472, + "balance_loss_mlp": 1.02140486, + "epoch": 0.4474372463550278, + "flos": 24895760630400.0, + "grad_norm": 1.7136011474904076, + "language_loss": 0.74562764, + "learning_rate": 2.32909110928013e-06, + "loss": 0.76659954, + "num_input_tokens_seen": 159503880, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.47265625, + "step": 7442, + "time_per_iteration": 2.3995230197906494 + }, + { + "auxiliary_loss_clip": 0.01068504, + "auxiliary_loss_mlp": 0.01026883, + "balance_loss_clip": 1.01371646, + "balance_loss_mlp": 1.02088499, + "epoch": 0.44749736960769576, + "flos": 33180231058560.0, + "grad_norm": 1.9275795515448118, + "language_loss": 0.74149525, + "learning_rate": 2.3287184309955847e-06, + "loss": 0.76244903, + "num_input_tokens_seen": 159522980, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.4765625, + "step": 7443, + "time_per_iteration": 2.454193592071533 + }, + { + "auxiliary_loss_clip": 0.01065588, + "auxiliary_loss_mlp": 0.01024823, + "balance_loss_clip": 1.01114941, + "balance_loss_mlp": 1.0199976, + "epoch": 0.4475574928603637, + "flos": 21070676073600.0, + "grad_norm": 1.6762979730623493, + "language_loss": 0.7771107, + "learning_rate": 2.328345740979924e-06, + "loss": 0.79801482, + "num_input_tokens_seen": 159543340, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.45703125, + "step": 7444, + "time_per_iteration": 2.3947010040283203 + }, + { + "auxiliary_loss_clip": 0.01063814, + "auxiliary_loss_mlp": 0.01024426, + "balance_loss_clip": 1.01099706, + "balance_loss_mlp": 1.0200634, + "epoch": 0.4476176161130317, + "flos": 21861703814400.0, + "grad_norm": 4.350286050030473, + "language_loss": 0.84955084, + "learning_rate": 2.3279730392464486e-06, + "loss": 0.87043333, + "num_input_tokens_seen": 159558210, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.4375, + "step": 7445, + "time_per_iteration": 2.340634822845459 + }, + { + "auxiliary_loss_clip": 0.01066636, + "auxiliary_loss_mlp": 0.01025216, + "balance_loss_clip": 1.01167941, + "balance_loss_mlp": 1.02168107, + "epoch": 0.4476777393656997, + "flos": 22527566864640.0, + "grad_norm": 2.254514953158344, + "language_loss": 0.64012766, + "learning_rate": 2.3276003258084593e-06, + "loss": 0.66104615, + "num_input_tokens_seen": 159577920, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.44921875, + "step": 7446, + "time_per_iteration": 3.7676007747650146 + }, + { + "auxiliary_loss_clip": 0.01010919, + "auxiliary_loss_mlp": 0.0100213, + "balance_loss_clip": 1.00075936, + "balance_loss_mlp": 1.00185442, + "epoch": 0.4477378626183677, + "flos": 49014283507200.0, + "grad_norm": 0.7386722227288544, + "language_loss": 0.50245178, + "learning_rate": 2.327227600679257e-06, + "loss": 0.52258229, + "num_input_tokens_seen": 159632295, + "router_z_loss_clip": 0.01373291, + "router_z_loss_mlp": 0.09082031, + "step": 7447, + "time_per_iteration": 2.8881587982177734 + }, + { + "auxiliary_loss_clip": 0.0101115, + "auxiliary_loss_mlp": 0.01001483, + "balance_loss_clip": 0.99998122, + "balance_loss_mlp": 1.00195587, + "epoch": 0.44779798587103564, + "flos": 56538868993920.0, + "grad_norm": 0.7780585317626533, + "language_loss": 0.59287417, + "learning_rate": 2.326854863872143e-06, + "loss": 0.61300051, + "num_input_tokens_seen": 159698435, + "router_z_loss_clip": 0.01501465, + "router_z_loss_mlp": 0.09179688, + "step": 7448, + "time_per_iteration": 3.0978567600250244 + }, + { + "auxiliary_loss_clip": 0.01064557, + "auxiliary_loss_mlp": 0.01029581, + "balance_loss_clip": 1.01748776, + "balance_loss_mlp": 1.02080536, + "epoch": 0.4478581091237036, + "flos": 46496803167360.0, + "grad_norm": 1.686666315578348, + "language_loss": 0.58651805, + "learning_rate": 2.32648211540042e-06, + "loss": 0.60745943, + "num_input_tokens_seen": 159722150, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.4375, + "step": 7449, + "time_per_iteration": 2.601027488708496 + }, + { + "auxiliary_loss_clip": 0.01067334, + "auxiliary_loss_mlp": 0.01027685, + "balance_loss_clip": 1.01524591, + "balance_loss_mlp": 1.02147269, + "epoch": 0.4479182323763716, + "flos": 20813296464000.0, + "grad_norm": 1.7339093198745914, + "language_loss": 0.8031528, + "learning_rate": 2.32610935527739e-06, + "loss": 0.824103, + "num_input_tokens_seen": 159740550, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.45898438, + "step": 7450, + "time_per_iteration": 3.7455461025238037 + }, + { + "auxiliary_loss_clip": 0.01065696, + "auxiliary_loss_mlp": 0.01024103, + "balance_loss_clip": 1.01210451, + "balance_loss_mlp": 1.02210307, + "epoch": 0.44797835562903954, + "flos": 14245121295360.0, + "grad_norm": 2.099369991707041, + "language_loss": 0.79522491, + "learning_rate": 2.3257365835163562e-06, + "loss": 0.81612289, + "num_input_tokens_seen": 159758245, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.4375, + "step": 7451, + "time_per_iteration": 2.3527753353118896 + }, + { + "auxiliary_loss_clip": 0.01010667, + "auxiliary_loss_mlp": 0.01003402, + "balance_loss_clip": 1.00219834, + "balance_loss_mlp": 1.00160146, + "epoch": 0.4480384788817075, + "flos": 63531414138240.0, + "grad_norm": 0.8624207842169236, + "language_loss": 0.62737525, + "learning_rate": 2.325363800130621e-06, + "loss": 0.64751601, + "num_input_tokens_seen": 159826790, + "router_z_loss_clip": 0.01202393, + "router_z_loss_mlp": 0.09082031, + "step": 7452, + "time_per_iteration": 4.475640296936035 + }, + { + "auxiliary_loss_clip": 0.01066754, + "auxiliary_loss_mlp": 0.01029827, + "balance_loss_clip": 1.01624274, + "balance_loss_mlp": 1.0217042, + "epoch": 0.44809860213437547, + "flos": 21651561141120.0, + "grad_norm": 1.750565727181812, + "language_loss": 0.62883985, + "learning_rate": 2.324991005133489e-06, + "loss": 0.64980567, + "num_input_tokens_seen": 159845805, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.44921875, + "step": 7453, + "time_per_iteration": 2.3689467906951904 + }, + { + "auxiliary_loss_clip": 0.01010715, + "auxiliary_loss_mlp": 0.01004378, + "balance_loss_clip": 1.00320947, + "balance_loss_mlp": 1.00147069, + "epoch": 0.44815872538704343, + "flos": 69187308814080.0, + "grad_norm": 0.774442871168118, + "language_loss": 0.57074136, + "learning_rate": 2.324618198538264e-06, + "loss": 0.59089231, + "num_input_tokens_seen": 159898860, + "router_z_loss_clip": 0.01165771, + "router_z_loss_mlp": 0.09228516, + "step": 7454, + "time_per_iteration": 2.92692232131958 + }, + { + "auxiliary_loss_clip": 0.01066223, + "auxiliary_loss_mlp": 0.01027522, + "balance_loss_clip": 1.01473689, + "balance_loss_mlp": 1.02113938, + "epoch": 0.4482188486397114, + "flos": 12597640058880.0, + "grad_norm": 2.5130959256238503, + "language_loss": 0.74808848, + "learning_rate": 2.3242453803582505e-06, + "loss": 0.76902586, + "num_input_tokens_seen": 159911555, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.45117188, + "step": 7455, + "time_per_iteration": 2.314444065093994 + }, + { + "auxiliary_loss_clip": 0.01066441, + "auxiliary_loss_mlp": 0.01026971, + "balance_loss_clip": 1.01454329, + "balance_loss_mlp": 1.02203739, + "epoch": 0.44827897189237936, + "flos": 34056760452480.0, + "grad_norm": 1.6331632395112727, + "language_loss": 0.75710458, + "learning_rate": 2.3238725506067535e-06, + "loss": 0.77803868, + "num_input_tokens_seen": 159931470, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.4453125, + "step": 7456, + "time_per_iteration": 2.484254837036133 + }, + { + "auxiliary_loss_clip": 0.01067896, + "auxiliary_loss_mlp": 0.01028175, + "balance_loss_clip": 1.01542592, + "balance_loss_mlp": 1.02352726, + "epoch": 0.44833909514504733, + "flos": 25146472170240.0, + "grad_norm": 1.9509177238559519, + "language_loss": 0.76273775, + "learning_rate": 2.3234997092970786e-06, + "loss": 0.78369844, + "num_input_tokens_seen": 159946115, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.4453125, + "step": 7457, + "time_per_iteration": 2.374163866043091 + }, + { + "auxiliary_loss_clip": 0.01066781, + "auxiliary_loss_mlp": 0.01027208, + "balance_loss_clip": 1.01312327, + "balance_loss_mlp": 1.02172685, + "epoch": 0.4483992183977153, + "flos": 16179065170560.0, + "grad_norm": 1.8381312936301486, + "language_loss": 0.68006819, + "learning_rate": 2.3231268564425305e-06, + "loss": 0.70100808, + "num_input_tokens_seen": 159963915, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.45117188, + "step": 7458, + "time_per_iteration": 2.392111301422119 + }, + { + "auxiliary_loss_clip": 0.01068788, + "auxiliary_loss_mlp": 0.01025297, + "balance_loss_clip": 1.01080132, + "balance_loss_mlp": 1.0218935, + "epoch": 0.44845934165038326, + "flos": 17745164294400.0, + "grad_norm": 1.6164400473684062, + "language_loss": 0.71735036, + "learning_rate": 2.322753992056417e-06, + "loss": 0.7382912, + "num_input_tokens_seen": 159982140, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.46875, + "step": 7459, + "time_per_iteration": 2.3928184509277344 + }, + { + "auxiliary_loss_clip": 0.01065044, + "auxiliary_loss_mlp": 0.01029825, + "balance_loss_clip": 1.01681972, + "balance_loss_mlp": 1.02028227, + "epoch": 0.4485194649030513, + "flos": 21834820200960.0, + "grad_norm": 1.9564366870527274, + "language_loss": 0.69434905, + "learning_rate": 2.3223811161520425e-06, + "loss": 0.7152977, + "num_input_tokens_seen": 160002280, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.44726562, + "step": 7460, + "time_per_iteration": 2.3897488117218018 + }, + { + "auxiliary_loss_clip": 0.01065535, + "auxiliary_loss_mlp": 0.01026429, + "balance_loss_clip": 1.01330972, + "balance_loss_mlp": 1.02092052, + "epoch": 0.44857958815571924, + "flos": 20083472069760.0, + "grad_norm": 2.100753063850413, + "language_loss": 0.77053946, + "learning_rate": 2.3220082287427163e-06, + "loss": 0.79145908, + "num_input_tokens_seen": 160020260, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.4453125, + "step": 7461, + "time_per_iteration": 2.3846757411956787 + }, + { + "auxiliary_loss_clip": 0.0106779, + "auxiliary_loss_mlp": 0.01026317, + "balance_loss_clip": 1.01345396, + "balance_loss_mlp": 1.02138054, + "epoch": 0.4486397114083872, + "flos": 27052275623040.0, + "grad_norm": 1.6468315496895474, + "language_loss": 0.67633581, + "learning_rate": 2.321635329841745e-06, + "loss": 0.69727683, + "num_input_tokens_seen": 160040240, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.46484375, + "step": 7462, + "time_per_iteration": 2.5074973106384277 + }, + { + "auxiliary_loss_clip": 0.01010721, + "auxiliary_loss_mlp": 0.01005655, + "balance_loss_clip": 1.00461209, + "balance_loss_mlp": 1.00177395, + "epoch": 0.4486998346610552, + "flos": 67318164155520.0, + "grad_norm": 0.744286141808646, + "language_loss": 0.54432946, + "learning_rate": 2.3212624194624354e-06, + "loss": 0.56449324, + "num_input_tokens_seen": 160093865, + "router_z_loss_clip": 0.01043701, + "router_z_loss_mlp": 0.08935547, + "step": 7463, + "time_per_iteration": 3.0197391510009766 + }, + { + "auxiliary_loss_clip": 0.01066146, + "auxiliary_loss_mlp": 0.01028861, + "balance_loss_clip": 1.01614702, + "balance_loss_mlp": 1.02180541, + "epoch": 0.44875995791372314, + "flos": 27635569574400.0, + "grad_norm": 1.7652195384974754, + "language_loss": 0.75486851, + "learning_rate": 2.3208894976180965e-06, + "loss": 0.77581859, + "num_input_tokens_seen": 160113590, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.44335938, + "step": 7464, + "time_per_iteration": 2.4302866458892822 + }, + { + "auxiliary_loss_clip": 0.01065326, + "auxiliary_loss_mlp": 0.01026321, + "balance_loss_clip": 1.01403689, + "balance_loss_mlp": 1.02212453, + "epoch": 0.4488200811663911, + "flos": 13005111070080.0, + "grad_norm": 2.163235677362432, + "language_loss": 0.74059021, + "learning_rate": 2.3205165643220364e-06, + "loss": 0.76150668, + "num_input_tokens_seen": 160131795, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.43164062, + "step": 7465, + "time_per_iteration": 2.347649335861206 + }, + { + "auxiliary_loss_clip": 0.01070182, + "auxiliary_loss_mlp": 0.0103057, + "balance_loss_clip": 1.01668823, + "balance_loss_mlp": 1.02163076, + "epoch": 0.44888020441905907, + "flos": 27488759840640.0, + "grad_norm": 2.5082058069679887, + "language_loss": 0.80114162, + "learning_rate": 2.3201436195875655e-06, + "loss": 0.82214916, + "num_input_tokens_seen": 160150635, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.48632812, + "step": 7466, + "time_per_iteration": 2.415229558944702 + }, + { + "auxiliary_loss_clip": 0.01065415, + "auxiliary_loss_mlp": 0.01032226, + "balance_loss_clip": 1.0195179, + "balance_loss_mlp": 1.02104831, + "epoch": 0.44894032767172704, + "flos": 18258701616000.0, + "grad_norm": 2.4709024788699723, + "language_loss": 0.80755162, + "learning_rate": 2.3197706634279916e-06, + "loss": 0.82852799, + "num_input_tokens_seen": 160168615, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.44335938, + "step": 7467, + "time_per_iteration": 2.3412058353424072 + }, + { + "auxiliary_loss_clip": 0.01063728, + "auxiliary_loss_mlp": 0.01024886, + "balance_loss_clip": 1.01319814, + "balance_loss_mlp": 1.02165627, + "epoch": 0.449000450924395, + "flos": 21578767223040.0, + "grad_norm": 2.056395923673777, + "language_loss": 0.74676496, + "learning_rate": 2.3193976958566256e-06, + "loss": 0.76765108, + "num_input_tokens_seen": 160187295, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.421875, + "step": 7468, + "time_per_iteration": 2.382798671722412 + }, + { + "auxiliary_loss_clip": 0.01064337, + "auxiliary_loss_mlp": 0.01023119, + "balance_loss_clip": 1.01155543, + "balance_loss_mlp": 1.02111578, + "epoch": 0.44906057417706297, + "flos": 17966932450560.0, + "grad_norm": 14.243577795934891, + "language_loss": 0.7059238, + "learning_rate": 2.3190247168867775e-06, + "loss": 0.72679842, + "num_input_tokens_seen": 160205115, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.43164062, + "step": 7469, + "time_per_iteration": 2.3477261066436768 + }, + { + "auxiliary_loss_clip": 0.01066295, + "auxiliary_loss_mlp": 0.0102801, + "balance_loss_clip": 1.01462841, + "balance_loss_mlp": 1.02142823, + "epoch": 0.44912069742973093, + "flos": 20046324516480.0, + "grad_norm": 1.7283279246377228, + "language_loss": 0.72130698, + "learning_rate": 2.3186517265317575e-06, + "loss": 0.74225003, + "num_input_tokens_seen": 160222580, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.44921875, + "step": 7470, + "time_per_iteration": 2.3661904335021973 + }, + { + "auxiliary_loss_clip": 0.01064226, + "auxiliary_loss_mlp": 0.01025219, + "balance_loss_clip": 1.01151013, + "balance_loss_mlp": 1.01899791, + "epoch": 0.4491808206823989, + "flos": 21032446268160.0, + "grad_norm": 1.914018943855317, + "language_loss": 0.77004313, + "learning_rate": 2.3182787248048776e-06, + "loss": 0.79093754, + "num_input_tokens_seen": 160241520, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.453125, + "step": 7471, + "time_per_iteration": 2.371919631958008 + }, + { + "auxiliary_loss_clip": 0.01065047, + "auxiliary_loss_mlp": 0.01027896, + "balance_loss_clip": 1.01495028, + "balance_loss_mlp": 1.02060139, + "epoch": 0.44924094393506686, + "flos": 22966006118400.0, + "grad_norm": 2.2783964101594223, + "language_loss": 0.70204192, + "learning_rate": 2.317905711719448e-06, + "loss": 0.72297132, + "num_input_tokens_seen": 160261815, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.44335938, + "step": 7472, + "time_per_iteration": 2.4122962951660156 + }, + { + "auxiliary_loss_clip": 0.01010136, + "auxiliary_loss_mlp": 0.01001844, + "balance_loss_clip": 1.00082481, + "balance_loss_mlp": 1.00137258, + "epoch": 0.4493010671877349, + "flos": 59230323838080.0, + "grad_norm": 0.7416948376921159, + "language_loss": 0.61677486, + "learning_rate": 2.3175326872887823e-06, + "loss": 0.6368947, + "num_input_tokens_seen": 160317070, + "router_z_loss_clip": 0.01019287, + "router_z_loss_mlp": 0.08789062, + "step": 7473, + "time_per_iteration": 2.9795444011688232 + }, + { + "auxiliary_loss_clip": 0.01065644, + "auxiliary_loss_mlp": 0.01026986, + "balance_loss_clip": 1.01421821, + "balance_loss_mlp": 1.02120459, + "epoch": 0.44936119044040285, + "flos": 18003905447040.0, + "grad_norm": 1.815709442564319, + "language_loss": 0.77714097, + "learning_rate": 2.3171596515261907e-06, + "loss": 0.79806733, + "num_input_tokens_seen": 160334980, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.4453125, + "step": 7474, + "time_per_iteration": 2.3624000549316406 + }, + { + "auxiliary_loss_clip": 0.01064146, + "auxiliary_loss_mlp": 0.01027826, + "balance_loss_clip": 1.01489735, + "balance_loss_mlp": 1.02106535, + "epoch": 0.4494213136930708, + "flos": 21250758199680.0, + "grad_norm": 1.683365226145844, + "language_loss": 0.72058392, + "learning_rate": 2.3167866044449876e-06, + "loss": 0.7415036, + "num_input_tokens_seen": 160354500, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.43164062, + "step": 7475, + "time_per_iteration": 2.3743896484375 + }, + { + "auxiliary_loss_clip": 0.01065393, + "auxiliary_loss_mlp": 0.01029268, + "balance_loss_clip": 1.01677525, + "balance_loss_mlp": 1.02032197, + "epoch": 0.4494814369457388, + "flos": 27417432199680.0, + "grad_norm": 1.8192616935533845, + "language_loss": 0.76565444, + "learning_rate": 2.3164135460584853e-06, + "loss": 0.78660113, + "num_input_tokens_seen": 160373650, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.44921875, + "step": 7476, + "time_per_iteration": 2.4352736473083496 + }, + { + "auxiliary_loss_clip": 0.01067181, + "auxiliary_loss_mlp": 0.01032355, + "balance_loss_clip": 1.01806152, + "balance_loss_mlp": 1.02054858, + "epoch": 0.44954156019840674, + "flos": 22853027358720.0, + "grad_norm": 1.9943568998788965, + "language_loss": 0.7174269, + "learning_rate": 2.316040476379998e-06, + "loss": 0.73842227, + "num_input_tokens_seen": 160393430, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.46679688, + "step": 7477, + "time_per_iteration": 2.3991780281066895 + }, + { + "auxiliary_loss_clip": 0.01070568, + "auxiliary_loss_mlp": 0.01026739, + "balance_loss_clip": 1.01322079, + "balance_loss_mlp": 1.02364254, + "epoch": 0.4496016834510747, + "flos": 17200623818880.0, + "grad_norm": 2.011699921105981, + "language_loss": 0.67688632, + "learning_rate": 2.3156673954228385e-06, + "loss": 0.69785941, + "num_input_tokens_seen": 160410545, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.46875, + "step": 7478, + "time_per_iteration": 3.7616255283355713 + }, + { + "auxiliary_loss_clip": 0.01062275, + "auxiliary_loss_mlp": 0.01028267, + "balance_loss_clip": 1.01576805, + "balance_loss_mlp": 1.02012968, + "epoch": 0.4496618067037427, + "flos": 18915627358080.0, + "grad_norm": 1.771304044095358, + "language_loss": 0.89294225, + "learning_rate": 2.315294303200322e-06, + "loss": 0.91384768, + "num_input_tokens_seen": 160428105, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.421875, + "step": 7479, + "time_per_iteration": 2.3589911460876465 + }, + { + "auxiliary_loss_clip": 0.01069401, + "auxiliary_loss_mlp": 0.01032531, + "balance_loss_clip": 1.01939428, + "balance_loss_mlp": 1.0237124, + "epoch": 0.44972192995641064, + "flos": 21030630877440.0, + "grad_norm": 1.6737844193709546, + "language_loss": 0.75201118, + "learning_rate": 2.314921199725762e-06, + "loss": 0.77303052, + "num_input_tokens_seen": 160448815, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.45703125, + "step": 7480, + "time_per_iteration": 2.4035439491271973 + }, + { + "auxiliary_loss_clip": 0.01067232, + "auxiliary_loss_mlp": 0.01028238, + "balance_loss_clip": 1.01446986, + "balance_loss_mlp": 1.02066231, + "epoch": 0.4497820532090786, + "flos": 20776044176640.0, + "grad_norm": 2.1698106702408944, + "language_loss": 0.79991335, + "learning_rate": 2.3145480850124754e-06, + "loss": 0.82086802, + "num_input_tokens_seen": 160465940, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.46484375, + "step": 7481, + "time_per_iteration": 2.3548061847686768 + }, + { + "auxiliary_loss_clip": 0.01063835, + "auxiliary_loss_mlp": 0.0102402, + "balance_loss_clip": 1.01098442, + "balance_loss_mlp": 1.02095616, + "epoch": 0.44984217646174657, + "flos": 33801196233600.0, + "grad_norm": 1.8561109894354673, + "language_loss": 0.68817341, + "learning_rate": 2.3141749590737763e-06, + "loss": 0.70905197, + "num_input_tokens_seen": 160486710, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.4296875, + "step": 7482, + "time_per_iteration": 2.482841968536377 + }, + { + "auxiliary_loss_clip": 0.01066876, + "auxiliary_loss_mlp": 0.01029218, + "balance_loss_clip": 1.01623058, + "balance_loss_mlp": 1.02154469, + "epoch": 0.44990229971441453, + "flos": 15517600951680.0, + "grad_norm": 2.2580716057004517, + "language_loss": 0.84532452, + "learning_rate": 2.313801821922981e-06, + "loss": 0.86628544, + "num_input_tokens_seen": 160503405, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.453125, + "step": 7483, + "time_per_iteration": 2.3468780517578125 + }, + { + "auxiliary_loss_clip": 0.01071836, + "auxiliary_loss_mlp": 0.01031454, + "balance_loss_clip": 1.01770902, + "balance_loss_mlp": 1.02358079, + "epoch": 0.4499624229670825, + "flos": 29860619299200.0, + "grad_norm": 1.7461590477016598, + "language_loss": 0.80613953, + "learning_rate": 2.3134286735734065e-06, + "loss": 0.8271724, + "num_input_tokens_seen": 160525080, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.48242188, + "step": 7484, + "time_per_iteration": 2.489485025405884 + }, + { + "auxiliary_loss_clip": 0.01069398, + "auxiliary_loss_mlp": 0.01023347, + "balance_loss_clip": 1.00824308, + "balance_loss_mlp": 1.02108693, + "epoch": 0.45002254621975046, + "flos": 18512729735040.0, + "grad_norm": 6.531895219045823, + "language_loss": 0.75371158, + "learning_rate": 2.3130555140383678e-06, + "loss": 0.77463901, + "num_input_tokens_seen": 160540895, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.484375, + "step": 7485, + "time_per_iteration": 2.3340041637420654 + }, + { + "auxiliary_loss_clip": 0.01010289, + "auxiliary_loss_mlp": 0.01003721, + "balance_loss_clip": 1.00260603, + "balance_loss_mlp": 1.00123227, + "epoch": 0.4500826694724185, + "flos": 70417334390400.0, + "grad_norm": 0.7872508399012341, + "language_loss": 0.58650196, + "learning_rate": 2.312682343331184e-06, + "loss": 0.60664207, + "num_input_tokens_seen": 160598270, + "router_z_loss_clip": 0.01116943, + "router_z_loss_mlp": 0.09082031, + "step": 7486, + "time_per_iteration": 4.374190807342529 + }, + { + "auxiliary_loss_clip": 0.01066922, + "auxiliary_loss_mlp": 0.01026531, + "balance_loss_clip": 1.01298308, + "balance_loss_mlp": 1.02156258, + "epoch": 0.45014279272508645, + "flos": 15777982938240.0, + "grad_norm": 4.306257996183545, + "language_loss": 0.82734627, + "learning_rate": 2.312309161465171e-06, + "loss": 0.84828079, + "num_input_tokens_seen": 160614720, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.453125, + "step": 7487, + "time_per_iteration": 2.334184408187866 + }, + { + "auxiliary_loss_clip": 0.01067649, + "auxiliary_loss_mlp": 0.01025976, + "balance_loss_clip": 1.0127852, + "balance_loss_mlp": 1.02278543, + "epoch": 0.4502029159777544, + "flos": 21798475608960.0, + "grad_norm": 1.9006218095706755, + "language_loss": 0.77128959, + "learning_rate": 2.311935968453648e-06, + "loss": 0.79222584, + "num_input_tokens_seen": 160635170, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.44726562, + "step": 7488, + "time_per_iteration": 2.408372163772583 + }, + { + "auxiliary_loss_clip": 0.01071401, + "auxiliary_loss_mlp": 0.0103121, + "balance_loss_clip": 1.0170306, + "balance_loss_mlp": 1.0232842, + "epoch": 0.4502630392304224, + "flos": 28766685669120.0, + "grad_norm": 2.1559619159335726, + "language_loss": 0.7206015, + "learning_rate": 2.311562764309932e-06, + "loss": 0.74162763, + "num_input_tokens_seen": 160654490, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.48046875, + "step": 7489, + "time_per_iteration": 2.431546926498413 + }, + { + "auxiliary_loss_clip": 0.01069995, + "auxiliary_loss_mlp": 0.01032358, + "balance_loss_clip": 1.01670027, + "balance_loss_mlp": 1.02313375, + "epoch": 0.45032316248309034, + "flos": 15843480382080.0, + "grad_norm": 1.9588992176651805, + "language_loss": 0.690956, + "learning_rate": 2.311189549047343e-06, + "loss": 0.71197951, + "num_input_tokens_seen": 160669400, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.46875, + "step": 7490, + "time_per_iteration": 3.705183982849121 + }, + { + "auxiliary_loss_clip": 0.0100959, + "auxiliary_loss_mlp": 0.01000664, + "balance_loss_clip": 0.99959701, + "balance_loss_mlp": 1.00078154, + "epoch": 0.4503832857357583, + "flos": 57850311594240.0, + "grad_norm": 0.7459974533740769, + "language_loss": 0.56665355, + "learning_rate": 2.3108163226791994e-06, + "loss": 0.58675611, + "num_input_tokens_seen": 160733820, + "router_z_loss_clip": 0.01068115, + "router_z_loss_mlp": 0.08789062, + "step": 7491, + "time_per_iteration": 3.0012550354003906 + }, + { + "auxiliary_loss_clip": 0.01064464, + "auxiliary_loss_mlp": 0.01027707, + "balance_loss_clip": 1.0156188, + "balance_loss_mlp": 1.02114153, + "epoch": 0.4504434089884263, + "flos": 23876959979520.0, + "grad_norm": 1.6423918982209609, + "language_loss": 0.79396516, + "learning_rate": 2.3104430852188206e-06, + "loss": 0.81488687, + "num_input_tokens_seen": 160753175, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.43359375, + "step": 7492, + "time_per_iteration": 3.7549972534179688 + }, + { + "auxiliary_loss_clip": 0.01069229, + "auxiliary_loss_mlp": 0.01028295, + "balance_loss_clip": 1.01440108, + "balance_loss_mlp": 1.02143908, + "epoch": 0.45050353224109424, + "flos": 17784127238400.0, + "grad_norm": 2.5083463513447843, + "language_loss": 0.92497772, + "learning_rate": 2.3100698366795266e-06, + "loss": 0.94595295, + "num_input_tokens_seen": 160768310, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.47851562, + "step": 7493, + "time_per_iteration": 2.3359549045562744 + }, + { + "auxiliary_loss_clip": 0.01009824, + "auxiliary_loss_mlp": 0.01000949, + "balance_loss_clip": 0.99972743, + "balance_loss_mlp": 1.00108242, + "epoch": 0.4505636554937622, + "flos": 65060330797440.0, + "grad_norm": 0.8041004873201066, + "language_loss": 0.62871397, + "learning_rate": 2.309696577074638e-06, + "loss": 0.64882171, + "num_input_tokens_seen": 160827370, + "router_z_loss_clip": 0.01220703, + "router_z_loss_mlp": 0.08740234, + "step": 7494, + "time_per_iteration": 3.0702004432678223 + }, + { + "auxiliary_loss_clip": 0.01065935, + "auxiliary_loss_mlp": 0.0102733, + "balance_loss_clip": 1.01455724, + "balance_loss_mlp": 1.02222002, + "epoch": 0.45062377874643017, + "flos": 22198999259520.0, + "grad_norm": 1.435324189091235, + "language_loss": 0.81909114, + "learning_rate": 2.3093233064174747e-06, + "loss": 0.84002382, + "num_input_tokens_seen": 160849140, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.4375, + "step": 7495, + "time_per_iteration": 2.40206241607666 + }, + { + "auxiliary_loss_clip": 0.01067884, + "auxiliary_loss_mlp": 0.01027245, + "balance_loss_clip": 1.01301682, + "balance_loss_mlp": 1.02142012, + "epoch": 0.45068390199909814, + "flos": 37668769781760.0, + "grad_norm": 1.836464648819498, + "language_loss": 0.85520303, + "learning_rate": 2.308950024721359e-06, + "loss": 0.8761543, + "num_input_tokens_seen": 160871280, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.46484375, + "step": 7496, + "time_per_iteration": 2.545952558517456 + }, + { + "auxiliary_loss_clip": 0.01067935, + "auxiliary_loss_mlp": 0.01027093, + "balance_loss_clip": 1.01348507, + "balance_loss_mlp": 1.02181816, + "epoch": 0.4507440252517661, + "flos": 22301609345280.0, + "grad_norm": 2.1717148537557542, + "language_loss": 0.76492083, + "learning_rate": 2.308576731999611e-06, + "loss": 0.78587109, + "num_input_tokens_seen": 160888625, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.4609375, + "step": 7497, + "time_per_iteration": 2.369039535522461 + }, + { + "auxiliary_loss_clip": 0.01066667, + "auxiliary_loss_mlp": 0.0102947, + "balance_loss_clip": 1.01642275, + "balance_loss_mlp": 1.02090812, + "epoch": 0.45080414850443407, + "flos": 13187532257280.0, + "grad_norm": 2.1301799685171092, + "language_loss": 0.74849474, + "learning_rate": 2.3082034282655532e-06, + "loss": 0.76945609, + "num_input_tokens_seen": 160907040, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.45703125, + "step": 7498, + "time_per_iteration": 2.378347635269165 + }, + { + "auxiliary_loss_clip": 0.01067769, + "auxiliary_loss_mlp": 0.01028342, + "balance_loss_clip": 1.01441884, + "balance_loss_mlp": 1.02208507, + "epoch": 0.4508642717571021, + "flos": 21943853976960.0, + "grad_norm": 2.228103284280172, + "language_loss": 0.70168984, + "learning_rate": 2.3078301135325076e-06, + "loss": 0.722651, + "num_input_tokens_seen": 160927115, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.45703125, + "step": 7499, + "time_per_iteration": 2.3801255226135254 + }, + { + "auxiliary_loss_clip": 0.01069386, + "auxiliary_loss_mlp": 0.01030458, + "balance_loss_clip": 1.01716602, + "balance_loss_mlp": 1.02250957, + "epoch": 0.45092439500977005, + "flos": 23366355212160.0, + "grad_norm": 1.7870269448366705, + "language_loss": 0.76922917, + "learning_rate": 2.307456787813798e-06, + "loss": 0.79022765, + "num_input_tokens_seen": 160944405, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.46875, + "step": 7500, + "time_per_iteration": 2.3838627338409424 + }, + { + "auxiliary_loss_clip": 0.01067825, + "auxiliary_loss_mlp": 0.01027987, + "balance_loss_clip": 1.01396179, + "balance_loss_mlp": 1.02118945, + "epoch": 0.450984518262438, + "flos": 20772029370240.0, + "grad_norm": 2.6964545612165374, + "language_loss": 0.6260193, + "learning_rate": 2.307083451122746e-06, + "loss": 0.64697742, + "num_input_tokens_seen": 160961345, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.46679688, + "step": 7501, + "time_per_iteration": 2.359476327896118 + }, + { + "auxiliary_loss_clip": 0.01067401, + "auxiliary_loss_mlp": 0.01030016, + "balance_loss_clip": 1.01608658, + "balance_loss_mlp": 1.02071071, + "epoch": 0.451044641515106, + "flos": 17706550464000.0, + "grad_norm": 1.8950597239093878, + "language_loss": 0.84176677, + "learning_rate": 2.3067101034726755e-06, + "loss": 0.86274099, + "num_input_tokens_seen": 160977330, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.46679688, + "step": 7502, + "time_per_iteration": 2.359642744064331 + }, + { + "auxiliary_loss_clip": 0.01064252, + "auxiliary_loss_mlp": 0.01027505, + "balance_loss_clip": 1.01464832, + "balance_loss_mlp": 1.02189064, + "epoch": 0.45110476476777395, + "flos": 20593657900800.0, + "grad_norm": 1.3732070107498473, + "language_loss": 0.79388529, + "learning_rate": 2.30633674487691e-06, + "loss": 0.81480289, + "num_input_tokens_seen": 160997280, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.42382812, + "step": 7503, + "time_per_iteration": 2.376873731613159 + }, + { + "auxiliary_loss_clip": 0.01064922, + "auxiliary_loss_mlp": 0.0102625, + "balance_loss_clip": 1.01329231, + "balance_loss_mlp": 1.02191663, + "epoch": 0.4511648880204419, + "flos": 16033128220800.0, + "grad_norm": 2.0624354344910993, + "language_loss": 0.80636406, + "learning_rate": 2.3059633753487745e-06, + "loss": 0.82727575, + "num_input_tokens_seen": 161014235, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.4296875, + "step": 7504, + "time_per_iteration": 2.3520076274871826 + }, + { + "auxiliary_loss_clip": 0.01066454, + "auxiliary_loss_mlp": 0.01026737, + "balance_loss_clip": 1.01391053, + "balance_loss_mlp": 1.02213311, + "epoch": 0.4512250112731099, + "flos": 23977929231360.0, + "grad_norm": 1.8009923450036573, + "language_loss": 0.63474894, + "learning_rate": 2.3055899949015932e-06, + "loss": 0.65568089, + "num_input_tokens_seen": 161032360, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.44335938, + "step": 7505, + "time_per_iteration": 2.3758647441864014 + }, + { + "auxiliary_loss_clip": 0.01009405, + "auxiliary_loss_mlp": 0.01003165, + "balance_loss_clip": 1.00205004, + "balance_loss_mlp": 1.00088978, + "epoch": 0.45128513452577784, + "flos": 71458652868480.0, + "grad_norm": 0.8324974362094337, + "language_loss": 0.5886063, + "learning_rate": 2.3052166035486916e-06, + "loss": 0.60873199, + "num_input_tokens_seen": 161091360, + "router_z_loss_clip": 0.01116943, + "router_z_loss_mlp": 0.08496094, + "step": 7506, + "time_per_iteration": 3.060225486755371 + }, + { + "auxiliary_loss_clip": 0.01066411, + "auxiliary_loss_mlp": 0.01026756, + "balance_loss_clip": 1.01350546, + "balance_loss_mlp": 1.02220869, + "epoch": 0.4513452577784458, + "flos": 22089756015360.0, + "grad_norm": 1.9936089721304333, + "language_loss": 0.79077637, + "learning_rate": 2.304843201303394e-06, + "loss": 0.81170797, + "num_input_tokens_seen": 161110825, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.44140625, + "step": 7507, + "time_per_iteration": 2.3829562664031982 + }, + { + "auxiliary_loss_clip": 0.01066581, + "auxiliary_loss_mlp": 0.01029658, + "balance_loss_clip": 1.0150131, + "balance_loss_mlp": 1.02090847, + "epoch": 0.4514053810311138, + "flos": 24275354037120.0, + "grad_norm": 1.8642874520044932, + "language_loss": 0.74095219, + "learning_rate": 2.3044697881790266e-06, + "loss": 0.76191461, + "num_input_tokens_seen": 161130685, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.45703125, + "step": 7508, + "time_per_iteration": 2.39839243888855 + }, + { + "auxiliary_loss_clip": 0.01065486, + "auxiliary_loss_mlp": 0.01027182, + "balance_loss_clip": 1.0143795, + "balance_loss_mlp": 1.02260208, + "epoch": 0.45146550428378174, + "flos": 17886039096960.0, + "grad_norm": 1.8340453736690003, + "language_loss": 0.79201818, + "learning_rate": 2.3040963641889155e-06, + "loss": 0.81294489, + "num_input_tokens_seen": 161147555, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.42773438, + "step": 7509, + "time_per_iteration": 2.3493340015411377 + }, + { + "auxiliary_loss_clip": 0.0106635, + "auxiliary_loss_mlp": 0.01027364, + "balance_loss_clip": 1.01490116, + "balance_loss_mlp": 1.02313519, + "epoch": 0.4515256275364497, + "flos": 24242291112960.0, + "grad_norm": 1.6563056598041785, + "language_loss": 0.72776949, + "learning_rate": 2.303722929346388e-06, + "loss": 0.74870658, + "num_input_tokens_seen": 161166255, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.43164062, + "step": 7510, + "time_per_iteration": 2.4089953899383545 + }, + { + "auxiliary_loss_clip": 0.01066059, + "auxiliary_loss_mlp": 0.01026698, + "balance_loss_clip": 1.01337016, + "balance_loss_mlp": 1.02119994, + "epoch": 0.45158575078911767, + "flos": 20630002492800.0, + "grad_norm": 2.0280374158145795, + "language_loss": 0.77039695, + "learning_rate": 2.3033494836647693e-06, + "loss": 0.7913245, + "num_input_tokens_seen": 161184720, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.44921875, + "step": 7511, + "time_per_iteration": 2.373490333557129 + }, + { + "auxiliary_loss_clip": 0.01065969, + "auxiliary_loss_mlp": 0.01024609, + "balance_loss_clip": 1.01149583, + "balance_loss_mlp": 1.02074575, + "epoch": 0.45164587404178563, + "flos": 23326728952320.0, + "grad_norm": 1.652233672682726, + "language_loss": 0.78699112, + "learning_rate": 2.3029760271573887e-06, + "loss": 0.80789685, + "num_input_tokens_seen": 161204360, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.453125, + "step": 7512, + "time_per_iteration": 2.387868642807007 + }, + { + "auxiliary_loss_clip": 0.010689, + "auxiliary_loss_mlp": 0.01029259, + "balance_loss_clip": 1.01486468, + "balance_loss_mlp": 1.02128804, + "epoch": 0.45170599729445365, + "flos": 23804829377280.0, + "grad_norm": 2.524624387462047, + "language_loss": 0.87581944, + "learning_rate": 2.3026025598375727e-06, + "loss": 0.89680099, + "num_input_tokens_seen": 161223575, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.4765625, + "step": 7513, + "time_per_iteration": 2.4097237586975098 + }, + { + "auxiliary_loss_clip": 0.01061889, + "auxiliary_loss_mlp": 0.0102469, + "balance_loss_clip": 1.01301908, + "balance_loss_mlp": 1.02158773, + "epoch": 0.4517661205471216, + "flos": 23511838314240.0, + "grad_norm": 1.6074659560887008, + "language_loss": 0.67272383, + "learning_rate": 2.30222908171865e-06, + "loss": 0.69358957, + "num_input_tokens_seen": 161243805, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.40234375, + "step": 7514, + "time_per_iteration": 2.3884377479553223 + }, + { + "auxiliary_loss_clip": 0.01066361, + "auxiliary_loss_mlp": 0.01030811, + "balance_loss_clip": 1.01633906, + "balance_loss_mlp": 1.02190566, + "epoch": 0.4518262437997896, + "flos": 23512815832320.0, + "grad_norm": 2.215496957640234, + "language_loss": 0.69455564, + "learning_rate": 2.301855592813949e-06, + "loss": 0.71552742, + "num_input_tokens_seen": 161261450, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.4453125, + "step": 7515, + "time_per_iteration": 2.3990111351013184 + }, + { + "auxiliary_loss_clip": 0.01068848, + "auxiliary_loss_mlp": 0.01033315, + "balance_loss_clip": 1.01898611, + "balance_loss_mlp": 1.02208304, + "epoch": 0.45188636705245755, + "flos": 14567369944320.0, + "grad_norm": 2.187252978055036, + "language_loss": 0.81408483, + "learning_rate": 2.3014820931367976e-06, + "loss": 0.83510643, + "num_input_tokens_seen": 161276965, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.46679688, + "step": 7516, + "time_per_iteration": 2.3343043327331543 + }, + { + "auxiliary_loss_clip": 0.01063692, + "auxiliary_loss_mlp": 0.01028779, + "balance_loss_clip": 1.01603556, + "balance_loss_mlp": 1.02125263, + "epoch": 0.4519464903051255, + "flos": 19900527212160.0, + "grad_norm": 1.7395920244136267, + "language_loss": 0.65423846, + "learning_rate": 2.301108582700526e-06, + "loss": 0.67516315, + "num_input_tokens_seen": 161295375, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.42578125, + "step": 7517, + "time_per_iteration": 2.3758602142333984 + }, + { + "auxiliary_loss_clip": 0.01064041, + "auxiliary_loss_mlp": 0.01020359, + "balance_loss_clip": 1.00915372, + "balance_loss_mlp": 1.02260804, + "epoch": 0.4520066135577935, + "flos": 18843357110400.0, + "grad_norm": 1.7063792249648089, + "language_loss": 0.63118595, + "learning_rate": 2.3007350615184645e-06, + "loss": 0.65202993, + "num_input_tokens_seen": 161313010, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.4140625, + "step": 7518, + "time_per_iteration": 3.8661282062530518 + }, + { + "auxiliary_loss_clip": 0.01064606, + "auxiliary_loss_mlp": 0.01028642, + "balance_loss_clip": 1.01609516, + "balance_loss_mlp": 1.02091777, + "epoch": 0.45206673681046144, + "flos": 48212609667840.0, + "grad_norm": 1.467886282961376, + "language_loss": 0.59280515, + "learning_rate": 2.300361529603941e-06, + "loss": 0.61373758, + "num_input_tokens_seen": 161336690, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.4375, + "step": 7519, + "time_per_iteration": 2.6109936237335205 + }, + { + "auxiliary_loss_clip": 0.01068969, + "auxiliary_loss_mlp": 0.01029469, + "balance_loss_clip": 1.01698792, + "balance_loss_mlp": 1.02388501, + "epoch": 0.4521268600631294, + "flos": 23841034323840.0, + "grad_norm": 1.243032982105272, + "language_loss": 0.72325897, + "learning_rate": 2.2999879869702884e-06, + "loss": 0.74424338, + "num_input_tokens_seen": 161357845, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.45117188, + "step": 7520, + "time_per_iteration": 2.438121795654297 + }, + { + "auxiliary_loss_clip": 0.01066761, + "auxiliary_loss_mlp": 0.0102755, + "balance_loss_clip": 1.01502085, + "balance_loss_mlp": 1.02185178, + "epoch": 0.4521869833157974, + "flos": 18842623971840.0, + "grad_norm": 1.8743912374408522, + "language_loss": 0.75740075, + "learning_rate": 2.299614433630835e-06, + "loss": 0.77834392, + "num_input_tokens_seen": 161375160, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.44921875, + "step": 7521, + "time_per_iteration": 2.3477320671081543 + }, + { + "auxiliary_loss_clip": 0.01068768, + "auxiliary_loss_mlp": 0.01029338, + "balance_loss_clip": 1.01566482, + "balance_loss_mlp": 1.02254939, + "epoch": 0.45224710656846534, + "flos": 19787164427520.0, + "grad_norm": 2.4125613985503542, + "language_loss": 0.6786927, + "learning_rate": 2.2992408695989144e-06, + "loss": 0.69967383, + "num_input_tokens_seen": 161393690, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.46289062, + "step": 7522, + "time_per_iteration": 2.3950700759887695 + }, + { + "auxiliary_loss_clip": 0.01063962, + "auxiliary_loss_mlp": 0.01029441, + "balance_loss_clip": 1.01650095, + "balance_loss_mlp": 1.01985526, + "epoch": 0.4523072298211333, + "flos": 28254893915520.0, + "grad_norm": 1.605731092970976, + "language_loss": 0.60878778, + "learning_rate": 2.2988672948878564e-06, + "loss": 0.62972176, + "num_input_tokens_seen": 161415015, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.44140625, + "step": 7523, + "time_per_iteration": 2.423436403274536 + }, + { + "auxiliary_loss_clip": 0.01068774, + "auxiliary_loss_mlp": 0.01030866, + "balance_loss_clip": 1.01637053, + "balance_loss_mlp": 1.02098584, + "epoch": 0.45236735307380127, + "flos": 11180270793600.0, + "grad_norm": 2.523636682323464, + "language_loss": 0.78660262, + "learning_rate": 2.2984937095109926e-06, + "loss": 0.80759901, + "num_input_tokens_seen": 161432940, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.4765625, + "step": 7524, + "time_per_iteration": 2.360898971557617 + }, + { + "auxiliary_loss_clip": 0.01065257, + "auxiliary_loss_mlp": 0.01030192, + "balance_loss_clip": 1.0171324, + "balance_loss_mlp": 1.02117014, + "epoch": 0.45242747632646924, + "flos": 22600290960000.0, + "grad_norm": 1.6235381036617678, + "language_loss": 0.63698751, + "learning_rate": 2.2981201134816573e-06, + "loss": 0.65794194, + "num_input_tokens_seen": 161452215, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.44140625, + "step": 7525, + "time_per_iteration": 3.745279550552368 + }, + { + "auxiliary_loss_clip": 0.01064579, + "auxiliary_loss_mlp": 0.01023976, + "balance_loss_clip": 1.01143479, + "balance_loss_mlp": 1.02138042, + "epoch": 0.45248759957913726, + "flos": 18255385036800.0, + "grad_norm": 1.806465620279236, + "language_loss": 0.78966773, + "learning_rate": 2.2977465068131812e-06, + "loss": 0.81055331, + "num_input_tokens_seen": 161469520, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.43164062, + "step": 7526, + "time_per_iteration": 2.347201347351074 + }, + { + "auxiliary_loss_clip": 0.01062597, + "auxiliary_loss_mlp": 0.01024145, + "balance_loss_clip": 1.01149702, + "balance_loss_mlp": 1.02062678, + "epoch": 0.4525477228318052, + "flos": 22449152217600.0, + "grad_norm": 1.5733798652334308, + "language_loss": 0.8087393, + "learning_rate": 2.2973728895188983e-06, + "loss": 0.82960671, + "num_input_tokens_seen": 161487335, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.41992188, + "step": 7527, + "time_per_iteration": 2.3623788356781006 + }, + { + "auxiliary_loss_clip": 0.01064755, + "auxiliary_loss_mlp": 0.01023999, + "balance_loss_clip": 1.01148176, + "balance_loss_mlp": 1.02146554, + "epoch": 0.4526078460844732, + "flos": 29643529265280.0, + "grad_norm": 1.5459448347499876, + "language_loss": 0.65539885, + "learning_rate": 2.296999261612142e-06, + "loss": 0.67628634, + "num_input_tokens_seen": 161510095, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.43359375, + "step": 7528, + "time_per_iteration": 2.4642136096954346 + }, + { + "auxiliary_loss_clip": 0.01062495, + "auxiliary_loss_mlp": 0.0102653, + "balance_loss_clip": 1.01454353, + "balance_loss_mlp": 1.02049637, + "epoch": 0.45266796933714115, + "flos": 23038625479680.0, + "grad_norm": 1.6903701525398327, + "language_loss": 0.75293696, + "learning_rate": 2.2966256231062464e-06, + "loss": 0.77382725, + "num_input_tokens_seen": 161528725, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.41992188, + "step": 7529, + "time_per_iteration": 3.7312045097351074 + }, + { + "auxiliary_loss_clip": 0.01063508, + "auxiliary_loss_mlp": 0.01026723, + "balance_loss_clip": 1.01412868, + "balance_loss_mlp": 1.0201875, + "epoch": 0.4527280925898091, + "flos": 14427542482560.0, + "grad_norm": 2.2518581554977204, + "language_loss": 0.7271843, + "learning_rate": 2.296251974014545e-06, + "loss": 0.74808657, + "num_input_tokens_seen": 161547195, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.43359375, + "step": 7530, + "time_per_iteration": 2.3872430324554443 + }, + { + "auxiliary_loss_clip": 0.0106626, + "auxiliary_loss_mlp": 0.01024363, + "balance_loss_clip": 1.01179242, + "balance_loss_mlp": 1.02172208, + "epoch": 0.4527882158424771, + "flos": 22924529556480.0, + "grad_norm": 1.637827167568268, + "language_loss": 0.76048076, + "learning_rate": 2.2958783143503724e-06, + "loss": 0.78138697, + "num_input_tokens_seen": 161565565, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.4453125, + "step": 7531, + "time_per_iteration": 3.7704241275787354 + }, + { + "auxiliary_loss_clip": 0.01061451, + "auxiliary_loss_mlp": 0.01023702, + "balance_loss_clip": 1.01103616, + "balance_loss_mlp": 1.01942837, + "epoch": 0.45284833909514505, + "flos": 25554187560960.0, + "grad_norm": 1.5209151884350196, + "language_loss": 0.6660043, + "learning_rate": 2.295504644127064e-06, + "loss": 0.68685585, + "num_input_tokens_seen": 161586630, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.41992188, + "step": 7532, + "time_per_iteration": 2.4000461101531982 + }, + { + "auxiliary_loss_clip": 0.01065086, + "auxiliary_loss_mlp": 0.01025374, + "balance_loss_clip": 1.01331615, + "balance_loss_mlp": 1.02247095, + "epoch": 0.452908462347813, + "flos": 18149039435520.0, + "grad_norm": 4.391115701470664, + "language_loss": 0.78493857, + "learning_rate": 2.295130963357955e-06, + "loss": 0.80584317, + "num_input_tokens_seen": 161603815, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.42578125, + "step": 7533, + "time_per_iteration": 2.351008176803589 + }, + { + "auxiliary_loss_clip": 0.01068209, + "auxiliary_loss_mlp": 0.01031224, + "balance_loss_clip": 1.01694274, + "balance_loss_mlp": 1.02029133, + "epoch": 0.452968585600481, + "flos": 19061738864640.0, + "grad_norm": 1.8325631223197276, + "language_loss": 0.82924712, + "learning_rate": 2.2947572720563815e-06, + "loss": 0.85024148, + "num_input_tokens_seen": 161622900, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.48046875, + "step": 7534, + "time_per_iteration": 2.3450465202331543 + }, + { + "auxiliary_loss_clip": 0.01066691, + "auxiliary_loss_mlp": 0.01029363, + "balance_loss_clip": 1.01588082, + "balance_loss_mlp": 1.02107501, + "epoch": 0.45302870885314894, + "flos": 22050723248640.0, + "grad_norm": 1.7522184369146205, + "language_loss": 0.76282066, + "learning_rate": 2.2943835702356788e-06, + "loss": 0.78378117, + "num_input_tokens_seen": 161641700, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.45703125, + "step": 7535, + "time_per_iteration": 2.3821914196014404 + }, + { + "auxiliary_loss_clip": 0.01062384, + "auxiliary_loss_mlp": 0.0102364, + "balance_loss_clip": 1.01171327, + "balance_loss_mlp": 1.02034926, + "epoch": 0.4530888321058169, + "flos": 20375171412480.0, + "grad_norm": 1.559551974415307, + "language_loss": 0.8068161, + "learning_rate": 2.2940098579091836e-06, + "loss": 0.82767636, + "num_input_tokens_seen": 161661955, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.41992188, + "step": 7536, + "time_per_iteration": 2.374249219894409 + }, + { + "auxiliary_loss_clip": 0.01065854, + "auxiliary_loss_mlp": 0.01024795, + "balance_loss_clip": 1.0120995, + "balance_loss_mlp": 1.02040982, + "epoch": 0.4531489553584849, + "flos": 14829532410240.0, + "grad_norm": 1.700642536206327, + "language_loss": 0.76253498, + "learning_rate": 2.2936361350902334e-06, + "loss": 0.78344148, + "num_input_tokens_seen": 161679245, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.453125, + "step": 7537, + "time_per_iteration": 2.355215549468994 + }, + { + "auxiliary_loss_clip": 0.01067576, + "auxiliary_loss_mlp": 0.01027344, + "balance_loss_clip": 1.01504803, + "balance_loss_mlp": 1.02265632, + "epoch": 0.45320907861115284, + "flos": 21943888888320.0, + "grad_norm": 2.039347724705475, + "language_loss": 0.75700998, + "learning_rate": 2.2932624017921643e-06, + "loss": 0.77795911, + "num_input_tokens_seen": 161698795, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.44921875, + "step": 7538, + "time_per_iteration": 2.382274866104126 + }, + { + "auxiliary_loss_clip": 0.01010016, + "auxiliary_loss_mlp": 0.00999658, + "balance_loss_clip": 0.99849528, + "balance_loss_mlp": 1.0011431, + "epoch": 0.45326920186382086, + "flos": 66247760782080.0, + "grad_norm": 0.8136728147832124, + "language_loss": 0.62407583, + "learning_rate": 2.292888658028315e-06, + "loss": 0.64417261, + "num_input_tokens_seen": 161761980, + "router_z_loss_clip": 0.01159668, + "router_z_loss_mlp": 0.08886719, + "step": 7539, + "time_per_iteration": 3.0612194538116455 + }, + { + "auxiliary_loss_clip": 0.01064903, + "auxiliary_loss_mlp": 0.01026525, + "balance_loss_clip": 1.01399565, + "balance_loss_mlp": 1.02175713, + "epoch": 0.4533293251164888, + "flos": 14683351080960.0, + "grad_norm": 1.810935567026907, + "language_loss": 0.65433913, + "learning_rate": 2.2925149038120226e-06, + "loss": 0.67525339, + "num_input_tokens_seen": 161779455, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.4296875, + "step": 7540, + "time_per_iteration": 2.350393295288086 + }, + { + "auxiliary_loss_clip": 0.01068635, + "auxiliary_loss_mlp": 0.01031859, + "balance_loss_clip": 1.016505, + "balance_loss_mlp": 1.02157629, + "epoch": 0.4533894483691568, + "flos": 22600116403200.0, + "grad_norm": 2.174137004797944, + "language_loss": 0.84658825, + "learning_rate": 2.292141139156625e-06, + "loss": 0.86759317, + "num_input_tokens_seen": 161798980, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.47070312, + "step": 7541, + "time_per_iteration": 2.4044814109802246 + }, + { + "auxiliary_loss_clip": 0.01066128, + "auxiliary_loss_mlp": 0.01025476, + "balance_loss_clip": 1.01298261, + "balance_loss_mlp": 1.02136135, + "epoch": 0.45344957162182475, + "flos": 34750170432000.0, + "grad_norm": 1.7685241068361206, + "language_loss": 0.76224756, + "learning_rate": 2.2917673640754626e-06, + "loss": 0.78316361, + "num_input_tokens_seen": 161819745, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.44921875, + "step": 7542, + "time_per_iteration": 2.497783660888672 + }, + { + "auxiliary_loss_clip": 0.0106496, + "auxiliary_loss_mlp": 0.01024342, + "balance_loss_clip": 1.01077569, + "balance_loss_mlp": 1.02003849, + "epoch": 0.4535096948744927, + "flos": 23549090601600.0, + "grad_norm": 1.581991532465093, + "language_loss": 0.80853593, + "learning_rate": 2.291393578581873e-06, + "loss": 0.82942891, + "num_input_tokens_seen": 161838575, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.44921875, + "step": 7543, + "time_per_iteration": 2.3962621688842773 + }, + { + "auxiliary_loss_clip": 0.01067752, + "auxiliary_loss_mlp": 0.01034242, + "balance_loss_clip": 1.02143896, + "balance_loss_mlp": 1.02238107, + "epoch": 0.4535698181271607, + "flos": 25556352065280.0, + "grad_norm": 8.35994304592433, + "language_loss": 0.7607373, + "learning_rate": 2.2910197826891966e-06, + "loss": 0.78175724, + "num_input_tokens_seen": 161858590, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.453125, + "step": 7544, + "time_per_iteration": 2.406228542327881 + }, + { + "auxiliary_loss_clip": 0.01067062, + "auxiliary_loss_mlp": 0.01029449, + "balance_loss_clip": 1.01668763, + "balance_loss_mlp": 1.02229714, + "epoch": 0.45362994137982865, + "flos": 24862942085760.0, + "grad_norm": 1.7949337668069838, + "language_loss": 0.75177962, + "learning_rate": 2.2906459764107725e-06, + "loss": 0.77274472, + "num_input_tokens_seen": 161878390, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.44921875, + "step": 7545, + "time_per_iteration": 2.4176297187805176 + }, + { + "auxiliary_loss_clip": 0.01066249, + "auxiliary_loss_mlp": 0.01033117, + "balance_loss_clip": 1.01953304, + "balance_loss_mlp": 1.02166998, + "epoch": 0.4536900646324966, + "flos": 30805578691200.0, + "grad_norm": 1.627821375558981, + "language_loss": 0.72373348, + "learning_rate": 2.290272159759941e-06, + "loss": 0.74472713, + "num_input_tokens_seen": 161898610, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.4453125, + "step": 7546, + "time_per_iteration": 2.4358227252960205 + }, + { + "auxiliary_loss_clip": 0.01067089, + "auxiliary_loss_mlp": 0.01027725, + "balance_loss_clip": 1.01360488, + "balance_loss_mlp": 1.02062178, + "epoch": 0.4537501878851646, + "flos": 23403188563200.0, + "grad_norm": 1.5199788766377549, + "language_loss": 0.75565016, + "learning_rate": 2.2898983327500428e-06, + "loss": 0.77659833, + "num_input_tokens_seen": 161918210, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.46484375, + "step": 7547, + "time_per_iteration": 2.413212537765503 + }, + { + "auxiliary_loss_clip": 0.01068266, + "auxiliary_loss_mlp": 0.01029108, + "balance_loss_clip": 1.01522017, + "balance_loss_mlp": 1.02225518, + "epoch": 0.45381031113783254, + "flos": 18148341208320.0, + "grad_norm": 1.9000708589278001, + "language_loss": 0.69258487, + "learning_rate": 2.2895244953944186e-06, + "loss": 0.71355855, + "num_input_tokens_seen": 161936950, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.4609375, + "step": 7548, + "time_per_iteration": 2.3683152198791504 + }, + { + "auxiliary_loss_clip": 0.01063828, + "auxiliary_loss_mlp": 0.01031303, + "balance_loss_clip": 1.01789165, + "balance_loss_mlp": 1.02035725, + "epoch": 0.4538704343905005, + "flos": 25335526515840.0, + "grad_norm": 2.0179679828976975, + "language_loss": 0.7271719, + "learning_rate": 2.2891506477064105e-06, + "loss": 0.74812317, + "num_input_tokens_seen": 161955550, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.43359375, + "step": 7549, + "time_per_iteration": 2.397914171218872 + }, + { + "auxiliary_loss_clip": 0.01063471, + "auxiliary_loss_mlp": 0.01026468, + "balance_loss_clip": 1.01437461, + "balance_loss_mlp": 1.02012587, + "epoch": 0.4539305576431685, + "flos": 28730166520320.0, + "grad_norm": 1.4796880017154637, + "language_loss": 0.65166223, + "learning_rate": 2.2887767896993595e-06, + "loss": 0.67256171, + "num_input_tokens_seen": 161976760, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.43359375, + "step": 7550, + "time_per_iteration": 2.4568722248077393 + }, + { + "auxiliary_loss_clip": 0.01064707, + "auxiliary_loss_mlp": 0.01027662, + "balance_loss_clip": 1.0154016, + "balance_loss_mlp": 1.02143145, + "epoch": 0.45399068089583644, + "flos": 22491292095360.0, + "grad_norm": 2.697626052639958, + "language_loss": 0.68671989, + "learning_rate": 2.2884029213866073e-06, + "loss": 0.70764351, + "num_input_tokens_seen": 161996120, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.43359375, + "step": 7551, + "time_per_iteration": 2.386312961578369 + }, + { + "auxiliary_loss_clip": 0.01064039, + "auxiliary_loss_mlp": 0.01026054, + "balance_loss_clip": 1.0134418, + "balance_loss_mlp": 1.01983809, + "epoch": 0.45405080414850446, + "flos": 12892655980800.0, + "grad_norm": 2.3632572098258384, + "language_loss": 0.79282761, + "learning_rate": 2.2880290427814972e-06, + "loss": 0.81372857, + "num_input_tokens_seen": 162011125, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.44140625, + "step": 7552, + "time_per_iteration": 2.339657783508301 + }, + { + "auxiliary_loss_clip": 0.01009587, + "auxiliary_loss_mlp": 0.01003227, + "balance_loss_clip": 1.00220823, + "balance_loss_mlp": 1.000916, + "epoch": 0.4541109274011724, + "flos": 59764146526080.0, + "grad_norm": 0.8197704609579471, + "language_loss": 0.57828516, + "learning_rate": 2.2876551538973712e-06, + "loss": 0.59841335, + "num_input_tokens_seen": 162068705, + "router_z_loss_clip": 0.01019287, + "router_z_loss_mlp": 0.08691406, + "step": 7553, + "time_per_iteration": 2.9942145347595215 + }, + { + "auxiliary_loss_clip": 0.01062583, + "auxiliary_loss_mlp": 0.01028594, + "balance_loss_clip": 1.0165714, + "balance_loss_mlp": 1.01986516, + "epoch": 0.4541710506538404, + "flos": 28510493045760.0, + "grad_norm": 1.3571502916655516, + "language_loss": 0.76597977, + "learning_rate": 2.2872812547475723e-06, + "loss": 0.78689152, + "num_input_tokens_seen": 162089655, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.42773438, + "step": 7554, + "time_per_iteration": 2.453375816345215 + }, + { + "auxiliary_loss_clip": 0.01067878, + "auxiliary_loss_mlp": 0.010254, + "balance_loss_clip": 1.01124954, + "balance_loss_mlp": 1.0214113, + "epoch": 0.45423117390650836, + "flos": 17674639614720.0, + "grad_norm": 2.5376545059950675, + "language_loss": 0.76572078, + "learning_rate": 2.286907345345445e-06, + "loss": 0.78665352, + "num_input_tokens_seen": 162108465, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.46484375, + "step": 7555, + "time_per_iteration": 2.3522145748138428 + }, + { + "auxiliary_loss_clip": 0.01063563, + "auxiliary_loss_mlp": 0.01028109, + "balance_loss_clip": 1.01552677, + "balance_loss_mlp": 1.02003157, + "epoch": 0.4542912971591763, + "flos": 28071355564800.0, + "grad_norm": 1.3207684171895824, + "language_loss": 0.72482848, + "learning_rate": 2.286533425704332e-06, + "loss": 0.74574518, + "num_input_tokens_seen": 162129910, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.43554688, + "step": 7556, + "time_per_iteration": 2.473217010498047 + }, + { + "auxiliary_loss_clip": 0.0106785, + "auxiliary_loss_mlp": 0.01024935, + "balance_loss_clip": 1.01113653, + "balance_loss_mlp": 1.02214503, + "epoch": 0.4543514204118443, + "flos": 22670745816960.0, + "grad_norm": 4.298828972852635, + "language_loss": 0.63671994, + "learning_rate": 2.2861594958375783e-06, + "loss": 0.65764779, + "num_input_tokens_seen": 162148840, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.45703125, + "step": 7557, + "time_per_iteration": 3.779031753540039 + }, + { + "auxiliary_loss_clip": 0.01009927, + "auxiliary_loss_mlp": 0.01001463, + "balance_loss_clip": 1.00050974, + "balance_loss_mlp": 1.00120211, + "epoch": 0.45441154366451225, + "flos": 58213303666560.0, + "grad_norm": 0.6799503676652496, + "language_loss": 0.57606173, + "learning_rate": 2.285785555758528e-06, + "loss": 0.59617567, + "num_input_tokens_seen": 162208500, + "router_z_loss_clip": 0.00952148, + "router_z_loss_mlp": 0.08691406, + "step": 7558, + "time_per_iteration": 2.9737865924835205 + }, + { + "auxiliary_loss_clip": 0.01067322, + "auxiliary_loss_mlp": 0.01031148, + "balance_loss_clip": 1.01677775, + "balance_loss_mlp": 1.02155316, + "epoch": 0.4544716669171802, + "flos": 16763336640000.0, + "grad_norm": 2.7897996622562666, + "language_loss": 0.56411529, + "learning_rate": 2.285411605480527e-06, + "loss": 0.5851, + "num_input_tokens_seen": 162224650, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.45898438, + "step": 7559, + "time_per_iteration": 2.3520994186401367 + }, + { + "auxiliary_loss_clip": 0.0106604, + "auxiliary_loss_mlp": 0.01032982, + "balance_loss_clip": 1.01939201, + "balance_loss_mlp": 1.02142644, + "epoch": 0.4545317901698482, + "flos": 15924303912960.0, + "grad_norm": 1.920548267845553, + "language_loss": 0.71818447, + "learning_rate": 2.2850376450169197e-06, + "loss": 0.73917472, + "num_input_tokens_seen": 162242930, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.4453125, + "step": 7560, + "time_per_iteration": 2.3832788467407227 + }, + { + "auxiliary_loss_clip": 0.01067736, + "auxiliary_loss_mlp": 0.01028261, + "balance_loss_clip": 1.01422405, + "balance_loss_mlp": 1.02209973, + "epoch": 0.45459191342251615, + "flos": 17638783781760.0, + "grad_norm": 1.9128081739474379, + "language_loss": 0.69143784, + "learning_rate": 2.284663674381052e-06, + "loss": 0.71239781, + "num_input_tokens_seen": 162261455, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.45703125, + "step": 7561, + "time_per_iteration": 2.3721296787261963 + }, + { + "auxiliary_loss_clip": 0.01061595, + "auxiliary_loss_mlp": 0.0102768, + "balance_loss_clip": 1.01559865, + "balance_loss_mlp": 1.02068591, + "epoch": 0.4546520366751841, + "flos": 16175783502720.0, + "grad_norm": 1.9254102739586016, + "language_loss": 0.85162824, + "learning_rate": 2.28428969358627e-06, + "loss": 0.87252098, + "num_input_tokens_seen": 162279725, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.41015625, + "step": 7562, + "time_per_iteration": 2.4027886390686035 + }, + { + "auxiliary_loss_clip": 0.01062134, + "auxiliary_loss_mlp": 0.01028194, + "balance_loss_clip": 1.01565337, + "balance_loss_mlp": 1.02050233, + "epoch": 0.4547121599278521, + "flos": 19750540544640.0, + "grad_norm": 1.824438195448442, + "language_loss": 0.89647728, + "learning_rate": 2.28391570264592e-06, + "loss": 0.91738057, + "num_input_tokens_seen": 162297865, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.41601562, + "step": 7563, + "time_per_iteration": 2.384768009185791 + }, + { + "auxiliary_loss_clip": 0.01063834, + "auxiliary_loss_mlp": 0.01027972, + "balance_loss_clip": 1.0151453, + "balance_loss_mlp": 1.02036917, + "epoch": 0.45477228318052004, + "flos": 19936452867840.0, + "grad_norm": 1.967038895416706, + "language_loss": 0.71156561, + "learning_rate": 2.283541701573349e-06, + "loss": 0.73248363, + "num_input_tokens_seen": 162316010, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.43554688, + "step": 7564, + "time_per_iteration": 2.4008710384368896 + }, + { + "auxiliary_loss_clip": 0.01066251, + "auxiliary_loss_mlp": 0.01027658, + "balance_loss_clip": 1.01390123, + "balance_loss_mlp": 1.02064157, + "epoch": 0.454832406433188, + "flos": 21287242437120.0, + "grad_norm": 2.272743758489971, + "language_loss": 0.68537104, + "learning_rate": 2.283167690381904e-06, + "loss": 0.70631015, + "num_input_tokens_seen": 162336115, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.45703125, + "step": 7565, + "time_per_iteration": 3.797539710998535 + }, + { + "auxiliary_loss_clip": 0.01062602, + "auxiliary_loss_mlp": 0.01023303, + "balance_loss_clip": 1.01036882, + "balance_loss_mlp": 1.02027619, + "epoch": 0.45489252968585603, + "flos": 24497576040960.0, + "grad_norm": 1.6666135981089103, + "language_loss": 0.80053306, + "learning_rate": 2.2827936690849326e-06, + "loss": 0.82139218, + "num_input_tokens_seen": 162355705, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.42382812, + "step": 7566, + "time_per_iteration": 2.431304693222046 + }, + { + "auxiliary_loss_clip": 0.01065267, + "auxiliary_loss_mlp": 0.01029234, + "balance_loss_clip": 1.01542974, + "balance_loss_mlp": 1.02018678, + "epoch": 0.454952652938524, + "flos": 17091520220160.0, + "grad_norm": 2.501253782504087, + "language_loss": 0.7388519, + "learning_rate": 2.2824196376957833e-06, + "loss": 0.75979698, + "num_input_tokens_seen": 162374055, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.45117188, + "step": 7567, + "time_per_iteration": 2.3591208457946777 + }, + { + "auxiliary_loss_clip": 0.01067331, + "auxiliary_loss_mlp": 0.01029194, + "balance_loss_clip": 1.01596785, + "balance_loss_mlp": 1.02169919, + "epoch": 0.45501277619119196, + "flos": 27629320440960.0, + "grad_norm": 1.5667623930086676, + "language_loss": 0.80922556, + "learning_rate": 2.282045596227803e-06, + "loss": 0.83019078, + "num_input_tokens_seen": 162393560, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.45507812, + "step": 7568, + "time_per_iteration": 2.4282302856445312 + }, + { + "auxiliary_loss_clip": 0.01068585, + "auxiliary_loss_mlp": 0.01034836, + "balance_loss_clip": 1.02067435, + "balance_loss_mlp": 1.02157056, + "epoch": 0.4550728994438599, + "flos": 19973635332480.0, + "grad_norm": 2.5970465158146943, + "language_loss": 0.79952586, + "learning_rate": 2.2816715446943402e-06, + "loss": 0.8205601, + "num_input_tokens_seen": 162413170, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.47070312, + "step": 7569, + "time_per_iteration": 3.7558767795562744 + }, + { + "auxiliary_loss_clip": 0.01067993, + "auxiliary_loss_mlp": 0.01029229, + "balance_loss_clip": 1.01532888, + "balance_loss_mlp": 1.0214473, + "epoch": 0.4551330226965279, + "flos": 26065700023680.0, + "grad_norm": 1.2867870743114809, + "language_loss": 0.74742424, + "learning_rate": 2.281297483108745e-06, + "loss": 0.76839638, + "num_input_tokens_seen": 162434080, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.46484375, + "step": 7570, + "time_per_iteration": 3.7874484062194824 + }, + { + "auxiliary_loss_clip": 0.01065804, + "auxiliary_loss_mlp": 0.01027281, + "balance_loss_clip": 1.01367331, + "balance_loss_mlp": 1.02230287, + "epoch": 0.45519314594919585, + "flos": 32779707408000.0, + "grad_norm": 1.7627774022864096, + "language_loss": 0.74975979, + "learning_rate": 2.2809234114843664e-06, + "loss": 0.77069068, + "num_input_tokens_seen": 162455445, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.43554688, + "step": 7571, + "time_per_iteration": 2.464336395263672 + }, + { + "auxiliary_loss_clip": 0.01061059, + "auxiliary_loss_mlp": 0.01025332, + "balance_loss_clip": 1.01324391, + "balance_loss_mlp": 1.02039444, + "epoch": 0.4552532692018638, + "flos": 19171645424640.0, + "grad_norm": 1.5951749479907738, + "language_loss": 0.80822539, + "learning_rate": 2.2805493298345537e-06, + "loss": 0.8290894, + "num_input_tokens_seen": 162474940, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.40625, + "step": 7572, + "time_per_iteration": 2.3671486377716064 + }, + { + "auxiliary_loss_clip": 0.01068509, + "auxiliary_loss_mlp": 0.01032314, + "balance_loss_clip": 1.01734161, + "balance_loss_mlp": 1.0215416, + "epoch": 0.4553133924545318, + "flos": 26026073763840.0, + "grad_norm": 1.7458145791747681, + "language_loss": 0.72657931, + "learning_rate": 2.280175238172657e-06, + "loss": 0.74758744, + "num_input_tokens_seen": 162493340, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.46875, + "step": 7573, + "time_per_iteration": 2.403475522994995 + }, + { + "auxiliary_loss_clip": 0.01068865, + "auxiliary_loss_mlp": 0.01024902, + "balance_loss_clip": 1.00983357, + "balance_loss_mlp": 1.02209127, + "epoch": 0.45537351570719975, + "flos": 30660305057280.0, + "grad_norm": 2.291599522878136, + "language_loss": 0.74428326, + "learning_rate": 2.279801136512027e-06, + "loss": 0.76522094, + "num_input_tokens_seen": 162514360, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.46875, + "step": 7574, + "time_per_iteration": 2.4735829830169678 + }, + { + "auxiliary_loss_clip": 0.01067514, + "auxiliary_loss_mlp": 0.01028626, + "balance_loss_clip": 1.01508403, + "balance_loss_mlp": 1.02131116, + "epoch": 0.4554336389598677, + "flos": 24352232584320.0, + "grad_norm": 1.6256376072547702, + "language_loss": 0.71358198, + "learning_rate": 2.2794270248660136e-06, + "loss": 0.73454338, + "num_input_tokens_seen": 162535240, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.46289062, + "step": 7575, + "time_per_iteration": 2.4060401916503906 + }, + { + "auxiliary_loss_clip": 0.01065969, + "auxiliary_loss_mlp": 0.01026719, + "balance_loss_clip": 1.01328981, + "balance_loss_mlp": 1.02163613, + "epoch": 0.4554937622125357, + "flos": 20556894372480.0, + "grad_norm": 2.23472255492855, + "language_loss": 0.73269236, + "learning_rate": 2.279052903247969e-06, + "loss": 0.75361919, + "num_input_tokens_seen": 162553880, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.4453125, + "step": 7576, + "time_per_iteration": 2.3669514656066895 + }, + { + "auxiliary_loss_clip": 0.01064888, + "auxiliary_loss_mlp": 0.01029229, + "balance_loss_clip": 1.01569283, + "balance_loss_mlp": 1.02136016, + "epoch": 0.45555388546520365, + "flos": 22819650232320.0, + "grad_norm": 1.697588309042872, + "language_loss": 0.6665262, + "learning_rate": 2.278678771671244e-06, + "loss": 0.6874674, + "num_input_tokens_seen": 162574485, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.43554688, + "step": 7577, + "time_per_iteration": 2.428788661956787 + }, + { + "auxiliary_loss_clip": 0.01067629, + "auxiliary_loss_mlp": 0.01031772, + "balance_loss_clip": 1.01635826, + "balance_loss_mlp": 1.02276635, + "epoch": 0.4556140087178716, + "flos": 21724913640960.0, + "grad_norm": 2.52861786484618, + "language_loss": 0.74021453, + "learning_rate": 2.2783046301491904e-06, + "loss": 0.76120847, + "num_input_tokens_seen": 162595130, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.44921875, + "step": 7578, + "time_per_iteration": 2.4156970977783203 + }, + { + "auxiliary_loss_clip": 0.01066206, + "auxiliary_loss_mlp": 0.01026891, + "balance_loss_clip": 1.01321173, + "balance_loss_mlp": 1.02232552, + "epoch": 0.45567413197053963, + "flos": 25993325041920.0, + "grad_norm": 2.241681005985688, + "language_loss": 0.70633674, + "learning_rate": 2.277930478695161e-06, + "loss": 0.72726774, + "num_input_tokens_seen": 162615720, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.43945312, + "step": 7579, + "time_per_iteration": 2.4085164070129395 + }, + { + "auxiliary_loss_clip": 0.0106346, + "auxiliary_loss_mlp": 0.01028648, + "balance_loss_clip": 1.01571417, + "balance_loss_mlp": 1.02086902, + "epoch": 0.4557342552232076, + "flos": 21536697168000.0, + "grad_norm": 1.8925720952297425, + "language_loss": 0.78210813, + "learning_rate": 2.2775563173225064e-06, + "loss": 0.80302918, + "num_input_tokens_seen": 162635825, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.42578125, + "step": 7580, + "time_per_iteration": 2.408362627029419 + }, + { + "auxiliary_loss_clip": 0.01066306, + "auxiliary_loss_mlp": 0.01024314, + "balance_loss_clip": 1.01056886, + "balance_loss_mlp": 1.02233052, + "epoch": 0.45579437847587556, + "flos": 40000479310080.0, + "grad_norm": 1.811592307075502, + "language_loss": 0.68850464, + "learning_rate": 2.277182146044582e-06, + "loss": 0.70941085, + "num_input_tokens_seen": 162659130, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.43945312, + "step": 7581, + "time_per_iteration": 2.533961534500122 + }, + { + "auxiliary_loss_clip": 0.01063347, + "auxiliary_loss_mlp": 0.01024403, + "balance_loss_clip": 1.01189172, + "balance_loss_mlp": 1.02037954, + "epoch": 0.4558545017285435, + "flos": 31137183584640.0, + "grad_norm": 1.5628369840244813, + "language_loss": 0.72840208, + "learning_rate": 2.2768079648747394e-06, + "loss": 0.74927962, + "num_input_tokens_seen": 162681665, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.4296875, + "step": 7582, + "time_per_iteration": 2.457530975341797 + }, + { + "auxiliary_loss_clip": 0.01069936, + "auxiliary_loss_mlp": 0.01037073, + "balance_loss_clip": 1.02180219, + "balance_loss_mlp": 1.02259326, + "epoch": 0.4559146249812115, + "flos": 21724704172800.0, + "grad_norm": 1.6588576120266465, + "language_loss": 0.72190607, + "learning_rate": 2.276433773826333e-06, + "loss": 0.74297607, + "num_input_tokens_seen": 162702040, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.47460938, + "step": 7583, + "time_per_iteration": 2.37620210647583 + }, + { + "auxiliary_loss_clip": 0.01070162, + "auxiliary_loss_mlp": 0.0103265, + "balance_loss_clip": 1.01728964, + "balance_loss_mlp": 1.02121282, + "epoch": 0.45597474823387946, + "flos": 23804829377280.0, + "grad_norm": 1.900539323994985, + "language_loss": 0.72714221, + "learning_rate": 2.2760595729127157e-06, + "loss": 0.74817026, + "num_input_tokens_seen": 162722375, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.49023438, + "step": 7584, + "time_per_iteration": 2.417562961578369 + }, + { + "auxiliary_loss_clip": 0.01067523, + "auxiliary_loss_mlp": 0.01029644, + "balance_loss_clip": 1.01638794, + "balance_loss_mlp": 1.0226016, + "epoch": 0.4560348714865474, + "flos": 31904295177600.0, + "grad_norm": 2.9072485202783698, + "language_loss": 0.67975616, + "learning_rate": 2.2756853621472424e-06, + "loss": 0.70072782, + "num_input_tokens_seen": 162746095, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.44921875, + "step": 7585, + "time_per_iteration": 2.4621307849884033 + }, + { + "auxiliary_loss_clip": 0.01065272, + "auxiliary_loss_mlp": 0.01022026, + "balance_loss_clip": 1.0088414, + "balance_loss_mlp": 1.02200294, + "epoch": 0.4560949947392154, + "flos": 22047895428480.0, + "grad_norm": 1.4405984950155442, + "language_loss": 0.76422566, + "learning_rate": 2.275311141543268e-06, + "loss": 0.78509867, + "num_input_tokens_seen": 162766330, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.43359375, + "step": 7586, + "time_per_iteration": 2.3923864364624023 + }, + { + "auxiliary_loss_clip": 0.0106538, + "auxiliary_loss_mlp": 0.01024036, + "balance_loss_clip": 1.01163805, + "balance_loss_mlp": 1.02179682, + "epoch": 0.45615511799188335, + "flos": 24570649249920.0, + "grad_norm": 1.6963859331897848, + "language_loss": 0.77957523, + "learning_rate": 2.2749369111141464e-06, + "loss": 0.8004694, + "num_input_tokens_seen": 162784755, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.43554688, + "step": 7587, + "time_per_iteration": 2.4002041816711426 + }, + { + "auxiliary_loss_clip": 0.01068487, + "auxiliary_loss_mlp": 0.01035681, + "balance_loss_clip": 1.02117324, + "balance_loss_mlp": 1.02041399, + "epoch": 0.4562152412445513, + "flos": 18329784877440.0, + "grad_norm": 1.5743543109800142, + "language_loss": 0.68719155, + "learning_rate": 2.2745626708732348e-06, + "loss": 0.70823324, + "num_input_tokens_seen": 162803850, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.48046875, + "step": 7588, + "time_per_iteration": 2.368277072906494 + }, + { + "auxiliary_loss_clip": 0.01065423, + "auxiliary_loss_mlp": 0.01024989, + "balance_loss_clip": 1.01200747, + "balance_loss_mlp": 1.02150583, + "epoch": 0.4562753644972193, + "flos": 13515680926080.0, + "grad_norm": 1.9918140561434377, + "language_loss": 0.79307824, + "learning_rate": 2.274188420833887e-06, + "loss": 0.81398237, + "num_input_tokens_seen": 162820775, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.43945312, + "step": 7589, + "time_per_iteration": 2.335357189178467 + }, + { + "auxiliary_loss_clip": 0.01066849, + "auxiliary_loss_mlp": 0.01027965, + "balance_loss_clip": 1.01464891, + "balance_loss_mlp": 1.02128756, + "epoch": 0.45633548774988725, + "flos": 29638502029440.0, + "grad_norm": 3.2812419150264915, + "language_loss": 0.62182152, + "learning_rate": 2.27381416100946e-06, + "loss": 0.64276963, + "num_input_tokens_seen": 162839695, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.45703125, + "step": 7590, + "time_per_iteration": 2.4395785331726074 + }, + { + "auxiliary_loss_clip": 0.01066048, + "auxiliary_loss_mlp": 0.01028982, + "balance_loss_clip": 1.01584506, + "balance_loss_mlp": 1.02104878, + "epoch": 0.4563956110025552, + "flos": 22232411297280.0, + "grad_norm": 1.7363728551581372, + "language_loss": 0.72812349, + "learning_rate": 2.27343989141331e-06, + "loss": 0.74907374, + "num_input_tokens_seen": 162856095, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.44921875, + "step": 7591, + "time_per_iteration": 2.359543561935425 + }, + { + "auxiliary_loss_clip": 0.01069314, + "auxiliary_loss_mlp": 0.01032127, + "balance_loss_clip": 1.01725554, + "balance_loss_mlp": 1.02330971, + "epoch": 0.45645573425522323, + "flos": 17091101283840.0, + "grad_norm": 1.8366237152175282, + "language_loss": 0.76705104, + "learning_rate": 2.2730656120587926e-06, + "loss": 0.78806543, + "num_input_tokens_seen": 162874070, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.4609375, + "step": 7592, + "time_per_iteration": 2.367180585861206 + }, + { + "auxiliary_loss_clip": 0.01065145, + "auxiliary_loss_mlp": 0.01028927, + "balance_loss_clip": 1.01518846, + "balance_loss_mlp": 1.02084684, + "epoch": 0.4565158575078912, + "flos": 20331495434880.0, + "grad_norm": 4.1967415977653815, + "language_loss": 0.69729877, + "learning_rate": 2.2726913229592673e-06, + "loss": 0.71823955, + "num_input_tokens_seen": 162891000, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.44335938, + "step": 7593, + "time_per_iteration": 2.3657524585723877 + }, + { + "auxiliary_loss_clip": 0.01061789, + "auxiliary_loss_mlp": 0.01024721, + "balance_loss_clip": 1.01314569, + "balance_loss_mlp": 1.02101707, + "epoch": 0.45657598076055916, + "flos": 23982013860480.0, + "grad_norm": 2.470424733464871, + "language_loss": 0.83721459, + "learning_rate": 2.2723170241280898e-06, + "loss": 0.85807973, + "num_input_tokens_seen": 162910120, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.40820312, + "step": 7594, + "time_per_iteration": 2.3911571502685547 + }, + { + "auxiliary_loss_clip": 0.01010859, + "auxiliary_loss_mlp": 0.00999514, + "balance_loss_clip": 0.99827462, + "balance_loss_mlp": 1.00203371, + "epoch": 0.45663610401322713, + "flos": 69361211629440.0, + "grad_norm": 0.8100696697369784, + "language_loss": 0.52716064, + "learning_rate": 2.271942715578618e-06, + "loss": 0.54726434, + "num_input_tokens_seen": 162963720, + "router_z_loss_clip": 0.01239014, + "router_z_loss_mlp": 0.08789062, + "step": 7595, + "time_per_iteration": 3.0713658332824707 + }, + { + "auxiliary_loss_clip": 0.01067851, + "auxiliary_loss_mlp": 0.01024478, + "balance_loss_clip": 1.01071501, + "balance_loss_mlp": 1.02117527, + "epoch": 0.4566962272658951, + "flos": 15148464480000.0, + "grad_norm": 2.237661114535625, + "language_loss": 0.8747772, + "learning_rate": 2.27156839732421e-06, + "loss": 0.89570045, + "num_input_tokens_seen": 162975760, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.46679688, + "step": 7596, + "time_per_iteration": 2.3343851566314697 + }, + { + "auxiliary_loss_clip": 0.01067827, + "auxiliary_loss_mlp": 0.01030412, + "balance_loss_clip": 1.01719713, + "balance_loss_mlp": 1.02237701, + "epoch": 0.45675635051856306, + "flos": 18696477553920.0, + "grad_norm": 1.7265504658982582, + "language_loss": 0.77141404, + "learning_rate": 2.2711940693782247e-06, + "loss": 0.79239643, + "num_input_tokens_seen": 162994865, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.45507812, + "step": 7597, + "time_per_iteration": 3.772029399871826 + }, + { + "auxiliary_loss_clip": 0.01066079, + "auxiliary_loss_mlp": 0.01027253, + "balance_loss_clip": 1.01287031, + "balance_loss_mlp": 1.02085376, + "epoch": 0.456816473771231, + "flos": 19097315406720.0, + "grad_norm": 1.6620465257580561, + "language_loss": 0.78462499, + "learning_rate": 2.270819731754021e-06, + "loss": 0.80555832, + "num_input_tokens_seen": 163014730, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.453125, + "step": 7598, + "time_per_iteration": 2.404742956161499 + }, + { + "auxiliary_loss_clip": 0.01065759, + "auxiliary_loss_mlp": 0.01025395, + "balance_loss_clip": 1.012205, + "balance_loss_mlp": 1.02163649, + "epoch": 0.456876597023899, + "flos": 28948792654080.0, + "grad_norm": 2.3670583522246744, + "language_loss": 0.71356076, + "learning_rate": 2.2704453844649573e-06, + "loss": 0.73447227, + "num_input_tokens_seen": 163033405, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.44140625, + "step": 7599, + "time_per_iteration": 2.4278604984283447 + }, + { + "auxiliary_loss_clip": 0.01063543, + "auxiliary_loss_mlp": 0.01024777, + "balance_loss_clip": 1.01195598, + "balance_loss_mlp": 1.01979136, + "epoch": 0.45693672027656695, + "flos": 23288499146880.0, + "grad_norm": 1.9689152904530511, + "language_loss": 0.69450909, + "learning_rate": 2.2700710275243936e-06, + "loss": 0.71539223, + "num_input_tokens_seen": 163051400, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.4375, + "step": 7600, + "time_per_iteration": 2.403019905090332 + }, + { + "auxiliary_loss_clip": 0.01066315, + "auxiliary_loss_mlp": 0.01029207, + "balance_loss_clip": 1.01655865, + "balance_loss_mlp": 1.02197099, + "epoch": 0.4569968435292349, + "flos": 20557173663360.0, + "grad_norm": 1.8838938512081942, + "language_loss": 0.78787374, + "learning_rate": 2.2696966609456896e-06, + "loss": 0.80882895, + "num_input_tokens_seen": 163069250, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.44335938, + "step": 7601, + "time_per_iteration": 2.366189479827881 + }, + { + "auxiliary_loss_clip": 0.01067385, + "auxiliary_loss_mlp": 0.01028684, + "balance_loss_clip": 1.01592243, + "balance_loss_mlp": 1.02213335, + "epoch": 0.4570569667819029, + "flos": 41280988579200.0, + "grad_norm": 1.8689863815845755, + "language_loss": 0.71496165, + "learning_rate": 2.269322284742205e-06, + "loss": 0.73592234, + "num_input_tokens_seen": 163091755, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.45117188, + "step": 7602, + "time_per_iteration": 2.6175425052642822 + }, + { + "auxiliary_loss_clip": 0.01066132, + "auxiliary_loss_mlp": 0.01027898, + "balance_loss_clip": 1.0133667, + "balance_loss_mlp": 1.02211499, + "epoch": 0.45711709003457085, + "flos": 26030367861120.0, + "grad_norm": 1.569395010781226, + "language_loss": 0.73137712, + "learning_rate": 2.2689478989273015e-06, + "loss": 0.75231743, + "num_input_tokens_seen": 163111600, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.44140625, + "step": 7603, + "time_per_iteration": 2.4084105491638184 + }, + { + "auxiliary_loss_clip": 0.01066342, + "auxiliary_loss_mlp": 0.01033143, + "balance_loss_clip": 1.01877236, + "balance_loss_mlp": 1.02151465, + "epoch": 0.4571772132872388, + "flos": 22157138672640.0, + "grad_norm": 2.246841786454731, + "language_loss": 0.82738191, + "learning_rate": 2.268573503514339e-06, + "loss": 0.84837675, + "num_input_tokens_seen": 163127350, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.44921875, + "step": 7604, + "time_per_iteration": 2.365032196044922 + }, + { + "auxiliary_loss_clip": 0.0107187, + "auxiliary_loss_mlp": 0.01032532, + "balance_loss_clip": 1.01729727, + "balance_loss_mlp": 1.02348757, + "epoch": 0.45723733653990684, + "flos": 23877728029440.0, + "grad_norm": 2.6439772236025436, + "language_loss": 0.85744572, + "learning_rate": 2.268199098516679e-06, + "loss": 0.87848973, + "num_input_tokens_seen": 163145855, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.484375, + "step": 7605, + "time_per_iteration": 3.8514997959136963 + }, + { + "auxiliary_loss_clip": 0.01066433, + "auxiliary_loss_mlp": 0.01025428, + "balance_loss_clip": 1.01142657, + "balance_loss_mlp": 1.02090836, + "epoch": 0.4572974597925748, + "flos": 16870904138880.0, + "grad_norm": 1.8533798650530404, + "language_loss": 0.73488021, + "learning_rate": 2.2678246839476837e-06, + "loss": 0.75579882, + "num_input_tokens_seen": 163163830, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.45507812, + "step": 7606, + "time_per_iteration": 2.361514091491699 + }, + { + "auxiliary_loss_clip": 0.01066606, + "auxiliary_loss_mlp": 0.01028452, + "balance_loss_clip": 1.01458216, + "balance_loss_mlp": 1.02144325, + "epoch": 0.45735758304524277, + "flos": 13770651651840.0, + "grad_norm": 1.9361165831804188, + "language_loss": 0.79858172, + "learning_rate": 2.2674502598207135e-06, + "loss": 0.81953233, + "num_input_tokens_seen": 163180700, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.45117188, + "step": 7607, + "time_per_iteration": 2.3461568355560303 + }, + { + "auxiliary_loss_clip": 0.01066171, + "auxiliary_loss_mlp": 0.01028871, + "balance_loss_clip": 1.01528728, + "balance_loss_mlp": 1.02240801, + "epoch": 0.45741770629791073, + "flos": 21099828925440.0, + "grad_norm": 1.731674653265185, + "language_loss": 0.80706882, + "learning_rate": 2.2670758261491316e-06, + "loss": 0.82801926, + "num_input_tokens_seen": 163199450, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.4375, + "step": 7608, + "time_per_iteration": 3.7504115104675293 + }, + { + "auxiliary_loss_clip": 0.01065852, + "auxiliary_loss_mlp": 0.01025986, + "balance_loss_clip": 1.01202655, + "balance_loss_mlp": 1.02021718, + "epoch": 0.4574778295505787, + "flos": 23111768511360.0, + "grad_norm": 1.6191495465432273, + "language_loss": 0.68400586, + "learning_rate": 2.2667013829463005e-06, + "loss": 0.70492417, + "num_input_tokens_seen": 163217875, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.45703125, + "step": 7609, + "time_per_iteration": 2.385618209838867 + }, + { + "auxiliary_loss_clip": 0.01064274, + "auxiliary_loss_mlp": 0.01033435, + "balance_loss_clip": 1.01951718, + "balance_loss_mlp": 1.02085495, + "epoch": 0.45753795280324666, + "flos": 24351778736640.0, + "grad_norm": 2.5443024246226256, + "language_loss": 0.81005085, + "learning_rate": 2.266326930225584e-06, + "loss": 0.83102798, + "num_input_tokens_seen": 163237430, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.43554688, + "step": 7610, + "time_per_iteration": 3.7516679763793945 + }, + { + "auxiliary_loss_clip": 0.01066832, + "auxiliary_loss_mlp": 0.01027095, + "balance_loss_clip": 1.01336217, + "balance_loss_mlp": 1.02093303, + "epoch": 0.4575980760559146, + "flos": 16652871498240.0, + "grad_norm": 2.1477229059460043, + "language_loss": 0.82125479, + "learning_rate": 2.265952468000344e-06, + "loss": 0.84219408, + "num_input_tokens_seen": 163253905, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.45898438, + "step": 7611, + "time_per_iteration": 2.359844923019409 + }, + { + "auxiliary_loss_clip": 0.01069005, + "auxiliary_loss_mlp": 0.01025375, + "balance_loss_clip": 1.01177883, + "balance_loss_mlp": 1.0235002, + "epoch": 0.4576581993085826, + "flos": 35910160087680.0, + "grad_norm": 1.8615345226611633, + "language_loss": 0.74008453, + "learning_rate": 2.2655779962839443e-06, + "loss": 0.76102829, + "num_input_tokens_seen": 163274285, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.45507812, + "step": 7612, + "time_per_iteration": 2.4806225299835205 + }, + { + "auxiliary_loss_clip": 0.01065049, + "auxiliary_loss_mlp": 0.01028213, + "balance_loss_clip": 1.01511168, + "balance_loss_mlp": 1.02043509, + "epoch": 0.45771832256125056, + "flos": 20079492174720.0, + "grad_norm": 1.5899806575703348, + "language_loss": 0.84802544, + "learning_rate": 2.265203515089749e-06, + "loss": 0.86895806, + "num_input_tokens_seen": 163293150, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.4453125, + "step": 7613, + "time_per_iteration": 2.390503406524658 + }, + { + "auxiliary_loss_clip": 0.01067428, + "auxiliary_loss_mlp": 0.01027378, + "balance_loss_clip": 1.01382339, + "balance_loss_mlp": 1.02283883, + "epoch": 0.4577784458139185, + "flos": 10743542196480.0, + "grad_norm": 2.4829348749738287, + "language_loss": 0.76056731, + "learning_rate": 2.264829024431122e-06, + "loss": 0.78151536, + "num_input_tokens_seen": 163310065, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.4453125, + "step": 7614, + "time_per_iteration": 2.3612396717071533 + }, + { + "auxiliary_loss_clip": 0.01063477, + "auxiliary_loss_mlp": 0.01024864, + "balance_loss_clip": 1.01187634, + "balance_loss_mlp": 1.02070332, + "epoch": 0.4578385690665865, + "flos": 21906217664640.0, + "grad_norm": 1.5275206915412347, + "language_loss": 0.746382, + "learning_rate": 2.264454524321429e-06, + "loss": 0.76726544, + "num_input_tokens_seen": 163329415, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.42773438, + "step": 7615, + "time_per_iteration": 2.3855385780334473 + }, + { + "auxiliary_loss_clip": 0.01011131, + "auxiliary_loss_mlp": 0.01002137, + "balance_loss_clip": 1.0010525, + "balance_loss_mlp": 1.00245106, + "epoch": 0.45789869231925445, + "flos": 64755574606080.0, + "grad_norm": 0.8027000213982431, + "language_loss": 0.57666826, + "learning_rate": 2.264080014774034e-06, + "loss": 0.59680092, + "num_input_tokens_seen": 163385875, + "router_z_loss_clip": 0.01086426, + "router_z_loss_mlp": 0.08691406, + "step": 7616, + "time_per_iteration": 2.9842305183410645 + }, + { + "auxiliary_loss_clip": 0.01064384, + "auxiliary_loss_mlp": 0.0102985, + "balance_loss_clip": 1.01602745, + "balance_loss_mlp": 1.02063119, + "epoch": 0.4579588155719224, + "flos": 16143069692160.0, + "grad_norm": 1.5751211374174563, + "language_loss": 0.71063316, + "learning_rate": 2.2637054958023026e-06, + "loss": 0.73157549, + "num_input_tokens_seen": 163405170, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.4375, + "step": 7617, + "time_per_iteration": 2.369464874267578 + }, + { + "auxiliary_loss_clip": 0.01066325, + "auxiliary_loss_mlp": 0.01024902, + "balance_loss_clip": 1.01233184, + "balance_loss_mlp": 1.02176476, + "epoch": 0.45801893882459044, + "flos": 21394530645120.0, + "grad_norm": 1.8373753362897653, + "language_loss": 0.76658118, + "learning_rate": 2.2633309674196004e-06, + "loss": 0.78749347, + "num_input_tokens_seen": 163423155, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.4453125, + "step": 7618, + "time_per_iteration": 2.3583836555480957 + }, + { + "auxiliary_loss_clip": 0.01069166, + "auxiliary_loss_mlp": 0.01034756, + "balance_loss_clip": 1.02111888, + "balance_loss_mlp": 1.02284694, + "epoch": 0.4580790620772584, + "flos": 19535545192320.0, + "grad_norm": 2.484864746752173, + "language_loss": 0.77160591, + "learning_rate": 2.2629564296392935e-06, + "loss": 0.79264516, + "num_input_tokens_seen": 163442450, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.46289062, + "step": 7619, + "time_per_iteration": 2.368227481842041 + }, + { + "auxiliary_loss_clip": 0.0106229, + "auxiliary_loss_mlp": 0.01025848, + "balance_loss_clip": 1.01384974, + "balance_loss_mlp": 1.0214926, + "epoch": 0.45813918532992637, + "flos": 16580147402880.0, + "grad_norm": 1.8127737800665285, + "language_loss": 0.71594846, + "learning_rate": 2.262581882474748e-06, + "loss": 0.73682988, + "num_input_tokens_seen": 163459810, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.40820312, + "step": 7620, + "time_per_iteration": 2.353285312652588 + }, + { + "auxiliary_loss_clip": 0.01062405, + "auxiliary_loss_mlp": 0.01022554, + "balance_loss_clip": 1.01134801, + "balance_loss_mlp": 1.02076316, + "epoch": 0.45819930858259433, + "flos": 42228671057280.0, + "grad_norm": 1.8010737165467483, + "language_loss": 0.78143406, + "learning_rate": 2.2622073259393302e-06, + "loss": 0.80228364, + "num_input_tokens_seen": 163482970, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.41601562, + "step": 7621, + "time_per_iteration": 2.5751729011535645 + }, + { + "auxiliary_loss_clip": 0.0101037, + "auxiliary_loss_mlp": 0.01002037, + "balance_loss_clip": 1.00100577, + "balance_loss_mlp": 1.00179482, + "epoch": 0.4582594318352623, + "flos": 63711705732480.0, + "grad_norm": 0.7752045555220521, + "language_loss": 0.64972019, + "learning_rate": 2.2618327600464076e-06, + "loss": 0.66984427, + "num_input_tokens_seen": 163545330, + "router_z_loss_clip": 0.01031494, + "router_z_loss_mlp": 0.0859375, + "step": 7622, + "time_per_iteration": 3.0180346965789795 + }, + { + "auxiliary_loss_clip": 0.01066028, + "auxiliary_loss_mlp": 0.01022512, + "balance_loss_clip": 1.00925565, + "balance_loss_mlp": 1.02142644, + "epoch": 0.45831955508793026, + "flos": 26868772183680.0, + "grad_norm": 2.572525036700186, + "language_loss": 0.79550326, + "learning_rate": 2.2614581848093474e-06, + "loss": 0.81638861, + "num_input_tokens_seen": 163564620, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.4453125, + "step": 7623, + "time_per_iteration": 2.4242162704467773 + }, + { + "auxiliary_loss_clip": 0.01066492, + "auxiliary_loss_mlp": 0.01027906, + "balance_loss_clip": 1.01532912, + "balance_loss_mlp": 1.02143645, + "epoch": 0.45837967834059823, + "flos": 18732961791360.0, + "grad_norm": 1.9191635326603662, + "language_loss": 0.70631635, + "learning_rate": 2.261083600241517e-06, + "loss": 0.72726035, + "num_input_tokens_seen": 163581010, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.45117188, + "step": 7624, + "time_per_iteration": 2.3486111164093018 + }, + { + "auxiliary_loss_clip": 0.01064788, + "auxiliary_loss_mlp": 0.01028284, + "balance_loss_clip": 1.01485467, + "balance_loss_mlp": 1.01989603, + "epoch": 0.4584398015932662, + "flos": 21177056586240.0, + "grad_norm": 2.0516848994536216, + "language_loss": 0.73076916, + "learning_rate": 2.2607090063562846e-06, + "loss": 0.75169981, + "num_input_tokens_seen": 163599955, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.44921875, + "step": 7625, + "time_per_iteration": 2.402421712875366 + }, + { + "auxiliary_loss_clip": 0.0106315, + "auxiliary_loss_mlp": 0.01025345, + "balance_loss_clip": 1.01342964, + "balance_loss_mlp": 1.02102447, + "epoch": 0.45849992484593416, + "flos": 19789084552320.0, + "grad_norm": 2.7360433726304905, + "language_loss": 0.78619289, + "learning_rate": 2.260334403167018e-06, + "loss": 0.80707788, + "num_input_tokens_seen": 163618545, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.421875, + "step": 7626, + "time_per_iteration": 2.3745758533477783 + }, + { + "auxiliary_loss_clip": 0.01066489, + "auxiliary_loss_mlp": 0.01028657, + "balance_loss_clip": 1.01563358, + "balance_loss_mlp": 1.02235007, + "epoch": 0.4585600480986021, + "flos": 18222287201280.0, + "grad_norm": 1.680811642373653, + "language_loss": 0.84668899, + "learning_rate": 2.2599597906870873e-06, + "loss": 0.86764038, + "num_input_tokens_seen": 163636055, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.44140625, + "step": 7627, + "time_per_iteration": 2.356332302093506 + }, + { + "auxiliary_loss_clip": 0.01068899, + "auxiliary_loss_mlp": 0.01027809, + "balance_loss_clip": 1.01326585, + "balance_loss_mlp": 1.02195764, + "epoch": 0.4586201713512701, + "flos": 29020958167680.0, + "grad_norm": 1.5783510780114414, + "language_loss": 0.69230402, + "learning_rate": 2.25958516892986e-06, + "loss": 0.71327108, + "num_input_tokens_seen": 163657485, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.46875, + "step": 7628, + "time_per_iteration": 2.435359001159668 + }, + { + "auxiliary_loss_clip": 0.01062562, + "auxiliary_loss_mlp": 0.01022484, + "balance_loss_clip": 1.00966883, + "balance_loss_mlp": 1.01966476, + "epoch": 0.45868029460393805, + "flos": 23403467854080.0, + "grad_norm": 1.684645142066587, + "language_loss": 0.78221273, + "learning_rate": 2.2592105379087053e-06, + "loss": 0.80306315, + "num_input_tokens_seen": 163676030, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.4296875, + "step": 7629, + "time_per_iteration": 2.4003407955169678 + }, + { + "auxiliary_loss_clip": 0.01067504, + "auxiliary_loss_mlp": 0.01031062, + "balance_loss_clip": 1.01683474, + "balance_loss_mlp": 1.02011085, + "epoch": 0.458740417856606, + "flos": 18221030392320.0, + "grad_norm": 2.1931470888324127, + "language_loss": 0.78886074, + "learning_rate": 2.2588358976369933e-06, + "loss": 0.8098464, + "num_input_tokens_seen": 163694490, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.47460938, + "step": 7630, + "time_per_iteration": 2.3439815044403076 + }, + { + "auxiliary_loss_clip": 0.01067391, + "auxiliary_loss_mlp": 0.01029011, + "balance_loss_clip": 1.01501548, + "balance_loss_mlp": 1.02041054, + "epoch": 0.458800541109274, + "flos": 34567330308480.0, + "grad_norm": 2.5571150391894464, + "language_loss": 0.72030056, + "learning_rate": 2.258461248128094e-06, + "loss": 0.74126458, + "num_input_tokens_seen": 163717035, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.46875, + "step": 7631, + "time_per_iteration": 2.4929888248443604 + }, + { + "auxiliary_loss_clip": 0.01065766, + "auxiliary_loss_mlp": 0.01028443, + "balance_loss_clip": 1.01454937, + "balance_loss_mlp": 1.02149892, + "epoch": 0.458860664361942, + "flos": 17711158763520.0, + "grad_norm": 2.8735226577976003, + "language_loss": 0.71157527, + "learning_rate": 2.2580865893953776e-06, + "loss": 0.73251736, + "num_input_tokens_seen": 163734525, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.44335938, + "step": 7632, + "time_per_iteration": 2.3461430072784424 + }, + { + "auxiliary_loss_clip": 0.01067076, + "auxiliary_loss_mlp": 0.01026408, + "balance_loss_clip": 1.01193571, + "balance_loss_mlp": 1.02089155, + "epoch": 0.45892078761460997, + "flos": 18440913335040.0, + "grad_norm": 2.209699936063333, + "language_loss": 0.69546378, + "learning_rate": 2.2577119214522147e-06, + "loss": 0.71639866, + "num_input_tokens_seen": 163752860, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.4609375, + "step": 7633, + "time_per_iteration": 2.363081693649292 + }, + { + "auxiliary_loss_clip": 0.01066125, + "auxiliary_loss_mlp": 0.01031224, + "balance_loss_clip": 1.01781249, + "balance_loss_mlp": 1.0218792, + "epoch": 0.45898091086727794, + "flos": 22671897891840.0, + "grad_norm": 1.595850622290223, + "language_loss": 0.80758893, + "learning_rate": 2.257337244311976e-06, + "loss": 0.8285625, + "num_input_tokens_seen": 163772495, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.44140625, + "step": 7634, + "time_per_iteration": 2.3881325721740723 + }, + { + "auxiliary_loss_clip": 0.0106705, + "auxiliary_loss_mlp": 0.01027453, + "balance_loss_clip": 1.01300454, + "balance_loss_mlp": 1.0202167, + "epoch": 0.4590410341199459, + "flos": 21651875343360.0, + "grad_norm": 1.8312613188881306, + "language_loss": 0.81556052, + "learning_rate": 2.2569625579880336e-06, + "loss": 0.83650553, + "num_input_tokens_seen": 163791475, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.46875, + "step": 7635, + "time_per_iteration": 2.394411325454712 + }, + { + "auxiliary_loss_clip": 0.01064369, + "auxiliary_loss_mlp": 0.01032405, + "balance_loss_clip": 1.02010822, + "balance_loss_mlp": 1.02182388, + "epoch": 0.45910115737261387, + "flos": 36533987994240.0, + "grad_norm": 1.4976446089237052, + "language_loss": 0.64480048, + "learning_rate": 2.256587862493758e-06, + "loss": 0.66576821, + "num_input_tokens_seen": 163812995, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.42578125, + "step": 7636, + "time_per_iteration": 2.5092782974243164 + }, + { + "auxiliary_loss_clip": 0.01065752, + "auxiliary_loss_mlp": 0.01025276, + "balance_loss_clip": 1.01257443, + "balance_loss_mlp": 1.02255464, + "epoch": 0.45916128062528183, + "flos": 24418882103040.0, + "grad_norm": 1.4623290885883815, + "language_loss": 0.80456823, + "learning_rate": 2.256213157842522e-06, + "loss": 0.82547855, + "num_input_tokens_seen": 163833945, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.43164062, + "step": 7637, + "time_per_iteration": 3.850968599319458 + }, + { + "auxiliary_loss_clip": 0.01069204, + "auxiliary_loss_mlp": 0.0102947, + "balance_loss_clip": 1.01564777, + "balance_loss_mlp": 1.022771, + "epoch": 0.4592214038779498, + "flos": 23220837198720.0, + "grad_norm": 1.5049350219288467, + "language_loss": 0.75289261, + "learning_rate": 2.255838444047697e-06, + "loss": 0.77387929, + "num_input_tokens_seen": 163853885, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.46484375, + "step": 7638, + "time_per_iteration": 2.391613245010376 + }, + { + "auxiliary_loss_clip": 0.01062791, + "auxiliary_loss_mlp": 0.0103009, + "balance_loss_clip": 1.0170722, + "balance_loss_mlp": 1.02106714, + "epoch": 0.45928152713061776, + "flos": 19171121754240.0, + "grad_norm": 1.7153064985538395, + "language_loss": 0.7398234, + "learning_rate": 2.2554637211226557e-06, + "loss": 0.7607522, + "num_input_tokens_seen": 163871855, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.41796875, + "step": 7639, + "time_per_iteration": 2.3587255477905273 + }, + { + "auxiliary_loss_clip": 0.01065485, + "auxiliary_loss_mlp": 0.01029063, + "balance_loss_clip": 1.01571155, + "balance_loss_mlp": 1.02126241, + "epoch": 0.4593416503832857, + "flos": 22413715320960.0, + "grad_norm": 1.6456987057105243, + "language_loss": 0.68179262, + "learning_rate": 2.2550889890807726e-06, + "loss": 0.70273811, + "num_input_tokens_seen": 163891450, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.44140625, + "step": 7640, + "time_per_iteration": 2.389676094055176 + }, + { + "auxiliary_loss_clip": 0.01066724, + "auxiliary_loss_mlp": 0.01026078, + "balance_loss_clip": 1.01306593, + "balance_loss_mlp": 1.02148461, + "epoch": 0.4594017736359537, + "flos": 18879212943360.0, + "grad_norm": 1.7288042328891808, + "language_loss": 0.75299287, + "learning_rate": 2.2547142479354186e-06, + "loss": 0.77392089, + "num_input_tokens_seen": 163909345, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.453125, + "step": 7641, + "time_per_iteration": 2.389803647994995 + }, + { + "auxiliary_loss_clip": 0.01067502, + "auxiliary_loss_mlp": 0.01026669, + "balance_loss_clip": 1.01297212, + "balance_loss_mlp": 1.02157497, + "epoch": 0.45946189688862166, + "flos": 20517617226240.0, + "grad_norm": 1.798118931066865, + "language_loss": 0.7485292, + "learning_rate": 2.2543394976999687e-06, + "loss": 0.76947093, + "num_input_tokens_seen": 163926940, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.45898438, + "step": 7642, + "time_per_iteration": 2.362088918685913 + }, + { + "auxiliary_loss_clip": 0.0100965, + "auxiliary_loss_mlp": 0.01003822, + "balance_loss_clip": 1.00279045, + "balance_loss_mlp": 1.00114965, + "epoch": 0.4595220201412896, + "flos": 61403249036160.0, + "grad_norm": 0.8346664788300928, + "language_loss": 0.58130497, + "learning_rate": 2.2539647383877964e-06, + "loss": 0.60143971, + "num_input_tokens_seen": 163977785, + "router_z_loss_clip": 0.01031494, + "router_z_loss_mlp": 0.08496094, + "step": 7643, + "time_per_iteration": 2.773585796356201 + }, + { + "auxiliary_loss_clip": 0.01069407, + "auxiliary_loss_mlp": 0.01031306, + "balance_loss_clip": 1.01735878, + "balance_loss_mlp": 1.02059448, + "epoch": 0.4595821433939576, + "flos": 23329836063360.0, + "grad_norm": 1.7741651709592123, + "language_loss": 0.93190742, + "learning_rate": 2.2535899700122758e-06, + "loss": 0.95291448, + "num_input_tokens_seen": 163996630, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.48828125, + "step": 7644, + "time_per_iteration": 3.7791848182678223 + }, + { + "auxiliary_loss_clip": 0.01067912, + "auxiliary_loss_mlp": 0.01027436, + "balance_loss_clip": 1.01358414, + "balance_loss_mlp": 1.02107632, + "epoch": 0.4596422666466256, + "flos": 14281500798720.0, + "grad_norm": 2.2550866832057865, + "language_loss": 0.82038283, + "learning_rate": 2.2532151925867816e-06, + "loss": 0.84133631, + "num_input_tokens_seen": 164013190, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.46679688, + "step": 7645, + "time_per_iteration": 2.351698637008667 + }, + { + "auxiliary_loss_clip": 0.01009924, + "auxiliary_loss_mlp": 0.01000179, + "balance_loss_clip": 0.99912989, + "balance_loss_mlp": 1.00133896, + "epoch": 0.4597023898992936, + "flos": 65724029343360.0, + "grad_norm": 0.756904260905072, + "language_loss": 0.59869456, + "learning_rate": 2.252840406124688e-06, + "loss": 0.61879557, + "num_input_tokens_seen": 164074030, + "router_z_loss_clip": 0.01049805, + "router_z_loss_mlp": 0.0859375, + "step": 7646, + "time_per_iteration": 2.959778308868408 + }, + { + "auxiliary_loss_clip": 0.01067295, + "auxiliary_loss_mlp": 0.01026662, + "balance_loss_clip": 1.01364982, + "balance_loss_mlp": 1.02117276, + "epoch": 0.45976251315196154, + "flos": 26905849914240.0, + "grad_norm": 1.7618026391695163, + "language_loss": 0.72548151, + "learning_rate": 2.2524656106393714e-06, + "loss": 0.7464211, + "num_input_tokens_seen": 164095515, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.4609375, + "step": 7647, + "time_per_iteration": 2.4340407848358154 + }, + { + "auxiliary_loss_clip": 0.01067819, + "auxiliary_loss_mlp": 0.01028468, + "balance_loss_clip": 1.01490235, + "balance_loss_mlp": 1.02155924, + "epoch": 0.4598226364046295, + "flos": 26616768923520.0, + "grad_norm": 2.0470359029405256, + "language_loss": 0.66621971, + "learning_rate": 2.252090806144206e-06, + "loss": 0.68718255, + "num_input_tokens_seen": 164117270, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.46289062, + "step": 7648, + "time_per_iteration": 3.811668634414673 + }, + { + "auxiliary_loss_clip": 0.01067174, + "auxiliary_loss_mlp": 0.01028205, + "balance_loss_clip": 1.01455569, + "balance_loss_mlp": 1.02222788, + "epoch": 0.45988275965729747, + "flos": 24386657051520.0, + "grad_norm": 2.132781344476235, + "language_loss": 0.7861284, + "learning_rate": 2.2517159926525685e-06, + "loss": 0.80708218, + "num_input_tokens_seen": 164137850, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.44921875, + "step": 7649, + "time_per_iteration": 2.410172939300537 + }, + { + "auxiliary_loss_clip": 0.01066555, + "auxiliary_loss_mlp": 0.01029881, + "balance_loss_clip": 1.0169822, + "balance_loss_mlp": 1.02235675, + "epoch": 0.45994288290996543, + "flos": 24534653771520.0, + "grad_norm": 4.045138878342894, + "language_loss": 0.68797582, + "learning_rate": 2.2513411701778346e-06, + "loss": 0.70894021, + "num_input_tokens_seen": 164157960, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.44140625, + "step": 7650, + "time_per_iteration": 3.801043748855591 + }, + { + "auxiliary_loss_clip": 0.01070374, + "auxiliary_loss_mlp": 0.01030522, + "balance_loss_clip": 1.01633573, + "balance_loss_mlp": 1.02234912, + "epoch": 0.4600030061626334, + "flos": 14829357853440.0, + "grad_norm": 2.1460907024521436, + "language_loss": 0.84356934, + "learning_rate": 2.2509663387333804e-06, + "loss": 0.86457825, + "num_input_tokens_seen": 164174590, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.48046875, + "step": 7651, + "time_per_iteration": 2.3633787631988525 + }, + { + "auxiliary_loss_clip": 0.01066211, + "auxiliary_loss_mlp": 0.01025138, + "balance_loss_clip": 1.01226902, + "balance_loss_mlp": 1.02203953, + "epoch": 0.46006312941530136, + "flos": 18112869400320.0, + "grad_norm": 1.8714527898980904, + "language_loss": 0.7521044, + "learning_rate": 2.250591498332584e-06, + "loss": 0.77301788, + "num_input_tokens_seen": 164192935, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.44140625, + "step": 7652, + "time_per_iteration": 2.353058099746704 + }, + { + "auxiliary_loss_clip": 0.01066616, + "auxiliary_loss_mlp": 0.01026478, + "balance_loss_clip": 1.01328158, + "balance_loss_mlp": 1.02075434, + "epoch": 0.46012325266796933, + "flos": 21975520446720.0, + "grad_norm": 1.9480283276595012, + "language_loss": 0.75916469, + "learning_rate": 2.2502166489888207e-06, + "loss": 0.7800957, + "num_input_tokens_seen": 164213160, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.45703125, + "step": 7653, + "time_per_iteration": 2.4238173961639404 + }, + { + "auxiliary_loss_clip": 0.01072161, + "auxiliary_loss_mlp": 0.01029316, + "balance_loss_clip": 1.01427174, + "balance_loss_mlp": 1.02308452, + "epoch": 0.4601833759206373, + "flos": 15267168702720.0, + "grad_norm": 2.5410167224644353, + "language_loss": 0.65726364, + "learning_rate": 2.2498417907154695e-06, + "loss": 0.67827833, + "num_input_tokens_seen": 164229330, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.4921875, + "step": 7654, + "time_per_iteration": 2.330723285675049 + }, + { + "auxiliary_loss_clip": 0.01065873, + "auxiliary_loss_mlp": 0.01032216, + "balance_loss_clip": 1.01988971, + "balance_loss_mlp": 1.02048695, + "epoch": 0.46024349917330526, + "flos": 27087782342400.0, + "grad_norm": 1.8845519586513353, + "language_loss": 0.78885448, + "learning_rate": 2.2494669235259077e-06, + "loss": 0.80983531, + "num_input_tokens_seen": 164248240, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.453125, + "step": 7655, + "time_per_iteration": 2.4423887729644775 + }, + { + "auxiliary_loss_clip": 0.01063665, + "auxiliary_loss_mlp": 0.01026958, + "balance_loss_clip": 1.0135771, + "balance_loss_mlp": 1.02008212, + "epoch": 0.4603036224259732, + "flos": 24461755119360.0, + "grad_norm": 1.5874879210218587, + "language_loss": 0.67860812, + "learning_rate": 2.249092047433512e-06, + "loss": 0.69951439, + "num_input_tokens_seen": 164268020, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.43554688, + "step": 7656, + "time_per_iteration": 2.4121053218841553 + }, + { + "auxiliary_loss_clip": 0.01066771, + "auxiliary_loss_mlp": 0.01027413, + "balance_loss_clip": 1.01452613, + "balance_loss_mlp": 1.02141309, + "epoch": 0.4603637456786412, + "flos": 28108084181760.0, + "grad_norm": 1.5824905974389682, + "language_loss": 0.8098408, + "learning_rate": 2.248717162451663e-06, + "loss": 0.83078265, + "num_input_tokens_seen": 164287305, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.453125, + "step": 7657, + "time_per_iteration": 2.4467904567718506 + }, + { + "auxiliary_loss_clip": 0.01066293, + "auxiliary_loss_mlp": 0.01025791, + "balance_loss_clip": 1.0115993, + "balance_loss_mlp": 1.02110136, + "epoch": 0.4604238689313092, + "flos": 24347903575680.0, + "grad_norm": 2.103072799680064, + "language_loss": 0.70667684, + "learning_rate": 2.248342268593738e-06, + "loss": 0.72759771, + "num_input_tokens_seen": 164306835, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.45117188, + "step": 7658, + "time_per_iteration": 2.4230480194091797 + }, + { + "auxiliary_loss_clip": 0.01010109, + "auxiliary_loss_mlp": 0.01003155, + "balance_loss_clip": 1.00214159, + "balance_loss_mlp": 1.00157094, + "epoch": 0.4604839921839772, + "flos": 53603477280000.0, + "grad_norm": 0.9514586715793119, + "language_loss": 0.62160367, + "learning_rate": 2.247967365873116e-06, + "loss": 0.64173627, + "num_input_tokens_seen": 164367095, + "router_z_loss_clip": 0.01013184, + "router_z_loss_mlp": 0.08496094, + "step": 7659, + "time_per_iteration": 3.111355781555176 + }, + { + "auxiliary_loss_clip": 0.0106711, + "auxiliary_loss_mlp": 0.01028681, + "balance_loss_clip": 1.01526964, + "balance_loss_mlp": 1.02282548, + "epoch": 0.46054411543664514, + "flos": 31247090144640.0, + "grad_norm": 1.6959983203474038, + "language_loss": 0.68287581, + "learning_rate": 2.2475924543031766e-06, + "loss": 0.7038337, + "num_input_tokens_seen": 164388895, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.44335938, + "step": 7660, + "time_per_iteration": 2.4484801292419434 + }, + { + "auxiliary_loss_clip": 0.01064988, + "auxiliary_loss_mlp": 0.01032175, + "balance_loss_clip": 1.017941, + "balance_loss_mlp": 1.02002668, + "epoch": 0.4606042386893131, + "flos": 24091850597760.0, + "grad_norm": 6.832544134428633, + "language_loss": 0.77064729, + "learning_rate": 2.2472175338972995e-06, + "loss": 0.79161894, + "num_input_tokens_seen": 164409080, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.44921875, + "step": 7661, + "time_per_iteration": 2.4147894382476807 + }, + { + "auxiliary_loss_clip": 0.01066526, + "auxiliary_loss_mlp": 0.01033128, + "balance_loss_clip": 1.01971734, + "balance_loss_mlp": 1.02327168, + "epoch": 0.46066436194198107, + "flos": 26577247397760.0, + "grad_norm": 1.9074619226597214, + "language_loss": 0.74512184, + "learning_rate": 2.246842604668865e-06, + "loss": 0.76611835, + "num_input_tokens_seen": 164427585, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.43359375, + "step": 7662, + "time_per_iteration": 2.4110236167907715 + }, + { + "auxiliary_loss_clip": 0.01070254, + "auxiliary_loss_mlp": 0.0103171, + "balance_loss_clip": 1.01727378, + "balance_loss_mlp": 1.02178001, + "epoch": 0.46072448519464904, + "flos": 17774910639360.0, + "grad_norm": 6.982220929362037, + "language_loss": 0.79584891, + "learning_rate": 2.246467666631252e-06, + "loss": 0.81686854, + "num_input_tokens_seen": 164438455, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.48632812, + "step": 7663, + "time_per_iteration": 2.329333782196045 + }, + { + "auxiliary_loss_clip": 0.01065027, + "auxiliary_loss_mlp": 0.01027164, + "balance_loss_clip": 1.01440835, + "balance_loss_mlp": 1.02106071, + "epoch": 0.460784608447317, + "flos": 15085201363200.0, + "grad_norm": 2.637435156129718, + "language_loss": 0.73803794, + "learning_rate": 2.2460927197978423e-06, + "loss": 0.75895989, + "num_input_tokens_seen": 164456830, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.43945312, + "step": 7664, + "time_per_iteration": 2.3520610332489014 + }, + { + "auxiliary_loss_clip": 0.01067626, + "auxiliary_loss_mlp": 0.01032055, + "balance_loss_clip": 1.01867342, + "balance_loss_mlp": 1.02238679, + "epoch": 0.46084473169998497, + "flos": 18587269221120.0, + "grad_norm": 1.9553552193106287, + "language_loss": 0.7234093, + "learning_rate": 2.2457177641820164e-06, + "loss": 0.7444061, + "num_input_tokens_seen": 164475375, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.453125, + "step": 7665, + "time_per_iteration": 2.3966569900512695 + }, + { + "auxiliary_loss_clip": 0.01065228, + "auxiliary_loss_mlp": 0.01027146, + "balance_loss_clip": 1.01365125, + "balance_loss_mlp": 1.02072501, + "epoch": 0.46090485495265293, + "flos": 19493928984960.0, + "grad_norm": 1.7824270305051046, + "language_loss": 0.7803334, + "learning_rate": 2.2453427997971553e-06, + "loss": 0.80125713, + "num_input_tokens_seen": 164492040, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.4453125, + "step": 7666, + "time_per_iteration": 2.3715999126434326 + }, + { + "auxiliary_loss_clip": 0.0106565, + "auxiliary_loss_mlp": 0.01025962, + "balance_loss_clip": 1.01126981, + "balance_loss_mlp": 1.02070045, + "epoch": 0.4609649782053209, + "flos": 33363525029760.0, + "grad_norm": 1.2991318473104096, + "language_loss": 0.73907548, + "learning_rate": 2.2449678266566416e-06, + "loss": 0.75999159, + "num_input_tokens_seen": 164513665, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.44921875, + "step": 7667, + "time_per_iteration": 2.4740638732910156 + }, + { + "auxiliary_loss_clip": 0.01064154, + "auxiliary_loss_mlp": 0.01021206, + "balance_loss_clip": 1.00870657, + "balance_loss_mlp": 1.01979828, + "epoch": 0.46102510145798886, + "flos": 23768030937600.0, + "grad_norm": 1.9401246265226992, + "language_loss": 0.76469636, + "learning_rate": 2.2445928447738556e-06, + "loss": 0.78555, + "num_input_tokens_seen": 164533890, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.44335938, + "step": 7668, + "time_per_iteration": 2.3954522609710693 + }, + { + "auxiliary_loss_clip": 0.0106463, + "auxiliary_loss_mlp": 0.01029366, + "balance_loss_clip": 1.01651502, + "balance_loss_mlp": 1.02161622, + "epoch": 0.4610852247106568, + "flos": 23293700939520.0, + "grad_norm": 21.951643909282357, + "language_loss": 0.77491009, + "learning_rate": 2.2442178541621804e-06, + "loss": 0.79585004, + "num_input_tokens_seen": 164553815, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.4296875, + "step": 7669, + "time_per_iteration": 2.3817970752716064 + }, + { + "auxiliary_loss_clip": 0.01066905, + "auxiliary_loss_mlp": 0.010244, + "balance_loss_clip": 1.01111972, + "balance_loss_mlp": 1.02175128, + "epoch": 0.4611453479633248, + "flos": 25446270948480.0, + "grad_norm": 2.3105477709978506, + "language_loss": 0.82325298, + "learning_rate": 2.2438428548349977e-06, + "loss": 0.84416604, + "num_input_tokens_seen": 164573125, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.45117188, + "step": 7670, + "time_per_iteration": 2.4148147106170654 + }, + { + "auxiliary_loss_clip": 0.01067233, + "auxiliary_loss_mlp": 0.01027062, + "balance_loss_clip": 1.01348436, + "balance_loss_mlp": 1.02175927, + "epoch": 0.4612054712159928, + "flos": 21138617312640.0, + "grad_norm": 2.2552998577228673, + "language_loss": 0.63237184, + "learning_rate": 2.2434678468056916e-06, + "loss": 0.65331483, + "num_input_tokens_seen": 164592575, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.45507812, + "step": 7671, + "time_per_iteration": 2.380840539932251 + }, + { + "auxiliary_loss_clip": 0.01009731, + "auxiliary_loss_mlp": 0.01001362, + "balance_loss_clip": 1.00029504, + "balance_loss_mlp": 1.00156355, + "epoch": 0.4612655944686608, + "flos": 69955851772800.0, + "grad_norm": 0.6981278275148353, + "language_loss": 0.55872285, + "learning_rate": 2.2430928300876436e-06, + "loss": 0.57883376, + "num_input_tokens_seen": 164659795, + "router_z_loss_clip": 0.01068115, + "router_z_loss_mlp": 0.08203125, + "step": 7672, + "time_per_iteration": 3.1148881912231445 + }, + { + "auxiliary_loss_clip": 0.01067518, + "auxiliary_loss_mlp": 0.01030076, + "balance_loss_clip": 1.01628351, + "balance_loss_mlp": 1.02221608, + "epoch": 0.46132571772132874, + "flos": 16836200380800.0, + "grad_norm": 1.9520663071891888, + "language_loss": 0.70538223, + "learning_rate": 2.2427178046942387e-06, + "loss": 0.72635818, + "num_input_tokens_seen": 164678735, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.453125, + "step": 7673, + "time_per_iteration": 2.3593480587005615 + }, + { + "auxiliary_loss_clip": 0.01065456, + "auxiliary_loss_mlp": 0.01020014, + "balance_loss_clip": 1.00676966, + "balance_loss_mlp": 1.02206326, + "epoch": 0.4613858409739967, + "flos": 35807480179200.0, + "grad_norm": 1.9522188040745003, + "language_loss": 0.70667946, + "learning_rate": 2.242342770638859e-06, + "loss": 0.72753406, + "num_input_tokens_seen": 164700885, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.43359375, + "step": 7674, + "time_per_iteration": 2.5126214027404785 + }, + { + "auxiliary_loss_clip": 0.0106329, + "auxiliary_loss_mlp": 0.01023165, + "balance_loss_clip": 1.00949168, + "balance_loss_mlp": 1.01892257, + "epoch": 0.4614459642266647, + "flos": 35265174030720.0, + "grad_norm": 1.4123438879321113, + "language_loss": 0.65416652, + "learning_rate": 2.241967727934889e-06, + "loss": 0.67503107, + "num_input_tokens_seen": 164726960, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.44335938, + "step": 7675, + "time_per_iteration": 2.5189685821533203 + }, + { + "auxiliary_loss_clip": 0.01009047, + "auxiliary_loss_mlp": 0.01001265, + "balance_loss_clip": 1.00017416, + "balance_loss_mlp": 1.00073326, + "epoch": 0.46150608747933264, + "flos": 66701493077760.0, + "grad_norm": 0.8735378452131773, + "language_loss": 0.58663487, + "learning_rate": 2.241592676595714e-06, + "loss": 0.60673797, + "num_input_tokens_seen": 164788525, + "router_z_loss_clip": 0.01092529, + "router_z_loss_mlp": 0.08300781, + "step": 7676, + "time_per_iteration": 4.442834138870239 + }, + { + "auxiliary_loss_clip": 0.01009019, + "auxiliary_loss_mlp": 0.01000668, + "balance_loss_clip": 0.99966705, + "balance_loss_mlp": 1.00073087, + "epoch": 0.4615662107320006, + "flos": 55827409841280.0, + "grad_norm": 1.0483353923008125, + "language_loss": 0.62694532, + "learning_rate": 2.241217616634717e-06, + "loss": 0.64704216, + "num_input_tokens_seen": 164843525, + "router_z_loss_clip": 0.01000977, + "router_z_loss_mlp": 0.08300781, + "step": 7677, + "time_per_iteration": 2.8997371196746826 + }, + { + "auxiliary_loss_clip": 0.01064577, + "auxiliary_loss_mlp": 0.01024232, + "balance_loss_clip": 1.01186991, + "balance_loss_mlp": 1.0209552, + "epoch": 0.46162633398466857, + "flos": 15482443345920.0, + "grad_norm": 2.1053896035001483, + "language_loss": 0.76396, + "learning_rate": 2.2408425480652838e-06, + "loss": 0.78484803, + "num_input_tokens_seen": 164859895, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.43554688, + "step": 7678, + "time_per_iteration": 2.3418757915496826 + }, + { + "auxiliary_loss_clip": 0.0106795, + "auxiliary_loss_mlp": 0.01024268, + "balance_loss_clip": 1.01051772, + "balance_loss_mlp": 1.02328324, + "epoch": 0.46168645723733653, + "flos": 20010398860800.0, + "grad_norm": 4.328002644817613, + "language_loss": 0.66644013, + "learning_rate": 2.2404674709008004e-06, + "loss": 0.68736231, + "num_input_tokens_seen": 164878030, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.44726562, + "step": 7679, + "time_per_iteration": 2.410994529724121 + }, + { + "auxiliary_loss_clip": 0.01064761, + "auxiliary_loss_mlp": 0.01028589, + "balance_loss_clip": 1.015625, + "balance_loss_mlp": 1.02108431, + "epoch": 0.4617465804900045, + "flos": 20297629549440.0, + "grad_norm": 2.518853341426678, + "language_loss": 0.69448042, + "learning_rate": 2.2400923851546506e-06, + "loss": 0.71541387, + "num_input_tokens_seen": 164895710, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.4375, + "step": 7680, + "time_per_iteration": 2.3679637908935547 + }, + { + "auxiliary_loss_clip": 0.01071741, + "auxiliary_loss_mlp": 0.0103283, + "balance_loss_clip": 1.0191927, + "balance_loss_mlp": 1.02438378, + "epoch": 0.46180670374267246, + "flos": 22345215500160.0, + "grad_norm": 1.6101980478072833, + "language_loss": 0.63652432, + "learning_rate": 2.2397172908402217e-06, + "loss": 0.65757, + "num_input_tokens_seen": 164913365, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.47265625, + "step": 7681, + "time_per_iteration": 2.393895149230957 + }, + { + "auxiliary_loss_clip": 0.01063125, + "auxiliary_loss_mlp": 0.01024131, + "balance_loss_clip": 1.01154804, + "balance_loss_mlp": 1.02062845, + "epoch": 0.46186682699534043, + "flos": 19894836660480.0, + "grad_norm": 1.3732408980399424, + "language_loss": 0.66965395, + "learning_rate": 2.2393421879708994e-06, + "loss": 0.69052649, + "num_input_tokens_seen": 164931620, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.42578125, + "step": 7682, + "time_per_iteration": 2.3681139945983887 + }, + { + "auxiliary_loss_clip": 0.0106533, + "auxiliary_loss_mlp": 0.01030298, + "balance_loss_clip": 1.01689339, + "balance_loss_mlp": 1.0210197, + "epoch": 0.4619269502480084, + "flos": 31502235427200.0, + "grad_norm": 2.0801411283840103, + "language_loss": 0.73680544, + "learning_rate": 2.2389670765600693e-06, + "loss": 0.75776172, + "num_input_tokens_seen": 164950905, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.44335938, + "step": 7683, + "time_per_iteration": 2.4395124912261963 + }, + { + "auxiliary_loss_clip": 0.01066004, + "auxiliary_loss_mlp": 0.01026961, + "balance_loss_clip": 1.01375222, + "balance_loss_mlp": 1.02182293, + "epoch": 0.46198707350067636, + "flos": 25008320453760.0, + "grad_norm": 5.025323148827002, + "language_loss": 0.76368928, + "learning_rate": 2.2385919566211196e-06, + "loss": 0.78461885, + "num_input_tokens_seen": 164970950, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.44140625, + "step": 7684, + "time_per_iteration": 3.8574986457824707 + }, + { + "auxiliary_loss_clip": 0.01071019, + "auxiliary_loss_mlp": 0.01029126, + "balance_loss_clip": 1.01518452, + "balance_loss_mlp": 1.02272868, + "epoch": 0.4620471967533444, + "flos": 18291485249280.0, + "grad_norm": 2.5225415120217636, + "language_loss": 0.79710138, + "learning_rate": 2.2382168281674365e-06, + "loss": 0.81810284, + "num_input_tokens_seen": 164989855, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.48242188, + "step": 7685, + "time_per_iteration": 2.388214349746704 + }, + { + "auxiliary_loss_clip": 0.01066571, + "auxiliary_loss_mlp": 0.01028837, + "balance_loss_clip": 1.01593852, + "balance_loss_mlp": 1.02297771, + "epoch": 0.46210732000601235, + "flos": 33983687243520.0, + "grad_norm": 1.7433432758600402, + "language_loss": 0.66884458, + "learning_rate": 2.2378416912124076e-06, + "loss": 0.68979871, + "num_input_tokens_seen": 165012290, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.43554688, + "step": 7686, + "time_per_iteration": 2.495123863220215 + }, + { + "auxiliary_loss_clip": 0.01066882, + "auxiliary_loss_mlp": 0.01026012, + "balance_loss_clip": 1.01249409, + "balance_loss_mlp": 1.02089369, + "epoch": 0.4621674432586803, + "flos": 25008250631040.0, + "grad_norm": 2.157918371841039, + "language_loss": 0.74043989, + "learning_rate": 2.23746654576942e-06, + "loss": 0.76136881, + "num_input_tokens_seen": 165030810, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.4609375, + "step": 7687, + "time_per_iteration": 2.3981354236602783 + }, + { + "auxiliary_loss_clip": 0.0106771, + "auxiliary_loss_mlp": 0.01026615, + "balance_loss_clip": 1.01310849, + "balance_loss_mlp": 1.02301443, + "epoch": 0.4622275665113483, + "flos": 22013052024960.0, + "grad_norm": 3.5471672742454436, + "language_loss": 0.73955536, + "learning_rate": 2.2370913918518635e-06, + "loss": 0.76049864, + "num_input_tokens_seen": 165050205, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.44726562, + "step": 7688, + "time_per_iteration": 3.758408784866333 + }, + { + "auxiliary_loss_clip": 0.01063344, + "auxiliary_loss_mlp": 0.01024291, + "balance_loss_clip": 1.01090384, + "balance_loss_mlp": 1.01961446, + "epoch": 0.46228768976401624, + "flos": 24057740332800.0, + "grad_norm": 2.4364420026899554, + "language_loss": 0.78504336, + "learning_rate": 2.2367162294731247e-06, + "loss": 0.80591971, + "num_input_tokens_seen": 165069370, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.4375, + "step": 7689, + "time_per_iteration": 3.804839611053467 + }, + { + "auxiliary_loss_clip": 0.01067686, + "auxiliary_loss_mlp": 0.01028727, + "balance_loss_clip": 1.0144819, + "balance_loss_mlp": 1.02192295, + "epoch": 0.4623478130166842, + "flos": 26650180961280.0, + "grad_norm": 2.0496844965322247, + "language_loss": 0.56321883, + "learning_rate": 2.236341058646592e-06, + "loss": 0.58418298, + "num_input_tokens_seen": 165089610, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.45703125, + "step": 7690, + "time_per_iteration": 2.4147167205810547 + }, + { + "auxiliary_loss_clip": 0.01065321, + "auxiliary_loss_mlp": 0.01028588, + "balance_loss_clip": 1.0150044, + "balance_loss_mlp": 1.02068996, + "epoch": 0.46240793626935217, + "flos": 20557383131520.0, + "grad_norm": 2.1744885458614562, + "language_loss": 0.83076662, + "learning_rate": 2.2359658793856556e-06, + "loss": 0.85170567, + "num_input_tokens_seen": 165109050, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.4453125, + "step": 7691, + "time_per_iteration": 2.3781399726867676 + }, + { + "auxiliary_loss_clip": 0.01066729, + "auxiliary_loss_mlp": 0.01028765, + "balance_loss_clip": 1.0161823, + "balance_loss_mlp": 1.02238321, + "epoch": 0.46246805952202014, + "flos": 22454947503360.0, + "grad_norm": 1.3867050416775746, + "language_loss": 0.75270033, + "learning_rate": 2.2355906917037027e-06, + "loss": 0.7736553, + "num_input_tokens_seen": 165130130, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.44335938, + "step": 7692, + "time_per_iteration": 2.4264349937438965 + }, + { + "auxiliary_loss_clip": 0.01068801, + "auxiliary_loss_mlp": 0.0103182, + "balance_loss_clip": 1.01722872, + "balance_loss_mlp": 1.0219903, + "epoch": 0.4625281827746881, + "flos": 35039914738560.0, + "grad_norm": 2.5778667655800316, + "language_loss": 0.74160755, + "learning_rate": 2.2352154956141253e-06, + "loss": 0.76261371, + "num_input_tokens_seen": 165152685, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.46875, + "step": 7693, + "time_per_iteration": 2.504532814025879 + }, + { + "auxiliary_loss_clip": 0.01066357, + "auxiliary_loss_mlp": 0.01028761, + "balance_loss_clip": 1.01561832, + "balance_loss_mlp": 1.02244878, + "epoch": 0.46258830602735607, + "flos": 21067603873920.0, + "grad_norm": 1.524689995153138, + "language_loss": 0.85453486, + "learning_rate": 2.2348402911303113e-06, + "loss": 0.87548614, + "num_input_tokens_seen": 165173315, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.43945312, + "step": 7694, + "time_per_iteration": 2.3941807746887207 + }, + { + "auxiliary_loss_clip": 0.01067166, + "auxiliary_loss_mlp": 0.01027227, + "balance_loss_clip": 1.01416135, + "balance_loss_mlp": 1.02167034, + "epoch": 0.46264842928002403, + "flos": 26176025520000.0, + "grad_norm": 1.9418597809017273, + "language_loss": 0.79137158, + "learning_rate": 2.2344650782656512e-06, + "loss": 0.81231552, + "num_input_tokens_seen": 165192395, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.45507812, + "step": 7695, + "time_per_iteration": 2.4056222438812256 + }, + { + "auxiliary_loss_clip": 0.01064149, + "auxiliary_loss_mlp": 0.01031604, + "balance_loss_clip": 1.01950407, + "balance_loss_mlp": 1.02065682, + "epoch": 0.462708552532692, + "flos": 16763266817280.0, + "grad_norm": 1.8450040999672972, + "language_loss": 0.72306514, + "learning_rate": 2.234089857033536e-06, + "loss": 0.74402273, + "num_input_tokens_seen": 165211355, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.43554688, + "step": 7696, + "time_per_iteration": 2.3952455520629883 + }, + { + "auxiliary_loss_clip": 0.01066769, + "auxiliary_loss_mlp": 0.01031873, + "balance_loss_clip": 1.01769948, + "balance_loss_mlp": 1.02172184, + "epoch": 0.46276867578535996, + "flos": 15559531361280.0, + "grad_norm": 1.631902607078984, + "language_loss": 0.69227636, + "learning_rate": 2.233714627447356e-06, + "loss": 0.7132628, + "num_input_tokens_seen": 165229380, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.44921875, + "step": 7697, + "time_per_iteration": 2.3501386642456055 + }, + { + "auxiliary_loss_clip": 0.01066239, + "auxiliary_loss_mlp": 0.01025792, + "balance_loss_clip": 1.0130012, + "balance_loss_mlp": 1.02262115, + "epoch": 0.462828799038028, + "flos": 22414413548160.0, + "grad_norm": 1.9185669706162547, + "language_loss": 0.84495747, + "learning_rate": 2.233339389520502e-06, + "loss": 0.86587781, + "num_input_tokens_seen": 165247200, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.4375, + "step": 7698, + "time_per_iteration": 2.3857288360595703 + }, + { + "auxiliary_loss_clip": 0.01066485, + "auxiliary_loss_mlp": 0.01025077, + "balance_loss_clip": 1.01221466, + "balance_loss_mlp": 1.02213097, + "epoch": 0.46288892229069595, + "flos": 21068511569280.0, + "grad_norm": 2.286075221877439, + "language_loss": 0.71265572, + "learning_rate": 2.2329641432663653e-06, + "loss": 0.73357135, + "num_input_tokens_seen": 165265825, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.44335938, + "step": 7699, + "time_per_iteration": 2.3969204425811768 + }, + { + "auxiliary_loss_clip": 0.01067133, + "auxiliary_loss_mlp": 0.01026309, + "balance_loss_clip": 1.01202798, + "balance_loss_mlp": 1.02083218, + "epoch": 0.4629490455433639, + "flos": 23184562429440.0, + "grad_norm": 1.7873700944615993, + "language_loss": 0.71438152, + "learning_rate": 2.232588888698337e-06, + "loss": 0.73531592, + "num_input_tokens_seen": 165284380, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.46289062, + "step": 7700, + "time_per_iteration": 2.399184465408325 + }, + { + "auxiliary_loss_clip": 0.01065028, + "auxiliary_loss_mlp": 0.01026003, + "balance_loss_clip": 1.01301479, + "balance_loss_mlp": 1.02002263, + "epoch": 0.4630091687960319, + "flos": 18834768915840.0, + "grad_norm": 2.3049212966744945, + "language_loss": 0.72118694, + "learning_rate": 2.232213625829811e-06, + "loss": 0.74209726, + "num_input_tokens_seen": 165300320, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.45117188, + "step": 7701, + "time_per_iteration": 2.3478362560272217 + }, + { + "auxiliary_loss_clip": 0.01069614, + "auxiliary_loss_mlp": 0.010279, + "balance_loss_clip": 1.01350498, + "balance_loss_mlp": 1.02218771, + "epoch": 0.46306929204869984, + "flos": 38905568161920.0, + "grad_norm": 3.5940294881564676, + "language_loss": 0.64678752, + "learning_rate": 2.2318383546741768e-06, + "loss": 0.66776264, + "num_input_tokens_seen": 165318130, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.47460938, + "step": 7702, + "time_per_iteration": 2.4980087280273438 + }, + { + "auxiliary_loss_clip": 0.01064735, + "auxiliary_loss_mlp": 0.01020875, + "balance_loss_clip": 1.00770235, + "balance_loss_mlp": 1.02171826, + "epoch": 0.4631294153013678, + "flos": 19643217425280.0, + "grad_norm": 1.9520945296197627, + "language_loss": 0.73089433, + "learning_rate": 2.231463075244829e-06, + "loss": 0.75175047, + "num_input_tokens_seen": 165336225, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.4296875, + "step": 7703, + "time_per_iteration": 2.3490116596221924 + }, + { + "auxiliary_loss_clip": 0.01068619, + "auxiliary_loss_mlp": 0.0102772, + "balance_loss_clip": 1.01325393, + "balance_loss_mlp": 1.02237916, + "epoch": 0.4631895385540358, + "flos": 24607098576000.0, + "grad_norm": 1.9814660146978083, + "language_loss": 0.68582273, + "learning_rate": 2.231087787555159e-06, + "loss": 0.70678604, + "num_input_tokens_seen": 165355005, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.46289062, + "step": 7704, + "time_per_iteration": 2.4178662300109863 + }, + { + "auxiliary_loss_clip": 0.01068231, + "auxiliary_loss_mlp": 0.01024735, + "balance_loss_clip": 1.01081133, + "balance_loss_mlp": 1.02182746, + "epoch": 0.46324966180670374, + "flos": 26318995004160.0, + "grad_norm": 2.825560502392505, + "language_loss": 0.81424409, + "learning_rate": 2.23071249161856e-06, + "loss": 0.83517373, + "num_input_tokens_seen": 165374910, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.46484375, + "step": 7705, + "time_per_iteration": 2.4089674949645996 + }, + { + "auxiliary_loss_clip": 0.01066459, + "auxiliary_loss_mlp": 0.01030429, + "balance_loss_clip": 1.01661837, + "balance_loss_mlp": 1.02044845, + "epoch": 0.4633097850593717, + "flos": 19239621575040.0, + "grad_norm": 1.6115609033867886, + "language_loss": 0.7715323, + "learning_rate": 2.230337187448426e-06, + "loss": 0.79250121, + "num_input_tokens_seen": 165392590, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.4609375, + "step": 7706, + "time_per_iteration": 2.3699300289154053 + }, + { + "auxiliary_loss_clip": 0.01063352, + "auxiliary_loss_mlp": 0.01024666, + "balance_loss_clip": 1.01136184, + "balance_loss_mlp": 1.0214355, + "epoch": 0.46336990831203967, + "flos": 22782083742720.0, + "grad_norm": 1.9285582299947877, + "language_loss": 0.70314741, + "learning_rate": 2.2299618750581498e-06, + "loss": 0.72402757, + "num_input_tokens_seen": 165411195, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.41992188, + "step": 7707, + "time_per_iteration": 2.37332820892334 + }, + { + "auxiliary_loss_clip": 0.01067928, + "auxiliary_loss_mlp": 0.01031758, + "balance_loss_clip": 1.0165832, + "balance_loss_mlp": 1.02040386, + "epoch": 0.46343003156470763, + "flos": 38209260539520.0, + "grad_norm": 2.037978260870784, + "language_loss": 0.60918796, + "learning_rate": 2.2295865544611264e-06, + "loss": 0.63018483, + "num_input_tokens_seen": 165430150, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.4765625, + "step": 7708, + "time_per_iteration": 2.529301643371582 + }, + { + "auxiliary_loss_clip": 0.01066719, + "auxiliary_loss_mlp": 0.01025895, + "balance_loss_clip": 1.01238871, + "balance_loss_mlp": 1.02206373, + "epoch": 0.4634901548173756, + "flos": 31937288279040.0, + "grad_norm": 1.8610382995495445, + "language_loss": 0.77727807, + "learning_rate": 2.229211225670749e-06, + "loss": 0.79820418, + "num_input_tokens_seen": 165450595, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.4453125, + "step": 7709, + "time_per_iteration": 2.4581143856048584 + }, + { + "auxiliary_loss_clip": 0.01069002, + "auxiliary_loss_mlp": 0.01029699, + "balance_loss_clip": 1.01569748, + "balance_loss_mlp": 1.02280211, + "epoch": 0.46355027807004356, + "flos": 20081551944960.0, + "grad_norm": 1.539072185075853, + "language_loss": 0.76871908, + "learning_rate": 2.2288358887004127e-06, + "loss": 0.78970599, + "num_input_tokens_seen": 165469515, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.4609375, + "step": 7710, + "time_per_iteration": 2.3774118423461914 + }, + { + "auxiliary_loss_clip": 0.01068044, + "auxiliary_loss_mlp": 0.01024577, + "balance_loss_clip": 1.0091995, + "balance_loss_mlp": 1.02167952, + "epoch": 0.4636104013227116, + "flos": 24060219039360.0, + "grad_norm": 2.320242062377287, + "language_loss": 0.73242176, + "learning_rate": 2.2284605435635124e-06, + "loss": 0.75334787, + "num_input_tokens_seen": 165488125, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.46484375, + "step": 7711, + "time_per_iteration": 2.3956570625305176 + }, + { + "auxiliary_loss_clip": 0.0106768, + "auxiliary_loss_mlp": 0.01032727, + "balance_loss_clip": 1.01858318, + "balance_loss_mlp": 1.02373576, + "epoch": 0.46367052457537955, + "flos": 23913514039680.0, + "grad_norm": 1.734667402765526, + "language_loss": 0.71424818, + "learning_rate": 2.2280851902734427e-06, + "loss": 0.73525226, + "num_input_tokens_seen": 165509225, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.43945312, + "step": 7712, + "time_per_iteration": 2.404571533203125 + }, + { + "auxiliary_loss_clip": 0.01071362, + "auxiliary_loss_mlp": 0.01034135, + "balance_loss_clip": 1.01827979, + "balance_loss_mlp": 1.02313924, + "epoch": 0.4637306478280475, + "flos": 26395314969600.0, + "grad_norm": 1.6278838224724683, + "language_loss": 0.72897279, + "learning_rate": 2.2277098288435994e-06, + "loss": 0.75002778, + "num_input_tokens_seen": 165529945, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.48242188, + "step": 7713, + "time_per_iteration": 2.413494110107422 + }, + { + "auxiliary_loss_clip": 0.01068465, + "auxiliary_loss_mlp": 0.01030277, + "balance_loss_clip": 1.0154593, + "balance_loss_mlp": 1.02203023, + "epoch": 0.4637907710807155, + "flos": 21979639987200.0, + "grad_norm": 1.6856280385027704, + "language_loss": 0.58576727, + "learning_rate": 2.2273344592873775e-06, + "loss": 0.60675466, + "num_input_tokens_seen": 165550690, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.46484375, + "step": 7714, + "time_per_iteration": 2.40552020072937 + }, + { + "auxiliary_loss_clip": 0.01063423, + "auxiliary_loss_mlp": 0.01027658, + "balance_loss_clip": 1.01381779, + "balance_loss_mlp": 1.02018869, + "epoch": 0.46385089433338345, + "flos": 12421468005120.0, + "grad_norm": 1.9514164816546398, + "language_loss": 0.70146608, + "learning_rate": 2.226959081618174e-06, + "loss": 0.72237688, + "num_input_tokens_seen": 165567775, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.43164062, + "step": 7715, + "time_per_iteration": 2.353139877319336 + }, + { + "auxiliary_loss_clip": 0.01070581, + "auxiliary_loss_mlp": 0.01028161, + "balance_loss_clip": 1.01364112, + "balance_loss_mlp": 1.02219355, + "epoch": 0.4639110175860514, + "flos": 23914596291840.0, + "grad_norm": 2.404493998873162, + "language_loss": 0.69158256, + "learning_rate": 2.2265836958493854e-06, + "loss": 0.71256995, + "num_input_tokens_seen": 165587010, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.484375, + "step": 7716, + "time_per_iteration": 3.807614326477051 + }, + { + "auxiliary_loss_clip": 0.01068018, + "auxiliary_loss_mlp": 0.01026419, + "balance_loss_clip": 1.01234674, + "balance_loss_mlp": 1.02193832, + "epoch": 0.4639711408387194, + "flos": 25299251746560.0, + "grad_norm": 1.5811116034125423, + "language_loss": 0.80862391, + "learning_rate": 2.2262083019944064e-06, + "loss": 0.82956827, + "num_input_tokens_seen": 165607850, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.4609375, + "step": 7717, + "time_per_iteration": 2.408647298812866 + }, + { + "auxiliary_loss_clip": 0.01066565, + "auxiliary_loss_mlp": 0.0102685, + "balance_loss_clip": 1.01364183, + "balance_loss_mlp": 1.02073348, + "epoch": 0.46403126409138734, + "flos": 21210852648960.0, + "grad_norm": 1.6487709361848497, + "language_loss": 0.73361868, + "learning_rate": 2.225832900066636e-06, + "loss": 0.75455284, + "num_input_tokens_seen": 165627175, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.45898438, + "step": 7718, + "time_per_iteration": 2.3821303844451904 + }, + { + "auxiliary_loss_clip": 0.01064414, + "auxiliary_loss_mlp": 0.01025727, + "balance_loss_clip": 1.01225674, + "balance_loss_mlp": 1.02012539, + "epoch": 0.4640913873440553, + "flos": 35844104062080.0, + "grad_norm": 1.7369161329491867, + "language_loss": 0.70416945, + "learning_rate": 2.2254574900794693e-06, + "loss": 0.72507083, + "num_input_tokens_seen": 165648340, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.44335938, + "step": 7719, + "time_per_iteration": 2.5035245418548584 + }, + { + "auxiliary_loss_clip": 0.01069734, + "auxiliary_loss_mlp": 0.01028886, + "balance_loss_clip": 1.01434827, + "balance_loss_mlp": 1.02229679, + "epoch": 0.46415151059672327, + "flos": 19165361379840.0, + "grad_norm": 1.7280022753195905, + "language_loss": 0.86670429, + "learning_rate": 2.2250820720463055e-06, + "loss": 0.88769042, + "num_input_tokens_seen": 165667195, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.47265625, + "step": 7720, + "time_per_iteration": 2.390904664993286 + }, + { + "auxiliary_loss_clip": 0.01010937, + "auxiliary_loss_mlp": 0.01007791, + "balance_loss_clip": 1.00668275, + "balance_loss_mlp": 1.0023334, + "epoch": 0.46421163384939124, + "flos": 58909401820800.0, + "grad_norm": 0.7430881520471856, + "language_loss": 0.55019766, + "learning_rate": 2.2247066459805414e-06, + "loss": 0.57038498, + "num_input_tokens_seen": 165726760, + "router_z_loss_clip": 0.0111084, + "router_z_loss_mlp": 0.0859375, + "step": 7721, + "time_per_iteration": 3.0670084953308105 + }, + { + "auxiliary_loss_clip": 0.01067978, + "auxiliary_loss_mlp": 0.01033899, + "balance_loss_clip": 1.01962352, + "balance_loss_mlp": 1.0224297, + "epoch": 0.4642717571020592, + "flos": 20156300899200.0, + "grad_norm": 1.7947203274252757, + "language_loss": 0.79910564, + "learning_rate": 2.2243312118955746e-06, + "loss": 0.82012439, + "num_input_tokens_seen": 165745005, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.45507812, + "step": 7722, + "time_per_iteration": 2.3853704929351807 + }, + { + "auxiliary_loss_clip": 0.01068504, + "auxiliary_loss_mlp": 0.01026861, + "balance_loss_clip": 1.01285958, + "balance_loss_mlp": 1.02151787, + "epoch": 0.46433188035472717, + "flos": 25045014159360.0, + "grad_norm": 1.5334753909363301, + "language_loss": 0.77651954, + "learning_rate": 2.2239557698048043e-06, + "loss": 0.79747319, + "num_input_tokens_seen": 165765750, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.46875, + "step": 7723, + "time_per_iteration": 3.780885934829712 + }, + { + "auxiliary_loss_clip": 0.01063537, + "auxiliary_loss_mlp": 0.01028127, + "balance_loss_clip": 1.01470399, + "balance_loss_mlp": 1.01992142, + "epoch": 0.4643920036073952, + "flos": 28074357941760.0, + "grad_norm": 1.4880764369730433, + "language_loss": 0.68022025, + "learning_rate": 2.2235803197216285e-06, + "loss": 0.70113689, + "num_input_tokens_seen": 165787515, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.43554688, + "step": 7724, + "time_per_iteration": 2.425663948059082 + }, + { + "auxiliary_loss_clip": 0.01065102, + "auxiliary_loss_mlp": 0.01028795, + "balance_loss_clip": 1.01526475, + "balance_loss_mlp": 1.02026081, + "epoch": 0.46445212686006315, + "flos": 18368363796480.0, + "grad_norm": 3.099528075562372, + "language_loss": 0.6712867, + "learning_rate": 2.2232048616594464e-06, + "loss": 0.69222569, + "num_input_tokens_seen": 165806675, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.44921875, + "step": 7725, + "time_per_iteration": 2.3602402210235596 + }, + { + "auxiliary_loss_clip": 0.01062341, + "auxiliary_loss_mlp": 0.01027435, + "balance_loss_clip": 1.01500177, + "balance_loss_mlp": 1.02074099, + "epoch": 0.4645122501127311, + "flos": 31720302979200.0, + "grad_norm": 1.9374771644634576, + "language_loss": 0.64883041, + "learning_rate": 2.2228293956316563e-06, + "loss": 0.66972816, + "num_input_tokens_seen": 165829835, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.41601562, + "step": 7726, + "time_per_iteration": 2.453810453414917 + }, + { + "auxiliary_loss_clip": 0.01068541, + "auxiliary_loss_mlp": 0.01029311, + "balance_loss_clip": 1.01592326, + "balance_loss_mlp": 1.0235256, + "epoch": 0.4645723733653991, + "flos": 23767681824000.0, + "grad_norm": 1.7286790823350173, + "language_loss": 0.7493462, + "learning_rate": 2.2224539216516592e-06, + "loss": 0.77032471, + "num_input_tokens_seen": 165849380, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.44921875, + "step": 7727, + "time_per_iteration": 2.418638229370117 + }, + { + "auxiliary_loss_clip": 0.01066719, + "auxiliary_loss_mlp": 0.01028585, + "balance_loss_clip": 1.01431012, + "balance_loss_mlp": 1.02147293, + "epoch": 0.46463249661806705, + "flos": 33144130846080.0, + "grad_norm": 2.3056956905010337, + "language_loss": 0.7843228, + "learning_rate": 2.2220784397328534e-06, + "loss": 0.80527592, + "num_input_tokens_seen": 165868620, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.453125, + "step": 7728, + "time_per_iteration": 3.892300605773926 + }, + { + "auxiliary_loss_clip": 0.01068095, + "auxiliary_loss_mlp": 0.01027615, + "balance_loss_clip": 1.01288629, + "balance_loss_mlp": 1.02192879, + "epoch": 0.464692619870735, + "flos": 18295046208000.0, + "grad_norm": 1.9044745378616401, + "language_loss": 0.75553155, + "learning_rate": 2.2217029498886386e-06, + "loss": 0.77648866, + "num_input_tokens_seen": 165885915, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.4609375, + "step": 7729, + "time_per_iteration": 3.8075358867645264 + }, + { + "auxiliary_loss_clip": 0.0106678, + "auxiliary_loss_mlp": 0.01028817, + "balance_loss_clip": 1.01555502, + "balance_loss_mlp": 1.02169013, + "epoch": 0.464752743123403, + "flos": 22636949754240.0, + "grad_norm": 2.4893470892000535, + "language_loss": 0.80070174, + "learning_rate": 2.2213274521324174e-06, + "loss": 0.82165766, + "num_input_tokens_seen": 165905465, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.45117188, + "step": 7730, + "time_per_iteration": 2.4091007709503174 + }, + { + "auxiliary_loss_clip": 0.0106601, + "auxiliary_loss_mlp": 0.01031339, + "balance_loss_clip": 1.01776719, + "balance_loss_mlp": 1.02188206, + "epoch": 0.46481286637607094, + "flos": 20411097068160.0, + "grad_norm": 1.5668078032830803, + "language_loss": 0.76512903, + "learning_rate": 2.220951946477587e-06, + "loss": 0.78610253, + "num_input_tokens_seen": 165924640, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.44140625, + "step": 7731, + "time_per_iteration": 2.405230760574341 + }, + { + "auxiliary_loss_clip": 0.01064536, + "auxiliary_loss_mlp": 0.0102606, + "balance_loss_clip": 1.01328087, + "balance_loss_mlp": 1.02075195, + "epoch": 0.4648729896287389, + "flos": 34274025043200.0, + "grad_norm": 3.814558385650242, + "language_loss": 0.66072118, + "learning_rate": 2.2205764329375516e-06, + "loss": 0.68162715, + "num_input_tokens_seen": 165945765, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.4375, + "step": 7732, + "time_per_iteration": 2.5199601650238037 + }, + { + "auxiliary_loss_clip": 0.0106937, + "auxiliary_loss_mlp": 0.01031479, + "balance_loss_clip": 1.01703656, + "balance_loss_mlp": 1.02156115, + "epoch": 0.4649331128814069, + "flos": 21320794120320.0, + "grad_norm": 3.1771110331955206, + "language_loss": 0.7262404, + "learning_rate": 2.2202009115257105e-06, + "loss": 0.74724889, + "num_input_tokens_seen": 165964025, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.47851562, + "step": 7733, + "time_per_iteration": 2.4219212532043457 + }, + { + "auxiliary_loss_clip": 0.01009502, + "auxiliary_loss_mlp": 0.01012709, + "balance_loss_clip": 1.01146901, + "balance_loss_mlp": 1.00089943, + "epoch": 0.46499323613407484, + "flos": 58305754546560.0, + "grad_norm": 1.2264746021800712, + "language_loss": 0.5193119, + "learning_rate": 2.219825382255464e-06, + "loss": 0.53953403, + "num_input_tokens_seen": 166021950, + "router_z_loss_clip": 0.01239014, + "router_z_loss_mlp": 0.0859375, + "step": 7734, + "time_per_iteration": 2.982264757156372 + }, + { + "auxiliary_loss_clip": 0.01066777, + "auxiliary_loss_mlp": 0.01025099, + "balance_loss_clip": 1.01145005, + "balance_loss_mlp": 1.02184105, + "epoch": 0.4650533593867428, + "flos": 10888885653120.0, + "grad_norm": 1.9397602013310795, + "language_loss": 0.75498128, + "learning_rate": 2.2194498451402163e-06, + "loss": 0.77590007, + "num_input_tokens_seen": 166039675, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.44921875, + "step": 7735, + "time_per_iteration": 2.366530418395996 + }, + { + "auxiliary_loss_clip": 0.01064845, + "auxiliary_loss_mlp": 0.0103097, + "balance_loss_clip": 1.01764262, + "balance_loss_mlp": 1.02173746, + "epoch": 0.46511348263941077, + "flos": 19973565509760.0, + "grad_norm": 1.7205645776565208, + "language_loss": 0.69812584, + "learning_rate": 2.2190743001933675e-06, + "loss": 0.71908396, + "num_input_tokens_seen": 166057745, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.43164062, + "step": 7736, + "time_per_iteration": 2.386244058609009 + }, + { + "auxiliary_loss_clip": 0.01064345, + "auxiliary_loss_mlp": 0.01028378, + "balance_loss_clip": 1.01577115, + "balance_loss_mlp": 1.0214628, + "epoch": 0.46517360589207873, + "flos": 19677502247040.0, + "grad_norm": 2.8701810096713722, + "language_loss": 0.72114456, + "learning_rate": 2.2186987474283207e-06, + "loss": 0.74207181, + "num_input_tokens_seen": 166076440, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.4296875, + "step": 7737, + "time_per_iteration": 2.4253880977630615 + }, + { + "auxiliary_loss_clip": 0.01069225, + "auxiliary_loss_mlp": 0.01030453, + "balance_loss_clip": 1.01522446, + "balance_loss_mlp": 1.02406263, + "epoch": 0.46523372914474675, + "flos": 16871742011520.0, + "grad_norm": 1.746460547513704, + "language_loss": 0.83720684, + "learning_rate": 2.218323186858478e-06, + "loss": 0.85820365, + "num_input_tokens_seen": 166092520, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.45117188, + "step": 7738, + "time_per_iteration": 2.399306058883667 + }, + { + "auxiliary_loss_clip": 0.01009977, + "auxiliary_loss_mlp": 0.01004468, + "balance_loss_clip": 1.00337172, + "balance_loss_mlp": 1.00151062, + "epoch": 0.4652938523974147, + "flos": 53435963243520.0, + "grad_norm": 0.7628263890249222, + "language_loss": 0.57821149, + "learning_rate": 2.2179476184972428e-06, + "loss": 0.59835595, + "num_input_tokens_seen": 166156285, + "router_z_loss_clip": 0.01098633, + "router_z_loss_mlp": 0.08496094, + "step": 7739, + "time_per_iteration": 3.108593225479126 + }, + { + "auxiliary_loss_clip": 0.01066309, + "auxiliary_loss_mlp": 0.01031104, + "balance_loss_clip": 1.01789522, + "balance_loss_mlp": 1.02195549, + "epoch": 0.4653539756500827, + "flos": 15230405174400.0, + "grad_norm": 1.6493992358648013, + "language_loss": 0.84969616, + "learning_rate": 2.2175720423580173e-06, + "loss": 0.87067032, + "num_input_tokens_seen": 166173455, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.4453125, + "step": 7740, + "time_per_iteration": 2.402707815170288 + }, + { + "auxiliary_loss_clip": 0.0106757, + "auxiliary_loss_mlp": 0.01027548, + "balance_loss_clip": 1.01314163, + "balance_loss_mlp": 1.02343917, + "epoch": 0.46541409890275065, + "flos": 23731127763840.0, + "grad_norm": 1.6495506673108782, + "language_loss": 0.75838095, + "learning_rate": 2.217196458454205e-06, + "loss": 0.7793321, + "num_input_tokens_seen": 166194370, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.44140625, + "step": 7741, + "time_per_iteration": 2.430569887161255 + }, + { + "auxiliary_loss_clip": 0.0106942, + "auxiliary_loss_mlp": 0.01033293, + "balance_loss_clip": 1.01929212, + "balance_loss_mlp": 1.02150846, + "epoch": 0.4654742221554186, + "flos": 20846359388160.0, + "grad_norm": 1.879226821212009, + "language_loss": 0.80814719, + "learning_rate": 2.2168208667992105e-06, + "loss": 0.82917428, + "num_input_tokens_seen": 166213195, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.48046875, + "step": 7742, + "time_per_iteration": 2.436495780944824 + }, + { + "auxiliary_loss_clip": 0.01068337, + "auxiliary_loss_mlp": 0.01032707, + "balance_loss_clip": 1.01765108, + "balance_loss_mlp": 1.02176464, + "epoch": 0.4655343454080866, + "flos": 20703773928960.0, + "grad_norm": 1.699030568511101, + "language_loss": 0.72454715, + "learning_rate": 2.2164452674064365e-06, + "loss": 0.74555761, + "num_input_tokens_seen": 166231350, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.46484375, + "step": 7743, + "time_per_iteration": 2.4128406047821045 + }, + { + "auxiliary_loss_clip": 0.01065523, + "auxiliary_loss_mlp": 0.01030664, + "balance_loss_clip": 1.01654375, + "balance_loss_mlp": 1.02058852, + "epoch": 0.46559446866075455, + "flos": 18988840212480.0, + "grad_norm": 1.7688187979397727, + "language_loss": 0.71584105, + "learning_rate": 2.2160696602892875e-06, + "loss": 0.73680294, + "num_input_tokens_seen": 166250530, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.44921875, + "step": 7744, + "time_per_iteration": 2.41943621635437 + }, + { + "auxiliary_loss_clip": 0.01064863, + "auxiliary_loss_mlp": 0.01026913, + "balance_loss_clip": 1.01463473, + "balance_loss_mlp": 1.02137661, + "epoch": 0.4656545919134225, + "flos": 34494920415360.0, + "grad_norm": 1.632773033144232, + "language_loss": 0.85235816, + "learning_rate": 2.2156940454611685e-06, + "loss": 0.87327588, + "num_input_tokens_seen": 166272545, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.43359375, + "step": 7745, + "time_per_iteration": 2.492661237716675 + }, + { + "auxiliary_loss_clip": 0.01065519, + "auxiliary_loss_mlp": 0.01030347, + "balance_loss_clip": 1.01672721, + "balance_loss_mlp": 1.02232242, + "epoch": 0.4657147151660905, + "flos": 24309569036160.0, + "grad_norm": 2.8561654741058544, + "language_loss": 0.73064858, + "learning_rate": 2.215318422935484e-06, + "loss": 0.7516073, + "num_input_tokens_seen": 166292135, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.43164062, + "step": 7746, + "time_per_iteration": 2.419539213180542 + }, + { + "auxiliary_loss_clip": 0.01065987, + "auxiliary_loss_mlp": 0.01030261, + "balance_loss_clip": 1.01676023, + "balance_loss_mlp": 1.02262926, + "epoch": 0.46577483841875844, + "flos": 58793038525440.0, + "grad_norm": 1.5323616865372867, + "language_loss": 0.69640684, + "learning_rate": 2.2149427927256387e-06, + "loss": 0.71736932, + "num_input_tokens_seen": 166316710, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.43359375, + "step": 7747, + "time_per_iteration": 2.708829402923584 + }, + { + "auxiliary_loss_clip": 0.01064301, + "auxiliary_loss_mlp": 0.01030683, + "balance_loss_clip": 1.01700377, + "balance_loss_mlp": 1.01976466, + "epoch": 0.4658349616714264, + "flos": 31320617201280.0, + "grad_norm": 1.6524302927437575, + "language_loss": 0.67173475, + "learning_rate": 2.2145671548450378e-06, + "loss": 0.69268465, + "num_input_tokens_seen": 166338535, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.4453125, + "step": 7748, + "time_per_iteration": 2.481064796447754 + }, + { + "auxiliary_loss_clip": 0.01068463, + "auxiliary_loss_mlp": 0.01030475, + "balance_loss_clip": 1.01537108, + "balance_loss_mlp": 1.02121937, + "epoch": 0.46589508492409437, + "flos": 14859627868800.0, + "grad_norm": 3.3260202886568018, + "language_loss": 0.63611847, + "learning_rate": 2.2141915093070875e-06, + "loss": 0.65710783, + "num_input_tokens_seen": 166355540, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.47265625, + "step": 7749, + "time_per_iteration": 2.3603060245513916 + }, + { + "auxiliary_loss_clip": 0.01067817, + "auxiliary_loss_mlp": 0.01030923, + "balance_loss_clip": 1.01706529, + "balance_loss_mlp": 1.02211499, + "epoch": 0.46595520817676234, + "flos": 12895169598720.0, + "grad_norm": 1.9607370557312682, + "language_loss": 0.74399722, + "learning_rate": 2.213815856125193e-06, + "loss": 0.76498461, + "num_input_tokens_seen": 166372635, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.45703125, + "step": 7750, + "time_per_iteration": 2.374199867248535 + }, + { + "auxiliary_loss_clip": 0.0106808, + "auxiliary_loss_mlp": 0.01025095, + "balance_loss_clip": 1.01132011, + "balance_loss_mlp": 1.02202833, + "epoch": 0.46601533142943036, + "flos": 32852780616960.0, + "grad_norm": 1.816348230578008, + "language_loss": 0.74230325, + "learning_rate": 2.21344019531276e-06, + "loss": 0.76323497, + "num_input_tokens_seen": 166393175, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.4609375, + "step": 7751, + "time_per_iteration": 2.4902355670928955 + }, + { + "auxiliary_loss_clip": 0.01069042, + "auxiliary_loss_mlp": 0.01029474, + "balance_loss_clip": 1.01427472, + "balance_loss_mlp": 1.0221386, + "epoch": 0.4660754546820983, + "flos": 19966687971840.0, + "grad_norm": 1.9674559366158486, + "language_loss": 0.73524445, + "learning_rate": 2.2130645268831965e-06, + "loss": 0.75622964, + "num_input_tokens_seen": 166408630, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.46875, + "step": 7752, + "time_per_iteration": 2.3746883869171143 + }, + { + "auxiliary_loss_clip": 0.01068508, + "auxiliary_loss_mlp": 0.01031138, + "balance_loss_clip": 1.01677334, + "balance_loss_mlp": 1.02199459, + "epoch": 0.4661355779347663, + "flos": 26686944489600.0, + "grad_norm": 5.107758458198386, + "language_loss": 0.6949445, + "learning_rate": 2.2126888508499074e-06, + "loss": 0.71594095, + "num_input_tokens_seen": 166428170, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.46484375, + "step": 7753, + "time_per_iteration": 2.4422481060028076 + }, + { + "auxiliary_loss_clip": 0.01063842, + "auxiliary_loss_mlp": 0.01024506, + "balance_loss_clip": 1.0112623, + "balance_loss_mlp": 1.02135754, + "epoch": 0.46619570118743425, + "flos": 20958395541120.0, + "grad_norm": 3.520382422956918, + "language_loss": 0.72983831, + "learning_rate": 2.2123131672263005e-06, + "loss": 0.75072181, + "num_input_tokens_seen": 166446705, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.42578125, + "step": 7754, + "time_per_iteration": 3.798454523086548 + }, + { + "auxiliary_loss_clip": 0.01067913, + "auxiliary_loss_mlp": 0.01031008, + "balance_loss_clip": 1.01674485, + "balance_loss_mlp": 1.02087379, + "epoch": 0.4662558244401022, + "flos": 24424921768320.0, + "grad_norm": 1.5785851423324844, + "language_loss": 0.79017925, + "learning_rate": 2.2119374760257828e-06, + "loss": 0.81116849, + "num_input_tokens_seen": 166466750, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.46875, + "step": 7755, + "time_per_iteration": 2.4098377227783203 + }, + { + "auxiliary_loss_clip": 0.01065054, + "auxiliary_loss_mlp": 0.01022816, + "balance_loss_clip": 1.01007831, + "balance_loss_mlp": 1.02145398, + "epoch": 0.4663159476927702, + "flos": 20594391039360.0, + "grad_norm": 2.9673875742231184, + "language_loss": 0.71733618, + "learning_rate": 2.2115617772617614e-06, + "loss": 0.73821485, + "num_input_tokens_seen": 166485400, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.4375, + "step": 7756, + "time_per_iteration": 2.401563882827759 + }, + { + "auxiliary_loss_clip": 0.01064794, + "auxiliary_loss_mlp": 0.01028698, + "balance_loss_clip": 1.0154897, + "balance_loss_mlp": 1.02089238, + "epoch": 0.46637607094543815, + "flos": 25660812453120.0, + "grad_norm": 1.6667450920967537, + "language_loss": 0.78091884, + "learning_rate": 2.211186070947645e-06, + "loss": 0.80185378, + "num_input_tokens_seen": 166505730, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.43945312, + "step": 7757, + "time_per_iteration": 2.4417884349823 + }, + { + "auxiliary_loss_clip": 0.0106395, + "auxiliary_loss_mlp": 0.01023968, + "balance_loss_clip": 1.01093841, + "balance_loss_mlp": 1.02057898, + "epoch": 0.4664361941981061, + "flos": 24272875330560.0, + "grad_norm": 1.7863351528759497, + "language_loss": 0.66179377, + "learning_rate": 2.2108103570968403e-06, + "loss": 0.68267292, + "num_input_tokens_seen": 166523770, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.43359375, + "step": 7758, + "time_per_iteration": 2.638392686843872 + }, + { + "auxiliary_loss_clip": 0.01066858, + "auxiliary_loss_mlp": 0.01025932, + "balance_loss_clip": 1.01272416, + "balance_loss_mlp": 1.02189791, + "epoch": 0.4664963174507741, + "flos": 18404882945280.0, + "grad_norm": 1.5862221572431663, + "language_loss": 0.74587375, + "learning_rate": 2.210434635722757e-06, + "loss": 0.76680171, + "num_input_tokens_seen": 166542935, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.44921875, + "step": 7759, + "time_per_iteration": 2.411034345626831 + }, + { + "auxiliary_loss_clip": 0.01065757, + "auxiliary_loss_mlp": 0.01030091, + "balance_loss_clip": 1.0168649, + "balance_loss_mlp": 1.02156973, + "epoch": 0.46655644070344204, + "flos": 22454039808000.0, + "grad_norm": 1.5187986529225563, + "language_loss": 0.77517295, + "learning_rate": 2.2100589068388028e-06, + "loss": 0.79613143, + "num_input_tokens_seen": 166563935, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.44140625, + "step": 7760, + "time_per_iteration": 2.47879958152771 + }, + { + "auxiliary_loss_clip": 0.01063485, + "auxiliary_loss_mlp": 0.01026677, + "balance_loss_clip": 1.01356435, + "balance_loss_mlp": 1.02076602, + "epoch": 0.46661656395611, + "flos": 13807554825600.0, + "grad_norm": 1.801638824331959, + "language_loss": 0.73890877, + "learning_rate": 2.2096831704583858e-06, + "loss": 0.75981045, + "num_input_tokens_seen": 166582175, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.42773438, + "step": 7761, + "time_per_iteration": 2.410750150680542 + }, + { + "auxiliary_loss_clip": 0.01067628, + "auxiliary_loss_mlp": 0.01036524, + "balance_loss_clip": 1.0214026, + "balance_loss_mlp": 1.02097845, + "epoch": 0.466676687208778, + "flos": 21651107293440.0, + "grad_norm": 1.6292229542887844, + "language_loss": 0.78569746, + "learning_rate": 2.2093074265949164e-06, + "loss": 0.80673903, + "num_input_tokens_seen": 166601870, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.46679688, + "step": 7762, + "time_per_iteration": 2.4111809730529785 + }, + { + "auxiliary_loss_clip": 0.01009525, + "auxiliary_loss_mlp": 0.01001889, + "balance_loss_clip": 1.00080967, + "balance_loss_mlp": 1.00084686, + "epoch": 0.46673681046144594, + "flos": 68530941653760.0, + "grad_norm": 0.7946830813156058, + "language_loss": 0.59758484, + "learning_rate": 2.2089316752618034e-06, + "loss": 0.61769891, + "num_input_tokens_seen": 166668960, + "router_z_loss_clip": 0.01080322, + "router_z_loss_mlp": 0.08691406, + "step": 7763, + "time_per_iteration": 4.5356221199035645 + }, + { + "auxiliary_loss_clip": 0.01067498, + "auxiliary_loss_mlp": 0.01032579, + "balance_loss_clip": 1.01807082, + "balance_loss_mlp": 1.02042103, + "epoch": 0.46679693371411396, + "flos": 15813559480320.0, + "grad_norm": 2.633455462621257, + "language_loss": 0.79015028, + "learning_rate": 2.208555916472456e-06, + "loss": 0.81115109, + "num_input_tokens_seen": 166686110, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.47070312, + "step": 7764, + "time_per_iteration": 2.372079610824585 + }, + { + "auxiliary_loss_clip": 0.01064358, + "auxiliary_loss_mlp": 0.01027766, + "balance_loss_clip": 1.01531482, + "balance_loss_mlp": 1.02091122, + "epoch": 0.4668570569667819, + "flos": 18513602519040.0, + "grad_norm": 2.354894001162374, + "language_loss": 0.71852684, + "learning_rate": 2.208180150240285e-06, + "loss": 0.73944807, + "num_input_tokens_seen": 166703930, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.43359375, + "step": 7765, + "time_per_iteration": 2.38215970993042 + }, + { + "auxiliary_loss_clip": 0.01066748, + "auxiliary_loss_mlp": 0.01029747, + "balance_loss_clip": 1.01603246, + "balance_loss_mlp": 1.02071702, + "epoch": 0.4669171802194499, + "flos": 19205685866880.0, + "grad_norm": 2.1210621133639034, + "language_loss": 0.77836835, + "learning_rate": 2.2078043765786993e-06, + "loss": 0.79933333, + "num_input_tokens_seen": 166719940, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.4609375, + "step": 7766, + "time_per_iteration": 2.36773943901062 + }, + { + "auxiliary_loss_clip": 0.01066536, + "auxiliary_loss_mlp": 0.01031314, + "balance_loss_clip": 1.01770592, + "balance_loss_mlp": 1.02152729, + "epoch": 0.46697730347211786, + "flos": 12275321587200.0, + "grad_norm": 2.65909672976516, + "language_loss": 0.6488325, + "learning_rate": 2.2074285955011097e-06, + "loss": 0.66981095, + "num_input_tokens_seen": 166738285, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.44921875, + "step": 7767, + "time_per_iteration": 3.8005311489105225 + }, + { + "auxiliary_loss_clip": 0.01068674, + "auxiliary_loss_mlp": 0.01026876, + "balance_loss_clip": 1.01283288, + "balance_loss_mlp": 1.02280664, + "epoch": 0.4670374267247858, + "flos": 23585609750400.0, + "grad_norm": 1.9249148181116778, + "language_loss": 0.74231493, + "learning_rate": 2.2070528070209272e-06, + "loss": 0.76327044, + "num_input_tokens_seen": 166758170, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.45898438, + "step": 7768, + "time_per_iteration": 2.406558036804199 + }, + { + "auxiliary_loss_clip": 0.01066219, + "auxiliary_loss_mlp": 0.01025647, + "balance_loss_clip": 1.01306462, + "balance_loss_mlp": 1.02187943, + "epoch": 0.4670975499774538, + "flos": 15990359938560.0, + "grad_norm": 2.0603062680850255, + "language_loss": 0.71336675, + "learning_rate": 2.2066770111515635e-06, + "loss": 0.73428535, + "num_input_tokens_seen": 166775750, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.44335938, + "step": 7769, + "time_per_iteration": 3.802736282348633 + }, + { + "auxiliary_loss_clip": 0.0106577, + "auxiliary_loss_mlp": 0.01027374, + "balance_loss_clip": 1.0144279, + "balance_loss_mlp": 1.02114713, + "epoch": 0.46715767323012175, + "flos": 15376691237760.0, + "grad_norm": 2.9029188756209447, + "language_loss": 0.81096727, + "learning_rate": 2.206301207906428e-06, + "loss": 0.83189875, + "num_input_tokens_seen": 166791720, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.4453125, + "step": 7770, + "time_per_iteration": 2.392916679382324 + }, + { + "auxiliary_loss_clip": 0.01010706, + "auxiliary_loss_mlp": 0.01001717, + "balance_loss_clip": 1.00061452, + "balance_loss_mlp": 1.00223422, + "epoch": 0.4672177964827897, + "flos": 60249124488960.0, + "grad_norm": 0.8384429192345793, + "language_loss": 0.5569185, + "learning_rate": 2.2059253972989332e-06, + "loss": 0.5770427, + "num_input_tokens_seen": 166856360, + "router_z_loss_clip": 0.01104736, + "router_z_loss_mlp": 0.08496094, + "step": 7771, + "time_per_iteration": 3.0725886821746826 + }, + { + "auxiliary_loss_clip": 0.01063498, + "auxiliary_loss_mlp": 0.01029808, + "balance_loss_clip": 1.01676106, + "balance_loss_mlp": 1.02041721, + "epoch": 0.4672779197354577, + "flos": 27634906258560.0, + "grad_norm": 1.8258742320873578, + "language_loss": 0.65891206, + "learning_rate": 2.2055495793424913e-06, + "loss": 0.67984509, + "num_input_tokens_seen": 166875925, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.4296875, + "step": 7772, + "time_per_iteration": 2.4973719120025635 + }, + { + "auxiliary_loss_clip": 0.01064485, + "auxiliary_loss_mlp": 0.01030011, + "balance_loss_clip": 1.01784527, + "balance_loss_mlp": 1.02210045, + "epoch": 0.46733804298812565, + "flos": 31392922360320.0, + "grad_norm": 1.8878539247102273, + "language_loss": 0.63784468, + "learning_rate": 2.2051737540505128e-06, + "loss": 0.65878963, + "num_input_tokens_seen": 166896520, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.42382812, + "step": 7773, + "time_per_iteration": 2.475296974182129 + }, + { + "auxiliary_loss_clip": 0.01064448, + "auxiliary_loss_mlp": 0.01031145, + "balance_loss_clip": 1.01742423, + "balance_loss_mlp": 1.01988935, + "epoch": 0.4673981662407936, + "flos": 19499584625280.0, + "grad_norm": 2.253546744838553, + "language_loss": 0.79989576, + "learning_rate": 2.2047979214364117e-06, + "loss": 0.82085168, + "num_input_tokens_seen": 166915370, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.4453125, + "step": 7774, + "time_per_iteration": 2.412198066711426 + }, + { + "auxiliary_loss_clip": 0.0106608, + "auxiliary_loss_mlp": 0.0102686, + "balance_loss_clip": 1.01340723, + "balance_loss_mlp": 1.02192259, + "epoch": 0.4674582894934616, + "flos": 20520794160000.0, + "grad_norm": 1.5792632443190542, + "language_loss": 0.77548873, + "learning_rate": 2.2044220815135984e-06, + "loss": 0.79641807, + "num_input_tokens_seen": 166934875, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.44140625, + "step": 7775, + "time_per_iteration": 2.402048110961914 + }, + { + "auxiliary_loss_clip": 0.01062679, + "auxiliary_loss_mlp": 0.0102417, + "balance_loss_clip": 1.01087189, + "balance_loss_mlp": 1.02016973, + "epoch": 0.46751841274612954, + "flos": 22089860749440.0, + "grad_norm": 1.9581698146010882, + "language_loss": 0.69592309, + "learning_rate": 2.2040462342954876e-06, + "loss": 0.71679157, + "num_input_tokens_seen": 166954285, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.42578125, + "step": 7776, + "time_per_iteration": 2.417745351791382 + }, + { + "auxiliary_loss_clip": 0.01065733, + "auxiliary_loss_mlp": 0.01026369, + "balance_loss_clip": 1.01374459, + "balance_loss_mlp": 1.02141237, + "epoch": 0.46757853599879756, + "flos": 26978853300480.0, + "grad_norm": 1.514393767508111, + "language_loss": 0.73791897, + "learning_rate": 2.2036703797954922e-06, + "loss": 0.75884002, + "num_input_tokens_seen": 166975975, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.44335938, + "step": 7777, + "time_per_iteration": 2.4385809898376465 + }, + { + "auxiliary_loss_clip": 0.01063931, + "auxiliary_loss_mlp": 0.01029579, + "balance_loss_clip": 1.01676965, + "balance_loss_mlp": 1.02015615, + "epoch": 0.4676386592514655, + "flos": 24132908223360.0, + "grad_norm": 4.905385519454798, + "language_loss": 0.69856334, + "learning_rate": 2.203294518027024e-06, + "loss": 0.7194984, + "num_input_tokens_seen": 166996140, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.4375, + "step": 7778, + "time_per_iteration": 2.4327757358551025 + }, + { + "auxiliary_loss_clip": 0.01065213, + "auxiliary_loss_mlp": 0.01030794, + "balance_loss_clip": 1.01712084, + "balance_loss_mlp": 1.02130008, + "epoch": 0.4676987825041335, + "flos": 25482545717760.0, + "grad_norm": 1.8976758826168214, + "language_loss": 0.73890477, + "learning_rate": 2.2029186490034977e-06, + "loss": 0.75986481, + "num_input_tokens_seen": 167016105, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.43945312, + "step": 7779, + "time_per_iteration": 2.4403913021087646 + }, + { + "auxiliary_loss_clip": 0.01065648, + "auxiliary_loss_mlp": 0.01027361, + "balance_loss_clip": 1.0153805, + "balance_loss_mlp": 1.02295446, + "epoch": 0.46775890575680146, + "flos": 21944203090560.0, + "grad_norm": 1.45475438649687, + "language_loss": 0.72658241, + "learning_rate": 2.2025427727383262e-06, + "loss": 0.74751246, + "num_input_tokens_seen": 167036185, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.42578125, + "step": 7780, + "time_per_iteration": 2.4195916652679443 + }, + { + "auxiliary_loss_clip": 0.01067162, + "auxiliary_loss_mlp": 0.01028931, + "balance_loss_clip": 1.01526988, + "balance_loss_mlp": 1.02106142, + "epoch": 0.4678190290094694, + "flos": 25227225878400.0, + "grad_norm": 1.7459212196914042, + "language_loss": 0.73945779, + "learning_rate": 2.2021668892449246e-06, + "loss": 0.76041871, + "num_input_tokens_seen": 167054515, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.4609375, + "step": 7781, + "time_per_iteration": 2.4165194034576416 + }, + { + "auxiliary_loss_clip": 0.01065562, + "auxiliary_loss_mlp": 0.01027257, + "balance_loss_clip": 1.01318467, + "balance_loss_mlp": 1.02017701, + "epoch": 0.4678791522621374, + "flos": 32267042870400.0, + "grad_norm": 2.9168964597140263, + "language_loss": 0.63235813, + "learning_rate": 2.201790998536707e-06, + "loss": 0.65328628, + "num_input_tokens_seen": 167077245, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.453125, + "step": 7782, + "time_per_iteration": 2.516464948654175 + }, + { + "auxiliary_loss_clip": 0.01068017, + "auxiliary_loss_mlp": 0.01029803, + "balance_loss_clip": 1.0159862, + "balance_loss_mlp": 1.02163947, + "epoch": 0.46793927551480535, + "flos": 27045432996480.0, + "grad_norm": 2.086058619016641, + "language_loss": 0.63283563, + "learning_rate": 2.2014151006270872e-06, + "loss": 0.65381384, + "num_input_tokens_seen": 167097235, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.46289062, + "step": 7783, + "time_per_iteration": 2.4414262771606445 + }, + { + "auxiliary_loss_clip": 0.01068921, + "auxiliary_loss_mlp": 0.01035043, + "balance_loss_clip": 1.02113104, + "balance_loss_mlp": 1.02234411, + "epoch": 0.4679993987674733, + "flos": 17456432417280.0, + "grad_norm": 2.363978942218185, + "language_loss": 0.674981, + "learning_rate": 2.2010391955294813e-06, + "loss": 0.6960206, + "num_input_tokens_seen": 167113155, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.46484375, + "step": 7784, + "time_per_iteration": 2.3719165325164795 + }, + { + "auxiliary_loss_clip": 0.01064593, + "auxiliary_loss_mlp": 0.01022782, + "balance_loss_clip": 1.01094472, + "balance_loss_mlp": 1.02113402, + "epoch": 0.4680595220201413, + "flos": 17164174492800.0, + "grad_norm": 1.6855272255608846, + "language_loss": 0.84719014, + "learning_rate": 2.200663283257303e-06, + "loss": 0.86806393, + "num_input_tokens_seen": 167131765, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.43359375, + "step": 7785, + "time_per_iteration": 2.3892011642456055 + }, + { + "auxiliary_loss_clip": 0.01066563, + "auxiliary_loss_mlp": 0.01026544, + "balance_loss_clip": 1.0130254, + "balance_loss_mlp": 1.02174091, + "epoch": 0.46811964527280925, + "flos": 11326801236480.0, + "grad_norm": 1.850116771804742, + "language_loss": 0.77254879, + "learning_rate": 2.2002873638239686e-06, + "loss": 0.7934798, + "num_input_tokens_seen": 167149030, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.44921875, + "step": 7786, + "time_per_iteration": 2.3797447681427 + }, + { + "auxiliary_loss_clip": 0.01060227, + "auxiliary_loss_mlp": 0.01026429, + "balance_loss_clip": 1.01484156, + "balance_loss_mlp": 1.01885796, + "epoch": 0.4681797685254772, + "flos": 24277693098240.0, + "grad_norm": 2.1492273052012343, + "language_loss": 0.74254119, + "learning_rate": 2.1999114372428932e-06, + "loss": 0.76340777, + "num_input_tokens_seen": 167167375, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.4140625, + "step": 7787, + "time_per_iteration": 2.4073169231414795 + }, + { + "auxiliary_loss_clip": 0.01065268, + "auxiliary_loss_mlp": 0.01026019, + "balance_loss_clip": 1.01313198, + "balance_loss_mlp": 1.02248394, + "epoch": 0.4682398917781452, + "flos": 31649010249600.0, + "grad_norm": 1.8402407392130442, + "language_loss": 0.65640885, + "learning_rate": 2.1995355035274923e-06, + "loss": 0.67732173, + "num_input_tokens_seen": 167188065, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.42773438, + "step": 7788, + "time_per_iteration": 2.472522020339966 + }, + { + "auxiliary_loss_clip": 0.01063066, + "auxiliary_loss_mlp": 0.0102464, + "balance_loss_clip": 1.01269555, + "balance_loss_mlp": 1.02095008, + "epoch": 0.46830001503081314, + "flos": 28109515547520.0, + "grad_norm": 1.5660897737062194, + "language_loss": 0.63986659, + "learning_rate": 2.1991595626911837e-06, + "loss": 0.66074371, + "num_input_tokens_seen": 167209675, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.421875, + "step": 7789, + "time_per_iteration": 2.4562673568725586 + }, + { + "auxiliary_loss_clip": 0.01011389, + "auxiliary_loss_mlp": 0.01000881, + "balance_loss_clip": 0.99981385, + "balance_loss_mlp": 1.00273466, + "epoch": 0.4683601382834811, + "flos": 57878661484800.0, + "grad_norm": 0.6937391163789155, + "language_loss": 0.61891657, + "learning_rate": 2.1987836147473813e-06, + "loss": 0.63903928, + "num_input_tokens_seen": 167273940, + "router_z_loss_clip": 0.01068115, + "router_z_loss_mlp": 0.08691406, + "step": 7790, + "time_per_iteration": 3.141482353210449 + }, + { + "auxiliary_loss_clip": 0.01063512, + "auxiliary_loss_mlp": 0.01023421, + "balance_loss_clip": 1.01214337, + "balance_loss_mlp": 1.02207303, + "epoch": 0.46842026153614913, + "flos": 21870850590720.0, + "grad_norm": 1.5933865284398117, + "language_loss": 0.79297507, + "learning_rate": 2.1984076597095044e-06, + "loss": 0.81384438, + "num_input_tokens_seen": 167292730, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.4140625, + "step": 7791, + "time_per_iteration": 2.4012553691864014 + }, + { + "auxiliary_loss_clip": 0.01064707, + "auxiliary_loss_mlp": 0.01031828, + "balance_loss_clip": 1.01888752, + "balance_loss_mlp": 1.02123404, + "epoch": 0.4684803847888171, + "flos": 24899635791360.0, + "grad_norm": 1.492361320943866, + "language_loss": 0.74894148, + "learning_rate": 2.1980316975909673e-06, + "loss": 0.76990688, + "num_input_tokens_seen": 167313460, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.43359375, + "step": 7792, + "time_per_iteration": 2.4522340297698975 + }, + { + "auxiliary_loss_clip": 0.01061891, + "auxiliary_loss_mlp": 0.01023609, + "balance_loss_clip": 1.01223016, + "balance_loss_mlp": 1.0208776, + "epoch": 0.46854050804148506, + "flos": 26250425360640.0, + "grad_norm": 1.9650128810504401, + "language_loss": 0.67996895, + "learning_rate": 2.1976557284051897e-06, + "loss": 0.70082396, + "num_input_tokens_seen": 167335385, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.41015625, + "step": 7793, + "time_per_iteration": 2.4376144409179688 + }, + { + "auxiliary_loss_clip": 0.01063811, + "auxiliary_loss_mlp": 0.01023248, + "balance_loss_clip": 1.01181006, + "balance_loss_mlp": 1.02141476, + "epoch": 0.468600631294153, + "flos": 21578732311680.0, + "grad_norm": 1.7839193143869831, + "language_loss": 0.73993731, + "learning_rate": 2.1972797521655864e-06, + "loss": 0.76080787, + "num_input_tokens_seen": 167353625, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.42382812, + "step": 7794, + "time_per_iteration": 3.834986448287964 + }, + { + "auxiliary_loss_clip": 0.01066758, + "auxiliary_loss_mlp": 0.01029178, + "balance_loss_clip": 1.01643503, + "balance_loss_mlp": 1.02303207, + "epoch": 0.468660754546821, + "flos": 25884430911360.0, + "grad_norm": 1.6001481651136051, + "language_loss": 0.63093185, + "learning_rate": 2.1969037688855765e-06, + "loss": 0.65189123, + "num_input_tokens_seen": 167374565, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.4375, + "step": 7795, + "time_per_iteration": 2.441859006881714 + }, + { + "auxiliary_loss_clip": 0.01061142, + "auxiliary_loss_mlp": 0.01022023, + "balance_loss_clip": 1.00948262, + "balance_loss_mlp": 1.01923764, + "epoch": 0.46872087779948896, + "flos": 35473710781440.0, + "grad_norm": 1.6045733427278561, + "language_loss": 0.68068552, + "learning_rate": 2.196527778578578e-06, + "loss": 0.70151722, + "num_input_tokens_seen": 167395010, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.41796875, + "step": 7796, + "time_per_iteration": 2.530568838119507 + }, + { + "auxiliary_loss_clip": 0.0106291, + "auxiliary_loss_mlp": 0.01024227, + "balance_loss_clip": 1.01214504, + "balance_loss_mlp": 1.02046418, + "epoch": 0.4687810010521569, + "flos": 26395210235520.0, + "grad_norm": 1.7576230449613672, + "language_loss": 0.70185626, + "learning_rate": 2.196151781258008e-06, + "loss": 0.72272754, + "num_input_tokens_seen": 167415285, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.42578125, + "step": 7797, + "time_per_iteration": 2.4373116493225098 + }, + { + "auxiliary_loss_clip": 0.01065246, + "auxiliary_loss_mlp": 0.01028682, + "balance_loss_clip": 1.01518142, + "balance_loss_mlp": 1.02132726, + "epoch": 0.4688411243048249, + "flos": 19971785030400.0, + "grad_norm": 2.3454960624278374, + "language_loss": 0.67396736, + "learning_rate": 2.1957757769372856e-06, + "loss": 0.69490671, + "num_input_tokens_seen": 167432405, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.43945312, + "step": 7798, + "time_per_iteration": 2.395909070968628 + }, + { + "auxiliary_loss_clip": 0.01010213, + "auxiliary_loss_mlp": 0.0100313, + "balance_loss_clip": 1.00203931, + "balance_loss_mlp": 1.00165594, + "epoch": 0.46890124755749285, + "flos": 63973728552960.0, + "grad_norm": 0.9401330570150451, + "language_loss": 0.64533162, + "learning_rate": 2.1953997656298296e-06, + "loss": 0.665465, + "num_input_tokens_seen": 167499365, + "router_z_loss_clip": 0.01092529, + "router_z_loss_mlp": 0.0859375, + "step": 7799, + "time_per_iteration": 3.197450876235962 + }, + { + "auxiliary_loss_clip": 0.01062129, + "auxiliary_loss_mlp": 0.01021452, + "balance_loss_clip": 1.00954354, + "balance_loss_mlp": 1.01997566, + "epoch": 0.4689613708101608, + "flos": 23767856380800.0, + "grad_norm": 1.3756550912447618, + "language_loss": 0.7236793, + "learning_rate": 2.1950237473490585e-06, + "loss": 0.74451512, + "num_input_tokens_seen": 167520390, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.421875, + "step": 7800, + "time_per_iteration": 2.442486524581909 + }, + { + "auxiliary_loss_clip": 0.01060872, + "auxiliary_loss_mlp": 0.01024408, + "balance_loss_clip": 1.01302314, + "balance_loss_mlp": 1.02056348, + "epoch": 0.4690214940628288, + "flos": 24787599638400.0, + "grad_norm": 2.932115213541457, + "language_loss": 0.72514915, + "learning_rate": 2.1946477221083917e-06, + "loss": 0.74600196, + "num_input_tokens_seen": 167539865, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.40234375, + "step": 7801, + "time_per_iteration": 2.4174792766571045 + }, + { + "auxiliary_loss_clip": 0.01067599, + "auxiliary_loss_mlp": 0.01024058, + "balance_loss_clip": 1.01006293, + "balance_loss_mlp": 1.02228785, + "epoch": 0.46908161731549675, + "flos": 18076350251520.0, + "grad_norm": 2.224633532677108, + "language_loss": 0.62238443, + "learning_rate": 2.194271689921248e-06, + "loss": 0.64330101, + "num_input_tokens_seen": 167558190, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.453125, + "step": 7802, + "time_per_iteration": 2.384854555130005 + }, + { + "auxiliary_loss_clip": 0.01064786, + "auxiliary_loss_mlp": 0.01024521, + "balance_loss_clip": 1.01226068, + "balance_loss_mlp": 1.02074885, + "epoch": 0.4691417405681647, + "flos": 25702149369600.0, + "grad_norm": 1.7773738163596415, + "language_loss": 0.73211157, + "learning_rate": 2.1938956508010475e-06, + "loss": 0.75300467, + "num_input_tokens_seen": 167577685, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.43945312, + "step": 7803, + "time_per_iteration": 3.892730236053467 + }, + { + "auxiliary_loss_clip": 0.01060915, + "auxiliary_loss_mlp": 0.01027254, + "balance_loss_clip": 1.0158515, + "balance_loss_mlp": 1.01947856, + "epoch": 0.46920186382083273, + "flos": 17456083303680.0, + "grad_norm": 1.6786178406996937, + "language_loss": 0.77390242, + "learning_rate": 2.19351960476121e-06, + "loss": 0.79478419, + "num_input_tokens_seen": 167596390, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.4140625, + "step": 7804, + "time_per_iteration": 2.363569974899292 + }, + { + "auxiliary_loss_clip": 0.01062361, + "auxiliary_loss_mlp": 0.01024368, + "balance_loss_clip": 1.01226211, + "balance_loss_mlp": 1.02002013, + "epoch": 0.4692619870735007, + "flos": 20338407884160.0, + "grad_norm": 1.7192196652740332, + "language_loss": 0.77400482, + "learning_rate": 2.193143551815155e-06, + "loss": 0.79487205, + "num_input_tokens_seen": 167614980, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.421875, + "step": 7805, + "time_per_iteration": 2.406327486038208 + }, + { + "auxiliary_loss_clip": 0.01068586, + "auxiliary_loss_mlp": 0.01029416, + "balance_loss_clip": 1.01613569, + "balance_loss_mlp": 1.02237952, + "epoch": 0.46932211032616866, + "flos": 29495288165760.0, + "grad_norm": 2.5681655551905074, + "language_loss": 0.82698405, + "learning_rate": 2.192767491976305e-06, + "loss": 0.84796405, + "num_input_tokens_seen": 167635895, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.46289062, + "step": 7806, + "time_per_iteration": 2.4635119438171387 + }, + { + "auxiliary_loss_clip": 0.01063164, + "auxiliary_loss_mlp": 0.01025403, + "balance_loss_clip": 1.01255834, + "balance_loss_mlp": 1.02058768, + "epoch": 0.4693822335788366, + "flos": 36209749397760.0, + "grad_norm": 1.782115978511784, + "language_loss": 0.76941544, + "learning_rate": 2.192391425258078e-06, + "loss": 0.79030108, + "num_input_tokens_seen": 167657440, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.42578125, + "step": 7807, + "time_per_iteration": 3.991924285888672 + }, + { + "auxiliary_loss_clip": 0.01063329, + "auxiliary_loss_mlp": 0.01022637, + "balance_loss_clip": 1.0101912, + "balance_loss_mlp": 1.02063048, + "epoch": 0.4694423568315046, + "flos": 20337954036480.0, + "grad_norm": 1.8170575731443623, + "language_loss": 0.51289874, + "learning_rate": 2.1920153516738967e-06, + "loss": 0.5337584, + "num_input_tokens_seen": 167675025, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.42773438, + "step": 7808, + "time_per_iteration": 3.819007158279419 + }, + { + "auxiliary_loss_clip": 0.01012381, + "auxiliary_loss_mlp": 0.0100188, + "balance_loss_clip": 1.00074112, + "balance_loss_mlp": 1.0038836, + "epoch": 0.46950248008417256, + "flos": 64323489219840.0, + "grad_norm": 0.7847796982861788, + "language_loss": 0.57751703, + "learning_rate": 2.1916392712371804e-06, + "loss": 0.59765959, + "num_input_tokens_seen": 167729635, + "router_z_loss_clip": 0.01141357, + "router_z_loss_mlp": 0.08496094, + "step": 7809, + "time_per_iteration": 2.9037928581237793 + }, + { + "auxiliary_loss_clip": 0.01066132, + "auxiliary_loss_mlp": 0.01033034, + "balance_loss_clip": 1.02046907, + "balance_loss_mlp": 1.02252579, + "epoch": 0.4695626033368405, + "flos": 19199331999360.0, + "grad_norm": 2.6939685595714207, + "language_loss": 0.71587026, + "learning_rate": 2.191263183961352e-06, + "loss": 0.73686194, + "num_input_tokens_seen": 167745135, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.4375, + "step": 7810, + "time_per_iteration": 2.3926444053649902 + }, + { + "auxiliary_loss_clip": 0.01065605, + "auxiliary_loss_mlp": 0.01028919, + "balance_loss_clip": 1.01635468, + "balance_loss_mlp": 1.02209127, + "epoch": 0.4696227265895085, + "flos": 23001338280960.0, + "grad_norm": 2.3637466172602477, + "language_loss": 0.80828327, + "learning_rate": 2.1908870898598326e-06, + "loss": 0.82922852, + "num_input_tokens_seen": 167763875, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.43554688, + "step": 7811, + "time_per_iteration": 2.4045393466949463 + }, + { + "auxiliary_loss_clip": 0.01064134, + "auxiliary_loss_mlp": 0.01029105, + "balance_loss_clip": 1.01655281, + "balance_loss_mlp": 1.02071166, + "epoch": 0.46968284984217645, + "flos": 21869803249920.0, + "grad_norm": 1.5696228381724824, + "language_loss": 0.80396307, + "learning_rate": 2.1905109889460436e-06, + "loss": 0.82489544, + "num_input_tokens_seen": 167784895, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.43554688, + "step": 7812, + "time_per_iteration": 2.437558174133301 + }, + { + "auxiliary_loss_clip": 0.01062143, + "auxiliary_loss_mlp": 0.01023954, + "balance_loss_clip": 1.01296258, + "balance_loss_mlp": 1.0209105, + "epoch": 0.4697429730948444, + "flos": 19973949534720.0, + "grad_norm": 1.7375280493544765, + "language_loss": 0.74153292, + "learning_rate": 2.1901348812334073e-06, + "loss": 0.76239395, + "num_input_tokens_seen": 167803185, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.4140625, + "step": 7813, + "time_per_iteration": 2.4063801765441895 + }, + { + "auxiliary_loss_clip": 0.01063203, + "auxiliary_loss_mlp": 0.01024486, + "balance_loss_clip": 1.01226091, + "balance_loss_mlp": 1.02055454, + "epoch": 0.4698030963475124, + "flos": 15155376929280.0, + "grad_norm": 2.0281614003724164, + "language_loss": 0.84453011, + "learning_rate": 2.1897587667353465e-06, + "loss": 0.86540705, + "num_input_tokens_seen": 167816550, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.42578125, + "step": 7814, + "time_per_iteration": 2.388887405395508 + }, + { + "auxiliary_loss_clip": 0.01060882, + "auxiliary_loss_mlp": 0.01027351, + "balance_loss_clip": 1.01607966, + "balance_loss_mlp": 1.0200634, + "epoch": 0.46986321960018035, + "flos": 15150489338880.0, + "grad_norm": 1.845329456576051, + "language_loss": 0.81621611, + "learning_rate": 2.189382645465284e-06, + "loss": 0.83709848, + "num_input_tokens_seen": 167831845, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.40820312, + "step": 7815, + "time_per_iteration": 2.361161470413208 + }, + { + "auxiliary_loss_clip": 0.01063972, + "auxiliary_loss_mlp": 0.01029778, + "balance_loss_clip": 1.01676011, + "balance_loss_mlp": 1.02058589, + "epoch": 0.4699233428528483, + "flos": 23107893350400.0, + "grad_norm": 1.8588913275105015, + "language_loss": 0.77823341, + "learning_rate": 2.1890065174366416e-06, + "loss": 0.79917085, + "num_input_tokens_seen": 167850360, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.43359375, + "step": 7816, + "time_per_iteration": 2.4239373207092285 + }, + { + "auxiliary_loss_clip": 0.01010289, + "auxiliary_loss_mlp": 0.01005903, + "balance_loss_clip": 1.00488961, + "balance_loss_mlp": 1.00203252, + "epoch": 0.46998346610551633, + "flos": 68101998289920.0, + "grad_norm": 0.8626555665648618, + "language_loss": 0.59010649, + "learning_rate": 2.1886303826628422e-06, + "loss": 0.61026835, + "num_input_tokens_seen": 167908660, + "router_z_loss_clip": 0.01013184, + "router_z_loss_mlp": 0.08251953, + "step": 7817, + "time_per_iteration": 3.024299144744873 + }, + { + "auxiliary_loss_clip": 0.01062302, + "auxiliary_loss_mlp": 0.0103079, + "balance_loss_clip": 1.01817214, + "balance_loss_mlp": 1.02163899, + "epoch": 0.4700435893581843, + "flos": 24128439569280.0, + "grad_norm": 1.849658306624684, + "language_loss": 0.79293048, + "learning_rate": 2.1882542411573103e-06, + "loss": 0.81386143, + "num_input_tokens_seen": 167927905, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.40625, + "step": 7818, + "time_per_iteration": 2.435706377029419 + }, + { + "auxiliary_loss_clip": 0.01061871, + "auxiliary_loss_mlp": 0.0102703, + "balance_loss_clip": 1.01492465, + "balance_loss_mlp": 1.0198276, + "epoch": 0.47010371261085226, + "flos": 20149667740800.0, + "grad_norm": 3.9285706826136435, + "language_loss": 0.83938223, + "learning_rate": 2.1878780929334684e-06, + "loss": 0.86027122, + "num_input_tokens_seen": 167945995, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.41992188, + "step": 7819, + "time_per_iteration": 2.411080837249756 + }, + { + "auxiliary_loss_clip": 0.01066007, + "auxiliary_loss_mlp": 0.01029689, + "balance_loss_clip": 1.016415, + "balance_loss_mlp": 1.02079272, + "epoch": 0.47016383586352023, + "flos": 15121301575680.0, + "grad_norm": 1.8680993809132647, + "language_loss": 0.76393104, + "learning_rate": 2.187501938004741e-06, + "loss": 0.78488803, + "num_input_tokens_seen": 167963380, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.45117188, + "step": 7820, + "time_per_iteration": 2.389704704284668 + }, + { + "auxiliary_loss_clip": 0.01063743, + "auxiliary_loss_mlp": 0.01026551, + "balance_loss_clip": 1.01442122, + "balance_loss_mlp": 1.02292204, + "epoch": 0.4702239591161882, + "flos": 13552130252160.0, + "grad_norm": 2.035108216425056, + "language_loss": 0.74233097, + "learning_rate": 2.187125776384552e-06, + "loss": 0.7632339, + "num_input_tokens_seen": 167981740, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.40820312, + "step": 7821, + "time_per_iteration": 2.440427541732788 + }, + { + "auxiliary_loss_clip": 0.01062429, + "auxiliary_loss_mlp": 0.01025962, + "balance_loss_clip": 1.01459599, + "balance_loss_mlp": 1.02095807, + "epoch": 0.47028408236885616, + "flos": 24275458771200.0, + "grad_norm": 2.2493933029299926, + "language_loss": 0.89144427, + "learning_rate": 2.1867496080863246e-06, + "loss": 0.91232824, + "num_input_tokens_seen": 167999380, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.41601562, + "step": 7822, + "time_per_iteration": 2.4266767501831055 + }, + { + "auxiliary_loss_clip": 0.01061369, + "auxiliary_loss_mlp": 0.01026517, + "balance_loss_clip": 1.01466131, + "balance_loss_mlp": 1.02034295, + "epoch": 0.4703442056215241, + "flos": 22855820267520.0, + "grad_norm": 1.6619484738740946, + "language_loss": 0.79744184, + "learning_rate": 2.186373433123485e-06, + "loss": 0.81832075, + "num_input_tokens_seen": 168018395, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.41015625, + "step": 7823, + "time_per_iteration": 2.401735305786133 + }, + { + "auxiliary_loss_clip": 0.01010094, + "auxiliary_loss_mlp": 0.01011196, + "balance_loss_clip": 1.01005793, + "balance_loss_mlp": 1.0018189, + "epoch": 0.4704043288741921, + "flos": 69236535697920.0, + "grad_norm": 0.9147822940645458, + "language_loss": 0.56644809, + "learning_rate": 2.1859972515094562e-06, + "loss": 0.58666098, + "num_input_tokens_seen": 168084080, + "router_z_loss_clip": 0.01141357, + "router_z_loss_mlp": 0.08300781, + "step": 7824, + "time_per_iteration": 3.1045608520507812 + }, + { + "auxiliary_loss_clip": 0.01065388, + "auxiliary_loss_mlp": 0.0103137, + "balance_loss_clip": 1.01769078, + "balance_loss_mlp": 1.02216017, + "epoch": 0.47046445212686006, + "flos": 18040110393600.0, + "grad_norm": 1.6767740526010473, + "language_loss": 0.81369621, + "learning_rate": 2.185621063257664e-06, + "loss": 0.83466375, + "num_input_tokens_seen": 168101555, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.43359375, + "step": 7825, + "time_per_iteration": 2.3771207332611084 + }, + { + "auxiliary_loss_clip": 0.0106507, + "auxiliary_loss_mlp": 0.01026113, + "balance_loss_clip": 1.01360822, + "balance_loss_mlp": 1.02252805, + "epoch": 0.470524575379528, + "flos": 23950312479360.0, + "grad_norm": 2.1915993545766503, + "language_loss": 0.66404748, + "learning_rate": 2.185244868381534e-06, + "loss": 0.68495929, + "num_input_tokens_seen": 168121530, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.42578125, + "step": 7826, + "time_per_iteration": 2.431581497192383 + }, + { + "auxiliary_loss_clip": 0.01064809, + "auxiliary_loss_mlp": 0.01030571, + "balance_loss_clip": 1.01751184, + "balance_loss_mlp": 1.0219605, + "epoch": 0.470584698632196, + "flos": 18112590109440.0, + "grad_norm": 1.7653582554289389, + "language_loss": 0.84069765, + "learning_rate": 2.184868666894491e-06, + "loss": 0.86165154, + "num_input_tokens_seen": 168140335, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.4296875, + "step": 7827, + "time_per_iteration": 2.406481981277466 + }, + { + "auxiliary_loss_clip": 0.01010436, + "auxiliary_loss_mlp": 0.01005841, + "balance_loss_clip": 1.00472045, + "balance_loss_mlp": 1.00217831, + "epoch": 0.47064482188486395, + "flos": 57249143026560.0, + "grad_norm": 0.7935180367648155, + "language_loss": 0.55634338, + "learning_rate": 2.184492458809961e-06, + "loss": 0.57650614, + "num_input_tokens_seen": 168200535, + "router_z_loss_clip": 0.01123047, + "router_z_loss_mlp": 0.08251953, + "step": 7828, + "time_per_iteration": 3.053349494934082 + }, + { + "auxiliary_loss_clip": 0.01062447, + "auxiliary_loss_mlp": 0.01029244, + "balance_loss_clip": 1.01694143, + "balance_loss_mlp": 1.02140188, + "epoch": 0.4707049451375319, + "flos": 17894103621120.0, + "grad_norm": 1.7486897940259771, + "language_loss": 0.80905116, + "learning_rate": 2.1841162441413686e-06, + "loss": 0.82996809, + "num_input_tokens_seen": 168219610, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.41015625, + "step": 7829, + "time_per_iteration": 2.3907763957977295 + }, + { + "auxiliary_loss_clip": 0.01062746, + "auxiliary_loss_mlp": 0.01026387, + "balance_loss_clip": 1.01460934, + "balance_loss_mlp": 1.0211879, + "epoch": 0.47076506839019994, + "flos": 25231380330240.0, + "grad_norm": 1.297763139368727, + "language_loss": 0.76052088, + "learning_rate": 2.1837400229021423e-06, + "loss": 0.78141224, + "num_input_tokens_seen": 168242505, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.41601562, + "step": 7830, + "time_per_iteration": 2.4615678787231445 + }, + { + "auxiliary_loss_clip": 0.01064961, + "auxiliary_loss_mlp": 0.01027847, + "balance_loss_clip": 1.01504374, + "balance_loss_mlp": 1.02297068, + "epoch": 0.4708251916428679, + "flos": 13478847575040.0, + "grad_norm": 1.9765805633470213, + "language_loss": 0.79007149, + "learning_rate": 2.183363795105707e-06, + "loss": 0.81099963, + "num_input_tokens_seen": 168260220, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.41992188, + "step": 7831, + "time_per_iteration": 2.401184320449829 + }, + { + "auxiliary_loss_clip": 0.01067036, + "auxiliary_loss_mlp": 0.01033738, + "balance_loss_clip": 1.02088714, + "balance_loss_mlp": 1.02155578, + "epoch": 0.47088531489553587, + "flos": 30146697912960.0, + "grad_norm": 1.6460026580297042, + "language_loss": 0.7595731, + "learning_rate": 2.182987560765489e-06, + "loss": 0.78058082, + "num_input_tokens_seen": 168277360, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.45507812, + "step": 7832, + "time_per_iteration": 2.481900453567505 + }, + { + "auxiliary_loss_clip": 0.0106386, + "auxiliary_loss_mlp": 0.01033874, + "balance_loss_clip": 1.02195907, + "balance_loss_mlp": 1.02211535, + "epoch": 0.47094543814820383, + "flos": 21797218800000.0, + "grad_norm": 1.343635329448584, + "language_loss": 0.74173665, + "learning_rate": 2.182611319894916e-06, + "loss": 0.76271403, + "num_input_tokens_seen": 168296605, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.41796875, + "step": 7833, + "time_per_iteration": 2.406428098678589 + }, + { + "auxiliary_loss_clip": 0.01064115, + "auxiliary_loss_mlp": 0.01031651, + "balance_loss_clip": 1.01922977, + "balance_loss_mlp": 1.02218103, + "epoch": 0.4710055614008718, + "flos": 23001896862720.0, + "grad_norm": 1.7344768513593352, + "language_loss": 0.75449461, + "learning_rate": 2.1822350725074145e-06, + "loss": 0.77545226, + "num_input_tokens_seen": 168316205, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.41796875, + "step": 7834, + "time_per_iteration": 3.82663893699646 + }, + { + "auxiliary_loss_clip": 0.01063226, + "auxiliary_loss_mlp": 0.0103507, + "balance_loss_clip": 1.02236247, + "balance_loss_mlp": 1.02111673, + "epoch": 0.47106568465353976, + "flos": 42739694760960.0, + "grad_norm": 1.29525890641393, + "language_loss": 0.66257739, + "learning_rate": 2.181858818616412e-06, + "loss": 0.68356037, + "num_input_tokens_seen": 168338935, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.421875, + "step": 7835, + "time_per_iteration": 2.5794596672058105 + }, + { + "auxiliary_loss_clip": 0.01010282, + "auxiliary_loss_mlp": 0.01005652, + "balance_loss_clip": 1.00474036, + "balance_loss_mlp": 1.00206053, + "epoch": 0.4711258079062077, + "flos": 68551157197440.0, + "grad_norm": 0.8675597225722999, + "language_loss": 0.62079978, + "learning_rate": 2.181482558235336e-06, + "loss": 0.64095908, + "num_input_tokens_seen": 168392800, + "router_z_loss_clip": 0.00909424, + "router_z_loss_mlp": 0.08203125, + "step": 7836, + "time_per_iteration": 3.0243518352508545 + }, + { + "auxiliary_loss_clip": 0.0106578, + "auxiliary_loss_mlp": 0.01032543, + "balance_loss_clip": 1.01887608, + "balance_loss_mlp": 1.02050114, + "epoch": 0.4711859311588757, + "flos": 25445433075840.0, + "grad_norm": 1.541126427270552, + "language_loss": 0.69780046, + "learning_rate": 2.181106291377615e-06, + "loss": 0.71878374, + "num_input_tokens_seen": 168412940, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.453125, + "step": 7837, + "time_per_iteration": 2.4420111179351807 + }, + { + "auxiliary_loss_clip": 0.01065308, + "auxiliary_loss_mlp": 0.01029621, + "balance_loss_clip": 1.0159595, + "balance_loss_mlp": 1.02223361, + "epoch": 0.47124605441154366, + "flos": 21980792062080.0, + "grad_norm": 1.7063142980388324, + "language_loss": 0.66223466, + "learning_rate": 2.1807300180566766e-06, + "loss": 0.68318403, + "num_input_tokens_seen": 168431995, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.4296875, + "step": 7838, + "time_per_iteration": 2.3938138484954834 + }, + { + "auxiliary_loss_clip": 0.01065014, + "auxiliary_loss_mlp": 0.01029392, + "balance_loss_clip": 1.0170598, + "balance_loss_mlp": 1.02094388, + "epoch": 0.4713061776642116, + "flos": 25411462456320.0, + "grad_norm": 1.7639706023445425, + "language_loss": 0.77360988, + "learning_rate": 2.1803537382859478e-06, + "loss": 0.794554, + "num_input_tokens_seen": 168454585, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.44140625, + "step": 7839, + "time_per_iteration": 2.4667775630950928 + }, + { + "auxiliary_loss_clip": 0.01064051, + "auxiliary_loss_mlp": 0.01029433, + "balance_loss_clip": 1.01767349, + "balance_loss_mlp": 1.02281141, + "epoch": 0.4713663009168796, + "flos": 26541042451200.0, + "grad_norm": 2.043708472157207, + "language_loss": 0.72709513, + "learning_rate": 2.179977452078858e-06, + "loss": 0.74802995, + "num_input_tokens_seen": 168471265, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.41210938, + "step": 7840, + "time_per_iteration": 2.4308972358703613 + }, + { + "auxiliary_loss_clip": 0.01063004, + "auxiliary_loss_mlp": 0.01024692, + "balance_loss_clip": 1.01226425, + "balance_loss_mlp": 1.02065074, + "epoch": 0.47142642416954755, + "flos": 23622443101440.0, + "grad_norm": 1.676849451296552, + "language_loss": 0.75046402, + "learning_rate": 2.1796011594488363e-06, + "loss": 0.77134097, + "num_input_tokens_seen": 168491360, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.421875, + "step": 7841, + "time_per_iteration": 2.427741289138794 + }, + { + "auxiliary_loss_clip": 0.0106437, + "auxiliary_loss_mlp": 0.0102661, + "balance_loss_clip": 1.01454616, + "balance_loss_mlp": 1.02205718, + "epoch": 0.4714865474222155, + "flos": 22309045464960.0, + "grad_norm": 1.542079760422129, + "language_loss": 0.70299435, + "learning_rate": 2.1792248604093107e-06, + "loss": 0.72390413, + "num_input_tokens_seen": 168511335, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.42382812, + "step": 7842, + "time_per_iteration": 2.395627498626709 + }, + { + "auxiliary_loss_clip": 0.01066628, + "auxiliary_loss_mlp": 0.01025332, + "balance_loss_clip": 1.01295841, + "balance_loss_mlp": 1.02452803, + "epoch": 0.4715466706748835, + "flos": 17821449348480.0, + "grad_norm": 1.8981358265445103, + "language_loss": 0.78859842, + "learning_rate": 2.1788485549737118e-06, + "loss": 0.80951804, + "num_input_tokens_seen": 168529920, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.421875, + "step": 7843, + "time_per_iteration": 3.830595016479492 + }, + { + "auxiliary_loss_clip": 0.01064613, + "auxiliary_loss_mlp": 0.01024103, + "balance_loss_clip": 1.01233721, + "balance_loss_mlp": 1.02221441, + "epoch": 0.4716067939275515, + "flos": 23658403668480.0, + "grad_norm": 1.5483673696480653, + "language_loss": 0.74450284, + "learning_rate": 2.178472243155467e-06, + "loss": 0.76539004, + "num_input_tokens_seen": 168550595, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.42382812, + "step": 7844, + "time_per_iteration": 2.416064739227295 + }, + { + "auxiliary_loss_clip": 0.01066217, + "auxiliary_loss_mlp": 0.01029165, + "balance_loss_clip": 1.01658249, + "balance_loss_mlp": 1.02373064, + "epoch": 0.47166691718021947, + "flos": 17929226315520.0, + "grad_norm": 1.6364891229626821, + "language_loss": 0.7832725, + "learning_rate": 2.178095924968008e-06, + "loss": 0.8042264, + "num_input_tokens_seen": 168569765, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.42578125, + "step": 7845, + "time_per_iteration": 2.396378755569458 + }, + { + "auxiliary_loss_clip": 0.01063105, + "auxiliary_loss_mlp": 0.01026813, + "balance_loss_clip": 1.01490378, + "balance_loss_mlp": 1.02270031, + "epoch": 0.47172704043288743, + "flos": 26613382521600.0, + "grad_norm": 1.3254674465035479, + "language_loss": 0.73021138, + "learning_rate": 2.1777196004247623e-06, + "loss": 0.75111055, + "num_input_tokens_seen": 168591525, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.40429688, + "step": 7846, + "time_per_iteration": 2.44018292427063 + }, + { + "auxiliary_loss_clip": 0.01061999, + "auxiliary_loss_mlp": 0.01024317, + "balance_loss_clip": 1.01204491, + "balance_loss_mlp": 1.02139294, + "epoch": 0.4717871636855554, + "flos": 27921613276800.0, + "grad_norm": 1.3010853449536688, + "language_loss": 0.74301028, + "learning_rate": 2.177343269539162e-06, + "loss": 0.76387352, + "num_input_tokens_seen": 168611235, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.40625, + "step": 7847, + "time_per_iteration": 3.9245951175689697 + }, + { + "auxiliary_loss_clip": 0.01065486, + "auxiliary_loss_mlp": 0.01026822, + "balance_loss_clip": 1.01400089, + "balance_loss_mlp": 1.02299726, + "epoch": 0.47184728693822336, + "flos": 14501348830080.0, + "grad_norm": 2.2558407001262935, + "language_loss": 0.81007254, + "learning_rate": 2.176966932324637e-06, + "loss": 0.83099562, + "num_input_tokens_seen": 168628710, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.42578125, + "step": 7848, + "time_per_iteration": 3.8483238220214844 + }, + { + "auxiliary_loss_clip": 0.01068034, + "auxiliary_loss_mlp": 0.01029013, + "balance_loss_clip": 1.01610887, + "balance_loss_mlp": 1.02476263, + "epoch": 0.47190741019089133, + "flos": 17855629436160.0, + "grad_norm": 2.021026517440526, + "language_loss": 0.70644951, + "learning_rate": 2.1765905887946162e-06, + "loss": 0.72741997, + "num_input_tokens_seen": 168645645, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.43359375, + "step": 7849, + "time_per_iteration": 2.377346992492676 + }, + { + "auxiliary_loss_clip": 0.01067955, + "auxiliary_loss_mlp": 0.01031287, + "balance_loss_clip": 1.01723778, + "balance_loss_mlp": 1.02233195, + "epoch": 0.4719675334435593, + "flos": 17894487646080.0, + "grad_norm": 2.6586418677234724, + "language_loss": 0.69720471, + "learning_rate": 2.176214238962532e-06, + "loss": 0.71819723, + "num_input_tokens_seen": 168664165, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.45703125, + "step": 7850, + "time_per_iteration": 2.3977580070495605 + }, + { + "auxiliary_loss_clip": 0.01066489, + "auxiliary_loss_mlp": 0.01027252, + "balance_loss_clip": 1.01475286, + "balance_loss_mlp": 1.02212942, + "epoch": 0.47202765669622726, + "flos": 20703320081280.0, + "grad_norm": 1.8654708957161485, + "language_loss": 0.75412619, + "learning_rate": 2.175837882841815e-06, + "loss": 0.77506357, + "num_input_tokens_seen": 168681940, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.44335938, + "step": 7851, + "time_per_iteration": 2.3820197582244873 + }, + { + "auxiliary_loss_clip": 0.01066935, + "auxiliary_loss_mlp": 0.01029066, + "balance_loss_clip": 1.01549363, + "balance_loss_mlp": 1.02259171, + "epoch": 0.4720877799488952, + "flos": 16359391676160.0, + "grad_norm": 1.6869896005169929, + "language_loss": 0.7639668, + "learning_rate": 2.1754615204458963e-06, + "loss": 0.78492677, + "num_input_tokens_seen": 168698830, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.44335938, + "step": 7852, + "time_per_iteration": 2.3881587982177734 + }, + { + "auxiliary_loss_clip": 0.01064799, + "auxiliary_loss_mlp": 0.01029599, + "balance_loss_clip": 1.01597285, + "balance_loss_mlp": 1.02189744, + "epoch": 0.4721479032015632, + "flos": 20667115134720.0, + "grad_norm": 1.5191521693904124, + "language_loss": 0.69241577, + "learning_rate": 2.175085151788208e-06, + "loss": 0.71335971, + "num_input_tokens_seen": 168718305, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.4296875, + "step": 7853, + "time_per_iteration": 2.3905465602874756 + }, + { + "auxiliary_loss_clip": 0.01011954, + "auxiliary_loss_mlp": 0.01004053, + "balance_loss_clip": 1.00293291, + "balance_loss_mlp": 1.00332117, + "epoch": 0.47220802645423116, + "flos": 67746616894080.0, + "grad_norm": 0.7243453218786182, + "language_loss": 0.50227457, + "learning_rate": 2.17470877688218e-06, + "loss": 0.52243465, + "num_input_tokens_seen": 168782365, + "router_z_loss_clip": 0.01123047, + "router_z_loss_mlp": 0.0859375, + "step": 7854, + "time_per_iteration": 3.1587862968444824 + }, + { + "auxiliary_loss_clip": 0.01067104, + "auxiliary_loss_mlp": 0.01035217, + "balance_loss_clip": 1.02091229, + "balance_loss_mlp": 1.022084, + "epoch": 0.4722681497068991, + "flos": 20920445026560.0, + "grad_norm": 2.059847622156215, + "language_loss": 0.6389997, + "learning_rate": 2.1743323957412457e-06, + "loss": 0.66002297, + "num_input_tokens_seen": 168800485, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.44921875, + "step": 7855, + "time_per_iteration": 2.386221408843994 + }, + { + "auxiliary_loss_clip": 0.01064826, + "auxiliary_loss_mlp": 0.01032677, + "balance_loss_clip": 1.01932001, + "balance_loss_mlp": 1.02118492, + "epoch": 0.4723282729595671, + "flos": 28291832000640.0, + "grad_norm": 1.9311826626014665, + "language_loss": 0.75889456, + "learning_rate": 2.1739560083788363e-06, + "loss": 0.77986956, + "num_input_tokens_seen": 168818965, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.4375, + "step": 7856, + "time_per_iteration": 2.4561686515808105 + }, + { + "auxiliary_loss_clip": 0.0106301, + "auxiliary_loss_mlp": 0.0103005, + "balance_loss_clip": 1.0174495, + "balance_loss_mlp": 1.02069557, + "epoch": 0.4723883962122351, + "flos": 27123847643520.0, + "grad_norm": 2.937941369688011, + "language_loss": 0.75349951, + "learning_rate": 2.1735796148083843e-06, + "loss": 0.7744301, + "num_input_tokens_seen": 168840355, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.421875, + "step": 7857, + "time_per_iteration": 2.4345805644989014 + }, + { + "auxiliary_loss_clip": 0.01063808, + "auxiliary_loss_mlp": 0.01035256, + "balance_loss_clip": 1.02330494, + "balance_loss_mlp": 1.02142, + "epoch": 0.47244851946490307, + "flos": 31535996578560.0, + "grad_norm": 1.4725117448696725, + "language_loss": 0.64843494, + "learning_rate": 2.1732032150433225e-06, + "loss": 0.66942561, + "num_input_tokens_seen": 168861765, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.42382812, + "step": 7858, + "time_per_iteration": 2.4831478595733643 + }, + { + "auxiliary_loss_clip": 0.01063056, + "auxiliary_loss_mlp": 0.01025395, + "balance_loss_clip": 1.01210284, + "balance_loss_mlp": 1.02041197, + "epoch": 0.47250864271757104, + "flos": 20885496888960.0, + "grad_norm": 1.533110901752159, + "language_loss": 0.70227879, + "learning_rate": 2.1728268090970834e-06, + "loss": 0.72316337, + "num_input_tokens_seen": 168881310, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.42578125, + "step": 7859, + "time_per_iteration": 2.3995707035064697 + }, + { + "auxiliary_loss_clip": 0.01067804, + "auxiliary_loss_mlp": 0.01031004, + "balance_loss_clip": 1.01774836, + "balance_loss_mlp": 1.02209496, + "epoch": 0.472568765970239, + "flos": 20521038539520.0, + "grad_norm": 1.7201348499895517, + "language_loss": 0.61804533, + "learning_rate": 2.1724503969831003e-06, + "loss": 0.63903338, + "num_input_tokens_seen": 168899470, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.45703125, + "step": 7860, + "time_per_iteration": 2.401761293411255 + }, + { + "auxiliary_loss_clip": 0.01067966, + "auxiliary_loss_mlp": 0.01030782, + "balance_loss_clip": 1.01698947, + "balance_loss_mlp": 1.02321768, + "epoch": 0.47262888922290697, + "flos": 35803849397760.0, + "grad_norm": 2.1365552072085583, + "language_loss": 0.71769601, + "learning_rate": 2.172073978714806e-06, + "loss": 0.73868352, + "num_input_tokens_seen": 168921495, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.44726562, + "step": 7861, + "time_per_iteration": 2.5268442630767822 + }, + { + "auxiliary_loss_clip": 0.01066746, + "auxiliary_loss_mlp": 0.01024731, + "balance_loss_clip": 1.01154089, + "balance_loss_mlp": 1.02292728, + "epoch": 0.47268901247557493, + "flos": 20666696198400.0, + "grad_norm": 1.7743233215506022, + "language_loss": 0.84789056, + "learning_rate": 2.171697554305634e-06, + "loss": 0.86880535, + "num_input_tokens_seen": 168940515, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.4375, + "step": 7862, + "time_per_iteration": 2.409329652786255 + }, + { + "auxiliary_loss_clip": 0.01066254, + "auxiliary_loss_mlp": 0.01029664, + "balance_loss_clip": 1.01680672, + "balance_loss_mlp": 1.02104282, + "epoch": 0.4727491357282429, + "flos": 19572273809280.0, + "grad_norm": 3.027806993275693, + "language_loss": 0.84470963, + "learning_rate": 2.1713211237690178e-06, + "loss": 0.86566877, + "num_input_tokens_seen": 168958340, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.45117188, + "step": 7863, + "time_per_iteration": 2.3807599544525146 + }, + { + "auxiliary_loss_clip": 0.01066651, + "auxiliary_loss_mlp": 0.01028876, + "balance_loss_clip": 1.01614416, + "balance_loss_mlp": 1.02252781, + "epoch": 0.47280925898091086, + "flos": 18216422092800.0, + "grad_norm": 2.0092710731492542, + "language_loss": 0.65968412, + "learning_rate": 2.1709446871183917e-06, + "loss": 0.68063939, + "num_input_tokens_seen": 168974850, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.44140625, + "step": 7864, + "time_per_iteration": 2.3932902812957764 + }, + { + "auxiliary_loss_clip": 0.01065218, + "auxiliary_loss_mlp": 0.01024853, + "balance_loss_clip": 1.01098299, + "balance_loss_mlp": 1.02173054, + "epoch": 0.4728693822335788, + "flos": 17820855855360.0, + "grad_norm": 1.8604189809591176, + "language_loss": 0.65601248, + "learning_rate": 2.1705682443671897e-06, + "loss": 0.67691326, + "num_input_tokens_seen": 168992860, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.43359375, + "step": 7865, + "time_per_iteration": 2.425555944442749 + }, + { + "auxiliary_loss_clip": 0.0106301, + "auxiliary_loss_mlp": 0.01027599, + "balance_loss_clip": 1.01568985, + "balance_loss_mlp": 1.02104616, + "epoch": 0.4729295054862468, + "flos": 20594007014400.0, + "grad_norm": 1.8359554871535808, + "language_loss": 0.74078536, + "learning_rate": 2.1701917955288454e-06, + "loss": 0.76169145, + "num_input_tokens_seen": 169010325, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.41992188, + "step": 7866, + "time_per_iteration": 2.4031128883361816 + }, + { + "auxiliary_loss_clip": 0.0106667, + "auxiliary_loss_mlp": 0.01023138, + "balance_loss_clip": 1.0101862, + "balance_loss_mlp": 1.02161837, + "epoch": 0.47298962873891476, + "flos": 23366948705280.0, + "grad_norm": 1.7699759912117907, + "language_loss": 0.82790732, + "learning_rate": 2.1698153406167934e-06, + "loss": 0.84880531, + "num_input_tokens_seen": 169029840, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.45117188, + "step": 7867, + "time_per_iteration": 2.4017879962921143 + }, + { + "auxiliary_loss_clip": 0.01064396, + "auxiliary_loss_mlp": 0.01024684, + "balance_loss_clip": 1.01209593, + "balance_loss_mlp": 1.02168798, + "epoch": 0.4730497519915827, + "flos": 22051212007680.0, + "grad_norm": 8.239429188258885, + "language_loss": 0.79514956, + "learning_rate": 2.1694388796444697e-06, + "loss": 0.8160404, + "num_input_tokens_seen": 169049975, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.42773438, + "step": 7868, + "time_per_iteration": 2.4181675910949707 + }, + { + "auxiliary_loss_clip": 0.01068372, + "auxiliary_loss_mlp": 0.01027795, + "balance_loss_clip": 1.01353121, + "balance_loss_mlp": 1.022596, + "epoch": 0.4731098752442507, + "flos": 21068651214720.0, + "grad_norm": 2.16567175727034, + "language_loss": 0.74960476, + "learning_rate": 2.1690624126253074e-06, + "loss": 0.77056646, + "num_input_tokens_seen": 169069540, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.45703125, + "step": 7869, + "time_per_iteration": 2.389354705810547 + }, + { + "auxiliary_loss_clip": 0.01068035, + "auxiliary_loss_mlp": 0.01031912, + "balance_loss_clip": 1.01840007, + "balance_loss_mlp": 1.02336192, + "epoch": 0.4731699984969187, + "flos": 22527671598720.0, + "grad_norm": 1.6461868802491713, + "language_loss": 0.73490572, + "learning_rate": 2.168685939572743e-06, + "loss": 0.75590515, + "num_input_tokens_seen": 169089940, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.44726562, + "step": 7870, + "time_per_iteration": 2.4213216304779053 + }, + { + "auxiliary_loss_clip": 0.01068044, + "auxiliary_loss_mlp": 0.01027133, + "balance_loss_clip": 1.01208842, + "balance_loss_mlp": 1.02194476, + "epoch": 0.4732301217495867, + "flos": 24897017439360.0, + "grad_norm": 1.9388495097179765, + "language_loss": 0.8087011, + "learning_rate": 2.1683094605002107e-06, + "loss": 0.82965285, + "num_input_tokens_seen": 169109650, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.4609375, + "step": 7871, + "time_per_iteration": 2.408006429672241 + }, + { + "auxiliary_loss_clip": 0.010665, + "auxiliary_loss_mlp": 0.01029205, + "balance_loss_clip": 1.01615167, + "balance_loss_mlp": 1.02283263, + "epoch": 0.47329024500225464, + "flos": 22783305640320.0, + "grad_norm": 1.5836070474087145, + "language_loss": 0.76176357, + "learning_rate": 2.1679329754211472e-06, + "loss": 0.78272063, + "num_input_tokens_seen": 169128990, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.4375, + "step": 7872, + "time_per_iteration": 2.42921781539917 + }, + { + "auxiliary_loss_clip": 0.01064782, + "auxiliary_loss_mlp": 0.01029615, + "balance_loss_clip": 1.01597202, + "balance_loss_mlp": 1.02038085, + "epoch": 0.4733503682549226, + "flos": 38694238502400.0, + "grad_norm": 1.6883096781691982, + "language_loss": 0.68057317, + "learning_rate": 2.1675564843489872e-06, + "loss": 0.70151716, + "num_input_tokens_seen": 169154645, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.44335938, + "step": 7873, + "time_per_iteration": 3.989781618118286 + }, + { + "auxiliary_loss_clip": 0.01065017, + "auxiliary_loss_mlp": 0.01026062, + "balance_loss_clip": 1.01265097, + "balance_loss_mlp": 1.02233481, + "epoch": 0.47341049150759057, + "flos": 22965726827520.0, + "grad_norm": 2.8653569229614155, + "language_loss": 0.72446769, + "learning_rate": 2.167179987297168e-06, + "loss": 0.74537849, + "num_input_tokens_seen": 169174995, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.42578125, + "step": 7874, + "time_per_iteration": 2.409348487854004 + }, + { + "auxiliary_loss_clip": 0.01067542, + "auxiliary_loss_mlp": 0.01031091, + "balance_loss_clip": 1.01748323, + "balance_loss_mlp": 1.02265453, + "epoch": 0.47347061476025853, + "flos": 14537588688000.0, + "grad_norm": 2.3798482307528253, + "language_loss": 0.6504761, + "learning_rate": 2.1668034842791246e-06, + "loss": 0.67146242, + "num_input_tokens_seen": 169191815, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.44921875, + "step": 7875, + "time_per_iteration": 2.373035192489624 + }, + { + "auxiliary_loss_clip": 0.01066935, + "auxiliary_loss_mlp": 0.01027742, + "balance_loss_clip": 1.01373529, + "balance_loss_mlp": 1.02198911, + "epoch": 0.4735307380129265, + "flos": 30261945911040.0, + "grad_norm": 5.132281944308627, + "language_loss": 0.81153631, + "learning_rate": 2.166426975308294e-06, + "loss": 0.83248305, + "num_input_tokens_seen": 169210430, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.44921875, + "step": 7876, + "time_per_iteration": 2.464959144592285 + }, + { + "auxiliary_loss_clip": 0.0106393, + "auxiliary_loss_mlp": 0.01026185, + "balance_loss_clip": 1.01296484, + "balance_loss_mlp": 1.01942849, + "epoch": 0.47359086126559446, + "flos": 19390027178880.0, + "grad_norm": 1.8065425505200834, + "language_loss": 0.79017437, + "learning_rate": 2.166050460398113e-06, + "loss": 0.81107545, + "num_input_tokens_seen": 169229295, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.4453125, + "step": 7877, + "time_per_iteration": 2.3724844455718994 + }, + { + "auxiliary_loss_clip": 0.01064681, + "auxiliary_loss_mlp": 0.01028342, + "balance_loss_clip": 1.01528883, + "balance_loss_mlp": 1.02193856, + "epoch": 0.47365098451826243, + "flos": 21938477627520.0, + "grad_norm": 2.3615374234084126, + "language_loss": 0.70697212, + "learning_rate": 2.1656739395620173e-06, + "loss": 0.72790229, + "num_input_tokens_seen": 169247855, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.42773438, + "step": 7878, + "time_per_iteration": 2.393564462661743 + }, + { + "auxiliary_loss_clip": 0.01063048, + "auxiliary_loss_mlp": 0.01024797, + "balance_loss_clip": 1.01256037, + "balance_loss_mlp": 1.02156329, + "epoch": 0.4737111077709304, + "flos": 25843966778880.0, + "grad_norm": 1.6749420091895515, + "language_loss": 0.753537, + "learning_rate": 2.1652974128134457e-06, + "loss": 0.77441537, + "num_input_tokens_seen": 169268860, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.41601562, + "step": 7879, + "time_per_iteration": 2.4442224502563477 + }, + { + "auxiliary_loss_clip": 0.0101264, + "auxiliary_loss_mlp": 0.01000628, + "balance_loss_clip": 0.99960256, + "balance_loss_mlp": 1.00400996, + "epoch": 0.47377123102359836, + "flos": 67757790395520.0, + "grad_norm": 0.7258484774614344, + "language_loss": 0.61231053, + "learning_rate": 2.1649208801658344e-06, + "loss": 0.63244319, + "num_input_tokens_seen": 169331855, + "router_z_loss_clip": 0.01025391, + "router_z_loss_mlp": 0.0859375, + "step": 7880, + "time_per_iteration": 3.0590505599975586 + }, + { + "auxiliary_loss_clip": 0.01063711, + "auxiliary_loss_mlp": 0.01033268, + "balance_loss_clip": 1.01883745, + "balance_loss_mlp": 1.02179193, + "epoch": 0.4738313542762663, + "flos": 24314840651520.0, + "grad_norm": 1.4355997900396278, + "language_loss": 0.68138194, + "learning_rate": 2.1645443416326214e-06, + "loss": 0.70235169, + "num_input_tokens_seen": 169352175, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.41992188, + "step": 7881, + "time_per_iteration": 2.4225499629974365 + }, + { + "auxiliary_loss_clip": 0.01065535, + "auxiliary_loss_mlp": 0.01025446, + "balance_loss_clip": 1.01294661, + "balance_loss_mlp": 1.02220297, + "epoch": 0.4738914775289343, + "flos": 20704262688000.0, + "grad_norm": 1.633595563448119, + "language_loss": 0.77115476, + "learning_rate": 2.164167797227244e-06, + "loss": 0.79206461, + "num_input_tokens_seen": 169371215, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.43359375, + "step": 7882, + "time_per_iteration": 3.84409761428833 + }, + { + "auxiliary_loss_clip": 0.01065824, + "auxiliary_loss_mlp": 0.01025986, + "balance_loss_clip": 1.01216936, + "balance_loss_mlp": 1.02215886, + "epoch": 0.4739516007816023, + "flos": 25445188696320.0, + "grad_norm": 1.4007084483940662, + "language_loss": 0.75994754, + "learning_rate": 2.16379124696314e-06, + "loss": 0.78086561, + "num_input_tokens_seen": 169391745, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.4375, + "step": 7883, + "time_per_iteration": 2.4378130435943604 + }, + { + "auxiliary_loss_clip": 0.01065813, + "auxiliary_loss_mlp": 0.01033974, + "balance_loss_clip": 1.02130187, + "balance_loss_mlp": 1.02210236, + "epoch": 0.4740117240342703, + "flos": 19973321130240.0, + "grad_norm": 2.7504792139946517, + "language_loss": 0.71925062, + "learning_rate": 2.1634146908537483e-06, + "loss": 0.74024844, + "num_input_tokens_seen": 169409845, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.4375, + "step": 7884, + "time_per_iteration": 2.394927978515625 + }, + { + "auxiliary_loss_clip": 0.01068537, + "auxiliary_loss_mlp": 0.01027852, + "balance_loss_clip": 1.01370835, + "balance_loss_mlp": 1.02228951, + "epoch": 0.47407184728693824, + "flos": 15660465701760.0, + "grad_norm": 1.9408303131750526, + "language_loss": 0.82202697, + "learning_rate": 2.163038128912506e-06, + "loss": 0.84299082, + "num_input_tokens_seen": 169426085, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.46289062, + "step": 7885, + "time_per_iteration": 2.3500571250915527 + }, + { + "auxiliary_loss_clip": 0.010671, + "auxiliary_loss_mlp": 0.01033008, + "balance_loss_clip": 1.02010953, + "balance_loss_mlp": 1.02334046, + "epoch": 0.4741319705396062, + "flos": 18587792891520.0, + "grad_norm": 3.303839022281658, + "language_loss": 0.73544675, + "learning_rate": 2.1626615611528525e-06, + "loss": 0.75644779, + "num_input_tokens_seen": 169444705, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.4375, + "step": 7886, + "time_per_iteration": 2.3819262981414795 + }, + { + "auxiliary_loss_clip": 0.01067883, + "auxiliary_loss_mlp": 0.01031907, + "balance_loss_clip": 1.01654148, + "balance_loss_mlp": 1.02157426, + "epoch": 0.47419209379227417, + "flos": 13260256352640.0, + "grad_norm": 2.0698796282885947, + "language_loss": 0.73592085, + "learning_rate": 2.1622849875882266e-06, + "loss": 0.75691873, + "num_input_tokens_seen": 169460850, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.46289062, + "step": 7887, + "time_per_iteration": 5.000602722167969 + }, + { + "auxiliary_loss_clip": 0.01061933, + "auxiliary_loss_mlp": 0.0102188, + "balance_loss_clip": 1.01004219, + "balance_loss_mlp": 1.02040672, + "epoch": 0.47425221704494214, + "flos": 20043112671360.0, + "grad_norm": 1.7487302650607348, + "language_loss": 0.76985222, + "learning_rate": 2.1619084082320663e-06, + "loss": 0.79069036, + "num_input_tokens_seen": 169478890, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.41601562, + "step": 7888, + "time_per_iteration": 2.3722057342529297 + }, + { + "auxiliary_loss_clip": 0.01066368, + "auxiliary_loss_mlp": 0.01027224, + "balance_loss_clip": 1.01384282, + "balance_loss_mlp": 1.02204633, + "epoch": 0.4743123402976101, + "flos": 27270657377280.0, + "grad_norm": 2.4861380225307657, + "language_loss": 0.72761977, + "learning_rate": 2.161531823097812e-06, + "loss": 0.74855578, + "num_input_tokens_seen": 169499690, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.44335938, + "step": 7889, + "time_per_iteration": 2.445380449295044 + }, + { + "auxiliary_loss_clip": 0.010121, + "auxiliary_loss_mlp": 0.01003631, + "balance_loss_clip": 1.00258791, + "balance_loss_mlp": 1.00311267, + "epoch": 0.47437246355027807, + "flos": 55391170003200.0, + "grad_norm": 0.7145390347130444, + "language_loss": 0.56026196, + "learning_rate": 2.1611552321989015e-06, + "loss": 0.58041918, + "num_input_tokens_seen": 169560475, + "router_z_loss_clip": 0.01043701, + "router_z_loss_mlp": 0.08984375, + "step": 7890, + "time_per_iteration": 3.099888801574707 + }, + { + "auxiliary_loss_clip": 0.01067951, + "auxiliary_loss_mlp": 0.01026539, + "balance_loss_clip": 1.01282442, + "balance_loss_mlp": 1.02292538, + "epoch": 0.47443258680294603, + "flos": 23877344004480.0, + "grad_norm": 2.0016580350175994, + "language_loss": 0.65360618, + "learning_rate": 2.1607786355487764e-06, + "loss": 0.67455113, + "num_input_tokens_seen": 169580110, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.44921875, + "step": 7891, + "time_per_iteration": 2.4142086505889893 + }, + { + "auxiliary_loss_clip": 0.0106856, + "auxiliary_loss_mlp": 0.01031477, + "balance_loss_clip": 1.01676059, + "balance_loss_mlp": 1.02186656, + "epoch": 0.474492710055614, + "flos": 21976777255680.0, + "grad_norm": 2.3106268133925463, + "language_loss": 0.69907355, + "learning_rate": 2.1604020331608746e-06, + "loss": 0.72007394, + "num_input_tokens_seen": 169597510, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.46679688, + "step": 7892, + "time_per_iteration": 2.4022436141967773 + }, + { + "auxiliary_loss_clip": 0.01010938, + "auxiliary_loss_mlp": 0.01000466, + "balance_loss_clip": 0.99939281, + "balance_loss_mlp": 1.00214517, + "epoch": 0.47455283330828196, + "flos": 62553845669760.0, + "grad_norm": 0.8181411269710498, + "language_loss": 0.58552754, + "learning_rate": 2.1600254250486373e-06, + "loss": 0.6056416, + "num_input_tokens_seen": 169660010, + "router_z_loss_clip": 0.01074219, + "router_z_loss_mlp": 0.08789062, + "step": 7893, + "time_per_iteration": 3.1402206420898438 + }, + { + "auxiliary_loss_clip": 0.01064624, + "auxiliary_loss_mlp": 0.01026504, + "balance_loss_clip": 1.01410055, + "balance_loss_mlp": 1.02146745, + "epoch": 0.47461295656094993, + "flos": 12092830577280.0, + "grad_norm": 3.1286380166390835, + "language_loss": 0.77378476, + "learning_rate": 2.1596488112255036e-06, + "loss": 0.79469603, + "num_input_tokens_seen": 169678485, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.43164062, + "step": 7894, + "time_per_iteration": 2.3926703929901123 + }, + { + "auxiliary_loss_clip": 0.01064686, + "auxiliary_loss_mlp": 0.01029008, + "balance_loss_clip": 1.01640141, + "balance_loss_mlp": 1.02092528, + "epoch": 0.4746730798136179, + "flos": 20883576764160.0, + "grad_norm": 1.7471960526550305, + "language_loss": 0.74537539, + "learning_rate": 2.159272191704915e-06, + "loss": 0.7663123, + "num_input_tokens_seen": 169697335, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.4375, + "step": 7895, + "time_per_iteration": 2.39827561378479 + }, + { + "auxiliary_loss_clip": 0.01063617, + "auxiliary_loss_mlp": 0.01024449, + "balance_loss_clip": 1.01159811, + "balance_loss_mlp": 1.02133107, + "epoch": 0.4747332030662859, + "flos": 19973774977920.0, + "grad_norm": 2.2419850344676155, + "language_loss": 0.82390517, + "learning_rate": 2.158895566500312e-06, + "loss": 0.84478581, + "num_input_tokens_seen": 169715395, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.421875, + "step": 7896, + "time_per_iteration": 2.4102399349212646 + }, + { + "auxiliary_loss_clip": 0.01064745, + "auxiliary_loss_mlp": 0.0102571, + "balance_loss_clip": 1.011917, + "balance_loss_mlp": 1.02142596, + "epoch": 0.4747933263189539, + "flos": 16033267866240.0, + "grad_norm": 1.8928725337412038, + "language_loss": 0.75321198, + "learning_rate": 2.158518935625134e-06, + "loss": 0.77411652, + "num_input_tokens_seen": 169733755, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.43359375, + "step": 7897, + "time_per_iteration": 2.38686203956604 + }, + { + "auxiliary_loss_clip": 0.01069206, + "auxiliary_loss_mlp": 0.01032578, + "balance_loss_clip": 1.0190357, + "balance_loss_mlp": 1.02226114, + "epoch": 0.47485344957162184, + "flos": 13954224913920.0, + "grad_norm": 7.080752441497833, + "language_loss": 0.63918447, + "learning_rate": 2.1581422990928233e-06, + "loss": 0.66020232, + "num_input_tokens_seen": 169751390, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.46875, + "step": 7898, + "time_per_iteration": 2.3751022815704346 + }, + { + "auxiliary_loss_clip": 0.01065847, + "auxiliary_loss_mlp": 0.01030979, + "balance_loss_clip": 1.01700211, + "balance_loss_mlp": 1.02138925, + "epoch": 0.4749135728242898, + "flos": 20448035153280.0, + "grad_norm": 2.188090695666534, + "language_loss": 0.70079327, + "learning_rate": 2.1577656569168215e-06, + "loss": 0.72176147, + "num_input_tokens_seen": 169769500, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.4453125, + "step": 7899, + "time_per_iteration": 2.424525260925293 + }, + { + "auxiliary_loss_clip": 0.01066779, + "auxiliary_loss_mlp": 0.01033673, + "balance_loss_clip": 1.01988053, + "balance_loss_mlp": 1.02213907, + "epoch": 0.4749736960769578, + "flos": 28948687920000.0, + "grad_norm": 1.8412828014078775, + "language_loss": 0.68432325, + "learning_rate": 2.1573890091105684e-06, + "loss": 0.70532775, + "num_input_tokens_seen": 169789215, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.4453125, + "step": 7900, + "time_per_iteration": 2.53025484085083 + }, + { + "auxiliary_loss_clip": 0.010647, + "auxiliary_loss_mlp": 0.0102783, + "balance_loss_clip": 1.01496148, + "balance_loss_mlp": 1.02141595, + "epoch": 0.47503381932962574, + "flos": 31937497747200.0, + "grad_norm": 1.9885375650873454, + "language_loss": 0.70712733, + "learning_rate": 2.157012355687507e-06, + "loss": 0.72805274, + "num_input_tokens_seen": 169808825, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.43359375, + "step": 7901, + "time_per_iteration": 2.4989264011383057 + }, + { + "auxiliary_loss_clip": 0.01067596, + "auxiliary_loss_mlp": 0.01037336, + "balance_loss_clip": 1.02408612, + "balance_loss_mlp": 1.02316093, + "epoch": 0.4750939425822937, + "flos": 22126170430080.0, + "grad_norm": 1.5652433670441186, + "language_loss": 0.73500258, + "learning_rate": 2.1566356966610776e-06, + "loss": 0.7560519, + "num_input_tokens_seen": 169827590, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.44335938, + "step": 7902, + "time_per_iteration": 2.4325289726257324 + }, + { + "auxiliary_loss_clip": 0.01061282, + "auxiliary_loss_mlp": 0.01034361, + "balance_loss_clip": 1.02218938, + "balance_loss_mlp": 1.01994693, + "epoch": 0.47515406583496167, + "flos": 20849047562880.0, + "grad_norm": 1.729090001371206, + "language_loss": 0.68488467, + "learning_rate": 2.1562590320447234e-06, + "loss": 0.70584112, + "num_input_tokens_seen": 169844925, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.4140625, + "step": 7903, + "time_per_iteration": 2.3975868225097656 + }, + { + "auxiliary_loss_clip": 0.01061705, + "auxiliary_loss_mlp": 0.01024988, + "balance_loss_clip": 1.01358569, + "balance_loss_mlp": 1.02038145, + "epoch": 0.47521418908762963, + "flos": 17523989631360.0, + "grad_norm": 1.6138453765346445, + "language_loss": 0.7244786, + "learning_rate": 2.155882361851887e-06, + "loss": 0.74534553, + "num_input_tokens_seen": 169862705, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.4140625, + "step": 7904, + "time_per_iteration": 2.3773956298828125 + }, + { + "auxiliary_loss_clip": 0.0106157, + "auxiliary_loss_mlp": 0.01025255, + "balance_loss_clip": 1.01366806, + "balance_loss_mlp": 1.0208689, + "epoch": 0.4752743123402976, + "flos": 20558360649600.0, + "grad_norm": 1.9155966426023745, + "language_loss": 0.85872608, + "learning_rate": 2.1555056860960095e-06, + "loss": 0.87959433, + "num_input_tokens_seen": 169880155, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.40820312, + "step": 7905, + "time_per_iteration": 2.408397912979126 + }, + { + "auxiliary_loss_clip": 0.01060621, + "auxiliary_loss_mlp": 0.01027293, + "balance_loss_clip": 1.01598036, + "balance_loss_mlp": 1.02002764, + "epoch": 0.47533443559296557, + "flos": 26359389313920.0, + "grad_norm": 1.7531622477660802, + "language_loss": 0.81907821, + "learning_rate": 2.1551290047905343e-06, + "loss": 0.83995736, + "num_input_tokens_seen": 169901525, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.40625, + "step": 7906, + "time_per_iteration": 2.4339654445648193 + }, + { + "auxiliary_loss_clip": 0.01012087, + "auxiliary_loss_mlp": 0.01005505, + "balance_loss_clip": 1.00434852, + "balance_loss_mlp": 1.00314784, + "epoch": 0.47539455884563353, + "flos": 65946251347200.0, + "grad_norm": 0.6631879933743545, + "language_loss": 0.5901705, + "learning_rate": 2.1547523179489033e-06, + "loss": 0.61034638, + "num_input_tokens_seen": 169970345, + "router_z_loss_clip": 0.01153564, + "router_z_loss_mlp": 0.08984375, + "step": 7907, + "time_per_iteration": 3.152611017227173 + }, + { + "auxiliary_loss_clip": 0.01065457, + "auxiliary_loss_mlp": 0.01025624, + "balance_loss_clip": 1.01356649, + "balance_loss_mlp": 1.02280343, + "epoch": 0.4754546820983015, + "flos": 17237177879040.0, + "grad_norm": 1.7539595273460309, + "language_loss": 0.81033838, + "learning_rate": 2.154375625584561e-06, + "loss": 0.83124918, + "num_input_tokens_seen": 169986440, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.42578125, + "step": 7908, + "time_per_iteration": 2.3810126781463623 + }, + { + "auxiliary_loss_clip": 0.01065166, + "auxiliary_loss_mlp": 0.01030011, + "balance_loss_clip": 1.01733923, + "balance_loss_mlp": 1.02214408, + "epoch": 0.47551480535096946, + "flos": 19824940385280.0, + "grad_norm": 1.564768765806314, + "language_loss": 0.73785806, + "learning_rate": 2.1539989277109496e-06, + "loss": 0.75880986, + "num_input_tokens_seen": 170005705, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.4296875, + "step": 7909, + "time_per_iteration": 2.4131598472595215 + }, + { + "auxiliary_loss_clip": 0.01063529, + "auxiliary_loss_mlp": 0.01027695, + "balance_loss_clip": 1.01492739, + "balance_loss_mlp": 1.02138305, + "epoch": 0.4755749286036375, + "flos": 22162864135680.0, + "grad_norm": 1.600718066337282, + "language_loss": 0.74917877, + "learning_rate": 2.153622224341512e-06, + "loss": 0.770091, + "num_input_tokens_seen": 170023415, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.421875, + "step": 7910, + "time_per_iteration": 2.4194037914276123 + }, + { + "auxiliary_loss_clip": 0.01060779, + "auxiliary_loss_mlp": 0.01025113, + "balance_loss_clip": 1.01343012, + "balance_loss_mlp": 1.01982784, + "epoch": 0.47563505185630545, + "flos": 21647336866560.0, + "grad_norm": 1.8382407459253056, + "language_loss": 0.78858411, + "learning_rate": 2.1532455154896926e-06, + "loss": 0.809443, + "num_input_tokens_seen": 170042395, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.41015625, + "step": 7911, + "time_per_iteration": 2.3936684131622314 + }, + { + "auxiliary_loss_clip": 0.01065284, + "auxiliary_loss_mlp": 0.01027568, + "balance_loss_clip": 1.01381111, + "balance_loss_mlp": 1.02044499, + "epoch": 0.4756951751089734, + "flos": 20627803077120.0, + "grad_norm": 1.685520170628848, + "language_loss": 0.75469702, + "learning_rate": 2.1528688011689348e-06, + "loss": 0.77562559, + "num_input_tokens_seen": 170061610, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.44921875, + "step": 7912, + "time_per_iteration": 2.386514663696289 + }, + { + "auxiliary_loss_clip": 0.01061081, + "auxiliary_loss_mlp": 0.01028871, + "balance_loss_clip": 1.01718879, + "balance_loss_mlp": 1.01871467, + "epoch": 0.4757552983616414, + "flos": 25847597560320.0, + "grad_norm": 1.5286204607670981, + "language_loss": 0.74198967, + "learning_rate": 2.1524920813926833e-06, + "loss": 0.76288915, + "num_input_tokens_seen": 170083505, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.421875, + "step": 7913, + "time_per_iteration": 3.8474714756011963 + }, + { + "auxiliary_loss_clip": 0.01064848, + "auxiliary_loss_mlp": 0.01026997, + "balance_loss_clip": 1.01418805, + "balance_loss_mlp": 1.0218358, + "epoch": 0.47581542161430934, + "flos": 18222042821760.0, + "grad_norm": 2.1742274314569663, + "language_loss": 0.72220588, + "learning_rate": 2.152115356174382e-06, + "loss": 0.74312425, + "num_input_tokens_seen": 170100690, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.4296875, + "step": 7914, + "time_per_iteration": 2.378041982650757 + }, + { + "auxiliary_loss_clip": 0.01065241, + "auxiliary_loss_mlp": 0.01025871, + "balance_loss_clip": 1.01325846, + "balance_loss_mlp": 1.02215445, + "epoch": 0.4758755448669773, + "flos": 21578697400320.0, + "grad_norm": 2.305274160726579, + "language_loss": 0.6448797, + "learning_rate": 2.151738625527474e-06, + "loss": 0.6657908, + "num_input_tokens_seen": 170119240, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.43164062, + "step": 7915, + "time_per_iteration": 2.3968522548675537 + }, + { + "auxiliary_loss_clip": 0.01063684, + "auxiliary_loss_mlp": 0.01024973, + "balance_loss_clip": 1.01286173, + "balance_loss_mlp": 1.02229762, + "epoch": 0.47593566811964527, + "flos": 15230265528960.0, + "grad_norm": 1.748935872342608, + "language_loss": 0.77036595, + "learning_rate": 2.151361889465405e-06, + "loss": 0.79125249, + "num_input_tokens_seen": 170136450, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.4140625, + "step": 7916, + "time_per_iteration": 2.38972806930542 + }, + { + "auxiliary_loss_clip": 0.01063823, + "auxiliary_loss_mlp": 0.01021874, + "balance_loss_clip": 1.01024508, + "balance_loss_mlp": 1.02061582, + "epoch": 0.47599579137231324, + "flos": 21542178251520.0, + "grad_norm": 1.886360718394972, + "language_loss": 0.64360642, + "learning_rate": 2.1509851480016197e-06, + "loss": 0.6644634, + "num_input_tokens_seen": 170155295, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.43164062, + "step": 7917, + "time_per_iteration": 2.3920717239379883 + }, + { + "auxiliary_loss_clip": 0.01011263, + "auxiliary_loss_mlp": 0.01007839, + "balance_loss_clip": 1.0067662, + "balance_loss_mlp": 1.00267947, + "epoch": 0.4760559146249812, + "flos": 64551471598080.0, + "grad_norm": 0.8462385426310135, + "language_loss": 0.65691054, + "learning_rate": 2.150608401149563e-06, + "loss": 0.67710161, + "num_input_tokens_seen": 170222325, + "router_z_loss_clip": 0.01074219, + "router_z_loss_mlp": 0.0859375, + "step": 7918, + "time_per_iteration": 3.048673629760742 + }, + { + "auxiliary_loss_clip": 0.01065173, + "auxiliary_loss_mlp": 0.01027126, + "balance_loss_clip": 1.01466906, + "balance_loss_mlp": 1.02226305, + "epoch": 0.47611603787764917, + "flos": 22232865144960.0, + "grad_norm": 4.60179458744552, + "language_loss": 0.68928725, + "learning_rate": 2.1502316489226796e-06, + "loss": 0.71021026, + "num_input_tokens_seen": 170241625, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.4296875, + "step": 7919, + "time_per_iteration": 2.3988280296325684 + }, + { + "auxiliary_loss_clip": 0.0106602, + "auxiliary_loss_mlp": 0.01024819, + "balance_loss_clip": 1.01250458, + "balance_loss_mlp": 1.02141786, + "epoch": 0.47617616113031713, + "flos": 22779011543040.0, + "grad_norm": 2.109426655792178, + "language_loss": 0.74882919, + "learning_rate": 2.149854891334415e-06, + "loss": 0.7697376, + "num_input_tokens_seen": 170262470, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.4453125, + "step": 7920, + "time_per_iteration": 2.475311040878296 + }, + { + "auxiliary_loss_clip": 0.01067273, + "auxiliary_loss_mlp": 0.0102829, + "balance_loss_clip": 1.0149858, + "balance_loss_mlp": 1.02229857, + "epoch": 0.4762362843829851, + "flos": 24132663843840.0, + "grad_norm": 1.4967303965163088, + "language_loss": 0.77378374, + "learning_rate": 2.149478128398215e-06, + "loss": 0.79473937, + "num_input_tokens_seen": 170283460, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.44921875, + "step": 7921, + "time_per_iteration": 2.430405855178833 + }, + { + "auxiliary_loss_clip": 0.01065103, + "auxiliary_loss_mlp": 0.01029718, + "balance_loss_clip": 1.01563907, + "balance_loss_mlp": 1.0222702, + "epoch": 0.47629640763565306, + "flos": 22451072342400.0, + "grad_norm": 1.705797356670972, + "language_loss": 0.77896452, + "learning_rate": 2.1491013601275244e-06, + "loss": 0.79991275, + "num_input_tokens_seen": 170304225, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.42773438, + "step": 7922, + "time_per_iteration": 3.8415377140045166 + }, + { + "auxiliary_loss_clip": 0.01065327, + "auxiliary_loss_mlp": 0.01029379, + "balance_loss_clip": 1.01571774, + "balance_loss_mlp": 1.0203861, + "epoch": 0.4763565308883211, + "flos": 11180619907200.0, + "grad_norm": 2.403984022443184, + "language_loss": 0.7253527, + "learning_rate": 2.148724586535791e-06, + "loss": 0.74629974, + "num_input_tokens_seen": 170322110, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.44921875, + "step": 7923, + "time_per_iteration": 2.375302791595459 + }, + { + "auxiliary_loss_clip": 0.01063791, + "auxiliary_loss_mlp": 0.01029479, + "balance_loss_clip": 1.01752257, + "balance_loss_mlp": 1.02005231, + "epoch": 0.47641665414098905, + "flos": 22381071333120.0, + "grad_norm": 1.8651951168182324, + "language_loss": 0.81874764, + "learning_rate": 2.1483478076364586e-06, + "loss": 0.83968031, + "num_input_tokens_seen": 170340700, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.4375, + "step": 7924, + "time_per_iteration": 2.4112462997436523 + }, + { + "auxiliary_loss_clip": 0.01070495, + "auxiliary_loss_mlp": 0.01031885, + "balance_loss_clip": 1.01717472, + "balance_loss_mlp": 1.02321124, + "epoch": 0.476476777393657, + "flos": 25044979248000.0, + "grad_norm": 1.7043829847313743, + "language_loss": 0.8018111, + "learning_rate": 2.147971023442975e-06, + "loss": 0.82283491, + "num_input_tokens_seen": 170359780, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.47265625, + "step": 7925, + "time_per_iteration": 2.4366507530212402 + }, + { + "auxiliary_loss_clip": 0.01064869, + "auxiliary_loss_mlp": 0.01032551, + "balance_loss_clip": 1.02038002, + "balance_loss_mlp": 1.02117634, + "epoch": 0.476536900646325, + "flos": 27268737252480.0, + "grad_norm": 1.5763781842963527, + "language_loss": 0.71985209, + "learning_rate": 2.147594233968787e-06, + "loss": 0.74082625, + "num_input_tokens_seen": 170381260, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.4375, + "step": 7926, + "time_per_iteration": 2.4563374519348145 + }, + { + "auxiliary_loss_clip": 0.01068429, + "auxiliary_loss_mlp": 0.01029342, + "balance_loss_clip": 1.01541233, + "balance_loss_mlp": 1.02282739, + "epoch": 0.47659702389899294, + "flos": 25300229264640.0, + "grad_norm": 2.2520977884579776, + "language_loss": 0.6822294, + "learning_rate": 2.147217439227339e-06, + "loss": 0.70320702, + "num_input_tokens_seen": 170400595, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.45507812, + "step": 7927, + "time_per_iteration": 5.286655426025391 + }, + { + "auxiliary_loss_clip": 0.01063728, + "auxiliary_loss_mlp": 0.01025282, + "balance_loss_clip": 1.01389754, + "balance_loss_mlp": 1.02118301, + "epoch": 0.4766571471516609, + "flos": 25991719119360.0, + "grad_norm": 1.5651136187507995, + "language_loss": 0.68028402, + "learning_rate": 2.1468406392320803e-06, + "loss": 0.70117414, + "num_input_tokens_seen": 170421110, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.42578125, + "step": 7928, + "time_per_iteration": 2.4361467361450195 + }, + { + "auxiliary_loss_clip": 0.01063959, + "auxiliary_loss_mlp": 0.01025307, + "balance_loss_clip": 1.0124681, + "balance_loss_mlp": 1.02097774, + "epoch": 0.4767172704043289, + "flos": 16031347741440.0, + "grad_norm": 3.0363473177097475, + "language_loss": 0.78559083, + "learning_rate": 2.1464638339964564e-06, + "loss": 0.80648345, + "num_input_tokens_seen": 170436700, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.4296875, + "step": 7929, + "time_per_iteration": 2.369135618209839 + }, + { + "auxiliary_loss_clip": 0.01065545, + "auxiliary_loss_mlp": 0.01026973, + "balance_loss_clip": 1.01511145, + "balance_loss_mlp": 1.02301717, + "epoch": 0.47677739365699684, + "flos": 39233891387520.0, + "grad_norm": 2.23190100597805, + "language_loss": 0.66477436, + "learning_rate": 2.1460870235339155e-06, + "loss": 0.68569952, + "num_input_tokens_seen": 170459555, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.42578125, + "step": 7930, + "time_per_iteration": 2.5804591178894043 + }, + { + "auxiliary_loss_clip": 0.01062505, + "auxiliary_loss_mlp": 0.01025373, + "balance_loss_clip": 1.01368439, + "balance_loss_mlp": 1.01975799, + "epoch": 0.4768375169096648, + "flos": 24716621111040.0, + "grad_norm": 2.0921298166880726, + "language_loss": 0.8030057, + "learning_rate": 2.1457102078579045e-06, + "loss": 0.82388443, + "num_input_tokens_seen": 170479175, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.42773438, + "step": 7931, + "time_per_iteration": 2.445435047149658 + }, + { + "auxiliary_loss_clip": 0.01064771, + "auxiliary_loss_mlp": 0.01033482, + "balance_loss_clip": 1.02037525, + "balance_loss_mlp": 1.0216403, + "epoch": 0.47689764016233277, + "flos": 22527566864640.0, + "grad_norm": 1.738863260024768, + "language_loss": 0.75576091, + "learning_rate": 2.1453333869818702e-06, + "loss": 0.77674347, + "num_input_tokens_seen": 170498450, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.43164062, + "step": 7932, + "time_per_iteration": 2.3978099822998047 + }, + { + "auxiliary_loss_clip": 0.01062835, + "auxiliary_loss_mlp": 0.01027341, + "balance_loss_clip": 1.0148716, + "balance_loss_mlp": 1.02171493, + "epoch": 0.47695776341500074, + "flos": 15119765475840.0, + "grad_norm": 1.622725162504741, + "language_loss": 0.79359972, + "learning_rate": 2.1449565609192617e-06, + "loss": 0.8145014, + "num_input_tokens_seen": 170516255, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.41015625, + "step": 7933, + "time_per_iteration": 2.3728184700012207 + }, + { + "auxiliary_loss_clip": 0.01070631, + "auxiliary_loss_mlp": 0.01030032, + "balance_loss_clip": 1.01527405, + "balance_loss_mlp": 1.02233696, + "epoch": 0.4770178866676687, + "flos": 14678184199680.0, + "grad_norm": 2.3633797460940653, + "language_loss": 0.74490029, + "learning_rate": 2.144579729683526e-06, + "loss": 0.76590693, + "num_input_tokens_seen": 170532705, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.48046875, + "step": 7934, + "time_per_iteration": 2.344356060028076 + }, + { + "auxiliary_loss_clip": 0.01065401, + "auxiliary_loss_mlp": 0.01025143, + "balance_loss_clip": 1.01296544, + "balance_loss_mlp": 1.02102447, + "epoch": 0.47707800992033667, + "flos": 22564470038400.0, + "grad_norm": 1.79167905328879, + "language_loss": 0.79772192, + "learning_rate": 2.1442028932881123e-06, + "loss": 0.81862736, + "num_input_tokens_seen": 170551925, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.44335938, + "step": 7935, + "time_per_iteration": 2.421065330505371 + }, + { + "auxiliary_loss_clip": 0.01070772, + "auxiliary_loss_mlp": 0.01029486, + "balance_loss_clip": 1.01582432, + "balance_loss_mlp": 1.02292728, + "epoch": 0.4771381331730047, + "flos": 30916951528320.0, + "grad_norm": 1.7328157103032225, + "language_loss": 0.70943213, + "learning_rate": 2.143826051746468e-06, + "loss": 0.73043466, + "num_input_tokens_seen": 170572320, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.47851562, + "step": 7936, + "time_per_iteration": 2.4666833877563477 + }, + { + "auxiliary_loss_clip": 0.01067394, + "auxiliary_loss_mlp": 0.0102284, + "balance_loss_clip": 1.01025164, + "balance_loss_mlp": 1.02174139, + "epoch": 0.47719825642567265, + "flos": 25737725911680.0, + "grad_norm": 3.971027279638395, + "language_loss": 0.67891759, + "learning_rate": 2.143449205072042e-06, + "loss": 0.69981998, + "num_input_tokens_seen": 170589470, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.45703125, + "step": 7937, + "time_per_iteration": 2.4231832027435303 + }, + { + "auxiliary_loss_clip": 0.01014178, + "auxiliary_loss_mlp": 0.0100299, + "balance_loss_clip": 1.00183356, + "balance_loss_mlp": 1.00538921, + "epoch": 0.4772583796783406, + "flos": 66351592765440.0, + "grad_norm": 0.714769562613384, + "language_loss": 0.56433183, + "learning_rate": 2.1430723532782828e-06, + "loss": 0.58450353, + "num_input_tokens_seen": 170662265, + "router_z_loss_clip": 0.01153564, + "router_z_loss_mlp": 0.08789062, + "step": 7938, + "time_per_iteration": 3.2145771980285645 + }, + { + "auxiliary_loss_clip": 0.01067317, + "auxiliary_loss_mlp": 0.01025449, + "balance_loss_clip": 1.01174581, + "balance_loss_mlp": 1.02223372, + "epoch": 0.4773185029310086, + "flos": 22050094844160.0, + "grad_norm": 1.6715483454351223, + "language_loss": 0.88716316, + "learning_rate": 2.142695496378639e-06, + "loss": 0.90809083, + "num_input_tokens_seen": 170679680, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.45117188, + "step": 7939, + "time_per_iteration": 2.3954811096191406 + }, + { + "auxiliary_loss_clip": 0.01065711, + "auxiliary_loss_mlp": 0.01027761, + "balance_loss_clip": 1.01526213, + "balance_loss_mlp": 1.02074766, + "epoch": 0.47737862618367655, + "flos": 16726852402560.0, + "grad_norm": 1.8059418217125036, + "language_loss": 0.77118146, + "learning_rate": 2.14231863438656e-06, + "loss": 0.79211617, + "num_input_tokens_seen": 170697340, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.44921875, + "step": 7940, + "time_per_iteration": 2.361955404281616 + }, + { + "auxiliary_loss_clip": 0.01065246, + "auxiliary_loss_mlp": 0.01029574, + "balance_loss_clip": 1.0180521, + "balance_loss_mlp": 1.02294266, + "epoch": 0.4774387494363445, + "flos": 19608443844480.0, + "grad_norm": 1.6298231059666275, + "language_loss": 0.85155737, + "learning_rate": 2.1419417673154954e-06, + "loss": 0.87250555, + "num_input_tokens_seen": 170714905, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.421875, + "step": 7941, + "time_per_iteration": 2.3817811012268066 + }, + { + "auxiliary_loss_clip": 0.0106787, + "auxiliary_loss_mlp": 0.01027978, + "balance_loss_clip": 1.01425672, + "balance_loss_mlp": 1.02265525, + "epoch": 0.4774988726890125, + "flos": 16653046055040.0, + "grad_norm": 1.7468241443938264, + "language_loss": 0.75787568, + "learning_rate": 2.1415648951788944e-06, + "loss": 0.77883416, + "num_input_tokens_seen": 170731810, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.453125, + "step": 7942, + "time_per_iteration": 2.362311840057373 + }, + { + "auxiliary_loss_clip": 0.01066897, + "auxiliary_loss_mlp": 0.01033977, + "balance_loss_clip": 1.02107263, + "balance_loss_mlp": 1.02286243, + "epoch": 0.47755899594168044, + "flos": 20484519390720.0, + "grad_norm": 2.4326819786563103, + "language_loss": 0.64656937, + "learning_rate": 2.1411880179902056e-06, + "loss": 0.66757822, + "num_input_tokens_seen": 170750270, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.44140625, + "step": 7943, + "time_per_iteration": 2.3972740173339844 + }, + { + "auxiliary_loss_clip": 0.01067578, + "auxiliary_loss_mlp": 0.01029691, + "balance_loss_clip": 1.01651859, + "balance_loss_mlp": 1.02207589, + "epoch": 0.4776191191943484, + "flos": 21651735697920.0, + "grad_norm": 1.7305799756105886, + "language_loss": 0.73509586, + "learning_rate": 2.140811135762881e-06, + "loss": 0.75606847, + "num_input_tokens_seen": 170769015, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.45507812, + "step": 7944, + "time_per_iteration": 2.4049108028411865 + }, + { + "auxiliary_loss_clip": 0.01066854, + "auxiliary_loss_mlp": 0.01030661, + "balance_loss_clip": 1.01680863, + "balance_loss_mlp": 1.02207923, + "epoch": 0.4776792424470164, + "flos": 18769236560640.0, + "grad_norm": 1.8218788991594543, + "language_loss": 0.67931467, + "learning_rate": 2.1404342485103683e-06, + "loss": 0.70028985, + "num_input_tokens_seen": 170785725, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.44726562, + "step": 7945, + "time_per_iteration": 2.3849294185638428 + }, + { + "auxiliary_loss_clip": 0.01063049, + "auxiliary_loss_mlp": 0.01026616, + "balance_loss_clip": 1.01408148, + "balance_loss_mlp": 1.01933551, + "epoch": 0.47773936569968434, + "flos": 29714542704000.0, + "grad_norm": 2.181512472281651, + "language_loss": 0.75457215, + "learning_rate": 2.1400573562461185e-06, + "loss": 0.77546883, + "num_input_tokens_seen": 170804600, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.4375, + "step": 7946, + "time_per_iteration": 2.4526207447052 + }, + { + "auxiliary_loss_clip": 0.01068785, + "auxiliary_loss_mlp": 0.01035145, + "balance_loss_clip": 1.02104855, + "balance_loss_mlp": 1.02293205, + "epoch": 0.4777994889523523, + "flos": 24790357635840.0, + "grad_norm": 1.8115761084644935, + "language_loss": 0.78979951, + "learning_rate": 2.139680458983582e-06, + "loss": 0.81083882, + "num_input_tokens_seen": 170824230, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.45898438, + "step": 7947, + "time_per_iteration": 2.431586265563965 + }, + { + "auxiliary_loss_clip": 0.01064334, + "auxiliary_loss_mlp": 0.01031245, + "balance_loss_clip": 1.01829886, + "balance_loss_mlp": 1.02163029, + "epoch": 0.47785961220502027, + "flos": 17857200447360.0, + "grad_norm": 6.259907486824126, + "language_loss": 0.73477507, + "learning_rate": 2.139303556736209e-06, + "loss": 0.75573087, + "num_input_tokens_seen": 170843365, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.42773438, + "step": 7948, + "time_per_iteration": 2.3736538887023926 + }, + { + "auxiliary_loss_clip": 0.0106661, + "auxiliary_loss_mlp": 0.01030748, + "balance_loss_clip": 1.01733065, + "balance_loss_mlp": 1.02254522, + "epoch": 0.4779197354576883, + "flos": 20265509232000.0, + "grad_norm": 1.5853484683272503, + "language_loss": 0.7805323, + "learning_rate": 2.1389266495174507e-06, + "loss": 0.80150586, + "num_input_tokens_seen": 170863515, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.43945312, + "step": 7949, + "time_per_iteration": 2.406538486480713 + }, + { + "auxiliary_loss_clip": 0.01062791, + "auxiliary_loss_mlp": 0.01025809, + "balance_loss_clip": 1.01362538, + "balance_loss_mlp": 1.02032769, + "epoch": 0.47797985871035625, + "flos": 17055629475840.0, + "grad_norm": 2.2359022876611774, + "language_loss": 0.74144566, + "learning_rate": 2.1385497373407574e-06, + "loss": 0.76233166, + "num_input_tokens_seen": 170881245, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.42382812, + "step": 7950, + "time_per_iteration": 2.4437789916992188 + }, + { + "auxiliary_loss_clip": 0.01064643, + "auxiliary_loss_mlp": 0.01034347, + "balance_loss_clip": 1.02039373, + "balance_loss_mlp": 1.02161324, + "epoch": 0.4780399819630242, + "flos": 31357066527360.0, + "grad_norm": 1.9558254350640514, + "language_loss": 0.74506211, + "learning_rate": 2.13817282021958e-06, + "loss": 0.76605201, + "num_input_tokens_seen": 170901285, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.4296875, + "step": 7951, + "time_per_iteration": 2.505134105682373 + }, + { + "auxiliary_loss_clip": 0.01065694, + "auxiliary_loss_mlp": 0.01028337, + "balance_loss_clip": 1.01437783, + "balance_loss_mlp": 1.02023947, + "epoch": 0.4781001052156922, + "flos": 24898448805120.0, + "grad_norm": 1.974047461988999, + "language_loss": 0.79780209, + "learning_rate": 2.137795898167371e-06, + "loss": 0.81874239, + "num_input_tokens_seen": 170919740, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.453125, + "step": 7952, + "time_per_iteration": 3.8587307929992676 + }, + { + "auxiliary_loss_clip": 0.01066623, + "auxiliary_loss_mlp": 0.01032519, + "balance_loss_clip": 1.01892328, + "balance_loss_mlp": 1.02111268, + "epoch": 0.47816022846836015, + "flos": 18696721933440.0, + "grad_norm": 2.114341629522234, + "language_loss": 0.78231019, + "learning_rate": 2.1374189711975806e-06, + "loss": 0.80330163, + "num_input_tokens_seen": 170938510, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.45703125, + "step": 7953, + "time_per_iteration": 2.3760244846343994 + }, + { + "auxiliary_loss_clip": 0.01065746, + "auxiliary_loss_mlp": 0.01028478, + "balance_loss_clip": 1.0144527, + "balance_loss_mlp": 1.01984572, + "epoch": 0.4782203517210281, + "flos": 11976954174720.0, + "grad_norm": 2.537514492243384, + "language_loss": 0.84332955, + "learning_rate": 2.1370420393236604e-06, + "loss": 0.86427176, + "num_input_tokens_seen": 170951170, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.45898438, + "step": 7954, + "time_per_iteration": 2.341264247894287 + }, + { + "auxiliary_loss_clip": 0.01065695, + "auxiliary_loss_mlp": 0.01027911, + "balance_loss_clip": 1.0152148, + "balance_loss_mlp": 1.0211165, + "epoch": 0.4782804749736961, + "flos": 20812458591360.0, + "grad_norm": 1.4095113456509543, + "language_loss": 0.70563591, + "learning_rate": 2.136665102559062e-06, + "loss": 0.72657192, + "num_input_tokens_seen": 170970990, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.4453125, + "step": 7955, + "time_per_iteration": 2.410383939743042 + }, + { + "auxiliary_loss_clip": 0.0106572, + "auxiliary_loss_mlp": 0.01030227, + "balance_loss_clip": 1.01707864, + "balance_loss_mlp": 1.02117121, + "epoch": 0.47834059822636404, + "flos": 23839218933120.0, + "grad_norm": 1.4950130705454354, + "language_loss": 0.81722122, + "learning_rate": 2.136288160917238e-06, + "loss": 0.83818066, + "num_input_tokens_seen": 170991215, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.4453125, + "step": 7956, + "time_per_iteration": 2.4118051528930664 + }, + { + "auxiliary_loss_clip": 0.01065954, + "auxiliary_loss_mlp": 0.01028241, + "balance_loss_clip": 1.01457345, + "balance_loss_mlp": 1.02160358, + "epoch": 0.478400721479032, + "flos": 22632795302400.0, + "grad_norm": 1.8347286827601343, + "language_loss": 0.84741366, + "learning_rate": 2.13591121441164e-06, + "loss": 0.86835563, + "num_input_tokens_seen": 171007325, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.44335938, + "step": 7957, + "time_per_iteration": 2.3943865299224854 + }, + { + "auxiliary_loss_clip": 0.01065715, + "auxiliary_loss_mlp": 0.010278, + "balance_loss_clip": 1.01514041, + "balance_loss_mlp": 1.0214622, + "epoch": 0.4784608447317, + "flos": 19353926966400.0, + "grad_norm": 1.665673310361617, + "language_loss": 0.79557383, + "learning_rate": 2.135534263055721e-06, + "loss": 0.81650895, + "num_input_tokens_seen": 171025650, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.44335938, + "step": 7958, + "time_per_iteration": 2.372213125228882 + }, + { + "auxiliary_loss_clip": 0.01064308, + "auxiliary_loss_mlp": 0.0102982, + "balance_loss_clip": 1.01525831, + "balance_loss_mlp": 1.01981628, + "epoch": 0.47852096798436794, + "flos": 24020069109120.0, + "grad_norm": 2.4253714839459692, + "language_loss": 0.82748008, + "learning_rate": 2.1351573068629324e-06, + "loss": 0.84842134, + "num_input_tokens_seen": 171045045, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.4453125, + "step": 7959, + "time_per_iteration": 2.447054386138916 + }, + { + "auxiliary_loss_clip": 0.01063885, + "auxiliary_loss_mlp": 0.01021961, + "balance_loss_clip": 1.00958717, + "balance_loss_mlp": 1.02114308, + "epoch": 0.4785810912370359, + "flos": 25665246195840.0, + "grad_norm": 2.2367265757136785, + "language_loss": 0.72852647, + "learning_rate": 2.1347803458467268e-06, + "loss": 0.749385, + "num_input_tokens_seen": 171062910, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.42773438, + "step": 7960, + "time_per_iteration": 2.4188590049743652 + }, + { + "auxiliary_loss_clip": 0.01064831, + "auxiliary_loss_mlp": 0.01029173, + "balance_loss_clip": 1.01658487, + "balance_loss_mlp": 1.02149892, + "epoch": 0.47864121448970387, + "flos": 21431119616640.0, + "grad_norm": 1.6605813537161542, + "language_loss": 0.76922709, + "learning_rate": 2.1344033800205573e-06, + "loss": 0.79016709, + "num_input_tokens_seen": 171080875, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.43359375, + "step": 7961, + "time_per_iteration": 3.9349327087402344 + }, + { + "auxiliary_loss_clip": 0.01063417, + "auxiliary_loss_mlp": 0.01028302, + "balance_loss_clip": 1.01505804, + "balance_loss_mlp": 1.02022541, + "epoch": 0.47870133774237184, + "flos": 16142964958080.0, + "grad_norm": 1.6384008208072116, + "language_loss": 0.77720994, + "learning_rate": 2.134026409397878e-06, + "loss": 0.79812711, + "num_input_tokens_seen": 171099190, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.43164062, + "step": 7962, + "time_per_iteration": 2.3716816902160645 + }, + { + "auxiliary_loss_clip": 0.0106733, + "auxiliary_loss_mlp": 0.010273, + "balance_loss_clip": 1.01447296, + "balance_loss_mlp": 1.02299809, + "epoch": 0.47876146099503986, + "flos": 26905570623360.0, + "grad_norm": 1.7310698248800382, + "language_loss": 0.64685392, + "learning_rate": 2.13364943399214e-06, + "loss": 0.66780025, + "num_input_tokens_seen": 171119060, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.44335938, + "step": 7963, + "time_per_iteration": 2.4582276344299316 + }, + { + "auxiliary_loss_clip": 0.01064918, + "auxiliary_loss_mlp": 0.01032887, + "balance_loss_clip": 1.0197978, + "balance_loss_mlp": 1.0213182, + "epoch": 0.4788215842477078, + "flos": 45330354910080.0, + "grad_norm": 1.6690680227866714, + "language_loss": 0.77382076, + "learning_rate": 2.133272453816797e-06, + "loss": 0.79479885, + "num_input_tokens_seen": 171141900, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.43554688, + "step": 7964, + "time_per_iteration": 2.6052098274230957 + }, + { + "auxiliary_loss_clip": 0.01067303, + "auxiliary_loss_mlp": 0.01030409, + "balance_loss_clip": 1.01507258, + "balance_loss_mlp": 1.02114773, + "epoch": 0.4788817075003758, + "flos": 22236076990080.0, + "grad_norm": 1.7467384371419368, + "language_loss": 0.76434064, + "learning_rate": 2.1328954688853036e-06, + "loss": 0.78531778, + "num_input_tokens_seen": 171161045, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.46289062, + "step": 7965, + "time_per_iteration": 2.4142847061157227 + }, + { + "auxiliary_loss_clip": 0.01065044, + "auxiliary_loss_mlp": 0.01030487, + "balance_loss_clip": 1.01766062, + "balance_loss_mlp": 1.02138197, + "epoch": 0.47894183075304375, + "flos": 16470275754240.0, + "grad_norm": 1.5351030477031797, + "language_loss": 0.74839604, + "learning_rate": 2.1325184792111125e-06, + "loss": 0.76935136, + "num_input_tokens_seen": 171179675, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.4375, + "step": 7966, + "time_per_iteration": 3.8266890048980713 + }, + { + "auxiliary_loss_clip": 0.01066658, + "auxiliary_loss_mlp": 0.01028726, + "balance_loss_clip": 1.01452255, + "balance_loss_mlp": 1.02196419, + "epoch": 0.4790019540057117, + "flos": 24281463525120.0, + "grad_norm": 1.6249585810485412, + "language_loss": 0.72909749, + "learning_rate": 2.132141484807678e-06, + "loss": 0.75005126, + "num_input_tokens_seen": 171201175, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.44726562, + "step": 7967, + "time_per_iteration": 3.9376394748687744 + }, + { + "auxiliary_loss_clip": 0.01063411, + "auxiliary_loss_mlp": 0.01025808, + "balance_loss_clip": 1.01305914, + "balance_loss_mlp": 1.02106953, + "epoch": 0.4790620772583797, + "flos": 25665281107200.0, + "grad_norm": 1.87514075991807, + "language_loss": 0.78884673, + "learning_rate": 2.131764485688454e-06, + "loss": 0.80973899, + "num_input_tokens_seen": 171221750, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.42382812, + "step": 7968, + "time_per_iteration": 2.4252591133117676 + }, + { + "auxiliary_loss_clip": 0.01013268, + "auxiliary_loss_mlp": 0.01007734, + "balance_loss_clip": 1.00647044, + "balance_loss_mlp": 1.00408816, + "epoch": 0.47912220051104765, + "flos": 69424228500480.0, + "grad_norm": 0.7695444603732247, + "language_loss": 0.62283367, + "learning_rate": 2.131387481866894e-06, + "loss": 0.6430437, + "num_input_tokens_seen": 171292235, + "router_z_loss_clip": 0.01263428, + "router_z_loss_mlp": 0.09179688, + "step": 7969, + "time_per_iteration": 3.2278764247894287 + }, + { + "auxiliary_loss_clip": 0.01064587, + "auxiliary_loss_mlp": 0.01024914, + "balance_loss_clip": 1.01271844, + "balance_loss_mlp": 1.02238894, + "epoch": 0.4791823237637156, + "flos": 24167821449600.0, + "grad_norm": 1.3191197092942217, + "language_loss": 0.77463198, + "learning_rate": 2.131010473356453e-06, + "loss": 0.79552698, + "num_input_tokens_seen": 171312215, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.421875, + "step": 7970, + "time_per_iteration": 2.4725534915924072 + }, + { + "auxiliary_loss_clip": 0.01065468, + "auxiliary_loss_mlp": 0.01027662, + "balance_loss_clip": 1.01438189, + "balance_loss_mlp": 1.02103662, + "epoch": 0.4792424470163836, + "flos": 24750382262400.0, + "grad_norm": 3.8244483932968647, + "language_loss": 0.70472383, + "learning_rate": 2.130633460170585e-06, + "loss": 0.72565508, + "num_input_tokens_seen": 171332975, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.44335938, + "step": 7971, + "time_per_iteration": 2.4670300483703613 + }, + { + "auxiliary_loss_clip": 0.01066537, + "auxiliary_loss_mlp": 0.01031819, + "balance_loss_clip": 1.0179255, + "balance_loss_mlp": 1.02180314, + "epoch": 0.47930257026905154, + "flos": 23256797765760.0, + "grad_norm": 1.4070648899684468, + "language_loss": 0.79765499, + "learning_rate": 2.1302564423227453e-06, + "loss": 0.8186385, + "num_input_tokens_seen": 171353880, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.44726562, + "step": 7972, + "time_per_iteration": 2.463724374771118 + }, + { + "auxiliary_loss_clip": 0.01065148, + "auxiliary_loss_mlp": 0.01029272, + "balance_loss_clip": 1.01522303, + "balance_loss_mlp": 1.02049398, + "epoch": 0.4793626935217195, + "flos": 14063223778560.0, + "grad_norm": 1.9929378189154, + "language_loss": 0.69547784, + "learning_rate": 2.129879419826387e-06, + "loss": 0.71642202, + "num_input_tokens_seen": 171370930, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.4453125, + "step": 7973, + "time_per_iteration": 2.3906986713409424 + }, + { + "auxiliary_loss_clip": 0.01065649, + "auxiliary_loss_mlp": 0.01030177, + "balance_loss_clip": 1.01737976, + "balance_loss_mlp": 1.02324307, + "epoch": 0.4794228167743875, + "flos": 21797777381760.0, + "grad_norm": 2.2988575661913386, + "language_loss": 0.78925049, + "learning_rate": 2.129502392694968e-06, + "loss": 0.8102088, + "num_input_tokens_seen": 171387575, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.42578125, + "step": 7974, + "time_per_iteration": 2.397998809814453 + }, + { + "auxiliary_loss_clip": 0.01011646, + "auxiliary_loss_mlp": 0.01001231, + "balance_loss_clip": 1.00006866, + "balance_loss_mlp": 1.00276518, + "epoch": 0.47948294002705544, + "flos": 66965436023040.0, + "grad_norm": 0.7500527468703956, + "language_loss": 0.54067218, + "learning_rate": 2.1291253609419415e-06, + "loss": 0.56080091, + "num_input_tokens_seen": 171449980, + "router_z_loss_clip": 0.01159668, + "router_z_loss_mlp": 0.08886719, + "step": 7975, + "time_per_iteration": 3.107653856277466 + }, + { + "auxiliary_loss_clip": 0.01070425, + "auxiliary_loss_mlp": 0.01038386, + "balance_loss_clip": 1.02359796, + "balance_loss_mlp": 1.02143359, + "epoch": 0.47954306327972346, + "flos": 12421642561920.0, + "grad_norm": 2.495653600707414, + "language_loss": 0.90121222, + "learning_rate": 2.1287483245807622e-06, + "loss": 0.92230028, + "num_input_tokens_seen": 171465290, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.49023438, + "step": 7976, + "time_per_iteration": 2.3618814945220947 + }, + { + "auxiliary_loss_clip": 0.01069818, + "auxiliary_loss_mlp": 0.010321, + "balance_loss_clip": 1.01659679, + "balance_loss_mlp": 1.02206242, + "epoch": 0.4796031865323914, + "flos": 18361172056320.0, + "grad_norm": 2.3213354890283635, + "language_loss": 0.73624253, + "learning_rate": 2.1283712836248866e-06, + "loss": 0.75726169, + "num_input_tokens_seen": 171481130, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.4765625, + "step": 7977, + "time_per_iteration": 2.4117624759674072 + }, + { + "auxiliary_loss_clip": 0.01063648, + "auxiliary_loss_mlp": 0.01031639, + "balance_loss_clip": 1.01829934, + "balance_loss_mlp": 1.02069998, + "epoch": 0.4796633097850594, + "flos": 21834017239680.0, + "grad_norm": 1.7089574129780971, + "language_loss": 0.78410041, + "learning_rate": 2.1279942380877694e-06, + "loss": 0.80505323, + "num_input_tokens_seen": 171501140, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.4296875, + "step": 7978, + "time_per_iteration": 2.416456699371338 + }, + { + "auxiliary_loss_clip": 0.01065743, + "auxiliary_loss_mlp": 0.01033029, + "balance_loss_clip": 1.01912355, + "balance_loss_mlp": 1.02027607, + "epoch": 0.47972343303772735, + "flos": 23436321310080.0, + "grad_norm": 1.7582120825935794, + "language_loss": 0.89358258, + "learning_rate": 2.127617187982868e-06, + "loss": 0.91457033, + "num_input_tokens_seen": 171519835, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.453125, + "step": 7979, + "time_per_iteration": 2.452105760574341 + }, + { + "auxiliary_loss_clip": 0.01068705, + "auxiliary_loss_mlp": 0.01035852, + "balance_loss_clip": 1.02146375, + "balance_loss_mlp": 1.022771, + "epoch": 0.4797835562903953, + "flos": 24898623361920.0, + "grad_norm": 1.536998795776256, + "language_loss": 0.76647019, + "learning_rate": 2.1272401333236377e-06, + "loss": 0.78751576, + "num_input_tokens_seen": 171540980, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.45898438, + "step": 7980, + "time_per_iteration": 2.475820302963257 + }, + { + "auxiliary_loss_clip": 0.01071143, + "auxiliary_loss_mlp": 0.01031904, + "balance_loss_clip": 1.01708674, + "balance_loss_mlp": 1.02358747, + "epoch": 0.4798436795430633, + "flos": 35041555572480.0, + "grad_norm": 1.589778626162493, + "language_loss": 0.7157321, + "learning_rate": 2.1268630741235334e-06, + "loss": 0.73676264, + "num_input_tokens_seen": 171563600, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.4765625, + "step": 7981, + "time_per_iteration": 2.5632855892181396 + }, + { + "auxiliary_loss_clip": 0.0106827, + "auxiliary_loss_mlp": 0.01026468, + "balance_loss_clip": 1.01301515, + "balance_loss_mlp": 1.02445889, + "epoch": 0.47990380279573125, + "flos": 20589293980800.0, + "grad_norm": 1.6476798736053628, + "language_loss": 0.70227385, + "learning_rate": 2.126486010396013e-06, + "loss": 0.72322118, + "num_input_tokens_seen": 171580700, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.4375, + "step": 7982, + "time_per_iteration": 2.4362292289733887 + }, + { + "auxiliary_loss_clip": 0.0106204, + "auxiliary_loss_mlp": 0.01022128, + "balance_loss_clip": 1.01017725, + "balance_loss_mlp": 1.02150571, + "epoch": 0.4799639260483992, + "flos": 26358202327680.0, + "grad_norm": 1.562833385655306, + "language_loss": 0.71423614, + "learning_rate": 2.126108942154532e-06, + "loss": 0.73507786, + "num_input_tokens_seen": 171602035, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.40625, + "step": 7983, + "time_per_iteration": 2.4878129959106445 + }, + { + "auxiliary_loss_clip": 0.0101131, + "auxiliary_loss_mlp": 0.01000785, + "balance_loss_clip": 0.99966478, + "balance_loss_mlp": 1.00211167, + "epoch": 0.4800240493010672, + "flos": 70975629941760.0, + "grad_norm": 0.8076087777654382, + "language_loss": 0.59470379, + "learning_rate": 2.125731869412547e-06, + "loss": 0.61482471, + "num_input_tokens_seen": 171659215, + "router_z_loss_clip": 0.01123047, + "router_z_loss_mlp": 0.09179688, + "step": 7984, + "time_per_iteration": 2.969733715057373 + }, + { + "auxiliary_loss_clip": 0.0106572, + "auxiliary_loss_mlp": 0.01032584, + "balance_loss_clip": 1.01936984, + "balance_loss_mlp": 1.02223301, + "epoch": 0.48008417255373514, + "flos": 17085864579840.0, + "grad_norm": 1.8293215793018265, + "language_loss": 0.66872394, + "learning_rate": 2.125354792183516e-06, + "loss": 0.68970692, + "num_input_tokens_seen": 171675710, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.43554688, + "step": 7985, + "time_per_iteration": 2.4247756004333496 + }, + { + "auxiliary_loss_clip": 0.01067849, + "auxiliary_loss_mlp": 0.01030269, + "balance_loss_clip": 1.01624382, + "balance_loss_mlp": 1.02193975, + "epoch": 0.4801442958064031, + "flos": 15412547070720.0, + "grad_norm": 1.9727523275712646, + "language_loss": 0.70207107, + "learning_rate": 2.124977710480894e-06, + "loss": 0.72305232, + "num_input_tokens_seen": 171692510, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.45898438, + "step": 7986, + "time_per_iteration": 2.3744397163391113 + }, + { + "auxiliary_loss_clip": 0.01072563, + "auxiliary_loss_mlp": 0.01031467, + "balance_loss_clip": 1.01701856, + "balance_loss_mlp": 1.02426112, + "epoch": 0.4802044190590711, + "flos": 11472947654400.0, + "grad_norm": 2.0012445582623, + "language_loss": 0.78882551, + "learning_rate": 2.124600624318139e-06, + "loss": 0.80986577, + "num_input_tokens_seen": 171710235, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.48242188, + "step": 7987, + "time_per_iteration": 2.4186458587646484 + }, + { + "auxiliary_loss_clip": 0.01066825, + "auxiliary_loss_mlp": 0.01032708, + "balance_loss_clip": 1.01943994, + "balance_loss_mlp": 1.02203178, + "epoch": 0.48026454231173904, + "flos": 20950191371520.0, + "grad_norm": 1.9254979407345039, + "language_loss": 0.75214529, + "learning_rate": 2.124223533708708e-06, + "loss": 0.77314067, + "num_input_tokens_seen": 171726715, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.44726562, + "step": 7988, + "time_per_iteration": 2.4076812267303467 + }, + { + "auxiliary_loss_clip": 0.01071016, + "auxiliary_loss_mlp": 0.01024753, + "balance_loss_clip": 1.00978017, + "balance_loss_mlp": 1.02529335, + "epoch": 0.48032466556440706, + "flos": 20447092546560.0, + "grad_norm": 1.8659088891973123, + "language_loss": 0.7906549, + "learning_rate": 2.1238464386660597e-06, + "loss": 0.81161261, + "num_input_tokens_seen": 171743605, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.45703125, + "step": 7989, + "time_per_iteration": 2.4274141788482666 + }, + { + "auxiliary_loss_clip": 0.01069119, + "auxiliary_loss_mlp": 0.01032591, + "balance_loss_clip": 1.01785111, + "balance_loss_mlp": 1.02132845, + "epoch": 0.480384788817075, + "flos": 37119376627200.0, + "grad_norm": 1.7503600850284895, + "language_loss": 0.73636281, + "learning_rate": 2.12346933920365e-06, + "loss": 0.75737995, + "num_input_tokens_seen": 171765445, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.47851562, + "step": 7990, + "time_per_iteration": 2.5363504886627197 + }, + { + "auxiliary_loss_clip": 0.01010789, + "auxiliary_loss_mlp": 0.01002416, + "balance_loss_clip": 1.00115228, + "balance_loss_mlp": 1.00189424, + "epoch": 0.480444912069743, + "flos": 69549323368320.0, + "grad_norm": 0.7740723345132107, + "language_loss": 0.59056401, + "learning_rate": 2.123092235334937e-06, + "loss": 0.61069602, + "num_input_tokens_seen": 171830115, + "router_z_loss_clip": 0.01263428, + "router_z_loss_mlp": 0.08886719, + "step": 7991, + "time_per_iteration": 3.117501974105835 + }, + { + "auxiliary_loss_clip": 0.01064304, + "auxiliary_loss_mlp": 0.01026446, + "balance_loss_clip": 1.01268291, + "balance_loss_mlp": 1.02060437, + "epoch": 0.48050503532241096, + "flos": 29821027950720.0, + "grad_norm": 1.8254742327228313, + "language_loss": 0.67438221, + "learning_rate": 2.1227151270733793e-06, + "loss": 0.69528973, + "num_input_tokens_seen": 171849135, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.4375, + "step": 7992, + "time_per_iteration": 3.8480300903320312 + }, + { + "auxiliary_loss_clip": 0.01067845, + "auxiliary_loss_mlp": 0.01026727, + "balance_loss_clip": 1.01301241, + "balance_loss_mlp": 1.02232039, + "epoch": 0.4805651585750789, + "flos": 23947484659200.0, + "grad_norm": 1.664183931148251, + "language_loss": 0.76341051, + "learning_rate": 2.1223380144324332e-06, + "loss": 0.78435624, + "num_input_tokens_seen": 171868880, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.45507812, + "step": 7993, + "time_per_iteration": 2.4409120082855225 + }, + { + "auxiliary_loss_clip": 0.01065168, + "auxiliary_loss_mlp": 0.01025651, + "balance_loss_clip": 1.01269925, + "balance_loss_mlp": 1.02211237, + "epoch": 0.4806252818277469, + "flos": 25664268677760.0, + "grad_norm": 1.5341435094303277, + "language_loss": 0.78667223, + "learning_rate": 2.121960897425559e-06, + "loss": 0.80758047, + "num_input_tokens_seen": 171889455, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.4296875, + "step": 7994, + "time_per_iteration": 2.4683563709259033 + }, + { + "auxiliary_loss_clip": 0.01067152, + "auxiliary_loss_mlp": 0.01026228, + "balance_loss_clip": 1.01294804, + "balance_loss_mlp": 1.02311039, + "epoch": 0.48068540508041485, + "flos": 13151152753920.0, + "grad_norm": 2.1880919789138003, + "language_loss": 0.79667723, + "learning_rate": 2.1215837760662136e-06, + "loss": 0.81761098, + "num_input_tokens_seen": 171906070, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.43945312, + "step": 7995, + "time_per_iteration": 2.3986990451812744 + }, + { + "auxiliary_loss_clip": 0.01065143, + "auxiliary_loss_mlp": 0.01028476, + "balance_loss_clip": 1.0149157, + "balance_loss_mlp": 1.02122474, + "epoch": 0.4807455283330828, + "flos": 21175729954560.0, + "grad_norm": 1.339622435762364, + "language_loss": 0.82787073, + "learning_rate": 2.1212066503678566e-06, + "loss": 0.84880686, + "num_input_tokens_seen": 171926515, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.43945312, + "step": 7996, + "time_per_iteration": 2.4372525215148926 + }, + { + "auxiliary_loss_clip": 0.01065338, + "auxiliary_loss_mlp": 0.01025608, + "balance_loss_clip": 1.01280546, + "balance_loss_mlp": 1.02187097, + "epoch": 0.4808056515857508, + "flos": 12275181941760.0, + "grad_norm": 1.7079969231036523, + "language_loss": 0.80978191, + "learning_rate": 2.1208295203439462e-06, + "loss": 0.83069134, + "num_input_tokens_seen": 171943845, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.43359375, + "step": 7997, + "time_per_iteration": 2.381704092025757 + }, + { + "auxiliary_loss_clip": 0.01065577, + "auxiliary_loss_mlp": 0.0102719, + "balance_loss_clip": 1.014184, + "balance_loss_mlp": 1.02059293, + "epoch": 0.48086577483841875, + "flos": 24824921748480.0, + "grad_norm": 1.6682401590227856, + "language_loss": 0.72690833, + "learning_rate": 2.12045238600794e-06, + "loss": 0.74783599, + "num_input_tokens_seen": 171964970, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.44921875, + "step": 7998, + "time_per_iteration": 2.443178176879883 + }, + { + "auxiliary_loss_clip": 0.01066602, + "auxiliary_loss_mlp": 0.01028101, + "balance_loss_clip": 1.01430273, + "balance_loss_mlp": 1.02173877, + "epoch": 0.4809258980910867, + "flos": 24464129091840.0, + "grad_norm": 1.7406676780221588, + "language_loss": 0.70635056, + "learning_rate": 2.1200752473732984e-06, + "loss": 0.7272976, + "num_input_tokens_seen": 171986340, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.44921875, + "step": 7999, + "time_per_iteration": 2.4275689125061035 + }, + { + "auxiliary_loss_clip": 0.01066388, + "auxiliary_loss_mlp": 0.01034984, + "balance_loss_clip": 1.02183521, + "balance_loss_mlp": 1.0210917, + "epoch": 0.4809860213437547, + "flos": 21214867455360.0, + "grad_norm": 1.9076228199221594, + "language_loss": 0.71140283, + "learning_rate": 2.11969810445348e-06, + "loss": 0.73241651, + "num_input_tokens_seen": 172007300, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.453125, + "step": 8000, + "time_per_iteration": 2.4259204864501953 + }, + { + "auxiliary_loss_clip": 0.01066281, + "auxiliary_loss_mlp": 0.01026297, + "balance_loss_clip": 1.01253986, + "balance_loss_mlp": 1.02194047, + "epoch": 0.48104614459642264, + "flos": 37630609799040.0, + "grad_norm": 1.3049470604852955, + "language_loss": 0.7473731, + "learning_rate": 2.119320957261945e-06, + "loss": 0.76829886, + "num_input_tokens_seen": 172029585, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.44335938, + "step": 8001, + "time_per_iteration": 4.011104106903076 + }, + { + "auxiliary_loss_clip": 0.01067939, + "auxiliary_loss_mlp": 0.01033236, + "balance_loss_clip": 1.01856136, + "balance_loss_mlp": 1.02197635, + "epoch": 0.48110626784909066, + "flos": 18405127324800.0, + "grad_norm": 1.7390506243988422, + "language_loss": 0.81299448, + "learning_rate": 2.118943805812151e-06, + "loss": 0.83400619, + "num_input_tokens_seen": 172047495, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.4609375, + "step": 8002, + "time_per_iteration": 2.404702663421631 + }, + { + "auxiliary_loss_clip": 0.01068376, + "auxiliary_loss_mlp": 0.01029374, + "balance_loss_clip": 1.01457989, + "balance_loss_mlp": 1.02230954, + "epoch": 0.48116639110175863, + "flos": 28438537000320.0, + "grad_norm": 1.790233638395478, + "language_loss": 0.71225786, + "learning_rate": 2.1185666501175587e-06, + "loss": 0.73323536, + "num_input_tokens_seen": 172067625, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.4609375, + "step": 8003, + "time_per_iteration": 2.452090263366699 + }, + { + "auxiliary_loss_clip": 0.01065556, + "auxiliary_loss_mlp": 0.01033458, + "balance_loss_clip": 1.0208993, + "balance_loss_mlp": 1.02244449, + "epoch": 0.4812265143544266, + "flos": 21724180502400.0, + "grad_norm": 1.8140797028186084, + "language_loss": 0.82512152, + "learning_rate": 2.1181894901916286e-06, + "loss": 0.84611166, + "num_input_tokens_seen": 172087885, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.4296875, + "step": 8004, + "time_per_iteration": 2.4056458473205566 + }, + { + "auxiliary_loss_clip": 0.01073076, + "auxiliary_loss_mlp": 0.01032848, + "balance_loss_clip": 1.01633167, + "balance_loss_mlp": 1.02345848, + "epoch": 0.48128663760709456, + "flos": 13223841937920.0, + "grad_norm": 3.071539826986096, + "language_loss": 0.77146971, + "learning_rate": 2.1178123260478183e-06, + "loss": 0.79252899, + "num_input_tokens_seen": 172105815, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.49609375, + "step": 8005, + "time_per_iteration": 3.764984369277954 + }, + { + "auxiliary_loss_clip": 0.01066001, + "auxiliary_loss_mlp": 0.01031482, + "balance_loss_clip": 1.01720047, + "balance_loss_mlp": 1.02014339, + "epoch": 0.4813467608597625, + "flos": 24242291112960.0, + "grad_norm": 3.082689805036039, + "language_loss": 0.70498824, + "learning_rate": 2.1174351576995897e-06, + "loss": 0.72596306, + "num_input_tokens_seen": 172126125, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.45898438, + "step": 8006, + "time_per_iteration": 3.8004310131073 + }, + { + "auxiliary_loss_clip": 0.01067844, + "auxiliary_loss_mlp": 0.01029595, + "balance_loss_clip": 1.01522982, + "balance_loss_mlp": 1.02253771, + "epoch": 0.4814068841124305, + "flos": 27479473418880.0, + "grad_norm": 3.4105605915740647, + "language_loss": 0.70938665, + "learning_rate": 2.117057985160403e-06, + "loss": 0.73036098, + "num_input_tokens_seen": 172141945, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.453125, + "step": 8007, + "time_per_iteration": 2.390446424484253 + }, + { + "auxiliary_loss_clip": 0.01067317, + "auxiliary_loss_mlp": 0.0102519, + "balance_loss_clip": 1.01154625, + "balance_loss_mlp": 1.02135229, + "epoch": 0.48146700736509845, + "flos": 19571889784320.0, + "grad_norm": 2.071712611469488, + "language_loss": 0.71684742, + "learning_rate": 2.1166808084437168e-06, + "loss": 0.73777246, + "num_input_tokens_seen": 172161095, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.45898438, + "step": 8008, + "time_per_iteration": 2.394728660583496 + }, + { + "auxiliary_loss_clip": 0.01070103, + "auxiliary_loss_mlp": 0.01029038, + "balance_loss_clip": 1.01379704, + "balance_loss_mlp": 1.02319968, + "epoch": 0.4815271306177664, + "flos": 20626825559040.0, + "grad_norm": 52.82275546804722, + "language_loss": 0.60701555, + "learning_rate": 2.1163036275629933e-06, + "loss": 0.62800694, + "num_input_tokens_seen": 172178750, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.46875, + "step": 8009, + "time_per_iteration": 2.3870859146118164 + }, + { + "auxiliary_loss_clip": 0.01014067, + "auxiliary_loss_mlp": 0.01011124, + "balance_loss_clip": 1.00968146, + "balance_loss_mlp": 1.00511956, + "epoch": 0.4815872538704344, + "flos": 67687894120320.0, + "grad_norm": 0.8641573669754393, + "language_loss": 0.61398196, + "learning_rate": 2.1159264425316922e-06, + "loss": 0.63423383, + "num_input_tokens_seen": 172240235, + "router_z_loss_clip": 0.0144043, + "router_z_loss_mlp": 0.08984375, + "step": 8010, + "time_per_iteration": 3.0499541759490967 + }, + { + "auxiliary_loss_clip": 0.01069178, + "auxiliary_loss_mlp": 0.0103596, + "balance_loss_clip": 1.02043867, + "balance_loss_mlp": 1.02345264, + "epoch": 0.48164737712310235, + "flos": 22819650232320.0, + "grad_norm": 1.6567486502305888, + "language_loss": 0.73339164, + "learning_rate": 2.115549253363275e-06, + "loss": 0.75444305, + "num_input_tokens_seen": 172259875, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.45703125, + "step": 8011, + "time_per_iteration": 2.411748170852661 + }, + { + "auxiliary_loss_clip": 0.01062715, + "auxiliary_loss_mlp": 0.01025802, + "balance_loss_clip": 1.01348138, + "balance_loss_mlp": 1.01995456, + "epoch": 0.4817075003757703, + "flos": 23732698775040.0, + "grad_norm": 2.277992579496314, + "language_loss": 0.78866446, + "learning_rate": 2.115172060071201e-06, + "loss": 0.80954963, + "num_input_tokens_seen": 172280150, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.42773438, + "step": 8012, + "time_per_iteration": 2.4145030975341797 + }, + { + "auxiliary_loss_clip": 0.01069101, + "auxiliary_loss_mlp": 0.01030573, + "balance_loss_clip": 1.01549315, + "balance_loss_mlp": 1.02284348, + "epoch": 0.4817676236284383, + "flos": 28181681061120.0, + "grad_norm": 2.1177295259999847, + "language_loss": 0.73383749, + "learning_rate": 2.114794862668934e-06, + "loss": 0.75483429, + "num_input_tokens_seen": 172300810, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.4609375, + "step": 8013, + "time_per_iteration": 2.4551827907562256 + }, + { + "auxiliary_loss_clip": 0.01064955, + "auxiliary_loss_mlp": 0.01027875, + "balance_loss_clip": 1.01553667, + "balance_loss_mlp": 1.02163315, + "epoch": 0.48182774688110624, + "flos": 17090821992960.0, + "grad_norm": 1.9695529100154923, + "language_loss": 0.90714669, + "learning_rate": 2.114417661169933e-06, + "loss": 0.92807502, + "num_input_tokens_seen": 172317930, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.43359375, + "step": 8014, + "time_per_iteration": 2.3900530338287354 + }, + { + "auxiliary_loss_clip": 0.01070464, + "auxiliary_loss_mlp": 0.0103005, + "balance_loss_clip": 1.01572084, + "balance_loss_mlp": 1.02295661, + "epoch": 0.4818878701337742, + "flos": 12567055841280.0, + "grad_norm": 2.971549921045136, + "language_loss": 0.74667853, + "learning_rate": 2.1140404555876595e-06, + "loss": 0.76768368, + "num_input_tokens_seen": 172336340, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.47460938, + "step": 8015, + "time_per_iteration": 2.4235363006591797 + }, + { + "auxiliary_loss_clip": 0.01068773, + "auxiliary_loss_mlp": 0.01034579, + "balance_loss_clip": 1.01970172, + "balance_loss_mlp": 1.02109361, + "epoch": 0.48194799338644223, + "flos": 24607342955520.0, + "grad_norm": 1.8441396250820288, + "language_loss": 0.80308074, + "learning_rate": 2.113663245935576e-06, + "loss": 0.82411426, + "num_input_tokens_seen": 172354315, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.4765625, + "step": 8016, + "time_per_iteration": 2.430704355239868 + }, + { + "auxiliary_loss_clip": 0.01066366, + "auxiliary_loss_mlp": 0.01024171, + "balance_loss_clip": 1.01123714, + "balance_loss_mlp": 1.02349138, + "epoch": 0.4820081166391102, + "flos": 21104157934080.0, + "grad_norm": 1.8353209696822317, + "language_loss": 0.77086914, + "learning_rate": 2.1132860322271436e-06, + "loss": 0.79177451, + "num_input_tokens_seen": 172372695, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.42773438, + "step": 8017, + "time_per_iteration": 2.42887806892395 + }, + { + "auxiliary_loss_clip": 0.01065586, + "auxiliary_loss_mlp": 0.01024752, + "balance_loss_clip": 1.01133549, + "balance_loss_mlp": 1.02318311, + "epoch": 0.48206823989177816, + "flos": 25263430824960.0, + "grad_norm": 1.9829478717201854, + "language_loss": 0.79583299, + "learning_rate": 2.112908814475824e-06, + "loss": 0.8167364, + "num_input_tokens_seen": 172390905, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.42382812, + "step": 8018, + "time_per_iteration": 2.4374308586120605 + }, + { + "auxiliary_loss_clip": 0.01066813, + "auxiliary_loss_mlp": 0.01025716, + "balance_loss_clip": 1.01182199, + "balance_loss_mlp": 1.0225029, + "epoch": 0.4821283631444461, + "flos": 24643897015680.0, + "grad_norm": 1.866775671298532, + "language_loss": 0.7581827, + "learning_rate": 2.1125315926950802e-06, + "loss": 0.77910805, + "num_input_tokens_seen": 172412295, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.44335938, + "step": 8019, + "time_per_iteration": 2.4584178924560547 + }, + { + "auxiliary_loss_clip": 0.01067596, + "auxiliary_loss_mlp": 0.01030269, + "balance_loss_clip": 1.01678669, + "balance_loss_mlp": 1.02168477, + "epoch": 0.4821884863971141, + "flos": 23950940883840.0, + "grad_norm": 1.8944275865410856, + "language_loss": 0.79447001, + "learning_rate": 2.1121543668983718e-06, + "loss": 0.81544864, + "num_input_tokens_seen": 172432625, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.45898438, + "step": 8020, + "time_per_iteration": 2.4272782802581787 + }, + { + "auxiliary_loss_clip": 0.01066006, + "auxiliary_loss_mlp": 0.01030334, + "balance_loss_clip": 1.01679206, + "balance_loss_mlp": 1.02186465, + "epoch": 0.48224860964978206, + "flos": 17159845484160.0, + "grad_norm": 2.079040117915738, + "language_loss": 0.70002401, + "learning_rate": 2.1117771370991636e-06, + "loss": 0.72098744, + "num_input_tokens_seen": 172450010, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.44140625, + "step": 8021, + "time_per_iteration": 2.395878314971924 + }, + { + "auxiliary_loss_clip": 0.0107038, + "auxiliary_loss_mlp": 0.01027881, + "balance_loss_clip": 1.0128423, + "balance_loss_mlp": 1.02262616, + "epoch": 0.48230873290245, + "flos": 23074725692160.0, + "grad_norm": 1.7404114811918097, + "language_loss": 0.62424964, + "learning_rate": 2.111399903310916e-06, + "loss": 0.6452322, + "num_input_tokens_seen": 172469080, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.4765625, + "step": 8022, + "time_per_iteration": 2.4257583618164062 + }, + { + "auxiliary_loss_clip": 0.01063823, + "auxiliary_loss_mlp": 0.01021454, + "balance_loss_clip": 1.00904465, + "balance_loss_mlp": 1.02105641, + "epoch": 0.482368856155118, + "flos": 19352530512000.0, + "grad_norm": 2.1373451570586925, + "language_loss": 0.65777087, + "learning_rate": 2.1110226655470932e-06, + "loss": 0.67862368, + "num_input_tokens_seen": 172484850, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.42773438, + "step": 8023, + "time_per_iteration": 2.4039318561553955 + }, + { + "auxiliary_loss_clip": 0.01065441, + "auxiliary_loss_mlp": 0.01028992, + "balance_loss_clip": 1.01570034, + "balance_loss_mlp": 1.02123618, + "epoch": 0.48242897940778595, + "flos": 20078095720320.0, + "grad_norm": 1.9567216146022988, + "language_loss": 0.76182872, + "learning_rate": 2.1106454238211572e-06, + "loss": 0.78277302, + "num_input_tokens_seen": 172503525, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.44140625, + "step": 8024, + "time_per_iteration": 2.4017598628997803 + }, + { + "auxiliary_loss_clip": 0.01064991, + "auxiliary_loss_mlp": 0.01027191, + "balance_loss_clip": 1.01313639, + "balance_loss_mlp": 1.01984024, + "epoch": 0.4824891026604539, + "flos": 23402874360960.0, + "grad_norm": 2.0610820155375142, + "language_loss": 0.75153172, + "learning_rate": 2.11026817814657e-06, + "loss": 0.77245355, + "num_input_tokens_seen": 172524360, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.45117188, + "step": 8025, + "time_per_iteration": 2.4362258911132812 + }, + { + "auxiliary_loss_clip": 0.0106502, + "auxiliary_loss_mlp": 0.01029992, + "balance_loss_clip": 1.01629472, + "balance_loss_mlp": 1.02192068, + "epoch": 0.4825492259131219, + "flos": 20367840026880.0, + "grad_norm": 1.7613967153543735, + "language_loss": 0.70661783, + "learning_rate": 2.1098909285367953e-06, + "loss": 0.72756791, + "num_input_tokens_seen": 172541480, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.4296875, + "step": 8026, + "time_per_iteration": 2.3907575607299805 + }, + { + "auxiliary_loss_clip": 0.0106704, + "auxiliary_loss_mlp": 0.01034224, + "balance_loss_clip": 1.01904845, + "balance_loss_mlp": 1.02053273, + "epoch": 0.48260934916578985, + "flos": 14318159592960.0, + "grad_norm": 2.084034731519354, + "language_loss": 0.74668407, + "learning_rate": 2.1095136750052957e-06, + "loss": 0.76769674, + "num_input_tokens_seen": 172559005, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.46484375, + "step": 8027, + "time_per_iteration": 2.394293785095215 + }, + { + "auxiliary_loss_clip": 0.01068656, + "auxiliary_loss_mlp": 0.01032241, + "balance_loss_clip": 1.0184778, + "balance_loss_mlp": 1.02193248, + "epoch": 0.4826694724184578, + "flos": 22120235498880.0, + "grad_norm": 3.7869454556532554, + "language_loss": 0.67088187, + "learning_rate": 2.1091364175655352e-06, + "loss": 0.69189084, + "num_input_tokens_seen": 172578435, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.46875, + "step": 8028, + "time_per_iteration": 2.4243106842041016 + }, + { + "auxiliary_loss_clip": 0.01068456, + "auxiliary_loss_mlp": 0.01028091, + "balance_loss_clip": 1.01503766, + "balance_loss_mlp": 1.02357554, + "epoch": 0.48272959567112583, + "flos": 16180217245440.0, + "grad_norm": 1.7457824505945467, + "language_loss": 0.73267615, + "learning_rate": 2.108759156230977e-06, + "loss": 0.75364161, + "num_input_tokens_seen": 172596095, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.44921875, + "step": 8029, + "time_per_iteration": 2.4220004081726074 + }, + { + "auxiliary_loss_clip": 0.01067652, + "auxiliary_loss_mlp": 0.01024113, + "balance_loss_clip": 1.01018977, + "balance_loss_mlp": 1.02237821, + "epoch": 0.4827897189237938, + "flos": 23179465370880.0, + "grad_norm": 2.2259533262294964, + "language_loss": 0.85025597, + "learning_rate": 2.1083818910150836e-06, + "loss": 0.87117362, + "num_input_tokens_seen": 172615255, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.453125, + "step": 8030, + "time_per_iteration": 2.422138214111328 + }, + { + "auxiliary_loss_clip": 0.01065048, + "auxiliary_loss_mlp": 0.01027627, + "balance_loss_clip": 1.01514578, + "balance_loss_mlp": 1.02082014, + "epoch": 0.48284984217646176, + "flos": 21651561141120.0, + "grad_norm": 1.89482516160822, + "language_loss": 0.73948812, + "learning_rate": 2.10800462193132e-06, + "loss": 0.76041496, + "num_input_tokens_seen": 172633185, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.44140625, + "step": 8031, + "time_per_iteration": 2.4239063262939453 + }, + { + "auxiliary_loss_clip": 0.010681, + "auxiliary_loss_mlp": 0.01032749, + "balance_loss_clip": 1.01785398, + "balance_loss_mlp": 1.02149391, + "epoch": 0.48290996542912973, + "flos": 31466100303360.0, + "grad_norm": 1.917656732953379, + "language_loss": 0.71879017, + "learning_rate": 2.1076273489931483e-06, + "loss": 0.73979867, + "num_input_tokens_seen": 172654280, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.46484375, + "step": 8032, + "time_per_iteration": 3.8519763946533203 + }, + { + "auxiliary_loss_clip": 0.01066268, + "auxiliary_loss_mlp": 0.01026724, + "balance_loss_clip": 1.01376605, + "balance_loss_mlp": 1.02145875, + "epoch": 0.4829700886817977, + "flos": 24460812512640.0, + "grad_norm": 1.3972058071182, + "language_loss": 0.74344075, + "learning_rate": 2.107250072214034e-06, + "loss": 0.76437074, + "num_input_tokens_seen": 172675545, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.44921875, + "step": 8033, + "time_per_iteration": 2.4485576152801514 + }, + { + "auxiliary_loss_clip": 0.01069424, + "auxiliary_loss_mlp": 0.01028321, + "balance_loss_clip": 1.01440883, + "balance_loss_mlp": 1.02313006, + "epoch": 0.48303021193446566, + "flos": 25700997294720.0, + "grad_norm": 1.6338876568391598, + "language_loss": 0.83244997, + "learning_rate": 2.1068727916074406e-06, + "loss": 0.85342741, + "num_input_tokens_seen": 172696455, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.46289062, + "step": 8034, + "time_per_iteration": 2.4517133235931396 + }, + { + "auxiliary_loss_clip": 0.01063187, + "auxiliary_loss_mlp": 0.01025632, + "balance_loss_clip": 1.01316261, + "balance_loss_mlp": 1.02158141, + "epoch": 0.4830903351871336, + "flos": 20084170296960.0, + "grad_norm": 1.8829165034747877, + "language_loss": 0.79312813, + "learning_rate": 2.106495507186832e-06, + "loss": 0.8140164, + "num_input_tokens_seen": 172716720, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.41601562, + "step": 8035, + "time_per_iteration": 2.44840931892395 + }, + { + "auxiliary_loss_clip": 0.01066511, + "auxiliary_loss_mlp": 0.01034568, + "balance_loss_clip": 1.01949406, + "balance_loss_mlp": 1.02091515, + "epoch": 0.4831504584398016, + "flos": 39450806864640.0, + "grad_norm": 3.4304211760780534, + "language_loss": 0.69711381, + "learning_rate": 2.106118218965673e-06, + "loss": 0.71812463, + "num_input_tokens_seen": 172737435, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.45703125, + "step": 8036, + "time_per_iteration": 2.564058303833008 + }, + { + "auxiliary_loss_clip": 0.01064158, + "auxiliary_loss_mlp": 0.0102977, + "balance_loss_clip": 1.01621604, + "balance_loss_mlp": 1.01959634, + "epoch": 0.48321058169246955, + "flos": 20005685827200.0, + "grad_norm": 1.8425701689419403, + "language_loss": 0.72885561, + "learning_rate": 2.105740926957427e-06, + "loss": 0.74979496, + "num_input_tokens_seen": 172755700, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.4453125, + "step": 8037, + "time_per_iteration": 2.4032833576202393 + }, + { + "auxiliary_loss_clip": 0.01070039, + "auxiliary_loss_mlp": 0.01030642, + "balance_loss_clip": 1.01571703, + "balance_loss_mlp": 1.02195024, + "epoch": 0.4832707049451375, + "flos": 20740397811840.0, + "grad_norm": 3.579661740327729, + "language_loss": 0.68946618, + "learning_rate": 2.1053636311755604e-06, + "loss": 0.710473, + "num_input_tokens_seen": 172775185, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.48046875, + "step": 8038, + "time_per_iteration": 2.4113616943359375 + }, + { + "auxiliary_loss_clip": 0.01065254, + "auxiliary_loss_mlp": 0.01026841, + "balance_loss_clip": 1.01283956, + "balance_loss_mlp": 1.02176833, + "epoch": 0.4833308281978055, + "flos": 33144200668800.0, + "grad_norm": 1.4871859153410016, + "language_loss": 0.79043955, + "learning_rate": 2.1049863316335356e-06, + "loss": 0.81136048, + "num_input_tokens_seen": 172796990, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.43554688, + "step": 8039, + "time_per_iteration": 2.514740228652954 + }, + { + "auxiliary_loss_clip": 0.01066385, + "auxiliary_loss_mlp": 0.01024909, + "balance_loss_clip": 1.01160562, + "balance_loss_mlp": 1.02197361, + "epoch": 0.48339095145047345, + "flos": 19098223102080.0, + "grad_norm": 1.5440683127526558, + "language_loss": 0.77329516, + "learning_rate": 2.1046090283448198e-06, + "loss": 0.79420817, + "num_input_tokens_seen": 172814915, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.4453125, + "step": 8040, + "time_per_iteration": 2.406372547149658 + }, + { + "auxiliary_loss_clip": 0.01066224, + "auxiliary_loss_mlp": 0.01028568, + "balance_loss_clip": 1.01451337, + "balance_loss_mlp": 1.02144706, + "epoch": 0.4834510747031414, + "flos": 34458017241600.0, + "grad_norm": 1.4984521559815305, + "language_loss": 0.75931019, + "learning_rate": 2.104231721322876e-06, + "loss": 0.78025806, + "num_input_tokens_seen": 172837060, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.44726562, + "step": 8041, + "time_per_iteration": 3.980497121810913 + }, + { + "auxiliary_loss_clip": 0.01067588, + "auxiliary_loss_mlp": 0.01028371, + "balance_loss_clip": 1.01465583, + "balance_loss_mlp": 1.0236268, + "epoch": 0.48351119795580944, + "flos": 27379621330560.0, + "grad_norm": 2.004013063389775, + "language_loss": 0.66792476, + "learning_rate": 2.1038544105811704e-06, + "loss": 0.68888438, + "num_input_tokens_seen": 172856545, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.43945312, + "step": 8042, + "time_per_iteration": 2.477031707763672 + }, + { + "auxiliary_loss_clip": 0.01012331, + "auxiliary_loss_mlp": 0.01002369, + "balance_loss_clip": 1.00103951, + "balance_loss_mlp": 1.00313866, + "epoch": 0.4835713212084774, + "flos": 67140770204160.0, + "grad_norm": 0.687950264432135, + "language_loss": 0.58528215, + "learning_rate": 2.103477096133168e-06, + "loss": 0.60542923, + "num_input_tokens_seen": 172923055, + "router_z_loss_clip": 0.01330566, + "router_z_loss_mlp": 0.09179688, + "step": 8043, + "time_per_iteration": 3.1437458992004395 + }, + { + "auxiliary_loss_clip": 0.01065991, + "auxiliary_loss_mlp": 0.01031672, + "balance_loss_clip": 1.01736689, + "balance_loss_mlp": 1.02093923, + "epoch": 0.48363144446114537, + "flos": 17966513514240.0, + "grad_norm": 2.5476392188121437, + "language_loss": 0.72525036, + "learning_rate": 2.1030997779923344e-06, + "loss": 0.74622697, + "num_input_tokens_seen": 172940700, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.45117188, + "step": 8044, + "time_per_iteration": 2.4050991535186768 + }, + { + "auxiliary_loss_clip": 0.01062994, + "auxiliary_loss_mlp": 0.01029911, + "balance_loss_clip": 1.01544511, + "balance_loss_mlp": 1.01989007, + "epoch": 0.48369156771381333, + "flos": 20592505825920.0, + "grad_norm": 1.3407868384826338, + "language_loss": 0.75947756, + "learning_rate": 2.1027224561721352e-06, + "loss": 0.78040659, + "num_input_tokens_seen": 172961125, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.43164062, + "step": 8045, + "time_per_iteration": 5.351046800613403 + }, + { + "auxiliary_loss_clip": 0.01067813, + "auxiliary_loss_mlp": 0.01028403, + "balance_loss_clip": 1.01456857, + "balance_loss_mlp": 1.02187192, + "epoch": 0.4837516909664813, + "flos": 22673957662080.0, + "grad_norm": 1.507715556225489, + "language_loss": 0.68935955, + "learning_rate": 2.1023451306860355e-06, + "loss": 0.71032166, + "num_input_tokens_seen": 172980405, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.4609375, + "step": 8046, + "time_per_iteration": 2.4154961109161377 + }, + { + "auxiliary_loss_clip": 0.01066655, + "auxiliary_loss_mlp": 0.01032646, + "balance_loss_clip": 1.01865673, + "balance_loss_mlp": 1.02251554, + "epoch": 0.48381181421914926, + "flos": 25517493855360.0, + "grad_norm": 1.7849324119041199, + "language_loss": 0.81987107, + "learning_rate": 2.101967801547501e-06, + "loss": 0.84086412, + "num_input_tokens_seen": 172999105, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.44140625, + "step": 8047, + "time_per_iteration": 2.4514734745025635 + }, + { + "auxiliary_loss_clip": 0.0106416, + "auxiliary_loss_mlp": 0.01026579, + "balance_loss_clip": 1.01313818, + "balance_loss_mlp": 1.02056682, + "epoch": 0.4838719374718172, + "flos": 24206330545920.0, + "grad_norm": 1.517735643252098, + "language_loss": 0.80309558, + "learning_rate": 2.1015904687699988e-06, + "loss": 0.82400298, + "num_input_tokens_seen": 173019935, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.43554688, + "step": 8048, + "time_per_iteration": 2.4431350231170654 + }, + { + "auxiliary_loss_clip": 0.01067384, + "auxiliary_loss_mlp": 0.01029227, + "balance_loss_clip": 1.0152024, + "balance_loss_mlp": 1.02138042, + "epoch": 0.4839320607244852, + "flos": 26723358904320.0, + "grad_norm": 1.7368168570198448, + "language_loss": 0.81214619, + "learning_rate": 2.101213132366993e-06, + "loss": 0.83311236, + "num_input_tokens_seen": 173039700, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.45898438, + "step": 8049, + "time_per_iteration": 2.4724676609039307 + }, + { + "auxiliary_loss_clip": 0.01064438, + "auxiliary_loss_mlp": 0.01024754, + "balance_loss_clip": 1.01251721, + "balance_loss_mlp": 1.02318394, + "epoch": 0.48399218397715316, + "flos": 20447860596480.0, + "grad_norm": 1.862773449251293, + "language_loss": 0.72856975, + "learning_rate": 2.100835792351952e-06, + "loss": 0.74946165, + "num_input_tokens_seen": 173059170, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.4140625, + "step": 8050, + "time_per_iteration": 2.4197020530700684 + }, + { + "auxiliary_loss_clip": 0.01013396, + "auxiliary_loss_mlp": 0.01001175, + "balance_loss_clip": 0.99985796, + "balance_loss_mlp": 1.00431848, + "epoch": 0.4840523072298211, + "flos": 67177394087040.0, + "grad_norm": 0.7033387513375722, + "language_loss": 0.56363773, + "learning_rate": 2.1004584487383405e-06, + "loss": 0.58378351, + "num_input_tokens_seen": 173119000, + "router_z_loss_clip": 0.01318359, + "router_z_loss_mlp": 0.09082031, + "step": 8051, + "time_per_iteration": 3.1161468029022217 + }, + { + "auxiliary_loss_clip": 0.01069026, + "auxiliary_loss_mlp": 0.01025347, + "balance_loss_clip": 1.01179338, + "balance_loss_mlp": 1.02313614, + "epoch": 0.4841124304824891, + "flos": 22410608209920.0, + "grad_norm": 1.9571301401000842, + "language_loss": 0.75292635, + "learning_rate": 2.1000811015396248e-06, + "loss": 0.77387005, + "num_input_tokens_seen": 173137570, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.45898438, + "step": 8052, + "time_per_iteration": 2.419300079345703 + }, + { + "auxiliary_loss_clip": 0.01065545, + "auxiliary_loss_mlp": 0.01021748, + "balance_loss_clip": 1.00930285, + "balance_loss_mlp": 1.02137434, + "epoch": 0.48417255373515705, + "flos": 13843131367680.0, + "grad_norm": 2.262750250119099, + "language_loss": 0.65813053, + "learning_rate": 2.0997037507692726e-06, + "loss": 0.67900348, + "num_input_tokens_seen": 173154355, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.44140625, + "step": 8053, + "time_per_iteration": 2.3929433822631836 + }, + { + "auxiliary_loss_clip": 0.01064049, + "auxiliary_loss_mlp": 0.0102355, + "balance_loss_clip": 1.01021636, + "balance_loss_mlp": 1.02064133, + "epoch": 0.484232676987825, + "flos": 31648346933760.0, + "grad_norm": 2.1714815852335385, + "language_loss": 0.69034398, + "learning_rate": 2.0993263964407494e-06, + "loss": 0.71121997, + "num_input_tokens_seen": 173174845, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.43359375, + "step": 8054, + "time_per_iteration": 2.4843952655792236 + }, + { + "auxiliary_loss_clip": 0.01063915, + "auxiliary_loss_mlp": 0.01029603, + "balance_loss_clip": 1.01642418, + "balance_loss_mlp": 1.01973259, + "epoch": 0.48429280024049304, + "flos": 24094294392960.0, + "grad_norm": 1.5441217896876331, + "language_loss": 0.69701129, + "learning_rate": 2.0989490385675237e-06, + "loss": 0.71794641, + "num_input_tokens_seen": 173195025, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.44140625, + "step": 8055, + "time_per_iteration": 2.444021463394165 + }, + { + "auxiliary_loss_clip": 0.0106743, + "auxiliary_loss_mlp": 0.01031713, + "balance_loss_clip": 1.01748538, + "balance_loss_mlp": 1.02278042, + "epoch": 0.484352923493161, + "flos": 17529121601280.0, + "grad_norm": 2.091292619166147, + "language_loss": 0.63352573, + "learning_rate": 2.0985716771630604e-06, + "loss": 0.65451717, + "num_input_tokens_seen": 173213065, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.44726562, + "step": 8056, + "time_per_iteration": 2.38543701171875 + }, + { + "auxiliary_loss_clip": 0.01063841, + "auxiliary_loss_mlp": 0.01030255, + "balance_loss_clip": 1.01679659, + "balance_loss_mlp": 1.01973784, + "epoch": 0.48441304674582897, + "flos": 29165638308480.0, + "grad_norm": 2.3994851247815623, + "language_loss": 0.6737448, + "learning_rate": 2.0981943122408278e-06, + "loss": 0.69468576, + "num_input_tokens_seen": 173234545, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.44140625, + "step": 8057, + "time_per_iteration": 2.4865992069244385 + }, + { + "auxiliary_loss_clip": 0.01064423, + "auxiliary_loss_mlp": 0.01024011, + "balance_loss_clip": 1.01054633, + "balance_loss_mlp": 1.02099609, + "epoch": 0.48447316999849693, + "flos": 15885829728000.0, + "grad_norm": 2.6047146862353605, + "language_loss": 0.81482214, + "learning_rate": 2.097816943814293e-06, + "loss": 0.83570647, + "num_input_tokens_seen": 173252175, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.43359375, + "step": 8058, + "time_per_iteration": 2.376267910003662 + }, + { + "auxiliary_loss_clip": 0.0106709, + "auxiliary_loss_mlp": 0.01027005, + "balance_loss_clip": 1.01298606, + "balance_loss_mlp": 1.02155209, + "epoch": 0.4845332932511649, + "flos": 24380477740800.0, + "grad_norm": 1.9422942171377875, + "language_loss": 0.80033463, + "learning_rate": 2.097439571896923e-06, + "loss": 0.82127559, + "num_input_tokens_seen": 173268790, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.45507812, + "step": 8059, + "time_per_iteration": 2.429425001144409 + }, + { + "auxiliary_loss_clip": 0.01068098, + "auxiliary_loss_mlp": 0.01031916, + "balance_loss_clip": 1.01720548, + "balance_loss_mlp": 1.02188575, + "epoch": 0.48459341650383286, + "flos": 37115152352640.0, + "grad_norm": 2.1205080976060144, + "language_loss": 0.66524339, + "learning_rate": 2.097062196502185e-06, + "loss": 0.68624353, + "num_input_tokens_seen": 173288030, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.46289062, + "step": 8060, + "time_per_iteration": 2.536776065826416 + }, + { + "auxiliary_loss_clip": 0.01067892, + "auxiliary_loss_mlp": 0.01026102, + "balance_loss_clip": 1.01285219, + "balance_loss_mlp": 1.02240181, + "epoch": 0.48465353975650083, + "flos": 22965657004800.0, + "grad_norm": 2.2163735050332143, + "language_loss": 0.67070806, + "learning_rate": 2.096684817643547e-06, + "loss": 0.69164801, + "num_input_tokens_seen": 173305965, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.453125, + "step": 8061, + "time_per_iteration": 2.425679922103882 + }, + { + "auxiliary_loss_clip": 0.01068473, + "auxiliary_loss_mlp": 0.01036408, + "balance_loss_clip": 1.0216856, + "balance_loss_mlp": 1.02218187, + "epoch": 0.4847136630091688, + "flos": 17706864666240.0, + "grad_norm": 1.9433176674102541, + "language_loss": 0.82296443, + "learning_rate": 2.0963074353344765e-06, + "loss": 0.84401333, + "num_input_tokens_seen": 173321985, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.46289062, + "step": 8062, + "time_per_iteration": 2.387420892715454 + }, + { + "auxiliary_loss_clip": 0.01065731, + "auxiliary_loss_mlp": 0.01027385, + "balance_loss_clip": 1.01421857, + "balance_loss_mlp": 1.02209592, + "epoch": 0.48477378626183676, + "flos": 22017171565440.0, + "grad_norm": 1.7169997641356023, + "language_loss": 0.74168885, + "learning_rate": 2.0959300495884416e-06, + "loss": 0.76262003, + "num_input_tokens_seen": 173341315, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.43554688, + "step": 8063, + "time_per_iteration": 2.4281883239746094 + }, + { + "auxiliary_loss_clip": 0.01067026, + "auxiliary_loss_mlp": 0.01022939, + "balance_loss_clip": 1.00944483, + "balance_loss_mlp": 1.02162409, + "epoch": 0.4848339095145047, + "flos": 27961763207040.0, + "grad_norm": 1.6964675582902369, + "language_loss": 0.788899, + "learning_rate": 2.0955526604189104e-06, + "loss": 0.80979866, + "num_input_tokens_seen": 173361055, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.45507812, + "step": 8064, + "time_per_iteration": 2.465191125869751 + }, + { + "auxiliary_loss_clip": 0.01063945, + "auxiliary_loss_mlp": 0.01025759, + "balance_loss_clip": 1.01376033, + "balance_loss_mlp": 1.02240419, + "epoch": 0.4848940327671727, + "flos": 21687696264960.0, + "grad_norm": 1.7579547778651041, + "language_loss": 0.78705037, + "learning_rate": 2.09517526783935e-06, + "loss": 0.8079474, + "num_input_tokens_seen": 173379255, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.41601562, + "step": 8065, + "time_per_iteration": 2.4242734909057617 + }, + { + "auxiliary_loss_clip": 0.01058993, + "auxiliary_loss_mlp": 0.01027363, + "balance_loss_clip": 1.01520371, + "balance_loss_mlp": 1.01896429, + "epoch": 0.48495415601984065, + "flos": 20630526163200.0, + "grad_norm": 3.124525789352464, + "language_loss": 0.76052564, + "learning_rate": 2.094797871863229e-06, + "loss": 0.78138924, + "num_input_tokens_seen": 173398370, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.40039062, + "step": 8066, + "time_per_iteration": 2.4067575931549072 + }, + { + "auxiliary_loss_clip": 0.01063448, + "auxiliary_loss_mlp": 0.01024929, + "balance_loss_clip": 1.01277018, + "balance_loss_mlp": 1.02167153, + "epoch": 0.4850142792725086, + "flos": 25627016390400.0, + "grad_norm": 1.625219735468426, + "language_loss": 0.71917474, + "learning_rate": 2.094420472504016e-06, + "loss": 0.7400586, + "num_input_tokens_seen": 173419595, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.41796875, + "step": 8067, + "time_per_iteration": 2.4799962043762207 + }, + { + "auxiliary_loss_clip": 0.01065613, + "auxiliary_loss_mlp": 0.01025308, + "balance_loss_clip": 1.01240408, + "balance_loss_mlp": 1.02183795, + "epoch": 0.4850744025251766, + "flos": 13771105499520.0, + "grad_norm": 2.0740167039588853, + "language_loss": 0.79334867, + "learning_rate": 2.0940430697751796e-06, + "loss": 0.81425786, + "num_input_tokens_seen": 173435390, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.4375, + "step": 8068, + "time_per_iteration": 2.3748984336853027 + }, + { + "auxiliary_loss_clip": 0.01061719, + "auxiliary_loss_mlp": 0.01026397, + "balance_loss_clip": 1.01433909, + "balance_loss_mlp": 1.01989067, + "epoch": 0.4851345257778446, + "flos": 20260447084800.0, + "grad_norm": 1.7404169132315053, + "language_loss": 0.84314895, + "learning_rate": 2.093665663690187e-06, + "loss": 0.86403006, + "num_input_tokens_seen": 173454095, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.41796875, + "step": 8069, + "time_per_iteration": 2.410999298095703 + }, + { + "auxiliary_loss_clip": 0.0106381, + "auxiliary_loss_mlp": 0.01023368, + "balance_loss_clip": 1.01098251, + "balance_loss_mlp": 1.0209465, + "epoch": 0.48519464903051257, + "flos": 27088445658240.0, + "grad_norm": 1.8901690390397954, + "language_loss": 0.78063715, + "learning_rate": 2.0932882542625085e-06, + "loss": 0.80150896, + "num_input_tokens_seen": 173475300, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.4296875, + "step": 8070, + "time_per_iteration": 2.4867634773254395 + }, + { + "auxiliary_loss_clip": 0.01064639, + "auxiliary_loss_mlp": 0.01025466, + "balance_loss_clip": 1.01229954, + "balance_loss_mlp": 1.02122879, + "epoch": 0.48525477228318054, + "flos": 17126328712320.0, + "grad_norm": 2.079709671853799, + "language_loss": 0.7740941, + "learning_rate": 2.0929108415056115e-06, + "loss": 0.79499519, + "num_input_tokens_seen": 173492005, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.43359375, + "step": 8071, + "time_per_iteration": 3.7918081283569336 + }, + { + "auxiliary_loss_clip": 0.01064588, + "auxiliary_loss_mlp": 0.01024695, + "balance_loss_clip": 1.01169562, + "balance_loss_mlp": 1.02031434, + "epoch": 0.4853148955358485, + "flos": 28179167443200.0, + "grad_norm": 5.0376351055928295, + "language_loss": 0.71376568, + "learning_rate": 2.0925334254329667e-06, + "loss": 0.73465854, + "num_input_tokens_seen": 173511995, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.44140625, + "step": 8072, + "time_per_iteration": 2.4643023014068604 + }, + { + "auxiliary_loss_clip": 0.01068057, + "auxiliary_loss_mlp": 0.01030875, + "balance_loss_clip": 1.01752937, + "balance_loss_mlp": 1.02202964, + "epoch": 0.48537501878851647, + "flos": 17492358072960.0, + "grad_norm": 1.9661691605125262, + "language_loss": 0.87713695, + "learning_rate": 2.092156006058041e-06, + "loss": 0.89812618, + "num_input_tokens_seen": 173530215, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.4609375, + "step": 8073, + "time_per_iteration": 2.393359422683716 + }, + { + "auxiliary_loss_clip": 0.01062245, + "auxiliary_loss_mlp": 0.01025639, + "balance_loss_clip": 1.01303852, + "balance_loss_mlp": 1.02098358, + "epoch": 0.48543514204118443, + "flos": 28583601166080.0, + "grad_norm": 1.6495820977628664, + "language_loss": 0.60492313, + "learning_rate": 2.0917785833943044e-06, + "loss": 0.62580192, + "num_input_tokens_seen": 173550920, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.41210938, + "step": 8074, + "time_per_iteration": 2.4651601314544678 + }, + { + "auxiliary_loss_clip": 0.01064916, + "auxiliary_loss_mlp": 0.01028333, + "balance_loss_clip": 1.0151068, + "balance_loss_mlp": 1.02045012, + "epoch": 0.4854952652938524, + "flos": 20958919211520.0, + "grad_norm": 1.5663760595893386, + "language_loss": 0.73293668, + "learning_rate": 2.091401157455227e-06, + "loss": 0.75386918, + "num_input_tokens_seen": 173569065, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.4453125, + "step": 8075, + "time_per_iteration": 2.40632963180542 + }, + { + "auxiliary_loss_clip": 0.01061721, + "auxiliary_loss_mlp": 0.01031333, + "balance_loss_clip": 1.01931667, + "balance_loss_mlp": 1.02094543, + "epoch": 0.48555538854652036, + "flos": 66527243015040.0, + "grad_norm": 1.6579038033870388, + "language_loss": 0.81617087, + "learning_rate": 2.091023728254277e-06, + "loss": 0.83710146, + "num_input_tokens_seen": 173596085, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.40820312, + "step": 8076, + "time_per_iteration": 2.807271957397461 + }, + { + "auxiliary_loss_clip": 0.01064282, + "auxiliary_loss_mlp": 0.01025685, + "balance_loss_clip": 1.01278067, + "balance_loss_mlp": 1.02165949, + "epoch": 0.4856155117991883, + "flos": 15924059533440.0, + "grad_norm": 2.0845727336561155, + "language_loss": 0.86168385, + "learning_rate": 2.0906462958049247e-06, + "loss": 0.8825835, + "num_input_tokens_seen": 173613900, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.42578125, + "step": 8077, + "time_per_iteration": 2.3928747177124023 + }, + { + "auxiliary_loss_clip": 0.01014124, + "auxiliary_loss_mlp": 0.01003821, + "balance_loss_clip": 1.00249827, + "balance_loss_mlp": 1.0051918, + "epoch": 0.4856756350518563, + "flos": 71044129762560.0, + "grad_norm": 0.9037529404407679, + "language_loss": 0.58468181, + "learning_rate": 2.090268860120638e-06, + "loss": 0.60486126, + "num_input_tokens_seen": 173671305, + "router_z_loss_clip": 0.01324463, + "router_z_loss_mlp": 0.08886719, + "step": 8078, + "time_per_iteration": 3.0348854064941406 + }, + { + "auxiliary_loss_clip": 0.01067062, + "auxiliary_loss_mlp": 0.01026871, + "balance_loss_clip": 1.01395452, + "balance_loss_mlp": 1.02259445, + "epoch": 0.48573575830452426, + "flos": 29824379441280.0, + "grad_norm": 1.7462907985579321, + "language_loss": 0.72263885, + "learning_rate": 2.0898914212148895e-06, + "loss": 0.7435782, + "num_input_tokens_seen": 173692070, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.4453125, + "step": 8079, + "time_per_iteration": 2.4856016635894775 + }, + { + "auxiliary_loss_clip": 0.0106575, + "auxiliary_loss_mlp": 0.0102459, + "balance_loss_clip": 1.01187599, + "balance_loss_mlp": 1.02234101, + "epoch": 0.4857958815571922, + "flos": 17638539402240.0, + "grad_norm": 2.9090700191940764, + "language_loss": 0.79535502, + "learning_rate": 2.089513979101147e-06, + "loss": 0.81625849, + "num_input_tokens_seen": 173709785, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.43359375, + "step": 8080, + "time_per_iteration": 2.376544237136841 + }, + { + "auxiliary_loss_clip": 0.01064015, + "auxiliary_loss_mlp": 0.0102852, + "balance_loss_clip": 1.01569867, + "balance_loss_mlp": 1.02058887, + "epoch": 0.4858560048098602, + "flos": 21104437224960.0, + "grad_norm": 1.990035452967306, + "language_loss": 0.84232545, + "learning_rate": 2.0891365337928803e-06, + "loss": 0.86325073, + "num_input_tokens_seen": 173728770, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.43359375, + "step": 8081, + "time_per_iteration": 3.857828140258789 + }, + { + "auxiliary_loss_clip": 0.01063505, + "auxiliary_loss_mlp": 0.01026677, + "balance_loss_clip": 1.0135287, + "balance_loss_mlp": 1.02028298, + "epoch": 0.4859161280625282, + "flos": 22089756015360.0, + "grad_norm": 1.439279017182136, + "language_loss": 0.82912064, + "learning_rate": 2.0887590853035604e-06, + "loss": 0.85002244, + "num_input_tokens_seen": 173747355, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.43359375, + "step": 8082, + "time_per_iteration": 2.4204251766204834 + }, + { + "auxiliary_loss_clip": 0.01063801, + "auxiliary_loss_mlp": 0.0102626, + "balance_loss_clip": 1.0147208, + "balance_loss_mlp": 1.022264, + "epoch": 0.4859762513151962, + "flos": 17492497718400.0, + "grad_norm": 1.9794609955333862, + "language_loss": 0.87300539, + "learning_rate": 2.0883816336466567e-06, + "loss": 0.893906, + "num_input_tokens_seen": 173764825, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.41601562, + "step": 8083, + "time_per_iteration": 2.4015729427337646 + }, + { + "auxiliary_loss_clip": 0.01062443, + "auxiliary_loss_mlp": 0.01026209, + "balance_loss_clip": 1.01440144, + "balance_loss_mlp": 1.02110934, + "epoch": 0.48603637456786414, + "flos": 18003277042560.0, + "grad_norm": 2.368821763764815, + "language_loss": 0.8070429, + "learning_rate": 2.0880041788356402e-06, + "loss": 0.82792938, + "num_input_tokens_seen": 173783215, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.4140625, + "step": 8084, + "time_per_iteration": 2.404780387878418 + }, + { + "auxiliary_loss_clip": 0.01064565, + "auxiliary_loss_mlp": 0.01029247, + "balance_loss_clip": 1.01617527, + "balance_loss_mlp": 1.02111626, + "epoch": 0.4860964978205321, + "flos": 22490942981760.0, + "grad_norm": 1.9147674030911406, + "language_loss": 0.68531144, + "learning_rate": 2.087626720883981e-06, + "loss": 0.70624954, + "num_input_tokens_seen": 173801905, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.43359375, + "step": 8085, + "time_per_iteration": 5.367203235626221 + }, + { + "auxiliary_loss_clip": 0.01064346, + "auxiliary_loss_mlp": 0.01029463, + "balance_loss_clip": 1.01652312, + "balance_loss_mlp": 1.02141881, + "epoch": 0.48615662107320007, + "flos": 23371277713920.0, + "grad_norm": 1.494991306765901, + "language_loss": 0.77274895, + "learning_rate": 2.0872492598051486e-06, + "loss": 0.79368705, + "num_input_tokens_seen": 173824690, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.4296875, + "step": 8086, + "time_per_iteration": 2.487104654312134 + }, + { + "auxiliary_loss_clip": 0.01063058, + "auxiliary_loss_mlp": 0.0102468, + "balance_loss_clip": 1.01195478, + "balance_loss_mlp": 1.01986563, + "epoch": 0.48621674432586803, + "flos": 34417518197760.0, + "grad_norm": 1.9872648663051247, + "language_loss": 0.69651723, + "learning_rate": 2.0868717956126155e-06, + "loss": 0.71739459, + "num_input_tokens_seen": 173844450, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.43164062, + "step": 8087, + "time_per_iteration": 2.5177295207977295 + }, + { + "auxiliary_loss_clip": 0.01066166, + "auxiliary_loss_mlp": 0.01027789, + "balance_loss_clip": 1.0136385, + "balance_loss_mlp": 1.01983595, + "epoch": 0.486276867578536, + "flos": 33106215242880.0, + "grad_norm": 3.0091971355054423, + "language_loss": 0.72419745, + "learning_rate": 2.086494328319851e-06, + "loss": 0.74513698, + "num_input_tokens_seen": 173864975, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.46289062, + "step": 8088, + "time_per_iteration": 2.5157318115234375 + }, + { + "auxiliary_loss_clip": 0.01064043, + "auxiliary_loss_mlp": 0.01034799, + "balance_loss_clip": 1.02225852, + "balance_loss_mlp": 1.02124834, + "epoch": 0.48633699083120396, + "flos": 21469628712960.0, + "grad_norm": 1.4485850493187817, + "language_loss": 0.75229859, + "learning_rate": 2.086116857940327e-06, + "loss": 0.77328706, + "num_input_tokens_seen": 173883805, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.4296875, + "step": 8089, + "time_per_iteration": 2.459401845932007 + }, + { + "auxiliary_loss_clip": 0.0106562, + "auxiliary_loss_mlp": 0.01025158, + "balance_loss_clip": 1.01218843, + "balance_loss_mlp": 1.02100503, + "epoch": 0.48639711408387193, + "flos": 20083297512960.0, + "grad_norm": 1.5703004050128453, + "language_loss": 0.83923805, + "learning_rate": 2.0857393844875135e-06, + "loss": 0.86014581, + "num_input_tokens_seen": 173903520, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.4453125, + "step": 8090, + "time_per_iteration": 2.401383399963379 + }, + { + "auxiliary_loss_clip": 0.01062463, + "auxiliary_loss_mlp": 0.01024924, + "balance_loss_clip": 1.01240087, + "balance_loss_mlp": 1.02120066, + "epoch": 0.4864572373365399, + "flos": 20777789744640.0, + "grad_norm": 1.9526199404857987, + "language_loss": 0.75640231, + "learning_rate": 2.0853619079748815e-06, + "loss": 0.77727616, + "num_input_tokens_seen": 173924255, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.41210938, + "step": 8091, + "time_per_iteration": 2.4265434741973877 + }, + { + "auxiliary_loss_clip": 0.01064517, + "auxiliary_loss_mlp": 0.01025059, + "balance_loss_clip": 1.01248217, + "balance_loss_mlp": 1.02125406, + "epoch": 0.48651736058920786, + "flos": 26024328195840.0, + "grad_norm": 1.395145444955985, + "language_loss": 0.80335283, + "learning_rate": 2.0849844284159035e-06, + "loss": 0.82424855, + "num_input_tokens_seen": 173943285, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.43359375, + "step": 8092, + "time_per_iteration": 2.4514262676239014 + }, + { + "auxiliary_loss_clip": 0.01065367, + "auxiliary_loss_mlp": 0.01023228, + "balance_loss_clip": 1.01077056, + "balance_loss_mlp": 1.02134418, + "epoch": 0.4865774838418758, + "flos": 20484554302080.0, + "grad_norm": 1.5205859254962397, + "language_loss": 0.71936399, + "learning_rate": 2.084606945824049e-06, + "loss": 0.74024993, + "num_input_tokens_seen": 173962205, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.44140625, + "step": 8093, + "time_per_iteration": 2.408280611038208 + }, + { + "auxiliary_loss_clip": 0.01063419, + "auxiliary_loss_mlp": 0.01025039, + "balance_loss_clip": 1.01248026, + "balance_loss_mlp": 1.02073336, + "epoch": 0.4866376070945438, + "flos": 23546646806400.0, + "grad_norm": 1.70818009050206, + "language_loss": 0.6783334, + "learning_rate": 2.0842294602127916e-06, + "loss": 0.69921798, + "num_input_tokens_seen": 173980945, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.42578125, + "step": 8094, + "time_per_iteration": 2.431414842605591 + }, + { + "auxiliary_loss_clip": 0.01066472, + "auxiliary_loss_mlp": 0.01031433, + "balance_loss_clip": 1.0177536, + "balance_loss_mlp": 1.0207864, + "epoch": 0.4866977303472118, + "flos": 16689669937920.0, + "grad_norm": 2.124393031330406, + "language_loss": 0.66836953, + "learning_rate": 2.0838519715956006e-06, + "loss": 0.68934858, + "num_input_tokens_seen": 173998860, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.45703125, + "step": 8095, + "time_per_iteration": 2.4066340923309326 + }, + { + "auxiliary_loss_clip": 0.01064282, + "auxiliary_loss_mlp": 0.01027627, + "balance_loss_clip": 1.01385283, + "balance_loss_mlp": 1.02050745, + "epoch": 0.4867578535998798, + "flos": 17895011316480.0, + "grad_norm": 1.7744958935305328, + "language_loss": 0.78440255, + "learning_rate": 2.0834744799859475e-06, + "loss": 0.80532157, + "num_input_tokens_seen": 174016665, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.4375, + "step": 8096, + "time_per_iteration": 2.3900949954986572 + }, + { + "auxiliary_loss_clip": 0.0106165, + "auxiliary_loss_mlp": 0.0102746, + "balance_loss_clip": 1.01445436, + "balance_loss_mlp": 1.02005148, + "epoch": 0.48681797685254774, + "flos": 22636705374720.0, + "grad_norm": 1.7213493593802274, + "language_loss": 0.6764214, + "learning_rate": 2.0830969853973063e-06, + "loss": 0.69731247, + "num_input_tokens_seen": 174034800, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.41601562, + "step": 8097, + "time_per_iteration": 2.426011562347412 + }, + { + "auxiliary_loss_clip": 0.01061193, + "auxiliary_loss_mlp": 0.01028068, + "balance_loss_clip": 1.015378, + "balance_loss_mlp": 1.01916182, + "epoch": 0.4868781001052157, + "flos": 20885043041280.0, + "grad_norm": 1.502991265301921, + "language_loss": 0.71577525, + "learning_rate": 2.0827194878431464e-06, + "loss": 0.73666787, + "num_input_tokens_seen": 174054445, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.41992188, + "step": 8098, + "time_per_iteration": 2.4086408615112305 + }, + { + "auxiliary_loss_clip": 0.01068941, + "auxiliary_loss_mlp": 0.01027674, + "balance_loss_clip": 1.01357186, + "balance_loss_mlp": 1.02227187, + "epoch": 0.48693822335788367, + "flos": 41973316306560.0, + "grad_norm": 1.6327960411709845, + "language_loss": 0.6625638, + "learning_rate": 2.082341987336941e-06, + "loss": 0.68352997, + "num_input_tokens_seen": 174077890, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.46679688, + "step": 8099, + "time_per_iteration": 2.597210645675659 + }, + { + "auxiliary_loss_clip": 0.01060633, + "auxiliary_loss_mlp": 0.01025615, + "balance_loss_clip": 1.01328278, + "balance_loss_mlp": 1.01918781, + "epoch": 0.48699834661055164, + "flos": 24242151467520.0, + "grad_norm": 1.8252343023436033, + "language_loss": 0.66530263, + "learning_rate": 2.0819644838921618e-06, + "loss": 0.68616509, + "num_input_tokens_seen": 174097460, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.4140625, + "step": 8100, + "time_per_iteration": 2.435896873474121 + }, + { + "auxiliary_loss_clip": 0.01060135, + "auxiliary_loss_mlp": 0.01027702, + "balance_loss_clip": 1.01587033, + "balance_loss_mlp": 1.01936781, + "epoch": 0.4870584698632196, + "flos": 25922625805440.0, + "grad_norm": 1.4719217703807386, + "language_loss": 0.76148349, + "learning_rate": 2.0815869775222816e-06, + "loss": 0.78236187, + "num_input_tokens_seen": 174120775, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.40820312, + "step": 8101, + "time_per_iteration": 2.47476863861084 + }, + { + "auxiliary_loss_clip": 0.0101281, + "auxiliary_loss_mlp": 0.01001414, + "balance_loss_clip": 1.00015593, + "balance_loss_mlp": 1.00394106, + "epoch": 0.48711859311588757, + "flos": 70208588171520.0, + "grad_norm": 0.703892085156667, + "language_loss": 0.52241397, + "learning_rate": 2.0812094682407718e-06, + "loss": 0.54255617, + "num_input_tokens_seen": 174189135, + "router_z_loss_clip": 0.01257324, + "router_z_loss_mlp": 0.08886719, + "step": 8102, + "time_per_iteration": 3.200195789337158 + }, + { + "auxiliary_loss_clip": 0.01064075, + "auxiliary_loss_mlp": 0.01026017, + "balance_loss_clip": 1.01381624, + "balance_loss_mlp": 1.02145755, + "epoch": 0.48717871636855553, + "flos": 12342320219520.0, + "grad_norm": 2.0021273463045337, + "language_loss": 0.73259115, + "learning_rate": 2.080831956061105e-06, + "loss": 0.753492, + "num_input_tokens_seen": 174203250, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.42578125, + "step": 8103, + "time_per_iteration": 2.375121593475342 + }, + { + "auxiliary_loss_clip": 0.01066903, + "auxiliary_loss_mlp": 0.01031368, + "balance_loss_clip": 1.01759362, + "balance_loss_mlp": 1.02258539, + "epoch": 0.4872388396212235, + "flos": 23476017392640.0, + "grad_norm": 1.5100138269862844, + "language_loss": 0.62656987, + "learning_rate": 2.0804544409967534e-06, + "loss": 0.64755261, + "num_input_tokens_seen": 174224145, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.44335938, + "step": 8104, + "time_per_iteration": 2.4256398677825928 + }, + { + "auxiliary_loss_clip": 0.01062627, + "auxiliary_loss_mlp": 0.01027586, + "balance_loss_clip": 1.01474738, + "balance_loss_mlp": 1.02126241, + "epoch": 0.48729896287389146, + "flos": 31426334398080.0, + "grad_norm": 2.1275178845225002, + "language_loss": 0.69021749, + "learning_rate": 2.0800769230611897e-06, + "loss": 0.71111953, + "num_input_tokens_seen": 174244435, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.4140625, + "step": 8105, + "time_per_iteration": 2.4989449977874756 + }, + { + "auxiliary_loss_clip": 0.01062153, + "auxiliary_loss_mlp": 0.01023326, + "balance_loss_clip": 1.01137543, + "balance_loss_mlp": 1.02019811, + "epoch": 0.4873590861265594, + "flos": 19057060742400.0, + "grad_norm": 1.5348783152224779, + "language_loss": 0.7308988, + "learning_rate": 2.079699402267887e-06, + "loss": 0.75175357, + "num_input_tokens_seen": 174262710, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.41992188, + "step": 8106, + "time_per_iteration": 2.3937466144561768 + }, + { + "auxiliary_loss_clip": 0.01068106, + "auxiliary_loss_mlp": 0.01032049, + "balance_loss_clip": 1.01798201, + "balance_loss_mlp": 1.02240968, + "epoch": 0.4874192093792274, + "flos": 24347275171200.0, + "grad_norm": 1.8032154034113386, + "language_loss": 0.76931638, + "learning_rate": 2.0793218786303176e-06, + "loss": 0.79031789, + "num_input_tokens_seen": 174281545, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.45703125, + "step": 8107, + "time_per_iteration": 2.457850694656372 + }, + { + "auxiliary_loss_clip": 0.01062535, + "auxiliary_loss_mlp": 0.01021123, + "balance_loss_clip": 1.00914896, + "balance_loss_mlp": 1.02165401, + "epoch": 0.4874793326318954, + "flos": 23111489220480.0, + "grad_norm": 1.5863172665559093, + "language_loss": 0.75184852, + "learning_rate": 2.0789443521619536e-06, + "loss": 0.77268511, + "num_input_tokens_seen": 174300290, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.40820312, + "step": 8108, + "time_per_iteration": 2.4193687438964844 + }, + { + "auxiliary_loss_clip": 0.01061388, + "auxiliary_loss_mlp": 0.0102461, + "balance_loss_clip": 1.01375008, + "balance_loss_mlp": 1.02071619, + "epoch": 0.4875394558845634, + "flos": 19025149893120.0, + "grad_norm": 2.209327622983143, + "language_loss": 0.7342478, + "learning_rate": 2.07856682287627e-06, + "loss": 0.75510788, + "num_input_tokens_seen": 174318490, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.40625, + "step": 8109, + "time_per_iteration": 2.41934871673584 + }, + { + "auxiliary_loss_clip": 0.01063062, + "auxiliary_loss_mlp": 0.01024609, + "balance_loss_clip": 1.01224065, + "balance_loss_mlp": 1.02092111, + "epoch": 0.48759957913723134, + "flos": 21432550982400.0, + "grad_norm": 1.8344737831844857, + "language_loss": 0.78891349, + "learning_rate": 2.078189290786738e-06, + "loss": 0.80979025, + "num_input_tokens_seen": 174335505, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.421875, + "step": 8110, + "time_per_iteration": 3.841266632080078 + }, + { + "auxiliary_loss_clip": 0.01063291, + "auxiliary_loss_mlp": 0.01034791, + "balance_loss_clip": 1.02204132, + "balance_loss_mlp": 1.02100325, + "epoch": 0.4876597023898993, + "flos": 17747712823680.0, + "grad_norm": 2.067795183545082, + "language_loss": 0.72588682, + "learning_rate": 2.0778117559068307e-06, + "loss": 0.74686766, + "num_input_tokens_seen": 174353990, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.42382812, + "step": 8111, + "time_per_iteration": 2.3841464519500732 + }, + { + "auxiliary_loss_clip": 0.01062443, + "auxiliary_loss_mlp": 0.01027282, + "balance_loss_clip": 1.01435971, + "balance_loss_mlp": 1.02099836, + "epoch": 0.4877198256425673, + "flos": 17018691390720.0, + "grad_norm": 1.9249180883317052, + "language_loss": 0.76094186, + "learning_rate": 2.077434218250023e-06, + "loss": 0.78183907, + "num_input_tokens_seen": 174373425, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.4140625, + "step": 8112, + "time_per_iteration": 2.3842945098876953 + }, + { + "auxiliary_loss_clip": 0.01067785, + "auxiliary_loss_mlp": 0.01026927, + "balance_loss_clip": 1.01342022, + "balance_loss_mlp": 1.02191412, + "epoch": 0.48777994889523524, + "flos": 22382956546560.0, + "grad_norm": 1.643251425519356, + "language_loss": 0.74934947, + "learning_rate": 2.0770566778297868e-06, + "loss": 0.77029669, + "num_input_tokens_seen": 174393070, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.45898438, + "step": 8113, + "time_per_iteration": 2.4278008937835693 + }, + { + "auxiliary_loss_clip": 0.01010957, + "auxiliary_loss_mlp": 0.01001024, + "balance_loss_clip": 1.00000465, + "balance_loss_mlp": 1.00200796, + "epoch": 0.4878400721479032, + "flos": 61238527908480.0, + "grad_norm": 0.7919375166188405, + "language_loss": 0.48818985, + "learning_rate": 2.076679134659596e-06, + "loss": 0.5083096, + "num_input_tokens_seen": 174446880, + "router_z_loss_clip": 0.01019287, + "router_z_loss_mlp": 0.08935547, + "step": 8114, + "time_per_iteration": 2.9144294261932373 + }, + { + "auxiliary_loss_clip": 0.01064382, + "auxiliary_loss_mlp": 0.01030267, + "balance_loss_clip": 1.01676083, + "balance_loss_mlp": 1.02077806, + "epoch": 0.48790019540057117, + "flos": 24535421821440.0, + "grad_norm": 1.428697024654497, + "language_loss": 0.76378274, + "learning_rate": 2.0763015887529235e-06, + "loss": 0.78472924, + "num_input_tokens_seen": 174468485, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.4375, + "step": 8115, + "time_per_iteration": 2.4488282203674316 + }, + { + "auxiliary_loss_clip": 0.01068967, + "auxiliary_loss_mlp": 0.0102708, + "balance_loss_clip": 1.01300716, + "balance_loss_mlp": 1.02353168, + "epoch": 0.48796031865323913, + "flos": 21832900076160.0, + "grad_norm": 2.5119597059990673, + "language_loss": 0.71927917, + "learning_rate": 2.0759240401232444e-06, + "loss": 0.74023962, + "num_input_tokens_seen": 174486360, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.453125, + "step": 8116, + "time_per_iteration": 2.3859174251556396 + }, + { + "auxiliary_loss_clip": 0.01063128, + "auxiliary_loss_mlp": 0.01024239, + "balance_loss_clip": 1.01202011, + "balance_loss_mlp": 1.0208143, + "epoch": 0.4880204419059071, + "flos": 18587897625600.0, + "grad_norm": 2.224950589531497, + "language_loss": 0.63260061, + "learning_rate": 2.0755464887840314e-06, + "loss": 0.65347421, + "num_input_tokens_seen": 174505075, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.421875, + "step": 8117, + "time_per_iteration": 2.3813016414642334 + }, + { + "auxiliary_loss_clip": 0.01061759, + "auxiliary_loss_mlp": 0.01023891, + "balance_loss_clip": 1.01179147, + "balance_loss_mlp": 1.01979184, + "epoch": 0.48808056515857506, + "flos": 19171156665600.0, + "grad_norm": 4.294916802356386, + "language_loss": 0.79370588, + "learning_rate": 2.0751689347487583e-06, + "loss": 0.81456238, + "num_input_tokens_seen": 174523385, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.41796875, + "step": 8118, + "time_per_iteration": 2.36600399017334 + }, + { + "auxiliary_loss_clip": 0.0106524, + "auxiliary_loss_mlp": 0.01026226, + "balance_loss_clip": 1.01290429, + "balance_loss_mlp": 1.02096689, + "epoch": 0.48814068841124303, + "flos": 20119467548160.0, + "grad_norm": 1.844929404802268, + "language_loss": 0.63217425, + "learning_rate": 2.0747913780308996e-06, + "loss": 0.65308893, + "num_input_tokens_seen": 174542200, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.44335938, + "step": 8119, + "time_per_iteration": 2.3935341835021973 + }, + { + "auxiliary_loss_clip": 0.01063719, + "auxiliary_loss_mlp": 0.01027005, + "balance_loss_clip": 1.01484585, + "balance_loss_mlp": 1.021052, + "epoch": 0.488200811663911, + "flos": 22964504929920.0, + "grad_norm": 2.104521109289735, + "language_loss": 0.71799976, + "learning_rate": 2.0744138186439288e-06, + "loss": 0.73890704, + "num_input_tokens_seen": 174563620, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.42773438, + "step": 8120, + "time_per_iteration": 3.890923261642456 + }, + { + "auxiliary_loss_clip": 0.01064438, + "auxiliary_loss_mlp": 0.01026738, + "balance_loss_clip": 1.01336265, + "balance_loss_mlp": 1.02012777, + "epoch": 0.48826093491657896, + "flos": 33909322314240.0, + "grad_norm": 3.0064947924558605, + "language_loss": 0.63631952, + "learning_rate": 2.0740362566013207e-06, + "loss": 0.65723133, + "num_input_tokens_seen": 174586465, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.44335938, + "step": 8121, + "time_per_iteration": 2.5041511058807373 + }, + { + "auxiliary_loss_clip": 0.01067548, + "auxiliary_loss_mlp": 0.01031298, + "balance_loss_clip": 1.0173564, + "balance_loss_mlp": 1.02222919, + "epoch": 0.488321058169247, + "flos": 23069349342720.0, + "grad_norm": 2.1575479176194463, + "language_loss": 0.82518673, + "learning_rate": 2.073658691916548e-06, + "loss": 0.84617519, + "num_input_tokens_seen": 174604035, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.453125, + "step": 8122, + "time_per_iteration": 2.404508113861084 + }, + { + "auxiliary_loss_clip": 0.01065158, + "auxiliary_loss_mlp": 0.01029237, + "balance_loss_clip": 1.01542044, + "balance_loss_mlp": 1.02042508, + "epoch": 0.48838118142191494, + "flos": 19316709590400.0, + "grad_norm": 1.8284325670042252, + "language_loss": 0.85183823, + "learning_rate": 2.073281124603087e-06, + "loss": 0.87278223, + "num_input_tokens_seen": 174621715, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.44726562, + "step": 8123, + "time_per_iteration": 2.3708174228668213 + }, + { + "auxiliary_loss_clip": 0.01064156, + "auxiliary_loss_mlp": 0.01028715, + "balance_loss_clip": 1.01525593, + "balance_loss_mlp": 1.01928639, + "epoch": 0.4884413046745829, + "flos": 25409507420160.0, + "grad_norm": 1.4454044317305992, + "language_loss": 0.8548981, + "learning_rate": 2.0729035546744115e-06, + "loss": 0.87582678, + "num_input_tokens_seen": 174643835, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.44921875, + "step": 8124, + "time_per_iteration": 3.8987529277801514 + }, + { + "auxiliary_loss_clip": 0.01064947, + "auxiliary_loss_mlp": 0.01027353, + "balance_loss_clip": 1.01447248, + "balance_loss_mlp": 1.02032256, + "epoch": 0.4885014279272509, + "flos": 20990620592640.0, + "grad_norm": 1.8058590733307116, + "language_loss": 0.79282242, + "learning_rate": 2.072525982143995e-06, + "loss": 0.81374538, + "num_input_tokens_seen": 174660955, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.4453125, + "step": 8125, + "time_per_iteration": 2.368713617324829 + }, + { + "auxiliary_loss_clip": 0.01063564, + "auxiliary_loss_mlp": 0.01027802, + "balance_loss_clip": 1.01583934, + "balance_loss_mlp": 1.02048302, + "epoch": 0.48856155117991884, + "flos": 13770756385920.0, + "grad_norm": 3.203623235849076, + "language_loss": 0.72766447, + "learning_rate": 2.0721484070253127e-06, + "loss": 0.74857813, + "num_input_tokens_seen": 174678270, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.4296875, + "step": 8126, + "time_per_iteration": 2.36407470703125 + }, + { + "auxiliary_loss_clip": 0.01064403, + "auxiliary_loss_mlp": 0.01024566, + "balance_loss_clip": 1.01157188, + "balance_loss_mlp": 1.02047455, + "epoch": 0.4886216744325868, + "flos": 32086402162560.0, + "grad_norm": 1.8483338397856406, + "language_loss": 0.68602461, + "learning_rate": 2.0717708293318393e-06, + "loss": 0.70691431, + "num_input_tokens_seen": 174698360, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.43945312, + "step": 8127, + "time_per_iteration": 2.461782217025757 + }, + { + "auxiliary_loss_clip": 0.01065625, + "auxiliary_loss_mlp": 0.01025527, + "balance_loss_clip": 1.01201487, + "balance_loss_mlp": 1.01902521, + "epoch": 0.48868179768525477, + "flos": 19609037337600.0, + "grad_norm": 2.5925656528511483, + "language_loss": 0.76313281, + "learning_rate": 2.07139324907705e-06, + "loss": 0.78404433, + "num_input_tokens_seen": 174716755, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.46679688, + "step": 8128, + "time_per_iteration": 2.38606595993042 + }, + { + "auxiliary_loss_clip": 0.01066092, + "auxiliary_loss_mlp": 0.01023978, + "balance_loss_clip": 1.01168728, + "balance_loss_mlp": 1.02324152, + "epoch": 0.48874192093792274, + "flos": 21285880894080.0, + "grad_norm": 1.7935472636660992, + "language_loss": 0.75954938, + "learning_rate": 2.0710156662744192e-06, + "loss": 0.78045011, + "num_input_tokens_seen": 174735560, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.4296875, + "step": 8129, + "time_per_iteration": 2.4140143394470215 + }, + { + "auxiliary_loss_clip": 0.01062331, + "auxiliary_loss_mlp": 0.01023468, + "balance_loss_clip": 1.00999749, + "balance_loss_mlp": 1.01941788, + "epoch": 0.4888020441905907, + "flos": 14172571756800.0, + "grad_norm": 2.023512428399292, + "language_loss": 0.64942086, + "learning_rate": 2.0706380809374213e-06, + "loss": 0.67027891, + "num_input_tokens_seen": 174752730, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.4296875, + "step": 8130, + "time_per_iteration": 2.3915319442749023 + }, + { + "auxiliary_loss_clip": 0.01061023, + "auxiliary_loss_mlp": 0.01023622, + "balance_loss_clip": 1.01095605, + "balance_loss_mlp": 1.0189364, + "epoch": 0.48886216744325867, + "flos": 24096738188160.0, + "grad_norm": 1.9999155035724216, + "language_loss": 0.72404045, + "learning_rate": 2.070260493079533e-06, + "loss": 0.74488688, + "num_input_tokens_seen": 174772520, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.421875, + "step": 8131, + "time_per_iteration": 2.4096786975860596 + }, + { + "auxiliary_loss_clip": 0.01064743, + "auxiliary_loss_mlp": 0.01028814, + "balance_loss_clip": 1.01503897, + "balance_loss_mlp": 1.02062738, + "epoch": 0.48892229069592663, + "flos": 38430016266240.0, + "grad_norm": 1.4431224152117499, + "language_loss": 0.69547433, + "learning_rate": 2.0698829027142274e-06, + "loss": 0.71640992, + "num_input_tokens_seen": 174796540, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.44140625, + "step": 8132, + "time_per_iteration": 2.543595314025879 + }, + { + "auxiliary_loss_clip": 0.01062899, + "auxiliary_loss_mlp": 0.01026044, + "balance_loss_clip": 1.01284742, + "balance_loss_mlp": 1.02063489, + "epoch": 0.4889824139485946, + "flos": 23842151487360.0, + "grad_norm": 1.3773352786614346, + "language_loss": 0.70255423, + "learning_rate": 2.0695053098549814e-06, + "loss": 0.72344369, + "num_input_tokens_seen": 174817840, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.421875, + "step": 8133, + "time_per_iteration": 2.414186954498291 + }, + { + "auxiliary_loss_clip": 0.01064957, + "auxiliary_loss_mlp": 0.01025695, + "balance_loss_clip": 1.01250482, + "balance_loss_mlp": 1.02189016, + "epoch": 0.48904253720126256, + "flos": 24424677388800.0, + "grad_norm": 1.5770789070341689, + "language_loss": 0.70848596, + "learning_rate": 2.06912771451527e-06, + "loss": 0.72939253, + "num_input_tokens_seen": 174837885, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.4296875, + "step": 8134, + "time_per_iteration": 2.417879819869995 + }, + { + "auxiliary_loss_clip": 0.01068134, + "auxiliary_loss_mlp": 0.01029008, + "balance_loss_clip": 1.01513851, + "balance_loss_mlp": 1.02275109, + "epoch": 0.4891026604539306, + "flos": 24169532106240.0, + "grad_norm": 1.8820806083374844, + "language_loss": 0.80144548, + "learning_rate": 2.068750116708567e-06, + "loss": 0.8224169, + "num_input_tokens_seen": 174855240, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.453125, + "step": 8135, + "time_per_iteration": 2.401287794113159 + }, + { + "auxiliary_loss_clip": 0.0106175, + "auxiliary_loss_mlp": 0.01025774, + "balance_loss_clip": 1.01412761, + "balance_loss_mlp": 1.02040565, + "epoch": 0.48916278370659855, + "flos": 21469873092480.0, + "grad_norm": 1.762411080104056, + "language_loss": 0.74446565, + "learning_rate": 2.0683725164483504e-06, + "loss": 0.76534092, + "num_input_tokens_seen": 174875145, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.4140625, + "step": 8136, + "time_per_iteration": 2.398501396179199 + }, + { + "auxiliary_loss_clip": 0.01062456, + "auxiliary_loss_mlp": 0.01026739, + "balance_loss_clip": 1.01473415, + "balance_loss_mlp": 1.02107, + "epoch": 0.4892229069592665, + "flos": 22308661440000.0, + "grad_norm": 1.6124579048807344, + "language_loss": 0.73243451, + "learning_rate": 2.067994913748094e-06, + "loss": 0.75332648, + "num_input_tokens_seen": 174894770, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.4140625, + "step": 8137, + "time_per_iteration": 2.4085686206817627 + }, + { + "auxiliary_loss_clip": 0.01063873, + "auxiliary_loss_mlp": 0.01025825, + "balance_loss_clip": 1.0124917, + "balance_loss_mlp": 1.02034223, + "epoch": 0.4892830302119345, + "flos": 12786031088640.0, + "grad_norm": 2.0382560502694527, + "language_loss": 0.74766684, + "learning_rate": 2.0676173086212745e-06, + "loss": 0.76856375, + "num_input_tokens_seen": 174912780, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.43359375, + "step": 8138, + "time_per_iteration": 2.4408349990844727 + }, + { + "auxiliary_loss_clip": 0.01065045, + "auxiliary_loss_mlp": 0.01028549, + "balance_loss_clip": 1.01670599, + "balance_loss_mlp": 1.02224159, + "epoch": 0.48934315346460244, + "flos": 20812842616320.0, + "grad_norm": 2.537499878511112, + "language_loss": 0.74334252, + "learning_rate": 2.067239701081367e-06, + "loss": 0.76427841, + "num_input_tokens_seen": 174931250, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.42773438, + "step": 8139, + "time_per_iteration": 2.3942410945892334 + }, + { + "auxiliary_loss_clip": 0.01062256, + "auxiliary_loss_mlp": 0.01028159, + "balance_loss_clip": 1.01542139, + "balance_loss_mlp": 1.01978922, + "epoch": 0.4894032767172704, + "flos": 19754520439680.0, + "grad_norm": 1.7490873589457323, + "language_loss": 0.62092775, + "learning_rate": 2.066862091141848e-06, + "loss": 0.64183193, + "num_input_tokens_seen": 174951105, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.42578125, + "step": 8140, + "time_per_iteration": 2.4256443977355957 + }, + { + "auxiliary_loss_clip": 0.01065981, + "auxiliary_loss_mlp": 0.01031377, + "balance_loss_clip": 1.01739335, + "balance_loss_mlp": 1.02090144, + "epoch": 0.4894633999699384, + "flos": 17818097857920.0, + "grad_norm": 2.8428097395956957, + "language_loss": 0.82550693, + "learning_rate": 2.0664844788161923e-06, + "loss": 0.84648049, + "num_input_tokens_seen": 174969120, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.45117188, + "step": 8141, + "time_per_iteration": 2.37553334236145 + }, + { + "auxiliary_loss_clip": 0.01066383, + "auxiliary_loss_mlp": 0.01027284, + "balance_loss_clip": 1.01413536, + "balance_loss_mlp": 1.02203524, + "epoch": 0.48952352322260634, + "flos": 25521962509440.0, + "grad_norm": 1.6863002223681327, + "language_loss": 0.72252589, + "learning_rate": 2.0661068641178764e-06, + "loss": 0.74346256, + "num_input_tokens_seen": 174991295, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.44335938, + "step": 8142, + "time_per_iteration": 2.474276542663574 + }, + { + "auxiliary_loss_clip": 0.01064238, + "auxiliary_loss_mlp": 0.01028579, + "balance_loss_clip": 1.01569867, + "balance_loss_mlp": 1.02140784, + "epoch": 0.4895836464752743, + "flos": 29054335294080.0, + "grad_norm": 1.79425851857054, + "language_loss": 0.67177534, + "learning_rate": 2.065729247060377e-06, + "loss": 0.69270349, + "num_input_tokens_seen": 175012830, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.42773438, + "step": 8143, + "time_per_iteration": 2.4448769092559814 + }, + { + "auxiliary_loss_clip": 0.01061907, + "auxiliary_loss_mlp": 0.010246, + "balance_loss_clip": 1.01256573, + "balance_loss_mlp": 1.01987767, + "epoch": 0.48964376972794227, + "flos": 33545562192000.0, + "grad_norm": 1.4242043799920732, + "language_loss": 0.74884295, + "learning_rate": 2.0653516276571694e-06, + "loss": 0.76970804, + "num_input_tokens_seen": 175035695, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.41992188, + "step": 8144, + "time_per_iteration": 2.5011680126190186 + }, + { + "auxiliary_loss_clip": 0.01062258, + "auxiliary_loss_mlp": 0.010249, + "balance_loss_clip": 1.01210332, + "balance_loss_mlp": 1.01935256, + "epoch": 0.48970389298061023, + "flos": 22052957575680.0, + "grad_norm": 1.4838183258777131, + "language_loss": 0.75894809, + "learning_rate": 2.0649740059217304e-06, + "loss": 0.77981973, + "num_input_tokens_seen": 175056425, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.4296875, + "step": 8145, + "time_per_iteration": 2.421186685562134 + }, + { + "auxiliary_loss_clip": 0.01065766, + "auxiliary_loss_mlp": 0.01032413, + "balance_loss_clip": 1.01840019, + "balance_loss_mlp": 1.02090383, + "epoch": 0.4897640162332782, + "flos": 20083262601600.0, + "grad_norm": 1.7822805503273498, + "language_loss": 0.80415213, + "learning_rate": 2.064596381867537e-06, + "loss": 0.82513386, + "num_input_tokens_seen": 175074800, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.44921875, + "step": 8146, + "time_per_iteration": 2.417325019836426 + }, + { + "auxiliary_loss_clip": 0.01061436, + "auxiliary_loss_mlp": 0.01022604, + "balance_loss_clip": 1.0104146, + "balance_loss_mlp": 1.01995599, + "epoch": 0.48982413948594616, + "flos": 23805073756800.0, + "grad_norm": 5.1808653971114484, + "language_loss": 0.74070472, + "learning_rate": 2.064218755508064e-06, + "loss": 0.76154512, + "num_input_tokens_seen": 175094500, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.4140625, + "step": 8147, + "time_per_iteration": 2.460737943649292 + }, + { + "auxiliary_loss_clip": 0.01065918, + "auxiliary_loss_mlp": 0.01022724, + "balance_loss_clip": 1.00934291, + "balance_loss_mlp": 1.0217942, + "epoch": 0.4898842627386142, + "flos": 17638679047680.0, + "grad_norm": 2.1928334611893123, + "language_loss": 0.82972848, + "learning_rate": 2.0638411268567894e-06, + "loss": 0.85061491, + "num_input_tokens_seen": 175112920, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.44140625, + "step": 8148, + "time_per_iteration": 2.388871669769287 + }, + { + "auxiliary_loss_clip": 0.01061002, + "auxiliary_loss_mlp": 0.01025547, + "balance_loss_clip": 1.01361442, + "balance_loss_mlp": 1.02071106, + "epoch": 0.48994438599128215, + "flos": 16616980753920.0, + "grad_norm": 1.6953090631983774, + "language_loss": 0.73636121, + "learning_rate": 2.0634634959271886e-06, + "loss": 0.75722671, + "num_input_tokens_seen": 175129910, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.40234375, + "step": 8149, + "time_per_iteration": 2.370102643966675 + }, + { + "auxiliary_loss_clip": 0.01067954, + "auxiliary_loss_mlp": 0.01024725, + "balance_loss_clip": 1.01108193, + "balance_loss_mlp": 1.02236974, + "epoch": 0.4900045092439501, + "flos": 26613626901120.0, + "grad_norm": 1.8026651997577552, + "language_loss": 0.7613048, + "learning_rate": 2.0630858627327394e-06, + "loss": 0.78223169, + "num_input_tokens_seen": 175148705, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.45703125, + "step": 8150, + "time_per_iteration": 3.8718411922454834 + }, + { + "auxiliary_loss_clip": 0.0106714, + "auxiliary_loss_mlp": 0.01026216, + "balance_loss_clip": 1.01282334, + "balance_loss_mlp": 1.02276337, + "epoch": 0.4900646324966181, + "flos": 19901085793920.0, + "grad_norm": 1.9224456503890592, + "language_loss": 0.7248258, + "learning_rate": 2.0627082272869176e-06, + "loss": 0.74575931, + "num_input_tokens_seen": 175167425, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.44335938, + "step": 8151, + "time_per_iteration": 2.46932315826416 + }, + { + "auxiliary_loss_clip": 0.0101015, + "auxiliary_loss_mlp": 0.01001527, + "balance_loss_clip": 1.00042462, + "balance_loss_mlp": 1.00173426, + "epoch": 0.49012475574928605, + "flos": 59186682771840.0, + "grad_norm": 0.8462339431742633, + "language_loss": 0.54379773, + "learning_rate": 2.062330589603201e-06, + "loss": 0.56391454, + "num_input_tokens_seen": 175227985, + "router_z_loss_clip": 0.01104736, + "router_z_loss_mlp": 0.08398438, + "step": 8152, + "time_per_iteration": 3.0295605659484863 + }, + { + "auxiliary_loss_clip": 0.01064952, + "auxiliary_loss_mlp": 0.01026184, + "balance_loss_clip": 1.01338744, + "balance_loss_mlp": 1.02060461, + "epoch": 0.490184879001954, + "flos": 45258049751040.0, + "grad_norm": 2.8749376481268256, + "language_loss": 0.61412942, + "learning_rate": 2.0619529496950657e-06, + "loss": 0.63504082, + "num_input_tokens_seen": 175251895, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.44335938, + "step": 8153, + "time_per_iteration": 2.6041548252105713 + }, + { + "auxiliary_loss_clip": 0.01064176, + "auxiliary_loss_mlp": 0.01026718, + "balance_loss_clip": 1.01460588, + "balance_loss_mlp": 1.02121007, + "epoch": 0.490245002254622, + "flos": 28000865796480.0, + "grad_norm": 1.5825474014394112, + "language_loss": 0.76724309, + "learning_rate": 2.0615753075759894e-06, + "loss": 0.78815198, + "num_input_tokens_seen": 175272770, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.4296875, + "step": 8154, + "time_per_iteration": 2.5358009338378906 + }, + { + "auxiliary_loss_clip": 0.01010327, + "auxiliary_loss_mlp": 0.01000888, + "balance_loss_clip": 0.99985725, + "balance_loss_mlp": 1.00170982, + "epoch": 0.49030512550728994, + "flos": 58947910917120.0, + "grad_norm": 0.9884087113585882, + "language_loss": 0.6706838, + "learning_rate": 2.0611976632594487e-06, + "loss": 0.69079596, + "num_input_tokens_seen": 175336320, + "router_z_loss_clip": 0.01031494, + "router_z_loss_mlp": 0.0859375, + "step": 8155, + "time_per_iteration": 3.116466522216797 + }, + { + "auxiliary_loss_clip": 0.01064615, + "auxiliary_loss_mlp": 0.01026618, + "balance_loss_clip": 1.01465487, + "balance_loss_mlp": 1.02164793, + "epoch": 0.4903652487599579, + "flos": 19790830120320.0, + "grad_norm": 3.78355674751088, + "language_loss": 0.76698375, + "learning_rate": 2.0608200167589204e-06, + "loss": 0.7878961, + "num_input_tokens_seen": 175353540, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.4296875, + "step": 8156, + "time_per_iteration": 2.3860974311828613 + }, + { + "auxiliary_loss_clip": 0.01063552, + "auxiliary_loss_mlp": 0.01024251, + "balance_loss_clip": 1.0117873, + "balance_loss_mlp": 1.02069616, + "epoch": 0.49042537201262587, + "flos": 21761013853440.0, + "grad_norm": 2.269456446983779, + "language_loss": 0.83841634, + "learning_rate": 2.060442368087882e-06, + "loss": 0.85929435, + "num_input_tokens_seen": 175370445, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.4296875, + "step": 8157, + "time_per_iteration": 2.382884979248047 + }, + { + "auxiliary_loss_clip": 0.01063427, + "auxiliary_loss_mlp": 0.01025306, + "balance_loss_clip": 1.01251531, + "balance_loss_mlp": 1.02037632, + "epoch": 0.49048549526529384, + "flos": 18952041772800.0, + "grad_norm": 2.6045900458529383, + "language_loss": 0.79873562, + "learning_rate": 2.060064717259811e-06, + "loss": 0.81962299, + "num_input_tokens_seen": 175389020, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.4296875, + "step": 8158, + "time_per_iteration": 2.380716562271118 + }, + { + "auxiliary_loss_clip": 0.01067092, + "auxiliary_loss_mlp": 0.0102383, + "balance_loss_clip": 1.01097345, + "balance_loss_mlp": 1.02184761, + "epoch": 0.4905456185179618, + "flos": 26905186598400.0, + "grad_norm": 6.068399376574761, + "language_loss": 0.69460583, + "learning_rate": 2.059687064288185e-06, + "loss": 0.71551502, + "num_input_tokens_seen": 175409545, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.453125, + "step": 8159, + "time_per_iteration": 2.4401438236236572 + }, + { + "auxiliary_loss_clip": 0.01061519, + "auxiliary_loss_mlp": 0.01024387, + "balance_loss_clip": 1.01141071, + "balance_loss_mlp": 1.01976013, + "epoch": 0.49060574177062977, + "flos": 20411306536320.0, + "grad_norm": 1.740982836213401, + "language_loss": 0.73290098, + "learning_rate": 2.059309409186481e-06, + "loss": 0.75376004, + "num_input_tokens_seen": 175429335, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.41796875, + "step": 8160, + "time_per_iteration": 3.826754570007324 + }, + { + "auxiliary_loss_clip": 0.01064972, + "auxiliary_loss_mlp": 0.01028426, + "balance_loss_clip": 1.01483011, + "balance_loss_mlp": 1.02031159, + "epoch": 0.4906658650232978, + "flos": 17492742097920.0, + "grad_norm": 1.7763054721147844, + "language_loss": 0.71508241, + "learning_rate": 2.0589317519681773e-06, + "loss": 0.73601639, + "num_input_tokens_seen": 175446955, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.44726562, + "step": 8161, + "time_per_iteration": 2.377476215362549 + }, + { + "auxiliary_loss_clip": 0.01063588, + "auxiliary_loss_mlp": 0.01032696, + "balance_loss_clip": 1.02045965, + "balance_loss_mlp": 1.0212512, + "epoch": 0.49072598827596575, + "flos": 26613242876160.0, + "grad_norm": 1.6656492392205824, + "language_loss": 0.68335259, + "learning_rate": 2.0585540926467507e-06, + "loss": 0.70431542, + "num_input_tokens_seen": 175468195, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.421875, + "step": 8162, + "time_per_iteration": 2.4335989952087402 + }, + { + "auxiliary_loss_clip": 0.01067203, + "auxiliary_loss_mlp": 0.01030617, + "balance_loss_clip": 1.0161922, + "balance_loss_mlp": 1.02074432, + "epoch": 0.4907861115286337, + "flos": 20411550915840.0, + "grad_norm": 2.1617252479863653, + "language_loss": 0.63189155, + "learning_rate": 2.058176431235679e-06, + "loss": 0.6528697, + "num_input_tokens_seen": 175487455, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.46484375, + "step": 8163, + "time_per_iteration": 2.4120795726776123 + }, + { + "auxiliary_loss_clip": 0.01062119, + "auxiliary_loss_mlp": 0.01024843, + "balance_loss_clip": 1.01193905, + "balance_loss_mlp": 1.02006531, + "epoch": 0.4908462347813017, + "flos": 14063398335360.0, + "grad_norm": 2.234669346252103, + "language_loss": 0.76373124, + "learning_rate": 2.05779876774844e-06, + "loss": 0.78460085, + "num_input_tokens_seen": 175504450, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.421875, + "step": 8164, + "time_per_iteration": 5.276147842407227 + }, + { + "auxiliary_loss_clip": 0.01065875, + "auxiliary_loss_mlp": 0.01023272, + "balance_loss_clip": 1.00993228, + "balance_loss_mlp": 1.0209446, + "epoch": 0.49090635803396965, + "flos": 18734078954880.0, + "grad_norm": 1.505414770628208, + "language_loss": 0.76697284, + "learning_rate": 2.057421102198512e-06, + "loss": 0.78786433, + "num_input_tokens_seen": 175523600, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.44921875, + "step": 8165, + "time_per_iteration": 2.4168453216552734 + }, + { + "auxiliary_loss_clip": 0.01064839, + "auxiliary_loss_mlp": 0.01030936, + "balance_loss_clip": 1.01825857, + "balance_loss_mlp": 1.02207983, + "epoch": 0.4909664812866376, + "flos": 20557452954240.0, + "grad_norm": 1.6063781731305717, + "language_loss": 0.7752977, + "learning_rate": 2.0570434345993717e-06, + "loss": 0.79625547, + "num_input_tokens_seen": 175542720, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.42773438, + "step": 8166, + "time_per_iteration": 2.51198148727417 + }, + { + "auxiliary_loss_clip": 0.01009537, + "auxiliary_loss_mlp": 0.01004766, + "balance_loss_clip": 1.0036875, + "balance_loss_mlp": 1.00112653, + "epoch": 0.4910266045393056, + "flos": 54680686502400.0, + "grad_norm": 0.804031527522377, + "language_loss": 0.54208672, + "learning_rate": 2.056665764964499e-06, + "loss": 0.56222975, + "num_input_tokens_seen": 175598640, + "router_z_loss_clip": 0.01080322, + "router_z_loss_mlp": 0.08398438, + "step": 8167, + "time_per_iteration": 3.0504355430603027 + }, + { + "auxiliary_loss_clip": 0.01062116, + "auxiliary_loss_mlp": 0.01023162, + "balance_loss_clip": 1.01063323, + "balance_loss_mlp": 1.01982164, + "epoch": 0.49108672779197354, + "flos": 16245714689280.0, + "grad_norm": 2.6165429677838477, + "language_loss": 0.85908794, + "learning_rate": 2.0562880933073705e-06, + "loss": 0.87994075, + "num_input_tokens_seen": 175615675, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.421875, + "step": 8168, + "time_per_iteration": 2.4596970081329346 + }, + { + "auxiliary_loss_clip": 0.0106086, + "auxiliary_loss_mlp": 0.01027502, + "balance_loss_clip": 1.01484799, + "balance_loss_mlp": 1.02048838, + "epoch": 0.4911468510446415, + "flos": 19824486537600.0, + "grad_norm": 1.7524677028165727, + "language_loss": 0.73733461, + "learning_rate": 2.055910419641465e-06, + "loss": 0.75821829, + "num_input_tokens_seen": 175632255, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.40429688, + "step": 8169, + "time_per_iteration": 2.412365198135376 + }, + { + "auxiliary_loss_clip": 0.01060845, + "auxiliary_loss_mlp": 0.01021032, + "balance_loss_clip": 1.00942707, + "balance_loss_mlp": 1.01886606, + "epoch": 0.4912069742973095, + "flos": 21211690521600.0, + "grad_norm": 1.670944984689891, + "language_loss": 0.78166735, + "learning_rate": 2.05553274398026e-06, + "loss": 0.80248618, + "num_input_tokens_seen": 175651625, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.41992188, + "step": 8170, + "time_per_iteration": 2.4385812282562256 + }, + { + "auxiliary_loss_clip": 0.0106278, + "auxiliary_loss_mlp": 0.01025676, + "balance_loss_clip": 1.01315856, + "balance_loss_mlp": 1.02002454, + "epoch": 0.49126709754997744, + "flos": 19536103774080.0, + "grad_norm": 15.084722949339948, + "language_loss": 0.75787294, + "learning_rate": 2.055155066337235e-06, + "loss": 0.77875751, + "num_input_tokens_seen": 175669265, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.42773438, + "step": 8171, + "time_per_iteration": 2.4022233486175537 + }, + { + "auxiliary_loss_clip": 0.01065648, + "auxiliary_loss_mlp": 0.01026372, + "balance_loss_clip": 1.01402187, + "balance_loss_mlp": 1.02199757, + "epoch": 0.4913272208026454, + "flos": 12238872261120.0, + "grad_norm": 2.036721091023354, + "language_loss": 0.81728929, + "learning_rate": 2.0547773867258667e-06, + "loss": 0.83820951, + "num_input_tokens_seen": 175686065, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.4375, + "step": 8172, + "time_per_iteration": 2.410752058029175 + }, + { + "auxiliary_loss_clip": 0.01009162, + "auxiliary_loss_mlp": 0.01003595, + "balance_loss_clip": 1.00256419, + "balance_loss_mlp": 1.00091624, + "epoch": 0.49138734405531337, + "flos": 65462739661440.0, + "grad_norm": 0.6915893172067483, + "language_loss": 0.53296345, + "learning_rate": 2.054399705159635e-06, + "loss": 0.55309099, + "num_input_tokens_seen": 175748595, + "router_z_loss_clip": 0.01031494, + "router_z_loss_mlp": 0.08203125, + "step": 8173, + "time_per_iteration": 3.0464820861816406 + }, + { + "auxiliary_loss_clip": 0.01066051, + "auxiliary_loss_mlp": 0.0102578, + "balance_loss_clip": 1.01213646, + "balance_loss_mlp": 1.02083194, + "epoch": 0.4914474673079814, + "flos": 18438155337600.0, + "grad_norm": 2.3795548405361013, + "language_loss": 0.62445641, + "learning_rate": 2.054022021652017e-06, + "loss": 0.64537477, + "num_input_tokens_seen": 175766770, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.45117188, + "step": 8174, + "time_per_iteration": 2.442434787750244 + }, + { + "auxiliary_loss_clip": 0.01064979, + "auxiliary_loss_mlp": 0.0102679, + "balance_loss_clip": 1.01377261, + "balance_loss_mlp": 1.02147377, + "epoch": 0.49150759056064935, + "flos": 21684100394880.0, + "grad_norm": 1.8569408577975257, + "language_loss": 0.68928504, + "learning_rate": 2.0536443362164927e-06, + "loss": 0.71020269, + "num_input_tokens_seen": 175783605, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.43554688, + "step": 8175, + "time_per_iteration": 2.408876895904541 + }, + { + "auxiliary_loss_clip": 0.01065522, + "auxiliary_loss_mlp": 0.01024693, + "balance_loss_clip": 1.01153255, + "balance_loss_mlp": 1.02252698, + "epoch": 0.4915677138133173, + "flos": 22381350624000.0, + "grad_norm": 1.5270810569019162, + "language_loss": 0.74496382, + "learning_rate": 2.0532666488665393e-06, + "loss": 0.76586592, + "num_input_tokens_seen": 175801390, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.4296875, + "step": 8176, + "time_per_iteration": 2.400296449661255 + }, + { + "auxiliary_loss_clip": 0.01065737, + "auxiliary_loss_mlp": 0.01027386, + "balance_loss_clip": 1.01471412, + "balance_loss_mlp": 1.02207553, + "epoch": 0.4916278370659853, + "flos": 18401985302400.0, + "grad_norm": 1.738829526815047, + "language_loss": 0.69800466, + "learning_rate": 2.052888959615637e-06, + "loss": 0.71893591, + "num_input_tokens_seen": 175819830, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.4375, + "step": 8177, + "time_per_iteration": 2.38999342918396 + }, + { + "auxiliary_loss_clip": 0.01009474, + "auxiliary_loss_mlp": 0.01006261, + "balance_loss_clip": 1.00514078, + "balance_loss_mlp": 1.00108683, + "epoch": 0.49168796031865325, + "flos": 66605620884480.0, + "grad_norm": 0.6927347145242478, + "language_loss": 0.46238822, + "learning_rate": 2.0525112684772633e-06, + "loss": 0.48254555, + "num_input_tokens_seen": 175881765, + "router_z_loss_clip": 0.01123047, + "router_z_loss_mlp": 0.08398438, + "step": 8178, + "time_per_iteration": 3.0346994400024414 + }, + { + "auxiliary_loss_clip": 0.01067614, + "auxiliary_loss_mlp": 0.01027633, + "balance_loss_clip": 1.0136919, + "balance_loss_mlp": 1.02225614, + "epoch": 0.4917480835713212, + "flos": 20958290807040.0, + "grad_norm": 2.5569828109532455, + "language_loss": 0.65796518, + "learning_rate": 2.052133575464898e-06, + "loss": 0.67891765, + "num_input_tokens_seen": 175901795, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.453125, + "step": 8179, + "time_per_iteration": 2.404388189315796 + }, + { + "auxiliary_loss_clip": 0.01067763, + "auxiliary_loss_mlp": 0.01023706, + "balance_loss_clip": 1.0101285, + "balance_loss_mlp": 1.02193546, + "epoch": 0.4918082068239892, + "flos": 15772152741120.0, + "grad_norm": 1.8576630458957946, + "language_loss": 0.70062751, + "learning_rate": 2.0517558805920193e-06, + "loss": 0.72154218, + "num_input_tokens_seen": 175917770, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.45703125, + "step": 8180, + "time_per_iteration": 2.380949020385742 + }, + { + "auxiliary_loss_clip": 0.01064277, + "auxiliary_loss_mlp": 0.01028468, + "balance_loss_clip": 1.01568294, + "balance_loss_mlp": 1.02123606, + "epoch": 0.49186833007665715, + "flos": 24603747085440.0, + "grad_norm": 1.6712505577274752, + "language_loss": 0.84147775, + "learning_rate": 2.0513781838721057e-06, + "loss": 0.86240518, + "num_input_tokens_seen": 175937000, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.4296875, + "step": 8181, + "time_per_iteration": 2.4243903160095215 + }, + { + "auxiliary_loss_clip": 0.01063471, + "auxiliary_loss_mlp": 0.01026255, + "balance_loss_clip": 1.01441169, + "balance_loss_mlp": 1.02154255, + "epoch": 0.4919284533293251, + "flos": 22089476724480.0, + "grad_norm": 1.8223440392103512, + "language_loss": 0.72348976, + "learning_rate": 2.051000485318637e-06, + "loss": 0.74438703, + "num_input_tokens_seen": 175955170, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.41796875, + "step": 8182, + "time_per_iteration": 2.3953747749328613 + }, + { + "auxiliary_loss_clip": 0.01065942, + "auxiliary_loss_mlp": 0.01026654, + "balance_loss_clip": 1.0143398, + "balance_loss_mlp": 1.02279282, + "epoch": 0.4919885765819931, + "flos": 23366913793920.0, + "grad_norm": 1.6979046050592908, + "language_loss": 0.72803915, + "learning_rate": 2.050622784945093e-06, + "loss": 0.74896502, + "num_input_tokens_seen": 175973725, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.43164062, + "step": 8183, + "time_per_iteration": 2.4015893936157227 + }, + { + "auxiliary_loss_clip": 0.01067155, + "auxiliary_loss_mlp": 0.01025596, + "balance_loss_clip": 1.01198196, + "balance_loss_mlp": 1.02119005, + "epoch": 0.49204869983466104, + "flos": 21359442862080.0, + "grad_norm": 3.8963183210071657, + "language_loss": 0.77138603, + "learning_rate": 2.0502450827649514e-06, + "loss": 0.79231346, + "num_input_tokens_seen": 175993885, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.4609375, + "step": 8184, + "time_per_iteration": 2.404146909713745 + }, + { + "auxiliary_loss_clip": 0.01064757, + "auxiliary_loss_mlp": 0.01026787, + "balance_loss_clip": 1.01450872, + "balance_loss_mlp": 1.0218327, + "epoch": 0.492108823087329, + "flos": 21141619689600.0, + "grad_norm": 1.6125584072570651, + "language_loss": 0.71022475, + "learning_rate": 2.049867378791693e-06, + "loss": 0.73114026, + "num_input_tokens_seen": 176014210, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.4296875, + "step": 8185, + "time_per_iteration": 2.442793130874634 + }, + { + "auxiliary_loss_clip": 0.01066321, + "auxiliary_loss_mlp": 0.01030752, + "balance_loss_clip": 1.01722193, + "balance_loss_mlp": 1.02135158, + "epoch": 0.49216894633999697, + "flos": 25115503927680.0, + "grad_norm": 1.9002586468142024, + "language_loss": 0.7515918, + "learning_rate": 2.049489673038795e-06, + "loss": 0.7725625, + "num_input_tokens_seen": 176033890, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.44921875, + "step": 8186, + "time_per_iteration": 2.4428927898406982 + }, + { + "auxiliary_loss_clip": 0.01065526, + "auxiliary_loss_mlp": 0.0102578, + "balance_loss_clip": 1.0129056, + "balance_loss_mlp": 1.0206964, + "epoch": 0.49222906959266494, + "flos": 22636845020160.0, + "grad_norm": 2.2817424534549238, + "language_loss": 0.67648727, + "learning_rate": 2.0491119655197382e-06, + "loss": 0.69740033, + "num_input_tokens_seen": 176052720, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.44921875, + "step": 8187, + "time_per_iteration": 2.4324986934661865 + }, + { + "auxiliary_loss_clip": 0.01065589, + "auxiliary_loss_mlp": 0.01032199, + "balance_loss_clip": 1.01931882, + "balance_loss_mlp": 1.02178526, + "epoch": 0.49228919284533296, + "flos": 20409560968320.0, + "grad_norm": 8.255755398201961, + "language_loss": 0.67012703, + "learning_rate": 2.0487342562480016e-06, + "loss": 0.69110489, + "num_input_tokens_seen": 176072545, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.43945312, + "step": 8188, + "time_per_iteration": 2.429807186126709 + }, + { + "auxiliary_loss_clip": 0.01067179, + "auxiliary_loss_mlp": 0.01023796, + "balance_loss_clip": 1.01176143, + "balance_loss_mlp": 1.02417374, + "epoch": 0.4923493160980009, + "flos": 27121229291520.0, + "grad_norm": 1.976521861997495, + "language_loss": 0.75077224, + "learning_rate": 2.048356545237065e-06, + "loss": 0.77168196, + "num_input_tokens_seen": 176091490, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.4296875, + "step": 8189, + "time_per_iteration": 4.005939722061157 + }, + { + "auxiliary_loss_clip": 0.0106219, + "auxiliary_loss_mlp": 0.01027405, + "balance_loss_clip": 1.01510811, + "balance_loss_mlp": 1.02033544, + "epoch": 0.4924094393506689, + "flos": 35735209931520.0, + "grad_norm": 1.6481414056088641, + "language_loss": 0.64309257, + "learning_rate": 2.0479788325004076e-06, + "loss": 0.66398847, + "num_input_tokens_seen": 176113200, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.41796875, + "step": 8190, + "time_per_iteration": 2.564378023147583 + }, + { + "auxiliary_loss_clip": 0.01064148, + "auxiliary_loss_mlp": 0.01026225, + "balance_loss_clip": 1.01397061, + "balance_loss_mlp": 1.02144468, + "epoch": 0.49246956260333685, + "flos": 20411446181760.0, + "grad_norm": 3.788353622374919, + "language_loss": 0.71229386, + "learning_rate": 2.0476011180515086e-06, + "loss": 0.73319757, + "num_input_tokens_seen": 176132485, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.42578125, + "step": 8191, + "time_per_iteration": 2.401263952255249 + }, + { + "auxiliary_loss_clip": 0.01063751, + "auxiliary_loss_mlp": 0.0103052, + "balance_loss_clip": 1.01794314, + "balance_loss_mlp": 1.02095973, + "epoch": 0.4925296858560048, + "flos": 38975569171200.0, + "grad_norm": 1.845667558418571, + "language_loss": 0.71865875, + "learning_rate": 2.047223401903849e-06, + "loss": 0.73960143, + "num_input_tokens_seen": 176155755, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.42773438, + "step": 8192, + "time_per_iteration": 2.567293167114258 + }, + { + "auxiliary_loss_clip": 0.01009989, + "auxiliary_loss_mlp": 0.01013328, + "balance_loss_clip": 1.01228487, + "balance_loss_mlp": 1.00154757, + "epoch": 0.4925898091086728, + "flos": 64275342721920.0, + "grad_norm": 0.7298669848281718, + "language_loss": 0.52144849, + "learning_rate": 2.0468456840709066e-06, + "loss": 0.54168159, + "num_input_tokens_seen": 176216295, + "router_z_loss_clip": 0.01043701, + "router_z_loss_mlp": 0.08398438, + "step": 8193, + "time_per_iteration": 3.087665557861328 + }, + { + "auxiliary_loss_clip": 0.01063415, + "auxiliary_loss_mlp": 0.01030602, + "balance_loss_clip": 1.01775742, + "balance_loss_mlp": 1.02140498, + "epoch": 0.49264993236134075, + "flos": 23035343811840.0, + "grad_norm": 1.8390476173280383, + "language_loss": 0.77140695, + "learning_rate": 2.0464679645661637e-06, + "loss": 0.79234713, + "num_input_tokens_seen": 176235925, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.41992188, + "step": 8194, + "time_per_iteration": 2.406836748123169 + }, + { + "auxiliary_loss_clip": 0.01063389, + "auxiliary_loss_mlp": 0.01022892, + "balance_loss_clip": 1.01112032, + "balance_loss_mlp": 1.02138543, + "epoch": 0.4927100556140087, + "flos": 24717040047360.0, + "grad_norm": 99.50607217048996, + "language_loss": 0.70117754, + "learning_rate": 2.0460902434030975e-06, + "loss": 0.72204036, + "num_input_tokens_seen": 176253865, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.41992188, + "step": 8195, + "time_per_iteration": 2.4525606632232666 + }, + { + "auxiliary_loss_clip": 0.01067167, + "auxiliary_loss_mlp": 0.01021882, + "balance_loss_clip": 1.01009226, + "balance_loss_mlp": 1.02267873, + "epoch": 0.4927701788666767, + "flos": 23504646574080.0, + "grad_norm": 2.2055074975473037, + "language_loss": 0.80964696, + "learning_rate": 2.045712520595189e-06, + "loss": 0.83053744, + "num_input_tokens_seen": 176271525, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.4453125, + "step": 8196, + "time_per_iteration": 2.4533777236938477 + }, + { + "auxiliary_loss_clip": 0.01065798, + "auxiliary_loss_mlp": 0.01028675, + "balance_loss_clip": 1.01553226, + "balance_loss_mlp": 1.02100277, + "epoch": 0.49283030211934464, + "flos": 22927811224320.0, + "grad_norm": 1.9341526427021423, + "language_loss": 0.704162, + "learning_rate": 2.045334796155919e-06, + "loss": 0.72510672, + "num_input_tokens_seen": 176290810, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.44921875, + "step": 8197, + "time_per_iteration": 2.4367449283599854 + }, + { + "auxiliary_loss_clip": 0.01062336, + "auxiliary_loss_mlp": 0.01024313, + "balance_loss_clip": 1.0127672, + "balance_loss_mlp": 1.02104902, + "epoch": 0.4928904253720126, + "flos": 16872091125120.0, + "grad_norm": 1.6843954061622155, + "language_loss": 0.83848315, + "learning_rate": 2.044957070098766e-06, + "loss": 0.85934967, + "num_input_tokens_seen": 176309165, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.41210938, + "step": 8198, + "time_per_iteration": 2.3901617527008057 + }, + { + "auxiliary_loss_clip": 0.0106848, + "auxiliary_loss_mlp": 0.01030731, + "balance_loss_clip": 1.01742697, + "balance_loss_mlp": 1.02358603, + "epoch": 0.4929505486246806, + "flos": 14865667534080.0, + "grad_norm": 3.039774934233361, + "language_loss": 0.7617439, + "learning_rate": 2.0445793424372114e-06, + "loss": 0.78273594, + "num_input_tokens_seen": 176324960, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.44921875, + "step": 8199, + "time_per_iteration": 3.806281566619873 + }, + { + "auxiliary_loss_clip": 0.01066911, + "auxiliary_loss_mlp": 0.01030282, + "balance_loss_clip": 1.01654339, + "balance_loss_mlp": 1.02145183, + "epoch": 0.49301067187734854, + "flos": 23841208880640.0, + "grad_norm": 1.4694356781768672, + "language_loss": 0.59847158, + "learning_rate": 2.044201613184735e-06, + "loss": 0.61944354, + "num_input_tokens_seen": 176346195, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.453125, + "step": 8200, + "time_per_iteration": 2.4580347537994385 + }, + { + "auxiliary_loss_clip": 0.01060487, + "auxiliary_loss_mlp": 0.01026845, + "balance_loss_clip": 1.01551449, + "balance_loss_mlp": 1.01992738, + "epoch": 0.49307079513001656, + "flos": 22490209843200.0, + "grad_norm": 1.4127296427420482, + "language_loss": 0.79204416, + "learning_rate": 2.0438238823548164e-06, + "loss": 0.81291747, + "num_input_tokens_seen": 176366735, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.40625, + "step": 8201, + "time_per_iteration": 2.4619975090026855 + }, + { + "auxiliary_loss_clip": 0.01068808, + "auxiliary_loss_mlp": 0.01025285, + "balance_loss_clip": 1.01131988, + "balance_loss_mlp": 1.02205861, + "epoch": 0.4931309183826845, + "flos": 15923675508480.0, + "grad_norm": 3.5196987222602103, + "language_loss": 0.67351878, + "learning_rate": 2.043446149960936e-06, + "loss": 0.69445968, + "num_input_tokens_seen": 176384475, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.46679688, + "step": 8202, + "time_per_iteration": 2.373882532119751 + }, + { + "auxiliary_loss_clip": 0.01062417, + "auxiliary_loss_mlp": 0.01025904, + "balance_loss_clip": 1.01361942, + "balance_loss_mlp": 1.01977253, + "epoch": 0.4931910416353525, + "flos": 27307804930560.0, + "grad_norm": 2.3516771113612753, + "language_loss": 0.7494778, + "learning_rate": 2.043068416016574e-06, + "loss": 0.77036107, + "num_input_tokens_seen": 176402645, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.42578125, + "step": 8203, + "time_per_iteration": 3.9013712406158447 + }, + { + "auxiliary_loss_clip": 0.01061881, + "auxiliary_loss_mlp": 0.01029823, + "balance_loss_clip": 1.01784289, + "balance_loss_mlp": 1.02005804, + "epoch": 0.49325116488802045, + "flos": 20805301762560.0, + "grad_norm": 2.5170262575660054, + "language_loss": 0.80125821, + "learning_rate": 2.0426906805352113e-06, + "loss": 0.82217526, + "num_input_tokens_seen": 176416715, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.41796875, + "step": 8204, + "time_per_iteration": 2.3742940425872803 + }, + { + "auxiliary_loss_clip": 0.01062493, + "auxiliary_loss_mlp": 0.01027633, + "balance_loss_clip": 1.01637983, + "balance_loss_mlp": 1.02143478, + "epoch": 0.4933112881406884, + "flos": 19864915758720.0, + "grad_norm": 2.0222308726041254, + "language_loss": 0.65713334, + "learning_rate": 2.0423129435303277e-06, + "loss": 0.6780346, + "num_input_tokens_seen": 176435755, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.41015625, + "step": 8205, + "time_per_iteration": 2.39680814743042 + }, + { + "auxiliary_loss_clip": 0.01069535, + "auxiliary_loss_mlp": 0.01031689, + "balance_loss_clip": 1.01690149, + "balance_loss_mlp": 1.02241921, + "epoch": 0.4933714113933564, + "flos": 21827104790400.0, + "grad_norm": 1.8741249668125988, + "language_loss": 0.66410482, + "learning_rate": 2.0419352050154046e-06, + "loss": 0.68511701, + "num_input_tokens_seen": 176453915, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.47070312, + "step": 8206, + "time_per_iteration": 2.3917930126190186 + }, + { + "auxiliary_loss_clip": 0.0106358, + "auxiliary_loss_mlp": 0.01029083, + "balance_loss_clip": 1.01700127, + "balance_loss_mlp": 1.02003002, + "epoch": 0.49343153464602435, + "flos": 27888934377600.0, + "grad_norm": 1.5944792706449604, + "language_loss": 0.76745933, + "learning_rate": 2.041557465003922e-06, + "loss": 0.78838599, + "num_input_tokens_seen": 176475175, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.43554688, + "step": 8207, + "time_per_iteration": 2.4562408924102783 + }, + { + "auxiliary_loss_clip": 0.01064304, + "auxiliary_loss_mlp": 0.01029185, + "balance_loss_clip": 1.0153265, + "balance_loss_mlp": 1.02104568, + "epoch": 0.4934916578986923, + "flos": 24679927405440.0, + "grad_norm": 5.5457356251893, + "language_loss": 0.60196787, + "learning_rate": 2.0411797235093593e-06, + "loss": 0.62290275, + "num_input_tokens_seen": 176494250, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.43359375, + "step": 8208, + "time_per_iteration": 2.4204134941101074 + }, + { + "auxiliary_loss_clip": 0.01064974, + "auxiliary_loss_mlp": 0.01023735, + "balance_loss_clip": 1.01024055, + "balance_loss_mlp": 1.02133036, + "epoch": 0.4935517811513603, + "flos": 23403991524480.0, + "grad_norm": 1.6568447848888432, + "language_loss": 0.78382522, + "learning_rate": 2.040801980545199e-06, + "loss": 0.8047123, + "num_input_tokens_seen": 176513325, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.4375, + "step": 8209, + "time_per_iteration": 2.421855926513672 + }, + { + "auxiliary_loss_clip": 0.01062674, + "auxiliary_loss_mlp": 0.01027988, + "balance_loss_clip": 1.01574564, + "balance_loss_mlp": 1.02040339, + "epoch": 0.49361190440402825, + "flos": 21943435040640.0, + "grad_norm": 2.046212910610994, + "language_loss": 0.78934789, + "learning_rate": 2.040424236124921e-06, + "loss": 0.81025451, + "num_input_tokens_seen": 176532915, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.42382812, + "step": 8210, + "time_per_iteration": 2.397672176361084 + }, + { + "auxiliary_loss_clip": 0.01068613, + "auxiliary_loss_mlp": 0.01031219, + "balance_loss_clip": 1.01785004, + "balance_loss_mlp": 1.02286124, + "epoch": 0.4936720276566962, + "flos": 25114596232320.0, + "grad_norm": 1.5832305658963357, + "language_loss": 0.81333274, + "learning_rate": 2.0400464902620057e-06, + "loss": 0.83433104, + "num_input_tokens_seen": 176552775, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.45703125, + "step": 8211, + "time_per_iteration": 2.4249815940856934 + }, + { + "auxiliary_loss_clip": 0.0106622, + "auxiliary_loss_mlp": 0.01029974, + "balance_loss_clip": 1.01636648, + "balance_loss_mlp": 1.02259541, + "epoch": 0.4937321509093642, + "flos": 26357748480000.0, + "grad_norm": 2.639776274519796, + "language_loss": 0.91331041, + "learning_rate": 2.0396687429699345e-06, + "loss": 0.93427241, + "num_input_tokens_seen": 176572185, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.4375, + "step": 8212, + "time_per_iteration": 2.437512159347534 + }, + { + "auxiliary_loss_clip": 0.01064205, + "auxiliary_loss_mlp": 0.01029506, + "balance_loss_clip": 1.0168457, + "balance_loss_mlp": 1.02009559, + "epoch": 0.49379227416203214, + "flos": 22960420300800.0, + "grad_norm": 1.679879483426227, + "language_loss": 0.64216214, + "learning_rate": 2.0392909942621875e-06, + "loss": 0.66309923, + "num_input_tokens_seen": 176591490, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.44140625, + "step": 8213, + "time_per_iteration": 2.438624382019043 + }, + { + "auxiliary_loss_clip": 0.01062062, + "auxiliary_loss_mlp": 0.01024596, + "balance_loss_clip": 1.01203108, + "balance_loss_mlp": 1.01946449, + "epoch": 0.49385239741470016, + "flos": 32487728774400.0, + "grad_norm": 1.702066606560777, + "language_loss": 0.75563276, + "learning_rate": 2.0389132441522464e-06, + "loss": 0.77649939, + "num_input_tokens_seen": 176612715, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.42578125, + "step": 8214, + "time_per_iteration": 2.478745222091675 + }, + { + "auxiliary_loss_clip": 0.01065367, + "auxiliary_loss_mlp": 0.01028416, + "balance_loss_clip": 1.01523113, + "balance_loss_mlp": 1.0216316, + "epoch": 0.4939125206673681, + "flos": 22491745943040.0, + "grad_norm": 1.7749253159977787, + "language_loss": 0.84200788, + "learning_rate": 2.0385354926535914e-06, + "loss": 0.86294568, + "num_input_tokens_seen": 176631950, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.4375, + "step": 8215, + "time_per_iteration": 2.405867338180542 + }, + { + "auxiliary_loss_clip": 0.01060071, + "auxiliary_loss_mlp": 0.01026654, + "balance_loss_clip": 1.01541269, + "balance_loss_mlp": 1.01999235, + "epoch": 0.4939726439200361, + "flos": 31174994453760.0, + "grad_norm": 1.8725376706478372, + "language_loss": 0.83340448, + "learning_rate": 2.0381577397797043e-06, + "loss": 0.85427177, + "num_input_tokens_seen": 176653060, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.40039062, + "step": 8216, + "time_per_iteration": 2.522568941116333 + }, + { + "auxiliary_loss_clip": 0.01062355, + "auxiliary_loss_mlp": 0.01029262, + "balance_loss_clip": 1.01750195, + "balance_loss_mlp": 1.02086604, + "epoch": 0.49403276717270406, + "flos": 22673119789440.0, + "grad_norm": 1.3968684175721353, + "language_loss": 0.74838686, + "learning_rate": 2.0377799855440653e-06, + "loss": 0.76930296, + "num_input_tokens_seen": 176673895, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.4140625, + "step": 8217, + "time_per_iteration": 2.4564216136932373 + }, + { + "auxiliary_loss_clip": 0.01062401, + "auxiliary_loss_mlp": 0.01030465, + "balance_loss_clip": 1.01744699, + "balance_loss_mlp": 1.02119005, + "epoch": 0.494092890425372, + "flos": 20740013786880.0, + "grad_norm": 1.6114115505135012, + "language_loss": 0.78430867, + "learning_rate": 2.037402229960156e-06, + "loss": 0.80523729, + "num_input_tokens_seen": 176692550, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.41210938, + "step": 8218, + "time_per_iteration": 2.39544677734375 + }, + { + "auxiliary_loss_clip": 0.01059596, + "auxiliary_loss_mlp": 0.01022154, + "balance_loss_clip": 1.01056075, + "balance_loss_mlp": 1.01887083, + "epoch": 0.49415301367804, + "flos": 18368049594240.0, + "grad_norm": 2.006201755386127, + "language_loss": 0.76920283, + "learning_rate": 2.0370244730414566e-06, + "loss": 0.79002035, + "num_input_tokens_seen": 176709335, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.40820312, + "step": 8219, + "time_per_iteration": 2.372676372528076 + }, + { + "auxiliary_loss_clip": 0.01062719, + "auxiliary_loss_mlp": 0.01026119, + "balance_loss_clip": 1.01381683, + "balance_loss_mlp": 1.0200789, + "epoch": 0.49421313693070795, + "flos": 17529645271680.0, + "grad_norm": 1.7018454019764306, + "language_loss": 0.6221813, + "learning_rate": 2.03664671480145e-06, + "loss": 0.64306968, + "num_input_tokens_seen": 176727715, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.42578125, + "step": 8220, + "time_per_iteration": 2.367541551589966 + }, + { + "auxiliary_loss_clip": 0.01062403, + "auxiliary_loss_mlp": 0.01025691, + "balance_loss_clip": 1.01304877, + "balance_loss_mlp": 1.01957965, + "epoch": 0.4942732601833759, + "flos": 20811166871040.0, + "grad_norm": 1.9341928831409219, + "language_loss": 0.72307479, + "learning_rate": 2.0362689552536152e-06, + "loss": 0.74395573, + "num_input_tokens_seen": 176747530, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.42773438, + "step": 8221, + "time_per_iteration": 2.398932695388794 + }, + { + "auxiliary_loss_clip": 0.01062692, + "auxiliary_loss_mlp": 0.01024323, + "balance_loss_clip": 1.01271224, + "balance_loss_mlp": 1.02270126, + "epoch": 0.4943333834360439, + "flos": 15048053809920.0, + "grad_norm": 1.632349536180027, + "language_loss": 0.79448867, + "learning_rate": 2.035891194411436e-06, + "loss": 0.81535888, + "num_input_tokens_seen": 176765260, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.40039062, + "step": 8222, + "time_per_iteration": 2.3953206539154053 + }, + { + "auxiliary_loss_clip": 0.01063356, + "auxiliary_loss_mlp": 0.01024977, + "balance_loss_clip": 1.01242435, + "balance_loss_mlp": 1.02081561, + "epoch": 0.49439350668871185, + "flos": 16069507724160.0, + "grad_norm": 2.6897808818154156, + "language_loss": 0.72667605, + "learning_rate": 2.0355134322883913e-06, + "loss": 0.74755937, + "num_input_tokens_seen": 176781770, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.42578125, + "step": 8223, + "time_per_iteration": 2.400542736053467 + }, + { + "auxiliary_loss_clip": 0.01063966, + "auxiliary_loss_mlp": 0.01029731, + "balance_loss_clip": 1.01630795, + "balance_loss_mlp": 1.02023399, + "epoch": 0.4944536299413798, + "flos": 20879212844160.0, + "grad_norm": 1.8743037124070914, + "language_loss": 0.75224555, + "learning_rate": 2.035135668897964e-06, + "loss": 0.77318251, + "num_input_tokens_seen": 176800655, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.4375, + "step": 8224, + "time_per_iteration": 2.401432514190674 + }, + { + "auxiliary_loss_clip": 0.01063706, + "auxiliary_loss_mlp": 0.01024917, + "balance_loss_clip": 1.01184559, + "balance_loss_mlp": 1.02075076, + "epoch": 0.4945137531940478, + "flos": 26607866526720.0, + "grad_norm": 2.1754428632095992, + "language_loss": 0.63962519, + "learning_rate": 2.034757904253635e-06, + "loss": 0.66051143, + "num_input_tokens_seen": 176820610, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.4296875, + "step": 8225, + "time_per_iteration": 2.441336154937744 + }, + { + "auxiliary_loss_clip": 0.01062192, + "auxiliary_loss_mlp": 0.01026584, + "balance_loss_clip": 1.0143652, + "balance_loss_mlp": 1.01955342, + "epoch": 0.49457387644671574, + "flos": 23987006184960.0, + "grad_norm": 2.2078009018946942, + "language_loss": 0.76166952, + "learning_rate": 2.034380138368886e-06, + "loss": 0.78255725, + "num_input_tokens_seen": 176840520, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.42578125, + "step": 8226, + "time_per_iteration": 2.398930311203003 + }, + { + "auxiliary_loss_clip": 0.01064802, + "auxiliary_loss_mlp": 0.01028775, + "balance_loss_clip": 1.0150423, + "balance_loss_mlp": 1.02093232, + "epoch": 0.49463399969938376, + "flos": 26975466898560.0, + "grad_norm": 1.6459776453504942, + "language_loss": 0.71069241, + "learning_rate": 2.034002371257198e-06, + "loss": 0.73162818, + "num_input_tokens_seen": 176860265, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.4375, + "step": 8227, + "time_per_iteration": 2.4377036094665527 + }, + { + "auxiliary_loss_clip": 0.01065298, + "auxiliary_loss_mlp": 0.01030283, + "balance_loss_clip": 1.01690769, + "balance_loss_mlp": 1.02019167, + "epoch": 0.49469412295205173, + "flos": 29680188059520.0, + "grad_norm": 1.462353182573745, + "language_loss": 0.71589595, + "learning_rate": 2.033624602932053e-06, + "loss": 0.73685181, + "num_input_tokens_seen": 176882910, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.45117188, + "step": 8228, + "time_per_iteration": 2.4524760246276855 + }, + { + "auxiliary_loss_clip": 0.01061036, + "auxiliary_loss_mlp": 0.01022768, + "balance_loss_clip": 1.01078105, + "balance_loss_mlp": 1.01954341, + "epoch": 0.4947542462047197, + "flos": 24130708807680.0, + "grad_norm": 1.4368183612796175, + "language_loss": 0.83965302, + "learning_rate": 2.0332468334069327e-06, + "loss": 0.86049104, + "num_input_tokens_seen": 176903030, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.4140625, + "step": 8229, + "time_per_iteration": 3.852625846862793 + }, + { + "auxiliary_loss_clip": 0.0106382, + "auxiliary_loss_mlp": 0.01026392, + "balance_loss_clip": 1.01312375, + "balance_loss_mlp": 1.02008092, + "epoch": 0.49481436945738766, + "flos": 20044090189440.0, + "grad_norm": 1.7875467508375242, + "language_loss": 0.74556625, + "learning_rate": 2.032869062695318e-06, + "loss": 0.76646835, + "num_input_tokens_seen": 176919025, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.4375, + "step": 8230, + "time_per_iteration": 2.376420021057129 + }, + { + "auxiliary_loss_clip": 0.01063284, + "auxiliary_loss_mlp": 0.01025531, + "balance_loss_clip": 1.01251936, + "balance_loss_mlp": 1.0207212, + "epoch": 0.4948744927100556, + "flos": 15668634960000.0, + "grad_norm": 2.3006991703604536, + "language_loss": 0.79747379, + "learning_rate": 2.032491290810692e-06, + "loss": 0.818362, + "num_input_tokens_seen": 176937945, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.42578125, + "step": 8231, + "time_per_iteration": 2.3835835456848145 + }, + { + "auxiliary_loss_clip": 0.01011323, + "auxiliary_loss_mlp": 0.010098, + "balance_loss_clip": 1.00878036, + "balance_loss_mlp": 1.00185955, + "epoch": 0.4949346159627236, + "flos": 68868481478400.0, + "grad_norm": 0.75289413590731, + "language_loss": 0.60201198, + "learning_rate": 2.0321135177665337e-06, + "loss": 0.6222232, + "num_input_tokens_seen": 177004575, + "router_z_loss_clip": 0.01019287, + "router_z_loss_mlp": 0.09472656, + "step": 8232, + "time_per_iteration": 3.158092498779297 + }, + { + "auxiliary_loss_clip": 0.01066547, + "auxiliary_loss_mlp": 0.01026866, + "balance_loss_clip": 1.0132463, + "balance_loss_mlp": 1.02040052, + "epoch": 0.49499473921539155, + "flos": 24789135738240.0, + "grad_norm": 1.6304811232078535, + "language_loss": 0.69111013, + "learning_rate": 2.0317357435763277e-06, + "loss": 0.71204418, + "num_input_tokens_seen": 177024155, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.4609375, + "step": 8233, + "time_per_iteration": 2.4324893951416016 + }, + { + "auxiliary_loss_clip": 0.01063744, + "auxiliary_loss_mlp": 0.01025311, + "balance_loss_clip": 1.01175725, + "balance_loss_mlp": 1.02019596, + "epoch": 0.4950548624680595, + "flos": 32706529464960.0, + "grad_norm": 1.7552182447512004, + "language_loss": 0.66472077, + "learning_rate": 2.0313579682535544e-06, + "loss": 0.68561125, + "num_input_tokens_seen": 177046185, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.43554688, + "step": 8234, + "time_per_iteration": 2.4712636470794678 + }, + { + "auxiliary_loss_clip": 0.01010541, + "auxiliary_loss_mlp": 0.01002858, + "balance_loss_clip": 1.00181448, + "balance_loss_mlp": 1.00144076, + "epoch": 0.4951149857207275, + "flos": 50079099196800.0, + "grad_norm": 0.7996780638215686, + "language_loss": 0.57977408, + "learning_rate": 2.030980191811696e-06, + "loss": 0.59990811, + "num_input_tokens_seen": 177099025, + "router_z_loss_clip": 0.01043701, + "router_z_loss_mlp": 0.09082031, + "step": 8235, + "time_per_iteration": 2.9794423580169678 + }, + { + "auxiliary_loss_clip": 0.01064859, + "auxiliary_loss_mlp": 0.01028663, + "balance_loss_clip": 1.01593184, + "balance_loss_mlp": 1.02055264, + "epoch": 0.49517510897339545, + "flos": 22235692965120.0, + "grad_norm": 1.9992276104084787, + "language_loss": 0.77244353, + "learning_rate": 2.0306024142642338e-06, + "loss": 0.79337883, + "num_input_tokens_seen": 177118365, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.44335938, + "step": 8236, + "time_per_iteration": 2.4173498153686523 + }, + { + "auxiliary_loss_clip": 0.01064633, + "auxiliary_loss_mlp": 0.01027798, + "balance_loss_clip": 1.01594901, + "balance_loss_mlp": 1.02158666, + "epoch": 0.4952352322260634, + "flos": 25372953360000.0, + "grad_norm": 1.7645621044227902, + "language_loss": 0.72337359, + "learning_rate": 2.03022463562465e-06, + "loss": 0.74429786, + "num_input_tokens_seen": 177136415, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.4296875, + "step": 8237, + "time_per_iteration": 2.4428317546844482 + }, + { + "auxiliary_loss_clip": 0.01061145, + "auxiliary_loss_mlp": 0.01025262, + "balance_loss_clip": 1.01292384, + "balance_loss_mlp": 1.01939237, + "epoch": 0.4952953554787314, + "flos": 24607552423680.0, + "grad_norm": 1.8845662364815072, + "language_loss": 0.75850999, + "learning_rate": 2.0298468559064276e-06, + "loss": 0.77937406, + "num_input_tokens_seen": 177155690, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.41796875, + "step": 8238, + "time_per_iteration": 2.44551944732666 + }, + { + "auxiliary_loss_clip": 0.01064659, + "auxiliary_loss_mlp": 0.01030806, + "balance_loss_clip": 1.01772249, + "balance_loss_mlp": 1.02036905, + "epoch": 0.49535547873139935, + "flos": 17310320910720.0, + "grad_norm": 2.221843981894111, + "language_loss": 0.74090558, + "learning_rate": 2.0294690751230476e-06, + "loss": 0.76186025, + "num_input_tokens_seen": 177173350, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.44140625, + "step": 8239, + "time_per_iteration": 3.7911696434020996 + }, + { + "auxiliary_loss_clip": 0.01065782, + "auxiliary_loss_mlp": 0.01031112, + "balance_loss_clip": 1.01695561, + "balance_loss_mlp": 1.02105665, + "epoch": 0.4954156019840673, + "flos": 20739280648320.0, + "grad_norm": 2.2168423297046522, + "language_loss": 0.78414679, + "learning_rate": 2.0290912932879915e-06, + "loss": 0.8051157, + "num_input_tokens_seen": 177191115, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.44726562, + "step": 8240, + "time_per_iteration": 2.4090702533721924 + }, + { + "auxiliary_loss_clip": 0.01062397, + "auxiliary_loss_mlp": 0.01029413, + "balance_loss_clip": 1.01757598, + "balance_loss_mlp": 1.02153909, + "epoch": 0.49547572523673533, + "flos": 12819931885440.0, + "grad_norm": 1.8735403337362693, + "language_loss": 0.8547163, + "learning_rate": 2.0287135104147423e-06, + "loss": 0.87563443, + "num_input_tokens_seen": 177206155, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.40820312, + "step": 8241, + "time_per_iteration": 2.363499164581299 + }, + { + "auxiliary_loss_clip": 0.01064436, + "auxiliary_loss_mlp": 0.01033132, + "balance_loss_clip": 1.02103245, + "balance_loss_mlp": 1.02172482, + "epoch": 0.4955358484894033, + "flos": 15596120332800.0, + "grad_norm": 1.74471833640638, + "language_loss": 0.77291209, + "learning_rate": 2.028335726516781e-06, + "loss": 0.79388773, + "num_input_tokens_seen": 177224815, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.42773438, + "step": 8242, + "time_per_iteration": 2.3805289268493652 + }, + { + "auxiliary_loss_clip": 0.01062733, + "auxiliary_loss_mlp": 0.01026338, + "balance_loss_clip": 1.01334417, + "balance_loss_mlp": 1.01997805, + "epoch": 0.49559597174207126, + "flos": 26463291120000.0, + "grad_norm": 1.6932526095227045, + "language_loss": 0.67154932, + "learning_rate": 2.0279579416075917e-06, + "loss": 0.69243997, + "num_input_tokens_seen": 177244490, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.42773438, + "step": 8243, + "time_per_iteration": 5.326393127441406 + }, + { + "auxiliary_loss_clip": 0.0106422, + "auxiliary_loss_mlp": 0.01028324, + "balance_loss_clip": 1.01647508, + "balance_loss_mlp": 1.02212334, + "epoch": 0.4956560949947392, + "flos": 23147135585280.0, + "grad_norm": 1.6866616735646447, + "language_loss": 0.6791358, + "learning_rate": 2.027580155700655e-06, + "loss": 0.70006132, + "num_input_tokens_seen": 177264340, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.421875, + "step": 8244, + "time_per_iteration": 2.398118734359741 + }, + { + "auxiliary_loss_clip": 0.01066315, + "auxiliary_loss_mlp": 0.0103225, + "balance_loss_clip": 1.01964319, + "balance_loss_mlp": 1.02248645, + "epoch": 0.4957162182474072, + "flos": 20772518129280.0, + "grad_norm": 3.6122490049674774, + "language_loss": 0.75127602, + "learning_rate": 2.0272023688094534e-06, + "loss": 0.77226168, + "num_input_tokens_seen": 177283055, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.4375, + "step": 8245, + "time_per_iteration": 2.3969554901123047 + }, + { + "auxiliary_loss_clip": 0.01063661, + "auxiliary_loss_mlp": 0.01025363, + "balance_loss_clip": 1.01266158, + "balance_loss_mlp": 1.02171397, + "epoch": 0.49577634150007516, + "flos": 18733206170880.0, + "grad_norm": 1.8966900004814085, + "language_loss": 0.81643987, + "learning_rate": 2.026824580947469e-06, + "loss": 0.83733016, + "num_input_tokens_seen": 177301140, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.41992188, + "step": 8246, + "time_per_iteration": 2.3605618476867676 + }, + { + "auxiliary_loss_clip": 0.0106628, + "auxiliary_loss_mlp": 0.0103495, + "balance_loss_clip": 1.02146721, + "balance_loss_mlp": 1.02172244, + "epoch": 0.4958364647527431, + "flos": 25553070397440.0, + "grad_norm": 1.6756097587662853, + "language_loss": 0.83925927, + "learning_rate": 2.0264467921281846e-06, + "loss": 0.86027157, + "num_input_tokens_seen": 177323095, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.4453125, + "step": 8247, + "time_per_iteration": 2.4281818866729736 + }, + { + "auxiliary_loss_clip": 0.01065047, + "auxiliary_loss_mlp": 0.01028639, + "balance_loss_clip": 1.01619935, + "balance_loss_mlp": 1.02081656, + "epoch": 0.4958965880054111, + "flos": 24424188629760.0, + "grad_norm": 1.9340967437036771, + "language_loss": 0.83213043, + "learning_rate": 2.0260690023650818e-06, + "loss": 0.85306728, + "num_input_tokens_seen": 177339845, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.44140625, + "step": 8248, + "time_per_iteration": 2.418771266937256 + }, + { + "auxiliary_loss_clip": 0.01065445, + "auxiliary_loss_mlp": 0.01027716, + "balance_loss_clip": 1.01431656, + "balance_loss_mlp": 1.02131546, + "epoch": 0.49595671125807905, + "flos": 25082266446720.0, + "grad_norm": 1.8347939558402888, + "language_loss": 0.73832506, + "learning_rate": 2.0256912116716437e-06, + "loss": 0.75925666, + "num_input_tokens_seen": 177359980, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.44140625, + "step": 8249, + "time_per_iteration": 2.4298501014709473 + }, + { + "auxiliary_loss_clip": 0.01068242, + "auxiliary_loss_mlp": 0.01029759, + "balance_loss_clip": 1.01669419, + "balance_loss_mlp": 1.02278066, + "epoch": 0.496016834510747, + "flos": 16434943591680.0, + "grad_norm": 1.823266727813093, + "language_loss": 0.75825298, + "learning_rate": 2.0253134200613526e-06, + "loss": 0.77923298, + "num_input_tokens_seen": 177378580, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.45507812, + "step": 8250, + "time_per_iteration": 2.3642523288726807 + }, + { + "auxiliary_loss_clip": 0.01063241, + "auxiliary_loss_mlp": 0.01028602, + "balance_loss_clip": 1.01601386, + "balance_loss_mlp": 1.02021277, + "epoch": 0.496076957763415, + "flos": 23436879891840.0, + "grad_norm": 1.984468976757176, + "language_loss": 0.70239651, + "learning_rate": 2.0249356275476903e-06, + "loss": 0.723315, + "num_input_tokens_seen": 177398790, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.4296875, + "step": 8251, + "time_per_iteration": 2.4166903495788574 + }, + { + "auxiliary_loss_clip": 0.01063165, + "auxiliary_loss_mlp": 0.01022361, + "balance_loss_clip": 1.01051176, + "balance_loss_mlp": 1.02091563, + "epoch": 0.49613708101608295, + "flos": 16908575362560.0, + "grad_norm": 1.8024114175332597, + "language_loss": 0.8077147, + "learning_rate": 2.02455783414414e-06, + "loss": 0.82856995, + "num_input_tokens_seen": 177416515, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.421875, + "step": 8252, + "time_per_iteration": 2.357588052749634 + }, + { + "auxiliary_loss_clip": 0.01064712, + "auxiliary_loss_mlp": 0.01023893, + "balance_loss_clip": 1.01062524, + "balance_loss_mlp": 1.02089047, + "epoch": 0.4961972042687509, + "flos": 16617155310720.0, + "grad_norm": 1.7948085206928475, + "language_loss": 0.80986226, + "learning_rate": 2.0241800398641834e-06, + "loss": 0.83074832, + "num_input_tokens_seen": 177434425, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.4375, + "step": 8253, + "time_per_iteration": 2.3893721103668213 + }, + { + "auxiliary_loss_clip": 0.01062258, + "auxiliary_loss_mlp": 0.01023737, + "balance_loss_clip": 1.01246548, + "balance_loss_mlp": 1.02033615, + "epoch": 0.49625732752141893, + "flos": 28955286167040.0, + "grad_norm": 1.8287555528126511, + "language_loss": 0.67422807, + "learning_rate": 2.023802244721303e-06, + "loss": 0.69508803, + "num_input_tokens_seen": 177459675, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.41796875, + "step": 8254, + "time_per_iteration": 2.50803279876709 + }, + { + "auxiliary_loss_clip": 0.01064322, + "auxiliary_loss_mlp": 0.01026503, + "balance_loss_clip": 1.0138433, + "balance_loss_mlp": 1.02023005, + "epoch": 0.4963174507740869, + "flos": 23111244840960.0, + "grad_norm": 1.7152402367856745, + "language_loss": 0.74118966, + "learning_rate": 2.023424448728982e-06, + "loss": 0.7620979, + "num_input_tokens_seen": 177478895, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.44140625, + "step": 8255, + "time_per_iteration": 2.4196550846099854 + }, + { + "auxiliary_loss_clip": 0.01062592, + "auxiliary_loss_mlp": 0.01024533, + "balance_loss_clip": 1.01194441, + "balance_loss_mlp": 1.02033532, + "epoch": 0.49637757402675486, + "flos": 13917007537920.0, + "grad_norm": 2.571244413938419, + "language_loss": 0.82092714, + "learning_rate": 2.023046651900703e-06, + "loss": 0.84179842, + "num_input_tokens_seen": 177494920, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.421875, + "step": 8256, + "time_per_iteration": 2.365628480911255 + }, + { + "auxiliary_loss_clip": 0.01061532, + "auxiliary_loss_mlp": 0.01021777, + "balance_loss_clip": 1.00958753, + "balance_loss_mlp": 1.02018619, + "epoch": 0.49643769727942283, + "flos": 22307928301440.0, + "grad_norm": 1.444551116042385, + "language_loss": 0.80777586, + "learning_rate": 2.022668854249948e-06, + "loss": 0.82860899, + "num_input_tokens_seen": 177515455, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.4140625, + "step": 8257, + "time_per_iteration": 2.4241559505462646 + }, + { + "auxiliary_loss_clip": 0.01066739, + "auxiliary_loss_mlp": 0.01026478, + "balance_loss_clip": 1.01239383, + "balance_loss_mlp": 1.0211215, + "epoch": 0.4964978205320908, + "flos": 19499235511680.0, + "grad_norm": 1.705353444386144, + "language_loss": 0.6594857, + "learning_rate": 2.0222910557902e-06, + "loss": 0.6804179, + "num_input_tokens_seen": 177534040, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.45703125, + "step": 8258, + "time_per_iteration": 2.371925115585327 + }, + { + "auxiliary_loss_clip": 0.01061811, + "auxiliary_loss_mlp": 0.0102208, + "balance_loss_clip": 1.01034939, + "balance_loss_mlp": 1.01984954, + "epoch": 0.49655794378475876, + "flos": 23435518348800.0, + "grad_norm": 1.4204588618557794, + "language_loss": 0.77803195, + "learning_rate": 2.0219132565349414e-06, + "loss": 0.79887092, + "num_input_tokens_seen": 177554510, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.41992188, + "step": 8259, + "time_per_iteration": 2.4026355743408203 + }, + { + "auxiliary_loss_clip": 0.01062768, + "auxiliary_loss_mlp": 0.01024581, + "balance_loss_clip": 1.01209426, + "balance_loss_mlp": 1.02123535, + "epoch": 0.4966180670374267, + "flos": 26829983796480.0, + "grad_norm": 1.5052022262164508, + "language_loss": 0.785375, + "learning_rate": 2.0215354564976555e-06, + "loss": 0.80624855, + "num_input_tokens_seen": 177575780, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.4140625, + "step": 8260, + "time_per_iteration": 2.4452292919158936 + }, + { + "auxiliary_loss_clip": 0.01062789, + "auxiliary_loss_mlp": 0.01026659, + "balance_loss_clip": 1.01407647, + "balance_loss_mlp": 1.01950824, + "epoch": 0.4966781902900947, + "flos": 22008478636800.0, + "grad_norm": 2.9802896189632206, + "language_loss": 0.7686497, + "learning_rate": 2.0211576556918244e-06, + "loss": 0.78954417, + "num_input_tokens_seen": 177588965, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.43359375, + "step": 8261, + "time_per_iteration": 2.3612887859344482 + }, + { + "auxiliary_loss_clip": 0.01061807, + "auxiliary_loss_mlp": 0.01023484, + "balance_loss_clip": 1.01160479, + "balance_loss_mlp": 1.02044904, + "epoch": 0.49673831354276266, + "flos": 26212160643840.0, + "grad_norm": 1.8943679820166999, + "language_loss": 0.90246761, + "learning_rate": 2.0207798541309307e-06, + "loss": 0.92332053, + "num_input_tokens_seen": 177608425, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.41210938, + "step": 8262, + "time_per_iteration": 2.4295995235443115 + }, + { + "auxiliary_loss_clip": 0.0106309, + "auxiliary_loss_mlp": 0.01022794, + "balance_loss_clip": 1.01105142, + "balance_loss_mlp": 1.02167833, + "epoch": 0.4967984367954306, + "flos": 23181245850240.0, + "grad_norm": 1.434545122307879, + "language_loss": 0.7398966, + "learning_rate": 2.0204020518284576e-06, + "loss": 0.76075548, + "num_input_tokens_seen": 177628240, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.4140625, + "step": 8263, + "time_per_iteration": 2.394153594970703 + }, + { + "auxiliary_loss_clip": 0.01067226, + "auxiliary_loss_mlp": 0.01031291, + "balance_loss_clip": 1.01773047, + "balance_loss_mlp": 1.02221811, + "epoch": 0.4968585600480986, + "flos": 19280434821120.0, + "grad_norm": 2.187543012575431, + "language_loss": 0.69617534, + "learning_rate": 2.0200242487978877e-06, + "loss": 0.71716046, + "num_input_tokens_seen": 177645920, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.44921875, + "step": 8264, + "time_per_iteration": 2.372612714767456 + }, + { + "auxiliary_loss_clip": 0.01064455, + "auxiliary_loss_mlp": 0.01024392, + "balance_loss_clip": 1.01176739, + "balance_loss_mlp": 1.02018619, + "epoch": 0.49691868330076655, + "flos": 22527601776000.0, + "grad_norm": 1.363279966611396, + "language_loss": 0.65246606, + "learning_rate": 2.019646445052704e-06, + "loss": 0.67335451, + "num_input_tokens_seen": 177667185, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.44335938, + "step": 8265, + "time_per_iteration": 2.403905153274536 + }, + { + "auxiliary_loss_clip": 0.01010337, + "auxiliary_loss_mlp": 0.0100933, + "balance_loss_clip": 1.00844789, + "balance_loss_mlp": 1.00194526, + "epoch": 0.4969788065534345, + "flos": 66573500567040.0, + "grad_norm": 1.2585273866496296, + "language_loss": 0.53464919, + "learning_rate": 2.0192686406063897e-06, + "loss": 0.55484587, + "num_input_tokens_seen": 177733020, + "router_z_loss_clip": 0.0088501, + "router_z_loss_mlp": 0.08398438, + "step": 8266, + "time_per_iteration": 3.0708909034729004 + }, + { + "auxiliary_loss_clip": 0.01061202, + "auxiliary_loss_mlp": 0.01028654, + "balance_loss_clip": 1.01567817, + "balance_loss_mlp": 1.02003145, + "epoch": 0.49703892980610254, + "flos": 24058403648640.0, + "grad_norm": 1.682977532335443, + "language_loss": 0.79537368, + "learning_rate": 2.018890835472426e-06, + "loss": 0.81627226, + "num_input_tokens_seen": 177753370, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.41210938, + "step": 8267, + "time_per_iteration": 2.4088282585144043 + }, + { + "auxiliary_loss_clip": 0.01065343, + "auxiliary_loss_mlp": 0.01024595, + "balance_loss_clip": 1.0114764, + "balance_loss_mlp": 1.02135587, + "epoch": 0.4970990530587705, + "flos": 29125069441920.0, + "grad_norm": 2.2186491304495743, + "language_loss": 0.75311565, + "learning_rate": 2.0185130296642974e-06, + "loss": 0.77401507, + "num_input_tokens_seen": 177771530, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.43945312, + "step": 8268, + "time_per_iteration": 3.8452250957489014 + }, + { + "auxiliary_loss_clip": 0.01062292, + "auxiliary_loss_mlp": 0.01027481, + "balance_loss_clip": 1.01387286, + "balance_loss_mlp": 1.01973581, + "epoch": 0.49715917631143847, + "flos": 46024393294080.0, + "grad_norm": 1.4842771581410306, + "language_loss": 0.67939776, + "learning_rate": 2.0181352231954865e-06, + "loss": 0.70029551, + "num_input_tokens_seen": 177796355, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.42578125, + "step": 8269, + "time_per_iteration": 2.6045145988464355 + }, + { + "auxiliary_loss_clip": 0.01061461, + "auxiliary_loss_mlp": 0.0102484, + "balance_loss_clip": 1.01219785, + "balance_loss_mlp": 1.02030063, + "epoch": 0.49721929956410643, + "flos": 20190306430080.0, + "grad_norm": 1.544788203864861, + "language_loss": 0.85426599, + "learning_rate": 2.0177574160794768e-06, + "loss": 0.87512898, + "num_input_tokens_seen": 177814300, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.41210938, + "step": 8270, + "time_per_iteration": 2.3840785026550293 + }, + { + "auxiliary_loss_clip": 0.01063583, + "auxiliary_loss_mlp": 0.0102182, + "balance_loss_clip": 1.00992298, + "balance_loss_mlp": 1.02106333, + "epoch": 0.4972794228167744, + "flos": 21652468836480.0, + "grad_norm": 1.625222347904302, + "language_loss": 0.70923054, + "learning_rate": 2.017379608329749e-06, + "loss": 0.7300846, + "num_input_tokens_seen": 177833615, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.42578125, + "step": 8271, + "time_per_iteration": 2.3981871604919434 + }, + { + "auxiliary_loss_clip": 0.01065225, + "auxiliary_loss_mlp": 0.01023977, + "balance_loss_clip": 1.01134133, + "balance_loss_mlp": 1.02143729, + "epoch": 0.49733954606944236, + "flos": 24278600793600.0, + "grad_norm": 1.4040051275551682, + "language_loss": 0.784823, + "learning_rate": 2.017001799959789e-06, + "loss": 0.80571502, + "num_input_tokens_seen": 177855315, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.4375, + "step": 8272, + "time_per_iteration": 2.4261014461517334 + }, + { + "auxiliary_loss_clip": 0.01065721, + "auxiliary_loss_mlp": 0.01028875, + "balance_loss_clip": 1.01613712, + "balance_loss_mlp": 1.02191329, + "epoch": 0.4973996693221103, + "flos": 37851051323520.0, + "grad_norm": 2.0238516474138235, + "language_loss": 0.66241539, + "learning_rate": 2.0166239909830786e-06, + "loss": 0.68336135, + "num_input_tokens_seen": 177875590, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.4375, + "step": 8273, + "time_per_iteration": 2.555712938308716 + }, + { + "auxiliary_loss_clip": 0.01061313, + "auxiliary_loss_mlp": 0.01024341, + "balance_loss_clip": 1.01159799, + "balance_loss_mlp": 1.02025282, + "epoch": 0.4974597925747783, + "flos": 21360350557440.0, + "grad_norm": 1.646804415183197, + "language_loss": 0.78163731, + "learning_rate": 2.0162461814130996e-06, + "loss": 0.80249387, + "num_input_tokens_seen": 177894175, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.41015625, + "step": 8274, + "time_per_iteration": 2.3751280307769775 + }, + { + "auxiliary_loss_clip": 0.01065229, + "auxiliary_loss_mlp": 0.01028562, + "balance_loss_clip": 1.01518679, + "balance_loss_mlp": 1.020702, + "epoch": 0.49751991582744626, + "flos": 30736799579520.0, + "grad_norm": 1.8179436852590198, + "language_loss": 0.75849771, + "learning_rate": 2.015868371263338e-06, + "loss": 0.77943563, + "num_input_tokens_seen": 177913920, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.4453125, + "step": 8275, + "time_per_iteration": 2.4814600944519043 + }, + { + "auxiliary_loss_clip": 0.0106533, + "auxiliary_loss_mlp": 0.01029604, + "balance_loss_clip": 1.0152036, + "balance_loss_mlp": 1.02052021, + "epoch": 0.4975800390801142, + "flos": 14099673104640.0, + "grad_norm": 2.5264061067592927, + "language_loss": 0.83680987, + "learning_rate": 2.0154905605472736e-06, + "loss": 0.85775924, + "num_input_tokens_seen": 177930425, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.44726562, + "step": 8276, + "time_per_iteration": 2.386993169784546 + }, + { + "auxiliary_loss_clip": 0.01059606, + "auxiliary_loss_mlp": 0.01025066, + "balance_loss_clip": 1.01333535, + "balance_loss_mlp": 1.01907945, + "epoch": 0.4976401623327822, + "flos": 24206121077760.0, + "grad_norm": 1.4187983890464366, + "language_loss": 0.70037061, + "learning_rate": 2.0151127492783913e-06, + "loss": 0.72121727, + "num_input_tokens_seen": 177949885, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.40625, + "step": 8277, + "time_per_iteration": 2.4289841651916504 + }, + { + "auxiliary_loss_clip": 0.01062293, + "auxiliary_loss_mlp": 0.01026426, + "balance_loss_clip": 1.01376629, + "balance_loss_mlp": 1.02069616, + "epoch": 0.49770028558545015, + "flos": 21135859315200.0, + "grad_norm": 1.7437608287096868, + "language_loss": 0.82104975, + "learning_rate": 2.014734937470174e-06, + "loss": 0.84193695, + "num_input_tokens_seen": 177965720, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.41601562, + "step": 8278, + "time_per_iteration": 2.3770694732666016 + }, + { + "auxiliary_loss_clip": 0.01063927, + "auxiliary_loss_mlp": 0.01027939, + "balance_loss_clip": 1.01467681, + "balance_loss_mlp": 1.02039862, + "epoch": 0.4977604088381181, + "flos": 16762987526400.0, + "grad_norm": 1.8807448943217984, + "language_loss": 0.67026877, + "learning_rate": 2.014357125136104e-06, + "loss": 0.69118744, + "num_input_tokens_seen": 177983190, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.43359375, + "step": 8279, + "time_per_iteration": 3.8164827823638916 + }, + { + "auxiliary_loss_clip": 0.01061594, + "auxiliary_loss_mlp": 0.01028964, + "balance_loss_clip": 1.0153687, + "balance_loss_mlp": 1.01913059, + "epoch": 0.49782053209078614, + "flos": 15702675402240.0, + "grad_norm": 2.197023631645862, + "language_loss": 0.70903468, + "learning_rate": 2.013979312289666e-06, + "loss": 0.72994024, + "num_input_tokens_seen": 178000155, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.42578125, + "step": 8280, + "time_per_iteration": 2.3662502765655518 + }, + { + "auxiliary_loss_clip": 0.01062169, + "auxiliary_loss_mlp": 0.01023583, + "balance_loss_clip": 1.01153088, + "balance_loss_mlp": 1.01875567, + "epoch": 0.4978806553434541, + "flos": 24752546766720.0, + "grad_norm": 1.8504974963844372, + "language_loss": 0.64583027, + "learning_rate": 2.0136014989443416e-06, + "loss": 0.66668785, + "num_input_tokens_seen": 178021060, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.43359375, + "step": 8281, + "time_per_iteration": 2.417127847671509 + }, + { + "auxiliary_loss_clip": 0.01064725, + "auxiliary_loss_mlp": 0.01028991, + "balance_loss_clip": 1.01512146, + "balance_loss_mlp": 1.02083278, + "epoch": 0.49794077859612207, + "flos": 13114877984640.0, + "grad_norm": 2.276074207642254, + "language_loss": 0.73240566, + "learning_rate": 2.013223685113615e-06, + "loss": 0.75334281, + "num_input_tokens_seen": 178038180, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.4375, + "step": 8282, + "time_per_iteration": 3.8130581378936768 + }, + { + "auxiliary_loss_clip": 0.01059469, + "auxiliary_loss_mlp": 0.01024411, + "balance_loss_clip": 1.01301432, + "balance_loss_mlp": 1.01937509, + "epoch": 0.49800090184879003, + "flos": 27523952357760.0, + "grad_norm": 1.525698892107428, + "language_loss": 0.73251522, + "learning_rate": 2.0128458708109694e-06, + "loss": 0.75335395, + "num_input_tokens_seen": 178057565, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.40039062, + "step": 8283, + "time_per_iteration": 3.9020609855651855 + }, + { + "auxiliary_loss_clip": 0.01061957, + "auxiliary_loss_mlp": 0.01026154, + "balance_loss_clip": 1.01271296, + "balance_loss_mlp": 1.01949501, + "epoch": 0.498061025101458, + "flos": 19791458524800.0, + "grad_norm": 1.5405782105807417, + "language_loss": 0.78697634, + "learning_rate": 2.0124680560498877e-06, + "loss": 0.80785739, + "num_input_tokens_seen": 178076965, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.42578125, + "step": 8284, + "time_per_iteration": 2.392714500427246 + }, + { + "auxiliary_loss_clip": 0.01067621, + "auxiliary_loss_mlp": 0.01025392, + "balance_loss_clip": 1.01120639, + "balance_loss_mlp": 1.02213275, + "epoch": 0.49812114835412596, + "flos": 29892739616640.0, + "grad_norm": 1.6714886158378848, + "language_loss": 0.73703915, + "learning_rate": 2.0120902408438527e-06, + "loss": 0.75796926, + "num_input_tokens_seen": 178095105, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.45507812, + "step": 8285, + "time_per_iteration": 2.445253849029541 + }, + { + "auxiliary_loss_clip": 0.01065368, + "auxiliary_loss_mlp": 0.0102332, + "balance_loss_clip": 1.0102067, + "balance_loss_mlp": 1.02229714, + "epoch": 0.49818127160679393, + "flos": 23145983510400.0, + "grad_norm": 13.218554922375853, + "language_loss": 0.74088073, + "learning_rate": 2.011712425206348e-06, + "loss": 0.76176763, + "num_input_tokens_seen": 178114505, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.4296875, + "step": 8286, + "time_per_iteration": 2.4054460525512695 + }, + { + "auxiliary_loss_clip": 0.01065229, + "auxiliary_loss_mlp": 0.01033606, + "balance_loss_clip": 1.02040339, + "balance_loss_mlp": 1.02180243, + "epoch": 0.4982413948594619, + "flos": 21651735697920.0, + "grad_norm": 1.8513349514512842, + "language_loss": 0.85355568, + "learning_rate": 2.011334609150857e-06, + "loss": 0.87454402, + "num_input_tokens_seen": 178131595, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.43359375, + "step": 8287, + "time_per_iteration": 2.3865091800689697 + }, + { + "auxiliary_loss_clip": 0.01063169, + "auxiliary_loss_mlp": 0.01025033, + "balance_loss_clip": 1.01212251, + "balance_loss_mlp": 1.01997828, + "epoch": 0.49830151811212986, + "flos": 32485669004160.0, + "grad_norm": 1.5888903736258249, + "language_loss": 0.72419554, + "learning_rate": 2.0109567926908636e-06, + "loss": 0.74507761, + "num_input_tokens_seen": 178152055, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.43359375, + "step": 8288, + "time_per_iteration": 2.480376958847046 + }, + { + "auxiliary_loss_clip": 0.01068209, + "auxiliary_loss_mlp": 0.01029696, + "balance_loss_clip": 1.01503897, + "balance_loss_mlp": 1.02157593, + "epoch": 0.4983616413647978, + "flos": 18141603315840.0, + "grad_norm": 2.5944203482140473, + "language_loss": 0.80334568, + "learning_rate": 2.01057897583985e-06, + "loss": 0.82432473, + "num_input_tokens_seen": 178168150, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.46679688, + "step": 8289, + "time_per_iteration": 2.3573157787323 + }, + { + "auxiliary_loss_clip": 0.0106221, + "auxiliary_loss_mlp": 0.01025189, + "balance_loss_clip": 1.01288033, + "balance_loss_mlp": 1.01942933, + "epoch": 0.4984217646174658, + "flos": 19717826734080.0, + "grad_norm": 2.3001235458220273, + "language_loss": 0.73747778, + "learning_rate": 2.0102011586113003e-06, + "loss": 0.75835174, + "num_input_tokens_seen": 178186150, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.42773438, + "step": 8290, + "time_per_iteration": 2.39393949508667 + }, + { + "auxiliary_loss_clip": 0.01066655, + "auxiliary_loss_mlp": 0.01027446, + "balance_loss_clip": 1.01242542, + "balance_loss_mlp": 1.02243876, + "epoch": 0.49848188787013376, + "flos": 24935386890240.0, + "grad_norm": 1.4912448351931957, + "language_loss": 0.84053004, + "learning_rate": 2.009823341018697e-06, + "loss": 0.86147106, + "num_input_tokens_seen": 178207665, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.44140625, + "step": 8291, + "time_per_iteration": 2.4760425090789795 + }, + { + "auxiliary_loss_clip": 0.01071043, + "auxiliary_loss_mlp": 0.01027917, + "balance_loss_clip": 1.01255143, + "balance_loss_mlp": 1.02392483, + "epoch": 0.4985420111228017, + "flos": 22381350624000.0, + "grad_norm": 1.7466761290202875, + "language_loss": 0.66940999, + "learning_rate": 2.0094455230755247e-06, + "loss": 0.69039965, + "num_input_tokens_seen": 178226325, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.47070312, + "step": 8292, + "time_per_iteration": 2.40769624710083 + }, + { + "auxiliary_loss_clip": 0.01065237, + "auxiliary_loss_mlp": 0.01027062, + "balance_loss_clip": 1.01333475, + "balance_loss_mlp": 1.02214861, + "epoch": 0.4986021343754697, + "flos": 16215549408000.0, + "grad_norm": 1.5741220944371193, + "language_loss": 0.66844761, + "learning_rate": 2.009067704795265e-06, + "loss": 0.68937063, + "num_input_tokens_seen": 178244960, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.4296875, + "step": 8293, + "time_per_iteration": 2.363285779953003 + }, + { + "auxiliary_loss_clip": 0.01065762, + "auxiliary_loss_mlp": 0.01022735, + "balance_loss_clip": 1.01003957, + "balance_loss_mlp": 1.02206171, + "epoch": 0.4986622576281377, + "flos": 23402490336000.0, + "grad_norm": 2.6752375921081932, + "language_loss": 0.82041043, + "learning_rate": 2.0086898861914026e-06, + "loss": 0.84129542, + "num_input_tokens_seen": 178265400, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.4375, + "step": 8294, + "time_per_iteration": 2.4106600284576416 + }, + { + "auxiliary_loss_clip": 0.01065267, + "auxiliary_loss_mlp": 0.01025097, + "balance_loss_clip": 1.01191235, + "balance_loss_mlp": 1.02084708, + "epoch": 0.49872238088080567, + "flos": 19973530598400.0, + "grad_norm": 1.5973654022712644, + "language_loss": 0.73018312, + "learning_rate": 2.008312067277421e-06, + "loss": 0.75108677, + "num_input_tokens_seen": 178284535, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.4453125, + "step": 8295, + "time_per_iteration": 2.3862719535827637 + }, + { + "auxiliary_loss_clip": 0.01064122, + "auxiliary_loss_mlp": 0.01029305, + "balance_loss_clip": 1.01601887, + "balance_loss_mlp": 1.02117944, + "epoch": 0.49878250413347364, + "flos": 22891920480000.0, + "grad_norm": 1.782459473413544, + "language_loss": 0.67489547, + "learning_rate": 2.007934248066802e-06, + "loss": 0.69582975, + "num_input_tokens_seen": 178302425, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.4296875, + "step": 8296, + "time_per_iteration": 2.394148826599121 + }, + { + "auxiliary_loss_clip": 0.01066815, + "auxiliary_loss_mlp": 0.0103145, + "balance_loss_clip": 1.01732385, + "balance_loss_mlp": 1.02135611, + "epoch": 0.4988426273861416, + "flos": 32597076752640.0, + "grad_norm": 3.4126952071956262, + "language_loss": 0.64945382, + "learning_rate": 2.0075564285730313e-06, + "loss": 0.6704365, + "num_input_tokens_seen": 178323065, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.45507812, + "step": 8297, + "time_per_iteration": 2.474010705947876 + }, + { + "auxiliary_loss_clip": 0.01063549, + "auxiliary_loss_mlp": 0.01027393, + "balance_loss_clip": 1.01446521, + "balance_loss_mlp": 1.02092695, + "epoch": 0.49890275063880957, + "flos": 20922539708160.0, + "grad_norm": 1.925460135327972, + "language_loss": 0.69735444, + "learning_rate": 2.00717860880959e-06, + "loss": 0.7182638, + "num_input_tokens_seen": 178343985, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.42578125, + "step": 8298, + "time_per_iteration": 2.4137985706329346 + }, + { + "auxiliary_loss_clip": 0.01060735, + "auxiliary_loss_mlp": 0.01028023, + "balance_loss_clip": 1.01488638, + "balance_loss_mlp": 1.01909637, + "epoch": 0.49896287389147753, + "flos": 18623474167680.0, + "grad_norm": 1.7167895270431772, + "language_loss": 0.84552658, + "learning_rate": 2.0068007887899636e-06, + "loss": 0.86641413, + "num_input_tokens_seen": 178362345, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.41601562, + "step": 8299, + "time_per_iteration": 2.3763620853424072 + }, + { + "auxiliary_loss_clip": 0.01066213, + "auxiliary_loss_mlp": 0.01029359, + "balance_loss_clip": 1.01477945, + "balance_loss_mlp": 1.02046537, + "epoch": 0.4990229971441455, + "flos": 24825410507520.0, + "grad_norm": 2.0444947284320723, + "language_loss": 0.69042194, + "learning_rate": 2.0064229685276345e-06, + "loss": 0.71137762, + "num_input_tokens_seen": 178383190, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.45703125, + "step": 8300, + "time_per_iteration": 2.4565980434417725 + }, + { + "auxiliary_loss_clip": 0.0101121, + "auxiliary_loss_mlp": 0.01000145, + "balance_loss_clip": 0.99911362, + "balance_loss_mlp": 1.00245619, + "epoch": 0.49908312039681346, + "flos": 71381006271360.0, + "grad_norm": 0.7589139195743744, + "language_loss": 0.51153231, + "learning_rate": 2.0060451480360855e-06, + "loss": 0.53164589, + "num_input_tokens_seen": 178444250, + "router_z_loss_clip": 0.01031494, + "router_z_loss_mlp": 0.08789062, + "step": 8301, + "time_per_iteration": 3.125663995742798 + }, + { + "auxiliary_loss_clip": 0.01063268, + "auxiliary_loss_mlp": 0.01030343, + "balance_loss_clip": 1.01761699, + "balance_loss_mlp": 1.02099681, + "epoch": 0.4991432436494814, + "flos": 19827628560000.0, + "grad_norm": 2.1079218843738268, + "language_loss": 0.84567523, + "learning_rate": 2.005667327328801e-06, + "loss": 0.86661136, + "num_input_tokens_seen": 178463250, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.421875, + "step": 8302, + "time_per_iteration": 2.4024007320404053 + }, + { + "auxiliary_loss_clip": 0.01063536, + "auxiliary_loss_mlp": 0.0102135, + "balance_loss_clip": 1.00916696, + "balance_loss_mlp": 1.02189851, + "epoch": 0.4992033669021494, + "flos": 15121022284800.0, + "grad_norm": 1.6576430011845633, + "language_loss": 0.69254827, + "learning_rate": 2.005289506419264e-06, + "loss": 0.71339715, + "num_input_tokens_seen": 178481340, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.41601562, + "step": 8303, + "time_per_iteration": 2.3847341537475586 + }, + { + "auxiliary_loss_clip": 0.01064762, + "auxiliary_loss_mlp": 0.01026496, + "balance_loss_clip": 1.01332355, + "balance_loss_mlp": 1.02069664, + "epoch": 0.49926349015481736, + "flos": 31206730746240.0, + "grad_norm": 1.6610557899394975, + "language_loss": 0.72653002, + "learning_rate": 2.0049116853209586e-06, + "loss": 0.7474426, + "num_input_tokens_seen": 178501545, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.44140625, + "step": 8304, + "time_per_iteration": 2.484576940536499 + }, + { + "auxiliary_loss_clip": 0.01065231, + "auxiliary_loss_mlp": 0.01030567, + "balance_loss_clip": 1.01832986, + "balance_loss_mlp": 1.02159691, + "epoch": 0.4993236134074853, + "flos": 24899042298240.0, + "grad_norm": 1.99890301913112, + "language_loss": 0.80790079, + "learning_rate": 2.0045338640473683e-06, + "loss": 0.82885873, + "num_input_tokens_seen": 178519700, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.43554688, + "step": 8305, + "time_per_iteration": 2.405923366546631 + }, + { + "auxiliary_loss_clip": 0.01009549, + "auxiliary_loss_mlp": 0.01002141, + "balance_loss_clip": 1.00114548, + "balance_loss_mlp": 1.00084388, + "epoch": 0.4993837366601533, + "flos": 70417508947200.0, + "grad_norm": 0.7117448502161117, + "language_loss": 0.56864309, + "learning_rate": 2.0041560426119747e-06, + "loss": 0.58875996, + "num_input_tokens_seen": 178576740, + "router_z_loss_clip": 0.00994873, + "router_z_loss_mlp": 0.08691406, + "step": 8306, + "time_per_iteration": 3.1527178287506104 + }, + { + "auxiliary_loss_clip": 0.01064663, + "auxiliary_loss_mlp": 0.01030227, + "balance_loss_clip": 1.01624966, + "balance_loss_mlp": 1.02114069, + "epoch": 0.4994438599128213, + "flos": 15960299391360.0, + "grad_norm": 1.6939697392355382, + "language_loss": 0.82625782, + "learning_rate": 2.0037782210282632e-06, + "loss": 0.84720671, + "num_input_tokens_seen": 178594745, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.43554688, + "step": 8307, + "time_per_iteration": 3.8962244987487793 + }, + { + "auxiliary_loss_clip": 0.01064226, + "auxiliary_loss_mlp": 0.01025326, + "balance_loss_clip": 1.0117662, + "balance_loss_mlp": 1.02101862, + "epoch": 0.4995039831654893, + "flos": 27927059448960.0, + "grad_norm": 1.8630792931173346, + "language_loss": 0.60777968, + "learning_rate": 2.0034003993097168e-06, + "loss": 0.62867522, + "num_input_tokens_seen": 178614110, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.43359375, + "step": 8308, + "time_per_iteration": 2.4405760765075684 + }, + { + "auxiliary_loss_clip": 0.01061182, + "auxiliary_loss_mlp": 0.0102581, + "balance_loss_clip": 1.01343012, + "balance_loss_mlp": 1.01977885, + "epoch": 0.49956410641815724, + "flos": 24203712193920.0, + "grad_norm": 1.5209373483786925, + "language_loss": 0.74867254, + "learning_rate": 2.0030225774698184e-06, + "loss": 0.76954246, + "num_input_tokens_seen": 178634170, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.4140625, + "step": 8309, + "time_per_iteration": 2.4293088912963867 + }, + { + "auxiliary_loss_clip": 0.01063378, + "auxiliary_loss_mlp": 0.01027383, + "balance_loss_clip": 1.01491976, + "balance_loss_mlp": 1.02076864, + "epoch": 0.4996242296708252, + "flos": 16179204816000.0, + "grad_norm": 4.424544309095664, + "language_loss": 0.79694057, + "learning_rate": 2.002644755522053e-06, + "loss": 0.81784815, + "num_input_tokens_seen": 178651775, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.42578125, + "step": 8310, + "time_per_iteration": 2.370014190673828 + }, + { + "auxiliary_loss_clip": 0.01065624, + "auxiliary_loss_mlp": 0.01029721, + "balance_loss_clip": 1.01679242, + "balance_loss_mlp": 1.02186227, + "epoch": 0.49968435292349317, + "flos": 16872579884160.0, + "grad_norm": 2.974697409269726, + "language_loss": 0.70642543, + "learning_rate": 2.0022669334799023e-06, + "loss": 0.7273789, + "num_input_tokens_seen": 178669720, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.4375, + "step": 8311, + "time_per_iteration": 2.3641228675842285 + }, + { + "auxiliary_loss_clip": 0.01065396, + "auxiliary_loss_mlp": 0.01029139, + "balance_loss_clip": 1.01624656, + "balance_loss_mlp": 1.02235341, + "epoch": 0.49974447617616113, + "flos": 14938636008960.0, + "grad_norm": 3.26129359837468, + "language_loss": 0.77289563, + "learning_rate": 2.0018891113568506e-06, + "loss": 0.793841, + "num_input_tokens_seen": 178686765, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.4296875, + "step": 8312, + "time_per_iteration": 2.3752686977386475 + }, + { + "auxiliary_loss_clip": 0.01063802, + "auxiliary_loss_mlp": 0.0102608, + "balance_loss_clip": 1.01289535, + "balance_loss_mlp": 1.02086854, + "epoch": 0.4998045994288291, + "flos": 26650320606720.0, + "grad_norm": 1.7960669760505534, + "language_loss": 0.84444332, + "learning_rate": 2.0015112891663814e-06, + "loss": 0.8653422, + "num_input_tokens_seen": 178705845, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.4296875, + "step": 8313, + "time_per_iteration": 2.416395902633667 + }, + { + "auxiliary_loss_clip": 0.01064415, + "auxiliary_loss_mlp": 0.01022931, + "balance_loss_clip": 1.00941849, + "balance_loss_mlp": 1.02093983, + "epoch": 0.49986472268149706, + "flos": 20952879546240.0, + "grad_norm": 2.1200426298836583, + "language_loss": 0.80884373, + "learning_rate": 2.0011334669219787e-06, + "loss": 0.82971716, + "num_input_tokens_seen": 178723410, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.43554688, + "step": 8314, + "time_per_iteration": 2.444504976272583 + }, + { + "auxiliary_loss_clip": 0.01068281, + "auxiliary_loss_mlp": 0.01035766, + "balance_loss_clip": 1.02085328, + "balance_loss_mlp": 1.02293539, + "epoch": 0.49992484593416503, + "flos": 22782781969920.0, + "grad_norm": 1.7210320839634696, + "language_loss": 0.79211551, + "learning_rate": 2.000755644637124e-06, + "loss": 0.81315601, + "num_input_tokens_seen": 178743560, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.453125, + "step": 8315, + "time_per_iteration": 2.4122416973114014 + }, + { + "auxiliary_loss_clip": 0.0106441, + "auxiliary_loss_mlp": 0.01025931, + "balance_loss_clip": 1.01343799, + "balance_loss_mlp": 1.02209711, + "epoch": 0.499984969186833, + "flos": 46785325576320.0, + "grad_norm": 1.5714760442126712, + "language_loss": 0.74550819, + "learning_rate": 2.000377822325304e-06, + "loss": 0.76641154, + "num_input_tokens_seen": 178767225, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.42382812, + "step": 8316, + "time_per_iteration": 2.615898609161377 + }, + { + "auxiliary_loss_clip": 0.01062612, + "auxiliary_loss_mlp": 0.01025579, + "balance_loss_clip": 1.01358712, + "balance_loss_mlp": 1.02164185, + "epoch": 0.500045092439501, + "flos": 25555793483520.0, + "grad_norm": 1.6204128981026233, + "language_loss": 0.813003, + "learning_rate": 2e-06, + "loss": 0.83388495, + "num_input_tokens_seen": 178786810, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.41015625, + "step": 8317, + "time_per_iteration": 2.4186623096466064 + }, + { + "auxiliary_loss_clip": 0.01063147, + "auxiliary_loss_mlp": 0.01028751, + "balance_loss_clip": 1.01443946, + "balance_loss_mlp": 1.01938796, + "epoch": 0.5001052156921689, + "flos": 20703704106240.0, + "grad_norm": 1.6753958473987465, + "language_loss": 0.82978302, + "learning_rate": 1.9996221776746954e-06, + "loss": 0.85070199, + "num_input_tokens_seen": 178805660, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.4375, + "step": 8318, + "time_per_iteration": 2.388617753982544 + }, + { + "auxiliary_loss_clip": 0.0106375, + "auxiliary_loss_mlp": 0.01022914, + "balance_loss_clip": 1.01046896, + "balance_loss_mlp": 1.02150345, + "epoch": 0.500165338944837, + "flos": 21250059972480.0, + "grad_norm": 3.090025102065958, + "language_loss": 0.81653643, + "learning_rate": 1.999244355362875e-06, + "loss": 0.83740312, + "num_input_tokens_seen": 178824780, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.421875, + "step": 8319, + "time_per_iteration": 3.823045015335083 + }, + { + "auxiliary_loss_clip": 0.01064575, + "auxiliary_loss_mlp": 0.01022774, + "balance_loss_clip": 1.01079369, + "balance_loss_mlp": 1.02126479, + "epoch": 0.5002254621975049, + "flos": 27853183278720.0, + "grad_norm": 1.6274842634938755, + "language_loss": 0.718831, + "learning_rate": 1.9988665330780216e-06, + "loss": 0.73970455, + "num_input_tokens_seen": 178845640, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.43359375, + "step": 8320, + "time_per_iteration": 2.458753824234009 + }, + { + "auxiliary_loss_clip": 0.01069772, + "auxiliary_loss_mlp": 0.01032491, + "balance_loss_clip": 1.01752996, + "balance_loss_mlp": 1.02129054, + "epoch": 0.5002855854501729, + "flos": 15551257368960.0, + "grad_norm": 3.5817087070498577, + "language_loss": 0.76733804, + "learning_rate": 1.998488710833619e-06, + "loss": 0.78836071, + "num_input_tokens_seen": 178862290, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.484375, + "step": 8321, + "time_per_iteration": 2.3650574684143066 + }, + { + "auxiliary_loss_clip": 0.01067036, + "auxiliary_loss_mlp": 0.01030956, + "balance_loss_clip": 1.01801538, + "balance_loss_mlp": 1.02198422, + "epoch": 0.5003457087028408, + "flos": 16106480720640.0, + "grad_norm": 1.5508289900681336, + "language_loss": 0.82725966, + "learning_rate": 1.9981108886431497e-06, + "loss": 0.8482396, + "num_input_tokens_seen": 178879805, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.44921875, + "step": 8322, + "time_per_iteration": 4.982910394668579 + }, + { + "auxiliary_loss_clip": 0.01066361, + "auxiliary_loss_mlp": 0.01025787, + "balance_loss_clip": 1.01272798, + "balance_loss_mlp": 1.02133095, + "epoch": 0.5004058319555088, + "flos": 22709429470080.0, + "grad_norm": 2.680160551063949, + "language_loss": 0.74088901, + "learning_rate": 1.997733066520098e-06, + "loss": 0.76181054, + "num_input_tokens_seen": 178896985, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.45117188, + "step": 8323, + "time_per_iteration": 2.3756463527679443 + }, + { + "auxiliary_loss_clip": 0.01064787, + "auxiliary_loss_mlp": 0.01028451, + "balance_loss_clip": 1.01471233, + "balance_loss_mlp": 1.02120638, + "epoch": 0.5004659552081767, + "flos": 30116637365760.0, + "grad_norm": 1.5489141979388505, + "language_loss": 0.6910131, + "learning_rate": 1.9973552444779477e-06, + "loss": 0.71194553, + "num_input_tokens_seen": 178920605, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.43554688, + "step": 8324, + "time_per_iteration": 2.4673707485198975 + }, + { + "auxiliary_loss_clip": 0.01062665, + "auxiliary_loss_mlp": 0.01030661, + "balance_loss_clip": 1.01776886, + "balance_loss_mlp": 1.01950169, + "epoch": 0.5005260784608447, + "flos": 18623718547200.0, + "grad_norm": 1.8716822254218757, + "language_loss": 0.72203302, + "learning_rate": 1.9969774225301814e-06, + "loss": 0.74296629, + "num_input_tokens_seen": 178937760, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.43164062, + "step": 8325, + "time_per_iteration": 2.378615140914917 + }, + { + "auxiliary_loss_clip": 0.01065728, + "auxiliary_loss_mlp": 0.01033861, + "balance_loss_clip": 1.01888204, + "balance_loss_mlp": 1.02107096, + "epoch": 0.5005862017135126, + "flos": 24858927279360.0, + "grad_norm": 1.5982837014807212, + "language_loss": 0.73634386, + "learning_rate": 1.9965996006902835e-06, + "loss": 0.75733972, + "num_input_tokens_seen": 178957985, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.4453125, + "step": 8326, + "time_per_iteration": 2.4204022884368896 + }, + { + "auxiliary_loss_clip": 0.01063407, + "auxiliary_loss_mlp": 0.01028678, + "balance_loss_clip": 1.01621509, + "balance_loss_mlp": 1.02077138, + "epoch": 0.5006463249661807, + "flos": 18733380727680.0, + "grad_norm": 1.4371832969662792, + "language_loss": 0.77896118, + "learning_rate": 1.996221778971737e-06, + "loss": 0.79988199, + "num_input_tokens_seen": 178977070, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.42578125, + "step": 8327, + "time_per_iteration": 2.4008874893188477 + }, + { + "auxiliary_loss_clip": 0.01067622, + "auxiliary_loss_mlp": 0.01036165, + "balance_loss_clip": 1.02254522, + "balance_loss_mlp": 1.02360463, + "epoch": 0.5007064482188487, + "flos": 13041316016640.0, + "grad_norm": 2.260004619101643, + "language_loss": 0.87335098, + "learning_rate": 1.995843957388025e-06, + "loss": 0.89438879, + "num_input_tokens_seen": 178994175, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.43945312, + "step": 8328, + "time_per_iteration": 2.4036848545074463 + }, + { + "auxiliary_loss_clip": 0.01067349, + "auxiliary_loss_mlp": 0.01027917, + "balance_loss_clip": 1.01401091, + "balance_loss_mlp": 1.02164698, + "epoch": 0.5007665714715166, + "flos": 21287591550720.0, + "grad_norm": 1.945897026678486, + "language_loss": 0.74441862, + "learning_rate": 1.9954661359526324e-06, + "loss": 0.76537126, + "num_input_tokens_seen": 179013710, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.45703125, + "step": 8329, + "time_per_iteration": 2.4084339141845703 + }, + { + "auxiliary_loss_clip": 0.01065302, + "auxiliary_loss_mlp": 0.01027495, + "balance_loss_clip": 1.01438785, + "balance_loss_mlp": 1.0223546, + "epoch": 0.5008266947241846, + "flos": 29753226357120.0, + "grad_norm": 1.5635491927261564, + "language_loss": 0.79502738, + "learning_rate": 1.9950883146790413e-06, + "loss": 0.81595534, + "num_input_tokens_seen": 179035255, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.4296875, + "step": 8330, + "time_per_iteration": 2.4660441875457764 + }, + { + "auxiliary_loss_clip": 0.01009788, + "auxiliary_loss_mlp": 0.01002533, + "balance_loss_clip": 1.00147176, + "balance_loss_mlp": 1.00137699, + "epoch": 0.5008868179768525, + "flos": 63555050995200.0, + "grad_norm": 0.7266759233269988, + "language_loss": 0.56081307, + "learning_rate": 1.9947104935807355e-06, + "loss": 0.58093631, + "num_input_tokens_seen": 179090915, + "router_z_loss_clip": 0.01062012, + "router_z_loss_mlp": 0.08398438, + "step": 8331, + "time_per_iteration": 2.8880603313446045 + }, + { + "auxiliary_loss_clip": 0.01064947, + "auxiliary_loss_mlp": 0.01027071, + "balance_loss_clip": 1.01359987, + "balance_loss_mlp": 1.02205396, + "epoch": 0.5009469412295205, + "flos": 27374559183360.0, + "grad_norm": 1.8202233068473077, + "language_loss": 0.6522392, + "learning_rate": 1.9943326726711987e-06, + "loss": 0.67315936, + "num_input_tokens_seen": 179109160, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.4296875, + "step": 8332, + "time_per_iteration": 2.471418619155884 + }, + { + "auxiliary_loss_clip": 0.01063802, + "auxiliary_loss_mlp": 0.01024558, + "balance_loss_clip": 1.01183259, + "balance_loss_mlp": 1.02128196, + "epoch": 0.5010070644821885, + "flos": 27377666294400.0, + "grad_norm": 1.5628991095675804, + "language_loss": 0.75035697, + "learning_rate": 1.9939548519639143e-06, + "loss": 0.77124059, + "num_input_tokens_seen": 179130610, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.42578125, + "step": 8333, + "time_per_iteration": 2.4456632137298584 + }, + { + "auxiliary_loss_clip": 0.01063426, + "auxiliary_loss_mlp": 0.01024848, + "balance_loss_clip": 1.01126981, + "balance_loss_mlp": 1.01996565, + "epoch": 0.5010671877348565, + "flos": 20661843519360.0, + "grad_norm": 2.3099332775236183, + "language_loss": 0.80290663, + "learning_rate": 1.9935770314723658e-06, + "loss": 0.82378936, + "num_input_tokens_seen": 179147860, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.43359375, + "step": 8334, + "time_per_iteration": 2.4105489253997803 + }, + { + "auxiliary_loss_clip": 0.01009363, + "auxiliary_loss_mlp": 0.01002, + "balance_loss_clip": 1.00090337, + "balance_loss_mlp": 1.00085378, + "epoch": 0.5011273109875244, + "flos": 59125374691200.0, + "grad_norm": 0.8993598504765632, + "language_loss": 0.62719846, + "learning_rate": 1.9931992112100362e-06, + "loss": 0.64731205, + "num_input_tokens_seen": 179210490, + "router_z_loss_clip": 0.01098633, + "router_z_loss_mlp": 0.08496094, + "step": 8335, + "time_per_iteration": 3.013087272644043 + }, + { + "auxiliary_loss_clip": 0.01063341, + "auxiliary_loss_mlp": 0.01026549, + "balance_loss_clip": 1.01375759, + "balance_loss_mlp": 1.02088499, + "epoch": 0.5011874342401924, + "flos": 25335212313600.0, + "grad_norm": 1.3805935235549365, + "language_loss": 0.79724765, + "learning_rate": 1.9928213911904096e-06, + "loss": 0.81814659, + "num_input_tokens_seen": 179231360, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.42382812, + "step": 8336, + "time_per_iteration": 2.4275362491607666 + }, + { + "auxiliary_loss_clip": 0.01065177, + "auxiliary_loss_mlp": 0.0102761, + "balance_loss_clip": 1.0137403, + "balance_loss_mlp": 1.02073622, + "epoch": 0.5012475574928603, + "flos": 20228920260480.0, + "grad_norm": 1.7720140651886318, + "language_loss": 0.79645491, + "learning_rate": 1.992443571426969e-06, + "loss": 0.81738281, + "num_input_tokens_seen": 179250625, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.44335938, + "step": 8337, + "time_per_iteration": 2.3787128925323486 + }, + { + "auxiliary_loss_clip": 0.01067113, + "auxiliary_loss_mlp": 0.01027114, + "balance_loss_clip": 1.01329136, + "balance_loss_mlp": 1.02096367, + "epoch": 0.5013076807455283, + "flos": 22709045445120.0, + "grad_norm": 2.74494321369497, + "language_loss": 0.792795, + "learning_rate": 1.9920657519331977e-06, + "loss": 0.81373727, + "num_input_tokens_seen": 179267360, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.4609375, + "step": 8338, + "time_per_iteration": 2.3757309913635254 + }, + { + "auxiliary_loss_clip": 0.01063947, + "auxiliary_loss_mlp": 0.010227, + "balance_loss_clip": 1.00917554, + "balance_loss_mlp": 1.02074039, + "epoch": 0.5013678039981962, + "flos": 24243966858240.0, + "grad_norm": 1.6985272936696911, + "language_loss": 0.84981948, + "learning_rate": 1.9916879327225794e-06, + "loss": 0.87068594, + "num_input_tokens_seen": 179289810, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.43359375, + "step": 8339, + "time_per_iteration": 2.417890787124634 + }, + { + "auxiliary_loss_clip": 0.0106407, + "auxiliary_loss_mlp": 0.01027156, + "balance_loss_clip": 1.01412082, + "balance_loss_mlp": 1.02111578, + "epoch": 0.5014279272508643, + "flos": 26175501849600.0, + "grad_norm": 1.4659782487399313, + "language_loss": 0.70918888, + "learning_rate": 1.991310113808597e-06, + "loss": 0.73010111, + "num_input_tokens_seen": 179310620, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.4296875, + "step": 8340, + "time_per_iteration": 2.431886911392212 + }, + { + "auxiliary_loss_clip": 0.01066502, + "auxiliary_loss_mlp": 0.01029334, + "balance_loss_clip": 1.01577389, + "balance_loss_mlp": 1.02119446, + "epoch": 0.5014880505035323, + "flos": 21429478782720.0, + "grad_norm": 2.2405816959755036, + "language_loss": 0.78786993, + "learning_rate": 1.9909322952047353e-06, + "loss": 0.80882829, + "num_input_tokens_seen": 179329005, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.453125, + "step": 8341, + "time_per_iteration": 2.382690191268921 + }, + { + "auxiliary_loss_clip": 0.01068499, + "auxiliary_loss_mlp": 0.01027596, + "balance_loss_clip": 1.01341617, + "balance_loss_mlp": 1.02209568, + "epoch": 0.5015481737562002, + "flos": 15770058059520.0, + "grad_norm": 2.4263520574036677, + "language_loss": 0.89352536, + "learning_rate": 1.9905544769244756e-06, + "loss": 0.91448629, + "num_input_tokens_seen": 179343785, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.46484375, + "step": 8342, + "time_per_iteration": 2.3486762046813965 + }, + { + "auxiliary_loss_clip": 0.01063194, + "auxiliary_loss_mlp": 0.0102901, + "balance_loss_clip": 1.01554489, + "balance_loss_mlp": 1.02001476, + "epoch": 0.5016082970088682, + "flos": 26829669594240.0, + "grad_norm": 1.6725691391925048, + "language_loss": 0.76573324, + "learning_rate": 1.9901766589813028e-06, + "loss": 0.78665531, + "num_input_tokens_seen": 179364070, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.43164062, + "step": 8343, + "time_per_iteration": 2.4177136421203613 + }, + { + "auxiliary_loss_clip": 0.01064333, + "auxiliary_loss_mlp": 0.01023635, + "balance_loss_clip": 1.01217294, + "balance_loss_mlp": 1.02263641, + "epoch": 0.5016684202615361, + "flos": 21469523978880.0, + "grad_norm": 2.4205684157923404, + "language_loss": 0.67113906, + "learning_rate": 1.9897988413887e-06, + "loss": 0.69201875, + "num_input_tokens_seen": 179384225, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.41796875, + "step": 8344, + "time_per_iteration": 2.39961314201355 + }, + { + "auxiliary_loss_clip": 0.0106442, + "auxiliary_loss_mlp": 0.0102739, + "balance_loss_clip": 1.0142293, + "balance_loss_mlp": 1.02080154, + "epoch": 0.5017285435142042, + "flos": 26245712327040.0, + "grad_norm": 1.9760820952270155, + "language_loss": 0.75567108, + "learning_rate": 1.9894210241601498e-06, + "loss": 0.77658916, + "num_input_tokens_seen": 179402595, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.43554688, + "step": 8345, + "time_per_iteration": 2.4119668006896973 + }, + { + "auxiliary_loss_clip": 0.01066345, + "auxiliary_loss_mlp": 0.01028341, + "balance_loss_clip": 1.01530552, + "balance_loss_mlp": 1.0225234, + "epoch": 0.5017886667668721, + "flos": 20776498024320.0, + "grad_norm": 1.887337463729423, + "language_loss": 0.78516948, + "learning_rate": 1.989043207309136e-06, + "loss": 0.80611634, + "num_input_tokens_seen": 179419635, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.4375, + "step": 8346, + "time_per_iteration": 2.36750864982605 + }, + { + "auxiliary_loss_clip": 0.01063563, + "auxiliary_loss_mlp": 0.01031743, + "balance_loss_clip": 1.01911831, + "balance_loss_mlp": 1.0196265, + "epoch": 0.5018487900195401, + "flos": 20155393203840.0, + "grad_norm": 1.431313152579607, + "language_loss": 0.69339049, + "learning_rate": 1.988665390849142e-06, + "loss": 0.71434355, + "num_input_tokens_seen": 179438770, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.43945312, + "step": 8347, + "time_per_iteration": 3.780440092086792 + }, + { + "auxiliary_loss_clip": 0.01065661, + "auxiliary_loss_mlp": 0.01031043, + "balance_loss_clip": 1.01751924, + "balance_loss_mlp": 1.02103329, + "epoch": 0.501908913272208, + "flos": 18149702751360.0, + "grad_norm": 2.1268924562026736, + "language_loss": 0.71289569, + "learning_rate": 1.9882875747936518e-06, + "loss": 0.73386276, + "num_input_tokens_seen": 179457475, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.4453125, + "step": 8348, + "time_per_iteration": 2.3823468685150146 + }, + { + "auxiliary_loss_clip": 0.01063225, + "auxiliary_loss_mlp": 0.01022341, + "balance_loss_clip": 1.01012826, + "balance_loss_mlp": 1.02147937, + "epoch": 0.501969036524876, + "flos": 23111175018240.0, + "grad_norm": 1.477445275759104, + "language_loss": 0.74507588, + "learning_rate": 1.9879097591561475e-06, + "loss": 0.76593155, + "num_input_tokens_seen": 179478140, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.41796875, + "step": 8349, + "time_per_iteration": 2.428532123565674 + }, + { + "auxiliary_loss_clip": 0.01064536, + "auxiliary_loss_mlp": 0.01023626, + "balance_loss_clip": 1.0101676, + "balance_loss_mlp": 1.02045703, + "epoch": 0.5020291597775439, + "flos": 11362447601280.0, + "grad_norm": 1.9972240462937627, + "language_loss": 0.63718271, + "learning_rate": 1.987531943950113e-06, + "loss": 0.65806437, + "num_input_tokens_seen": 179494325, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.44140625, + "step": 8350, + "time_per_iteration": 2.3788702487945557 + }, + { + "auxiliary_loss_clip": 0.01010267, + "auxiliary_loss_mlp": 0.01001587, + "balance_loss_clip": 1.00070465, + "balance_loss_mlp": 1.00170827, + "epoch": 0.5020892830302119, + "flos": 64007873729280.0, + "grad_norm": 0.7792682896246632, + "language_loss": 0.59843838, + "learning_rate": 1.9871541291890312e-06, + "loss": 0.61855698, + "num_input_tokens_seen": 179553545, + "router_z_loss_clip": 0.0088501, + "router_z_loss_mlp": 0.0859375, + "step": 8351, + "time_per_iteration": 3.1547012329101562 + }, + { + "auxiliary_loss_clip": 0.01065357, + "auxiliary_loss_mlp": 0.01028344, + "balance_loss_clip": 1.01489115, + "balance_loss_mlp": 1.02061272, + "epoch": 0.5021494062828799, + "flos": 23731721256960.0, + "grad_norm": 4.055086601928735, + "language_loss": 0.74866533, + "learning_rate": 1.986776314886385e-06, + "loss": 0.76960242, + "num_input_tokens_seen": 179573645, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.44726562, + "step": 8352, + "time_per_iteration": 2.4193108081817627 + }, + { + "auxiliary_loss_clip": 0.0106562, + "auxiliary_loss_mlp": 0.01027291, + "balance_loss_clip": 1.01429749, + "balance_loss_mlp": 1.02198851, + "epoch": 0.5022095295355479, + "flos": 21575764846080.0, + "grad_norm": 1.613294697490489, + "language_loss": 0.71687007, + "learning_rate": 1.9863985010556587e-06, + "loss": 0.73779917, + "num_input_tokens_seen": 179591435, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.43554688, + "step": 8353, + "time_per_iteration": 2.384147882461548 + }, + { + "auxiliary_loss_clip": 0.01068257, + "auxiliary_loss_mlp": 0.01029379, + "balance_loss_clip": 1.01541948, + "balance_loss_mlp": 1.02392268, + "epoch": 0.5022696527882159, + "flos": 21396171479040.0, + "grad_norm": 1.6195559124235008, + "language_loss": 0.73874706, + "learning_rate": 1.9860206877103344e-06, + "loss": 0.75972342, + "num_input_tokens_seen": 179609955, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.44335938, + "step": 8354, + "time_per_iteration": 2.4360899925231934 + }, + { + "auxiliary_loss_clip": 0.01064642, + "auxiliary_loss_mlp": 0.01029109, + "balance_loss_clip": 1.01594853, + "balance_loss_mlp": 1.02229548, + "epoch": 0.5023297760408838, + "flos": 27159528919680.0, + "grad_norm": 1.824391481185833, + "language_loss": 0.72378051, + "learning_rate": 1.9856428748638957e-06, + "loss": 0.74471802, + "num_input_tokens_seen": 179630875, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.421875, + "step": 8355, + "time_per_iteration": 2.411675214767456 + }, + { + "auxiliary_loss_clip": 0.010653, + "auxiliary_loss_mlp": 0.01027712, + "balance_loss_clip": 1.01387811, + "balance_loss_mlp": 1.02041769, + "epoch": 0.5023898992935518, + "flos": 26212614491520.0, + "grad_norm": 1.6382586152683498, + "language_loss": 0.81255531, + "learning_rate": 1.9852650625298267e-06, + "loss": 0.83348548, + "num_input_tokens_seen": 179649835, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.44921875, + "step": 8356, + "time_per_iteration": 2.420945167541504 + }, + { + "auxiliary_loss_clip": 0.01066653, + "auxiliary_loss_mlp": 0.01032964, + "balance_loss_clip": 1.01858711, + "balance_loss_mlp": 1.02002192, + "epoch": 0.5024500225462197, + "flos": 13439570428800.0, + "grad_norm": 1.8389217258266635, + "language_loss": 0.7605927, + "learning_rate": 1.984887250721609e-06, + "loss": 0.78158891, + "num_input_tokens_seen": 179667605, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.46679688, + "step": 8357, + "time_per_iteration": 2.346778631210327 + }, + { + "auxiliary_loss_clip": 0.01065079, + "auxiliary_loss_mlp": 0.01032226, + "balance_loss_clip": 1.01774251, + "balance_loss_mlp": 1.02078843, + "epoch": 0.5025101457988878, + "flos": 21578522843520.0, + "grad_norm": 1.8460603325475649, + "language_loss": 0.76373196, + "learning_rate": 1.9845094394527267e-06, + "loss": 0.78470504, + "num_input_tokens_seen": 179686910, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.44335938, + "step": 8358, + "time_per_iteration": 3.8050715923309326 + }, + { + "auxiliary_loss_clip": 0.01067147, + "auxiliary_loss_mlp": 0.01031568, + "balance_loss_clip": 1.01797819, + "balance_loss_mlp": 1.0221076, + "epoch": 0.5025702690515557, + "flos": 24643966838400.0, + "grad_norm": 2.120549779088496, + "language_loss": 0.72146136, + "learning_rate": 1.984131628736662e-06, + "loss": 0.74244851, + "num_input_tokens_seen": 179706395, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.45117188, + "step": 8359, + "time_per_iteration": 2.397411823272705 + }, + { + "auxiliary_loss_clip": 0.01065797, + "auxiliary_loss_mlp": 0.01033666, + "balance_loss_clip": 1.02064848, + "balance_loss_mlp": 1.02240205, + "epoch": 0.5026303923042237, + "flos": 22089092699520.0, + "grad_norm": 1.5201688419082853, + "language_loss": 0.76464045, + "learning_rate": 1.9837538185868998e-06, + "loss": 0.78563505, + "num_input_tokens_seen": 179725735, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.43359375, + "step": 8360, + "time_per_iteration": 2.408262252807617 + }, + { + "auxiliary_loss_clip": 0.01066085, + "auxiliary_loss_mlp": 0.01029514, + "balance_loss_clip": 1.01688337, + "balance_loss_mlp": 1.02248907, + "epoch": 0.5026905155568916, + "flos": 23696039980800.0, + "grad_norm": 1.5782610420675, + "language_loss": 0.76735175, + "learning_rate": 1.9833760090169216e-06, + "loss": 0.78830779, + "num_input_tokens_seen": 179746150, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.43554688, + "step": 8361, + "time_per_iteration": 3.9266576766967773 + }, + { + "auxiliary_loss_clip": 0.0106444, + "auxiliary_loss_mlp": 0.01031799, + "balance_loss_clip": 1.01809621, + "balance_loss_mlp": 1.01979649, + "epoch": 0.5027506388095596, + "flos": 25811218056960.0, + "grad_norm": 1.670364095841688, + "language_loss": 0.84657657, + "learning_rate": 1.9829982000402105e-06, + "loss": 0.86753893, + "num_input_tokens_seen": 179767550, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.44726562, + "step": 8362, + "time_per_iteration": 3.8608627319335938 + }, + { + "auxiliary_loss_clip": 0.01063483, + "auxiliary_loss_mlp": 0.01029251, + "balance_loss_clip": 1.01662636, + "balance_loss_mlp": 1.01921999, + "epoch": 0.5028107620622275, + "flos": 27525383723520.0, + "grad_norm": 1.5651688209729113, + "language_loss": 0.78627062, + "learning_rate": 1.9826203916702502e-06, + "loss": 0.80719799, + "num_input_tokens_seen": 179790075, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.44335938, + "step": 8363, + "time_per_iteration": 2.4406139850616455 + }, + { + "auxiliary_loss_clip": 0.01062191, + "auxiliary_loss_mlp": 0.01024607, + "balance_loss_clip": 1.0117209, + "balance_loss_mlp": 1.02043247, + "epoch": 0.5028708853148955, + "flos": 24533152583040.0, + "grad_norm": 2.3302434608734046, + "language_loss": 0.76219952, + "learning_rate": 1.982242583920523e-06, + "loss": 0.78306746, + "num_input_tokens_seen": 179806515, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.41796875, + "step": 8364, + "time_per_iteration": 2.39467716217041 + }, + { + "auxiliary_loss_clip": 0.01062414, + "auxiliary_loss_mlp": 0.010254, + "balance_loss_clip": 1.01275182, + "balance_loss_mlp": 1.02060664, + "epoch": 0.5029310085675635, + "flos": 20812563325440.0, + "grad_norm": 1.4953455503459165, + "language_loss": 0.69360262, + "learning_rate": 1.9818647768045137e-06, + "loss": 0.71448076, + "num_input_tokens_seen": 179826450, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.41796875, + "step": 8365, + "time_per_iteration": 2.3991012573242188 + }, + { + "auxiliary_loss_clip": 0.01068472, + "auxiliary_loss_mlp": 0.01028887, + "balance_loss_clip": 1.01437342, + "balance_loss_mlp": 1.02151036, + "epoch": 0.5029911318202315, + "flos": 22341479984640.0, + "grad_norm": 1.6592696175373811, + "language_loss": 0.72878098, + "learning_rate": 1.981486970335703e-06, + "loss": 0.74975455, + "num_input_tokens_seen": 179846770, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.46875, + "step": 8366, + "time_per_iteration": 2.426133632659912 + }, + { + "auxiliary_loss_clip": 0.01065281, + "auxiliary_loss_mlp": 0.01026908, + "balance_loss_clip": 1.01444435, + "balance_loss_mlp": 1.02254689, + "epoch": 0.5030512550728995, + "flos": 24351569268480.0, + "grad_norm": 1.4673157860468349, + "language_loss": 0.78174365, + "learning_rate": 1.9811091645275742e-06, + "loss": 0.80266547, + "num_input_tokens_seen": 179866585, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.42578125, + "step": 8367, + "time_per_iteration": 2.42598032951355 + }, + { + "auxiliary_loss_clip": 0.01063645, + "auxiliary_loss_mlp": 0.01026199, + "balance_loss_clip": 1.01363444, + "balance_loss_mlp": 1.02012515, + "epoch": 0.5031113783255674, + "flos": 18258945995520.0, + "grad_norm": 1.6553257591479997, + "language_loss": 0.69801891, + "learning_rate": 1.9807313593936114e-06, + "loss": 0.71891737, + "num_input_tokens_seen": 179885575, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.43359375, + "step": 8368, + "time_per_iteration": 2.388026237487793 + }, + { + "auxiliary_loss_clip": 0.01009611, + "auxiliary_loss_mlp": 0.01001326, + "balance_loss_clip": 1.00030041, + "balance_loss_mlp": 1.00078678, + "epoch": 0.5031715015782354, + "flos": 57250364924160.0, + "grad_norm": 1.1516043809027952, + "language_loss": 0.63356096, + "learning_rate": 1.9803535549472962e-06, + "loss": 0.65367037, + "num_input_tokens_seen": 179939650, + "router_z_loss_clip": 0.01025391, + "router_z_loss_mlp": 0.08837891, + "step": 8369, + "time_per_iteration": 2.980738878250122 + }, + { + "auxiliary_loss_clip": 0.01064071, + "auxiliary_loss_mlp": 0.01027319, + "balance_loss_clip": 1.01520753, + "balance_loss_mlp": 1.02126718, + "epoch": 0.5032316248309033, + "flos": 27526116862080.0, + "grad_norm": 1.9029026147370882, + "language_loss": 0.60851073, + "learning_rate": 1.9799757512021126e-06, + "loss": 0.62942463, + "num_input_tokens_seen": 179961765, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.42773438, + "step": 8370, + "time_per_iteration": 2.4542763233184814 + }, + { + "auxiliary_loss_clip": 0.01065184, + "auxiliary_loss_mlp": 0.01030722, + "balance_loss_clip": 1.01620269, + "balance_loss_mlp": 1.02057445, + "epoch": 0.5032917480835714, + "flos": 34494396744960.0, + "grad_norm": 1.4556643516419339, + "language_loss": 0.68174261, + "learning_rate": 1.9795979481715426e-06, + "loss": 0.70270169, + "num_input_tokens_seen": 179983015, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.4453125, + "step": 8371, + "time_per_iteration": 2.485039472579956 + }, + { + "auxiliary_loss_clip": 0.01065479, + "auxiliary_loss_mlp": 0.01028034, + "balance_loss_clip": 1.01435447, + "balance_loss_mlp": 1.0211556, + "epoch": 0.5033518713362393, + "flos": 33655364017920.0, + "grad_norm": 2.012195416606541, + "language_loss": 0.67454302, + "learning_rate": 1.9792201458690695e-06, + "loss": 0.69547814, + "num_input_tokens_seen": 180003210, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.44335938, + "step": 8372, + "time_per_iteration": 2.4857780933380127 + }, + { + "auxiliary_loss_clip": 0.01064201, + "auxiliary_loss_mlp": 0.01027277, + "balance_loss_clip": 1.01258492, + "balance_loss_mlp": 1.02094483, + "epoch": 0.5034119945889073, + "flos": 28184194679040.0, + "grad_norm": 1.6360929888871407, + "language_loss": 0.66784441, + "learning_rate": 1.978842344308176e-06, + "loss": 0.68875921, + "num_input_tokens_seen": 180025530, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.43359375, + "step": 8373, + "time_per_iteration": 2.45308256149292 + }, + { + "auxiliary_loss_clip": 0.01062379, + "auxiliary_loss_mlp": 0.01024704, + "balance_loss_clip": 1.01184702, + "balance_loss_mlp": 1.02002776, + "epoch": 0.5034721178415752, + "flos": 21357697294080.0, + "grad_norm": 1.5849549291691505, + "language_loss": 0.74672878, + "learning_rate": 1.9784645435023443e-06, + "loss": 0.76759958, + "num_input_tokens_seen": 180043180, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.42382812, + "step": 8374, + "time_per_iteration": 2.383108139038086 + }, + { + "auxiliary_loss_clip": 0.01065372, + "auxiliary_loss_mlp": 0.01027776, + "balance_loss_clip": 1.01401329, + "balance_loss_mlp": 1.02156925, + "epoch": 0.5035322410942432, + "flos": 22673713282560.0, + "grad_norm": 1.5440268404935285, + "language_loss": 0.67786592, + "learning_rate": 1.9780867434650584e-06, + "loss": 0.69879746, + "num_input_tokens_seen": 180062905, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.4375, + "step": 8375, + "time_per_iteration": 2.3947808742523193 + }, + { + "auxiliary_loss_clip": 0.01067201, + "auxiliary_loss_mlp": 0.01027727, + "balance_loss_clip": 1.01237345, + "balance_loss_mlp": 1.02144909, + "epoch": 0.5035923643469111, + "flos": 19827698382720.0, + "grad_norm": 1.661790814943288, + "language_loss": 0.78663039, + "learning_rate": 1.9777089442098e-06, + "loss": 0.8075797, + "num_input_tokens_seen": 180082000, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.45703125, + "step": 8376, + "time_per_iteration": 2.3663976192474365 + }, + { + "auxiliary_loss_clip": 0.01065818, + "auxiliary_loss_mlp": 0.01028891, + "balance_loss_clip": 1.01450253, + "balance_loss_mlp": 1.01939726, + "epoch": 0.5036524875995791, + "flos": 30513425500800.0, + "grad_norm": 2.103909340013547, + "language_loss": 0.59754705, + "learning_rate": 1.977331145750052e-06, + "loss": 0.61849409, + "num_input_tokens_seen": 180101340, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.46484375, + "step": 8377, + "time_per_iteration": 2.465972423553467 + }, + { + "auxiliary_loss_clip": 0.01063075, + "auxiliary_loss_mlp": 0.0102768, + "balance_loss_clip": 1.01374412, + "balance_loss_mlp": 1.01951122, + "epoch": 0.5037126108522471, + "flos": 14719695672960.0, + "grad_norm": 1.9483794919908484, + "language_loss": 0.76164955, + "learning_rate": 1.976953348099297e-06, + "loss": 0.78255713, + "num_input_tokens_seen": 180119160, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.43554688, + "step": 8378, + "time_per_iteration": 2.3437018394470215 + }, + { + "auxiliary_loss_clip": 0.01064944, + "auxiliary_loss_mlp": 0.01027551, + "balance_loss_clip": 1.01451516, + "balance_loss_mlp": 1.02107382, + "epoch": 0.5037727341049151, + "flos": 25296633394560.0, + "grad_norm": 1.582194966451142, + "language_loss": 0.7501182, + "learning_rate": 1.9765755512710173e-06, + "loss": 0.77104318, + "num_input_tokens_seen": 180138730, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.4375, + "step": 8379, + "time_per_iteration": 2.45980167388916 + }, + { + "auxiliary_loss_clip": 0.01062781, + "auxiliary_loss_mlp": 0.01029184, + "balance_loss_clip": 1.01626122, + "balance_loss_mlp": 1.02095056, + "epoch": 0.5038328573575831, + "flos": 28540693238400.0, + "grad_norm": 1.91504466365508, + "language_loss": 0.66752386, + "learning_rate": 1.9761977552786974e-06, + "loss": 0.68844354, + "num_input_tokens_seen": 180158810, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.41796875, + "step": 8380, + "time_per_iteration": 2.43107533454895 + }, + { + "auxiliary_loss_clip": 0.01063642, + "auxiliary_loss_mlp": 0.01026975, + "balance_loss_clip": 1.01485729, + "balance_loss_mlp": 1.02071214, + "epoch": 0.503892980610251, + "flos": 31648521490560.0, + "grad_norm": 4.058159809920885, + "language_loss": 0.63628602, + "learning_rate": 1.975819960135817e-06, + "loss": 0.65719223, + "num_input_tokens_seen": 180179700, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.4296875, + "step": 8381, + "time_per_iteration": 2.4642887115478516 + }, + { + "auxiliary_loss_clip": 0.01062409, + "auxiliary_loss_mlp": 0.01027303, + "balance_loss_clip": 1.01423192, + "balance_loss_mlp": 1.01973438, + "epoch": 0.503953103862919, + "flos": 27088131456000.0, + "grad_norm": 1.5288235876273442, + "language_loss": 0.6783132, + "learning_rate": 1.9754421658558604e-06, + "loss": 0.69921029, + "num_input_tokens_seen": 180199890, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.42578125, + "step": 8382, + "time_per_iteration": 2.4315826892852783 + }, + { + "auxiliary_loss_clip": 0.01070045, + "auxiliary_loss_mlp": 0.01036142, + "balance_loss_clip": 1.02163422, + "balance_loss_mlp": 1.02253687, + "epoch": 0.5040132271155869, + "flos": 15632045988480.0, + "grad_norm": 1.6154840750360036, + "language_loss": 0.6218406, + "learning_rate": 1.97506437245231e-06, + "loss": 0.64290237, + "num_input_tokens_seen": 180217840, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.47460938, + "step": 8383, + "time_per_iteration": 2.3606581687927246 + }, + { + "auxiliary_loss_clip": 0.01062175, + "auxiliary_loss_mlp": 0.01029537, + "balance_loss_clip": 1.0171032, + "balance_loss_mlp": 1.02083611, + "epoch": 0.504073350368255, + "flos": 13589242894080.0, + "grad_norm": 3.050355839092931, + "language_loss": 0.67354363, + "learning_rate": 1.9746865799386476e-06, + "loss": 0.69446075, + "num_input_tokens_seen": 180236465, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.4140625, + "step": 8384, + "time_per_iteration": 2.356374979019165 + }, + { + "auxiliary_loss_clip": 0.01063099, + "auxiliary_loss_mlp": 0.01029292, + "balance_loss_clip": 1.01659584, + "balance_loss_mlp": 1.02029479, + "epoch": 0.5041334736209229, + "flos": 29056918734720.0, + "grad_norm": 1.5766478790881568, + "language_loss": 0.70888859, + "learning_rate": 1.974308788328356e-06, + "loss": 0.7298125, + "num_input_tokens_seen": 180258025, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.42773438, + "step": 8385, + "time_per_iteration": 2.4611220359802246 + }, + { + "auxiliary_loss_clip": 0.01061098, + "auxiliary_loss_mlp": 0.01021127, + "balance_loss_clip": 1.00948048, + "balance_loss_mlp": 1.02030826, + "epoch": 0.5041935968735909, + "flos": 24607203310080.0, + "grad_norm": 2.3617212746078087, + "language_loss": 0.82994735, + "learning_rate": 1.973930997634918e-06, + "loss": 0.85076964, + "num_input_tokens_seen": 180277825, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.40820312, + "step": 8386, + "time_per_iteration": 3.9338924884796143 + }, + { + "auxiliary_loss_clip": 0.01065083, + "auxiliary_loss_mlp": 0.0102648, + "balance_loss_clip": 1.01329565, + "balance_loss_mlp": 1.02152562, + "epoch": 0.5042537201262588, + "flos": 26285722611840.0, + "grad_norm": 1.6751537442283655, + "language_loss": 0.66584957, + "learning_rate": 1.9735532078718157e-06, + "loss": 0.68676519, + "num_input_tokens_seen": 180300465, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.43554688, + "step": 8387, + "time_per_iteration": 2.4409191608428955 + }, + { + "auxiliary_loss_clip": 0.01064856, + "auxiliary_loss_mlp": 0.01029766, + "balance_loss_clip": 1.01777983, + "balance_loss_mlp": 1.02220702, + "epoch": 0.5043138433789268, + "flos": 22016298781440.0, + "grad_norm": 2.0177801732340748, + "language_loss": 0.80250698, + "learning_rate": 1.973175419052531e-06, + "loss": 0.82345319, + "num_input_tokens_seen": 180321050, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.42578125, + "step": 8388, + "time_per_iteration": 2.391280174255371 + }, + { + "auxiliary_loss_clip": 0.01064539, + "auxiliary_loss_mlp": 0.01026088, + "balance_loss_clip": 1.01329672, + "balance_loss_mlp": 1.02148819, + "epoch": 0.5043739666315947, + "flos": 28765847796480.0, + "grad_norm": 2.2370082631553743, + "language_loss": 0.70344102, + "learning_rate": 1.972797631190547e-06, + "loss": 0.72434729, + "num_input_tokens_seen": 180338870, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.4296875, + "step": 8389, + "time_per_iteration": 2.433605909347534 + }, + { + "auxiliary_loss_clip": 0.01065409, + "auxiliary_loss_mlp": 0.01028602, + "balance_loss_clip": 1.01597762, + "balance_loss_mlp": 1.021402, + "epoch": 0.5044340898842627, + "flos": 27598037996160.0, + "grad_norm": 1.7984742258065087, + "language_loss": 0.6966151, + "learning_rate": 1.972419844299345e-06, + "loss": 0.71755528, + "num_input_tokens_seen": 180361285, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.44140625, + "step": 8390, + "time_per_iteration": 2.428912878036499 + }, + { + "auxiliary_loss_clip": 0.01009904, + "auxiliary_loss_mlp": 0.0100326, + "balance_loss_clip": 1.00231266, + "balance_loss_mlp": 1.00143123, + "epoch": 0.5044942131369307, + "flos": 67455268531200.0, + "grad_norm": 0.8136963754874427, + "language_loss": 0.52948153, + "learning_rate": 1.972042058392408e-06, + "loss": 0.54961312, + "num_input_tokens_seen": 180415170, + "router_z_loss_clip": 0.00946045, + "router_z_loss_mlp": 0.08496094, + "step": 8391, + "time_per_iteration": 2.847377300262451 + }, + { + "auxiliary_loss_clip": 0.01009532, + "auxiliary_loss_mlp": 0.01002046, + "balance_loss_clip": 1.00110972, + "balance_loss_mlp": 1.00099492, + "epoch": 0.5045543363895987, + "flos": 58628247575040.0, + "grad_norm": 0.8688862793842582, + "language_loss": 0.60741663, + "learning_rate": 1.9716642734832183e-06, + "loss": 0.62753236, + "num_input_tokens_seen": 180468060, + "router_z_loss_clip": 0.00933838, + "router_z_loss_mlp": 0.08544922, + "step": 8392, + "time_per_iteration": 2.892364263534546 + }, + { + "auxiliary_loss_clip": 0.01060151, + "auxiliary_loss_mlp": 0.01024227, + "balance_loss_clip": 1.0118711, + "balance_loss_mlp": 1.01962519, + "epoch": 0.5046144596422667, + "flos": 22525576917120.0, + "grad_norm": 1.5677128242672427, + "language_loss": 0.84329462, + "learning_rate": 1.971286489585258e-06, + "loss": 0.86413848, + "num_input_tokens_seen": 180486610, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.40625, + "step": 8393, + "time_per_iteration": 2.380943775177002 + }, + { + "auxiliary_loss_clip": 0.01064249, + "auxiliary_loss_mlp": 0.01023785, + "balance_loss_clip": 1.01144075, + "balance_loss_mlp": 1.02123725, + "epoch": 0.5046745828949346, + "flos": 27453008741760.0, + "grad_norm": 2.395930811140495, + "language_loss": 0.50634509, + "learning_rate": 1.9709087067120084e-06, + "loss": 0.52722549, + "num_input_tokens_seen": 180508135, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.4296875, + "step": 8394, + "time_per_iteration": 2.4787323474884033 + }, + { + "auxiliary_loss_clip": 0.01062251, + "auxiliary_loss_mlp": 0.01026933, + "balance_loss_clip": 1.01307487, + "balance_loss_mlp": 1.01949525, + "epoch": 0.5047347061476026, + "flos": 17273592293760.0, + "grad_norm": 1.5985779145850327, + "language_loss": 0.7479912, + "learning_rate": 1.970530924876953e-06, + "loss": 0.76888299, + "num_input_tokens_seen": 180527000, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.42773438, + "step": 8395, + "time_per_iteration": 2.393223524093628 + }, + { + "auxiliary_loss_clip": 0.01063371, + "auxiliary_loss_mlp": 0.0102532, + "balance_loss_clip": 1.01268339, + "balance_loss_mlp": 1.02114725, + "epoch": 0.5047948294002705, + "flos": 16648717046400.0, + "grad_norm": 1.746193205308842, + "language_loss": 0.67739236, + "learning_rate": 1.9701531440935726e-06, + "loss": 0.69827932, + "num_input_tokens_seen": 180544715, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.421875, + "step": 8396, + "time_per_iteration": 2.3453943729400635 + }, + { + "auxiliary_loss_clip": 0.01064546, + "auxiliary_loss_mlp": 0.01028468, + "balance_loss_clip": 1.01617193, + "balance_loss_mlp": 1.02132928, + "epoch": 0.5048549526529386, + "flos": 26864617731840.0, + "grad_norm": 1.7782798924493306, + "language_loss": 0.79031312, + "learning_rate": 1.9697753643753497e-06, + "loss": 0.81124318, + "num_input_tokens_seen": 180565365, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.43359375, + "step": 8397, + "time_per_iteration": 3.866243839263916 + }, + { + "auxiliary_loss_clip": 0.01060816, + "auxiliary_loss_mlp": 0.01025136, + "balance_loss_clip": 1.01279795, + "balance_loss_mlp": 1.01925349, + "epoch": 0.5049150759056065, + "flos": 21832900076160.0, + "grad_norm": 1.6906652695777649, + "language_loss": 0.66408432, + "learning_rate": 1.9693975857357665e-06, + "loss": 0.6849438, + "num_input_tokens_seen": 180586670, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.41601562, + "step": 8398, + "time_per_iteration": 2.4129557609558105 + }, + { + "auxiliary_loss_clip": 0.01063092, + "auxiliary_loss_mlp": 0.01025562, + "balance_loss_clip": 1.01231766, + "balance_loss_mlp": 1.02085567, + "epoch": 0.5049751991582745, + "flos": 21684833533440.0, + "grad_norm": 1.7996165827877901, + "language_loss": 0.71977913, + "learning_rate": 1.9690198081883043e-06, + "loss": 0.74066567, + "num_input_tokens_seen": 180605085, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.421875, + "step": 8399, + "time_per_iteration": 2.377622604370117 + }, + { + "auxiliary_loss_clip": 0.01063414, + "auxiliary_loss_mlp": 0.01027821, + "balance_loss_clip": 1.01418948, + "balance_loss_mlp": 1.02085924, + "epoch": 0.5050353224109424, + "flos": 21358360609920.0, + "grad_norm": 1.5908711853199746, + "language_loss": 0.8139689, + "learning_rate": 1.968642031746446e-06, + "loss": 0.83488131, + "num_input_tokens_seen": 180624370, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.42578125, + "step": 8400, + "time_per_iteration": 2.410210371017456 + }, + { + "auxiliary_loss_clip": 0.0106582, + "auxiliary_loss_mlp": 0.01026155, + "balance_loss_clip": 1.01270223, + "balance_loss_mlp": 1.02196169, + "epoch": 0.5050954456636104, + "flos": 22818986916480.0, + "grad_norm": 1.9342614474336124, + "language_loss": 0.78459275, + "learning_rate": 1.9682642564236725e-06, + "loss": 0.80551243, + "num_input_tokens_seen": 180642450, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.4375, + "step": 8401, + "time_per_iteration": 3.87377667427063 + }, + { + "auxiliary_loss_clip": 0.01064597, + "auxiliary_loss_mlp": 0.01024042, + "balance_loss_clip": 1.0110364, + "balance_loss_mlp": 1.02021098, + "epoch": 0.5051555689162783, + "flos": 30446845804800.0, + "grad_norm": 1.633610908462229, + "language_loss": 0.69915384, + "learning_rate": 1.967886482233466e-06, + "loss": 0.7200402, + "num_input_tokens_seen": 180665250, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.4453125, + "step": 8402, + "time_per_iteration": 3.846214532852173 + }, + { + "auxiliary_loss_clip": 0.01062178, + "auxiliary_loss_mlp": 0.01020942, + "balance_loss_clip": 1.0084846, + "balance_loss_mlp": 1.01970232, + "epoch": 0.5052156921689464, + "flos": 21286893323520.0, + "grad_norm": 1.8407154233768737, + "language_loss": 0.69584292, + "learning_rate": 1.9675087091893084e-06, + "loss": 0.71667415, + "num_input_tokens_seen": 180687425, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.42382812, + "step": 8403, + "time_per_iteration": 2.4305102825164795 + }, + { + "auxiliary_loss_clip": 0.01061446, + "auxiliary_loss_mlp": 0.01026338, + "balance_loss_clip": 1.01407099, + "balance_loss_mlp": 1.02060175, + "epoch": 0.5052758154216143, + "flos": 25080171765120.0, + "grad_norm": 1.3869892076558763, + "language_loss": 0.8560738, + "learning_rate": 1.9671309373046816e-06, + "loss": 0.87695163, + "num_input_tokens_seen": 180708725, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.40820312, + "step": 8404, + "time_per_iteration": 2.4138121604919434 + }, + { + "auxiliary_loss_clip": 0.01063806, + "auxiliary_loss_mlp": 0.01029724, + "balance_loss_clip": 1.01716542, + "balance_loss_mlp": 1.02120996, + "epoch": 0.5053359386742823, + "flos": 20884484459520.0, + "grad_norm": 1.564060571155798, + "language_loss": 0.75650454, + "learning_rate": 1.9667531665930676e-06, + "loss": 0.77743983, + "num_input_tokens_seen": 180727990, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.42578125, + "step": 8405, + "time_per_iteration": 2.4042153358459473 + }, + { + "auxiliary_loss_clip": 0.01064211, + "auxiliary_loss_mlp": 0.01027224, + "balance_loss_clip": 1.01318753, + "balance_loss_mlp": 1.02024436, + "epoch": 0.5053960619269503, + "flos": 37741808079360.0, + "grad_norm": 1.6387831190158155, + "language_loss": 0.72723043, + "learning_rate": 1.966375397067947e-06, + "loss": 0.74814481, + "num_input_tokens_seen": 180749765, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.43945312, + "step": 8406, + "time_per_iteration": 2.524808645248413 + }, + { + "auxiliary_loss_clip": 0.01060602, + "auxiliary_loss_mlp": 0.01025566, + "balance_loss_clip": 1.01282895, + "balance_loss_mlp": 1.01985681, + "epoch": 0.5054561851796182, + "flos": 23512711098240.0, + "grad_norm": 1.7003254107614392, + "language_loss": 0.76609272, + "learning_rate": 1.965997628742802e-06, + "loss": 0.7869544, + "num_input_tokens_seen": 180769580, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.40820312, + "step": 8407, + "time_per_iteration": 2.421942949295044 + }, + { + "auxiliary_loss_clip": 0.01063357, + "auxiliary_loss_mlp": 0.01021454, + "balance_loss_clip": 1.0087347, + "balance_loss_mlp": 1.01918602, + "epoch": 0.5055163084322862, + "flos": 30408895290240.0, + "grad_norm": 1.980624977694934, + "language_loss": 0.62966055, + "learning_rate": 1.965619861631114e-06, + "loss": 0.65050864, + "num_input_tokens_seen": 180790295, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.44140625, + "step": 8408, + "time_per_iteration": 2.452329158782959 + }, + { + "auxiliary_loss_clip": 0.01061719, + "auxiliary_loss_mlp": 0.01025977, + "balance_loss_clip": 1.0121125, + "balance_loss_mlp": 1.02005601, + "epoch": 0.5055764316849541, + "flos": 20258806250880.0, + "grad_norm": 2.2176496456218726, + "language_loss": 0.63737267, + "learning_rate": 1.9652420957463645e-06, + "loss": 0.65824968, + "num_input_tokens_seen": 180807875, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.41601562, + "step": 8409, + "time_per_iteration": 2.390472173690796 + }, + { + "auxiliary_loss_clip": 0.01062644, + "auxiliary_loss_mlp": 0.01025516, + "balance_loss_clip": 1.01377964, + "balance_loss_mlp": 1.02064919, + "epoch": 0.5056365549376222, + "flos": 26069610096000.0, + "grad_norm": 1.320740966756984, + "language_loss": 0.70896649, + "learning_rate": 1.9648643311020365e-06, + "loss": 0.72984809, + "num_input_tokens_seen": 180831300, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.41992188, + "step": 8410, + "time_per_iteration": 2.4454362392425537 + }, + { + "auxiliary_loss_clip": 0.01064925, + "auxiliary_loss_mlp": 0.01028466, + "balance_loss_clip": 1.01548982, + "balance_loss_mlp": 1.02202058, + "epoch": 0.5056966781902901, + "flos": 19278130671360.0, + "grad_norm": 1.4371606605636416, + "language_loss": 0.79150653, + "learning_rate": 1.9644865677116086e-06, + "loss": 0.81244045, + "num_input_tokens_seen": 180849055, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.4296875, + "step": 8411, + "time_per_iteration": 2.382700204849243 + }, + { + "auxiliary_loss_clip": 0.01063066, + "auxiliary_loss_mlp": 0.01024663, + "balance_loss_clip": 1.011127, + "balance_loss_mlp": 1.02083468, + "epoch": 0.5057568014429581, + "flos": 21322295308800.0, + "grad_norm": 1.688296494220886, + "language_loss": 0.82000679, + "learning_rate": 1.9641088055885647e-06, + "loss": 0.84088409, + "num_input_tokens_seen": 180867395, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.421875, + "step": 8412, + "time_per_iteration": 2.377617597579956 + }, + { + "auxiliary_loss_clip": 0.0106498, + "auxiliary_loss_mlp": 0.01028708, + "balance_loss_clip": 1.01544583, + "balance_loss_mlp": 1.02254367, + "epoch": 0.505816924695626, + "flos": 17492637363840.0, + "grad_norm": 1.9976625224750586, + "language_loss": 0.80412126, + "learning_rate": 1.9637310447463846e-06, + "loss": 0.8250581, + "num_input_tokens_seen": 180886670, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.42382812, + "step": 8413, + "time_per_iteration": 2.405055284500122 + }, + { + "auxiliary_loss_clip": 0.01064734, + "auxiliary_loss_mlp": 0.01027591, + "balance_loss_clip": 1.01412082, + "balance_loss_mlp": 1.02098167, + "epoch": 0.505877047948294, + "flos": 21141026196480.0, + "grad_norm": 2.1327197523768766, + "language_loss": 0.80336833, + "learning_rate": 1.9633532851985504e-06, + "loss": 0.82429159, + "num_input_tokens_seen": 180904645, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.4375, + "step": 8414, + "time_per_iteration": 2.3941268920898438 + }, + { + "auxiliary_loss_clip": 0.01062454, + "auxiliary_loss_mlp": 0.01022797, + "balance_loss_clip": 1.00977314, + "balance_loss_mlp": 1.01918173, + "epoch": 0.5059371712009619, + "flos": 36348738986880.0, + "grad_norm": 2.0664252532409475, + "language_loss": 0.61827141, + "learning_rate": 1.9629755269585436e-06, + "loss": 0.63912392, + "num_input_tokens_seen": 180922340, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.43359375, + "step": 8415, + "time_per_iteration": 2.5048248767852783 + }, + { + "auxiliary_loss_clip": 0.01060803, + "auxiliary_loss_mlp": 0.01024771, + "balance_loss_clip": 1.01262355, + "balance_loss_mlp": 1.01888621, + "epoch": 0.50599729445363, + "flos": 22672316828160.0, + "grad_norm": 1.504583630270518, + "language_loss": 0.81715596, + "learning_rate": 1.9625977700398442e-06, + "loss": 0.83801168, + "num_input_tokens_seen": 180941350, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.41992188, + "step": 8416, + "time_per_iteration": 2.4135067462921143 + }, + { + "auxiliary_loss_clip": 0.01061717, + "auxiliary_loss_mlp": 0.01026331, + "balance_loss_clip": 1.01442838, + "balance_loss_mlp": 1.02069497, + "epoch": 0.5060574177062979, + "flos": 22746751580160.0, + "grad_norm": 1.6757918946857153, + "language_loss": 0.7921378, + "learning_rate": 1.962220014455935e-06, + "loss": 0.81301826, + "num_input_tokens_seen": 180960720, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.41015625, + "step": 8417, + "time_per_iteration": 2.393808603286743 + }, + { + "auxiliary_loss_clip": 0.01059557, + "auxiliary_loss_mlp": 0.01028212, + "balance_loss_clip": 1.01552248, + "balance_loss_mlp": 1.01976526, + "epoch": 0.5061175409589659, + "flos": 21652119722880.0, + "grad_norm": 1.6729230950075595, + "language_loss": 0.62493908, + "learning_rate": 1.9618422602202955e-06, + "loss": 0.6458168, + "num_input_tokens_seen": 180979725, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.3984375, + "step": 8418, + "time_per_iteration": 2.4222495555877686 + }, + { + "auxiliary_loss_clip": 0.01064261, + "auxiliary_loss_mlp": 0.0102639, + "balance_loss_clip": 1.01403975, + "balance_loss_mlp": 1.02061653, + "epoch": 0.5061776642116339, + "flos": 21615181637760.0, + "grad_norm": 2.1149203419261067, + "language_loss": 0.77049136, + "learning_rate": 1.9614645073464084e-06, + "loss": 0.79139787, + "num_input_tokens_seen": 180998980, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.4375, + "step": 8419, + "time_per_iteration": 2.391892433166504 + }, + { + "auxiliary_loss_clip": 0.01062767, + "auxiliary_loss_mlp": 0.01027613, + "balance_loss_clip": 1.01434457, + "balance_loss_mlp": 1.01928806, + "epoch": 0.5062377874643018, + "flos": 24425131236480.0, + "grad_norm": 1.864736043406879, + "language_loss": 0.76961982, + "learning_rate": 1.9610867558477534e-06, + "loss": 0.79052365, + "num_input_tokens_seen": 181019165, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.43359375, + "step": 8420, + "time_per_iteration": 2.454824924468994 + }, + { + "auxiliary_loss_clip": 0.01062788, + "auxiliary_loss_mlp": 0.01028262, + "balance_loss_clip": 1.01470792, + "balance_loss_mlp": 1.02052462, + "epoch": 0.5062979107169698, + "flos": 22523447324160.0, + "grad_norm": 1.633664916164348, + "language_loss": 0.77296662, + "learning_rate": 1.960709005737812e-06, + "loss": 0.79387712, + "num_input_tokens_seen": 181037110, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.421875, + "step": 8421, + "time_per_iteration": 2.3933303356170654 + }, + { + "auxiliary_loss_clip": 0.01059994, + "auxiliary_loss_mlp": 0.01023803, + "balance_loss_clip": 1.01164341, + "balance_loss_mlp": 1.01864982, + "epoch": 0.5063580339696377, + "flos": 24570823806720.0, + "grad_norm": 1.4754902968837873, + "language_loss": 0.6659658, + "learning_rate": 1.9603312570300653e-06, + "loss": 0.68680376, + "num_input_tokens_seen": 181057775, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.4140625, + "step": 8422, + "time_per_iteration": 2.455075979232788 + }, + { + "auxiliary_loss_clip": 0.01062101, + "auxiliary_loss_mlp": 0.01026748, + "balance_loss_clip": 1.01336122, + "balance_loss_mlp": 1.01986122, + "epoch": 0.5064181572223058, + "flos": 22595193901440.0, + "grad_norm": 1.7225768084716389, + "language_loss": 0.81521958, + "learning_rate": 1.9599535097379936e-06, + "loss": 0.83610809, + "num_input_tokens_seen": 181078260, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.421875, + "step": 8423, + "time_per_iteration": 2.4502899646759033 + }, + { + "auxiliary_loss_clip": 0.01065205, + "auxiliary_loss_mlp": 0.01026519, + "balance_loss_clip": 1.01307797, + "balance_loss_mlp": 1.02127361, + "epoch": 0.5064782804749737, + "flos": 25993743978240.0, + "grad_norm": 2.325141571056703, + "language_loss": 0.74717593, + "learning_rate": 1.9595757638750787e-06, + "loss": 0.76809317, + "num_input_tokens_seen": 181098755, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.43945312, + "step": 8424, + "time_per_iteration": 2.44850754737854 + }, + { + "auxiliary_loss_clip": 0.0106153, + "auxiliary_loss_mlp": 0.01028864, + "balance_loss_clip": 1.01533341, + "balance_loss_mlp": 1.0191288, + "epoch": 0.5065384037276417, + "flos": 28551655405440.0, + "grad_norm": 1.3788266131602946, + "language_loss": 0.71445262, + "learning_rate": 1.9591980194548007e-06, + "loss": 0.73535657, + "num_input_tokens_seen": 181121570, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.42382812, + "step": 8425, + "time_per_iteration": 2.4718079566955566 + }, + { + "auxiliary_loss_clip": 0.01062926, + "auxiliary_loss_mlp": 0.01027624, + "balance_loss_clip": 1.01495194, + "balance_loss_mlp": 1.01990843, + "epoch": 0.5065985269803096, + "flos": 22964923866240.0, + "grad_norm": 1.9320212428985457, + "language_loss": 0.78795111, + "learning_rate": 1.9588202764906405e-06, + "loss": 0.80885661, + "num_input_tokens_seen": 181140240, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.4296875, + "step": 8426, + "time_per_iteration": 3.8544609546661377 + }, + { + "auxiliary_loss_clip": 0.01066819, + "auxiliary_loss_mlp": 0.01030035, + "balance_loss_clip": 1.01664782, + "balance_loss_mlp": 1.02233338, + "epoch": 0.5066586502329776, + "flos": 21607710606720.0, + "grad_norm": 2.129062818045728, + "language_loss": 0.77670068, + "learning_rate": 1.9584425349960787e-06, + "loss": 0.79766923, + "num_input_tokens_seen": 181158630, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.4453125, + "step": 8427, + "time_per_iteration": 2.3980913162231445 + }, + { + "auxiliary_loss_clip": 0.01060192, + "auxiliary_loss_mlp": 0.01024698, + "balance_loss_clip": 1.01173425, + "balance_loss_mlp": 1.01921439, + "epoch": 0.5067187734856455, + "flos": 20338861731840.0, + "grad_norm": 1.5907934661272507, + "language_loss": 0.71313417, + "learning_rate": 1.9580647949845953e-06, + "loss": 0.73398316, + "num_input_tokens_seen": 181176405, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.41015625, + "step": 8428, + "time_per_iteration": 2.408596992492676 + }, + { + "auxiliary_loss_clip": 0.01061534, + "auxiliary_loss_mlp": 0.01028077, + "balance_loss_clip": 1.01582825, + "balance_loss_mlp": 1.0200094, + "epoch": 0.5067788967383136, + "flos": 28839793789440.0, + "grad_norm": 1.617872172209847, + "language_loss": 0.8297599, + "learning_rate": 1.957687056469672e-06, + "loss": 0.85065603, + "num_input_tokens_seen": 181197595, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.41601562, + "step": 8429, + "time_per_iteration": 2.4650137424468994 + }, + { + "auxiliary_loss_clip": 0.01066575, + "auxiliary_loss_mlp": 0.01031425, + "balance_loss_clip": 1.01757288, + "balance_loss_mlp": 1.02094221, + "epoch": 0.5068390199909815, + "flos": 32448870564480.0, + "grad_norm": 1.8936853742332393, + "language_loss": 0.72939157, + "learning_rate": 1.957309319464789e-06, + "loss": 0.75037152, + "num_input_tokens_seen": 181218560, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.45703125, + "step": 8430, + "time_per_iteration": 2.5159096717834473 + }, + { + "auxiliary_loss_clip": 0.01064322, + "auxiliary_loss_mlp": 0.01027905, + "balance_loss_clip": 1.01537561, + "balance_loss_mlp": 1.02107608, + "epoch": 0.5068991432436495, + "flos": 23145529662720.0, + "grad_norm": 1.6125654922512163, + "language_loss": 0.76916069, + "learning_rate": 1.956931583983426e-06, + "loss": 0.79008293, + "num_input_tokens_seen": 181237095, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.43359375, + "step": 8431, + "time_per_iteration": 2.4200146198272705 + }, + { + "auxiliary_loss_clip": 0.01062484, + "auxiliary_loss_mlp": 0.01027261, + "balance_loss_clip": 1.01449335, + "balance_loss_mlp": 1.02090812, + "epoch": 0.5069592664963174, + "flos": 19935126236160.0, + "grad_norm": 1.4733615370053716, + "language_loss": 0.72226936, + "learning_rate": 1.9565538500390644e-06, + "loss": 0.7431668, + "num_input_tokens_seen": 181255940, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.4140625, + "step": 8432, + "time_per_iteration": 2.408000946044922 + }, + { + "auxiliary_loss_clip": 0.01009592, + "auxiliary_loss_mlp": 0.01010472, + "balance_loss_clip": 1.00966144, + "balance_loss_mlp": 1.00140786, + "epoch": 0.5070193897489854, + "flos": 65790643950720.0, + "grad_norm": 0.7784177942270085, + "language_loss": 0.63632166, + "learning_rate": 1.956176117645184e-06, + "loss": 0.65652227, + "num_input_tokens_seen": 181316945, + "router_z_loss_clip": 0.00811768, + "router_z_loss_mlp": 0.08203125, + "step": 8433, + "time_per_iteration": 3.080568552017212 + }, + { + "auxiliary_loss_clip": 0.01063199, + "auxiliary_loss_mlp": 0.0103192, + "balance_loss_clip": 1.01903307, + "balance_loss_mlp": 1.02072597, + "epoch": 0.5070795130016534, + "flos": 17274360343680.0, + "grad_norm": 1.7250251853255187, + "language_loss": 0.77609587, + "learning_rate": 1.9557983868152652e-06, + "loss": 0.79704702, + "num_input_tokens_seen": 181335555, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.42578125, + "step": 8434, + "time_per_iteration": 2.3948135375976562 + }, + { + "auxiliary_loss_clip": 0.01063081, + "auxiliary_loss_mlp": 0.01030921, + "balance_loss_clip": 1.01789165, + "balance_loss_mlp": 1.01971555, + "epoch": 0.5071396362543213, + "flos": 21068860682880.0, + "grad_norm": 1.782546340880156, + "language_loss": 0.70668942, + "learning_rate": 1.955420657562788e-06, + "loss": 0.72762942, + "num_input_tokens_seen": 181354580, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.43359375, + "step": 8435, + "time_per_iteration": 2.3950724601745605 + }, + { + "auxiliary_loss_clip": 0.01065022, + "auxiliary_loss_mlp": 0.01032134, + "balance_loss_clip": 1.0183953, + "balance_loss_mlp": 1.02102757, + "epoch": 0.5071997595069894, + "flos": 23143819006080.0, + "grad_norm": 2.4499437088433353, + "language_loss": 0.72375405, + "learning_rate": 1.9550429299012334e-06, + "loss": 0.74472558, + "num_input_tokens_seen": 181374320, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.43945312, + "step": 8436, + "time_per_iteration": 2.4293394088745117 + }, + { + "auxiliary_loss_clip": 0.01065593, + "auxiliary_loss_mlp": 0.01025635, + "balance_loss_clip": 1.01177669, + "balance_loss_mlp": 1.02186179, + "epoch": 0.5072598827596573, + "flos": 22746088264320.0, + "grad_norm": 1.9276289234401085, + "language_loss": 0.84193444, + "learning_rate": 1.954665203844081e-06, + "loss": 0.86284673, + "num_input_tokens_seen": 181392190, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.4375, + "step": 8437, + "time_per_iteration": 3.7914528846740723 + }, + { + "auxiliary_loss_clip": 0.01059928, + "auxiliary_loss_mlp": 0.01024235, + "balance_loss_clip": 1.01158714, + "balance_loss_mlp": 1.01929617, + "epoch": 0.5073200060123253, + "flos": 22565168265600.0, + "grad_norm": 1.457463449267251, + "language_loss": 0.80298489, + "learning_rate": 1.9542874794048103e-06, + "loss": 0.82382655, + "num_input_tokens_seen": 181413890, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.40625, + "step": 8438, + "time_per_iteration": 2.479721784591675 + }, + { + "auxiliary_loss_clip": 0.01065035, + "auxiliary_loss_mlp": 0.01031475, + "balance_loss_clip": 1.01744998, + "balance_loss_mlp": 1.02061629, + "epoch": 0.5073801292649932, + "flos": 25805318037120.0, + "grad_norm": 1.4860368332112635, + "language_loss": 0.79163969, + "learning_rate": 1.9539097565969023e-06, + "loss": 0.81260473, + "num_input_tokens_seen": 181433240, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.4453125, + "step": 8439, + "time_per_iteration": 2.4299771785736084 + }, + { + "auxiliary_loss_clip": 0.01062296, + "auxiliary_loss_mlp": 0.01025046, + "balance_loss_clip": 1.01246333, + "balance_loss_mlp": 1.01948094, + "epoch": 0.5074402525176612, + "flos": 25372778803200.0, + "grad_norm": 1.6158478262610703, + "language_loss": 0.7115494, + "learning_rate": 1.9535320354338366e-06, + "loss": 0.73242283, + "num_input_tokens_seen": 181453535, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.42773438, + "step": 8440, + "time_per_iteration": 2.4538967609405518 + }, + { + "auxiliary_loss_clip": 0.01064239, + "auxiliary_loss_mlp": 0.01025508, + "balance_loss_clip": 1.01225829, + "balance_loss_mlp": 1.02025294, + "epoch": 0.5075003757703291, + "flos": 26063326051200.0, + "grad_norm": 2.0759134304758953, + "language_loss": 0.70926321, + "learning_rate": 1.9531543159290933e-06, + "loss": 0.73016071, + "num_input_tokens_seen": 181474195, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.43945312, + "step": 8441, + "time_per_iteration": 5.25331974029541 + }, + { + "auxiliary_loss_clip": 0.01064714, + "auxiliary_loss_mlp": 0.0103681, + "balance_loss_clip": 1.0234344, + "balance_loss_mlp": 1.02039504, + "epoch": 0.5075604990229972, + "flos": 21834366353280.0, + "grad_norm": 1.531635407084006, + "language_loss": 0.63756704, + "learning_rate": 1.9527765980961516e-06, + "loss": 0.65858233, + "num_input_tokens_seen": 181494000, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.44335938, + "step": 8442, + "time_per_iteration": 2.41365647315979 + }, + { + "auxiliary_loss_clip": 0.01062636, + "auxiliary_loss_mlp": 0.01025119, + "balance_loss_clip": 1.01305497, + "balance_loss_mlp": 1.02119279, + "epoch": 0.5076206222756651, + "flos": 31977333475200.0, + "grad_norm": 1.4523849867278962, + "language_loss": 0.71185076, + "learning_rate": 1.952398881948491e-06, + "loss": 0.7327283, + "num_input_tokens_seen": 181515955, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.4140625, + "step": 8443, + "time_per_iteration": 2.469493865966797 + }, + { + "auxiliary_loss_clip": 0.01064705, + "auxiliary_loss_mlp": 0.01026706, + "balance_loss_clip": 1.01321793, + "balance_loss_mlp": 1.01993942, + "epoch": 0.5076807455283331, + "flos": 36902530972800.0, + "grad_norm": 1.3732697010067447, + "language_loss": 0.62091798, + "learning_rate": 1.9520211674995927e-06, + "loss": 0.64183211, + "num_input_tokens_seen": 181540225, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.44921875, + "step": 8444, + "time_per_iteration": 2.565833806991577 + }, + { + "auxiliary_loss_clip": 0.01061941, + "auxiliary_loss_mlp": 0.01024108, + "balance_loss_clip": 1.01181769, + "balance_loss_mlp": 1.01962686, + "epoch": 0.507740868781001, + "flos": 29861108058240.0, + "grad_norm": 1.6793466967982271, + "language_loss": 0.63795471, + "learning_rate": 1.951643454762935e-06, + "loss": 0.65881515, + "num_input_tokens_seen": 181560125, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.421875, + "step": 8445, + "time_per_iteration": 2.4627528190612793 + }, + { + "auxiliary_loss_clip": 0.01009559, + "auxiliary_loss_mlp": 0.01001127, + "balance_loss_clip": 1.00020933, + "balance_loss_mlp": 1.00134659, + "epoch": 0.507800992033669, + "flos": 61916157953280.0, + "grad_norm": 0.8949565647779318, + "language_loss": 0.61913443, + "learning_rate": 1.9512657437519986e-06, + "loss": 0.63924134, + "num_input_tokens_seen": 181618830, + "router_z_loss_clip": 0.00915527, + "router_z_loss_mlp": 0.08203125, + "step": 8446, + "time_per_iteration": 3.0809547901153564 + }, + { + "auxiliary_loss_clip": 0.01061634, + "auxiliary_loss_mlp": 0.01024065, + "balance_loss_clip": 1.01225126, + "balance_loss_mlp": 1.01942098, + "epoch": 0.507861115286337, + "flos": 20699549654400.0, + "grad_norm": 1.8010415486621536, + "language_loss": 0.80343491, + "learning_rate": 1.9508880344802616e-06, + "loss": 0.82429188, + "num_input_tokens_seen": 181637120, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.421875, + "step": 8447, + "time_per_iteration": 2.405841588973999 + }, + { + "auxiliary_loss_clip": 0.0106334, + "auxiliary_loss_mlp": 0.01026612, + "balance_loss_clip": 1.01443529, + "balance_loss_mlp": 1.02163208, + "epoch": 0.507921238539005, + "flos": 30845728621440.0, + "grad_norm": 1.4227678346700452, + "language_loss": 0.70555288, + "learning_rate": 1.950510326961205e-06, + "loss": 0.72645235, + "num_input_tokens_seen": 181659965, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.41601562, + "step": 8448, + "time_per_iteration": 2.5026588439941406 + }, + { + "auxiliary_loss_clip": 0.01062539, + "auxiliary_loss_mlp": 0.01027611, + "balance_loss_clip": 1.01554108, + "balance_loss_mlp": 1.02107334, + "epoch": 0.507981361791673, + "flos": 35698725694080.0, + "grad_norm": 1.340018665988135, + "language_loss": 0.71817869, + "learning_rate": 1.9501326212083077e-06, + "loss": 0.73908019, + "num_input_tokens_seen": 181685290, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.4140625, + "step": 8449, + "time_per_iteration": 2.5515923500061035 + }, + { + "auxiliary_loss_clip": 0.01060911, + "auxiliary_loss_mlp": 0.01025991, + "balance_loss_clip": 1.0131942, + "balance_loss_mlp": 1.01863885, + "epoch": 0.5080414850443409, + "flos": 27160262058240.0, + "grad_norm": 1.5015520574805528, + "language_loss": 0.7314347, + "learning_rate": 1.949754917235048e-06, + "loss": 0.75230372, + "num_input_tokens_seen": 181706080, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.421875, + "step": 8450, + "time_per_iteration": 2.4675447940826416 + }, + { + "auxiliary_loss_clip": 0.01065573, + "auxiliary_loss_mlp": 0.01022595, + "balance_loss_clip": 1.01002979, + "balance_loss_mlp": 1.02111983, + "epoch": 0.5081016082970089, + "flos": 27084081738240.0, + "grad_norm": 1.5846530044303087, + "language_loss": 0.76901591, + "learning_rate": 1.9493772150549068e-06, + "loss": 0.78989762, + "num_input_tokens_seen": 181724805, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.4453125, + "step": 8451, + "time_per_iteration": 2.473761558532715 + }, + { + "auxiliary_loss_clip": 0.01062958, + "auxiliary_loss_mlp": 0.01031616, + "balance_loss_clip": 1.01893234, + "balance_loss_mlp": 1.02048004, + "epoch": 0.5081617315496768, + "flos": 22855436242560.0, + "grad_norm": 1.6928768854391545, + "language_loss": 0.84617853, + "learning_rate": 1.9489995146813622e-06, + "loss": 0.86712432, + "num_input_tokens_seen": 181743725, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.42578125, + "step": 8452, + "time_per_iteration": 2.4216113090515137 + }, + { + "auxiliary_loss_clip": 0.01068117, + "auxiliary_loss_mlp": 0.0103025, + "balance_loss_clip": 1.01620698, + "balance_loss_mlp": 1.02197218, + "epoch": 0.5082218548023448, + "flos": 16281186497280.0, + "grad_norm": 3.2685746905255826, + "language_loss": 0.77706218, + "learning_rate": 1.948621816127894e-06, + "loss": 0.79804587, + "num_input_tokens_seen": 181757720, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.4609375, + "step": 8453, + "time_per_iteration": 2.332880973815918 + }, + { + "auxiliary_loss_clip": 0.01060998, + "auxiliary_loss_mlp": 0.01025399, + "balance_loss_clip": 1.01371074, + "balance_loss_mlp": 1.01987028, + "epoch": 0.5082819780550127, + "flos": 24459660437760.0, + "grad_norm": 1.6904042004203337, + "language_loss": 0.75970489, + "learning_rate": 1.948244119407981e-06, + "loss": 0.78056884, + "num_input_tokens_seen": 181778545, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.41210938, + "step": 8454, + "time_per_iteration": 2.454442024230957 + }, + { + "auxiliary_loss_clip": 0.01064115, + "auxiliary_loss_mlp": 0.01023674, + "balance_loss_clip": 1.01019764, + "balance_loss_mlp": 1.02070475, + "epoch": 0.5083421013076808, + "flos": 23402176133760.0, + "grad_norm": 1.7312257079201105, + "language_loss": 0.89403832, + "learning_rate": 1.947866424535102e-06, + "loss": 0.91491628, + "num_input_tokens_seen": 181799495, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.43359375, + "step": 8455, + "time_per_iteration": 2.4019088745117188 + }, + { + "auxiliary_loss_clip": 0.0106623, + "auxiliary_loss_mlp": 0.01027494, + "balance_loss_clip": 1.01246178, + "balance_loss_mlp": 1.02100277, + "epoch": 0.5084022245603487, + "flos": 23871723275520.0, + "grad_norm": 2.126827829949818, + "language_loss": 0.62522924, + "learning_rate": 1.947488731522737e-06, + "loss": 0.64616644, + "num_input_tokens_seen": 181818400, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.453125, + "step": 8456, + "time_per_iteration": 2.4268174171447754 + }, + { + "auxiliary_loss_clip": 0.01009713, + "auxiliary_loss_mlp": 0.01001194, + "balance_loss_clip": 1.00021613, + "balance_loss_mlp": 1.00151491, + "epoch": 0.5084623478130167, + "flos": 62870333944320.0, + "grad_norm": 0.809455374317216, + "language_loss": 0.62410522, + "learning_rate": 1.947111040384363e-06, + "loss": 0.64421433, + "num_input_tokens_seen": 181875975, + "router_z_loss_clip": 0.00976562, + "router_z_loss_mlp": 0.08203125, + "step": 8457, + "time_per_iteration": 2.946321725845337 + }, + { + "auxiliary_loss_clip": 0.010665, + "auxiliary_loss_mlp": 0.01031071, + "balance_loss_clip": 1.01751661, + "balance_loss_mlp": 1.02251887, + "epoch": 0.5085224710656846, + "flos": 22345040943360.0, + "grad_norm": 1.5206405397244813, + "language_loss": 0.6760391, + "learning_rate": 1.9467333511334605e-06, + "loss": 0.69701481, + "num_input_tokens_seen": 181896450, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.44140625, + "step": 8458, + "time_per_iteration": 2.430699110031128 + }, + { + "auxiliary_loss_clip": 0.01063152, + "auxiliary_loss_mlp": 0.01026473, + "balance_loss_clip": 1.01291251, + "balance_loss_mlp": 1.01881695, + "epoch": 0.5085825943183526, + "flos": 26065106530560.0, + "grad_norm": 2.899104561378975, + "language_loss": 0.77605927, + "learning_rate": 1.946355663783508e-06, + "loss": 0.79695547, + "num_input_tokens_seen": 181916770, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.44335938, + "step": 8459, + "time_per_iteration": 2.429638624191284 + }, + { + "auxiliary_loss_clip": 0.01070161, + "auxiliary_loss_mlp": 0.01022979, + "balance_loss_clip": 1.00924039, + "balance_loss_mlp": 1.02334666, + "epoch": 0.5086427175710206, + "flos": 17419773623040.0, + "grad_norm": 1.9182135900895623, + "language_loss": 0.80443591, + "learning_rate": 1.945977978347983e-06, + "loss": 0.82536733, + "num_input_tokens_seen": 181932710, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.46875, + "step": 8460, + "time_per_iteration": 2.3886377811431885 + }, + { + "auxiliary_loss_clip": 0.01065556, + "auxiliary_loss_mlp": 0.01029941, + "balance_loss_clip": 1.01706028, + "balance_loss_mlp": 1.02182615, + "epoch": 0.5087028408236886, + "flos": 20630700720000.0, + "grad_norm": 1.4332395825013515, + "language_loss": 0.68939948, + "learning_rate": 1.9456002948403656e-06, + "loss": 0.71035445, + "num_input_tokens_seen": 181950665, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.4375, + "step": 8461, + "time_per_iteration": 2.386747360229492 + }, + { + "auxiliary_loss_clip": 0.01062192, + "auxiliary_loss_mlp": 0.01024753, + "balance_loss_clip": 1.0115267, + "balance_loss_mlp": 1.02017617, + "epoch": 0.5087629640763566, + "flos": 25592626834560.0, + "grad_norm": 1.5700864789040956, + "language_loss": 0.76283419, + "learning_rate": 1.945222613274133e-06, + "loss": 0.78370363, + "num_input_tokens_seen": 181971270, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.41992188, + "step": 8462, + "time_per_iteration": 2.4508368968963623 + }, + { + "auxiliary_loss_clip": 0.0106413, + "auxiliary_loss_mlp": 0.01026278, + "balance_loss_clip": 1.01367164, + "balance_loss_mlp": 1.02165711, + "epoch": 0.5088230873290245, + "flos": 13260780023040.0, + "grad_norm": 2.7951884428711464, + "language_loss": 0.81607467, + "learning_rate": 1.9448449336627654e-06, + "loss": 0.83697879, + "num_input_tokens_seen": 181988410, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.42382812, + "step": 8463, + "time_per_iteration": 2.3566112518310547 + }, + { + "auxiliary_loss_clip": 0.01064165, + "auxiliary_loss_mlp": 0.01031866, + "balance_loss_clip": 1.01900983, + "balance_loss_mlp": 1.02158141, + "epoch": 0.5088832105816925, + "flos": 20042554089600.0, + "grad_norm": 1.6838994665330582, + "language_loss": 0.76050359, + "learning_rate": 1.94446725601974e-06, + "loss": 0.78146398, + "num_input_tokens_seen": 182006530, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.42578125, + "step": 8464, + "time_per_iteration": 2.4273600578308105 + }, + { + "auxiliary_loss_clip": 0.0100909, + "auxiliary_loss_mlp": 0.01001038, + "balance_loss_clip": 1.00012636, + "balance_loss_mlp": 1.00088632, + "epoch": 0.5089433338343604, + "flos": 67418363491200.0, + "grad_norm": 0.6856002002949038, + "language_loss": 0.59420729, + "learning_rate": 1.9440895803585347e-06, + "loss": 0.61430848, + "num_input_tokens_seen": 182074240, + "router_z_loss_clip": 0.00909424, + "router_z_loss_mlp": 0.08203125, + "step": 8465, + "time_per_iteration": 3.162677526473999 + }, + { + "auxiliary_loss_clip": 0.01062965, + "auxiliary_loss_mlp": 0.01031541, + "balance_loss_clip": 1.01776099, + "balance_loss_mlp": 1.02084708, + "epoch": 0.5090034570870284, + "flos": 22709254913280.0, + "grad_norm": 1.8527272016456464, + "language_loss": 0.80142784, + "learning_rate": 1.9437119066926293e-06, + "loss": 0.82237291, + "num_input_tokens_seen": 182093360, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.421875, + "step": 8466, + "time_per_iteration": 3.860405206680298 + }, + { + "auxiliary_loss_clip": 0.01061394, + "auxiliary_loss_mlp": 0.01028127, + "balance_loss_clip": 1.01535416, + "balance_loss_mlp": 1.01942551, + "epoch": 0.5090635803396963, + "flos": 20444858219520.0, + "grad_norm": 1.6341753036940152, + "language_loss": 0.78435183, + "learning_rate": 1.9433342350355007e-06, + "loss": 0.80524707, + "num_input_tokens_seen": 182110170, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.41992188, + "step": 8467, + "time_per_iteration": 2.3919601440429688 + }, + { + "auxiliary_loss_clip": 0.0106232, + "auxiliary_loss_mlp": 0.0102524, + "balance_loss_clip": 1.01286614, + "balance_loss_mlp": 1.02074194, + "epoch": 0.5091237035923644, + "flos": 23767751646720.0, + "grad_norm": 1.6411643085688188, + "language_loss": 0.7407136, + "learning_rate": 1.9429565654006277e-06, + "loss": 0.76158929, + "num_input_tokens_seen": 182129570, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.41601562, + "step": 8468, + "time_per_iteration": 2.423982858657837 + }, + { + "auxiliary_loss_clip": 0.01063155, + "auxiliary_loss_mlp": 0.01027944, + "balance_loss_clip": 1.01504564, + "balance_loss_mlp": 1.02041209, + "epoch": 0.5091838268450323, + "flos": 18327061791360.0, + "grad_norm": 2.3431916387852483, + "language_loss": 0.77907205, + "learning_rate": 1.942578897801488e-06, + "loss": 0.79998302, + "num_input_tokens_seen": 182147565, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.42773438, + "step": 8469, + "time_per_iteration": 2.3925423622131348 + }, + { + "auxiliary_loss_clip": 0.01066697, + "auxiliary_loss_mlp": 0.01029478, + "balance_loss_clip": 1.01576853, + "balance_loss_mlp": 1.02337623, + "epoch": 0.5092439500977003, + "flos": 29056395064320.0, + "grad_norm": 1.4686852918309319, + "language_loss": 0.69452369, + "learning_rate": 1.94220123225156e-06, + "loss": 0.71548545, + "num_input_tokens_seen": 182169695, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.43359375, + "step": 8470, + "time_per_iteration": 2.4805729389190674 + }, + { + "auxiliary_loss_clip": 0.01066333, + "auxiliary_loss_mlp": 0.01032915, + "balance_loss_clip": 1.01945674, + "balance_loss_mlp": 1.02150369, + "epoch": 0.5093040733503682, + "flos": 13553037947520.0, + "grad_norm": 1.8950462015241143, + "language_loss": 0.73755521, + "learning_rate": 1.9418235687643216e-06, + "loss": 0.75854772, + "num_input_tokens_seen": 182186385, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.44726562, + "step": 8471, + "time_per_iteration": 2.409515857696533 + }, + { + "auxiliary_loss_clip": 0.01009079, + "auxiliary_loss_mlp": 0.01000767, + "balance_loss_clip": 0.99965847, + "balance_loss_mlp": 1.00091255, + "epoch": 0.5093641966030362, + "flos": 68903080502400.0, + "grad_norm": 0.7509793837593367, + "language_loss": 0.58085585, + "learning_rate": 1.9414459073532495e-06, + "loss": 0.60095435, + "num_input_tokens_seen": 182247095, + "router_z_loss_clip": 0.0111084, + "router_z_loss_mlp": 0.08203125, + "step": 8472, + "time_per_iteration": 3.029125213623047 + }, + { + "auxiliary_loss_clip": 0.01063037, + "auxiliary_loss_mlp": 0.01026126, + "balance_loss_clip": 1.01366854, + "balance_loss_mlp": 1.0205915, + "epoch": 0.5094243198557042, + "flos": 21579849475200.0, + "grad_norm": 1.9100401992482405, + "language_loss": 0.6889267, + "learning_rate": 1.941068248031823e-06, + "loss": 0.7098183, + "num_input_tokens_seen": 182266380, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.42382812, + "step": 8473, + "time_per_iteration": 2.3963303565979004 + }, + { + "auxiliary_loss_clip": 0.01063702, + "auxiliary_loss_mlp": 0.01030538, + "balance_loss_clip": 1.01728177, + "balance_loss_mlp": 1.02096725, + "epoch": 0.5094844431083722, + "flos": 28839444675840.0, + "grad_norm": 2.2331377850260274, + "language_loss": 0.84884274, + "learning_rate": 1.9406905908135187e-06, + "loss": 0.86978519, + "num_input_tokens_seen": 182284685, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.42773438, + "step": 8474, + "time_per_iteration": 2.439709424972534 + }, + { + "auxiliary_loss_clip": 0.01063363, + "auxiliary_loss_mlp": 0.01024264, + "balance_loss_clip": 1.01195598, + "balance_loss_mlp": 1.02061665, + "epoch": 0.5095445663610402, + "flos": 14975224980480.0, + "grad_norm": 1.9803391070889478, + "language_loss": 0.64764696, + "learning_rate": 1.940312935711815e-06, + "loss": 0.66852319, + "num_input_tokens_seen": 182301810, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.42773438, + "step": 8475, + "time_per_iteration": 2.3479957580566406 + }, + { + "auxiliary_loss_clip": 0.01068348, + "auxiliary_loss_mlp": 0.01031853, + "balance_loss_clip": 1.01832306, + "balance_loss_mlp": 1.02176046, + "epoch": 0.5096046896137081, + "flos": 20776044176640.0, + "grad_norm": 3.1028268501915384, + "language_loss": 0.81715459, + "learning_rate": 1.939935282740189e-06, + "loss": 0.83815658, + "num_input_tokens_seen": 182320285, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.46484375, + "step": 8476, + "time_per_iteration": 2.3895623683929443 + }, + { + "auxiliary_loss_clip": 0.01065313, + "auxiliary_loss_mlp": 0.01027484, + "balance_loss_clip": 1.01171303, + "balance_loss_mlp": 1.01971173, + "epoch": 0.5096648128663761, + "flos": 23183968936320.0, + "grad_norm": 2.4069644650361774, + "language_loss": 0.81095707, + "learning_rate": 1.939557631912118e-06, + "loss": 0.83188504, + "num_input_tokens_seen": 182339465, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.45703125, + "step": 8477, + "time_per_iteration": 3.8003320693969727 + }, + { + "auxiliary_loss_clip": 0.01067166, + "auxiliary_loss_mlp": 0.01024427, + "balance_loss_clip": 1.01042557, + "balance_loss_mlp": 1.02214289, + "epoch": 0.509724936119044, + "flos": 22308347237760.0, + "grad_norm": 1.97272487672938, + "language_loss": 0.61969411, + "learning_rate": 1.9391799832410803e-06, + "loss": 0.64061004, + "num_input_tokens_seen": 182358375, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.45117188, + "step": 8478, + "time_per_iteration": 2.389512062072754 + }, + { + "auxiliary_loss_clip": 0.01063535, + "auxiliary_loss_mlp": 0.01033, + "balance_loss_clip": 1.01993513, + "balance_loss_mlp": 1.02089131, + "epoch": 0.509785059371712, + "flos": 26285862257280.0, + "grad_norm": 1.536512749087618, + "language_loss": 0.65577471, + "learning_rate": 1.9388023367405516e-06, + "loss": 0.67674011, + "num_input_tokens_seen": 182377935, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.42578125, + "step": 8479, + "time_per_iteration": 2.4235026836395264 + }, + { + "auxiliary_loss_clip": 0.01063314, + "auxiliary_loss_mlp": 0.01025853, + "balance_loss_clip": 1.01384258, + "balance_loss_mlp": 1.02099156, + "epoch": 0.50984518262438, + "flos": 22963527411840.0, + "grad_norm": 1.5895822288077255, + "language_loss": 0.69462502, + "learning_rate": 1.938424692424011e-06, + "loss": 0.71551675, + "num_input_tokens_seen": 182396440, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.42382812, + "step": 8480, + "time_per_iteration": 3.8436577320098877 + }, + { + "auxiliary_loss_clip": 0.01065826, + "auxiliary_loss_mlp": 0.01022734, + "balance_loss_clip": 1.00952506, + "balance_loss_mlp": 1.02059126, + "epoch": 0.509905305877048, + "flos": 26212195555200.0, + "grad_norm": 1.840077035033676, + "language_loss": 0.79773277, + "learning_rate": 1.938047050304934e-06, + "loss": 0.8186183, + "num_input_tokens_seen": 182415890, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.453125, + "step": 8481, + "time_per_iteration": 3.8562698364257812 + }, + { + "auxiliary_loss_clip": 0.01060069, + "auxiliary_loss_mlp": 0.01027667, + "balance_loss_clip": 1.01469135, + "balance_loss_mlp": 1.01889408, + "epoch": 0.5099654291297159, + "flos": 20339001377280.0, + "grad_norm": 1.6357926354104824, + "language_loss": 0.83308607, + "learning_rate": 1.937669410396799e-06, + "loss": 0.85396343, + "num_input_tokens_seen": 182434235, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.41210938, + "step": 8482, + "time_per_iteration": 2.397336721420288 + }, + { + "auxiliary_loss_clip": 0.01066329, + "auxiliary_loss_mlp": 0.01027985, + "balance_loss_clip": 1.01413345, + "balance_loss_mlp": 1.0202831, + "epoch": 0.5100255523823839, + "flos": 29053671978240.0, + "grad_norm": 1.7279670676683965, + "language_loss": 0.85236168, + "learning_rate": 1.937291772713082e-06, + "loss": 0.87330484, + "num_input_tokens_seen": 182454360, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.4609375, + "step": 8483, + "time_per_iteration": 2.448607921600342 + }, + { + "auxiliary_loss_clip": 0.01063646, + "auxiliary_loss_mlp": 0.01024073, + "balance_loss_clip": 1.01183665, + "balance_loss_mlp": 1.0195092, + "epoch": 0.5100856756350518, + "flos": 22454807857920.0, + "grad_norm": 2.8228399429525597, + "language_loss": 0.82935965, + "learning_rate": 1.93691413726726e-06, + "loss": 0.85023689, + "num_input_tokens_seen": 182471940, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.44140625, + "step": 8484, + "time_per_iteration": 2.4067959785461426 + }, + { + "auxiliary_loss_clip": 0.01065375, + "auxiliary_loss_mlp": 0.01025478, + "balance_loss_clip": 1.01155996, + "balance_loss_mlp": 1.02160239, + "epoch": 0.5101457988877198, + "flos": 19170074413440.0, + "grad_norm": 1.8756157346771156, + "language_loss": 0.81169295, + "learning_rate": 1.936536504072811e-06, + "loss": 0.83260155, + "num_input_tokens_seen": 182490685, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.4375, + "step": 8485, + "time_per_iteration": 2.358070135116577 + }, + { + "auxiliary_loss_clip": 0.01060837, + "auxiliary_loss_mlp": 0.01027979, + "balance_loss_clip": 1.01554537, + "balance_loss_mlp": 1.02014399, + "epoch": 0.5102059221403878, + "flos": 14865492977280.0, + "grad_norm": 1.9530145918650361, + "language_loss": 0.73889709, + "learning_rate": 1.9361588731432112e-06, + "loss": 0.75978523, + "num_input_tokens_seen": 182508325, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.40625, + "step": 8486, + "time_per_iteration": 2.3645479679107666 + }, + { + "auxiliary_loss_clip": 0.01066542, + "auxiliary_loss_mlp": 0.0102262, + "balance_loss_clip": 1.00950098, + "balance_loss_mlp": 1.02195358, + "epoch": 0.5102660453930558, + "flos": 22960141009920.0, + "grad_norm": 1.6797324603442987, + "language_loss": 0.70084023, + "learning_rate": 1.9357812444919363e-06, + "loss": 0.72173178, + "num_input_tokens_seen": 182527020, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.4453125, + "step": 8487, + "time_per_iteration": 2.396305561065674 + }, + { + "auxiliary_loss_clip": 0.01064698, + "auxiliary_loss_mlp": 0.01024748, + "balance_loss_clip": 1.01199889, + "balance_loss_mlp": 1.02173901, + "epoch": 0.5103261686457238, + "flos": 23148182926080.0, + "grad_norm": 1.9061065685948777, + "language_loss": 0.72522044, + "learning_rate": 1.9354036181324636e-06, + "loss": 0.74611497, + "num_input_tokens_seen": 182543505, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.4296875, + "step": 8488, + "time_per_iteration": 2.404505968093872 + }, + { + "auxiliary_loss_clip": 0.01063584, + "auxiliary_loss_mlp": 0.01027525, + "balance_loss_clip": 1.01457894, + "balance_loss_mlp": 1.01984143, + "epoch": 0.5103862918983917, + "flos": 14318369061120.0, + "grad_norm": 8.189356627475595, + "language_loss": 0.69357663, + "learning_rate": 1.9350259940782694e-06, + "loss": 0.71448773, + "num_input_tokens_seen": 182562250, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.4375, + "step": 8489, + "time_per_iteration": 2.3626224994659424 + }, + { + "auxiliary_loss_clip": 0.0106213, + "auxiliary_loss_mlp": 0.01026295, + "balance_loss_clip": 1.01351571, + "balance_loss_mlp": 1.01951814, + "epoch": 0.5104464151510597, + "flos": 25847353180800.0, + "grad_norm": 1.6094707248800082, + "language_loss": 0.72587043, + "learning_rate": 1.934648372342831e-06, + "loss": 0.74675471, + "num_input_tokens_seen": 182581910, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.42578125, + "step": 8490, + "time_per_iteration": 2.422778606414795 + }, + { + "auxiliary_loss_clip": 0.01060895, + "auxiliary_loss_mlp": 0.01022314, + "balance_loss_clip": 1.01014233, + "balance_loss_mlp": 1.01959991, + "epoch": 0.5105065384037276, + "flos": 21651840432000.0, + "grad_norm": 2.8182800631569918, + "language_loss": 0.80326247, + "learning_rate": 1.934270752939623e-06, + "loss": 0.82409459, + "num_input_tokens_seen": 182601350, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.4140625, + "step": 8491, + "time_per_iteration": 2.377026081085205 + }, + { + "auxiliary_loss_clip": 0.01062596, + "auxiliary_loss_mlp": 0.01023318, + "balance_loss_clip": 1.01133776, + "balance_loss_mlp": 1.02166843, + "epoch": 0.5105666616563956, + "flos": 22490489134080.0, + "grad_norm": 6.5173616274763875, + "language_loss": 0.78874695, + "learning_rate": 1.933893135882124e-06, + "loss": 0.80960608, + "num_input_tokens_seen": 182619660, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.41015625, + "step": 8492, + "time_per_iteration": 2.3901963233947754 + }, + { + "auxiliary_loss_clip": 0.01067014, + "auxiliary_loss_mlp": 0.01028968, + "balance_loss_clip": 1.01493669, + "balance_loss_mlp": 1.02258825, + "epoch": 0.5106267849090635, + "flos": 22454668212480.0, + "grad_norm": 1.826353901003699, + "language_loss": 0.77573192, + "learning_rate": 1.9335155211838083e-06, + "loss": 0.79669178, + "num_input_tokens_seen": 182639815, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.44335938, + "step": 8493, + "time_per_iteration": 2.400700330734253 + }, + { + "auxiliary_loss_clip": 0.01063733, + "auxiliary_loss_mlp": 0.01026091, + "balance_loss_clip": 1.0129658, + "balance_loss_mlp": 1.02117324, + "epoch": 0.5106869081617316, + "flos": 23546053313280.0, + "grad_norm": 2.2801170345571653, + "language_loss": 0.83676577, + "learning_rate": 1.9331379088581524e-06, + "loss": 0.85766405, + "num_input_tokens_seen": 182659655, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.42578125, + "step": 8494, + "time_per_iteration": 2.4155118465423584 + }, + { + "auxiliary_loss_clip": 0.01066859, + "auxiliary_loss_mlp": 0.01028072, + "balance_loss_clip": 1.01495957, + "balance_loss_mlp": 1.02234316, + "epoch": 0.5107470314143995, + "flos": 26791893636480.0, + "grad_norm": 2.2187897354962565, + "language_loss": 0.79207158, + "learning_rate": 1.932760298918633e-06, + "loss": 0.81302094, + "num_input_tokens_seen": 182677075, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.4453125, + "step": 8495, + "time_per_iteration": 2.411024808883667 + }, + { + "auxiliary_loss_clip": 0.01063223, + "auxiliary_loss_mlp": 0.01028887, + "balance_loss_clip": 1.01600635, + "balance_loss_mlp": 1.02053428, + "epoch": 0.5108071546670675, + "flos": 25738493961600.0, + "grad_norm": 1.626354107889346, + "language_loss": 0.78204405, + "learning_rate": 1.9323826913787253e-06, + "loss": 0.80296516, + "num_input_tokens_seen": 182699625, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.42578125, + "step": 8496, + "time_per_iteration": 2.460472345352173 + }, + { + "auxiliary_loss_clip": 0.01060641, + "auxiliary_loss_mlp": 0.01027062, + "balance_loss_clip": 1.01460457, + "balance_loss_mlp": 1.01888371, + "epoch": 0.5108672779197354, + "flos": 18696547376640.0, + "grad_norm": 2.0790606880531595, + "language_loss": 0.78686994, + "learning_rate": 1.9320050862519054e-06, + "loss": 0.80774701, + "num_input_tokens_seen": 182717020, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.41796875, + "step": 8497, + "time_per_iteration": 2.378371000289917 + }, + { + "auxiliary_loss_clip": 0.01061681, + "auxiliary_loss_mlp": 0.01028392, + "balance_loss_clip": 1.01596999, + "balance_loss_mlp": 1.01998103, + "epoch": 0.5109274011724034, + "flos": 26686944489600.0, + "grad_norm": 2.21184055388675, + "language_loss": 0.81617367, + "learning_rate": 1.9316274835516494e-06, + "loss": 0.83707434, + "num_input_tokens_seen": 182736955, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.41796875, + "step": 8498, + "time_per_iteration": 2.4413959980010986 + }, + { + "auxiliary_loss_clip": 0.01064075, + "auxiliary_loss_mlp": 0.01025608, + "balance_loss_clip": 1.01303148, + "balance_loss_mlp": 1.0214119, + "epoch": 0.5109875244250714, + "flos": 22782921615360.0, + "grad_norm": 1.8795295013544169, + "language_loss": 0.70648795, + "learning_rate": 1.9312498832914323e-06, + "loss": 0.72738481, + "num_input_tokens_seen": 182757620, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.42578125, + "step": 8499, + "time_per_iteration": 2.408784866333008 + }, + { + "auxiliary_loss_clip": 0.01062035, + "auxiliary_loss_mlp": 0.01023834, + "balance_loss_clip": 1.01133513, + "balance_loss_mlp": 1.01997638, + "epoch": 0.5110476476777394, + "flos": 35587108477440.0, + "grad_norm": 1.4875539537118998, + "language_loss": 0.72256732, + "learning_rate": 1.9308722854847304e-06, + "loss": 0.74342608, + "num_input_tokens_seen": 182780195, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.421875, + "step": 8500, + "time_per_iteration": 2.5124402046203613 + }, + { + "auxiliary_loss_clip": 0.01069052, + "auxiliary_loss_mlp": 0.01029425, + "balance_loss_clip": 1.01423812, + "balance_loss_mlp": 1.0222764, + "epoch": 0.5111077709304074, + "flos": 19279806416640.0, + "grad_norm": 3.4776659221289874, + "language_loss": 0.63439888, + "learning_rate": 1.930494690145019e-06, + "loss": 0.65538365, + "num_input_tokens_seen": 182795765, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.46875, + "step": 8501, + "time_per_iteration": 2.3649606704711914 + }, + { + "auxiliary_loss_clip": 0.01062622, + "auxiliary_loss_mlp": 0.01029873, + "balance_loss_clip": 1.016361, + "balance_loss_mlp": 1.01936281, + "epoch": 0.5111678941830753, + "flos": 20667150046080.0, + "grad_norm": 1.7112570986019287, + "language_loss": 0.87383521, + "learning_rate": 1.930117097285773e-06, + "loss": 0.89476025, + "num_input_tokens_seen": 182813120, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.43359375, + "step": 8502, + "time_per_iteration": 2.394882917404175 + }, + { + "auxiliary_loss_clip": 0.01061036, + "auxiliary_loss_mlp": 0.01022812, + "balance_loss_clip": 1.01060522, + "balance_loss_mlp": 1.01915503, + "epoch": 0.5112280174357433, + "flos": 26286665218560.0, + "grad_norm": 17.72841255355305, + "language_loss": 0.8220886, + "learning_rate": 1.929739506920468e-06, + "loss": 0.8429271, + "num_input_tokens_seen": 182835745, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.41796875, + "step": 8503, + "time_per_iteration": 2.434370994567871 + }, + { + "auxiliary_loss_clip": 0.0100932, + "auxiliary_loss_mlp": 0.01000779, + "balance_loss_clip": 0.99971187, + "balance_loss_mlp": 1.00133026, + "epoch": 0.5112881406884112, + "flos": 59471504576640.0, + "grad_norm": 0.8533367027927226, + "language_loss": 0.63870674, + "learning_rate": 1.9293619190625785e-06, + "loss": 0.65880775, + "num_input_tokens_seen": 182892540, + "router_z_loss_clip": 0.01068115, + "router_z_loss_mlp": 0.08007812, + "step": 8504, + "time_per_iteration": 3.0794594287872314 + }, + { + "auxiliary_loss_clip": 0.01062321, + "auxiliary_loss_mlp": 0.01024517, + "balance_loss_clip": 1.01147509, + "balance_loss_mlp": 1.02016282, + "epoch": 0.5113482639410792, + "flos": 26467655040000.0, + "grad_norm": 1.9454083967980318, + "language_loss": 0.84395397, + "learning_rate": 1.9289843337255814e-06, + "loss": 0.86482227, + "num_input_tokens_seen": 182911515, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.421875, + "step": 8505, + "time_per_iteration": 2.4208948612213135 + }, + { + "auxiliary_loss_clip": 0.01063166, + "auxiliary_loss_mlp": 0.01027411, + "balance_loss_clip": 1.01480508, + "balance_loss_mlp": 1.02087188, + "epoch": 0.5114083871937471, + "flos": 29894624830080.0, + "grad_norm": 1.8758554768515223, + "language_loss": 0.75596738, + "learning_rate": 1.92860675092295e-06, + "loss": 0.77687311, + "num_input_tokens_seen": 182930860, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.421875, + "step": 8506, + "time_per_iteration": 3.8731508255004883 + }, + { + "auxiliary_loss_clip": 0.01065508, + "auxiliary_loss_mlp": 0.01030455, + "balance_loss_clip": 1.01646614, + "balance_loss_mlp": 1.02119732, + "epoch": 0.5114685104464152, + "flos": 24313479108480.0, + "grad_norm": 1.6110698747329057, + "language_loss": 0.57921857, + "learning_rate": 1.928229170668161e-06, + "loss": 0.60017824, + "num_input_tokens_seen": 182949960, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.44335938, + "step": 8507, + "time_per_iteration": 2.4119484424591064 + }, + { + "auxiliary_loss_clip": 0.01066867, + "auxiliary_loss_mlp": 0.01029324, + "balance_loss_clip": 1.0157764, + "balance_loss_mlp": 1.02270055, + "epoch": 0.5115286336990831, + "flos": 17018342277120.0, + "grad_norm": 2.7806616437069716, + "language_loss": 0.85575837, + "learning_rate": 1.9278515929746875e-06, + "loss": 0.87672031, + "num_input_tokens_seen": 182968085, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.44140625, + "step": 8508, + "time_per_iteration": 2.393714427947998 + }, + { + "auxiliary_loss_clip": 0.01065849, + "auxiliary_loss_mlp": 0.01026138, + "balance_loss_clip": 1.01194048, + "balance_loss_mlp": 1.02107, + "epoch": 0.5115887569517511, + "flos": 23658264023040.0, + "grad_norm": 3.139759208913173, + "language_loss": 0.72314703, + "learning_rate": 1.9274740178560054e-06, + "loss": 0.74406689, + "num_input_tokens_seen": 182987275, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.44726562, + "step": 8509, + "time_per_iteration": 2.420238733291626 + }, + { + "auxiliary_loss_clip": 0.01062586, + "auxiliary_loss_mlp": 0.0102595, + "balance_loss_clip": 1.01375449, + "balance_loss_mlp": 1.02050543, + "epoch": 0.511648880204419, + "flos": 16106271252480.0, + "grad_norm": 1.6884959256513705, + "language_loss": 0.7604031, + "learning_rate": 1.9270964453255887e-06, + "loss": 0.7812885, + "num_input_tokens_seen": 183004700, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.421875, + "step": 8510, + "time_per_iteration": 2.368346691131592 + }, + { + "auxiliary_loss_clip": 0.01063483, + "auxiliary_loss_mlp": 0.01025119, + "balance_loss_clip": 1.01310825, + "balance_loss_mlp": 1.0215745, + "epoch": 0.511709003457087, + "flos": 32633595901440.0, + "grad_norm": 1.3952018329359945, + "language_loss": 0.7098695, + "learning_rate": 1.9267188753969125e-06, + "loss": 0.73075545, + "num_input_tokens_seen": 183025830, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.41992188, + "step": 8511, + "time_per_iteration": 2.4762139320373535 + }, + { + "auxiliary_loss_clip": 0.01062622, + "auxiliary_loss_mlp": 0.01027448, + "balance_loss_clip": 1.01497817, + "balance_loss_mlp": 1.02150428, + "epoch": 0.511769126709755, + "flos": 21761013853440.0, + "grad_norm": 1.98310947362673, + "language_loss": 0.6645326, + "learning_rate": 1.9263413080834514e-06, + "loss": 0.68543333, + "num_input_tokens_seen": 183045140, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.41210938, + "step": 8512, + "time_per_iteration": 2.4042553901672363 + }, + { + "auxiliary_loss_clip": 0.01064742, + "auxiliary_loss_mlp": 0.01027119, + "balance_loss_clip": 1.01335025, + "balance_loss_mlp": 1.02065837, + "epoch": 0.511829249962423, + "flos": 23914212266880.0, + "grad_norm": 1.8513289741826338, + "language_loss": 0.66630423, + "learning_rate": 1.925963743398679e-06, + "loss": 0.68722284, + "num_input_tokens_seen": 183063935, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.44140625, + "step": 8513, + "time_per_iteration": 2.4009952545166016 + }, + { + "auxiliary_loss_clip": 0.01062232, + "auxiliary_loss_mlp": 0.01029236, + "balance_loss_clip": 1.01583099, + "balance_loss_mlp": 1.02066755, + "epoch": 0.511889373215091, + "flos": 23726030705280.0, + "grad_norm": 1.998285375814287, + "language_loss": 0.69145334, + "learning_rate": 1.9255861813560706e-06, + "loss": 0.71236801, + "num_input_tokens_seen": 183084135, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.41601562, + "step": 8514, + "time_per_iteration": 2.4253036975860596 + }, + { + "auxiliary_loss_clip": 0.01059732, + "auxiliary_loss_mlp": 0.0102756, + "balance_loss_clip": 1.01574636, + "balance_loss_mlp": 1.0189693, + "epoch": 0.5119494964677589, + "flos": 28110248686080.0, + "grad_norm": 4.542866921777, + "language_loss": 0.65953934, + "learning_rate": 1.9252086219691e-06, + "loss": 0.68041229, + "num_input_tokens_seen": 183104570, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.40820312, + "step": 8515, + "time_per_iteration": 2.4372267723083496 + }, + { + "auxiliary_loss_clip": 0.0106279, + "auxiliary_loss_mlp": 0.01031861, + "balance_loss_clip": 1.01929665, + "balance_loss_mlp": 1.0195775, + "epoch": 0.5120096197204269, + "flos": 24972045684480.0, + "grad_norm": 1.7659855395017925, + "language_loss": 0.7514444, + "learning_rate": 1.9248310652512415e-06, + "loss": 0.77239084, + "num_input_tokens_seen": 183123850, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.43164062, + "step": 8516, + "time_per_iteration": 2.4224483966827393 + }, + { + "auxiliary_loss_clip": 0.0106571, + "auxiliary_loss_mlp": 0.01027605, + "balance_loss_clip": 1.01420021, + "balance_loss_mlp": 1.02156556, + "epoch": 0.5120697429730948, + "flos": 17967037184640.0, + "grad_norm": 2.1028613910547143, + "language_loss": 0.77270085, + "learning_rate": 1.924453511215969e-06, + "loss": 0.79363394, + "num_input_tokens_seen": 183141725, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.44140625, + "step": 8517, + "time_per_iteration": 3.8657989501953125 + }, + { + "auxiliary_loss_clip": 0.01065444, + "auxiliary_loss_mlp": 0.01031458, + "balance_loss_clip": 1.01801157, + "balance_loss_mlp": 1.02071607, + "epoch": 0.5121298662257628, + "flos": 23291292055680.0, + "grad_norm": 2.201576312463353, + "language_loss": 0.74161333, + "learning_rate": 1.9240759598767554e-06, + "loss": 0.7625823, + "num_input_tokens_seen": 183161300, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.44726562, + "step": 8518, + "time_per_iteration": 2.404710054397583 + }, + { + "auxiliary_loss_clip": 0.01062414, + "auxiliary_loss_mlp": 0.01029962, + "balance_loss_clip": 1.0167892, + "balance_loss_mlp": 1.01988864, + "epoch": 0.5121899894784308, + "flos": 17310111442560.0, + "grad_norm": 2.11421896117481, + "language_loss": 0.78265756, + "learning_rate": 1.9236984112470763e-06, + "loss": 0.80358136, + "num_input_tokens_seen": 183180495, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.42578125, + "step": 8519, + "time_per_iteration": 2.3721566200256348 + }, + { + "auxiliary_loss_clip": 0.01065398, + "auxiliary_loss_mlp": 0.01028111, + "balance_loss_clip": 1.01449776, + "balance_loss_mlp": 1.02182841, + "epoch": 0.5122501127310988, + "flos": 24929102845440.0, + "grad_norm": 1.5458478082935563, + "language_loss": 0.79538751, + "learning_rate": 1.923320865340405e-06, + "loss": 0.81632257, + "num_input_tokens_seen": 183200330, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.43554688, + "step": 8520, + "time_per_iteration": 3.863050937652588 + }, + { + "auxiliary_loss_clip": 0.01066628, + "auxiliary_loss_mlp": 0.01036555, + "balance_loss_clip": 1.02158213, + "balance_loss_mlp": 1.02013934, + "epoch": 0.5123102359837667, + "flos": 18441855941760.0, + "grad_norm": 20.561874080422275, + "language_loss": 0.81665701, + "learning_rate": 1.9229433221702135e-06, + "loss": 0.8376888, + "num_input_tokens_seen": 183218230, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.46484375, + "step": 8521, + "time_per_iteration": 2.3886399269104004 + }, + { + "auxiliary_loss_clip": 0.01059972, + "auxiliary_loss_mlp": 0.01026279, + "balance_loss_clip": 1.0125761, + "balance_loss_mlp": 1.01810026, + "epoch": 0.5123703592364347, + "flos": 26683732644480.0, + "grad_norm": 2.94224765203729, + "language_loss": 0.68078494, + "learning_rate": 1.9225657817499773e-06, + "loss": 0.7016474, + "num_input_tokens_seen": 183236735, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.41796875, + "step": 8522, + "time_per_iteration": 2.4165496826171875 + }, + { + "auxiliary_loss_clip": 0.01068719, + "auxiliary_loss_mlp": 0.01034323, + "balance_loss_clip": 1.01721632, + "balance_loss_mlp": 1.02164555, + "epoch": 0.5124304824891026, + "flos": 28802681147520.0, + "grad_norm": 9.271852505377732, + "language_loss": 0.61551368, + "learning_rate": 1.922188244093169e-06, + "loss": 0.63654411, + "num_input_tokens_seen": 183257550, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.47070312, + "step": 8523, + "time_per_iteration": 2.4490303993225098 + }, + { + "auxiliary_loss_clip": 0.01065285, + "auxiliary_loss_mlp": 0.01025626, + "balance_loss_clip": 1.01451564, + "balance_loss_mlp": 1.02155352, + "epoch": 0.5124906057417706, + "flos": 21760769473920.0, + "grad_norm": 2.433673894471895, + "language_loss": 0.77660263, + "learning_rate": 1.9218107092132623e-06, + "loss": 0.7975117, + "num_input_tokens_seen": 183275515, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.4375, + "step": 8524, + "time_per_iteration": 2.3809690475463867 + }, + { + "auxiliary_loss_clip": 0.01065694, + "auxiliary_loss_mlp": 0.01027068, + "balance_loss_clip": 1.01486695, + "balance_loss_mlp": 1.02360821, + "epoch": 0.5125507289944387, + "flos": 18879527145600.0, + "grad_norm": 1.7773964067133012, + "language_loss": 0.74885154, + "learning_rate": 1.9214331771237307e-06, + "loss": 0.76977921, + "num_input_tokens_seen": 183293880, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.421875, + "step": 8525, + "time_per_iteration": 2.3877780437469482 + }, + { + "auxiliary_loss_clip": 0.01065674, + "auxiliary_loss_mlp": 0.01026098, + "balance_loss_clip": 1.01240063, + "balance_loss_mlp": 1.02207947, + "epoch": 0.5126108522471066, + "flos": 35626350712320.0, + "grad_norm": 1.701640275408998, + "language_loss": 0.74173129, + "learning_rate": 1.9210556478380458e-06, + "loss": 0.76264894, + "num_input_tokens_seen": 183315860, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.4375, + "step": 8526, + "time_per_iteration": 2.503108263015747 + }, + { + "auxiliary_loss_clip": 0.01064359, + "auxiliary_loss_mlp": 0.01028337, + "balance_loss_clip": 1.0149976, + "balance_loss_mlp": 1.02099276, + "epoch": 0.5126709754997746, + "flos": 20189957316480.0, + "grad_norm": 1.5655098189524723, + "language_loss": 0.6543811, + "learning_rate": 1.9206781213696827e-06, + "loss": 0.67530811, + "num_input_tokens_seen": 183335480, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.43359375, + "step": 8527, + "time_per_iteration": 2.416719913482666 + }, + { + "auxiliary_loss_clip": 0.01063171, + "auxiliary_loss_mlp": 0.01035744, + "balance_loss_clip": 1.02347159, + "balance_loss_mlp": 1.02246237, + "epoch": 0.5127310987524425, + "flos": 18587548512000.0, + "grad_norm": 1.4386080025761456, + "language_loss": 0.74290192, + "learning_rate": 1.920300597732113e-06, + "loss": 0.7638911, + "num_input_tokens_seen": 183354395, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.40820312, + "step": 8528, + "time_per_iteration": 2.443207263946533 + }, + { + "auxiliary_loss_clip": 0.01061171, + "auxiliary_loss_mlp": 0.01024891, + "balance_loss_clip": 1.01184332, + "balance_loss_mlp": 1.01883745, + "epoch": 0.5127912220051105, + "flos": 22453620871680.0, + "grad_norm": 1.7057196131659278, + "language_loss": 0.82952058, + "learning_rate": 1.91992307693881e-06, + "loss": 0.85038114, + "num_input_tokens_seen": 183372980, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.42382812, + "step": 8529, + "time_per_iteration": 2.4164958000183105 + }, + { + "auxiliary_loss_clip": 0.01061248, + "auxiliary_loss_mlp": 0.01029768, + "balance_loss_clip": 1.01642227, + "balance_loss_mlp": 1.02059329, + "epoch": 0.5128513452577784, + "flos": 19092846752640.0, + "grad_norm": 1.8256412209633106, + "language_loss": 0.73729241, + "learning_rate": 1.919545559003247e-06, + "loss": 0.75820255, + "num_input_tokens_seen": 183390160, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.40625, + "step": 8530, + "time_per_iteration": 2.3623945713043213 + }, + { + "auxiliary_loss_clip": 0.0100963, + "auxiliary_loss_mlp": 0.0100117, + "balance_loss_clip": 0.99998391, + "balance_loss_mlp": 1.00188637, + "epoch": 0.5129114685104464, + "flos": 67888573948800.0, + "grad_norm": 0.7545522450990743, + "language_loss": 0.60804641, + "learning_rate": 1.9191680439388954e-06, + "loss": 0.62815446, + "num_input_tokens_seen": 183455280, + "router_z_loss_clip": 0.01184082, + "router_z_loss_mlp": 0.07763672, + "step": 8531, + "time_per_iteration": 3.1450273990631104 + }, + { + "auxiliary_loss_clip": 0.01063307, + "auxiliary_loss_mlp": 0.01023207, + "balance_loss_clip": 1.01036787, + "balance_loss_mlp": 1.02148438, + "epoch": 0.5129715917631144, + "flos": 20448104976000.0, + "grad_norm": 1.8573689553969177, + "language_loss": 0.76849818, + "learning_rate": 1.9187905317592285e-06, + "loss": 0.78936338, + "num_input_tokens_seen": 183473955, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.41796875, + "step": 8532, + "time_per_iteration": 2.3933935165405273 + }, + { + "auxiliary_loss_clip": 0.0106157, + "auxiliary_loss_mlp": 0.01026906, + "balance_loss_clip": 1.01400781, + "balance_loss_mlp": 1.02060997, + "epoch": 0.5130317150157824, + "flos": 35114698604160.0, + "grad_norm": 2.8745581343432858, + "language_loss": 0.67426264, + "learning_rate": 1.9184130224777183e-06, + "loss": 0.6951474, + "num_input_tokens_seen": 183497195, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.41015625, + "step": 8533, + "time_per_iteration": 2.52349853515625 + }, + { + "auxiliary_loss_clip": 0.01065245, + "auxiliary_loss_mlp": 0.01025966, + "balance_loss_clip": 1.01213145, + "balance_loss_mlp": 1.02126884, + "epoch": 0.5130918382684503, + "flos": 19790620652160.0, + "grad_norm": 1.819093728501652, + "language_loss": 0.81735611, + "learning_rate": 1.918035516107838e-06, + "loss": 0.83826828, + "num_input_tokens_seen": 183513675, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.44140625, + "step": 8534, + "time_per_iteration": 2.3711957931518555 + }, + { + "auxiliary_loss_clip": 0.01066288, + "auxiliary_loss_mlp": 0.01030911, + "balance_loss_clip": 1.0158906, + "balance_loss_mlp": 1.02094793, + "epoch": 0.5131519615211183, + "flos": 26321892647040.0, + "grad_norm": 1.7734347549706013, + "language_loss": 0.63493872, + "learning_rate": 1.9176580126630587e-06, + "loss": 0.65591079, + "num_input_tokens_seen": 183535165, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.453125, + "step": 8535, + "time_per_iteration": 2.450385570526123 + }, + { + "auxiliary_loss_clip": 0.01066008, + "auxiliary_loss_mlp": 0.01023394, + "balance_loss_clip": 1.01048958, + "balance_loss_mlp": 1.02253056, + "epoch": 0.5132120847737862, + "flos": 19170912286080.0, + "grad_norm": 1.54593730818312, + "language_loss": 0.69547915, + "learning_rate": 1.917280512156854e-06, + "loss": 0.71637321, + "num_input_tokens_seen": 183553780, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.43359375, + "step": 8536, + "time_per_iteration": 2.3800113201141357 + }, + { + "auxiliary_loss_clip": 0.01062597, + "auxiliary_loss_mlp": 0.0102916, + "balance_loss_clip": 1.01519418, + "balance_loss_mlp": 1.01951718, + "epoch": 0.5132722080264542, + "flos": 20229374108160.0, + "grad_norm": 1.9238946617675587, + "language_loss": 0.714082, + "learning_rate": 1.9169030146026944e-06, + "loss": 0.73499954, + "num_input_tokens_seen": 183572285, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.4296875, + "step": 8537, + "time_per_iteration": 2.399336576461792 + }, + { + "auxiliary_loss_clip": 0.01065177, + "auxiliary_loss_mlp": 0.01028125, + "balance_loss_clip": 1.01435018, + "balance_loss_mlp": 1.02105498, + "epoch": 0.5133323312791223, + "flos": 16468600008960.0, + "grad_norm": 2.0900245905746173, + "language_loss": 0.80211598, + "learning_rate": 1.9165255200140523e-06, + "loss": 0.82304901, + "num_input_tokens_seen": 183589330, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.44140625, + "step": 8538, + "time_per_iteration": 2.3456430435180664 + }, + { + "auxiliary_loss_clip": 0.01063001, + "auxiliary_loss_mlp": 0.01025731, + "balance_loss_clip": 1.01302934, + "balance_loss_mlp": 1.01959682, + "epoch": 0.5133924545317902, + "flos": 26066887009920.0, + "grad_norm": 2.001260702780254, + "language_loss": 0.78136873, + "learning_rate": 1.9161480284044e-06, + "loss": 0.80225605, + "num_input_tokens_seen": 183609205, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.43359375, + "step": 8539, + "time_per_iteration": 2.4316279888153076 + }, + { + "auxiliary_loss_clip": 0.0106486, + "auxiliary_loss_mlp": 0.01025929, + "balance_loss_clip": 1.01224375, + "balance_loss_mlp": 1.0207181, + "epoch": 0.5134525777844582, + "flos": 29129782475520.0, + "grad_norm": 5.107660588660739, + "language_loss": 0.75887036, + "learning_rate": 1.915770539787209e-06, + "loss": 0.77977824, + "num_input_tokens_seen": 183629985, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.44140625, + "step": 8540, + "time_per_iteration": 2.451059579849243 + }, + { + "auxiliary_loss_clip": 0.01065691, + "auxiliary_loss_mlp": 0.01031993, + "balance_loss_clip": 1.01708031, + "balance_loss_mlp": 1.0208894, + "epoch": 0.5135127010371261, + "flos": 17453883888000.0, + "grad_norm": 2.7684738333906336, + "language_loss": 0.74663866, + "learning_rate": 1.9153930541759507e-06, + "loss": 0.7676155, + "num_input_tokens_seen": 183648220, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.44726562, + "step": 8541, + "time_per_iteration": 2.3806538581848145 + }, + { + "auxiliary_loss_clip": 0.01062794, + "auxiliary_loss_mlp": 0.01028558, + "balance_loss_clip": 1.01488459, + "balance_loss_mlp": 1.02056456, + "epoch": 0.5135728242897941, + "flos": 21943888888320.0, + "grad_norm": 1.5453558434131913, + "language_loss": 0.70710969, + "learning_rate": 1.9150155715840967e-06, + "loss": 0.72802323, + "num_input_tokens_seen": 183668230, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.421875, + "step": 8542, + "time_per_iteration": 2.3881945610046387 + }, + { + "auxiliary_loss_clip": 0.01063736, + "auxiliary_loss_mlp": 0.01024715, + "balance_loss_clip": 1.01194739, + "balance_loss_mlp": 1.01979136, + "epoch": 0.513632947542462, + "flos": 22673748193920.0, + "grad_norm": 1.9881356606234617, + "language_loss": 0.79219329, + "learning_rate": 1.914638092025118e-06, + "loss": 0.81307775, + "num_input_tokens_seen": 183687800, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.43945312, + "step": 8543, + "time_per_iteration": 2.39534330368042 + }, + { + "auxiliary_loss_clip": 0.01065733, + "auxiliary_loss_mlp": 0.01031297, + "balance_loss_clip": 1.01752245, + "balance_loss_mlp": 1.02252293, + "epoch": 0.51369307079513, + "flos": 29455976108160.0, + "grad_norm": 1.6774955516140162, + "language_loss": 0.68027627, + "learning_rate": 1.9142606155124863e-06, + "loss": 0.70124662, + "num_input_tokens_seen": 183709025, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.43359375, + "step": 8544, + "time_per_iteration": 2.4444215297698975 + }, + { + "auxiliary_loss_clip": 0.01064589, + "auxiliary_loss_mlp": 0.01032462, + "balance_loss_clip": 1.0197897, + "balance_loss_mlp": 1.02022505, + "epoch": 0.513753194047798, + "flos": 18988351453440.0, + "grad_norm": 2.4554472278293114, + "language_loss": 0.72607183, + "learning_rate": 1.9138831420596727e-06, + "loss": 0.7470423, + "num_input_tokens_seen": 183725740, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.44335938, + "step": 8545, + "time_per_iteration": 3.8102076053619385 + }, + { + "auxiliary_loss_clip": 0.01067136, + "auxiliary_loss_mlp": 0.01026853, + "balance_loss_clip": 1.0126847, + "balance_loss_mlp": 1.02228403, + "epoch": 0.513813317300466, + "flos": 17820890766720.0, + "grad_norm": 1.9459127000992726, + "language_loss": 0.82008976, + "learning_rate": 1.9135056716801487e-06, + "loss": 0.84102964, + "num_input_tokens_seen": 183743995, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.44921875, + "step": 8546, + "time_per_iteration": 2.370330810546875 + }, + { + "auxiliary_loss_clip": 0.01063739, + "auxiliary_loss_mlp": 0.01028807, + "balance_loss_clip": 1.01552129, + "balance_loss_mlp": 1.02042007, + "epoch": 0.5138734405531339, + "flos": 24060044482560.0, + "grad_norm": 1.8078131920203655, + "language_loss": 0.73479533, + "learning_rate": 1.9131282043873848e-06, + "loss": 0.75572073, + "num_input_tokens_seen": 183764150, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.43359375, + "step": 8547, + "time_per_iteration": 2.414874315261841 + }, + { + "auxiliary_loss_clip": 0.01062934, + "auxiliary_loss_mlp": 0.01030758, + "balance_loss_clip": 1.01741862, + "balance_loss_mlp": 1.01958776, + "epoch": 0.5139335638058019, + "flos": 26096249329920.0, + "grad_norm": 1.6641729905138687, + "language_loss": 0.72895014, + "learning_rate": 1.912750740194851e-06, + "loss": 0.74988705, + "num_input_tokens_seen": 183783280, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.43359375, + "step": 8548, + "time_per_iteration": 2.4236886501312256 + }, + { + "auxiliary_loss_clip": 0.01063694, + "auxiliary_loss_mlp": 0.01029496, + "balance_loss_clip": 1.01588774, + "balance_loss_mlp": 1.02025104, + "epoch": 0.5139936870584698, + "flos": 18916081205760.0, + "grad_norm": 1.6572310081502495, + "language_loss": 0.81953251, + "learning_rate": 1.9123732791160196e-06, + "loss": 0.84046435, + "num_input_tokens_seen": 183800725, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.43359375, + "step": 8549, + "time_per_iteration": 2.3980817794799805 + }, + { + "auxiliary_loss_clip": 0.01062578, + "auxiliary_loss_mlp": 0.01023972, + "balance_loss_clip": 1.01192629, + "balance_loss_mlp": 1.02100444, + "epoch": 0.5140538103111378, + "flos": 16143069692160.0, + "grad_norm": 1.7635594372843486, + "language_loss": 0.71711171, + "learning_rate": 1.91199582116436e-06, + "loss": 0.73797727, + "num_input_tokens_seen": 183818735, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.41601562, + "step": 8550, + "time_per_iteration": 2.3784308433532715 + }, + { + "auxiliary_loss_clip": 0.01064121, + "auxiliary_loss_mlp": 0.01027069, + "balance_loss_clip": 1.01383126, + "balance_loss_mlp": 1.01981342, + "epoch": 0.5141139335638057, + "flos": 22419021847680.0, + "grad_norm": 1.5144012874217363, + "language_loss": 0.75147486, + "learning_rate": 1.9116183663533436e-06, + "loss": 0.77238679, + "num_input_tokens_seen": 183840015, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.44335938, + "step": 8551, + "time_per_iteration": 2.409386157989502 + }, + { + "auxiliary_loss_clip": 0.01062354, + "auxiliary_loss_mlp": 0.01025562, + "balance_loss_clip": 1.01216912, + "balance_loss_mlp": 1.02059734, + "epoch": 0.5141740568164738, + "flos": 27088410746880.0, + "grad_norm": 1.53778292307916, + "language_loss": 0.69425702, + "learning_rate": 1.9112409146964402e-06, + "loss": 0.71513617, + "num_input_tokens_seen": 183860145, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.41796875, + "step": 8552, + "time_per_iteration": 2.424651861190796 + }, + { + "auxiliary_loss_clip": 0.01065173, + "auxiliary_loss_mlp": 0.01028155, + "balance_loss_clip": 1.01537561, + "balance_loss_mlp": 1.02064133, + "epoch": 0.5142341800691418, + "flos": 24972080595840.0, + "grad_norm": 2.074024351694409, + "language_loss": 0.74886262, + "learning_rate": 1.9108634662071195e-06, + "loss": 0.76979589, + "num_input_tokens_seen": 183880540, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.4453125, + "step": 8553, + "time_per_iteration": 2.4206511974334717 + }, + { + "auxiliary_loss_clip": 0.01063964, + "auxiliary_loss_mlp": 0.01027625, + "balance_loss_clip": 1.01567447, + "balance_loss_mlp": 1.02085912, + "epoch": 0.5142943033218097, + "flos": 20703459726720.0, + "grad_norm": 1.6109760562604571, + "language_loss": 0.67313612, + "learning_rate": 1.9104860208988534e-06, + "loss": 0.69405204, + "num_input_tokens_seen": 183900895, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.43164062, + "step": 8554, + "time_per_iteration": 2.385693073272705 + }, + { + "auxiliary_loss_clip": 0.01067456, + "auxiliary_loss_mlp": 0.01030596, + "balance_loss_clip": 1.01588571, + "balance_loss_mlp": 1.02167022, + "epoch": 0.5143544265744777, + "flos": 22924494645120.0, + "grad_norm": 2.246356472366354, + "language_loss": 0.73733974, + "learning_rate": 1.9101085787851103e-06, + "loss": 0.75832027, + "num_input_tokens_seen": 183920335, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.45898438, + "step": 8555, + "time_per_iteration": 2.4007205963134766 + }, + { + "auxiliary_loss_clip": 0.01064045, + "auxiliary_loss_mlp": 0.01026645, + "balance_loss_clip": 1.01425362, + "balance_loss_mlp": 1.02157629, + "epoch": 0.5144145498271456, + "flos": 15920568397440.0, + "grad_norm": 2.876843086680328, + "language_loss": 0.74242473, + "learning_rate": 1.9097311398793613e-06, + "loss": 0.76333165, + "num_input_tokens_seen": 183936220, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.42578125, + "step": 8556, + "time_per_iteration": 3.7697930335998535 + }, + { + "auxiliary_loss_clip": 0.01062632, + "auxiliary_loss_mlp": 0.01027743, + "balance_loss_clip": 1.01486886, + "balance_loss_mlp": 1.02086973, + "epoch": 0.5144746730798136, + "flos": 19680260244480.0, + "grad_norm": 1.912658718880087, + "language_loss": 0.86125326, + "learning_rate": 1.909353704195075e-06, + "loss": 0.88215703, + "num_input_tokens_seen": 183953250, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.41796875, + "step": 8557, + "time_per_iteration": 2.37068510055542 + }, + { + "auxiliary_loss_clip": 0.01060777, + "auxiliary_loss_mlp": 0.01027597, + "balance_loss_clip": 1.0154314, + "balance_loss_mlp": 1.01995122, + "epoch": 0.5145347963324816, + "flos": 23913583862400.0, + "grad_norm": 1.903926739605707, + "language_loss": 0.89089829, + "learning_rate": 1.9089762717457226e-06, + "loss": 0.91178203, + "num_input_tokens_seen": 183973865, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.40820312, + "step": 8558, + "time_per_iteration": 2.4034390449523926 + }, + { + "auxiliary_loss_clip": 0.01062168, + "auxiliary_loss_mlp": 0.01027458, + "balance_loss_clip": 1.01472592, + "balance_loss_mlp": 1.02036011, + "epoch": 0.5145949195851496, + "flos": 18259015818240.0, + "grad_norm": 1.8673007462117255, + "language_loss": 0.65070242, + "learning_rate": 1.908598842544773e-06, + "loss": 0.67159867, + "num_input_tokens_seen": 183992555, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.41796875, + "step": 8559, + "time_per_iteration": 2.3857059478759766 + }, + { + "auxiliary_loss_clip": 0.0106312, + "auxiliary_loss_mlp": 0.01024616, + "balance_loss_clip": 1.01239681, + "balance_loss_mlp": 1.02170682, + "epoch": 0.5146550428378175, + "flos": 26212230466560.0, + "grad_norm": 1.785114216506022, + "language_loss": 0.63306248, + "learning_rate": 1.908221416605695e-06, + "loss": 0.65393984, + "num_input_tokens_seen": 184010825, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.4140625, + "step": 8560, + "time_per_iteration": 5.320660352706909 + }, + { + "auxiliary_loss_clip": 0.01061878, + "auxiliary_loss_mlp": 0.01020799, + "balance_loss_clip": 1.00856233, + "balance_loss_mlp": 1.02036142, + "epoch": 0.5147151660904855, + "flos": 22673084878080.0, + "grad_norm": 1.6388572559083328, + "language_loss": 0.70026952, + "learning_rate": 1.9078439939419595e-06, + "loss": 0.72109628, + "num_input_tokens_seen": 184030155, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.4140625, + "step": 8561, + "time_per_iteration": 2.392577886581421 + }, + { + "auxiliary_loss_clip": 0.01064439, + "auxiliary_loss_mlp": 0.01025598, + "balance_loss_clip": 1.01352179, + "balance_loss_mlp": 1.0211643, + "epoch": 0.5147752893431534, + "flos": 24971242723200.0, + "grad_norm": 2.623983103189185, + "language_loss": 0.66800368, + "learning_rate": 1.907466574567034e-06, + "loss": 0.68890405, + "num_input_tokens_seen": 184051440, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.43359375, + "step": 8562, + "time_per_iteration": 2.424085855484009 + }, + { + "auxiliary_loss_clip": 0.01065695, + "auxiliary_loss_mlp": 0.01025462, + "balance_loss_clip": 1.01302266, + "balance_loss_mlp": 1.0219003, + "epoch": 0.5148354125958214, + "flos": 22743644469120.0, + "grad_norm": 1.6061877930612711, + "language_loss": 0.77788877, + "learning_rate": 1.9070891584943885e-06, + "loss": 0.79880035, + "num_input_tokens_seen": 184070205, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.4375, + "step": 8563, + "time_per_iteration": 2.3933091163635254 + }, + { + "auxiliary_loss_clip": 0.01066543, + "auxiliary_loss_mlp": 0.01028311, + "balance_loss_clip": 1.0134697, + "balance_loss_mlp": 1.02042615, + "epoch": 0.5148955358484893, + "flos": 23067848154240.0, + "grad_norm": 2.0403167993548617, + "language_loss": 0.82782602, + "learning_rate": 1.9067117457374921e-06, + "loss": 0.84877455, + "num_input_tokens_seen": 184087345, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.4609375, + "step": 8564, + "time_per_iteration": 2.3840675354003906 + }, + { + "auxiliary_loss_clip": 0.01064485, + "auxiliary_loss_mlp": 0.01025886, + "balance_loss_clip": 1.01270175, + "balance_loss_mlp": 1.01963472, + "epoch": 0.5149556591011574, + "flos": 20339071200000.0, + "grad_norm": 1.9819403982000598, + "language_loss": 0.72747576, + "learning_rate": 1.9063343363098132e-06, + "loss": 0.74837947, + "num_input_tokens_seen": 184107110, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.44921875, + "step": 8565, + "time_per_iteration": 2.382279396057129 + }, + { + "auxiliary_loss_clip": 0.01067292, + "auxiliary_loss_mlp": 0.01026041, + "balance_loss_clip": 1.01119947, + "balance_loss_mlp": 1.02208757, + "epoch": 0.5150157823538254, + "flos": 22637124311040.0, + "grad_norm": 3.3483394809113958, + "language_loss": 0.68132526, + "learning_rate": 1.9059569302248213e-06, + "loss": 0.70225859, + "num_input_tokens_seen": 184127105, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.45117188, + "step": 8566, + "time_per_iteration": 2.4113473892211914 + }, + { + "auxiliary_loss_clip": 0.01062613, + "auxiliary_loss_mlp": 0.01027519, + "balance_loss_clip": 1.01383424, + "balance_loss_mlp": 1.01816559, + "epoch": 0.5150759056064933, + "flos": 26066433162240.0, + "grad_norm": 1.5704462838076656, + "language_loss": 0.7792542, + "learning_rate": 1.9055795274959841e-06, + "loss": 0.80015552, + "num_input_tokens_seen": 184148060, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.4453125, + "step": 8567, + "time_per_iteration": 2.4280288219451904 + }, + { + "auxiliary_loss_clip": 0.01065977, + "auxiliary_loss_mlp": 0.01027835, + "balance_loss_clip": 1.01371455, + "balance_loss_mlp": 1.02155542, + "epoch": 0.5151360288591613, + "flos": 25951604100480.0, + "grad_norm": 1.8868865822393368, + "language_loss": 0.78597939, + "learning_rate": 1.9052021281367711e-06, + "loss": 0.80691743, + "num_input_tokens_seen": 184166175, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.4453125, + "step": 8568, + "time_per_iteration": 2.4889495372772217 + }, + { + "auxiliary_loss_clip": 0.01061719, + "auxiliary_loss_mlp": 0.01026762, + "balance_loss_clip": 1.0140717, + "balance_loss_mlp": 1.01902366, + "epoch": 0.5151961521118292, + "flos": 18506480601600.0, + "grad_norm": 2.3799054721551, + "language_loss": 0.90540063, + "learning_rate": 1.9048247321606505e-06, + "loss": 0.92628551, + "num_input_tokens_seen": 184182600, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.42578125, + "step": 8569, + "time_per_iteration": 2.3648598194122314 + }, + { + "auxiliary_loss_clip": 0.01063607, + "auxiliary_loss_mlp": 0.01027895, + "balance_loss_clip": 1.01455557, + "balance_loss_mlp": 1.02003515, + "epoch": 0.5152562753644973, + "flos": 22232690588160.0, + "grad_norm": 1.6831260865204882, + "language_loss": 0.76631021, + "learning_rate": 1.90444733958109e-06, + "loss": 0.78722525, + "num_input_tokens_seen": 184202020, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.4375, + "step": 8570, + "time_per_iteration": 2.4034836292266846 + }, + { + "auxiliary_loss_clip": 0.01063192, + "auxiliary_loss_mlp": 0.01032339, + "balance_loss_clip": 1.01728261, + "balance_loss_mlp": 1.019526, + "epoch": 0.5153163986171652, + "flos": 38435008590720.0, + "grad_norm": 1.75254414902045, + "language_loss": 0.7372635, + "learning_rate": 1.9040699504115584e-06, + "loss": 0.75821877, + "num_input_tokens_seen": 184224850, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.4375, + "step": 8571, + "time_per_iteration": 2.529325246810913 + }, + { + "auxiliary_loss_clip": 0.01009398, + "auxiliary_loss_mlp": 0.01001244, + "balance_loss_clip": 1.00025415, + "balance_loss_mlp": 1.00133014, + "epoch": 0.5153765218698332, + "flos": 66381164553600.0, + "grad_norm": 0.9370379429218981, + "language_loss": 0.52937949, + "learning_rate": 1.9036925646655231e-06, + "loss": 0.54948592, + "num_input_tokens_seen": 184288520, + "router_z_loss_clip": 0.0098877, + "router_z_loss_mlp": 0.08056641, + "step": 8572, + "time_per_iteration": 3.116089105606079 + }, + { + "auxiliary_loss_clip": 0.01063499, + "auxiliary_loss_mlp": 0.01026041, + "balance_loss_clip": 1.01369679, + "balance_loss_mlp": 1.02210116, + "epoch": 0.5154366451225011, + "flos": 24023525333760.0, + "grad_norm": 1.6395898963832904, + "language_loss": 0.76106381, + "learning_rate": 1.9033151823564531e-06, + "loss": 0.78195918, + "num_input_tokens_seen": 184308565, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.4140625, + "step": 8573, + "time_per_iteration": 2.401439666748047 + }, + { + "auxiliary_loss_clip": 0.01061019, + "auxiliary_loss_mlp": 0.01020963, + "balance_loss_clip": 1.00825477, + "balance_loss_mlp": 1.01958668, + "epoch": 0.5154967683751691, + "flos": 23467952868480.0, + "grad_norm": 4.01394674618122, + "language_loss": 0.77255023, + "learning_rate": 1.9029378034978153e-06, + "loss": 0.79337013, + "num_input_tokens_seen": 184326795, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.4140625, + "step": 8574, + "time_per_iteration": 2.4092581272125244 + }, + { + "auxiliary_loss_clip": 0.01064052, + "auxiliary_loss_mlp": 0.0102642, + "balance_loss_clip": 1.01424909, + "balance_loss_mlp": 1.02098906, + "epoch": 0.515556891627837, + "flos": 23804515175040.0, + "grad_norm": 1.6290847244847881, + "language_loss": 0.8510235, + "learning_rate": 1.9025604281030772e-06, + "loss": 0.87192822, + "num_input_tokens_seen": 184345990, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.4296875, + "step": 8575, + "time_per_iteration": 2.3891799449920654 + }, + { + "auxiliary_loss_clip": 0.01064071, + "auxiliary_loss_mlp": 0.01023288, + "balance_loss_clip": 1.01007366, + "balance_loss_mlp": 1.02086866, + "epoch": 0.515617014880505, + "flos": 19827523825920.0, + "grad_norm": 1.6365487320643097, + "language_loss": 0.76508152, + "learning_rate": 1.9021830561857074e-06, + "loss": 0.78595513, + "num_input_tokens_seen": 184366300, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.43164062, + "step": 8576, + "time_per_iteration": 2.396468162536621 + }, + { + "auxiliary_loss_clip": 0.01065928, + "auxiliary_loss_mlp": 0.01027632, + "balance_loss_clip": 1.01294541, + "balance_loss_mlp": 1.02106845, + "epoch": 0.515677138133173, + "flos": 14245051472640.0, + "grad_norm": 3.218605678400676, + "language_loss": 0.75692672, + "learning_rate": 1.9018056877591725e-06, + "loss": 0.77786231, + "num_input_tokens_seen": 184383030, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.44921875, + "step": 8577, + "time_per_iteration": 2.347573757171631 + }, + { + "auxiliary_loss_clip": 0.01067566, + "auxiliary_loss_mlp": 0.01030147, + "balance_loss_clip": 1.01544833, + "balance_loss_mlp": 1.02147841, + "epoch": 0.515737261385841, + "flos": 28288550332800.0, + "grad_norm": 1.6303528698340897, + "language_loss": 0.81102747, + "learning_rate": 1.9014283228369399e-06, + "loss": 0.83200455, + "num_input_tokens_seen": 184403410, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.4609375, + "step": 8578, + "time_per_iteration": 2.437626838684082 + }, + { + "auxiliary_loss_clip": 0.01061875, + "auxiliary_loss_mlp": 0.0102892, + "balance_loss_clip": 1.01619434, + "balance_loss_mlp": 1.0198369, + "epoch": 0.515797384638509, + "flos": 27890679945600.0, + "grad_norm": 1.9308402979918513, + "language_loss": 0.76163745, + "learning_rate": 1.9010509614324766e-06, + "loss": 0.78254533, + "num_input_tokens_seen": 184423830, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.421875, + "step": 8579, + "time_per_iteration": 2.4334070682525635 + }, + { + "auxiliary_loss_clip": 0.01065551, + "auxiliary_loss_mlp": 0.01024606, + "balance_loss_clip": 1.0119698, + "balance_loss_mlp": 1.02184522, + "epoch": 0.5158575078911769, + "flos": 23038939681920.0, + "grad_norm": 1.6152653013643594, + "language_loss": 0.79055429, + "learning_rate": 1.9006736035592505e-06, + "loss": 0.81145585, + "num_input_tokens_seen": 184445050, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.4375, + "step": 8580, + "time_per_iteration": 2.4248812198638916 + }, + { + "auxiliary_loss_clip": 0.01067077, + "auxiliary_loss_mlp": 0.01031082, + "balance_loss_clip": 1.01682472, + "balance_loss_mlp": 1.02118945, + "epoch": 0.5159176311438449, + "flos": 12640513075200.0, + "grad_norm": 2.1231701638768157, + "language_loss": 0.72449744, + "learning_rate": 1.900296249230728e-06, + "loss": 0.74547899, + "num_input_tokens_seen": 184460775, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.45898438, + "step": 8581, + "time_per_iteration": 2.352936267852783 + }, + { + "auxiliary_loss_clip": 0.0106539, + "auxiliary_loss_mlp": 0.01026245, + "balance_loss_clip": 1.01442552, + "balance_loss_mlp": 1.02240801, + "epoch": 0.5159777543965128, + "flos": 15557297034240.0, + "grad_norm": 2.1011479805531694, + "language_loss": 0.74290919, + "learning_rate": 1.8999188984603753e-06, + "loss": 0.7638256, + "num_input_tokens_seen": 184477365, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.4296875, + "step": 8582, + "time_per_iteration": 2.383453607559204 + }, + { + "auxiliary_loss_clip": 0.01062763, + "auxiliary_loss_mlp": 0.01027014, + "balance_loss_clip": 1.0145328, + "balance_loss_mlp": 1.02022612, + "epoch": 0.5160378776491809, + "flos": 23220557907840.0, + "grad_norm": 2.512810825956543, + "language_loss": 0.65958428, + "learning_rate": 1.8995415512616602e-06, + "loss": 0.68048209, + "num_input_tokens_seen": 184497045, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.42578125, + "step": 8583, + "time_per_iteration": 2.4067630767822266 + }, + { + "auxiliary_loss_clip": 0.01064544, + "auxiliary_loss_mlp": 0.01028534, + "balance_loss_clip": 1.0152123, + "balance_loss_mlp": 1.02116799, + "epoch": 0.5160980009018488, + "flos": 21943539774720.0, + "grad_norm": 1.3346747131386665, + "language_loss": 0.76014125, + "learning_rate": 1.8991642076480482e-06, + "loss": 0.78107202, + "num_input_tokens_seen": 184517675, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.43359375, + "step": 8584, + "time_per_iteration": 3.8281962871551514 + }, + { + "auxiliary_loss_clip": 0.01064969, + "auxiliary_loss_mlp": 0.01029138, + "balance_loss_clip": 1.01597762, + "balance_loss_mlp": 1.02060127, + "epoch": 0.5161581241545168, + "flos": 22782956526720.0, + "grad_norm": 2.462984557863417, + "language_loss": 0.78994662, + "learning_rate": 1.8987868676330068e-06, + "loss": 0.81088769, + "num_input_tokens_seen": 184537745, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.44335938, + "step": 8585, + "time_per_iteration": 2.3828964233398438 + }, + { + "auxiliary_loss_clip": 0.01059194, + "auxiliary_loss_mlp": 0.01025925, + "balance_loss_clip": 1.01471376, + "balance_loss_mlp": 1.01869035, + "epoch": 0.5162182474071847, + "flos": 19674569692800.0, + "grad_norm": 2.1027604714819095, + "language_loss": 0.80815399, + "learning_rate": 1.8984095312300017e-06, + "loss": 0.82900512, + "num_input_tokens_seen": 184553630, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.40429688, + "step": 8586, + "time_per_iteration": 2.3655002117156982 + }, + { + "auxiliary_loss_clip": 0.01063652, + "auxiliary_loss_mlp": 0.01024408, + "balance_loss_clip": 1.01189137, + "balance_loss_mlp": 1.01999187, + "epoch": 0.5162783706598527, + "flos": 20045207352960.0, + "grad_norm": 1.5823194989407463, + "language_loss": 0.71199477, + "learning_rate": 1.8980321984524988e-06, + "loss": 0.73287535, + "num_input_tokens_seen": 184573530, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.4375, + "step": 8587, + "time_per_iteration": 2.381086826324463 + }, + { + "auxiliary_loss_clip": 0.01009263, + "auxiliary_loss_mlp": 0.01002177, + "balance_loss_clip": 1.00113404, + "balance_loss_mlp": 1.001526, + "epoch": 0.5163384939125206, + "flos": 69956131063680.0, + "grad_norm": 0.7290897968694928, + "language_loss": 0.57772815, + "learning_rate": 1.8976548693139648e-06, + "loss": 0.59784245, + "num_input_tokens_seen": 184637875, + "router_z_loss_clip": 0.01043701, + "router_z_loss_mlp": 0.07714844, + "step": 8588, + "time_per_iteration": 3.053659677505493 + }, + { + "auxiliary_loss_clip": 0.01064303, + "auxiliary_loss_mlp": 0.01026471, + "balance_loss_clip": 1.01270247, + "balance_loss_mlp": 1.02113605, + "epoch": 0.5163986171651886, + "flos": 17416177752960.0, + "grad_norm": 1.771245965545318, + "language_loss": 0.7544713, + "learning_rate": 1.8972775438278646e-06, + "loss": 0.77537906, + "num_input_tokens_seen": 184656125, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.43164062, + "step": 8589, + "time_per_iteration": 2.4007463455200195 + }, + { + "auxiliary_loss_clip": 0.01063194, + "auxiliary_loss_mlp": 0.01023497, + "balance_loss_clip": 1.01086116, + "balance_loss_mlp": 1.01953793, + "epoch": 0.5164587404178566, + "flos": 21321666904320.0, + "grad_norm": 2.016888732819929, + "language_loss": 0.67259318, + "learning_rate": 1.8969002220076654e-06, + "loss": 0.69346011, + "num_input_tokens_seen": 184675920, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.4375, + "step": 8590, + "time_per_iteration": 2.381500720977783 + }, + { + "auxiliary_loss_clip": 0.01009493, + "auxiliary_loss_mlp": 0.01001053, + "balance_loss_clip": 1.00011158, + "balance_loss_mlp": 1.0015955, + "epoch": 0.5165188636705246, + "flos": 68053923480960.0, + "grad_norm": 0.7856117623290707, + "language_loss": 0.55872887, + "learning_rate": 1.8965229038668323e-06, + "loss": 0.57883435, + "num_input_tokens_seen": 184730520, + "router_z_loss_clip": 0.00939941, + "router_z_loss_mlp": 0.07910156, + "step": 8591, + "time_per_iteration": 2.9615724086761475 + }, + { + "auxiliary_loss_clip": 0.01057826, + "auxiliary_loss_mlp": 0.01020022, + "balance_loss_clip": 1.00892961, + "balance_loss_mlp": 1.01898217, + "epoch": 0.5165789869231926, + "flos": 19384790474880.0, + "grad_norm": 1.9169421080911417, + "language_loss": 0.81306082, + "learning_rate": 1.8961455894188297e-06, + "loss": 0.8338393, + "num_input_tokens_seen": 184748340, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.38867188, + "step": 8592, + "time_per_iteration": 2.3862154483795166 + }, + { + "auxiliary_loss_clip": 0.01062071, + "auxiliary_loss_mlp": 0.01026079, + "balance_loss_clip": 1.01442671, + "balance_loss_mlp": 1.02160466, + "epoch": 0.5166391101758605, + "flos": 20959128679680.0, + "grad_norm": 1.8535244972404188, + "language_loss": 0.83033639, + "learning_rate": 1.8957682786771243e-06, + "loss": 0.85121787, + "num_input_tokens_seen": 184766615, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.40429688, + "step": 8593, + "time_per_iteration": 2.3878390789031982 + }, + { + "auxiliary_loss_clip": 0.01064666, + "auxiliary_loss_mlp": 0.01026335, + "balance_loss_clip": 1.01422358, + "balance_loss_mlp": 1.02191043, + "epoch": 0.5166992334285285, + "flos": 29461073166720.0, + "grad_norm": 1.7052146213371429, + "language_loss": 0.69269955, + "learning_rate": 1.8953909716551807e-06, + "loss": 0.71360958, + "num_input_tokens_seen": 184788075, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.42773438, + "step": 8594, + "time_per_iteration": 2.486936569213867 + }, + { + "auxiliary_loss_clip": 0.0106174, + "auxiliary_loss_mlp": 0.01025627, + "balance_loss_clip": 1.01325917, + "balance_loss_mlp": 1.01914716, + "epoch": 0.5167593566811964, + "flos": 20303285189760.0, + "grad_norm": 1.4257163743274672, + "language_loss": 0.77230102, + "learning_rate": 1.8950136683664645e-06, + "loss": 0.79317474, + "num_input_tokens_seen": 184808710, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.42578125, + "step": 8595, + "time_per_iteration": 3.8270492553710938 + }, + { + "auxiliary_loss_clip": 0.01062036, + "auxiliary_loss_mlp": 0.01028823, + "balance_loss_clip": 1.01699138, + "balance_loss_mlp": 1.02030897, + "epoch": 0.5168194799338645, + "flos": 14610487340160.0, + "grad_norm": 1.6420491820493845, + "language_loss": 0.65254849, + "learning_rate": 1.8946363688244405e-06, + "loss": 0.67345715, + "num_input_tokens_seen": 184826475, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.41796875, + "step": 8596, + "time_per_iteration": 2.376950979232788 + }, + { + "auxiliary_loss_clip": 0.0106402, + "auxiliary_loss_mlp": 0.01029689, + "balance_loss_clip": 1.01532984, + "balance_loss_mlp": 1.02205253, + "epoch": 0.5168796031865324, + "flos": 25006155949440.0, + "grad_norm": 1.679185956822758, + "language_loss": 0.7540834, + "learning_rate": 1.894259073042573e-06, + "loss": 0.77502048, + "num_input_tokens_seen": 184845245, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.41992188, + "step": 8597, + "time_per_iteration": 2.4293720722198486 + }, + { + "auxiliary_loss_clip": 0.01061237, + "auxiliary_loss_mlp": 0.01021967, + "balance_loss_clip": 1.00971222, + "balance_loss_mlp": 1.01898766, + "epoch": 0.5169397264392004, + "flos": 26938843015680.0, + "grad_norm": 2.0301453587200364, + "language_loss": 0.81287086, + "learning_rate": 1.8938817810343276e-06, + "loss": 0.83370292, + "num_input_tokens_seen": 184866605, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.421875, + "step": 8598, + "time_per_iteration": 2.4291484355926514 + }, + { + "auxiliary_loss_clip": 0.01062443, + "auxiliary_loss_mlp": 0.0102806, + "balance_loss_clip": 1.01511407, + "balance_loss_mlp": 1.01947618, + "epoch": 0.5169998496918683, + "flos": 25406714511360.0, + "grad_norm": 1.4788456172953721, + "language_loss": 0.74948013, + "learning_rate": 1.8935044928131679e-06, + "loss": 0.77038515, + "num_input_tokens_seen": 184886945, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.4296875, + "step": 8599, + "time_per_iteration": 3.903721570968628 + }, + { + "auxiliary_loss_clip": 0.01060402, + "auxiliary_loss_mlp": 0.01025186, + "balance_loss_clip": 1.01305652, + "balance_loss_mlp": 1.01953316, + "epoch": 0.5170599729445363, + "flos": 24679648114560.0, + "grad_norm": 2.729002102285012, + "language_loss": 0.7187227, + "learning_rate": 1.8931272083925593e-06, + "loss": 0.73957855, + "num_input_tokens_seen": 184905590, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.40820312, + "step": 8600, + "time_per_iteration": 3.8609766960144043 + }, + { + "auxiliary_loss_clip": 0.01062463, + "auxiliary_loss_mlp": 0.01025062, + "balance_loss_clip": 1.01197243, + "balance_loss_mlp": 1.02054715, + "epoch": 0.5171200961972042, + "flos": 20993448412800.0, + "grad_norm": 1.619537225059684, + "language_loss": 0.74561775, + "learning_rate": 1.8927499277859655e-06, + "loss": 0.76649296, + "num_input_tokens_seen": 184925555, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.41992188, + "step": 8601, + "time_per_iteration": 2.3999106884002686 + }, + { + "auxiliary_loss_clip": 0.0106349, + "auxiliary_loss_mlp": 0.010308, + "balance_loss_clip": 1.01734698, + "balance_loss_mlp": 1.02085733, + "epoch": 0.5171802194498722, + "flos": 22744587075840.0, + "grad_norm": 1.9563425605984894, + "language_loss": 0.83817935, + "learning_rate": 1.8923726510068513e-06, + "loss": 0.85912222, + "num_input_tokens_seen": 184944490, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.42578125, + "step": 8602, + "time_per_iteration": 2.4158003330230713 + }, + { + "auxiliary_loss_clip": 0.01063534, + "auxiliary_loss_mlp": 0.01030227, + "balance_loss_clip": 1.01669669, + "balance_loss_mlp": 1.0203445, + "epoch": 0.5172403427025402, + "flos": 28175676307200.0, + "grad_norm": 2.1680591718674105, + "language_loss": 0.74477601, + "learning_rate": 1.8919953780686804e-06, + "loss": 0.76571363, + "num_input_tokens_seen": 184963190, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.43164062, + "step": 8603, + "time_per_iteration": 2.44724440574646 + }, + { + "auxiliary_loss_clip": 0.01064571, + "auxiliary_loss_mlp": 0.01028319, + "balance_loss_clip": 1.01557517, + "balance_loss_mlp": 1.02156329, + "epoch": 0.5173004659552082, + "flos": 20336836872960.0, + "grad_norm": 2.1844962998612, + "language_loss": 0.7254473, + "learning_rate": 1.8916181089849162e-06, + "loss": 0.74637628, + "num_input_tokens_seen": 184981220, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.4296875, + "step": 8604, + "time_per_iteration": 2.39406681060791 + }, + { + "auxiliary_loss_clip": 0.01067449, + "auxiliary_loss_mlp": 0.01027273, + "balance_loss_clip": 1.01305151, + "balance_loss_mlp": 1.02033842, + "epoch": 0.5173605892078762, + "flos": 19062297446400.0, + "grad_norm": 1.8992652360090818, + "language_loss": 0.84961462, + "learning_rate": 1.8912408437690234e-06, + "loss": 0.87056184, + "num_input_tokens_seen": 184998810, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.47070312, + "step": 8605, + "time_per_iteration": 2.407050609588623 + }, + { + "auxiliary_loss_clip": 0.01061414, + "auxiliary_loss_mlp": 0.01022617, + "balance_loss_clip": 1.01018941, + "balance_loss_mlp": 1.01973724, + "epoch": 0.5174207124605441, + "flos": 27994092992640.0, + "grad_norm": 1.5959544150121643, + "language_loss": 0.64720392, + "learning_rate": 1.8908635824344648e-06, + "loss": 0.66804427, + "num_input_tokens_seen": 185021185, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.41601562, + "step": 8606, + "time_per_iteration": 2.498565673828125 + }, + { + "auxiliary_loss_clip": 0.01061419, + "auxiliary_loss_mlp": 0.01021795, + "balance_loss_clip": 1.00944459, + "balance_loss_mlp": 1.02066326, + "epoch": 0.5174808357132121, + "flos": 19495744375680.0, + "grad_norm": 1.555303228270742, + "language_loss": 0.77861547, + "learning_rate": 1.8904863249947043e-06, + "loss": 0.7994476, + "num_input_tokens_seen": 185038465, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.40820312, + "step": 8607, + "time_per_iteration": 2.3904807567596436 + }, + { + "auxiliary_loss_clip": 0.01064333, + "auxiliary_loss_mlp": 0.01031208, + "balance_loss_clip": 1.01745725, + "balance_loss_mlp": 1.02074707, + "epoch": 0.51754095896588, + "flos": 22783061260800.0, + "grad_norm": 1.9806704328168818, + "language_loss": 0.72279507, + "learning_rate": 1.8901090714632054e-06, + "loss": 0.74375051, + "num_input_tokens_seen": 185057340, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.43554688, + "step": 8608, + "time_per_iteration": 2.440455436706543 + }, + { + "auxiliary_loss_clip": 0.01066632, + "auxiliary_loss_mlp": 0.01027224, + "balance_loss_clip": 1.01339626, + "balance_loss_mlp": 1.02107573, + "epoch": 0.5176010822185481, + "flos": 22668302021760.0, + "grad_norm": 1.9635397234723746, + "language_loss": 0.86536765, + "learning_rate": 1.8897318218534304e-06, + "loss": 0.88630623, + "num_input_tokens_seen": 185074935, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.45507812, + "step": 8609, + "time_per_iteration": 2.4082303047180176 + }, + { + "auxiliary_loss_clip": 0.01061546, + "auxiliary_loss_mlp": 0.01025887, + "balance_loss_clip": 1.01383519, + "balance_loss_mlp": 1.01994228, + "epoch": 0.517661205471216, + "flos": 23950068099840.0, + "grad_norm": 1.5290675816201513, + "language_loss": 0.73418075, + "learning_rate": 1.8893545761788436e-06, + "loss": 0.75505507, + "num_input_tokens_seen": 185095050, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.41601562, + "step": 8610, + "time_per_iteration": 2.4440243244171143 + }, + { + "auxiliary_loss_clip": 0.01066817, + "auxiliary_loss_mlp": 0.01028042, + "balance_loss_clip": 1.01373744, + "balance_loss_mlp": 1.02108836, + "epoch": 0.517721328723884, + "flos": 15595177726080.0, + "grad_norm": 1.7610990976355607, + "language_loss": 0.67004204, + "learning_rate": 1.8889773344529068e-06, + "loss": 0.69099063, + "num_input_tokens_seen": 185112275, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.45703125, + "step": 8611, + "time_per_iteration": 2.3665382862091064 + }, + { + "auxiliary_loss_clip": 0.01063493, + "auxiliary_loss_mlp": 0.01028301, + "balance_loss_clip": 1.01536047, + "balance_loss_mlp": 1.02048242, + "epoch": 0.5177814519765519, + "flos": 20959128679680.0, + "grad_norm": 2.1109098903846157, + "language_loss": 0.76966476, + "learning_rate": 1.888600096689084e-06, + "loss": 0.79058278, + "num_input_tokens_seen": 185132165, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.4296875, + "step": 8612, + "time_per_iteration": 2.4196343421936035 + }, + { + "auxiliary_loss_clip": 0.0106529, + "auxiliary_loss_mlp": 0.01031547, + "balance_loss_clip": 1.0186187, + "balance_loss_mlp": 1.02166688, + "epoch": 0.5178415752292199, + "flos": 17966862627840.0, + "grad_norm": 1.9859941169621491, + "language_loss": 0.82401711, + "learning_rate": 1.888222862900837e-06, + "loss": 0.84498549, + "num_input_tokens_seen": 185151025, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.43554688, + "step": 8613, + "time_per_iteration": 2.3658063411712646 + }, + { + "auxiliary_loss_clip": 0.01063731, + "auxiliary_loss_mlp": 0.0102751, + "balance_loss_clip": 1.01408076, + "balance_loss_mlp": 1.02126718, + "epoch": 0.5179016984818878, + "flos": 17820541653120.0, + "grad_norm": 2.119453451787835, + "language_loss": 0.65792572, + "learning_rate": 1.887845633101628e-06, + "loss": 0.67883813, + "num_input_tokens_seen": 185168455, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.42578125, + "step": 8614, + "time_per_iteration": 2.3360679149627686 + }, + { + "auxiliary_loss_clip": 0.01063629, + "auxiliary_loss_mlp": 0.01026968, + "balance_loss_clip": 1.01406932, + "balance_loss_mlp": 1.02011037, + "epoch": 0.5179618217345558, + "flos": 17819529223680.0, + "grad_norm": 1.8927633916916444, + "language_loss": 0.86430585, + "learning_rate": 1.8874684073049204e-06, + "loss": 0.88521183, + "num_input_tokens_seen": 185184415, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.43554688, + "step": 8615, + "time_per_iteration": 2.355849266052246 + }, + { + "auxiliary_loss_clip": 0.01061315, + "auxiliary_loss_mlp": 0.01032704, + "balance_loss_clip": 1.02075911, + "balance_loss_mlp": 1.02035213, + "epoch": 0.5180219449872238, + "flos": 22521212997120.0, + "grad_norm": 1.6302433417745683, + "language_loss": 0.81043673, + "learning_rate": 1.8870911855241755e-06, + "loss": 0.83137691, + "num_input_tokens_seen": 185202910, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.41015625, + "step": 8616, + "time_per_iteration": 2.391752004623413 + }, + { + "auxiliary_loss_clip": 0.01068516, + "auxiliary_loss_mlp": 0.0103044, + "balance_loss_clip": 1.01658773, + "balance_loss_mlp": 1.0232507, + "epoch": 0.5180820682398918, + "flos": 23914317000960.0, + "grad_norm": 1.7554669255341897, + "language_loss": 0.74738073, + "learning_rate": 1.8867139677728564e-06, + "loss": 0.76837027, + "num_input_tokens_seen": 185223085, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.453125, + "step": 8617, + "time_per_iteration": 2.411194324493408 + }, + { + "auxiliary_loss_clip": 0.01063547, + "auxiliary_loss_mlp": 0.01029339, + "balance_loss_clip": 1.01574373, + "balance_loss_mlp": 1.02001143, + "epoch": 0.5181421914925598, + "flos": 16979065130880.0, + "grad_norm": 1.6186562906011936, + "language_loss": 0.6996032, + "learning_rate": 1.886336754064424e-06, + "loss": 0.72053206, + "num_input_tokens_seen": 185241295, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.43554688, + "step": 8618, + "time_per_iteration": 2.347165822982788 + }, + { + "auxiliary_loss_clip": 0.01009283, + "auxiliary_loss_mlp": 0.0100231, + "balance_loss_clip": 1.00129628, + "balance_loss_mlp": 1.00128889, + "epoch": 0.5182023147452277, + "flos": 66056437198080.0, + "grad_norm": 0.9545044140787943, + "language_loss": 0.6725986, + "learning_rate": 1.8859595444123401e-06, + "loss": 0.69271457, + "num_input_tokens_seen": 185298295, + "router_z_loss_clip": 0.01013184, + "router_z_loss_mlp": 0.08007812, + "step": 8619, + "time_per_iteration": 2.9871037006378174 + }, + { + "auxiliary_loss_clip": 0.01061169, + "auxiliary_loss_mlp": 0.01025951, + "balance_loss_clip": 1.01292729, + "balance_loss_mlp": 1.02050734, + "epoch": 0.5182624379978957, + "flos": 18186745570560.0, + "grad_norm": 2.0995267137577467, + "language_loss": 0.79690659, + "learning_rate": 1.8855823388300672e-06, + "loss": 0.81777787, + "num_input_tokens_seen": 185317000, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.40625, + "step": 8620, + "time_per_iteration": 2.391857385635376 + }, + { + "auxiliary_loss_clip": 0.01061795, + "auxiliary_loss_mlp": 0.01026877, + "balance_loss_clip": 1.01460409, + "balance_loss_mlp": 1.02065063, + "epoch": 0.5183225612505636, + "flos": 14025866757120.0, + "grad_norm": 2.1545571705244075, + "language_loss": 0.8243289, + "learning_rate": 1.8852051373310665e-06, + "loss": 0.84521556, + "num_input_tokens_seen": 185331185, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.41210938, + "step": 8621, + "time_per_iteration": 2.3703715801239014 + }, + { + "auxiliary_loss_clip": 0.01063947, + "auxiliary_loss_mlp": 0.01024638, + "balance_loss_clip": 1.01305127, + "balance_loss_mlp": 1.0227139, + "epoch": 0.5183826845032317, + "flos": 23658648048000.0, + "grad_norm": 2.071264847593946, + "language_loss": 0.65391964, + "learning_rate": 1.8848279399287987e-06, + "loss": 0.67480552, + "num_input_tokens_seen": 185348955, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.41210938, + "step": 8622, + "time_per_iteration": 2.3912174701690674 + }, + { + "auxiliary_loss_clip": 0.01065463, + "auxiliary_loss_mlp": 0.01030348, + "balance_loss_clip": 1.01658499, + "balance_loss_mlp": 1.02109003, + "epoch": 0.5184428077558996, + "flos": 15887680030080.0, + "grad_norm": 2.0474295735739467, + "language_loss": 0.60746336, + "learning_rate": 1.8844507466367254e-06, + "loss": 0.62842155, + "num_input_tokens_seen": 185367330, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.4453125, + "step": 8623, + "time_per_iteration": 2.3761682510375977 + }, + { + "auxiliary_loss_clip": 0.01062265, + "auxiliary_loss_mlp": 0.01022299, + "balance_loss_clip": 1.01019907, + "balance_loss_mlp": 1.01954389, + "epoch": 0.5185029310085676, + "flos": 21029827916160.0, + "grad_norm": 1.810903248728098, + "language_loss": 0.76347339, + "learning_rate": 1.8840735574683082e-06, + "loss": 0.78431898, + "num_input_tokens_seen": 185385060, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.42773438, + "step": 8624, + "time_per_iteration": 3.7959046363830566 + }, + { + "auxiliary_loss_clip": 0.0106302, + "auxiliary_loss_mlp": 0.01026136, + "balance_loss_clip": 1.01403022, + "balance_loss_mlp": 1.02025187, + "epoch": 0.5185630542612355, + "flos": 26541461387520.0, + "grad_norm": 1.7838146114836777, + "language_loss": 0.71333724, + "learning_rate": 1.8836963724370074e-06, + "loss": 0.73422885, + "num_input_tokens_seen": 185403745, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.42773438, + "step": 8625, + "time_per_iteration": 2.421334743499756 + }, + { + "auxiliary_loss_clip": 0.01063336, + "auxiliary_loss_mlp": 0.0102595, + "balance_loss_clip": 1.01359987, + "balance_loss_mlp": 1.02045393, + "epoch": 0.5186231775139035, + "flos": 20667359514240.0, + "grad_norm": 2.1183555355707053, + "language_loss": 0.67892081, + "learning_rate": 1.8833191915562835e-06, + "loss": 0.69981372, + "num_input_tokens_seen": 185422620, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.42773438, + "step": 8626, + "time_per_iteration": 2.391024351119995 + }, + { + "auxiliary_loss_clip": 0.01063099, + "auxiliary_loss_mlp": 0.01025703, + "balance_loss_clip": 1.01353741, + "balance_loss_mlp": 1.02117598, + "epoch": 0.5186833007665714, + "flos": 20884484459520.0, + "grad_norm": 1.9710364172466657, + "language_loss": 0.70196998, + "learning_rate": 1.8829420148395978e-06, + "loss": 0.72285795, + "num_input_tokens_seen": 185439380, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.41992188, + "step": 8627, + "time_per_iteration": 2.3716142177581787 + }, + { + "auxiliary_loss_clip": 0.01066828, + "auxiliary_loss_mlp": 0.01028769, + "balance_loss_clip": 1.01516104, + "balance_loss_mlp": 1.02147627, + "epoch": 0.5187434240192395, + "flos": 20885846002560.0, + "grad_norm": 2.1692563884686926, + "language_loss": 0.73085022, + "learning_rate": 1.8825648423004101e-06, + "loss": 0.7518062, + "num_input_tokens_seen": 185458830, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.453125, + "step": 8628, + "time_per_iteration": 2.4150121212005615 + }, + { + "auxiliary_loss_clip": 0.01063559, + "auxiliary_loss_mlp": 0.01026696, + "balance_loss_clip": 1.01488197, + "balance_loss_mlp": 1.02211094, + "epoch": 0.5188035472719074, + "flos": 19859050650240.0, + "grad_norm": 1.622088273274129, + "language_loss": 0.77663988, + "learning_rate": 1.8821876739521815e-06, + "loss": 0.79754239, + "num_input_tokens_seen": 185477270, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.4140625, + "step": 8629, + "time_per_iteration": 2.4136555194854736 + }, + { + "auxiliary_loss_clip": 0.01064558, + "auxiliary_loss_mlp": 0.01029421, + "balance_loss_clip": 1.01674318, + "balance_loss_mlp": 1.0212661, + "epoch": 0.5188636705245754, + "flos": 21137360503680.0, + "grad_norm": 3.3410542119409747, + "language_loss": 0.74862528, + "learning_rate": 1.881810509808372e-06, + "loss": 0.76956499, + "num_input_tokens_seen": 185495795, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.43164062, + "step": 8630, + "time_per_iteration": 2.3841729164123535 + }, + { + "auxiliary_loss_clip": 0.01063602, + "auxiliary_loss_mlp": 0.01023372, + "balance_loss_clip": 1.01040244, + "balance_loss_mlp": 1.01967597, + "epoch": 0.5189237937772434, + "flos": 22418672734080.0, + "grad_norm": 2.449283581462629, + "language_loss": 0.80216712, + "learning_rate": 1.8814333498824409e-06, + "loss": 0.82303685, + "num_input_tokens_seen": 185514885, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.43945312, + "step": 8631, + "time_per_iteration": 2.3836545944213867 + }, + { + "auxiliary_loss_clip": 0.01064193, + "auxiliary_loss_mlp": 0.01025645, + "balance_loss_clip": 1.01212704, + "balance_loss_mlp": 1.02089179, + "epoch": 0.5189839170299113, + "flos": 25445537809920.0, + "grad_norm": 1.459117878235685, + "language_loss": 0.73941684, + "learning_rate": 1.8810561941878488e-06, + "loss": 0.76031518, + "num_input_tokens_seen": 185537155, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.43359375, + "step": 8632, + "time_per_iteration": 2.437873363494873 + }, + { + "auxiliary_loss_clip": 0.0105897, + "auxiliary_loss_mlp": 0.01021674, + "balance_loss_clip": 1.00991392, + "balance_loss_mlp": 1.01907563, + "epoch": 0.5190440402825793, + "flos": 18586745550720.0, + "grad_norm": 2.0606110712961483, + "language_loss": 0.78963155, + "learning_rate": 1.880679042738055e-06, + "loss": 0.81043804, + "num_input_tokens_seen": 185555520, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.3984375, + "step": 8633, + "time_per_iteration": 2.361757516860962 + }, + { + "auxiliary_loss_clip": 0.0106298, + "auxiliary_loss_mlp": 0.01023341, + "balance_loss_clip": 1.01091325, + "balance_loss_mlp": 1.02102065, + "epoch": 0.5191041635352472, + "flos": 21907544296320.0, + "grad_norm": 1.604433784814693, + "language_loss": 0.80687696, + "learning_rate": 1.8803018955465194e-06, + "loss": 0.82774019, + "num_input_tokens_seen": 185573855, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.41796875, + "step": 8634, + "time_per_iteration": 2.382355213165283 + }, + { + "auxiliary_loss_clip": 0.01061109, + "auxiliary_loss_mlp": 0.01022725, + "balance_loss_clip": 1.01076245, + "balance_loss_mlp": 1.0203979, + "epoch": 0.5191642867879153, + "flos": 27526710355200.0, + "grad_norm": 1.9797386386800828, + "language_loss": 0.69017166, + "learning_rate": 1.8799247526267015e-06, + "loss": 0.71100998, + "num_input_tokens_seen": 185595145, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.40625, + "step": 8635, + "time_per_iteration": 3.8485634326934814 + }, + { + "auxiliary_loss_clip": 0.01064935, + "auxiliary_loss_mlp": 0.01022915, + "balance_loss_clip": 1.00977778, + "balance_loss_mlp": 1.02194524, + "epoch": 0.5192244100405832, + "flos": 15705084286080.0, + "grad_norm": 1.9430890962799163, + "language_loss": 0.77290529, + "learning_rate": 1.87954761399206e-06, + "loss": 0.79378378, + "num_input_tokens_seen": 185613320, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.4296875, + "step": 8636, + "time_per_iteration": 2.368393898010254 + }, + { + "auxiliary_loss_clip": 0.01064287, + "auxiliary_loss_mlp": 0.01028975, + "balance_loss_clip": 1.01615429, + "balance_loss_mlp": 1.02057505, + "epoch": 0.5192845332932512, + "flos": 12056276517120.0, + "grad_norm": 2.3635678353034217, + "language_loss": 0.72188997, + "learning_rate": 1.8791704796560547e-06, + "loss": 0.74282265, + "num_input_tokens_seen": 185630730, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.4375, + "step": 8637, + "time_per_iteration": 2.3557796478271484 + }, + { + "auxiliary_loss_clip": 0.01061806, + "auxiliary_loss_mlp": 0.0102362, + "balance_loss_clip": 1.01110291, + "balance_loss_mlp": 1.02016902, + "epoch": 0.5193446565459191, + "flos": 18952181418240.0, + "grad_norm": 1.9739700471928023, + "language_loss": 0.76263416, + "learning_rate": 1.8787933496321433e-06, + "loss": 0.78348839, + "num_input_tokens_seen": 185648515, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.41601562, + "step": 8638, + "time_per_iteration": 2.376089572906494 + }, + { + "auxiliary_loss_clip": 0.0106055, + "auxiliary_loss_mlp": 0.01023201, + "balance_loss_clip": 1.01116073, + "balance_loss_mlp": 1.01949227, + "epoch": 0.5194047797985871, + "flos": 20373949514880.0, + "grad_norm": 1.8231990719268913, + "language_loss": 0.74778438, + "learning_rate": 1.8784162239337862e-06, + "loss": 0.76862186, + "num_input_tokens_seen": 185665220, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.41015625, + "step": 8639, + "time_per_iteration": 3.8090946674346924 + }, + { + "auxiliary_loss_clip": 0.01062088, + "auxiliary_loss_mlp": 0.01025244, + "balance_loss_clip": 1.01244676, + "balance_loss_mlp": 1.02082968, + "epoch": 0.519464903051255, + "flos": 24351848559360.0, + "grad_norm": 1.872244248991094, + "language_loss": 0.77478528, + "learning_rate": 1.8780391025744413e-06, + "loss": 0.79565853, + "num_input_tokens_seen": 185683750, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.4140625, + "step": 8640, + "time_per_iteration": 2.4176156520843506 + }, + { + "auxiliary_loss_clip": 0.01061919, + "auxiliary_loss_mlp": 0.01026946, + "balance_loss_clip": 1.01488829, + "balance_loss_mlp": 1.02064562, + "epoch": 0.519525026303923, + "flos": 14061024362880.0, + "grad_norm": 3.8566300972879493, + "language_loss": 0.66148591, + "learning_rate": 1.8776619855675666e-06, + "loss": 0.6823746, + "num_input_tokens_seen": 185700625, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.41210938, + "step": 8641, + "time_per_iteration": 2.371037244796753 + }, + { + "auxiliary_loss_clip": 0.01062717, + "auxiliary_loss_mlp": 0.01025203, + "balance_loss_clip": 1.01280487, + "balance_loss_mlp": 1.021191, + "epoch": 0.519585149556591, + "flos": 28834731642240.0, + "grad_norm": 1.7775720173240823, + "language_loss": 0.76335937, + "learning_rate": 1.8772848729266212e-06, + "loss": 0.78423858, + "num_input_tokens_seen": 185721155, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.41601562, + "step": 8642, + "time_per_iteration": 2.4615001678466797 + }, + { + "auxiliary_loss_clip": 0.01060347, + "auxiliary_loss_mlp": 0.01021382, + "balance_loss_clip": 1.0096637, + "balance_loss_mlp": 1.01935172, + "epoch": 0.519645272809259, + "flos": 25371871107840.0, + "grad_norm": 1.5942641359303724, + "language_loss": 0.8293404, + "learning_rate": 1.8769077646650631e-06, + "loss": 0.85015768, + "num_input_tokens_seen": 185740990, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.41015625, + "step": 8643, + "time_per_iteration": 2.413005828857422 + }, + { + "auxiliary_loss_clip": 0.01064755, + "auxiliary_loss_mlp": 0.01035154, + "balance_loss_clip": 1.02174258, + "balance_loss_mlp": 1.02140784, + "epoch": 0.519705396061927, + "flos": 25371731462400.0, + "grad_norm": 2.268436372002959, + "language_loss": 0.69916809, + "learning_rate": 1.8765306607963503e-06, + "loss": 0.72016722, + "num_input_tokens_seen": 185762235, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.43359375, + "step": 8644, + "time_per_iteration": 2.4282376766204834 + }, + { + "auxiliary_loss_clip": 0.01062817, + "auxiliary_loss_mlp": 0.01023955, + "balance_loss_clip": 1.01177812, + "balance_loss_mlp": 1.02004123, + "epoch": 0.5197655193145949, + "flos": 28474951415040.0, + "grad_norm": 1.6185494522112474, + "language_loss": 0.80275333, + "learning_rate": 1.8761535613339401e-06, + "loss": 0.82362103, + "num_input_tokens_seen": 185783415, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.42773438, + "step": 8645, + "time_per_iteration": 2.4354748725891113 + }, + { + "auxiliary_loss_clip": 0.01060687, + "auxiliary_loss_mlp": 0.01027279, + "balance_loss_clip": 1.014691, + "balance_loss_mlp": 1.01877189, + "epoch": 0.5198256425672629, + "flos": 20008164533760.0, + "grad_norm": 1.6435800435135166, + "language_loss": 0.78260285, + "learning_rate": 1.8757764662912913e-06, + "loss": 0.80348253, + "num_input_tokens_seen": 185801345, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.41992188, + "step": 8646, + "time_per_iteration": 2.4106597900390625 + }, + { + "auxiliary_loss_clip": 0.01064682, + "auxiliary_loss_mlp": 0.01027567, + "balance_loss_clip": 1.01510382, + "balance_loss_mlp": 1.0219022, + "epoch": 0.5198857658199308, + "flos": 19827838028160.0, + "grad_norm": 1.95622286535158, + "language_loss": 0.65953279, + "learning_rate": 1.875399375681861e-06, + "loss": 0.68045533, + "num_input_tokens_seen": 185820815, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.42773438, + "step": 8647, + "time_per_iteration": 2.3786911964416504 + }, + { + "auxiliary_loss_clip": 0.01066558, + "auxiliary_loss_mlp": 0.01030777, + "balance_loss_clip": 1.01667476, + "balance_loss_mlp": 1.02145624, + "epoch": 0.5199458890725989, + "flos": 24460777601280.0, + "grad_norm": 1.6992387361766252, + "language_loss": 0.70972759, + "learning_rate": 1.875022289519106e-06, + "loss": 0.73070091, + "num_input_tokens_seen": 185841450, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.45117188, + "step": 8648, + "time_per_iteration": 2.4276959896087646 + }, + { + "auxiliary_loss_clip": 0.01065644, + "auxiliary_loss_mlp": 0.01026561, + "balance_loss_clip": 1.01279795, + "balance_loss_mlp": 1.02129555, + "epoch": 0.5200060123252668, + "flos": 23403642410880.0, + "grad_norm": 1.842346391556131, + "language_loss": 0.64651954, + "learning_rate": 1.8746452078164843e-06, + "loss": 0.66744155, + "num_input_tokens_seen": 185859935, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.44335938, + "step": 8649, + "time_per_iteration": 2.422222852706909 + }, + { + "auxiliary_loss_clip": 0.01065845, + "auxiliary_loss_mlp": 0.01028151, + "balance_loss_clip": 1.01360703, + "balance_loss_mlp": 1.02063847, + "epoch": 0.5200661355779348, + "flos": 17200414350720.0, + "grad_norm": 1.6457338837076716, + "language_loss": 0.70345056, + "learning_rate": 1.8742681305874523e-06, + "loss": 0.72439051, + "num_input_tokens_seen": 185876795, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.45117188, + "step": 8650, + "time_per_iteration": 2.3564651012420654 + }, + { + "auxiliary_loss_clip": 0.01060392, + "auxiliary_loss_mlp": 0.0102254, + "balance_loss_clip": 1.01020217, + "balance_loss_mlp": 1.01909351, + "epoch": 0.5201262588306027, + "flos": 18514091278080.0, + "grad_norm": 1.7440618052307222, + "language_loss": 0.77656054, + "learning_rate": 1.873891057845468e-06, + "loss": 0.79738986, + "num_input_tokens_seen": 185895570, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.4140625, + "step": 8651, + "time_per_iteration": 2.3627829551696777 + }, + { + "auxiliary_loss_clip": 0.01064796, + "auxiliary_loss_mlp": 0.01027942, + "balance_loss_clip": 1.01453662, + "balance_loss_mlp": 1.02113008, + "epoch": 0.5201863820832707, + "flos": 18618551665920.0, + "grad_norm": 1.7800427917686212, + "language_loss": 0.78859115, + "learning_rate": 1.8735139896039874e-06, + "loss": 0.80951852, + "num_input_tokens_seen": 185913700, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.43554688, + "step": 8652, + "time_per_iteration": 2.377971887588501 + }, + { + "auxiliary_loss_clip": 0.0106397, + "auxiliary_loss_mlp": 0.01029449, + "balance_loss_clip": 1.01458395, + "balance_loss_mlp": 1.02044845, + "epoch": 0.5202465053359386, + "flos": 22856029735680.0, + "grad_norm": 2.4785451833764873, + "language_loss": 0.69873577, + "learning_rate": 1.8731369258764664e-06, + "loss": 0.71966994, + "num_input_tokens_seen": 185932460, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.43554688, + "step": 8653, + "time_per_iteration": 2.395881414413452 + }, + { + "auxiliary_loss_clip": 0.01066694, + "auxiliary_loss_mlp": 0.01035383, + "balance_loss_clip": 1.0219357, + "balance_loss_mlp": 1.02208519, + "epoch": 0.5203066285886067, + "flos": 21980442948480.0, + "grad_norm": 1.6007577016721133, + "language_loss": 0.78486359, + "learning_rate": 1.8727598666763628e-06, + "loss": 0.80588436, + "num_input_tokens_seen": 185952030, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.4453125, + "step": 8654, + "time_per_iteration": 2.3977952003479004 + }, + { + "auxiliary_loss_clip": 0.01067106, + "auxiliary_loss_mlp": 0.01032512, + "balance_loss_clip": 1.0188148, + "balance_loss_mlp": 1.02107334, + "epoch": 0.5203667518412746, + "flos": 20232201928320.0, + "grad_norm": 2.2207815693749504, + "language_loss": 0.84136212, + "learning_rate": 1.8723828120171316e-06, + "loss": 0.86235827, + "num_input_tokens_seen": 185973130, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.4609375, + "step": 8655, + "time_per_iteration": 2.417703628540039 + }, + { + "auxiliary_loss_clip": 0.01061225, + "auxiliary_loss_mlp": 0.01028461, + "balance_loss_clip": 1.01642108, + "balance_loss_mlp": 1.02050579, + "epoch": 0.5204268750939426, + "flos": 15704560615680.0, + "grad_norm": 2.4195671520159165, + "language_loss": 0.6586979, + "learning_rate": 1.8720057619122302e-06, + "loss": 0.67959476, + "num_input_tokens_seen": 185990200, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.40625, + "step": 8656, + "time_per_iteration": 2.367649793624878 + }, + { + "auxiliary_loss_clip": 0.01065162, + "auxiliary_loss_mlp": 0.01028913, + "balance_loss_clip": 1.01618755, + "balance_loss_mlp": 1.02155638, + "epoch": 0.5204869983466105, + "flos": 27448365530880.0, + "grad_norm": 2.495997828093742, + "language_loss": 0.73044431, + "learning_rate": 1.871628716375114e-06, + "loss": 0.75138503, + "num_input_tokens_seen": 186009880, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.43554688, + "step": 8657, + "time_per_iteration": 2.4340476989746094 + }, + { + "auxiliary_loss_clip": 0.01063804, + "auxiliary_loss_mlp": 0.0102868, + "balance_loss_clip": 1.015203, + "balance_loss_mlp": 1.02112639, + "epoch": 0.5205471215992785, + "flos": 20594391039360.0, + "grad_norm": 1.9414138450230911, + "language_loss": 0.71124339, + "learning_rate": 1.8712516754192382e-06, + "loss": 0.7321682, + "num_input_tokens_seen": 186026680, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.42578125, + "step": 8658, + "time_per_iteration": 2.393345355987549 + }, + { + "auxiliary_loss_clip": 0.01062809, + "auxiliary_loss_mlp": 0.01026985, + "balance_loss_clip": 1.01472449, + "balance_loss_mlp": 1.01991308, + "epoch": 0.5206072448519465, + "flos": 22126798834560.0, + "grad_norm": 2.503296209494289, + "language_loss": 0.82979023, + "learning_rate": 1.8708746390580592e-06, + "loss": 0.85068822, + "num_input_tokens_seen": 186046920, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.4296875, + "step": 8659, + "time_per_iteration": 2.406036615371704 + }, + { + "auxiliary_loss_clip": 0.01067425, + "auxiliary_loss_mlp": 0.01031435, + "balance_loss_clip": 1.01701629, + "balance_loss_mlp": 1.02160978, + "epoch": 0.5206673681046144, + "flos": 18329505586560.0, + "grad_norm": 1.9353703058295364, + "language_loss": 0.75297624, + "learning_rate": 1.8704976073050318e-06, + "loss": 0.77396488, + "num_input_tokens_seen": 186062090, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.45703125, + "step": 8660, + "time_per_iteration": 2.337801218032837 + }, + { + "auxiliary_loss_clip": 0.01060802, + "auxiliary_loss_mlp": 0.01027186, + "balance_loss_clip": 1.01523483, + "balance_loss_mlp": 1.01948535, + "epoch": 0.5207274913572825, + "flos": 20229199551360.0, + "grad_norm": 1.6946607849972808, + "language_loss": 0.77489352, + "learning_rate": 1.8701205801736121e-06, + "loss": 0.79577333, + "num_input_tokens_seen": 186081135, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.4140625, + "step": 8661, + "time_per_iteration": 2.3757808208465576 + }, + { + "auxiliary_loss_clip": 0.01065445, + "auxiliary_loss_mlp": 0.01026123, + "balance_loss_clip": 1.01317704, + "balance_loss_mlp": 1.02064991, + "epoch": 0.5207876146099504, + "flos": 22125960961920.0, + "grad_norm": 1.6964727279324012, + "language_loss": 0.70333862, + "learning_rate": 1.8697435576772551e-06, + "loss": 0.72425431, + "num_input_tokens_seen": 186099700, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.44921875, + "step": 8662, + "time_per_iteration": 2.4099271297454834 + }, + { + "auxiliary_loss_clip": 0.01062595, + "auxiliary_loss_mlp": 0.01025048, + "balance_loss_clip": 1.01217294, + "balance_loss_mlp": 1.02060711, + "epoch": 0.5208477378626184, + "flos": 23877762940800.0, + "grad_norm": 1.7067638992129643, + "language_loss": 0.69257319, + "learning_rate": 1.8693665398294148e-06, + "loss": 0.7134496, + "num_input_tokens_seen": 186119740, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.41992188, + "step": 8663, + "time_per_iteration": 2.4266889095306396 + }, + { + "auxiliary_loss_clip": 0.01062556, + "auxiliary_loss_mlp": 0.01025903, + "balance_loss_clip": 1.01274252, + "balance_loss_mlp": 1.01978111, + "epoch": 0.5209078611152863, + "flos": 20960420400000.0, + "grad_norm": 1.6150252889777634, + "language_loss": 0.76657224, + "learning_rate": 1.868989526643547e-06, + "loss": 0.78745687, + "num_input_tokens_seen": 186140645, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.42773438, + "step": 8664, + "time_per_iteration": 3.910113573074341 + }, + { + "auxiliary_loss_clip": 0.01063627, + "auxiliary_loss_mlp": 0.01030498, + "balance_loss_clip": 1.01796961, + "balance_loss_mlp": 1.02128196, + "epoch": 0.5209679843679543, + "flos": 20666696198400.0, + "grad_norm": 2.407890249925382, + "language_loss": 0.76361215, + "learning_rate": 1.8686125181331056e-06, + "loss": 0.78455341, + "num_input_tokens_seen": 186160130, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.42382812, + "step": 8665, + "time_per_iteration": 2.37300443649292 + }, + { + "auxiliary_loss_clip": 0.01064679, + "auxiliary_loss_mlp": 0.0102677, + "balance_loss_clip": 1.01447964, + "balance_loss_mlp": 1.02145767, + "epoch": 0.5210281076206222, + "flos": 20226336819840.0, + "grad_norm": 1.8736872275041265, + "language_loss": 0.72122383, + "learning_rate": 1.8682355143115464e-06, + "loss": 0.74213833, + "num_input_tokens_seen": 186179485, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.43164062, + "step": 8666, + "time_per_iteration": 2.3800251483917236 + }, + { + "auxiliary_loss_clip": 0.01067942, + "auxiliary_loss_mlp": 0.01029398, + "balance_loss_clip": 1.01425838, + "balance_loss_mlp": 1.02051342, + "epoch": 0.5210882308732903, + "flos": 16069088787840.0, + "grad_norm": 2.529041748769032, + "language_loss": 0.6828438, + "learning_rate": 1.867858515192322e-06, + "loss": 0.70381719, + "num_input_tokens_seen": 186197140, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.47460938, + "step": 8667, + "time_per_iteration": 2.3566620349884033 + }, + { + "auxiliary_loss_clip": 0.01060792, + "auxiliary_loss_mlp": 0.0102547, + "balance_loss_clip": 1.01285148, + "balance_loss_mlp": 1.0194869, + "epoch": 0.5211483541259582, + "flos": 24824188609920.0, + "grad_norm": 1.4246450706194869, + "language_loss": 0.80847436, + "learning_rate": 1.8674815207888875e-06, + "loss": 0.82933694, + "num_input_tokens_seen": 186216800, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.41210938, + "step": 8668, + "time_per_iteration": 2.416766881942749 + }, + { + "auxiliary_loss_clip": 0.01062964, + "auxiliary_loss_mlp": 0.01022924, + "balance_loss_clip": 1.01088965, + "balance_loss_mlp": 1.02094698, + "epoch": 0.5212084773786262, + "flos": 20369760151680.0, + "grad_norm": 2.46985818693014, + "language_loss": 0.63430142, + "learning_rate": 1.8671045311146966e-06, + "loss": 0.65516031, + "num_input_tokens_seen": 186235320, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.41992188, + "step": 8669, + "time_per_iteration": 2.3835222721099854 + }, + { + "auxiliary_loss_clip": 0.01063797, + "auxiliary_loss_mlp": 0.01031864, + "balance_loss_clip": 1.01909101, + "balance_loss_mlp": 1.02216649, + "epoch": 0.5212686006312941, + "flos": 23144447410560.0, + "grad_norm": 1.6011358114393994, + "language_loss": 0.66279918, + "learning_rate": 1.866727546183203e-06, + "loss": 0.68375576, + "num_input_tokens_seen": 186254460, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.41601562, + "step": 8670, + "time_per_iteration": 2.4091551303863525 + }, + { + "auxiliary_loss_clip": 0.01059841, + "auxiliary_loss_mlp": 0.01024516, + "balance_loss_clip": 1.01204109, + "balance_loss_mlp": 1.02030945, + "epoch": 0.5213287238839621, + "flos": 27773023063680.0, + "grad_norm": 2.7556210681304596, + "language_loss": 0.76045072, + "learning_rate": 1.8663505660078608e-06, + "loss": 0.78129435, + "num_input_tokens_seen": 186269465, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.39453125, + "step": 8671, + "time_per_iteration": 2.395078659057617 + }, + { + "auxiliary_loss_clip": 0.01068346, + "auxiliary_loss_mlp": 0.01031629, + "balance_loss_clip": 1.01806867, + "balance_loss_mlp": 1.02443016, + "epoch": 0.5213888471366301, + "flos": 19936662336000.0, + "grad_norm": 2.270832442298721, + "language_loss": 0.78638721, + "learning_rate": 1.8659735906021226e-06, + "loss": 0.80738699, + "num_input_tokens_seen": 186288660, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.43945312, + "step": 8672, + "time_per_iteration": 2.3769643306732178 + }, + { + "auxiliary_loss_clip": 0.01060074, + "auxiliary_loss_mlp": 0.01021361, + "balance_loss_clip": 1.0101552, + "balance_loss_mlp": 1.01967001, + "epoch": 0.521448970389298, + "flos": 16981788216960.0, + "grad_norm": 2.0230914586362796, + "language_loss": 0.71988165, + "learning_rate": 1.8655966199794427e-06, + "loss": 0.74069595, + "num_input_tokens_seen": 186305760, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.40429688, + "step": 8673, + "time_per_iteration": 2.342364549636841 + }, + { + "auxiliary_loss_clip": 0.01063202, + "auxiliary_loss_mlp": 0.01027033, + "balance_loss_clip": 1.01392615, + "balance_loss_mlp": 1.02049077, + "epoch": 0.5215090936419661, + "flos": 18988700567040.0, + "grad_norm": 1.6471209359371901, + "language_loss": 0.74626207, + "learning_rate": 1.8652196541532735e-06, + "loss": 0.76716447, + "num_input_tokens_seen": 186324135, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.42578125, + "step": 8674, + "time_per_iteration": 2.361753463745117 + }, + { + "auxiliary_loss_clip": 0.01062349, + "auxiliary_loss_mlp": 0.01026138, + "balance_loss_clip": 1.01349628, + "balance_loss_mlp": 1.01933193, + "epoch": 0.521569216894634, + "flos": 16142511110400.0, + "grad_norm": 2.0693327100893346, + "language_loss": 0.85763144, + "learning_rate": 1.8648426931370678e-06, + "loss": 0.87851632, + "num_input_tokens_seen": 186340205, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.43164062, + "step": 8675, + "time_per_iteration": 3.780143976211548 + }, + { + "auxiliary_loss_clip": 0.01010663, + "auxiliary_loss_mlp": 0.01003932, + "balance_loss_clip": 1.00280511, + "balance_loss_mlp": 1.00229061, + "epoch": 0.521629340147302, + "flos": 57576733113600.0, + "grad_norm": 0.8822145341954948, + "language_loss": 0.63112187, + "learning_rate": 1.8644657369442794e-06, + "loss": 0.65126789, + "num_input_tokens_seen": 186396940, + "router_z_loss_clip": 0.0112915, + "router_z_loss_mlp": 0.08398438, + "step": 8676, + "time_per_iteration": 3.005274772644043 + }, + { + "auxiliary_loss_clip": 0.0106112, + "auxiliary_loss_mlp": 0.01026085, + "balance_loss_clip": 1.01397955, + "balance_loss_mlp": 1.02066028, + "epoch": 0.5216894633999699, + "flos": 26795698974720.0, + "grad_norm": 4.00148411390434, + "language_loss": 0.68620849, + "learning_rate": 1.8640887855883594e-06, + "loss": 0.70708048, + "num_input_tokens_seen": 186418680, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.40429688, + "step": 8677, + "time_per_iteration": 2.4221465587615967 + }, + { + "auxiliary_loss_clip": 0.01062026, + "auxiliary_loss_mlp": 0.01024841, + "balance_loss_clip": 1.01281834, + "balance_loss_mlp": 1.01984644, + "epoch": 0.5217495866526379, + "flos": 26357539011840.0, + "grad_norm": 1.5761653552168706, + "language_loss": 0.65346521, + "learning_rate": 1.8637118390827618e-06, + "loss": 0.67433387, + "num_input_tokens_seen": 186438265, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.421875, + "step": 8678, + "time_per_iteration": 2.443568229675293 + }, + { + "auxiliary_loss_clip": 0.01064185, + "auxiliary_loss_mlp": 0.01027089, + "balance_loss_clip": 1.01387429, + "balance_loss_mlp": 1.0209403, + "epoch": 0.5218097099053058, + "flos": 23582956487040.0, + "grad_norm": 2.0619665597909784, + "language_loss": 0.68434954, + "learning_rate": 1.8633348974409377e-06, + "loss": 0.70526224, + "num_input_tokens_seen": 186456870, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.43359375, + "step": 8679, + "time_per_iteration": 5.1812238693237305 + }, + { + "auxiliary_loss_clip": 0.01063533, + "auxiliary_loss_mlp": 0.01027161, + "balance_loss_clip": 1.01503766, + "balance_loss_mlp": 1.02114427, + "epoch": 0.5218698331579739, + "flos": 18076420074240.0, + "grad_norm": 1.5247038018549328, + "language_loss": 0.66503739, + "learning_rate": 1.8629579606763395e-06, + "loss": 0.68594432, + "num_input_tokens_seen": 186476425, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.42382812, + "step": 8680, + "time_per_iteration": 2.3893930912017822 + }, + { + "auxiliary_loss_clip": 0.01063432, + "auxiliary_loss_mlp": 0.0103164, + "balance_loss_clip": 1.01805568, + "balance_loss_mlp": 1.02101767, + "epoch": 0.5219299564106418, + "flos": 19280120618880.0, + "grad_norm": 2.4449053634641875, + "language_loss": 0.83253169, + "learning_rate": 1.86258102880242e-06, + "loss": 0.85348243, + "num_input_tokens_seen": 186492555, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.42382812, + "step": 8681, + "time_per_iteration": 2.3695831298828125 + }, + { + "auxiliary_loss_clip": 0.0106167, + "auxiliary_loss_mlp": 0.01022796, + "balance_loss_clip": 1.01053572, + "balance_loss_mlp": 1.01967847, + "epoch": 0.5219900796633098, + "flos": 26650146049920.0, + "grad_norm": 1.8609817031856872, + "language_loss": 0.77627969, + "learning_rate": 1.862204101832629e-06, + "loss": 0.79712439, + "num_input_tokens_seen": 186513190, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.41992188, + "step": 8682, + "time_per_iteration": 2.4231386184692383 + }, + { + "auxiliary_loss_clip": 0.01062643, + "auxiliary_loss_mlp": 0.01030184, + "balance_loss_clip": 1.01843619, + "balance_loss_mlp": 1.0209322, + "epoch": 0.5220502029159777, + "flos": 34311312241920.0, + "grad_norm": 1.5641493509116313, + "language_loss": 0.68937826, + "learning_rate": 1.8618271797804197e-06, + "loss": 0.71030653, + "num_input_tokens_seen": 186534830, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.41796875, + "step": 8683, + "time_per_iteration": 2.500208854675293 + }, + { + "auxiliary_loss_clip": 0.010638, + "auxiliary_loss_mlp": 0.01029337, + "balance_loss_clip": 1.01625419, + "balance_loss_mlp": 1.02076471, + "epoch": 0.5221103261686457, + "flos": 22155602572800.0, + "grad_norm": 1.4809232308086435, + "language_loss": 0.75735605, + "learning_rate": 1.861450262659243e-06, + "loss": 0.77828747, + "num_input_tokens_seen": 186554390, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.4296875, + "step": 8684, + "time_per_iteration": 2.412447452545166 + }, + { + "auxiliary_loss_clip": 0.01059362, + "auxiliary_loss_mlp": 0.01022646, + "balance_loss_clip": 1.01067126, + "balance_loss_mlp": 1.01840234, + "epoch": 0.5221704494213137, + "flos": 19207396523520.0, + "grad_norm": 1.634542404206708, + "language_loss": 0.76214719, + "learning_rate": 1.8610733504825495e-06, + "loss": 0.78296721, + "num_input_tokens_seen": 186572360, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.41015625, + "step": 8685, + "time_per_iteration": 2.3980259895324707 + }, + { + "auxiliary_loss_clip": 0.0106559, + "auxiliary_loss_mlp": 0.01025419, + "balance_loss_clip": 1.01303291, + "balance_loss_mlp": 1.02245951, + "epoch": 0.5222305726739817, + "flos": 19353054182400.0, + "grad_norm": 1.6808565397637838, + "language_loss": 0.80795974, + "learning_rate": 1.8606964432637912e-06, + "loss": 0.82886982, + "num_input_tokens_seen": 186590655, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.4296875, + "step": 8686, + "time_per_iteration": 2.3758232593536377 + }, + { + "auxiliary_loss_clip": 0.01058445, + "auxiliary_loss_mlp": 0.01023228, + "balance_loss_clip": 1.01217139, + "balance_loss_mlp": 1.0193665, + "epoch": 0.5222906959266497, + "flos": 27813661752960.0, + "grad_norm": 1.7257770924516977, + "language_loss": 0.69958127, + "learning_rate": 1.8603195410164183e-06, + "loss": 0.72039795, + "num_input_tokens_seen": 186610345, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.390625, + "step": 8687, + "time_per_iteration": 2.439145565032959 + }, + { + "auxiliary_loss_clip": 0.01058705, + "auxiliary_loss_mlp": 0.01026891, + "balance_loss_clip": 1.01588786, + "balance_loss_mlp": 1.01896763, + "epoch": 0.5223508191793176, + "flos": 12712189829760.0, + "grad_norm": 2.2773727117004583, + "language_loss": 0.83495891, + "learning_rate": 1.859942643753882e-06, + "loss": 0.85581493, + "num_input_tokens_seen": 186624360, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.39648438, + "step": 8688, + "time_per_iteration": 2.346292495727539 + }, + { + "auxiliary_loss_clip": 0.01062396, + "auxiliary_loss_mlp": 0.01024751, + "balance_loss_clip": 1.01260936, + "balance_loss_mlp": 1.02121818, + "epoch": 0.5224109424319856, + "flos": 15631347761280.0, + "grad_norm": 2.3480398139402894, + "language_loss": 0.73097134, + "learning_rate": 1.859565751489632e-06, + "loss": 0.75184286, + "num_input_tokens_seen": 186638680, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.41015625, + "step": 8689, + "time_per_iteration": 2.3883590698242188 + }, + { + "auxiliary_loss_clip": 0.01059497, + "auxiliary_loss_mlp": 0.01024421, + "balance_loss_clip": 1.0121839, + "balance_loss_mlp": 1.01970935, + "epoch": 0.5224710656846535, + "flos": 15741324144000.0, + "grad_norm": 2.005806960107113, + "language_loss": 0.82860672, + "learning_rate": 1.8591888642371194e-06, + "loss": 0.84944588, + "num_input_tokens_seen": 186655840, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.3984375, + "step": 8690, + "time_per_iteration": 2.368793487548828 + }, + { + "auxiliary_loss_clip": 0.01062911, + "auxiliary_loss_mlp": 0.01026426, + "balance_loss_clip": 1.01346815, + "balance_loss_mlp": 1.01981485, + "epoch": 0.5225311889373215, + "flos": 26029809279360.0, + "grad_norm": 2.1737157378091396, + "language_loss": 0.78870076, + "learning_rate": 1.858811982009794e-06, + "loss": 0.80959415, + "num_input_tokens_seen": 186674150, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.43164062, + "step": 8691, + "time_per_iteration": 2.427111864089966 + }, + { + "auxiliary_loss_clip": 0.01064066, + "auxiliary_loss_mlp": 0.01029744, + "balance_loss_clip": 1.01567769, + "balance_loss_mlp": 1.02033389, + "epoch": 0.5225913121899894, + "flos": 18368293973760.0, + "grad_norm": 4.079148029740356, + "language_loss": 0.7666707, + "learning_rate": 1.8584351048211056e-06, + "loss": 0.78760874, + "num_input_tokens_seen": 186690675, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.4375, + "step": 8692, + "time_per_iteration": 2.3451695442199707 + }, + { + "auxiliary_loss_clip": 0.01059106, + "auxiliary_loss_mlp": 0.01020785, + "balance_loss_clip": 1.00988913, + "balance_loss_mlp": 1.01933801, + "epoch": 0.5226514354426575, + "flos": 29272367934720.0, + "grad_norm": 1.7312634406952134, + "language_loss": 0.72780937, + "learning_rate": 1.8580582326845044e-06, + "loss": 0.74860823, + "num_input_tokens_seen": 186710380, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.3984375, + "step": 8693, + "time_per_iteration": 2.454498291015625 + }, + { + "auxiliary_loss_clip": 0.01065641, + "auxiliary_loss_mlp": 0.01029872, + "balance_loss_clip": 1.0165025, + "balance_loss_mlp": 1.02134061, + "epoch": 0.5227115586953254, + "flos": 22852294220160.0, + "grad_norm": 2.3786738885860608, + "language_loss": 0.81991065, + "learning_rate": 1.8576813656134393e-06, + "loss": 0.84086585, + "num_input_tokens_seen": 186729135, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.44335938, + "step": 8694, + "time_per_iteration": 2.3878014087677 + }, + { + "auxiliary_loss_clip": 0.01010801, + "auxiliary_loss_mlp": 0.01000654, + "balance_loss_clip": 0.99964035, + "balance_loss_mlp": 1.0023675, + "epoch": 0.5227716819479934, + "flos": 57471539587200.0, + "grad_norm": 0.7742806383699335, + "language_loss": 0.55644798, + "learning_rate": 1.8573045036213608e-06, + "loss": 0.57656252, + "num_input_tokens_seen": 186791115, + "router_z_loss_clip": 0.01013184, + "router_z_loss_mlp": 0.08447266, + "step": 8695, + "time_per_iteration": 3.06724214553833 + }, + { + "auxiliary_loss_clip": 0.01061896, + "auxiliary_loss_mlp": 0.01027989, + "balance_loss_clip": 1.01535904, + "balance_loss_mlp": 1.0201931, + "epoch": 0.5228318052006613, + "flos": 13807415180160.0, + "grad_norm": 1.7099714170416143, + "language_loss": 0.72956407, + "learning_rate": 1.8569276467217175e-06, + "loss": 0.75046295, + "num_input_tokens_seen": 186808660, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.41601562, + "step": 8696, + "time_per_iteration": 2.3548147678375244 + }, + { + "auxiliary_loss_clip": 0.0106277, + "auxiliary_loss_mlp": 0.01025278, + "balance_loss_clip": 1.013304, + "balance_loss_mlp": 1.02110684, + "epoch": 0.5228919284533293, + "flos": 15595282460160.0, + "grad_norm": 1.593447121808996, + "language_loss": 0.71276855, + "learning_rate": 1.8565507949279584e-06, + "loss": 0.73364902, + "num_input_tokens_seen": 186825900, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.41796875, + "step": 8697, + "time_per_iteration": 2.367253065109253 + }, + { + "auxiliary_loss_clip": 0.01060764, + "auxiliary_loss_mlp": 0.01022678, + "balance_loss_clip": 1.01067936, + "balance_loss_mlp": 1.02077937, + "epoch": 0.5229520517059973, + "flos": 22490419311360.0, + "grad_norm": 1.674226897853648, + "language_loss": 0.8037079, + "learning_rate": 1.8561739482535323e-06, + "loss": 0.82454234, + "num_input_tokens_seen": 186843735, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.3984375, + "step": 8698, + "time_per_iteration": 2.383469820022583 + }, + { + "auxiliary_loss_clip": 0.01063135, + "auxiliary_loss_mlp": 0.0102664, + "balance_loss_clip": 1.01480913, + "balance_loss_mlp": 1.02144337, + "epoch": 0.5230121749586653, + "flos": 22089790926720.0, + "grad_norm": 2.330845969545342, + "language_loss": 0.74390793, + "learning_rate": 1.8557971067118877e-06, + "loss": 0.76480567, + "num_input_tokens_seen": 186862440, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.41796875, + "step": 8699, + "time_per_iteration": 2.3941915035247803 + }, + { + "auxiliary_loss_clip": 0.01066725, + "auxiliary_loss_mlp": 0.01027672, + "balance_loss_clip": 1.01466668, + "balance_loss_mlp": 1.02155316, + "epoch": 0.5230722982113333, + "flos": 22126065696000.0, + "grad_norm": 1.9980525488751848, + "language_loss": 0.73442125, + "learning_rate": 1.8554202703164739e-06, + "loss": 0.75536525, + "num_input_tokens_seen": 186880940, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.453125, + "step": 8700, + "time_per_iteration": 2.376199245452881 + }, + { + "auxiliary_loss_clip": 0.01067246, + "auxiliary_loss_mlp": 0.01029786, + "balance_loss_clip": 1.01639879, + "balance_loss_mlp": 1.02264047, + "epoch": 0.5231324214640012, + "flos": 25008110985600.0, + "grad_norm": 2.4369024510589643, + "language_loss": 0.67035496, + "learning_rate": 1.8550434390807387e-06, + "loss": 0.69132525, + "num_input_tokens_seen": 186900785, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.4453125, + "step": 8701, + "time_per_iteration": 2.421910047531128 + }, + { + "auxiliary_loss_clip": 0.01061486, + "auxiliary_loss_mlp": 0.01028241, + "balance_loss_clip": 1.01523495, + "balance_loss_mlp": 1.020123, + "epoch": 0.5231925447166692, + "flos": 25739296922880.0, + "grad_norm": 1.7274064076223172, + "language_loss": 0.66563141, + "learning_rate": 1.8546666130181298e-06, + "loss": 0.68652868, + "num_input_tokens_seen": 186920895, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.4140625, + "step": 8702, + "time_per_iteration": 2.4149303436279297 + }, + { + "auxiliary_loss_clip": 0.01062554, + "auxiliary_loss_mlp": 0.0102524, + "balance_loss_clip": 1.01246083, + "balance_loss_mlp": 1.0210427, + "epoch": 0.5232526679693371, + "flos": 21432865184640.0, + "grad_norm": 2.0952696047672466, + "language_loss": 0.76462728, + "learning_rate": 1.8542897921420961e-06, + "loss": 0.78550524, + "num_input_tokens_seen": 186940605, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.41601562, + "step": 8703, + "time_per_iteration": 2.371110200881958 + }, + { + "auxiliary_loss_clip": 0.01067803, + "auxiliary_loss_mlp": 0.01034159, + "balance_loss_clip": 1.01874518, + "balance_loss_mlp": 1.0218823, + "epoch": 0.5233127912220051, + "flos": 35296945234560.0, + "grad_norm": 2.3859509670508987, + "language_loss": 0.77557242, + "learning_rate": 1.8539129764660845e-06, + "loss": 0.79659212, + "num_input_tokens_seen": 186960820, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.45898438, + "step": 8704, + "time_per_iteration": 3.9181127548217773 + }, + { + "auxiliary_loss_clip": 0.01060858, + "auxiliary_loss_mlp": 0.01028486, + "balance_loss_clip": 1.01612949, + "balance_loss_mlp": 1.02063203, + "epoch": 0.523372914474673, + "flos": 17050497505920.0, + "grad_norm": 2.963418091422421, + "language_loss": 0.78289747, + "learning_rate": 1.8535361660035436e-06, + "loss": 0.80379093, + "num_input_tokens_seen": 186976240, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.40234375, + "step": 8705, + "time_per_iteration": 2.3686020374298096 + }, + { + "auxiliary_loss_clip": 0.01063298, + "auxiliary_loss_mlp": 0.01026455, + "balance_loss_clip": 1.01392055, + "balance_loss_mlp": 1.02060342, + "epoch": 0.5234330377273411, + "flos": 18405301881600.0, + "grad_norm": 2.254233071050047, + "language_loss": 0.69553161, + "learning_rate": 1.8531593607679195e-06, + "loss": 0.71642911, + "num_input_tokens_seen": 186992855, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.42773438, + "step": 8706, + "time_per_iteration": 2.3617613315582275 + }, + { + "auxiliary_loss_clip": 0.01010129, + "auxiliary_loss_mlp": 0.01000779, + "balance_loss_clip": 0.99984962, + "balance_loss_mlp": 1.00156736, + "epoch": 0.523493160980009, + "flos": 65846608727040.0, + "grad_norm": 0.6769688573377586, + "language_loss": 0.52426207, + "learning_rate": 1.8527825607726606e-06, + "loss": 0.54437113, + "num_input_tokens_seen": 187051205, + "router_z_loss_clip": 0.00927734, + "router_z_loss_mlp": 0.0859375, + "step": 8707, + "time_per_iteration": 3.0919058322906494 + }, + { + "auxiliary_loss_clip": 0.01063282, + "auxiliary_loss_mlp": 0.01030105, + "balance_loss_clip": 1.01757622, + "balance_loss_mlp": 1.02018118, + "epoch": 0.523553284232677, + "flos": 21870990236160.0, + "grad_norm": 1.7969903278277632, + "language_loss": 0.75034571, + "learning_rate": 1.8524057660312134e-06, + "loss": 0.77127957, + "num_input_tokens_seen": 187070540, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.4296875, + "step": 8708, + "time_per_iteration": 2.3926284313201904 + }, + { + "auxiliary_loss_clip": 0.01061766, + "auxiliary_loss_mlp": 0.01024232, + "balance_loss_clip": 1.01179838, + "balance_loss_mlp": 1.02082729, + "epoch": 0.5236134074853449, + "flos": 20847197260800.0, + "grad_norm": 2.196316891067605, + "language_loss": 0.77565312, + "learning_rate": 1.8520289765570242e-06, + "loss": 0.79651308, + "num_input_tokens_seen": 187089975, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.41015625, + "step": 8709, + "time_per_iteration": 2.458695650100708 + }, + { + "auxiliary_loss_clip": 0.01065098, + "auxiliary_loss_mlp": 0.01027139, + "balance_loss_clip": 1.0120945, + "balance_loss_mlp": 1.01954865, + "epoch": 0.5236735307380129, + "flos": 25519239423360.0, + "grad_norm": 2.156441051545963, + "language_loss": 0.83786446, + "learning_rate": 1.8516521923635408e-06, + "loss": 0.85878676, + "num_input_tokens_seen": 187108775, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.45703125, + "step": 8710, + "time_per_iteration": 2.4056522846221924 + }, + { + "auxiliary_loss_clip": 0.01061407, + "auxiliary_loss_mlp": 0.01024043, + "balance_loss_clip": 1.01157355, + "balance_loss_mlp": 1.02039647, + "epoch": 0.523733653990681, + "flos": 23582083703040.0, + "grad_norm": 1.7263958262971666, + "language_loss": 0.69631559, + "learning_rate": 1.8512754134642092e-06, + "loss": 0.71717006, + "num_input_tokens_seen": 187128830, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.41015625, + "step": 8711, + "time_per_iteration": 2.424436569213867 + }, + { + "auxiliary_loss_clip": 0.0106365, + "auxiliary_loss_mlp": 0.01029279, + "balance_loss_clip": 1.01669598, + "balance_loss_mlp": 1.02054906, + "epoch": 0.5237937772433489, + "flos": 21105170363520.0, + "grad_norm": 1.6935345485225535, + "language_loss": 0.83177662, + "learning_rate": 1.8508986398724752e-06, + "loss": 0.85270596, + "num_input_tokens_seen": 187149570, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.43164062, + "step": 8712, + "time_per_iteration": 2.3852548599243164 + }, + { + "auxiliary_loss_clip": 0.01064682, + "auxiliary_loss_mlp": 0.01030269, + "balance_loss_clip": 1.01672649, + "balance_loss_mlp": 1.02035308, + "epoch": 0.5238539004960169, + "flos": 19171854892800.0, + "grad_norm": 2.1638487069571037, + "language_loss": 0.69318438, + "learning_rate": 1.8505218716017857e-06, + "loss": 0.71413386, + "num_input_tokens_seen": 187170575, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.44335938, + "step": 8713, + "time_per_iteration": 2.422886371612549 + }, + { + "auxiliary_loss_clip": 0.01065248, + "auxiliary_loss_mlp": 0.01028507, + "balance_loss_clip": 1.01318836, + "balance_loss_mlp": 1.01926124, + "epoch": 0.5239140237486848, + "flos": 17887435551360.0, + "grad_norm": 3.5512307965763816, + "language_loss": 0.76475084, + "learning_rate": 1.8501451086655852e-06, + "loss": 0.7856884, + "num_input_tokens_seen": 187187190, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.45898438, + "step": 8714, + "time_per_iteration": 2.3482069969177246 + }, + { + "auxiliary_loss_clip": 0.01065044, + "auxiliary_loss_mlp": 0.01024416, + "balance_loss_clip": 1.01078463, + "balance_loss_mlp": 1.02141869, + "epoch": 0.5239741470013528, + "flos": 17929470695040.0, + "grad_norm": 2.219020050734622, + "language_loss": 0.76363212, + "learning_rate": 1.8497683510773207e-06, + "loss": 0.78452671, + "num_input_tokens_seen": 187204350, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.4375, + "step": 8715, + "time_per_iteration": 3.7989871501922607 + }, + { + "auxiliary_loss_clip": 0.01060624, + "auxiliary_loss_mlp": 0.01024647, + "balance_loss_clip": 1.01221335, + "balance_loss_mlp": 1.01983738, + "epoch": 0.5240342702540207, + "flos": 30992049596160.0, + "grad_norm": 1.6198679223949821, + "language_loss": 0.71041858, + "learning_rate": 1.8493915988504372e-06, + "loss": 0.73127133, + "num_input_tokens_seen": 187225605, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.40820312, + "step": 8716, + "time_per_iteration": 2.45976185798645 + }, + { + "auxiliary_loss_clip": 0.01060204, + "auxiliary_loss_mlp": 0.01028432, + "balance_loss_clip": 1.01701808, + "balance_loss_mlp": 1.01931179, + "epoch": 0.5240943935066887, + "flos": 25004026356480.0, + "grad_norm": 1.8684369041150533, + "language_loss": 0.86769074, + "learning_rate": 1.8490148519983804e-06, + "loss": 0.8885771, + "num_input_tokens_seen": 187241335, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.41015625, + "step": 8717, + "time_per_iteration": 2.3889567852020264 + }, + { + "auxiliary_loss_clip": 0.01062337, + "auxiliary_loss_mlp": 0.01025281, + "balance_loss_clip": 1.01315713, + "balance_loss_mlp": 1.02099383, + "epoch": 0.5241545167593566, + "flos": 23657984732160.0, + "grad_norm": 3.918028109751091, + "language_loss": 0.60901475, + "learning_rate": 1.8486381105345953e-06, + "loss": 0.62989086, + "num_input_tokens_seen": 187259925, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.4140625, + "step": 8718, + "time_per_iteration": 3.834669351577759 + }, + { + "auxiliary_loss_clip": 0.01063677, + "auxiliary_loss_mlp": 0.01035847, + "balance_loss_clip": 1.02178001, + "balance_loss_mlp": 1.02000451, + "epoch": 0.5242146400120247, + "flos": 23399383224960.0, + "grad_norm": 1.9317234240236707, + "language_loss": 0.71779788, + "learning_rate": 1.848261374472526e-06, + "loss": 0.73879308, + "num_input_tokens_seen": 187279035, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.4375, + "step": 8719, + "time_per_iteration": 3.8007664680480957 + }, + { + "auxiliary_loss_clip": 0.01057918, + "auxiliary_loss_mlp": 0.01021116, + "balance_loss_clip": 1.00953496, + "balance_loss_mlp": 1.01875758, + "epoch": 0.5242747632646926, + "flos": 17748096848640.0, + "grad_norm": 1.7558093352208288, + "language_loss": 0.73290288, + "learning_rate": 1.8478846438256183e-06, + "loss": 0.75369322, + "num_input_tokens_seen": 187297555, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.390625, + "step": 8720, + "time_per_iteration": 2.370110273361206 + }, + { + "auxiliary_loss_clip": 0.01062701, + "auxiliary_loss_mlp": 0.01026637, + "balance_loss_clip": 1.0136553, + "balance_loss_mlp": 1.02081382, + "epoch": 0.5243348865173606, + "flos": 32596378525440.0, + "grad_norm": 1.5882477054911817, + "language_loss": 0.70582479, + "learning_rate": 1.847507918607316e-06, + "loss": 0.72671819, + "num_input_tokens_seen": 187320265, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.41992188, + "step": 8721, + "time_per_iteration": 2.5017364025115967 + }, + { + "auxiliary_loss_clip": 0.01061812, + "auxiliary_loss_mlp": 0.01027523, + "balance_loss_clip": 1.01429117, + "balance_loss_mlp": 1.01975715, + "epoch": 0.5243950097700285, + "flos": 25482929742720.0, + "grad_norm": 1.7239445203086505, + "language_loss": 0.86304712, + "learning_rate": 1.8471311988310646e-06, + "loss": 0.88394052, + "num_input_tokens_seen": 187338045, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.421875, + "step": 8722, + "time_per_iteration": 2.409658908843994 + }, + { + "auxiliary_loss_clip": 0.01061867, + "auxiliary_loss_mlp": 0.01026473, + "balance_loss_clip": 1.01425397, + "balance_loss_mlp": 1.02136898, + "epoch": 0.5244551330226965, + "flos": 15267482904960.0, + "grad_norm": 2.3525569207404464, + "language_loss": 0.79768723, + "learning_rate": 1.8467544845103074e-06, + "loss": 0.81857061, + "num_input_tokens_seen": 187356040, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.40429688, + "step": 8723, + "time_per_iteration": 2.371000051498413 + }, + { + "auxiliary_loss_clip": 0.01064289, + "auxiliary_loss_mlp": 0.01035181, + "balance_loss_clip": 1.02132845, + "balance_loss_mlp": 1.02047133, + "epoch": 0.5245152562753645, + "flos": 22236007167360.0, + "grad_norm": 1.7730055274936236, + "language_loss": 0.74618816, + "learning_rate": 1.8463777756584878e-06, + "loss": 0.76718289, + "num_input_tokens_seen": 187374185, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.4375, + "step": 8724, + "time_per_iteration": 2.3973562717437744 + }, + { + "auxiliary_loss_clip": 0.01065936, + "auxiliary_loss_mlp": 0.01028744, + "balance_loss_clip": 1.01443326, + "balance_loss_mlp": 1.02081954, + "epoch": 0.5245753795280325, + "flos": 29425112599680.0, + "grad_norm": 1.5444917013563333, + "language_loss": 0.70186293, + "learning_rate": 1.8460010722890507e-06, + "loss": 0.72280973, + "num_input_tokens_seen": 187396640, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.44921875, + "step": 8725, + "time_per_iteration": 2.4634759426116943 + }, + { + "auxiliary_loss_clip": 0.01011289, + "auxiliary_loss_mlp": 0.0100698, + "balance_loss_clip": 1.00594842, + "balance_loss_mlp": 1.00294733, + "epoch": 0.5246355027807005, + "flos": 58831196641920.0, + "grad_norm": 0.7587219206316503, + "language_loss": 0.55680317, + "learning_rate": 1.8456243744154392e-06, + "loss": 0.57698584, + "num_input_tokens_seen": 187455945, + "router_z_loss_clip": 0.01031494, + "router_z_loss_mlp": 0.08349609, + "step": 8726, + "time_per_iteration": 3.022136688232422 + }, + { + "auxiliary_loss_clip": 0.01010479, + "auxiliary_loss_mlp": 0.01001851, + "balance_loss_clip": 1.00080788, + "balance_loss_mlp": 1.00205064, + "epoch": 0.5246956260333684, + "flos": 64523226441600.0, + "grad_norm": 0.7961194585024417, + "language_loss": 0.58370513, + "learning_rate": 1.8452476820510967e-06, + "loss": 0.60382843, + "num_input_tokens_seen": 187519975, + "router_z_loss_clip": 0.01043701, + "router_z_loss_mlp": 0.08398438, + "step": 8727, + "time_per_iteration": 3.035306692123413 + }, + { + "auxiliary_loss_clip": 0.01064676, + "auxiliary_loss_mlp": 0.01025589, + "balance_loss_clip": 1.01225519, + "balance_loss_mlp": 1.02095485, + "epoch": 0.5247557492860364, + "flos": 24532524178560.0, + "grad_norm": 1.3184703270642604, + "language_loss": 0.70700455, + "learning_rate": 1.844870995209466e-06, + "loss": 0.72790718, + "num_input_tokens_seen": 187541775, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.4375, + "step": 8728, + "time_per_iteration": 2.429145574569702 + }, + { + "auxiliary_loss_clip": 0.0106112, + "auxiliary_loss_mlp": 0.01022093, + "balance_loss_clip": 1.01005912, + "balance_loss_mlp": 1.02026212, + "epoch": 0.5248158725387043, + "flos": 18805162216320.0, + "grad_norm": 1.46948587110685, + "language_loss": 0.69820046, + "learning_rate": 1.8444943139039907e-06, + "loss": 0.71903265, + "num_input_tokens_seen": 187560425, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.40820312, + "step": 8729, + "time_per_iteration": 2.387162685394287 + }, + { + "auxiliary_loss_clip": 0.01065939, + "auxiliary_loss_mlp": 0.01024009, + "balance_loss_clip": 1.011415, + "balance_loss_mlp": 1.02186203, + "epoch": 0.5248759957913723, + "flos": 20954904405120.0, + "grad_norm": 1.5835485845545847, + "language_loss": 0.8349728, + "learning_rate": 1.8441176381481135e-06, + "loss": 0.85587227, + "num_input_tokens_seen": 187579930, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.44140625, + "step": 8730, + "time_per_iteration": 2.4007067680358887 + }, + { + "auxiliary_loss_clip": 0.01063449, + "auxiliary_loss_mlp": 0.01024086, + "balance_loss_clip": 1.01165271, + "balance_loss_mlp": 1.02138782, + "epoch": 0.5249361190440403, + "flos": 18659993316480.0, + "grad_norm": 2.159882484728315, + "language_loss": 0.79240102, + "learning_rate": 1.8437409679552762e-06, + "loss": 0.81327635, + "num_input_tokens_seen": 187595365, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.421875, + "step": 8731, + "time_per_iteration": 2.350849151611328 + }, + { + "auxiliary_loss_clip": 0.01060274, + "auxiliary_loss_mlp": 0.01024946, + "balance_loss_clip": 1.01328707, + "balance_loss_mlp": 1.02032876, + "epoch": 0.5249962422967083, + "flos": 24862174035840.0, + "grad_norm": 1.6316202087593208, + "language_loss": 0.82868654, + "learning_rate": 1.8433643033389227e-06, + "loss": 0.84953868, + "num_input_tokens_seen": 187614715, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.3984375, + "step": 8732, + "time_per_iteration": 2.42507266998291 + }, + { + "auxiliary_loss_clip": 0.01063465, + "auxiliary_loss_mlp": 0.01026947, + "balance_loss_clip": 1.0156405, + "balance_loss_mlp": 1.02115536, + "epoch": 0.5250563655493762, + "flos": 15261931998720.0, + "grad_norm": 1.522920732154093, + "language_loss": 0.77562279, + "learning_rate": 1.8429876443124934e-06, + "loss": 0.79652691, + "num_input_tokens_seen": 187630745, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.421875, + "step": 8733, + "time_per_iteration": 2.3422863483428955 + }, + { + "auxiliary_loss_clip": 0.01065028, + "auxiliary_loss_mlp": 0.01032824, + "balance_loss_clip": 1.01958585, + "balance_loss_mlp": 1.0213716, + "epoch": 0.5251164888020442, + "flos": 18624172394880.0, + "grad_norm": 1.9538327357214325, + "language_loss": 0.81405628, + "learning_rate": 1.8426109908894316e-06, + "loss": 0.83503485, + "num_input_tokens_seen": 187648200, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.4375, + "step": 8734, + "time_per_iteration": 2.3684659004211426 + }, + { + "auxiliary_loss_clip": 0.01061771, + "auxiliary_loss_mlp": 0.01027698, + "balance_loss_clip": 1.01542521, + "balance_loss_mlp": 1.02081156, + "epoch": 0.5251766120547121, + "flos": 29709620202240.0, + "grad_norm": 1.623779268361182, + "language_loss": 0.7685858, + "learning_rate": 1.8422343430831791e-06, + "loss": 0.78948045, + "num_input_tokens_seen": 187669205, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.41015625, + "step": 8735, + "time_per_iteration": 2.4542341232299805 + }, + { + "auxiliary_loss_clip": 0.01063989, + "auxiliary_loss_mlp": 0.01024481, + "balance_loss_clip": 1.01312077, + "balance_loss_mlp": 1.02121007, + "epoch": 0.5252367353073801, + "flos": 23439184041600.0, + "grad_norm": 1.645528612870944, + "language_loss": 0.80689955, + "learning_rate": 1.8418577009071763e-06, + "loss": 0.8277843, + "num_input_tokens_seen": 187690890, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.42773438, + "step": 8736, + "time_per_iteration": 2.4154062271118164 + }, + { + "auxiliary_loss_clip": 0.01063051, + "auxiliary_loss_mlp": 0.01026787, + "balance_loss_clip": 1.01453209, + "balance_loss_mlp": 1.02145207, + "epoch": 0.5252968585600482, + "flos": 30809384029440.0, + "grad_norm": 2.0230408781095335, + "language_loss": 0.70136535, + "learning_rate": 1.8414810643748656e-06, + "loss": 0.72226375, + "num_input_tokens_seen": 187713045, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.41601562, + "step": 8737, + "time_per_iteration": 2.463508129119873 + }, + { + "auxiliary_loss_clip": 0.01063137, + "auxiliary_loss_mlp": 0.01027316, + "balance_loss_clip": 1.01485312, + "balance_loss_mlp": 1.02049136, + "epoch": 0.5253569818127161, + "flos": 20627314318080.0, + "grad_norm": 1.9680441470693886, + "language_loss": 0.77201653, + "learning_rate": 1.841104433499688e-06, + "loss": 0.79292107, + "num_input_tokens_seen": 187733640, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.42578125, + "step": 8738, + "time_per_iteration": 2.4415194988250732 + }, + { + "auxiliary_loss_clip": 0.0100922, + "auxiliary_loss_mlp": 0.01007098, + "balance_loss_clip": 1.00629354, + "balance_loss_mlp": 1.0014962, + "epoch": 0.5254171050653841, + "flos": 63425452561920.0, + "grad_norm": 0.7482008623706926, + "language_loss": 0.54526943, + "learning_rate": 1.8407278082950846e-06, + "loss": 0.56543261, + "num_input_tokens_seen": 187792930, + "router_z_loss_clip": 0.00805664, + "router_z_loss_mlp": 0.07714844, + "step": 8739, + "time_per_iteration": 3.001358985900879 + }, + { + "auxiliary_loss_clip": 0.01061188, + "auxiliary_loss_mlp": 0.01027229, + "balance_loss_clip": 1.01576746, + "balance_loss_mlp": 1.0204947, + "epoch": 0.525477228318052, + "flos": 34669556369280.0, + "grad_norm": 1.9319011240234714, + "language_loss": 0.84384435, + "learning_rate": 1.840351188774496e-06, + "loss": 0.86472857, + "num_input_tokens_seen": 187812495, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.40625, + "step": 8740, + "time_per_iteration": 2.5147783756256104 + }, + { + "auxiliary_loss_clip": 0.01062118, + "auxiliary_loss_mlp": 0.01028765, + "balance_loss_clip": 1.01674283, + "balance_loss_mlp": 1.02054965, + "epoch": 0.52553735157072, + "flos": 17929889631360.0, + "grad_norm": 2.1255727045867725, + "language_loss": 0.69676512, + "learning_rate": 1.8399745749513627e-06, + "loss": 0.71767402, + "num_input_tokens_seen": 187829685, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.41601562, + "step": 8741, + "time_per_iteration": 2.361222982406616 + }, + { + "auxiliary_loss_clip": 0.01065046, + "auxiliary_loss_mlp": 0.01025448, + "balance_loss_clip": 1.01210809, + "balance_loss_mlp": 1.02193928, + "epoch": 0.5255974748233879, + "flos": 9940120922880.0, + "grad_norm": 1.79628986509709, + "language_loss": 0.66361976, + "learning_rate": 1.8395979668391256e-06, + "loss": 0.68452471, + "num_input_tokens_seen": 187846495, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.43164062, + "step": 8742, + "time_per_iteration": 2.3790605068206787 + }, + { + "auxiliary_loss_clip": 0.0106499, + "auxiliary_loss_mlp": 0.01029331, + "balance_loss_clip": 1.01522827, + "balance_loss_mlp": 1.02118921, + "epoch": 0.5256575980760559, + "flos": 16867622471040.0, + "grad_norm": 3.0480608190166243, + "language_loss": 0.6318928, + "learning_rate": 1.839221364451224e-06, + "loss": 0.65283602, + "num_input_tokens_seen": 187862010, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.4375, + "step": 8743, + "time_per_iteration": 3.7844862937927246 + }, + { + "auxiliary_loss_clip": 0.01011735, + "auxiliary_loss_mlp": 0.01003073, + "balance_loss_clip": 1.00219691, + "balance_loss_mlp": 1.00393486, + "epoch": 0.5257177213287239, + "flos": 62382561206400.0, + "grad_norm": 0.7705770972892789, + "language_loss": 0.54120922, + "learning_rate": 1.8388447678010985e-06, + "loss": 0.56135726, + "num_input_tokens_seen": 187922730, + "router_z_loss_clip": 0.00878906, + "router_z_loss_mlp": 0.078125, + "step": 8744, + "time_per_iteration": 3.1449272632598877 + }, + { + "auxiliary_loss_clip": 0.01067441, + "auxiliary_loss_mlp": 0.01027593, + "balance_loss_clip": 1.01353216, + "balance_loss_mlp": 1.02233446, + "epoch": 0.5257778445813919, + "flos": 20775869619840.0, + "grad_norm": 2.0754296327311947, + "language_loss": 0.75969917, + "learning_rate": 1.8384681769021888e-06, + "loss": 0.78064948, + "num_input_tokens_seen": 187940160, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.45117188, + "step": 8745, + "time_per_iteration": 2.3786160945892334 + }, + { + "auxiliary_loss_clip": 0.01063723, + "auxiliary_loss_mlp": 0.01027221, + "balance_loss_clip": 1.01544952, + "balance_loss_mlp": 1.02271783, + "epoch": 0.5258379678340598, + "flos": 17017678961280.0, + "grad_norm": 2.0375754811011753, + "language_loss": 0.8099829, + "learning_rate": 1.8380915917679337e-06, + "loss": 0.83089232, + "num_input_tokens_seen": 187958625, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.41015625, + "step": 8746, + "time_per_iteration": 2.363778591156006 + }, + { + "auxiliary_loss_clip": 0.01061698, + "auxiliary_loss_mlp": 0.01028302, + "balance_loss_clip": 1.01599932, + "balance_loss_mlp": 1.02068949, + "epoch": 0.5258980910867278, + "flos": 21067708608000.0, + "grad_norm": 1.957194596911714, + "language_loss": 0.75227284, + "learning_rate": 1.8377150124117739e-06, + "loss": 0.77317286, + "num_input_tokens_seen": 187977575, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.41015625, + "step": 8747, + "time_per_iteration": 2.4216129779815674 + }, + { + "auxiliary_loss_clip": 0.01059107, + "auxiliary_loss_mlp": 0.01030266, + "balance_loss_clip": 1.0178802, + "balance_loss_mlp": 1.01983142, + "epoch": 0.5259582143393957, + "flos": 18003486510720.0, + "grad_norm": 1.9678046331106718, + "language_loss": 0.82890749, + "learning_rate": 1.8373384388471474e-06, + "loss": 0.84980118, + "num_input_tokens_seen": 187996650, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.39453125, + "step": 8748, + "time_per_iteration": 2.375405788421631 + }, + { + "auxiliary_loss_clip": 0.01060696, + "auxiliary_loss_mlp": 0.01028697, + "balance_loss_clip": 1.0154115, + "balance_loss_mlp": 1.01948547, + "epoch": 0.5260183375920637, + "flos": 22782747058560.0, + "grad_norm": 2.1183767016273856, + "language_loss": 0.8068701, + "learning_rate": 1.836961871087494e-06, + "loss": 0.82776409, + "num_input_tokens_seen": 188013510, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.41210938, + "step": 8749, + "time_per_iteration": 2.381593942642212 + }, + { + "auxiliary_loss_clip": 0.01063804, + "auxiliary_loss_mlp": 0.01026834, + "balance_loss_clip": 1.01474631, + "balance_loss_mlp": 1.02237749, + "epoch": 0.5260784608447318, + "flos": 27051193370880.0, + "grad_norm": 1.6520745733779105, + "language_loss": 0.72222161, + "learning_rate": 1.8365853091462516e-06, + "loss": 0.74312806, + "num_input_tokens_seen": 188032085, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.4140625, + "step": 8750, + "time_per_iteration": 2.4263296127319336 + }, + { + "auxiliary_loss_clip": 0.01062072, + "auxiliary_loss_mlp": 0.01023887, + "balance_loss_clip": 1.01204336, + "balance_loss_mlp": 1.02032804, + "epoch": 0.5261385840973997, + "flos": 20661913342080.0, + "grad_norm": 1.3356833560382648, + "language_loss": 0.76410699, + "learning_rate": 1.8362087530368597e-06, + "loss": 0.78496647, + "num_input_tokens_seen": 188050590, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.41796875, + "step": 8751, + "time_per_iteration": 2.3906936645507812 + }, + { + "auxiliary_loss_clip": 0.01059456, + "auxiliary_loss_mlp": 0.01032348, + "balance_loss_clip": 1.02076101, + "balance_loss_mlp": 1.02036524, + "epoch": 0.5261987073500677, + "flos": 23621535406080.0, + "grad_norm": 1.435111997101803, + "language_loss": 0.75770795, + "learning_rate": 1.835832202772756e-06, + "loss": 0.77862597, + "num_input_tokens_seen": 188071620, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.390625, + "step": 8752, + "time_per_iteration": 2.4119069576263428 + }, + { + "auxiliary_loss_clip": 0.01064345, + "auxiliary_loss_mlp": 0.01029477, + "balance_loss_clip": 1.01577401, + "balance_loss_mlp": 1.0215795, + "epoch": 0.5262588306027356, + "flos": 27635010992640.0, + "grad_norm": 3.2093554558793125, + "language_loss": 0.67783117, + "learning_rate": 1.8354556583673782e-06, + "loss": 0.69876939, + "num_input_tokens_seen": 188091740, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.42773438, + "step": 8753, + "time_per_iteration": 2.4455368518829346 + }, + { + "auxiliary_loss_clip": 0.01064276, + "auxiliary_loss_mlp": 0.0102886, + "balance_loss_clip": 1.01698661, + "balance_loss_mlp": 1.02336454, + "epoch": 0.5263189538554036, + "flos": 21758709703680.0, + "grad_norm": 1.4465333567335317, + "language_loss": 0.83872807, + "learning_rate": 1.835079119834165e-06, + "loss": 0.85965943, + "num_input_tokens_seen": 188111165, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.41015625, + "step": 8754, + "time_per_iteration": 3.831928253173828 + }, + { + "auxiliary_loss_clip": 0.01063478, + "auxiliary_loss_mlp": 0.01025463, + "balance_loss_clip": 1.01317835, + "balance_loss_mlp": 1.02135611, + "epoch": 0.5263790771080715, + "flos": 14275670601600.0, + "grad_norm": 1.9678495897271924, + "language_loss": 0.87093592, + "learning_rate": 1.8347025871865537e-06, + "loss": 0.89182532, + "num_input_tokens_seen": 188127825, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.421875, + "step": 8755, + "time_per_iteration": 2.364396333694458 + }, + { + "auxiliary_loss_clip": 0.01009293, + "auxiliary_loss_mlp": 0.01003299, + "balance_loss_clip": 1.0022496, + "balance_loss_mlp": 1.00165033, + "epoch": 0.5264392003607395, + "flos": 65802932749440.0, + "grad_norm": 0.7211210285997481, + "language_loss": 0.58830935, + "learning_rate": 1.834326060437982e-06, + "loss": 0.60843527, + "num_input_tokens_seen": 188194050, + "router_z_loss_clip": 0.01049805, + "router_z_loss_mlp": 0.07617188, + "step": 8756, + "time_per_iteration": 3.085657835006714 + }, + { + "auxiliary_loss_clip": 0.0106361, + "auxiliary_loss_mlp": 0.01026958, + "balance_loss_clip": 1.01466155, + "balance_loss_mlp": 1.02084208, + "epoch": 0.5264993236134075, + "flos": 21031364016000.0, + "grad_norm": 2.9563796862465503, + "language_loss": 0.70516944, + "learning_rate": 1.8339495396018876e-06, + "loss": 0.72607505, + "num_input_tokens_seen": 188212565, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.42773438, + "step": 8757, + "time_per_iteration": 2.3850505352020264 + }, + { + "auxiliary_loss_clip": 0.01062359, + "auxiliary_loss_mlp": 0.01026927, + "balance_loss_clip": 1.01427245, + "balance_loss_mlp": 1.01942348, + "epoch": 0.5265594468660755, + "flos": 16617260044800.0, + "grad_norm": 1.6923172057622473, + "language_loss": 0.87546158, + "learning_rate": 1.8335730246917063e-06, + "loss": 0.89635444, + "num_input_tokens_seen": 188229505, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.4296875, + "step": 8758, + "time_per_iteration": 5.195640802383423 + }, + { + "auxiliary_loss_clip": 0.01063745, + "auxiliary_loss_mlp": 0.01030732, + "balance_loss_clip": 1.01683831, + "balance_loss_mlp": 1.02004337, + "epoch": 0.5266195701187434, + "flos": 24132978046080.0, + "grad_norm": 1.4048761533037357, + "language_loss": 0.7619319, + "learning_rate": 1.8331965157208757e-06, + "loss": 0.78287661, + "num_input_tokens_seen": 188250395, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.4375, + "step": 8759, + "time_per_iteration": 2.4106040000915527 + }, + { + "auxiliary_loss_clip": 0.0105955, + "auxiliary_loss_mlp": 0.01024066, + "balance_loss_clip": 1.01241291, + "balance_loss_mlp": 1.01977015, + "epoch": 0.5266796933714114, + "flos": 15843410559360.0, + "grad_norm": 3.647926283636705, + "language_loss": 0.71834695, + "learning_rate": 1.8328200127028324e-06, + "loss": 0.73918307, + "num_input_tokens_seen": 188266785, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.3984375, + "step": 8760, + "time_per_iteration": 2.3800275325775146 + }, + { + "auxiliary_loss_clip": 0.01064676, + "auxiliary_loss_mlp": 0.01032023, + "balance_loss_clip": 1.01822424, + "balance_loss_mlp": 1.0204488, + "epoch": 0.5267398166240793, + "flos": 20950610307840.0, + "grad_norm": 1.8136638949512567, + "language_loss": 0.75578916, + "learning_rate": 1.832443515651013e-06, + "loss": 0.77675617, + "num_input_tokens_seen": 188282525, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.44140625, + "step": 8761, + "time_per_iteration": 2.3745639324188232 + }, + { + "auxiliary_loss_clip": 0.01060733, + "auxiliary_loss_mlp": 0.01028156, + "balance_loss_clip": 1.01642609, + "balance_loss_mlp": 1.02076805, + "epoch": 0.5267999398767473, + "flos": 20995333626240.0, + "grad_norm": 1.6508874888545662, + "language_loss": 0.70851314, + "learning_rate": 1.8320670245788534e-06, + "loss": 0.72940207, + "num_input_tokens_seen": 188301395, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.40039062, + "step": 8762, + "time_per_iteration": 2.3851420879364014 + }, + { + "auxiliary_loss_clip": 0.01009676, + "auxiliary_loss_mlp": 0.01004, + "balance_loss_clip": 1.00302827, + "balance_loss_mlp": 1.0018388, + "epoch": 0.5268600631294152, + "flos": 66846312864000.0, + "grad_norm": 0.9257065826530112, + "language_loss": 0.65495265, + "learning_rate": 1.8316905394997895e-06, + "loss": 0.67508948, + "num_input_tokens_seen": 188357665, + "router_z_loss_clip": 0.00970459, + "router_z_loss_mlp": 0.078125, + "step": 8763, + "time_per_iteration": 2.927121162414551 + }, + { + "auxiliary_loss_clip": 0.0106219, + "auxiliary_loss_mlp": 0.01029504, + "balance_loss_clip": 1.0171895, + "balance_loss_mlp": 1.02072132, + "epoch": 0.5269201863820833, + "flos": 17164593429120.0, + "grad_norm": 1.6332696877464412, + "language_loss": 0.71324569, + "learning_rate": 1.8313140604272577e-06, + "loss": 0.73416263, + "num_input_tokens_seen": 188376935, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.4140625, + "step": 8764, + "time_per_iteration": 2.38730788230896 + }, + { + "auxiliary_loss_clip": 0.0106209, + "auxiliary_loss_mlp": 0.01024639, + "balance_loss_clip": 1.01151967, + "balance_loss_mlp": 1.01998353, + "epoch": 0.5269803096347513, + "flos": 20521527298560.0, + "grad_norm": 2.0530986783335123, + "language_loss": 0.74187487, + "learning_rate": 1.8309375873746926e-06, + "loss": 0.76274216, + "num_input_tokens_seen": 188394995, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.421875, + "step": 8765, + "time_per_iteration": 2.3777058124542236 + }, + { + "auxiliary_loss_clip": 0.01062603, + "auxiliary_loss_mlp": 0.01028576, + "balance_loss_clip": 1.01517081, + "balance_loss_mlp": 1.01923537, + "epoch": 0.5270404328874192, + "flos": 27229879042560.0, + "grad_norm": 1.5435428134795297, + "language_loss": 0.85629481, + "learning_rate": 1.8305611203555307e-06, + "loss": 0.87720656, + "num_input_tokens_seen": 188415475, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.43359375, + "step": 8766, + "time_per_iteration": 2.4467809200286865 + }, + { + "auxiliary_loss_clip": 0.01063913, + "auxiliary_loss_mlp": 0.01028633, + "balance_loss_clip": 1.01538885, + "balance_loss_mlp": 1.02133226, + "epoch": 0.5271005561400872, + "flos": 23109429450240.0, + "grad_norm": 2.1538948561162754, + "language_loss": 0.78874433, + "learning_rate": 1.8301846593832064e-06, + "loss": 0.80966979, + "num_input_tokens_seen": 188435665, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.42578125, + "step": 8767, + "time_per_iteration": 2.464409828186035 + }, + { + "auxiliary_loss_clip": 0.01064922, + "auxiliary_loss_mlp": 0.01027704, + "balance_loss_clip": 1.01423311, + "balance_loss_mlp": 1.02152634, + "epoch": 0.5271606793927551, + "flos": 22563701988480.0, + "grad_norm": 2.0071039978097667, + "language_loss": 0.73135722, + "learning_rate": 1.8298082044711544e-06, + "loss": 0.75228345, + "num_input_tokens_seen": 188455405, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.43359375, + "step": 8768, + "time_per_iteration": 2.406036615371704 + }, + { + "auxiliary_loss_clip": 0.01061083, + "auxiliary_loss_mlp": 0.01025088, + "balance_loss_clip": 1.01411438, + "balance_loss_mlp": 1.02099621, + "epoch": 0.5272208026454231, + "flos": 18763441274880.0, + "grad_norm": 1.9395350724274167, + "language_loss": 0.74202091, + "learning_rate": 1.8294317556328102e-06, + "loss": 0.76288259, + "num_input_tokens_seen": 188472940, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.40039062, + "step": 8769, + "time_per_iteration": 2.364230155944824 + }, + { + "auxiliary_loss_clip": 0.01060308, + "auxiliary_loss_mlp": 0.01022222, + "balance_loss_clip": 1.01106381, + "balance_loss_mlp": 1.02014637, + "epoch": 0.5272809258980911, + "flos": 20411131979520.0, + "grad_norm": 2.498545561143705, + "language_loss": 0.73804581, + "learning_rate": 1.8290553128816077e-06, + "loss": 0.75887114, + "num_input_tokens_seen": 188493035, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.40234375, + "step": 8770, + "time_per_iteration": 2.397458553314209 + }, + { + "auxiliary_loss_clip": 0.01062203, + "auxiliary_loss_mlp": 0.01024452, + "balance_loss_clip": 1.01167846, + "balance_loss_mlp": 1.02013135, + "epoch": 0.5273410491507591, + "flos": 28255487408640.0, + "grad_norm": 1.9202466922612045, + "language_loss": 0.68304497, + "learning_rate": 1.8286788762309816e-06, + "loss": 0.70391154, + "num_input_tokens_seen": 188513860, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.41992188, + "step": 8771, + "time_per_iteration": 2.456984758377075 + }, + { + "auxiliary_loss_clip": 0.01060985, + "auxiliary_loss_mlp": 0.01023187, + "balance_loss_clip": 1.01108146, + "balance_loss_mlp": 1.02043653, + "epoch": 0.527401172403427, + "flos": 22454074719360.0, + "grad_norm": 1.7094094200787504, + "language_loss": 0.75762409, + "learning_rate": 1.8283024456943659e-06, + "loss": 0.77846581, + "num_input_tokens_seen": 188533345, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.40625, + "step": 8772, + "time_per_iteration": 2.402803897857666 + }, + { + "auxiliary_loss_clip": 0.01064146, + "auxiliary_loss_mlp": 0.0102887, + "balance_loss_clip": 1.01568532, + "balance_loss_mlp": 1.02139401, + "epoch": 0.527461295656095, + "flos": 21030072295680.0, + "grad_norm": 1.748619955220301, + "language_loss": 0.65631425, + "learning_rate": 1.8279260212851938e-06, + "loss": 0.67724442, + "num_input_tokens_seen": 188551550, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.42773438, + "step": 8773, + "time_per_iteration": 2.390746593475342 + }, + { + "auxiliary_loss_clip": 0.01062303, + "auxiliary_loss_mlp": 0.01026871, + "balance_loss_clip": 1.01483703, + "balance_loss_mlp": 1.02001226, + "epoch": 0.5275214189087629, + "flos": 26320845306240.0, + "grad_norm": 1.7945705194372417, + "language_loss": 0.86238176, + "learning_rate": 1.8275496030169e-06, + "loss": 0.88327348, + "num_input_tokens_seen": 188571615, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.421875, + "step": 8774, + "time_per_iteration": 2.443553924560547 + }, + { + "auxiliary_loss_clip": 0.0106024, + "auxiliary_loss_mlp": 0.01020704, + "balance_loss_clip": 1.00886703, + "balance_loss_mlp": 1.01981568, + "epoch": 0.5275815421614309, + "flos": 20046010314240.0, + "grad_norm": 1.5707097223629698, + "language_loss": 0.79692709, + "learning_rate": 1.8271731909029164e-06, + "loss": 0.81773651, + "num_input_tokens_seen": 188591965, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.40429688, + "step": 8775, + "time_per_iteration": 2.387338161468506 + }, + { + "auxiliary_loss_clip": 0.01061044, + "auxiliary_loss_mlp": 0.01026918, + "balance_loss_clip": 1.01376367, + "balance_loss_mlp": 1.02043927, + "epoch": 0.5276416654140988, + "flos": 21431189439360.0, + "grad_norm": 1.8217170731224177, + "language_loss": 0.83606082, + "learning_rate": 1.8267967849566776e-06, + "loss": 0.85694039, + "num_input_tokens_seen": 188610675, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.40625, + "step": 8776, + "time_per_iteration": 2.397451877593994 + }, + { + "auxiliary_loss_clip": 0.01061493, + "auxiliary_loss_mlp": 0.01022201, + "balance_loss_clip": 1.01047719, + "balance_loss_mlp": 1.01971054, + "epoch": 0.5277017886667669, + "flos": 17164139581440.0, + "grad_norm": 1.7386991902636, + "language_loss": 0.68124008, + "learning_rate": 1.8264203851916155e-06, + "loss": 0.70207703, + "num_input_tokens_seen": 188628235, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.41796875, + "step": 8777, + "time_per_iteration": 2.349332332611084 + }, + { + "auxiliary_loss_clip": 0.01063733, + "auxiliary_loss_mlp": 0.01029443, + "balance_loss_clip": 1.01805282, + "balance_loss_mlp": 1.02258158, + "epoch": 0.5277619119194349, + "flos": 20447127457920.0, + "grad_norm": 1.6042840595959538, + "language_loss": 0.82388562, + "learning_rate": 1.826043991621164e-06, + "loss": 0.8448174, + "num_input_tokens_seen": 188648925, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.41015625, + "step": 8778, + "time_per_iteration": 2.414142608642578 + }, + { + "auxiliary_loss_clip": 0.01063321, + "auxiliary_loss_mlp": 0.01030048, + "balance_loss_clip": 1.01670241, + "balance_loss_mlp": 1.02071595, + "epoch": 0.5278220351721028, + "flos": 24059939748480.0, + "grad_norm": 2.0865459178454135, + "language_loss": 0.79342508, + "learning_rate": 1.825667604258755e-06, + "loss": 0.81435877, + "num_input_tokens_seen": 188668125, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.42578125, + "step": 8779, + "time_per_iteration": 2.408700466156006 + }, + { + "auxiliary_loss_clip": 0.01057517, + "auxiliary_loss_mlp": 0.01023283, + "balance_loss_clip": 1.01201153, + "balance_loss_mlp": 1.01850986, + "epoch": 0.5278821584247708, + "flos": 24641802334080.0, + "grad_norm": 2.8190620248787277, + "language_loss": 0.7656908, + "learning_rate": 1.82529122311782e-06, + "loss": 0.78649879, + "num_input_tokens_seen": 188684410, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.390625, + "step": 8780, + "time_per_iteration": 2.401724338531494 + }, + { + "auxiliary_loss_clip": 0.01066285, + "auxiliary_loss_mlp": 0.01028369, + "balance_loss_clip": 1.0144093, + "balance_loss_mlp": 1.02271879, + "epoch": 0.5279422816774387, + "flos": 35406781971840.0, + "grad_norm": 1.5526312224737249, + "language_loss": 0.69407308, + "learning_rate": 1.8249148482117925e-06, + "loss": 0.71501964, + "num_input_tokens_seen": 188706130, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.43554688, + "step": 8781, + "time_per_iteration": 2.506904125213623 + }, + { + "auxiliary_loss_clip": 0.01059661, + "auxiliary_loss_mlp": 0.01028505, + "balance_loss_clip": 1.01759171, + "balance_loss_mlp": 1.02001607, + "epoch": 0.5280024049301068, + "flos": 22965901384320.0, + "grad_norm": 2.0417669212349607, + "language_loss": 0.72351754, + "learning_rate": 1.8245384795541033e-06, + "loss": 0.74439925, + "num_input_tokens_seen": 188725030, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.39648438, + "step": 8782, + "time_per_iteration": 3.7883944511413574 + }, + { + "auxiliary_loss_clip": 0.01060665, + "auxiliary_loss_mlp": 0.01021555, + "balance_loss_clip": 1.00987852, + "balance_loss_mlp": 1.01980233, + "epoch": 0.5280625281827747, + "flos": 21506531886720.0, + "grad_norm": 1.672826635444315, + "language_loss": 0.68384588, + "learning_rate": 1.8241621171581846e-06, + "loss": 0.70466805, + "num_input_tokens_seen": 188744325, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.40820312, + "step": 8783, + "time_per_iteration": 2.3900556564331055 + }, + { + "auxiliary_loss_clip": 0.01066319, + "auxiliary_loss_mlp": 0.01027391, + "balance_loss_clip": 1.01329398, + "balance_loss_mlp": 1.02145946, + "epoch": 0.5281226514354427, + "flos": 31206940214400.0, + "grad_norm": 1.7742455993613393, + "language_loss": 0.6915127, + "learning_rate": 1.8237857610374678e-06, + "loss": 0.71244979, + "num_input_tokens_seen": 188765100, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.44726562, + "step": 8784, + "time_per_iteration": 2.464232921600342 + }, + { + "auxiliary_loss_clip": 0.01063886, + "auxiliary_loss_mlp": 0.01028304, + "balance_loss_clip": 1.01412988, + "balance_loss_mlp": 1.0193305, + "epoch": 0.5281827746881106, + "flos": 25076785363200.0, + "grad_norm": 3.0373162175041877, + "language_loss": 0.75204378, + "learning_rate": 1.8234094112053836e-06, + "loss": 0.77296567, + "num_input_tokens_seen": 188783995, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.4453125, + "step": 8785, + "time_per_iteration": 2.4112942218780518 + }, + { + "auxiliary_loss_clip": 0.01062525, + "auxiliary_loss_mlp": 0.01025459, + "balance_loss_clip": 1.01305509, + "balance_loss_mlp": 1.02062106, + "epoch": 0.5282428979407786, + "flos": 20630211960960.0, + "grad_norm": 2.837911155945939, + "language_loss": 0.83610284, + "learning_rate": 1.8230330676753637e-06, + "loss": 0.85698265, + "num_input_tokens_seen": 188803120, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.41796875, + "step": 8786, + "time_per_iteration": 2.4315810203552246 + }, + { + "auxiliary_loss_clip": 0.01057789, + "auxiliary_loss_mlp": 0.01020828, + "balance_loss_clip": 1.01059961, + "balance_loss_mlp": 1.01931632, + "epoch": 0.5283030211934465, + "flos": 22418288709120.0, + "grad_norm": 2.1707567295241956, + "language_loss": 0.82753348, + "learning_rate": 1.8226567304608383e-06, + "loss": 0.84831959, + "num_input_tokens_seen": 188820960, + "router_z_loss_clip": 0.10253906, + "router_z_loss_mlp": 0.38476562, + "step": 8787, + "time_per_iteration": 2.4102323055267334 + }, + { + "auxiliary_loss_clip": 0.01060296, + "auxiliary_loss_mlp": 0.01022042, + "balance_loss_clip": 1.0110631, + "balance_loss_mlp": 1.01958084, + "epoch": 0.5283631444461145, + "flos": 23614553134080.0, + "grad_norm": 2.2627176146254415, + "language_loss": 0.83274025, + "learning_rate": 1.822280399575238e-06, + "loss": 0.85356367, + "num_input_tokens_seen": 188837165, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.40820312, + "step": 8788, + "time_per_iteration": 2.3854801654815674 + }, + { + "auxiliary_loss_clip": 0.01062884, + "auxiliary_loss_mlp": 0.0102795, + "balance_loss_clip": 1.01520061, + "balance_loss_mlp": 1.02202857, + "epoch": 0.5284232676987825, + "flos": 32670603809280.0, + "grad_norm": 1.6203688608460587, + "language_loss": 0.75100219, + "learning_rate": 1.821904075031993e-06, + "loss": 0.77191055, + "num_input_tokens_seen": 188858555, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.40820312, + "step": 8789, + "time_per_iteration": 2.4695162773132324 + }, + { + "auxiliary_loss_clip": 0.01064246, + "auxiliary_loss_mlp": 0.0102467, + "balance_loss_clip": 1.01224267, + "balance_loss_mlp": 1.02131653, + "epoch": 0.5284833909514505, + "flos": 26759703496320.0, + "grad_norm": 1.6560171312067096, + "language_loss": 0.69704634, + "learning_rate": 1.821527756844533e-06, + "loss": 0.71793544, + "num_input_tokens_seen": 188879050, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.4296875, + "step": 8790, + "time_per_iteration": 2.4244682788848877 + }, + { + "auxiliary_loss_clip": 0.0106032, + "auxiliary_loss_mlp": 0.01021538, + "balance_loss_clip": 1.01017749, + "balance_loss_mlp": 1.02041531, + "epoch": 0.5285435142041185, + "flos": 22089616369920.0, + "grad_norm": 1.6322892526521924, + "language_loss": 0.79264396, + "learning_rate": 1.821151445026289e-06, + "loss": 0.8134625, + "num_input_tokens_seen": 188898885, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.3984375, + "step": 8791, + "time_per_iteration": 2.383145332336426 + }, + { + "auxiliary_loss_clip": 0.01062217, + "auxiliary_loss_mlp": 0.01021814, + "balance_loss_clip": 1.01096594, + "balance_loss_mlp": 1.02173233, + "epoch": 0.5286036374567864, + "flos": 20374438273920.0, + "grad_norm": 2.1635790348652653, + "language_loss": 0.66238385, + "learning_rate": 1.820775139590689e-06, + "loss": 0.6832242, + "num_input_tokens_seen": 188917225, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.40429688, + "step": 8792, + "time_per_iteration": 2.3696393966674805 + }, + { + "auxiliary_loss_clip": 0.01058739, + "auxiliary_loss_mlp": 0.0102405, + "balance_loss_clip": 1.01277876, + "balance_loss_mlp": 1.01879203, + "epoch": 0.5286637607094544, + "flos": 24351045598080.0, + "grad_norm": 1.936843155215031, + "language_loss": 0.79661965, + "learning_rate": 1.820398840551164e-06, + "loss": 0.81744754, + "num_input_tokens_seen": 188936120, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.3984375, + "step": 8793, + "time_per_iteration": 2.39971923828125 + }, + { + "auxiliary_loss_clip": 0.01061738, + "auxiliary_loss_mlp": 0.01027641, + "balance_loss_clip": 1.01450419, + "balance_loss_mlp": 1.0201093, + "epoch": 0.5287238839621223, + "flos": 17270310625920.0, + "grad_norm": 1.9066689177105665, + "language_loss": 0.84816825, + "learning_rate": 1.8200225479211416e-06, + "loss": 0.86906207, + "num_input_tokens_seen": 188953405, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.41601562, + "step": 8794, + "time_per_iteration": 3.795783758163452 + }, + { + "auxiliary_loss_clip": 0.0106469, + "auxiliary_loss_mlp": 0.01027501, + "balance_loss_clip": 1.01442981, + "balance_loss_mlp": 1.02000523, + "epoch": 0.5287840072147904, + "flos": 17565920040960.0, + "grad_norm": 3.282035620711645, + "language_loss": 0.6849516, + "learning_rate": 1.8196462617140525e-06, + "loss": 0.70587349, + "num_input_tokens_seen": 188971150, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.44726562, + "step": 8795, + "time_per_iteration": 2.353410005569458 + }, + { + "auxiliary_loss_clip": 0.01062489, + "auxiliary_loss_mlp": 0.01026161, + "balance_loss_clip": 1.01306009, + "balance_loss_mlp": 1.01996541, + "epoch": 0.5288441304674583, + "flos": 18551099185920.0, + "grad_norm": 1.8047857685029989, + "language_loss": 0.80855906, + "learning_rate": 1.8192699819433242e-06, + "loss": 0.8294456, + "num_input_tokens_seen": 188989550, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.42578125, + "step": 8796, + "time_per_iteration": 2.3670268058776855 + }, + { + "auxiliary_loss_clip": 0.0106598, + "auxiliary_loss_mlp": 0.01029876, + "balance_loss_clip": 1.01656044, + "balance_loss_mlp": 1.02256548, + "epoch": 0.5289042537201263, + "flos": 20813436109440.0, + "grad_norm": 2.8652083854581534, + "language_loss": 0.68964112, + "learning_rate": 1.8188937086223847e-06, + "loss": 0.71059966, + "num_input_tokens_seen": 189008795, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.43359375, + "step": 8797, + "time_per_iteration": 2.3844640254974365 + }, + { + "auxiliary_loss_clip": 0.01058531, + "auxiliary_loss_mlp": 0.01022444, + "balance_loss_clip": 1.01085687, + "balance_loss_mlp": 1.01986527, + "epoch": 0.5289643769727942, + "flos": 15734551340160.0, + "grad_norm": 1.608591331346701, + "language_loss": 0.82306588, + "learning_rate": 1.8185174417646633e-06, + "loss": 0.84387559, + "num_input_tokens_seen": 189025540, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.38671875, + "step": 8798, + "time_per_iteration": 3.8301186561584473 + }, + { + "auxiliary_loss_clip": 0.01064107, + "auxiliary_loss_mlp": 0.01031942, + "balance_loss_clip": 1.01840639, + "balance_loss_mlp": 1.02000165, + "epoch": 0.5290245002254622, + "flos": 19536278330880.0, + "grad_norm": 2.280428305612497, + "language_loss": 0.70344424, + "learning_rate": 1.8181411813835873e-06, + "loss": 0.72440469, + "num_input_tokens_seen": 189044885, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.44140625, + "step": 8799, + "time_per_iteration": 2.3759138584136963 + }, + { + "auxiliary_loss_clip": 0.01060873, + "auxiliary_loss_mlp": 0.01026483, + "balance_loss_clip": 1.01489568, + "balance_loss_mlp": 1.02194202, + "epoch": 0.5290846234781301, + "flos": 15814222796160.0, + "grad_norm": 2.077959398030892, + "language_loss": 0.69576144, + "learning_rate": 1.8177649274925852e-06, + "loss": 0.71663499, + "num_input_tokens_seen": 189061280, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.390625, + "step": 8800, + "time_per_iteration": 2.362342357635498 + }, + { + "auxiliary_loss_clip": 0.01009112, + "auxiliary_loss_mlp": 0.01003449, + "balance_loss_clip": 1.00250149, + "balance_loss_mlp": 1.00108576, + "epoch": 0.5291447467307981, + "flos": 70054516874880.0, + "grad_norm": 0.9459149223803092, + "language_loss": 0.56994271, + "learning_rate": 1.8173886801050842e-06, + "loss": 0.59006834, + "num_input_tokens_seen": 189114775, + "router_z_loss_clip": 0.00946045, + "router_z_loss_mlp": 0.08007812, + "step": 8801, + "time_per_iteration": 2.9403960704803467 + }, + { + "auxiliary_loss_clip": 0.01061683, + "auxiliary_loss_mlp": 0.01029976, + "balance_loss_clip": 1.01791251, + "balance_loss_mlp": 1.02007926, + "epoch": 0.529204869983466, + "flos": 28362985084800.0, + "grad_norm": 1.473546971278229, + "language_loss": 0.63803184, + "learning_rate": 1.8170124392345113e-06, + "loss": 0.65894842, + "num_input_tokens_seen": 189134700, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.41601562, + "step": 8802, + "time_per_iteration": 2.4504079818725586 + }, + { + "auxiliary_loss_clip": 0.01059929, + "auxiliary_loss_mlp": 0.01027455, + "balance_loss_clip": 1.01537311, + "balance_loss_mlp": 1.01890612, + "epoch": 0.5292649932361341, + "flos": 33757624990080.0, + "grad_norm": 1.6829573519310124, + "language_loss": 0.68883157, + "learning_rate": 1.8166362048942935e-06, + "loss": 0.70970547, + "num_input_tokens_seen": 189155365, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.41015625, + "step": 8803, + "time_per_iteration": 2.5010857582092285 + }, + { + "auxiliary_loss_clip": 0.01059196, + "auxiliary_loss_mlp": 0.01022547, + "balance_loss_clip": 1.01087022, + "balance_loss_mlp": 1.01974964, + "epoch": 0.5293251164888021, + "flos": 20446673610240.0, + "grad_norm": 1.6920181350088284, + "language_loss": 0.76195347, + "learning_rate": 1.816259977097858e-06, + "loss": 0.78277087, + "num_input_tokens_seen": 189173885, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.39453125, + "step": 8804, + "time_per_iteration": 2.375884532928467 + }, + { + "auxiliary_loss_clip": 0.01008761, + "auxiliary_loss_mlp": 0.01000654, + "balance_loss_clip": 0.99962926, + "balance_loss_mlp": 1.00086856, + "epoch": 0.52938523974147, + "flos": 66541554806400.0, + "grad_norm": 0.7601223706568797, + "language_loss": 0.52940923, + "learning_rate": 1.8158837558586313e-06, + "loss": 0.54950339, + "num_input_tokens_seen": 189236515, + "router_z_loss_clip": 0.01025391, + "router_z_loss_mlp": 0.07910156, + "step": 8805, + "time_per_iteration": 3.141249179840088 + }, + { + "auxiliary_loss_clip": 0.01008396, + "auxiliary_loss_mlp": 0.01001219, + "balance_loss_clip": 1.00030148, + "balance_loss_mlp": 1.0007087, + "epoch": 0.529445362994138, + "flos": 67148345969280.0, + "grad_norm": 0.7399635199758576, + "language_loss": 0.63789672, + "learning_rate": 1.8155075411900398e-06, + "loss": 0.6579929, + "num_input_tokens_seen": 189300500, + "router_z_loss_clip": 0.00915527, + "router_z_loss_mlp": 0.07714844, + "step": 8806, + "time_per_iteration": 3.100407838821411 + }, + { + "auxiliary_loss_clip": 0.01064001, + "auxiliary_loss_mlp": 0.01025705, + "balance_loss_clip": 1.01336658, + "balance_loss_mlp": 1.01972127, + "epoch": 0.5295054862468059, + "flos": 17748341228160.0, + "grad_norm": 1.6717265872585991, + "language_loss": 0.72234595, + "learning_rate": 1.8151313331055094e-06, + "loss": 0.74324298, + "num_input_tokens_seen": 189319745, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.44140625, + "step": 8807, + "time_per_iteration": 2.372189521789551 + }, + { + "auxiliary_loss_clip": 0.01059811, + "auxiliary_loss_mlp": 0.01022894, + "balance_loss_clip": 1.01143813, + "balance_loss_mlp": 1.02005458, + "epoch": 0.529565609499474, + "flos": 11396697511680.0, + "grad_norm": 2.150374406840926, + "language_loss": 0.69046313, + "learning_rate": 1.8147551316184661e-06, + "loss": 0.71129012, + "num_input_tokens_seen": 189334550, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.3984375, + "step": 8808, + "time_per_iteration": 2.333024501800537 + }, + { + "auxiliary_loss_clip": 0.01059365, + "auxiliary_loss_mlp": 0.01023026, + "balance_loss_clip": 1.01166499, + "balance_loss_mlp": 1.01910233, + "epoch": 0.5296257327521419, + "flos": 17195561671680.0, + "grad_norm": 2.2844061625707432, + "language_loss": 0.86418897, + "learning_rate": 1.8143789367423356e-06, + "loss": 0.88501287, + "num_input_tokens_seen": 189351735, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.40234375, + "step": 8809, + "time_per_iteration": 2.373746871948242 + }, + { + "auxiliary_loss_clip": 0.01064222, + "auxiliary_loss_mlp": 0.01028666, + "balance_loss_clip": 1.01499295, + "balance_loss_mlp": 1.0204438, + "epoch": 0.5296858560048099, + "flos": 39962633529600.0, + "grad_norm": 1.807860763697407, + "language_loss": 0.6417954, + "learning_rate": 1.8140027484905438e-06, + "loss": 0.66272426, + "num_input_tokens_seen": 189373105, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.4375, + "step": 8810, + "time_per_iteration": 2.551849126815796 + }, + { + "auxiliary_loss_clip": 0.01060607, + "auxiliary_loss_mlp": 0.01026889, + "balance_loss_clip": 1.01480079, + "balance_loss_mlp": 1.02055907, + "epoch": 0.5297459792574778, + "flos": 20960315665920.0, + "grad_norm": 1.4601147221675987, + "language_loss": 0.67882967, + "learning_rate": 1.8136265668765153e-06, + "loss": 0.69970465, + "num_input_tokens_seen": 189394615, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.40039062, + "step": 8811, + "time_per_iteration": 2.4350461959838867 + }, + { + "auxiliary_loss_clip": 0.01010172, + "auxiliary_loss_mlp": 0.01001119, + "balance_loss_clip": 1.00011134, + "balance_loss_mlp": 1.00205529, + "epoch": 0.5298061025101458, + "flos": 66520468344960.0, + "grad_norm": 0.6756767979683355, + "language_loss": 0.53372282, + "learning_rate": 1.813250391913675e-06, + "loss": 0.55383575, + "num_input_tokens_seen": 189459750, + "router_z_loss_clip": 0.0100708, + "router_z_loss_mlp": 0.08105469, + "step": 8812, + "time_per_iteration": 3.1533238887786865 + }, + { + "auxiliary_loss_clip": 0.01009384, + "auxiliary_loss_mlp": 0.01000489, + "balance_loss_clip": 0.99955952, + "balance_loss_mlp": 1.00156021, + "epoch": 0.5298662257628137, + "flos": 67659579141120.0, + "grad_norm": 0.7335155597510381, + "language_loss": 0.56379616, + "learning_rate": 1.8128742236154482e-06, + "loss": 0.58389497, + "num_input_tokens_seen": 189527540, + "router_z_loss_clip": 0.00927734, + "router_z_loss_mlp": 0.078125, + "step": 8813, + "time_per_iteration": 3.096597671508789 + }, + { + "auxiliary_loss_clip": 0.01058589, + "auxiliary_loss_mlp": 0.01025273, + "balance_loss_clip": 1.01376891, + "balance_loss_mlp": 1.0205071, + "epoch": 0.5299263490154817, + "flos": 19645381929600.0, + "grad_norm": 1.8215453504923633, + "language_loss": 0.81347173, + "learning_rate": 1.8124980619952585e-06, + "loss": 0.83431029, + "num_input_tokens_seen": 189546900, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.38085938, + "step": 8814, + "time_per_iteration": 2.398347854614258 + }, + { + "auxiliary_loss_clip": 0.0106494, + "auxiliary_loss_mlp": 0.01027269, + "balance_loss_clip": 1.01525903, + "balance_loss_mlp": 1.02242875, + "epoch": 0.5299864722681497, + "flos": 22053900182400.0, + "grad_norm": 1.5645386675862325, + "language_loss": 0.84983695, + "learning_rate": 1.8121219070665312e-06, + "loss": 0.87075907, + "num_input_tokens_seen": 189566490, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.42578125, + "step": 8815, + "time_per_iteration": 2.383754253387451 + }, + { + "auxiliary_loss_clip": 0.01060101, + "auxiliary_loss_mlp": 0.01024117, + "balance_loss_clip": 1.01263666, + "balance_loss_mlp": 1.02052522, + "epoch": 0.5300465955208177, + "flos": 21762584864640.0, + "grad_norm": 2.2443750948753878, + "language_loss": 0.66243219, + "learning_rate": 1.8117457588426893e-06, + "loss": 0.68327445, + "num_input_tokens_seen": 189585580, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.39453125, + "step": 8816, + "time_per_iteration": 2.4053852558135986 + }, + { + "auxiliary_loss_clip": 0.01061885, + "auxiliary_loss_mlp": 0.01029007, + "balance_loss_clip": 1.01693141, + "balance_loss_mlp": 1.02097619, + "epoch": 0.5301067187734857, + "flos": 42994840043520.0, + "grad_norm": 1.5320110261751638, + "language_loss": 0.7220788, + "learning_rate": 1.8113696173371578e-06, + "loss": 0.74298775, + "num_input_tokens_seen": 189608485, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.41015625, + "step": 8817, + "time_per_iteration": 2.573481559753418 + }, + { + "auxiliary_loss_clip": 0.01061102, + "auxiliary_loss_mlp": 0.01025624, + "balance_loss_clip": 1.01358938, + "balance_loss_mlp": 1.01950383, + "epoch": 0.5301668420261536, + "flos": 20553368325120.0, + "grad_norm": 1.7032786793026138, + "language_loss": 0.65028346, + "learning_rate": 1.810993482563359e-06, + "loss": 0.67115068, + "num_input_tokens_seen": 189627815, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.41601562, + "step": 8818, + "time_per_iteration": 2.4357340335845947 + }, + { + "auxiliary_loss_clip": 0.01061384, + "auxiliary_loss_mlp": 0.01021946, + "balance_loss_clip": 1.00904167, + "balance_loss_mlp": 1.02015066, + "epoch": 0.5302269652788216, + "flos": 17485899471360.0, + "grad_norm": 2.479170720837474, + "language_loss": 0.74860346, + "learning_rate": 1.8106173545347164e-06, + "loss": 0.76943672, + "num_input_tokens_seen": 189644850, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.41210938, + "step": 8819, + "time_per_iteration": 2.359041690826416 + }, + { + "auxiliary_loss_clip": 0.01062576, + "auxiliary_loss_mlp": 0.01026709, + "balance_loss_clip": 1.01445389, + "balance_loss_mlp": 1.01955891, + "epoch": 0.5302870885314895, + "flos": 14573339786880.0, + "grad_norm": 2.1023327945092305, + "language_loss": 0.82185924, + "learning_rate": 1.8102412332646536e-06, + "loss": 0.8427521, + "num_input_tokens_seen": 189660945, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.4296875, + "step": 8820, + "time_per_iteration": 2.365088701248169 + }, + { + "auxiliary_loss_clip": 0.01061274, + "auxiliary_loss_mlp": 0.01023552, + "balance_loss_clip": 1.01179218, + "balance_loss_mlp": 1.01972032, + "epoch": 0.5303472117841576, + "flos": 23436984625920.0, + "grad_norm": 1.775486254596039, + "language_loss": 0.72635722, + "learning_rate": 1.8098651187665923e-06, + "loss": 0.74720544, + "num_input_tokens_seen": 189680425, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.4140625, + "step": 8821, + "time_per_iteration": 3.834498405456543 + }, + { + "auxiliary_loss_clip": 0.01058358, + "auxiliary_loss_mlp": 0.01022573, + "balance_loss_clip": 1.01182008, + "balance_loss_mlp": 1.01908696, + "epoch": 0.5304073350368255, + "flos": 22707963192960.0, + "grad_norm": 54.61077258389642, + "language_loss": 0.74090493, + "learning_rate": 1.8094890110539567e-06, + "loss": 0.76171422, + "num_input_tokens_seen": 189700375, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.39257812, + "step": 8822, + "time_per_iteration": 2.4357147216796875 + }, + { + "auxiliary_loss_clip": 0.01060588, + "auxiliary_loss_mlp": 0.0102602, + "balance_loss_clip": 1.01355624, + "balance_loss_mlp": 1.01955068, + "epoch": 0.5304674582894935, + "flos": 27927303828480.0, + "grad_norm": 3.344012288821774, + "language_loss": 0.6739195, + "learning_rate": 1.809112910140168e-06, + "loss": 0.69478559, + "num_input_tokens_seen": 189721225, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.41015625, + "step": 8823, + "time_per_iteration": 2.428647756576538 + }, + { + "auxiliary_loss_clip": 0.01064608, + "auxiliary_loss_mlp": 0.01026705, + "balance_loss_clip": 1.012591, + "balance_loss_mlp": 1.02083707, + "epoch": 0.5305275815421614, + "flos": 21249606124800.0, + "grad_norm": 1.886899662229966, + "language_loss": 0.69393623, + "learning_rate": 1.8087368160386483e-06, + "loss": 0.71484935, + "num_input_tokens_seen": 189740170, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.4375, + "step": 8824, + "time_per_iteration": 2.4110398292541504 + }, + { + "auxiliary_loss_clip": 0.01059488, + "auxiliary_loss_mlp": 0.01021864, + "balance_loss_clip": 1.00999045, + "balance_loss_mlp": 1.02027488, + "epoch": 0.5305877047948294, + "flos": 17602124987520.0, + "grad_norm": 1.889955946051064, + "language_loss": 0.75996101, + "learning_rate": 1.8083607287628198e-06, + "loss": 0.78077447, + "num_input_tokens_seen": 189757890, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.39257812, + "step": 8825, + "time_per_iteration": 2.3637311458587646 + }, + { + "auxiliary_loss_clip": 0.01061234, + "auxiliary_loss_mlp": 0.01025231, + "balance_loss_clip": 1.01271963, + "balance_loss_mlp": 1.02001417, + "epoch": 0.5306478280474973, + "flos": 15194584252800.0, + "grad_norm": 2.0458755077606643, + "language_loss": 0.85782373, + "learning_rate": 1.8079846483261035e-06, + "loss": 0.87868834, + "num_input_tokens_seen": 189775390, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.41210938, + "step": 8826, + "time_per_iteration": 2.407830238342285 + }, + { + "auxiliary_loss_clip": 0.01058966, + "auxiliary_loss_mlp": 0.01023469, + "balance_loss_clip": 1.01194739, + "balance_loss_mlp": 1.01981473, + "epoch": 0.5307079513001653, + "flos": 15340311734400.0, + "grad_norm": 1.5332951768281577, + "language_loss": 0.64356077, + "learning_rate": 1.807608574741922e-06, + "loss": 0.66438514, + "num_input_tokens_seen": 189793975, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.390625, + "step": 8827, + "time_per_iteration": 2.3632476329803467 + }, + { + "auxiliary_loss_clip": 0.01065754, + "auxiliary_loss_mlp": 0.0102894, + "balance_loss_clip": 1.01626253, + "balance_loss_mlp": 1.02191854, + "epoch": 0.5307680745528333, + "flos": 43542766920960.0, + "grad_norm": 1.898850587250875, + "language_loss": 0.59620404, + "learning_rate": 1.8072325080236951e-06, + "loss": 0.61715102, + "num_input_tokens_seen": 189817870, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.43945312, + "step": 8828, + "time_per_iteration": 2.585857629776001 + }, + { + "auxiliary_loss_clip": 0.01060372, + "auxiliary_loss_mlp": 0.01028178, + "balance_loss_clip": 1.01594102, + "balance_loss_mlp": 1.01956928, + "epoch": 0.5308281978055013, + "flos": 20047860616320.0, + "grad_norm": 2.3996223517916255, + "language_loss": 0.81140548, + "learning_rate": 1.806856448184844e-06, + "loss": 0.83229101, + "num_input_tokens_seen": 189837905, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.40625, + "step": 8829, + "time_per_iteration": 2.390322208404541 + }, + { + "auxiliary_loss_clip": 0.0105996, + "auxiliary_loss_mlp": 0.01025499, + "balance_loss_clip": 1.01388192, + "balance_loss_mlp": 1.02049863, + "epoch": 0.5308883210581693, + "flos": 20702901144960.0, + "grad_norm": 1.7464248304291068, + "language_loss": 0.78337061, + "learning_rate": 1.80648039523879e-06, + "loss": 0.80422521, + "num_input_tokens_seen": 189856970, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.39453125, + "step": 8830, + "time_per_iteration": 2.3949315547943115 + }, + { + "auxiliary_loss_clip": 0.01059728, + "auxiliary_loss_mlp": 0.01026127, + "balance_loss_clip": 1.01411664, + "balance_loss_mlp": 1.02036524, + "epoch": 0.5309484443108372, + "flos": 14354643830400.0, + "grad_norm": 2.033862442037055, + "language_loss": 0.80720532, + "learning_rate": 1.8061043491989523e-06, + "loss": 0.82806391, + "num_input_tokens_seen": 189872830, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.39453125, + "step": 8831, + "time_per_iteration": 2.3497633934020996 + }, + { + "auxiliary_loss_clip": 0.01060277, + "auxiliary_loss_mlp": 0.01023398, + "balance_loss_clip": 1.01096439, + "balance_loss_mlp": 1.01964676, + "epoch": 0.5310085675635052, + "flos": 20009491165440.0, + "grad_norm": 1.873776736464002, + "language_loss": 0.73328078, + "learning_rate": 1.8057283100787524e-06, + "loss": 0.75411749, + "num_input_tokens_seen": 189891635, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.40625, + "step": 8832, + "time_per_iteration": 2.370431423187256 + }, + { + "auxiliary_loss_clip": 0.01063347, + "auxiliary_loss_mlp": 0.01023841, + "balance_loss_clip": 1.01138353, + "balance_loss_mlp": 1.02083063, + "epoch": 0.5310686908161731, + "flos": 22126205341440.0, + "grad_norm": 1.8929889089681542, + "language_loss": 0.75744689, + "learning_rate": 1.805352277891609e-06, + "loss": 0.77831876, + "num_input_tokens_seen": 189909050, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.42578125, + "step": 8833, + "time_per_iteration": 3.799762010574341 + }, + { + "auxiliary_loss_clip": 0.01061299, + "auxiliary_loss_mlp": 0.01028015, + "balance_loss_clip": 1.01503921, + "balance_loss_mlp": 1.01909828, + "epoch": 0.5311288140688412, + "flos": 24716725845120.0, + "grad_norm": 2.142975793889799, + "language_loss": 0.73512805, + "learning_rate": 1.8049762526509416e-06, + "loss": 0.75602114, + "num_input_tokens_seen": 189927405, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.421875, + "step": 8834, + "time_per_iteration": 2.4113035202026367 + }, + { + "auxiliary_loss_clip": 0.01065315, + "auxiliary_loss_mlp": 0.01031267, + "balance_loss_clip": 1.0175519, + "balance_loss_mlp": 1.0197587, + "epoch": 0.5311889373215091, + "flos": 24096563631360.0, + "grad_norm": 2.305769081457282, + "language_loss": 0.77857029, + "learning_rate": 1.8046002343701708e-06, + "loss": 0.79953611, + "num_input_tokens_seen": 189947740, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.45703125, + "step": 8835, + "time_per_iteration": 2.4194633960723877 + }, + { + "auxiliary_loss_clip": 0.01060259, + "auxiliary_loss_mlp": 0.0102688, + "balance_loss_clip": 1.01385081, + "balance_loss_mlp": 1.01912761, + "epoch": 0.5312490605741771, + "flos": 22015949667840.0, + "grad_norm": 1.6018441376081232, + "language_loss": 0.72454107, + "learning_rate": 1.8042242230627142e-06, + "loss": 0.74541247, + "num_input_tokens_seen": 189966495, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.41210938, + "step": 8836, + "time_per_iteration": 2.387211322784424 + }, + { + "auxiliary_loss_clip": 0.01062082, + "auxiliary_loss_mlp": 0.01025642, + "balance_loss_clip": 1.01220131, + "balance_loss_mlp": 1.01945496, + "epoch": 0.531309183826845, + "flos": 19389538419840.0, + "grad_norm": 1.6672493420463883, + "language_loss": 0.80745471, + "learning_rate": 1.8038482187419922e-06, + "loss": 0.82833195, + "num_input_tokens_seen": 189985325, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.42578125, + "step": 8837, + "time_per_iteration": 3.8929057121276855 + }, + { + "auxiliary_loss_clip": 0.010603, + "auxiliary_loss_mlp": 0.0102481, + "balance_loss_clip": 1.01175678, + "balance_loss_mlp": 1.02018952, + "epoch": 0.531369307079513, + "flos": 20189119443840.0, + "grad_norm": 1.810465855929521, + "language_loss": 0.85746372, + "learning_rate": 1.8034722214214223e-06, + "loss": 0.87831485, + "num_input_tokens_seen": 190003290, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.40039062, + "step": 8838, + "time_per_iteration": 2.375460147857666 + }, + { + "auxiliary_loss_clip": 0.01059568, + "auxiliary_loss_mlp": 0.01023524, + "balance_loss_clip": 1.01203275, + "balance_loss_mlp": 1.0192312, + "epoch": 0.5314294303321809, + "flos": 18879143120640.0, + "grad_norm": 1.6715572635931066, + "language_loss": 0.72847772, + "learning_rate": 1.8030962311144233e-06, + "loss": 0.74930865, + "num_input_tokens_seen": 190023260, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.40234375, + "step": 8839, + "time_per_iteration": 2.4110915660858154 + }, + { + "auxiliary_loss_clip": 0.0106141, + "auxiliary_loss_mlp": 0.01024179, + "balance_loss_clip": 1.01203799, + "balance_loss_mlp": 1.01944864, + "epoch": 0.531489553584849, + "flos": 23038904770560.0, + "grad_norm": 1.553107352822617, + "language_loss": 0.76492655, + "learning_rate": 1.8027202478344136e-06, + "loss": 0.78578246, + "num_input_tokens_seen": 190042035, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.41992188, + "step": 8840, + "time_per_iteration": 2.3819010257720947 + }, + { + "auxiliary_loss_clip": 0.01063491, + "auxiliary_loss_mlp": 0.01027155, + "balance_loss_clip": 1.01392841, + "balance_loss_mlp": 1.02004313, + "epoch": 0.5315496768375169, + "flos": 19789503488640.0, + "grad_norm": 1.90439755248497, + "language_loss": 0.77280664, + "learning_rate": 1.8023442715948105e-06, + "loss": 0.79371309, + "num_input_tokens_seen": 190057545, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.43554688, + "step": 8841, + "time_per_iteration": 2.35364031791687 + }, + { + "auxiliary_loss_clip": 0.0100986, + "auxiliary_loss_mlp": 0.01006173, + "balance_loss_clip": 1.00531507, + "balance_loss_mlp": 1.00180781, + "epoch": 0.5316098000901849, + "flos": 71019620121600.0, + "grad_norm": 0.6920199310203501, + "language_loss": 0.56817555, + "learning_rate": 1.8019683024090323e-06, + "loss": 0.58833587, + "num_input_tokens_seen": 190123800, + "router_z_loss_clip": 0.00860596, + "router_z_loss_mlp": 0.08056641, + "step": 8842, + "time_per_iteration": 3.164372205734253 + }, + { + "auxiliary_loss_clip": 0.01063898, + "auxiliary_loss_mlp": 0.01023977, + "balance_loss_clip": 1.01068497, + "balance_loss_mlp": 1.02050996, + "epoch": 0.5316699233428529, + "flos": 16434629389440.0, + "grad_norm": 1.865029315725728, + "language_loss": 0.73860168, + "learning_rate": 1.8015923402904952e-06, + "loss": 0.75948048, + "num_input_tokens_seen": 190141625, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.43359375, + "step": 8843, + "time_per_iteration": 2.351494550704956 + }, + { + "auxiliary_loss_clip": 0.01009717, + "auxiliary_loss_mlp": 0.01003443, + "balance_loss_clip": 1.00256634, + "balance_loss_mlp": 1.00146866, + "epoch": 0.5317300465955208, + "flos": 67417036859520.0, + "grad_norm": 0.9295319140216696, + "language_loss": 0.61092532, + "learning_rate": 1.8012163852526179e-06, + "loss": 0.6310569, + "num_input_tokens_seen": 190198110, + "router_z_loss_clip": 0.00878906, + "router_z_loss_mlp": 0.08251953, + "step": 8844, + "time_per_iteration": 3.0973105430603027 + }, + { + "auxiliary_loss_clip": 0.01009054, + "auxiliary_loss_mlp": 0.01001667, + "balance_loss_clip": 1.00071895, + "balance_loss_mlp": 1.00090265, + "epoch": 0.5317901698481888, + "flos": 59505405373440.0, + "grad_norm": 0.8360233435311406, + "language_loss": 0.62161565, + "learning_rate": 1.8008404373088164e-06, + "loss": 0.64172292, + "num_input_tokens_seen": 190259950, + "router_z_loss_clip": 0.00946045, + "router_z_loss_mlp": 0.08203125, + "step": 8845, + "time_per_iteration": 3.0914716720581055 + }, + { + "auxiliary_loss_clip": 0.01062012, + "auxiliary_loss_mlp": 0.01024691, + "balance_loss_clip": 1.01162612, + "balance_loss_mlp": 1.01957703, + "epoch": 0.5318502931008567, + "flos": 19128388383360.0, + "grad_norm": 1.9419499889644423, + "language_loss": 0.75480723, + "learning_rate": 1.8004644964725069e-06, + "loss": 0.77567428, + "num_input_tokens_seen": 190278265, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.42578125, + "step": 8846, + "time_per_iteration": 2.3797669410705566 + }, + { + "auxiliary_loss_clip": 0.01061641, + "auxiliary_loss_mlp": 0.01025265, + "balance_loss_clip": 1.01295626, + "balance_loss_mlp": 1.020365, + "epoch": 0.5319104163535248, + "flos": 24679892494080.0, + "grad_norm": 1.6585899542039426, + "language_loss": 0.75422055, + "learning_rate": 1.8000885627571072e-06, + "loss": 0.77508962, + "num_input_tokens_seen": 190298400, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.4140625, + "step": 8847, + "time_per_iteration": 2.41469407081604 + }, + { + "auxiliary_loss_clip": 0.01060011, + "auxiliary_loss_mlp": 0.01024819, + "balance_loss_clip": 1.01301754, + "balance_loss_mlp": 1.01846004, + "epoch": 0.5319705396061927, + "flos": 19384650829440.0, + "grad_norm": 1.531099850228735, + "language_loss": 0.87598741, + "learning_rate": 1.7997126361760314e-06, + "loss": 0.89683568, + "num_input_tokens_seen": 190316235, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.41601562, + "step": 8848, + "time_per_iteration": 2.3629024028778076 + }, + { + "auxiliary_loss_clip": 0.01060982, + "auxiliary_loss_mlp": 0.0102881, + "balance_loss_clip": 1.01468349, + "balance_loss_mlp": 1.01897597, + "epoch": 0.5320306628588607, + "flos": 18258352502400.0, + "grad_norm": 2.323726546736762, + "language_loss": 0.74567676, + "learning_rate": 1.7993367167426972e-06, + "loss": 0.76657462, + "num_input_tokens_seen": 190335060, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.41992188, + "step": 8849, + "time_per_iteration": 2.3646538257598877 + }, + { + "auxiliary_loss_clip": 0.01063176, + "auxiliary_loss_mlp": 0.01023106, + "balance_loss_clip": 1.01022506, + "balance_loss_mlp": 1.02008235, + "epoch": 0.5320907861115286, + "flos": 23731197586560.0, + "grad_norm": 1.6038665066108493, + "language_loss": 0.79771912, + "learning_rate": 1.7989608044705194e-06, + "loss": 0.81858194, + "num_input_tokens_seen": 190353265, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.4296875, + "step": 8850, + "time_per_iteration": 2.3853251934051514 + }, + { + "auxiliary_loss_clip": 0.01062148, + "auxiliary_loss_mlp": 0.0102644, + "balance_loss_clip": 1.0142808, + "balance_loss_mlp": 1.02022672, + "epoch": 0.5321509093641966, + "flos": 34493838163200.0, + "grad_norm": 1.3856032640675269, + "language_loss": 0.55169761, + "learning_rate": 1.7985848993729124e-06, + "loss": 0.57258356, + "num_input_tokens_seen": 190376575, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.41992188, + "step": 8851, + "time_per_iteration": 2.5024945735931396 + }, + { + "auxiliary_loss_clip": 0.01063965, + "auxiliary_loss_mlp": 0.01025565, + "balance_loss_clip": 1.01295853, + "balance_loss_mlp": 1.02132535, + "epoch": 0.5322110326168645, + "flos": 20009910101760.0, + "grad_norm": 1.6715407913719367, + "language_loss": 0.6816659, + "learning_rate": 1.798209001463293e-06, + "loss": 0.70256114, + "num_input_tokens_seen": 190395185, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.42578125, + "step": 8852, + "time_per_iteration": 2.3726062774658203 + }, + { + "auxiliary_loss_clip": 0.01009365, + "auxiliary_loss_mlp": 0.01008025, + "balance_loss_clip": 1.00699961, + "balance_loss_mlp": 1.00119781, + "epoch": 0.5322711558695326, + "flos": 64627931208960.0, + "grad_norm": 0.8017295250620273, + "language_loss": 0.62705457, + "learning_rate": 1.797833110755075e-06, + "loss": 0.64722836, + "num_input_tokens_seen": 190452595, + "router_z_loss_clip": 0.01025391, + "router_z_loss_mlp": 0.08154297, + "step": 8853, + "time_per_iteration": 2.9817845821380615 + }, + { + "auxiliary_loss_clip": 0.01062723, + "auxiliary_loss_mlp": 0.01024895, + "balance_loss_clip": 1.01231885, + "balance_loss_mlp": 1.01987481, + "epoch": 0.5323312791222005, + "flos": 14938461452160.0, + "grad_norm": 3.209470425105457, + "language_loss": 0.79346144, + "learning_rate": 1.7974572272616736e-06, + "loss": 0.81433761, + "num_input_tokens_seen": 190469140, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.4296875, + "step": 8854, + "time_per_iteration": 2.3550596237182617 + }, + { + "auxiliary_loss_clip": 0.01059658, + "auxiliary_loss_mlp": 0.01022552, + "balance_loss_clip": 1.01101816, + "balance_loss_mlp": 1.02039742, + "epoch": 0.5323914023748685, + "flos": 23439707712000.0, + "grad_norm": 1.66330387126314, + "language_loss": 0.7376256, + "learning_rate": 1.7970813509965025e-06, + "loss": 0.75844777, + "num_input_tokens_seen": 190489015, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.39257812, + "step": 8855, + "time_per_iteration": 2.4184305667877197 + }, + { + "auxiliary_loss_clip": 0.01059877, + "auxiliary_loss_mlp": 0.01022686, + "balance_loss_clip": 1.01127779, + "balance_loss_mlp": 1.01947594, + "epoch": 0.5324515256275365, + "flos": 26284989473280.0, + "grad_norm": 1.930388905498676, + "language_loss": 0.64408815, + "learning_rate": 1.796705481972976e-06, + "loss": 0.66491377, + "num_input_tokens_seen": 190508065, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.40429688, + "step": 8856, + "time_per_iteration": 2.4192702770233154 + }, + { + "auxiliary_loss_clip": 0.01064972, + "auxiliary_loss_mlp": 0.01032127, + "balance_loss_clip": 1.01875174, + "balance_loss_mlp": 1.02130985, + "epoch": 0.5325116488802044, + "flos": 26869679879040.0, + "grad_norm": 2.150892566619501, + "language_loss": 0.77505046, + "learning_rate": 1.796329620204508e-06, + "loss": 0.79602146, + "num_input_tokens_seen": 190527045, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.4375, + "step": 8857, + "time_per_iteration": 2.4300637245178223 + }, + { + "auxiliary_loss_clip": 0.01061841, + "auxiliary_loss_mlp": 0.01030952, + "balance_loss_clip": 1.0183394, + "balance_loss_mlp": 1.02049994, + "epoch": 0.5325717721328724, + "flos": 25883558127360.0, + "grad_norm": 4.044624745892549, + "language_loss": 0.7097311, + "learning_rate": 1.7959537657045115e-06, + "loss": 0.73065901, + "num_input_tokens_seen": 190544075, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.4140625, + "step": 8858, + "time_per_iteration": 2.4105913639068604 + }, + { + "auxiliary_loss_clip": 0.01063538, + "auxiliary_loss_mlp": 0.01028928, + "balance_loss_clip": 1.01567221, + "balance_loss_mlp": 1.02148807, + "epoch": 0.5326318953855403, + "flos": 21798231229440.0, + "grad_norm": 1.7500950823302879, + "language_loss": 0.69692874, + "learning_rate": 1.795577918486401e-06, + "loss": 0.71785331, + "num_input_tokens_seen": 190566030, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.41992188, + "step": 8859, + "time_per_iteration": 2.44215989112854 + }, + { + "auxiliary_loss_clip": 0.01057521, + "auxiliary_loss_mlp": 0.01024664, + "balance_loss_clip": 1.01385164, + "balance_loss_mlp": 1.0197289, + "epoch": 0.5326920186382084, + "flos": 20921876392320.0, + "grad_norm": 1.5540230958233836, + "language_loss": 0.69288909, + "learning_rate": 1.795202078563588e-06, + "loss": 0.7137109, + "num_input_tokens_seen": 190585605, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.37890625, + "step": 8860, + "time_per_iteration": 3.7947676181793213 + }, + { + "auxiliary_loss_clip": 0.01062091, + "auxiliary_loss_mlp": 0.01022681, + "balance_loss_clip": 1.01101625, + "balance_loss_mlp": 1.02132392, + "epoch": 0.5327521418908763, + "flos": 21432376425600.0, + "grad_norm": 2.1082870403213305, + "language_loss": 0.78074843, + "learning_rate": 1.7948262459494866e-06, + "loss": 0.80159616, + "num_input_tokens_seen": 190604625, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.40820312, + "step": 8861, + "time_per_iteration": 2.395508289337158 + }, + { + "auxiliary_loss_clip": 0.01061278, + "auxiliary_loss_mlp": 0.01028053, + "balance_loss_clip": 1.01449335, + "balance_loss_mlp": 1.01947856, + "epoch": 0.5328122651435443, + "flos": 21759233374080.0, + "grad_norm": 1.67625714762894, + "language_loss": 0.85591662, + "learning_rate": 1.794450420657509e-06, + "loss": 0.87680995, + "num_input_tokens_seen": 190625060, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.41796875, + "step": 8862, + "time_per_iteration": 2.4154932498931885 + }, + { + "auxiliary_loss_clip": 0.01061815, + "auxiliary_loss_mlp": 0.01030172, + "balance_loss_clip": 1.01580119, + "balance_loss_mlp": 1.01908612, + "epoch": 0.5328723883962122, + "flos": 18295500055680.0, + "grad_norm": 1.8341916053973086, + "language_loss": 0.61728793, + "learning_rate": 1.7940746027010664e-06, + "loss": 0.63820779, + "num_input_tokens_seen": 190643150, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.42773438, + "step": 8863, + "time_per_iteration": 2.369800329208374 + }, + { + "auxiliary_loss_clip": 0.01062443, + "auxiliary_loss_mlp": 0.01021938, + "balance_loss_clip": 1.01074469, + "balance_loss_mlp": 1.02193999, + "epoch": 0.5329325116488802, + "flos": 25373721409920.0, + "grad_norm": 1.9815482970831042, + "language_loss": 0.73403776, + "learning_rate": 1.793698792093572e-06, + "loss": 0.7548815, + "num_input_tokens_seen": 190662725, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.40625, + "step": 8864, + "time_per_iteration": 2.4254887104034424 + }, + { + "auxiliary_loss_clip": 0.01060585, + "auxiliary_loss_mlp": 0.01025087, + "balance_loss_clip": 1.01295793, + "balance_loss_mlp": 1.01995254, + "epoch": 0.5329926349015481, + "flos": 25590951089280.0, + "grad_norm": 1.6600783666556822, + "language_loss": 0.64140743, + "learning_rate": 1.7933229888484367e-06, + "loss": 0.66226411, + "num_input_tokens_seen": 190683680, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.40625, + "step": 8865, + "time_per_iteration": 2.451512575149536 + }, + { + "auxiliary_loss_clip": 0.01061438, + "auxiliary_loss_mlp": 0.01025052, + "balance_loss_clip": 1.01303577, + "balance_loss_mlp": 1.02005625, + "epoch": 0.5330527581542162, + "flos": 22888603900800.0, + "grad_norm": 1.8211855388900628, + "language_loss": 0.78545272, + "learning_rate": 1.7929471929790726e-06, + "loss": 0.80631757, + "num_input_tokens_seen": 190703350, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.4140625, + "step": 8866, + "time_per_iteration": 2.398538589477539 + }, + { + "auxiliary_loss_clip": 0.01060614, + "auxiliary_loss_mlp": 0.01027664, + "balance_loss_clip": 1.01623189, + "balance_loss_mlp": 1.01997089, + "epoch": 0.5331128814068841, + "flos": 16026041214720.0, + "grad_norm": 2.224141450445589, + "language_loss": 0.73112082, + "learning_rate": 1.7925714044988904e-06, + "loss": 0.75200361, + "num_input_tokens_seen": 190721170, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.40625, + "step": 8867, + "time_per_iteration": 2.3716471195220947 + }, + { + "auxiliary_loss_clip": 0.01064848, + "auxiliary_loss_mlp": 0.0102708, + "balance_loss_clip": 1.01410985, + "balance_loss_mlp": 1.02158952, + "epoch": 0.5331730046595521, + "flos": 39343239365760.0, + "grad_norm": 1.8099934754808034, + "language_loss": 0.72046912, + "learning_rate": 1.7921956234213011e-06, + "loss": 0.74138844, + "num_input_tokens_seen": 190743795, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.43359375, + "step": 8868, + "time_per_iteration": 2.5389633178710938 + }, + { + "auxiliary_loss_clip": 0.01060886, + "auxiliary_loss_mlp": 0.01020768, + "balance_loss_clip": 1.00924039, + "balance_loss_mlp": 1.01949656, + "epoch": 0.5332331279122201, + "flos": 24023246042880.0, + "grad_norm": 3.496463241324383, + "language_loss": 0.7847147, + "learning_rate": 1.7918198497597158e-06, + "loss": 0.80553126, + "num_input_tokens_seen": 190761560, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.4140625, + "step": 8869, + "time_per_iteration": 2.4070255756378174 + }, + { + "auxiliary_loss_clip": 0.01063327, + "auxiliary_loss_mlp": 0.01032127, + "balance_loss_clip": 1.0192287, + "balance_loss_mlp": 1.01990318, + "epoch": 0.533293251164888, + "flos": 17128353571200.0, + "grad_norm": 1.7319902275389893, + "language_loss": 0.75796914, + "learning_rate": 1.791444083527544e-06, + "loss": 0.77892369, + "num_input_tokens_seen": 190778875, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.43359375, + "step": 8870, + "time_per_iteration": 2.347634792327881 + }, + { + "auxiliary_loss_clip": 0.01065051, + "auxiliary_loss_mlp": 0.0102713, + "balance_loss_clip": 1.01352227, + "balance_loss_mlp": 1.02174306, + "epoch": 0.533353374417556, + "flos": 22125297646080.0, + "grad_norm": 1.9545054724449056, + "language_loss": 0.75812811, + "learning_rate": 1.7910683247381968e-06, + "loss": 0.77904987, + "num_input_tokens_seen": 190799830, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.43359375, + "step": 8871, + "time_per_iteration": 2.4311039447784424 + }, + { + "auxiliary_loss_clip": 0.0106279, + "auxiliary_loss_mlp": 0.01024833, + "balance_loss_clip": 1.0126617, + "balance_loss_mlp": 1.02137208, + "epoch": 0.533413497670224, + "flos": 15010242940800.0, + "grad_norm": 1.6659100489924559, + "language_loss": 0.71938252, + "learning_rate": 1.7906925734050837e-06, + "loss": 0.74025875, + "num_input_tokens_seen": 190817155, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.4140625, + "step": 8872, + "time_per_iteration": 2.342756986618042 + }, + { + "auxiliary_loss_clip": 0.01064171, + "auxiliary_loss_mlp": 0.0103016, + "balance_loss_clip": 1.01739264, + "balance_loss_mlp": 1.02230048, + "epoch": 0.533473620922892, + "flos": 19608932603520.0, + "grad_norm": 1.7603116151592078, + "language_loss": 0.64985853, + "learning_rate": 1.7903168295416138e-06, + "loss": 0.67080188, + "num_input_tokens_seen": 190835240, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.41796875, + "step": 8873, + "time_per_iteration": 3.7657196521759033 + }, + { + "auxiliary_loss_clip": 0.01062486, + "auxiliary_loss_mlp": 0.01027016, + "balance_loss_clip": 1.01454067, + "balance_loss_mlp": 1.02048266, + "epoch": 0.5335337441755599, + "flos": 14281780089600.0, + "grad_norm": 2.235376943202847, + "language_loss": 0.7990036, + "learning_rate": 1.7899410931611972e-06, + "loss": 0.81989866, + "num_input_tokens_seen": 190851620, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.41992188, + "step": 8874, + "time_per_iteration": 2.336662769317627 + }, + { + "auxiliary_loss_clip": 0.01061522, + "auxiliary_loss_mlp": 0.01023966, + "balance_loss_clip": 1.01178932, + "balance_loss_mlp": 1.0201149, + "epoch": 0.5335938674282279, + "flos": 20813750311680.0, + "grad_norm": 1.605948948503345, + "language_loss": 0.69469446, + "learning_rate": 1.7895653642772425e-06, + "loss": 0.71554935, + "num_input_tokens_seen": 190870545, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.4140625, + "step": 8875, + "time_per_iteration": 2.3740789890289307 + }, + { + "auxiliary_loss_clip": 0.01009596, + "auxiliary_loss_mlp": 0.01002723, + "balance_loss_clip": 1.00175178, + "balance_loss_mlp": 1.00148606, + "epoch": 0.5336539906808958, + "flos": 71392596842880.0, + "grad_norm": 0.7259129807475545, + "language_loss": 0.59678495, + "learning_rate": 1.789189642903159e-06, + "loss": 0.61690807, + "num_input_tokens_seen": 190931995, + "router_z_loss_clip": 0.00970459, + "router_z_loss_mlp": 0.08105469, + "step": 8876, + "time_per_iteration": 3.129502296447754 + }, + { + "auxiliary_loss_clip": 0.01059909, + "auxiliary_loss_mlp": 0.01025143, + "balance_loss_clip": 1.01394367, + "balance_loss_mlp": 1.01915288, + "epoch": 0.5337141139335638, + "flos": 20152076624640.0, + "grad_norm": 1.8252733991685641, + "language_loss": 0.74897051, + "learning_rate": 1.7888139290523555e-06, + "loss": 0.76982105, + "num_input_tokens_seen": 190949890, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.40820312, + "step": 8877, + "time_per_iteration": 5.224806785583496 + }, + { + "auxiliary_loss_clip": 0.01009337, + "auxiliary_loss_mlp": 0.01001769, + "balance_loss_clip": 1.00078511, + "balance_loss_mlp": 1.00127673, + "epoch": 0.5337742371862317, + "flos": 67725181319040.0, + "grad_norm": 0.750739209319621, + "language_loss": 0.57188237, + "learning_rate": 1.7884382227382384e-06, + "loss": 0.59199345, + "num_input_tokens_seen": 191008480, + "router_z_loss_clip": 0.00982666, + "router_z_loss_mlp": 0.08056641, + "step": 8878, + "time_per_iteration": 2.949732780456543 + }, + { + "auxiliary_loss_clip": 0.01063126, + "auxiliary_loss_mlp": 0.01028642, + "balance_loss_clip": 1.01574373, + "balance_loss_mlp": 1.02002764, + "epoch": 0.5338343604388998, + "flos": 25007761872000.0, + "grad_norm": 1.6437164910070454, + "language_loss": 0.7225033, + "learning_rate": 1.7880625239742175e-06, + "loss": 0.74342096, + "num_input_tokens_seen": 191028995, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.43164062, + "step": 8879, + "time_per_iteration": 2.4237558841705322 + }, + { + "auxiliary_loss_clip": 0.01062069, + "auxiliary_loss_mlp": 0.01026068, + "balance_loss_clip": 1.01358676, + "balance_loss_mlp": 1.01986456, + "epoch": 0.5338944836915677, + "flos": 17600344508160.0, + "grad_norm": 1.9625291298865253, + "language_loss": 0.83699721, + "learning_rate": 1.7876868327736995e-06, + "loss": 0.85787857, + "num_input_tokens_seen": 191045285, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.421875, + "step": 8880, + "time_per_iteration": 2.3313798904418945 + }, + { + "auxiliary_loss_clip": 0.01058336, + "auxiliary_loss_mlp": 0.01024174, + "balance_loss_clip": 1.01247358, + "balance_loss_mlp": 1.01831591, + "epoch": 0.5339546069442357, + "flos": 21723098250240.0, + "grad_norm": 1.6930994217293749, + "language_loss": 0.79268146, + "learning_rate": 1.7873111491500927e-06, + "loss": 0.81350648, + "num_input_tokens_seen": 191066105, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.40039062, + "step": 8881, + "time_per_iteration": 2.4011073112487793 + }, + { + "auxiliary_loss_clip": 0.01008411, + "auxiliary_loss_mlp": 0.01001478, + "balance_loss_clip": 1.00053608, + "balance_loss_mlp": 1.00053477, + "epoch": 0.5340147301969036, + "flos": 69720642743040.0, + "grad_norm": 0.7892786909710429, + "language_loss": 0.59285355, + "learning_rate": 1.7869354731168035e-06, + "loss": 0.61295247, + "num_input_tokens_seen": 191126315, + "router_z_loss_clip": 0.00939941, + "router_z_loss_mlp": 0.07861328, + "step": 8882, + "time_per_iteration": 2.9793803691864014 + }, + { + "auxiliary_loss_clip": 0.01008503, + "auxiliary_loss_mlp": 0.01000889, + "balance_loss_clip": 0.99992925, + "balance_loss_mlp": 1.00076675, + "epoch": 0.5340748534495716, + "flos": 63878067694080.0, + "grad_norm": 0.8732674467075816, + "language_loss": 0.636994, + "learning_rate": 1.7865598046872396e-06, + "loss": 0.65708792, + "num_input_tokens_seen": 191174240, + "router_z_loss_clip": 0.00958252, + "router_z_loss_mlp": 0.07763672, + "step": 8883, + "time_per_iteration": 2.8909807205200195 + }, + { + "auxiliary_loss_clip": 0.01064125, + "auxiliary_loss_mlp": 0.01026792, + "balance_loss_clip": 1.013762, + "balance_loss_mlp": 1.02075148, + "epoch": 0.5341349767022396, + "flos": 28693053878400.0, + "grad_norm": 2.236928189238738, + "language_loss": 0.82463771, + "learning_rate": 1.7861841438748073e-06, + "loss": 0.84554684, + "num_input_tokens_seen": 191193335, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.43359375, + "step": 8884, + "time_per_iteration": 2.4246394634246826 + }, + { + "auxiliary_loss_clip": 0.01060173, + "auxiliary_loss_mlp": 0.01022979, + "balance_loss_clip": 1.01106966, + "balance_loss_mlp": 1.02060032, + "epoch": 0.5341950999549075, + "flos": 16288762262400.0, + "grad_norm": 2.7003477542139276, + "language_loss": 0.72333014, + "learning_rate": 1.7858084906929126e-06, + "loss": 0.74416173, + "num_input_tokens_seen": 191210900, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.39453125, + "step": 8885, + "time_per_iteration": 2.3743820190429688 + }, + { + "auxiliary_loss_clip": 0.01064116, + "auxiliary_loss_mlp": 0.01026988, + "balance_loss_clip": 1.01336861, + "balance_loss_mlp": 1.01967812, + "epoch": 0.5342552232075756, + "flos": 14354783475840.0, + "grad_norm": 2.776116801208582, + "language_loss": 0.79152751, + "learning_rate": 1.785432845154962e-06, + "loss": 0.81243861, + "num_input_tokens_seen": 191226730, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.4453125, + "step": 8886, + "time_per_iteration": 2.345134735107422 + }, + { + "auxiliary_loss_clip": 0.01061473, + "auxiliary_loss_mlp": 0.01028126, + "balance_loss_clip": 1.01510215, + "balance_loss_mlp": 1.01994562, + "epoch": 0.5343153464602435, + "flos": 30296719491840.0, + "grad_norm": 1.4653043679950861, + "language_loss": 0.75023973, + "learning_rate": 1.7850572072743611e-06, + "loss": 0.77113575, + "num_input_tokens_seen": 191250435, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.4140625, + "step": 8887, + "time_per_iteration": 2.4875755310058594 + }, + { + "auxiliary_loss_clip": 0.01058644, + "auxiliary_loss_mlp": 0.01021898, + "balance_loss_clip": 1.00992942, + "balance_loss_mlp": 1.0186193, + "epoch": 0.5343754697129115, + "flos": 15595387194240.0, + "grad_norm": 2.1275981136648636, + "language_loss": 0.68948281, + "learning_rate": 1.7846815770645158e-06, + "loss": 0.71028823, + "num_input_tokens_seen": 191268315, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.40039062, + "step": 8888, + "time_per_iteration": 2.36183762550354 + }, + { + "auxiliary_loss_clip": 0.010675, + "auxiliary_loss_mlp": 0.01028304, + "balance_loss_clip": 1.01463079, + "balance_loss_mlp": 1.02148986, + "epoch": 0.5344355929655794, + "flos": 16908680096640.0, + "grad_norm": 1.9423294439890515, + "language_loss": 0.77369177, + "learning_rate": 1.7843059545388313e-06, + "loss": 0.79464978, + "num_input_tokens_seen": 191287000, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.4609375, + "step": 8889, + "time_per_iteration": 2.37422776222229 + }, + { + "auxiliary_loss_clip": 0.010623, + "auxiliary_loss_mlp": 0.01031197, + "balance_loss_clip": 1.0184778, + "balance_loss_mlp": 1.02081847, + "epoch": 0.5344957162182474, + "flos": 16797307259520.0, + "grad_norm": 1.9681835212293248, + "language_loss": 0.69167411, + "learning_rate": 1.783930339710712e-06, + "loss": 0.71260911, + "num_input_tokens_seen": 191304565, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.4140625, + "step": 8890, + "time_per_iteration": 2.349303722381592 + }, + { + "auxiliary_loss_clip": 0.01065244, + "auxiliary_loss_mlp": 0.01029541, + "balance_loss_clip": 1.01527739, + "balance_loss_mlp": 1.02098584, + "epoch": 0.5345558394709153, + "flos": 12704998089600.0, + "grad_norm": 6.292030402354697, + "language_loss": 0.76880747, + "learning_rate": 1.7835547325935633e-06, + "loss": 0.78975534, + "num_input_tokens_seen": 191318300, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.44335938, + "step": 8891, + "time_per_iteration": 2.3549323081970215 + }, + { + "auxiliary_loss_clip": 0.01058974, + "auxiliary_loss_mlp": 0.01025122, + "balance_loss_clip": 1.01373136, + "balance_loss_mlp": 1.0199964, + "epoch": 0.5346159627235834, + "flos": 22453969985280.0, + "grad_norm": 1.5245453289445032, + "language_loss": 0.74327767, + "learning_rate": 1.7831791332007897e-06, + "loss": 0.76411867, + "num_input_tokens_seen": 191337925, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.390625, + "step": 8892, + "time_per_iteration": 2.3908095359802246 + }, + { + "auxiliary_loss_clip": 0.01059426, + "auxiliary_loss_mlp": 0.01028761, + "balance_loss_clip": 1.0168575, + "balance_loss_mlp": 1.01903093, + "epoch": 0.5346760859762513, + "flos": 22414762661760.0, + "grad_norm": 1.4872621588582806, + "language_loss": 0.87674189, + "learning_rate": 1.782803541545795e-06, + "loss": 0.89762372, + "num_input_tokens_seen": 191357120, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.40429688, + "step": 8893, + "time_per_iteration": 2.390073537826538 + }, + { + "auxiliary_loss_clip": 0.01059124, + "auxiliary_loss_mlp": 0.01025892, + "balance_loss_clip": 1.01410186, + "balance_loss_mlp": 1.01931155, + "epoch": 0.5347362092289193, + "flos": 22815146666880.0, + "grad_norm": 1.5468308505301267, + "language_loss": 0.72433114, + "learning_rate": 1.7824279576419832e-06, + "loss": 0.74518132, + "num_input_tokens_seen": 191375395, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.3984375, + "step": 8894, + "time_per_iteration": 2.3903160095214844 + }, + { + "auxiliary_loss_clip": 0.01061317, + "auxiliary_loss_mlp": 0.01025518, + "balance_loss_clip": 1.01294708, + "balance_loss_mlp": 1.02107048, + "epoch": 0.5347963324815872, + "flos": 23218428314880.0, + "grad_norm": 1.5884289294755884, + "language_loss": 0.74743426, + "learning_rate": 1.7820523815027575e-06, + "loss": 0.76830256, + "num_input_tokens_seen": 191395595, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.40234375, + "step": 8895, + "time_per_iteration": 2.4074625968933105 + }, + { + "auxiliary_loss_clip": 0.01060246, + "auxiliary_loss_mlp": 0.01023471, + "balance_loss_clip": 1.01109099, + "balance_loss_mlp": 1.01937079, + "epoch": 0.5348564557342552, + "flos": 22600256048640.0, + "grad_norm": 1.634035611958319, + "language_loss": 0.7678405, + "learning_rate": 1.7816768131415221e-06, + "loss": 0.78867769, + "num_input_tokens_seen": 191413730, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.40820312, + "step": 8896, + "time_per_iteration": 2.4050405025482178 + }, + { + "auxiliary_loss_clip": 0.01058647, + "auxiliary_loss_mlp": 0.01022034, + "balance_loss_clip": 1.00991678, + "balance_loss_mlp": 1.01945281, + "epoch": 0.5349165789869232, + "flos": 18001461651840.0, + "grad_norm": 1.655277027414963, + "language_loss": 0.7869696, + "learning_rate": 1.7813012525716794e-06, + "loss": 0.80777639, + "num_input_tokens_seen": 191432400, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.390625, + "step": 8897, + "time_per_iteration": 2.3819570541381836 + }, + { + "auxiliary_loss_clip": 0.01063395, + "auxiliary_loss_mlp": 0.01023693, + "balance_loss_clip": 1.01136088, + "balance_loss_mlp": 1.02061558, + "epoch": 0.5349767022395912, + "flos": 17158972700160.0, + "grad_norm": 1.8677331160936603, + "language_loss": 0.76250494, + "learning_rate": 1.7809256998066323e-06, + "loss": 0.78337574, + "num_input_tokens_seen": 191448855, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.4296875, + "step": 8898, + "time_per_iteration": 2.361583709716797 + }, + { + "auxiliary_loss_clip": 0.0100892, + "auxiliary_loss_mlp": 0.0100103, + "balance_loss_clip": 1.00002861, + "balance_loss_mlp": 1.00112438, + "epoch": 0.5350368254922592, + "flos": 70988302765440.0, + "grad_norm": 0.8298227163707418, + "language_loss": 0.57947242, + "learning_rate": 1.7805501548597842e-06, + "loss": 0.59957194, + "num_input_tokens_seen": 191519690, + "router_z_loss_clip": 0.01000977, + "router_z_loss_mlp": 0.078125, + "step": 8899, + "time_per_iteration": 3.2002017498016357 + }, + { + "auxiliary_loss_clip": 0.0105822, + "auxiliary_loss_mlp": 0.01024807, + "balance_loss_clip": 1.01268339, + "balance_loss_mlp": 1.01849699, + "epoch": 0.5350969487449271, + "flos": 27416594327040.0, + "grad_norm": 1.6159092972510944, + "language_loss": 0.69981754, + "learning_rate": 1.7801746177445357e-06, + "loss": 0.72064775, + "num_input_tokens_seen": 191539380, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.39648438, + "step": 8900, + "time_per_iteration": 3.8127384185791016 + }, + { + "auxiliary_loss_clip": 0.01061247, + "auxiliary_loss_mlp": 0.01024873, + "balance_loss_clip": 1.01241601, + "balance_loss_mlp": 1.01947963, + "epoch": 0.5351570719975951, + "flos": 19315173490560.0, + "grad_norm": 1.5941381118278757, + "language_loss": 0.71655238, + "learning_rate": 1.7797990884742901e-06, + "loss": 0.73741353, + "num_input_tokens_seen": 191557400, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.41796875, + "step": 8901, + "time_per_iteration": 2.3946444988250732 + }, + { + "auxiliary_loss_clip": 0.01060706, + "auxiliary_loss_mlp": 0.01026729, + "balance_loss_clip": 1.01417041, + "balance_loss_mlp": 1.01880336, + "epoch": 0.535217195250263, + "flos": 19827558737280.0, + "grad_norm": 1.960595835913831, + "language_loss": 0.77506608, + "learning_rate": 1.7794235670624482e-06, + "loss": 0.79594046, + "num_input_tokens_seen": 191575860, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.41992188, + "step": 8902, + "time_per_iteration": 2.364778995513916 + }, + { + "auxiliary_loss_clip": 0.01057144, + "auxiliary_loss_mlp": 0.01023178, + "balance_loss_clip": 1.01249075, + "balance_loss_mlp": 1.01834393, + "epoch": 0.535277318502931, + "flos": 22126763923200.0, + "grad_norm": 1.637980644657394, + "language_loss": 0.69892061, + "learning_rate": 1.7790480535224122e-06, + "loss": 0.71972382, + "num_input_tokens_seen": 191595775, + "router_z_loss_clip": 0.10693359, + "router_z_loss_mlp": 0.38671875, + "step": 8903, + "time_per_iteration": 2.442460060119629 + }, + { + "auxiliary_loss_clip": 0.0106094, + "auxiliary_loss_mlp": 0.01027246, + "balance_loss_clip": 1.0145328, + "balance_loss_mlp": 1.01869464, + "epoch": 0.5353374417555989, + "flos": 25044734868480.0, + "grad_norm": 1.6197888279220738, + "language_loss": 0.72376823, + "learning_rate": 1.7786725478675827e-06, + "loss": 0.74465007, + "num_input_tokens_seen": 191617785, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.421875, + "step": 8904, + "time_per_iteration": 2.4251656532287598 + }, + { + "auxiliary_loss_clip": 0.01062369, + "auxiliary_loss_mlp": 0.01028117, + "balance_loss_clip": 1.01496196, + "balance_loss_mlp": 1.02062035, + "epoch": 0.535397565008267, + "flos": 19387757940480.0, + "grad_norm": 1.716941856161677, + "language_loss": 0.73323631, + "learning_rate": 1.7782970501113606e-06, + "loss": 0.75414109, + "num_input_tokens_seen": 191636900, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.41796875, + "step": 8905, + "time_per_iteration": 2.393233060836792 + }, + { + "auxiliary_loss_clip": 0.01057334, + "auxiliary_loss_mlp": 0.01027266, + "balance_loss_clip": 1.01656067, + "balance_loss_mlp": 1.01936412, + "epoch": 0.5354576882609349, + "flos": 21470117472000.0, + "grad_norm": 1.390120953407688, + "language_loss": 0.83523166, + "learning_rate": 1.7779215602671466e-06, + "loss": 0.85607761, + "num_input_tokens_seen": 191656720, + "router_z_loss_clip": 0.10693359, + "router_z_loss_mlp": 0.37890625, + "step": 8906, + "time_per_iteration": 2.3880538940429688 + }, + { + "auxiliary_loss_clip": 0.01061742, + "auxiliary_loss_mlp": 0.01030495, + "balance_loss_clip": 1.01703036, + "balance_loss_mlp": 1.01945996, + "epoch": 0.5355178115136029, + "flos": 20776463112960.0, + "grad_norm": 2.178263431155002, + "language_loss": 0.73899829, + "learning_rate": 1.7775460783483412e-06, + "loss": 0.7599206, + "num_input_tokens_seen": 191674445, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.421875, + "step": 8907, + "time_per_iteration": 2.376307725906372 + }, + { + "auxiliary_loss_clip": 0.01059242, + "auxiliary_loss_mlp": 0.01022339, + "balance_loss_clip": 1.01054907, + "balance_loss_mlp": 1.01893401, + "epoch": 0.5355779347662708, + "flos": 23512885655040.0, + "grad_norm": 1.7073565873624923, + "language_loss": 0.76640314, + "learning_rate": 1.7771706043683437e-06, + "loss": 0.78721905, + "num_input_tokens_seen": 191695000, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.40429688, + "step": 8908, + "time_per_iteration": 2.3960721492767334 + }, + { + "auxiliary_loss_clip": 0.01062047, + "auxiliary_loss_mlp": 0.01026326, + "balance_loss_clip": 1.01362467, + "balance_loss_mlp": 1.01975608, + "epoch": 0.5356380580189388, + "flos": 20302168026240.0, + "grad_norm": 2.0204936924085923, + "language_loss": 0.74135721, + "learning_rate": 1.7767951383405539e-06, + "loss": 0.76224095, + "num_input_tokens_seen": 191713295, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.421875, + "step": 8909, + "time_per_iteration": 2.3885040283203125 + }, + { + "auxiliary_loss_clip": 0.01061379, + "auxiliary_loss_mlp": 0.01028568, + "balance_loss_clip": 1.01708794, + "balance_loss_mlp": 1.02039814, + "epoch": 0.5356981812716068, + "flos": 21360560025600.0, + "grad_norm": 2.8265712017426856, + "language_loss": 0.8433181, + "learning_rate": 1.7764196802783717e-06, + "loss": 0.86421758, + "num_input_tokens_seen": 191732725, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.41015625, + "step": 8910, + "time_per_iteration": 2.3716936111450195 + }, + { + "auxiliary_loss_clip": 0.0106071, + "auxiliary_loss_mlp": 0.01022834, + "balance_loss_clip": 1.00936341, + "balance_loss_mlp": 1.01868272, + "epoch": 0.5357583045242748, + "flos": 23110162588800.0, + "grad_norm": 1.435259136485983, + "language_loss": 0.81506908, + "learning_rate": 1.7760442301951962e-06, + "loss": 0.83590448, + "num_input_tokens_seen": 191753765, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.41992188, + "step": 8911, + "time_per_iteration": 2.4377198219299316 + }, + { + "auxiliary_loss_clip": 0.01060754, + "auxiliary_loss_mlp": 0.01027385, + "balance_loss_clip": 1.01563692, + "balance_loss_mlp": 1.02037513, + "epoch": 0.5358184277769428, + "flos": 21140711994240.0, + "grad_norm": 1.869581866370383, + "language_loss": 0.68873829, + "learning_rate": 1.7756687881044255e-06, + "loss": 0.70961964, + "num_input_tokens_seen": 191773560, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.40429688, + "step": 8912, + "time_per_iteration": 3.892829179763794 + }, + { + "auxiliary_loss_clip": 0.01060287, + "auxiliary_loss_mlp": 0.01028021, + "balance_loss_clip": 1.01607001, + "balance_loss_mlp": 1.01952255, + "epoch": 0.5358785510296107, + "flos": 16281675256320.0, + "grad_norm": 1.8948785328268931, + "language_loss": 0.7115072, + "learning_rate": 1.7752933540194593e-06, + "loss": 0.73239028, + "num_input_tokens_seen": 191791255, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.40625, + "step": 8913, + "time_per_iteration": 2.3619515895843506 + }, + { + "auxiliary_loss_clip": 0.01064779, + "auxiliary_loss_mlp": 0.01030624, + "balance_loss_clip": 1.01745725, + "balance_loss_mlp": 1.02115202, + "epoch": 0.5359386742822787, + "flos": 16976097665280.0, + "grad_norm": 1.6484823014115937, + "language_loss": 0.7209903, + "learning_rate": 1.7749179279536946e-06, + "loss": 0.74194437, + "num_input_tokens_seen": 191809325, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.4375, + "step": 8914, + "time_per_iteration": 2.357787847518921 + }, + { + "auxiliary_loss_clip": 0.01065658, + "auxiliary_loss_mlp": 0.01028303, + "balance_loss_clip": 1.01495767, + "balance_loss_mlp": 1.02141619, + "epoch": 0.5359987975349466, + "flos": 20811900009600.0, + "grad_norm": 1.7885307617529733, + "language_loss": 0.70942688, + "learning_rate": 1.7745425099205305e-06, + "loss": 0.73036647, + "num_input_tokens_seen": 191829795, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.44335938, + "step": 8915, + "time_per_iteration": 2.4103500843048096 + }, + { + "auxiliary_loss_clip": 0.01060768, + "auxiliary_loss_mlp": 0.01030337, + "balance_loss_clip": 1.01745057, + "balance_loss_mlp": 1.02002919, + "epoch": 0.5360589207876146, + "flos": 22198859614080.0, + "grad_norm": 1.6156767328685975, + "language_loss": 0.75228274, + "learning_rate": 1.7741670999333645e-06, + "loss": 0.77319384, + "num_input_tokens_seen": 191850840, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.40625, + "step": 8916, + "time_per_iteration": 3.8840746879577637 + }, + { + "auxiliary_loss_clip": 0.01062707, + "auxiliary_loss_mlp": 0.01024909, + "balance_loss_clip": 1.01244569, + "balance_loss_mlp": 1.02035606, + "epoch": 0.5361190440402825, + "flos": 31393027094400.0, + "grad_norm": 1.9619223458186872, + "language_loss": 0.72201872, + "learning_rate": 1.7737916980055932e-06, + "loss": 0.74289489, + "num_input_tokens_seen": 191869520, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.42382812, + "step": 8917, + "time_per_iteration": 3.874760866165161 + }, + { + "auxiliary_loss_clip": 0.01060294, + "auxiliary_loss_mlp": 0.01025537, + "balance_loss_clip": 1.01294231, + "balance_loss_mlp": 1.01991189, + "epoch": 0.5361791672929506, + "flos": 16068984053760.0, + "grad_norm": 2.317686315819532, + "language_loss": 0.71194732, + "learning_rate": 1.7734163041506146e-06, + "loss": 0.73280561, + "num_input_tokens_seen": 191887240, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.40429688, + "step": 8918, + "time_per_iteration": 2.378718137741089 + }, + { + "auxiliary_loss_clip": 0.01062302, + "auxiliary_loss_mlp": 0.01027535, + "balance_loss_clip": 1.01508379, + "balance_loss_mlp": 1.0206995, + "epoch": 0.5362392905456185, + "flos": 20739874141440.0, + "grad_norm": 1.5470974424723225, + "language_loss": 0.74884224, + "learning_rate": 1.773040918381825e-06, + "loss": 0.76974064, + "num_input_tokens_seen": 191905690, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.41601562, + "step": 8919, + "time_per_iteration": 2.3864099979400635 + }, + { + "auxiliary_loss_clip": 0.01060995, + "auxiliary_loss_mlp": 0.01029136, + "balance_loss_clip": 1.01688099, + "balance_loss_mlp": 1.01946211, + "epoch": 0.5362994137982865, + "flos": 17339334117120.0, + "grad_norm": 2.5869745736837966, + "language_loss": 0.71541232, + "learning_rate": 1.7726655407126219e-06, + "loss": 0.73631358, + "num_input_tokens_seen": 191920725, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.4140625, + "step": 8920, + "time_per_iteration": 2.348038673400879 + }, + { + "auxiliary_loss_clip": 0.01059325, + "auxiliary_loss_mlp": 0.0102592, + "balance_loss_clip": 1.01368856, + "balance_loss_mlp": 1.01876473, + "epoch": 0.5363595370509544, + "flos": 42812314122240.0, + "grad_norm": 1.4521084283195689, + "language_loss": 0.68722314, + "learning_rate": 1.7722901711564006e-06, + "loss": 0.70807558, + "num_input_tokens_seen": 191944645, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.40625, + "step": 8921, + "time_per_iteration": 2.5647685527801514 + }, + { + "auxiliary_loss_clip": 0.01063003, + "auxiliary_loss_mlp": 0.01029616, + "balance_loss_clip": 1.0172298, + "balance_loss_mlp": 1.02182281, + "epoch": 0.5364196603036224, + "flos": 19170947197440.0, + "grad_norm": 1.8428246561568418, + "language_loss": 0.81888127, + "learning_rate": 1.7719148097265575e-06, + "loss": 0.83980739, + "num_input_tokens_seen": 191962265, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.41210938, + "step": 8922, + "time_per_iteration": 2.395998477935791 + }, + { + "auxiliary_loss_clip": 0.01061804, + "auxiliary_loss_mlp": 0.01023942, + "balance_loss_clip": 1.01190221, + "balance_loss_mlp": 1.01967454, + "epoch": 0.5364797835562904, + "flos": 17930099099520.0, + "grad_norm": 2.496256955070938, + "language_loss": 0.76351643, + "learning_rate": 1.771539456436488e-06, + "loss": 0.78437388, + "num_input_tokens_seen": 191978850, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.421875, + "step": 8923, + "time_per_iteration": 2.415630340576172 + }, + { + "auxiliary_loss_clip": 0.01063768, + "auxiliary_loss_mlp": 0.01031221, + "balance_loss_clip": 1.01779246, + "balance_loss_mlp": 1.02107596, + "epoch": 0.5365399068089584, + "flos": 30226718482560.0, + "grad_norm": 1.3828624857211362, + "language_loss": 0.7059142, + "learning_rate": 1.771164111299587e-06, + "loss": 0.7268641, + "num_input_tokens_seen": 192002000, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.42578125, + "step": 8924, + "time_per_iteration": 2.468163251876831 + }, + { + "auxiliary_loss_clip": 0.01063434, + "auxiliary_loss_mlp": 0.01032971, + "balance_loss_clip": 1.01927984, + "balance_loss_mlp": 1.02030337, + "epoch": 0.5366000300616264, + "flos": 24890768305920.0, + "grad_norm": 1.869058978293957, + "language_loss": 0.87034076, + "learning_rate": 1.770788774329251e-06, + "loss": 0.89130485, + "num_input_tokens_seen": 192019100, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.43164062, + "step": 8925, + "time_per_iteration": 2.3984522819519043 + }, + { + "auxiliary_loss_clip": 0.0106176, + "auxiliary_loss_mlp": 0.01023336, + "balance_loss_clip": 1.0106169, + "balance_loss_mlp": 1.01938319, + "epoch": 0.5366601533142943, + "flos": 29825322048000.0, + "grad_norm": 1.573615841424531, + "language_loss": 0.77712446, + "learning_rate": 1.7704134455388737e-06, + "loss": 0.79797542, + "num_input_tokens_seen": 192041660, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.42382812, + "step": 8926, + "time_per_iteration": 2.454423427581787 + }, + { + "auxiliary_loss_clip": 0.01060569, + "auxiliary_loss_mlp": 0.01022843, + "balance_loss_clip": 1.0109694, + "balance_loss_mlp": 1.0204823, + "epoch": 0.5367202765669623, + "flos": 27198107838720.0, + "grad_norm": 1.5066166975933941, + "language_loss": 0.66876328, + "learning_rate": 1.77003812494185e-06, + "loss": 0.68959743, + "num_input_tokens_seen": 192063540, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.40039062, + "step": 8927, + "time_per_iteration": 2.422757625579834 + }, + { + "auxiliary_loss_clip": 0.01060973, + "auxiliary_loss_mlp": 0.01023524, + "balance_loss_clip": 1.01074433, + "balance_loss_mlp": 1.01884151, + "epoch": 0.5367803998196302, + "flos": 20228920260480.0, + "grad_norm": 2.0070725154474074, + "language_loss": 0.73791581, + "learning_rate": 1.7696628125515745e-06, + "loss": 0.75876081, + "num_input_tokens_seen": 192081760, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.41992188, + "step": 8928, + "time_per_iteration": 2.3709118366241455 + }, + { + "auxiliary_loss_clip": 0.01061067, + "auxiliary_loss_mlp": 0.01024015, + "balance_loss_clip": 1.0122968, + "balance_loss_mlp": 1.01972771, + "epoch": 0.5368405230722982, + "flos": 32153435706240.0, + "grad_norm": 2.5037120920267153, + "language_loss": 0.62871099, + "learning_rate": 1.76928750838144e-06, + "loss": 0.64956176, + "num_input_tokens_seen": 192101620, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.4140625, + "step": 8929, + "time_per_iteration": 2.459855318069458 + }, + { + "auxiliary_loss_clip": 0.01062163, + "auxiliary_loss_mlp": 0.01026893, + "balance_loss_clip": 1.01472187, + "balance_loss_mlp": 1.02044344, + "epoch": 0.5369006463249661, + "flos": 26246794579200.0, + "grad_norm": 1.7603383103120824, + "language_loss": 0.66181201, + "learning_rate": 1.7689122124448415e-06, + "loss": 0.68270254, + "num_input_tokens_seen": 192121805, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.41796875, + "step": 8930, + "time_per_iteration": 2.4323694705963135 + }, + { + "auxiliary_loss_clip": 0.01060156, + "auxiliary_loss_mlp": 0.01022259, + "balance_loss_clip": 1.01015878, + "balance_loss_mlp": 1.01957774, + "epoch": 0.5369607695776342, + "flos": 26210170696320.0, + "grad_norm": 1.5915099979206906, + "language_loss": 0.67182368, + "learning_rate": 1.7685369247551712e-06, + "loss": 0.69264781, + "num_input_tokens_seen": 192141765, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.40625, + "step": 8931, + "time_per_iteration": 2.4012179374694824 + }, + { + "auxiliary_loss_clip": 0.01063028, + "auxiliary_loss_mlp": 0.01024754, + "balance_loss_clip": 1.01282692, + "balance_loss_mlp": 1.02169609, + "epoch": 0.5370208928303021, + "flos": 25007866606080.0, + "grad_norm": 1.8648826690708815, + "language_loss": 0.75633061, + "learning_rate": 1.768161645325823e-06, + "loss": 0.77720845, + "num_input_tokens_seen": 192161560, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.4140625, + "step": 8932, + "time_per_iteration": 2.4214792251586914 + }, + { + "auxiliary_loss_clip": 0.01062185, + "auxiliary_loss_mlp": 0.01024459, + "balance_loss_clip": 1.01246023, + "balance_loss_mlp": 1.02078295, + "epoch": 0.5370810160829701, + "flos": 31790897481600.0, + "grad_norm": 1.8144961026791744, + "language_loss": 0.66038156, + "learning_rate": 1.7677863741701892e-06, + "loss": 0.68124795, + "num_input_tokens_seen": 192180190, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.4140625, + "step": 8933, + "time_per_iteration": 2.4408481121063232 + }, + { + "auxiliary_loss_clip": 0.01058041, + "auxiliary_loss_mlp": 0.01026563, + "balance_loss_clip": 1.01493979, + "balance_loss_mlp": 1.01886594, + "epoch": 0.537141139335638, + "flos": 23841453260160.0, + "grad_norm": 1.5257017773175565, + "language_loss": 0.8268609, + "learning_rate": 1.767411111301662e-06, + "loss": 0.84770703, + "num_input_tokens_seen": 192198855, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.390625, + "step": 8934, + "time_per_iteration": 2.3958044052124023 + }, + { + "auxiliary_loss_clip": 0.01058916, + "auxiliary_loss_mlp": 0.01025191, + "balance_loss_clip": 1.01271605, + "balance_loss_mlp": 1.0186255, + "epoch": 0.537201262588306, + "flos": 18508016701440.0, + "grad_norm": 1.6080400092354552, + "language_loss": 0.79832768, + "learning_rate": 1.7670358567336347e-06, + "loss": 0.81916869, + "num_input_tokens_seen": 192216555, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.40234375, + "step": 8935, + "time_per_iteration": 2.336564779281616 + }, + { + "auxiliary_loss_clip": 0.01061016, + "auxiliary_loss_mlp": 0.01030645, + "balance_loss_clip": 1.01783013, + "balance_loss_mlp": 1.01903963, + "epoch": 0.537261385840974, + "flos": 25661859793920.0, + "grad_norm": 1.620789540166802, + "language_loss": 0.83444488, + "learning_rate": 1.766660610479498e-06, + "loss": 0.85536146, + "num_input_tokens_seen": 192236910, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.41992188, + "step": 8936, + "time_per_iteration": 2.4122021198272705 + }, + { + "auxiliary_loss_clip": 0.01059355, + "auxiliary_loss_mlp": 0.01025436, + "balance_loss_clip": 1.01302648, + "balance_loss_mlp": 1.01961803, + "epoch": 0.537321509093642, + "flos": 40733410815360.0, + "grad_norm": 1.4139697699194937, + "language_loss": 0.72711587, + "learning_rate": 1.7662853725526443e-06, + "loss": 0.74796379, + "num_input_tokens_seen": 192260790, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.3984375, + "step": 8937, + "time_per_iteration": 2.571127414703369 + }, + { + "auxiliary_loss_clip": 0.0106085, + "auxiliary_loss_mlp": 0.01023591, + "balance_loss_clip": 1.01109171, + "balance_loss_mlp": 1.01784372, + "epoch": 0.53738163234631, + "flos": 17237526992640.0, + "grad_norm": 2.70584490643321, + "language_loss": 0.81763685, + "learning_rate": 1.7659101429664642e-06, + "loss": 0.83848119, + "num_input_tokens_seen": 192277230, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.4296875, + "step": 8938, + "time_per_iteration": 2.3917675018310547 + }, + { + "auxiliary_loss_clip": 0.01064384, + "auxiliary_loss_mlp": 0.01033233, + "balance_loss_clip": 1.01990521, + "balance_loss_mlp": 1.02091527, + "epoch": 0.5374417555989779, + "flos": 12821188694400.0, + "grad_norm": 3.05503273425232, + "language_loss": 0.80589783, + "learning_rate": 1.7655349217343488e-06, + "loss": 0.82687402, + "num_input_tokens_seen": 192292840, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.43359375, + "step": 8939, + "time_per_iteration": 2.356818675994873 + }, + { + "auxiliary_loss_clip": 0.0100959, + "auxiliary_loss_mlp": 0.01001596, + "balance_loss_clip": 1.00063014, + "balance_loss_mlp": 1.00162935, + "epoch": 0.5375018788516459, + "flos": 67252771445760.0, + "grad_norm": 0.7022094883344824, + "language_loss": 0.52436054, + "learning_rate": 1.765159708869689e-06, + "loss": 0.5444724, + "num_input_tokens_seen": 192358240, + "router_z_loss_clip": 0.00964355, + "router_z_loss_mlp": 0.08007812, + "step": 8940, + "time_per_iteration": 4.4304893016815186 + }, + { + "auxiliary_loss_clip": 0.0105898, + "auxiliary_loss_mlp": 0.01028731, + "balance_loss_clip": 1.01647592, + "balance_loss_mlp": 1.01886392, + "epoch": 0.5375620021043138, + "flos": 18113183602560.0, + "grad_norm": 2.4059913891159557, + "language_loss": 0.71666622, + "learning_rate": 1.764784504385875e-06, + "loss": 0.73754334, + "num_input_tokens_seen": 192377370, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.40039062, + "step": 8941, + "time_per_iteration": 2.362462043762207 + }, + { + "auxiliary_loss_clip": 0.01059962, + "auxiliary_loss_mlp": 0.01023983, + "balance_loss_clip": 1.01207352, + "balance_loss_mlp": 1.01945066, + "epoch": 0.5376221253569818, + "flos": 23148252748800.0, + "grad_norm": 11.91285054199481, + "language_loss": 0.79565758, + "learning_rate": 1.7644093082962969e-06, + "loss": 0.81649697, + "num_input_tokens_seen": 192396450, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.40429688, + "step": 8942, + "time_per_iteration": 2.4044370651245117 + }, + { + "auxiliary_loss_clip": 0.01063863, + "auxiliary_loss_mlp": 0.01029113, + "balance_loss_clip": 1.01622653, + "balance_loss_mlp": 1.02059495, + "epoch": 0.5376822486096497, + "flos": 29970979706880.0, + "grad_norm": 1.4938431632788212, + "language_loss": 0.70160007, + "learning_rate": 1.7640341206143452e-06, + "loss": 0.72252989, + "num_input_tokens_seen": 192417390, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.43359375, + "step": 8943, + "time_per_iteration": 2.4910881519317627 + }, + { + "auxiliary_loss_clip": 0.01008629, + "auxiliary_loss_mlp": 0.01002245, + "balance_loss_clip": 1.00129735, + "balance_loss_mlp": 1.00067139, + "epoch": 0.5377423718623178, + "flos": 54165752726400.0, + "grad_norm": 0.8018954404245223, + "language_loss": 0.59580183, + "learning_rate": 1.763658941353408e-06, + "loss": 0.61591059, + "num_input_tokens_seen": 192478060, + "router_z_loss_clip": 0.00946045, + "router_z_loss_mlp": 0.07958984, + "step": 8944, + "time_per_iteration": 3.0702390670776367 + }, + { + "auxiliary_loss_clip": 0.01066251, + "auxiliary_loss_mlp": 0.0102792, + "balance_loss_clip": 1.01359093, + "balance_loss_mlp": 1.02101362, + "epoch": 0.5378024951149857, + "flos": 23255994804480.0, + "grad_norm": 3.2097493417440472, + "language_loss": 0.77703476, + "learning_rate": 1.7632837705268758e-06, + "loss": 0.79797643, + "num_input_tokens_seen": 192495985, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.453125, + "step": 8945, + "time_per_iteration": 2.4081943035125732 + }, + { + "auxiliary_loss_clip": 0.01063258, + "auxiliary_loss_mlp": 0.01025857, + "balance_loss_clip": 1.01335788, + "balance_loss_mlp": 1.02054048, + "epoch": 0.5378626183676537, + "flos": 24022966752000.0, + "grad_norm": 1.647216703848471, + "language_loss": 0.68332601, + "learning_rate": 1.7629086081481363e-06, + "loss": 0.7042172, + "num_input_tokens_seen": 192515445, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.42773438, + "step": 8946, + "time_per_iteration": 2.4275293350219727 + }, + { + "auxiliary_loss_clip": 0.01060318, + "auxiliary_loss_mlp": 0.01027838, + "balance_loss_clip": 1.01597714, + "balance_loss_mlp": 1.01961184, + "epoch": 0.5379227416203216, + "flos": 27160576260480.0, + "grad_norm": 1.6345353483648288, + "language_loss": 0.76839876, + "learning_rate": 1.7625334542305792e-06, + "loss": 0.78928035, + "num_input_tokens_seen": 192536530, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.40820312, + "step": 8947, + "time_per_iteration": 2.4363083839416504 + }, + { + "auxiliary_loss_clip": 0.01061473, + "auxiliary_loss_mlp": 0.01025011, + "balance_loss_clip": 1.01281571, + "balance_loss_mlp": 1.02002549, + "epoch": 0.5379828648729896, + "flos": 24680451075840.0, + "grad_norm": 1.7473490529299822, + "language_loss": 0.60054624, + "learning_rate": 1.762158308787592e-06, + "loss": 0.62141109, + "num_input_tokens_seen": 192556075, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.4140625, + "step": 8948, + "time_per_iteration": 2.4467451572418213 + }, + { + "auxiliary_loss_clip": 0.01063539, + "auxiliary_loss_mlp": 0.01029049, + "balance_loss_clip": 1.01696765, + "balance_loss_mlp": 1.02135122, + "epoch": 0.5380429881256577, + "flos": 22522330160640.0, + "grad_norm": 1.8251275645251641, + "language_loss": 0.79237914, + "learning_rate": 1.7617831718325634e-06, + "loss": 0.81330502, + "num_input_tokens_seen": 192575535, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.421875, + "step": 8949, + "time_per_iteration": 2.4145607948303223 + }, + { + "auxiliary_loss_clip": 0.0105831, + "auxiliary_loss_mlp": 0.01027555, + "balance_loss_clip": 1.01545548, + "balance_loss_mlp": 1.01899958, + "epoch": 0.5381031113783256, + "flos": 26978329630080.0, + "grad_norm": 2.1535079177627887, + "language_loss": 0.78158844, + "learning_rate": 1.7614080433788802e-06, + "loss": 0.80244708, + "num_input_tokens_seen": 192594490, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.39453125, + "step": 8950, + "time_per_iteration": 2.4557244777679443 + }, + { + "auxiliary_loss_clip": 0.01060988, + "auxiliary_loss_mlp": 0.01025704, + "balance_loss_clip": 1.01428962, + "balance_loss_mlp": 1.02093267, + "epoch": 0.5381632346309936, + "flos": 24752930791680.0, + "grad_norm": 1.548470953380121, + "language_loss": 0.72658682, + "learning_rate": 1.7610329234399301e-06, + "loss": 0.74745375, + "num_input_tokens_seen": 192615650, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.40039062, + "step": 8951, + "time_per_iteration": 3.9649159908294678 + }, + { + "auxiliary_loss_clip": 0.01059488, + "auxiliary_loss_mlp": 0.0102043, + "balance_loss_clip": 1.00843203, + "balance_loss_mlp": 1.01861, + "epoch": 0.5382233578836615, + "flos": 15559147336320.0, + "grad_norm": 1.6469634104466004, + "language_loss": 0.75260329, + "learning_rate": 1.7606578120291013e-06, + "loss": 0.77340251, + "num_input_tokens_seen": 192633840, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.41015625, + "step": 8952, + "time_per_iteration": 2.3955750465393066 + }, + { + "auxiliary_loss_clip": 0.01061363, + "auxiliary_loss_mlp": 0.01023221, + "balance_loss_clip": 1.01015592, + "balance_loss_mlp": 1.02009273, + "epoch": 0.5382834811363295, + "flos": 25083278876160.0, + "grad_norm": 1.38145777737592, + "language_loss": 0.79834843, + "learning_rate": 1.7602827091597785e-06, + "loss": 0.8191942, + "num_input_tokens_seen": 192655890, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.41210938, + "step": 8953, + "time_per_iteration": 2.4685330390930176 + }, + { + "auxiliary_loss_clip": 0.01060152, + "auxiliary_loss_mlp": 0.01022576, + "balance_loss_clip": 1.01049411, + "balance_loss_mlp": 1.02052104, + "epoch": 0.5383436043889974, + "flos": 13297054792320.0, + "grad_norm": 1.741637281980072, + "language_loss": 0.8085891, + "learning_rate": 1.7599076148453496e-06, + "loss": 0.82941645, + "num_input_tokens_seen": 192673025, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.39648438, + "step": 8954, + "time_per_iteration": 2.383639335632324 + }, + { + "auxiliary_loss_clip": 0.01061544, + "auxiliary_loss_mlp": 0.01026337, + "balance_loss_clip": 1.01392162, + "balance_loss_mlp": 1.01979542, + "epoch": 0.5384037276416654, + "flos": 23038276366080.0, + "grad_norm": 2.0832509852793932, + "language_loss": 0.75853103, + "learning_rate": 1.7595325290992003e-06, + "loss": 0.77940989, + "num_input_tokens_seen": 192692190, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.41796875, + "step": 8955, + "time_per_iteration": 2.424792766571045 + }, + { + "auxiliary_loss_clip": 0.01060812, + "auxiliary_loss_mlp": 0.0102953, + "balance_loss_clip": 1.01703048, + "balance_loss_mlp": 1.01995754, + "epoch": 0.5384638508943334, + "flos": 20630107226880.0, + "grad_norm": 1.6174258331338185, + "language_loss": 0.77962708, + "learning_rate": 1.759157451934716e-06, + "loss": 0.80053049, + "num_input_tokens_seen": 192710380, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.40820312, + "step": 8956, + "time_per_iteration": 3.8175208568573 + }, + { + "auxiliary_loss_clip": 0.01009109, + "auxiliary_loss_mlp": 0.01004081, + "balance_loss_clip": 1.00316954, + "balance_loss_mlp": 1.00100696, + "epoch": 0.5385239741470014, + "flos": 66734660736000.0, + "grad_norm": 0.8509507742100785, + "language_loss": 0.6343624, + "learning_rate": 1.7587823833652833e-06, + "loss": 0.65449429, + "num_input_tokens_seen": 192768995, + "router_z_loss_clip": 0.00909424, + "router_z_loss_mlp": 0.08105469, + "step": 8957, + "time_per_iteration": 4.308598279953003 + }, + { + "auxiliary_loss_clip": 0.01009159, + "auxiliary_loss_mlp": 0.0100341, + "balance_loss_clip": 1.00243211, + "balance_loss_mlp": 1.00101447, + "epoch": 0.5385840973996693, + "flos": 64712490255360.0, + "grad_norm": 0.7102066685757649, + "language_loss": 0.5165596, + "learning_rate": 1.7584073234042865e-06, + "loss": 0.53668529, + "num_input_tokens_seen": 192825585, + "router_z_loss_clip": 0.00976562, + "router_z_loss_mlp": 0.08154297, + "step": 8958, + "time_per_iteration": 3.08372163772583 + }, + { + "auxiliary_loss_clip": 0.01064512, + "auxiliary_loss_mlp": 0.0102586, + "balance_loss_clip": 1.0126276, + "balance_loss_mlp": 1.02167451, + "epoch": 0.5386442206523373, + "flos": 26140553712000.0, + "grad_norm": 2.5835714711663775, + "language_loss": 0.77370441, + "learning_rate": 1.7580322720651111e-06, + "loss": 0.79460818, + "num_input_tokens_seen": 192847335, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.42773438, + "step": 8959, + "time_per_iteration": 2.4273433685302734 + }, + { + "auxiliary_loss_clip": 0.01061218, + "auxiliary_loss_mlp": 0.0102635, + "balance_loss_clip": 1.01373744, + "balance_loss_mlp": 1.01914299, + "epoch": 0.5387043439050052, + "flos": 18251090939520.0, + "grad_norm": 2.1355707319022654, + "language_loss": 0.83871752, + "learning_rate": 1.7576572293611413e-06, + "loss": 0.85959315, + "num_input_tokens_seen": 192862205, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.41992188, + "step": 8960, + "time_per_iteration": 2.393908739089966 + }, + { + "auxiliary_loss_clip": 0.0106245, + "auxiliary_loss_mlp": 0.01024979, + "balance_loss_clip": 1.01262307, + "balance_loss_mlp": 1.02070093, + "epoch": 0.5387644671576732, + "flos": 29787022419840.0, + "grad_norm": 1.8001696480048022, + "language_loss": 0.78815365, + "learning_rate": 1.7572821953057615e-06, + "loss": 0.80902797, + "num_input_tokens_seen": 192883695, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.41796875, + "step": 8961, + "time_per_iteration": 2.465399980545044 + }, + { + "auxiliary_loss_clip": 0.01063247, + "auxiliary_loss_mlp": 0.01029631, + "balance_loss_clip": 1.01663744, + "balance_loss_mlp": 1.0211165, + "epoch": 0.5388245904103413, + "flos": 22373600302080.0, + "grad_norm": 1.976279044614501, + "language_loss": 0.846802, + "learning_rate": 1.7569071699123563e-06, + "loss": 0.86773074, + "num_input_tokens_seen": 192900190, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.41992188, + "step": 8962, + "time_per_iteration": 2.382307767868042 + }, + { + "auxiliary_loss_clip": 0.01009692, + "auxiliary_loss_mlp": 0.01001557, + "balance_loss_clip": 1.00050759, + "balance_loss_mlp": 1.0016309, + "epoch": 0.5388847136630092, + "flos": 69804538473600.0, + "grad_norm": 0.7428834052587658, + "language_loss": 0.54167211, + "learning_rate": 1.7565321531943082e-06, + "loss": 0.56178457, + "num_input_tokens_seen": 192958675, + "router_z_loss_clip": 0.01049805, + "router_z_loss_mlp": 0.08056641, + "step": 8963, + "time_per_iteration": 2.9656429290771484 + }, + { + "auxiliary_loss_clip": 0.01008973, + "auxiliary_loss_mlp": 0.01002046, + "balance_loss_clip": 1.00102115, + "balance_loss_mlp": 1.00101221, + "epoch": 0.5389448369156772, + "flos": 69818642663040.0, + "grad_norm": 0.8010752834917424, + "language_loss": 0.63314342, + "learning_rate": 1.756157145165002e-06, + "loss": 0.65325356, + "num_input_tokens_seen": 193033135, + "router_z_loss_clip": 0.01025391, + "router_z_loss_mlp": 0.07958984, + "step": 8964, + "time_per_iteration": 3.1298179626464844 + }, + { + "auxiliary_loss_clip": 0.01062067, + "auxiliary_loss_mlp": 0.01029439, + "balance_loss_clip": 1.01660573, + "balance_loss_mlp": 1.01949692, + "epoch": 0.5390049601683451, + "flos": 31721105940480.0, + "grad_norm": 1.5696306281881949, + "language_loss": 0.69829094, + "learning_rate": 1.7557821458378197e-06, + "loss": 0.71920598, + "num_input_tokens_seen": 193055570, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.42578125, + "step": 8965, + "time_per_iteration": 2.481285333633423 + }, + { + "auxiliary_loss_clip": 0.0106476, + "auxiliary_loss_mlp": 0.01032816, + "balance_loss_clip": 1.01891041, + "balance_loss_mlp": 1.02165329, + "epoch": 0.5390650834210131, + "flos": 18112520286720.0, + "grad_norm": 2.8494567900484493, + "language_loss": 0.81849086, + "learning_rate": 1.7554071552261442e-06, + "loss": 0.83946663, + "num_input_tokens_seen": 193073120, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.43164062, + "step": 8966, + "time_per_iteration": 2.393794298171997 + }, + { + "auxiliary_loss_clip": 0.01062379, + "auxiliary_loss_mlp": 0.01027931, + "balance_loss_clip": 1.01556289, + "balance_loss_mlp": 1.02120352, + "epoch": 0.539125206673681, + "flos": 17415863550720.0, + "grad_norm": 3.869495356053994, + "language_loss": 0.72468793, + "learning_rate": 1.755032173343359e-06, + "loss": 0.74559104, + "num_input_tokens_seen": 193090105, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.41210938, + "step": 8967, + "time_per_iteration": 2.3402717113494873 + }, + { + "auxiliary_loss_clip": 0.01061985, + "auxiliary_loss_mlp": 0.01032018, + "balance_loss_clip": 1.01988316, + "balance_loss_mlp": 1.02067721, + "epoch": 0.539185329926349, + "flos": 22197882096000.0, + "grad_norm": 1.56041784808983, + "language_loss": 0.82028413, + "learning_rate": 1.7546572002028446e-06, + "loss": 0.84122413, + "num_input_tokens_seen": 193109325, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.4140625, + "step": 8968, + "time_per_iteration": 2.4133455753326416 + }, + { + "auxiliary_loss_clip": 0.0106198, + "auxiliary_loss_mlp": 0.01031374, + "balance_loss_clip": 1.0178442, + "balance_loss_mlp": 1.02021301, + "epoch": 0.539245453179017, + "flos": 21433319032320.0, + "grad_norm": 6.911515596020926, + "language_loss": 0.73883379, + "learning_rate": 1.754282235817984e-06, + "loss": 0.75976729, + "num_input_tokens_seen": 193130595, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.41796875, + "step": 8969, + "time_per_iteration": 2.403881072998047 + }, + { + "auxiliary_loss_clip": 0.01065072, + "auxiliary_loss_mlp": 0.01032901, + "balance_loss_clip": 1.01938856, + "balance_loss_mlp": 1.02051878, + "epoch": 0.539305576431685, + "flos": 20734113767040.0, + "grad_norm": 1.9856867802187954, + "language_loss": 0.82318044, + "learning_rate": 1.753907280202158e-06, + "loss": 0.84416014, + "num_input_tokens_seen": 193148930, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.4453125, + "step": 8970, + "time_per_iteration": 2.380018472671509 + }, + { + "auxiliary_loss_clip": 0.01062349, + "auxiliary_loss_mlp": 0.01027081, + "balance_loss_clip": 1.01511848, + "balance_loss_mlp": 1.02115321, + "epoch": 0.5393656996843529, + "flos": 30919116032640.0, + "grad_norm": 1.318436659837457, + "language_loss": 0.75271052, + "learning_rate": 1.7535323333687485e-06, + "loss": 0.77360487, + "num_input_tokens_seen": 193170140, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.41210938, + "step": 8971, + "time_per_iteration": 2.500344753265381 + }, + { + "auxiliary_loss_clip": 0.0100966, + "auxiliary_loss_mlp": 0.01005228, + "balance_loss_clip": 1.00431585, + "balance_loss_mlp": 1.00148082, + "epoch": 0.5394258229370209, + "flos": 50315252699520.0, + "grad_norm": 0.8769032052123117, + "language_loss": 0.60343987, + "learning_rate": 1.7531573953311358e-06, + "loss": 0.62358874, + "num_input_tokens_seen": 193227235, + "router_z_loss_clip": 0.00909424, + "router_z_loss_mlp": 0.08203125, + "step": 8972, + "time_per_iteration": 3.055309772491455 + }, + { + "auxiliary_loss_clip": 0.01062516, + "auxiliary_loss_mlp": 0.01035033, + "balance_loss_clip": 1.02208114, + "balance_loss_mlp": 1.02057695, + "epoch": 0.5394859461896888, + "flos": 25410729317760.0, + "grad_norm": 1.4188885206518398, + "language_loss": 0.78203523, + "learning_rate": 1.7527824661027007e-06, + "loss": 0.80301082, + "num_input_tokens_seen": 193248435, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.41992188, + "step": 8973, + "time_per_iteration": 2.4792518615722656 + }, + { + "auxiliary_loss_clip": 0.01066553, + "auxiliary_loss_mlp": 0.01028591, + "balance_loss_clip": 1.01416719, + "balance_loss_mlp": 1.02050424, + "epoch": 0.5395460694423568, + "flos": 25477448659200.0, + "grad_norm": 2.2641183799284463, + "language_loss": 0.73946506, + "learning_rate": 1.7524075456968234e-06, + "loss": 0.76041651, + "num_input_tokens_seen": 193267490, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.4609375, + "step": 8974, + "time_per_iteration": 2.4675536155700684 + }, + { + "auxiliary_loss_clip": 0.01010281, + "auxiliary_loss_mlp": 0.01008333, + "balance_loss_clip": 1.00730813, + "balance_loss_mlp": 1.00194502, + "epoch": 0.5396061926950249, + "flos": 53246524872960.0, + "grad_norm": 0.7315973416411174, + "language_loss": 0.51015013, + "learning_rate": 1.7520326341268838e-06, + "loss": 0.53033626, + "num_input_tokens_seen": 193326050, + "router_z_loss_clip": 0.01025391, + "router_z_loss_mlp": 0.08349609, + "step": 8975, + "time_per_iteration": 3.060709238052368 + }, + { + "auxiliary_loss_clip": 0.01061824, + "auxiliary_loss_mlp": 0.01032252, + "balance_loss_clip": 1.01963413, + "balance_loss_mlp": 1.02085686, + "epoch": 0.5396663159476928, + "flos": 26723847663360.0, + "grad_norm": 1.4296239257810799, + "language_loss": 0.72392148, + "learning_rate": 1.7516577314062622e-06, + "loss": 0.74486226, + "num_input_tokens_seen": 193348785, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.41015625, + "step": 8976, + "time_per_iteration": 2.4485011100769043 + }, + { + "auxiliary_loss_clip": 0.01063028, + "auxiliary_loss_mlp": 0.0102654, + "balance_loss_clip": 1.01424944, + "balance_loss_mlp": 1.02083457, + "epoch": 0.5397264392003608, + "flos": 23252398934400.0, + "grad_norm": 2.0916621515073195, + "language_loss": 0.70122796, + "learning_rate": 1.7512828375483371e-06, + "loss": 0.72212368, + "num_input_tokens_seen": 193367080, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.421875, + "step": 8977, + "time_per_iteration": 2.421504020690918 + }, + { + "auxiliary_loss_clip": 0.01062771, + "auxiliary_loss_mlp": 0.0102358, + "balance_loss_clip": 1.0105983, + "balance_loss_mlp": 1.02059531, + "epoch": 0.5397865624530287, + "flos": 18293265728640.0, + "grad_norm": 1.9781863621647366, + "language_loss": 0.72391307, + "learning_rate": 1.7509079525664875e-06, + "loss": 0.74477655, + "num_input_tokens_seen": 193383715, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.421875, + "step": 8978, + "time_per_iteration": 2.3694064617156982 + }, + { + "auxiliary_loss_clip": 0.01068737, + "auxiliary_loss_mlp": 0.01027044, + "balance_loss_clip": 1.01224971, + "balance_loss_mlp": 1.02275276, + "epoch": 0.5398466857056967, + "flos": 15296810313600.0, + "grad_norm": 3.173926535714201, + "language_loss": 0.74236178, + "learning_rate": 1.7505330764740927e-06, + "loss": 0.76331955, + "num_input_tokens_seen": 193400560, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.45898438, + "step": 8979, + "time_per_iteration": 3.783588409423828 + }, + { + "auxiliary_loss_clip": 0.01067716, + "auxiliary_loss_mlp": 0.01039365, + "balance_loss_clip": 1.02486956, + "balance_loss_mlp": 1.02237034, + "epoch": 0.5399068089583646, + "flos": 17820786032640.0, + "grad_norm": 2.3657222111475407, + "language_loss": 0.76840001, + "learning_rate": 1.75015820928453e-06, + "loss": 0.78947079, + "num_input_tokens_seen": 193418680, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.453125, + "step": 8980, + "time_per_iteration": 2.371093273162842 + }, + { + "auxiliary_loss_clip": 0.01063212, + "auxiliary_loss_mlp": 0.01026523, + "balance_loss_clip": 1.01430988, + "balance_loss_mlp": 1.02083933, + "epoch": 0.5399669322110326, + "flos": 27380389380480.0, + "grad_norm": 1.968730778641231, + "language_loss": 0.82254839, + "learning_rate": 1.7497833510111787e-06, + "loss": 0.84344572, + "num_input_tokens_seen": 193439310, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.42382812, + "step": 8981, + "time_per_iteration": 2.48915433883667 + }, + { + "auxiliary_loss_clip": 0.0106611, + "auxiliary_loss_mlp": 0.01028163, + "balance_loss_clip": 1.01487756, + "balance_loss_mlp": 1.0219543, + "epoch": 0.5400270554637006, + "flos": 20448070064640.0, + "grad_norm": 1.8592748383033357, + "language_loss": 0.67321938, + "learning_rate": 1.7494085016674162e-06, + "loss": 0.69416207, + "num_input_tokens_seen": 193458115, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.44140625, + "step": 8982, + "time_per_iteration": 2.393150568008423 + }, + { + "auxiliary_loss_clip": 0.01064489, + "auxiliary_loss_mlp": 0.01028842, + "balance_loss_clip": 1.01608109, + "balance_loss_mlp": 1.02185082, + "epoch": 0.5400871787163686, + "flos": 21688499226240.0, + "grad_norm": 1.6200884260395272, + "language_loss": 0.82651168, + "learning_rate": 1.7490336612666196e-06, + "loss": 0.84744501, + "num_input_tokens_seen": 193477365, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.42578125, + "step": 8983, + "time_per_iteration": 2.468750238418579 + }, + { + "auxiliary_loss_clip": 0.01062518, + "auxiliary_loss_mlp": 0.01027698, + "balance_loss_clip": 1.01443577, + "balance_loss_mlp": 1.02003837, + "epoch": 0.5401473019690365, + "flos": 19203835564800.0, + "grad_norm": 1.6992288293477693, + "language_loss": 0.70656729, + "learning_rate": 1.748658829822166e-06, + "loss": 0.7274695, + "num_input_tokens_seen": 193495595, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.42382812, + "step": 8984, + "time_per_iteration": 2.430941104888916 + }, + { + "auxiliary_loss_clip": 0.01065331, + "auxiliary_loss_mlp": 0.01032151, + "balance_loss_clip": 1.01793551, + "balance_loss_mlp": 1.02128935, + "epoch": 0.5402074252217045, + "flos": 20626441534080.0, + "grad_norm": 1.6750537761681144, + "language_loss": 0.80043674, + "learning_rate": 1.748284007347432e-06, + "loss": 0.82141161, + "num_input_tokens_seen": 193514035, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.44140625, + "step": 8985, + "time_per_iteration": 2.430713653564453 + }, + { + "auxiliary_loss_clip": 0.01060291, + "auxiliary_loss_mlp": 0.0102652, + "balance_loss_clip": 1.01443827, + "balance_loss_mlp": 1.02125037, + "epoch": 0.5402675484743724, + "flos": 24972290064000.0, + "grad_norm": 4.6106826713682745, + "language_loss": 0.78831482, + "learning_rate": 1.7479091938557945e-06, + "loss": 0.80918294, + "num_input_tokens_seen": 193535445, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.390625, + "step": 8986, + "time_per_iteration": 2.4710071086883545 + }, + { + "auxiliary_loss_clip": 0.01063268, + "auxiliary_loss_mlp": 0.01031624, + "balance_loss_clip": 1.01958978, + "balance_loss_mlp": 1.02088475, + "epoch": 0.5403276717270404, + "flos": 19458142974720.0, + "grad_norm": 1.5862721661408845, + "language_loss": 0.76909208, + "learning_rate": 1.7475343893606293e-06, + "loss": 0.79004103, + "num_input_tokens_seen": 193554780, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.42578125, + "step": 8987, + "time_per_iteration": 2.423546552658081 + }, + { + "auxiliary_loss_clip": 0.01061743, + "auxiliary_loss_mlp": 0.0102638, + "balance_loss_clip": 1.01344621, + "balance_loss_mlp": 1.01971078, + "epoch": 0.5403877949797083, + "flos": 18441157714560.0, + "grad_norm": 1.7669716389601104, + "language_loss": 0.71353239, + "learning_rate": 1.747159593875312e-06, + "loss": 0.73441362, + "num_input_tokens_seen": 193573580, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.421875, + "step": 8988, + "time_per_iteration": 2.4179372787475586 + }, + { + "auxiliary_loss_clip": 0.01063775, + "auxiliary_loss_mlp": 0.01027547, + "balance_loss_clip": 1.01450562, + "balance_loss_mlp": 1.02046466, + "epoch": 0.5404479182323764, + "flos": 28291622532480.0, + "grad_norm": 2.9187870773844398, + "language_loss": 0.66955429, + "learning_rate": 1.746784807413219e-06, + "loss": 0.69046748, + "num_input_tokens_seen": 193590490, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.43359375, + "step": 8989, + "time_per_iteration": 2.457038164138794 + }, + { + "auxiliary_loss_clip": 0.01063781, + "auxiliary_loss_mlp": 0.01035364, + "balance_loss_clip": 1.02186346, + "balance_loss_mlp": 1.02049565, + "epoch": 0.5405080414850444, + "flos": 23366215566720.0, + "grad_norm": 1.7038029020775374, + "language_loss": 0.77439094, + "learning_rate": 1.7464100299877243e-06, + "loss": 0.79538238, + "num_input_tokens_seen": 193609900, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.43164062, + "step": 8990, + "time_per_iteration": 2.4090499877929688 + }, + { + "auxiliary_loss_clip": 0.0106008, + "auxiliary_loss_mlp": 0.01028028, + "balance_loss_clip": 1.01620281, + "balance_loss_mlp": 1.01924908, + "epoch": 0.5405681647377123, + "flos": 21105344920320.0, + "grad_norm": 1.7078933181127272, + "language_loss": 0.69439131, + "learning_rate": 1.7460352616122039e-06, + "loss": 0.71527237, + "num_input_tokens_seen": 193629775, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.40820312, + "step": 8991, + "time_per_iteration": 3.850456476211548 + }, + { + "auxiliary_loss_clip": 0.01062218, + "auxiliary_loss_mlp": 0.01027237, + "balance_loss_clip": 1.01363504, + "balance_loss_mlp": 1.01926756, + "epoch": 0.5406282879903803, + "flos": 20448139887360.0, + "grad_norm": 1.8245309033337997, + "language_loss": 0.76287282, + "learning_rate": 1.745660502300031e-06, + "loss": 0.78376734, + "num_input_tokens_seen": 193648070, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.4296875, + "step": 8992, + "time_per_iteration": 2.3990936279296875 + }, + { + "auxiliary_loss_clip": 0.01063068, + "auxiliary_loss_mlp": 0.01025097, + "balance_loss_clip": 1.0120852, + "balance_loss_mlp": 1.01956141, + "epoch": 0.5406884112430482, + "flos": 14208637057920.0, + "grad_norm": 7.54589980939357, + "language_loss": 0.75717789, + "learning_rate": 1.7452857520645812e-06, + "loss": 0.7780596, + "num_input_tokens_seen": 193665060, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.43554688, + "step": 8993, + "time_per_iteration": 2.368481159210205 + }, + { + "auxiliary_loss_clip": 0.01058757, + "auxiliary_loss_mlp": 0.01024786, + "balance_loss_clip": 1.01244771, + "balance_loss_mlp": 1.01905954, + "epoch": 0.5407485344957162, + "flos": 23874516184320.0, + "grad_norm": 1.5453361763233768, + "language_loss": 0.70504618, + "learning_rate": 1.7449110109192278e-06, + "loss": 0.72588158, + "num_input_tokens_seen": 193683620, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.39648438, + "step": 8994, + "time_per_iteration": 2.40769100189209 + }, + { + "auxiliary_loss_clip": 0.01063978, + "auxiliary_loss_mlp": 0.01028014, + "balance_loss_clip": 1.01345801, + "balance_loss_mlp": 1.02025485, + "epoch": 0.5408086577483842, + "flos": 23147379964800.0, + "grad_norm": 1.649679060295355, + "language_loss": 0.75156826, + "learning_rate": 1.7445362788773435e-06, + "loss": 0.77248818, + "num_input_tokens_seen": 193702990, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.4375, + "step": 8995, + "time_per_iteration": 3.8453516960144043 + }, + { + "auxiliary_loss_clip": 0.01059534, + "auxiliary_loss_mlp": 0.01026784, + "balance_loss_clip": 1.0144875, + "balance_loss_mlp": 1.01900721, + "epoch": 0.5408687810010522, + "flos": 18770039521920.0, + "grad_norm": 4.425285676933778, + "language_loss": 0.73918724, + "learning_rate": 1.7441615559523028e-06, + "loss": 0.76005042, + "num_input_tokens_seen": 193721785, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.40625, + "step": 8996, + "time_per_iteration": 3.8680903911590576 + }, + { + "auxiliary_loss_clip": 0.01060383, + "auxiliary_loss_mlp": 0.01032747, + "balance_loss_clip": 1.02108288, + "balance_loss_mlp": 1.01997185, + "epoch": 0.5409289042537201, + "flos": 13880697857280.0, + "grad_norm": 1.6484468068291793, + "language_loss": 0.73323464, + "learning_rate": 1.7437868421574783e-06, + "loss": 0.75416589, + "num_input_tokens_seen": 193740315, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.40429688, + "step": 8997, + "time_per_iteration": 2.3734638690948486 + }, + { + "auxiliary_loss_clip": 0.01062358, + "auxiliary_loss_mlp": 0.01027039, + "balance_loss_clip": 1.01426601, + "balance_loss_mlp": 1.02035558, + "epoch": 0.5409890275063881, + "flos": 14464480567680.0, + "grad_norm": 2.138541208915134, + "language_loss": 0.71446109, + "learning_rate": 1.7434121375062424e-06, + "loss": 0.73535502, + "num_input_tokens_seen": 193757580, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.41992188, + "step": 8998, + "time_per_iteration": 2.3750157356262207 + }, + { + "auxiliary_loss_clip": 0.01060094, + "auxiliary_loss_mlp": 0.01028414, + "balance_loss_clip": 1.01571786, + "balance_loss_mlp": 1.01912975, + "epoch": 0.541049150759056, + "flos": 48975706454400.0, + "grad_norm": 1.601667077813483, + "language_loss": 0.70387256, + "learning_rate": 1.7430374420119668e-06, + "loss": 0.72475761, + "num_input_tokens_seen": 193780965, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.41015625, + "step": 8999, + "time_per_iteration": 2.6230623722076416 + }, + { + "auxiliary_loss_clip": 0.01058967, + "auxiliary_loss_mlp": 0.0102507, + "balance_loss_clip": 1.01241624, + "balance_loss_mlp": 1.01916862, + "epoch": 0.541109274011724, + "flos": 18146700374400.0, + "grad_norm": 1.9268066129001258, + "language_loss": 0.80570573, + "learning_rate": 1.7426627556880238e-06, + "loss": 0.82654613, + "num_input_tokens_seen": 193797855, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.3984375, + "step": 9000, + "time_per_iteration": 2.3623993396759033 + }, + { + "auxiliary_loss_clip": 0.01061513, + "auxiliary_loss_mlp": 0.01026649, + "balance_loss_clip": 1.01335144, + "balance_loss_mlp": 1.02102828, + "epoch": 0.541169397264392, + "flos": 20521492387200.0, + "grad_norm": 1.6616062951075306, + "language_loss": 0.73066723, + "learning_rate": 1.7422880785477855e-06, + "loss": 0.75154883, + "num_input_tokens_seen": 193817375, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.40625, + "step": 9001, + "time_per_iteration": 2.3954193592071533 + }, + { + "auxiliary_loss_clip": 0.01065387, + "auxiliary_loss_mlp": 0.01027263, + "balance_loss_clip": 1.01391768, + "balance_loss_mlp": 1.02128208, + "epoch": 0.54122952051706, + "flos": 20043112671360.0, + "grad_norm": 2.367464590984208, + "language_loss": 0.85387003, + "learning_rate": 1.7419134106046224e-06, + "loss": 0.87479651, + "num_input_tokens_seen": 193832205, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.44140625, + "step": 9002, + "time_per_iteration": 2.366626262664795 + }, + { + "auxiliary_loss_clip": 0.01059293, + "auxiliary_loss_mlp": 0.01026401, + "balance_loss_clip": 1.01548719, + "balance_loss_mlp": 1.01974297, + "epoch": 0.541289643769728, + "flos": 19061250105600.0, + "grad_norm": 1.7503678928225772, + "language_loss": 0.77751309, + "learning_rate": 1.7415387518719063e-06, + "loss": 0.79837, + "num_input_tokens_seen": 193849830, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.39453125, + "step": 9003, + "time_per_iteration": 2.3775718212127686 + }, + { + "auxiliary_loss_clip": 0.01060503, + "auxiliary_loss_mlp": 0.01026124, + "balance_loss_clip": 1.01342225, + "balance_loss_mlp": 1.01955771, + "epoch": 0.5413497670223959, + "flos": 22381210978560.0, + "grad_norm": 1.907884390420618, + "language_loss": 0.69211495, + "learning_rate": 1.741164102363007e-06, + "loss": 0.71298116, + "num_input_tokens_seen": 193869945, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.41015625, + "step": 9004, + "time_per_iteration": 2.4641165733337402 + }, + { + "auxiliary_loss_clip": 0.01009001, + "auxiliary_loss_mlp": 0.01016261, + "balance_loss_clip": 1.01517582, + "balance_loss_mlp": 1.00125229, + "epoch": 0.5414098902750639, + "flos": 70028331488640.0, + "grad_norm": 0.9662711287937278, + "language_loss": 0.59098196, + "learning_rate": 1.740789462091295e-06, + "loss": 0.61123455, + "num_input_tokens_seen": 193930860, + "router_z_loss_clip": 0.01086426, + "router_z_loss_mlp": 0.07763672, + "step": 9005, + "time_per_iteration": 3.1341357231140137 + }, + { + "auxiliary_loss_clip": 0.01063687, + "auxiliary_loss_mlp": 0.01028598, + "balance_loss_clip": 1.01618218, + "balance_loss_mlp": 1.02148533, + "epoch": 0.5414700135277318, + "flos": 21797882115840.0, + "grad_norm": 1.7005052851458342, + "language_loss": 0.78228188, + "learning_rate": 1.7404148310701405e-06, + "loss": 0.80320472, + "num_input_tokens_seen": 193949075, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.421875, + "step": 9006, + "time_per_iteration": 2.460325002670288 + }, + { + "auxiliary_loss_clip": 0.01059596, + "auxiliary_loss_mlp": 0.01029053, + "balance_loss_clip": 1.01802015, + "balance_loss_mlp": 1.01929522, + "epoch": 0.5415301367803999, + "flos": 16907039262720.0, + "grad_norm": 1.6396597033169815, + "language_loss": 0.83385468, + "learning_rate": 1.7400402093129125e-06, + "loss": 0.85474122, + "num_input_tokens_seen": 193967630, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.40234375, + "step": 9007, + "time_per_iteration": 2.3878493309020996 + }, + { + "auxiliary_loss_clip": 0.01063612, + "auxiliary_loss_mlp": 0.01029961, + "balance_loss_clip": 1.01764643, + "balance_loss_mlp": 1.0220437, + "epoch": 0.5415902600330678, + "flos": 25702952330880.0, + "grad_norm": 1.7898024543330922, + "language_loss": 0.67122269, + "learning_rate": 1.7396655968329813e-06, + "loss": 0.69215846, + "num_input_tokens_seen": 193988730, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.41601562, + "step": 9008, + "time_per_iteration": 2.4754297733306885 + }, + { + "auxiliary_loss_clip": 0.01064691, + "auxiliary_loss_mlp": 0.01028039, + "balance_loss_clip": 1.01453304, + "balance_loss_mlp": 1.02036798, + "epoch": 0.5416503832857358, + "flos": 19207152144000.0, + "grad_norm": 2.375187158327401, + "language_loss": 0.7406584, + "learning_rate": 1.7392909936437152e-06, + "loss": 0.76158571, + "num_input_tokens_seen": 194005160, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.44335938, + "step": 9009, + "time_per_iteration": 2.372058868408203 + }, + { + "auxiliary_loss_clip": 0.01063293, + "auxiliary_loss_mlp": 0.01031747, + "balance_loss_clip": 1.01740611, + "balance_loss_mlp": 1.02041221, + "epoch": 0.5417105065384037, + "flos": 12712888056960.0, + "grad_norm": 5.120227679058226, + "language_loss": 0.88226014, + "learning_rate": 1.7389163997584825e-06, + "loss": 0.90321052, + "num_input_tokens_seen": 194021700, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.4296875, + "step": 9010, + "time_per_iteration": 2.431602954864502 + }, + { + "auxiliary_loss_clip": 0.01061821, + "auxiliary_loss_mlp": 0.01031574, + "balance_loss_clip": 1.01940846, + "balance_loss_mlp": 1.02022648, + "epoch": 0.5417706297910717, + "flos": 30334635095040.0, + "grad_norm": 1.9750528790118007, + "language_loss": 0.65064186, + "learning_rate": 1.7385418151906524e-06, + "loss": 0.67157578, + "num_input_tokens_seen": 194042620, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.41601562, + "step": 9011, + "time_per_iteration": 2.465805768966675 + }, + { + "auxiliary_loss_clip": 0.01062697, + "auxiliary_loss_mlp": 0.01026645, + "balance_loss_clip": 1.01402724, + "balance_loss_mlp": 1.02095783, + "epoch": 0.5418307530437396, + "flos": 29019771181440.0, + "grad_norm": 2.2010485832610884, + "language_loss": 0.792018, + "learning_rate": 1.7381672399535918e-06, + "loss": 0.81291139, + "num_input_tokens_seen": 194061800, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.41796875, + "step": 9012, + "time_per_iteration": 2.467667579650879 + }, + { + "auxiliary_loss_clip": 0.01060849, + "auxiliary_loss_mlp": 0.01026359, + "balance_loss_clip": 1.01392543, + "balance_loss_mlp": 1.01921916, + "epoch": 0.5418908762964076, + "flos": 16872510061440.0, + "grad_norm": 2.7473247388589095, + "language_loss": 0.74363148, + "learning_rate": 1.73779267406067e-06, + "loss": 0.76450354, + "num_input_tokens_seen": 194079890, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.41601562, + "step": 9013, + "time_per_iteration": 2.382643938064575 + }, + { + "auxiliary_loss_clip": 0.01063373, + "auxiliary_loss_mlp": 0.01027729, + "balance_loss_clip": 1.01431203, + "balance_loss_mlp": 1.01988816, + "epoch": 0.5419509995490756, + "flos": 18948795016320.0, + "grad_norm": 1.6731739102977263, + "language_loss": 0.72295356, + "learning_rate": 1.7374181175252522e-06, + "loss": 0.74386454, + "num_input_tokens_seen": 194097625, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.43554688, + "step": 9014, + "time_per_iteration": 2.3874008655548096 + }, + { + "auxiliary_loss_clip": 0.01063712, + "auxiliary_loss_mlp": 0.01025671, + "balance_loss_clip": 1.01285625, + "balance_loss_mlp": 1.0215627, + "epoch": 0.5420111228017436, + "flos": 18076734276480.0, + "grad_norm": 1.4968950760823931, + "language_loss": 0.80307537, + "learning_rate": 1.7370435703607068e-06, + "loss": 0.82396924, + "num_input_tokens_seen": 194116055, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.421875, + "step": 9015, + "time_per_iteration": 2.3743369579315186 + }, + { + "auxiliary_loss_clip": 0.0106645, + "auxiliary_loss_mlp": 0.0103172, + "balance_loss_clip": 1.01784372, + "balance_loss_mlp": 1.02191734, + "epoch": 0.5420712460544116, + "flos": 19060796257920.0, + "grad_norm": 2.322839657437037, + "language_loss": 0.81732231, + "learning_rate": 1.7366690325803998e-06, + "loss": 0.83830404, + "num_input_tokens_seen": 194130365, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.4453125, + "step": 9016, + "time_per_iteration": 2.3786728382110596 + }, + { + "auxiliary_loss_clip": 0.01062883, + "auxiliary_loss_mlp": 0.01027099, + "balance_loss_clip": 1.01448631, + "balance_loss_mlp": 1.02131343, + "epoch": 0.5421313693070795, + "flos": 18186117166080.0, + "grad_norm": 1.6457368910970296, + "language_loss": 0.81345177, + "learning_rate": 1.7362945041976972e-06, + "loss": 0.83435154, + "num_input_tokens_seen": 194148975, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.41601562, + "step": 9017, + "time_per_iteration": 2.4811620712280273 + }, + { + "auxiliary_loss_clip": 0.01058931, + "auxiliary_loss_mlp": 0.01030892, + "balance_loss_clip": 1.01866066, + "balance_loss_mlp": 1.01919222, + "epoch": 0.5421914925597475, + "flos": 13005111070080.0, + "grad_norm": 1.5406379375098076, + "language_loss": 0.7747128, + "learning_rate": 1.7359199852259663e-06, + "loss": 0.79561102, + "num_input_tokens_seen": 194167185, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.39648438, + "step": 9018, + "time_per_iteration": 2.449995756149292 + }, + { + "auxiliary_loss_clip": 0.01062748, + "auxiliary_loss_mlp": 0.01034741, + "balance_loss_clip": 1.01983929, + "balance_loss_mlp": 1.0190115, + "epoch": 0.5422516158124154, + "flos": 46756591660800.0, + "grad_norm": 1.4882621803588318, + "language_loss": 0.66553378, + "learning_rate": 1.735545475678571e-06, + "loss": 0.68650866, + "num_input_tokens_seen": 194192840, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.4375, + "step": 9019, + "time_per_iteration": 4.061171770095825 + }, + { + "auxiliary_loss_clip": 0.01058813, + "auxiliary_loss_mlp": 0.01021786, + "balance_loss_clip": 1.01055086, + "balance_loss_mlp": 1.01908088, + "epoch": 0.5423117390650835, + "flos": 31757310887040.0, + "grad_norm": 1.6143093094701304, + "language_loss": 0.6962201, + "learning_rate": 1.735170975568878e-06, + "loss": 0.71702611, + "num_input_tokens_seen": 194213150, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.3984375, + "step": 9020, + "time_per_iteration": 2.4746360778808594 + }, + { + "auxiliary_loss_clip": 0.01057825, + "auxiliary_loss_mlp": 0.01026741, + "balance_loss_clip": 1.01483226, + "balance_loss_mlp": 1.01925039, + "epoch": 0.5423718623177514, + "flos": 27200656368000.0, + "grad_norm": 1.4818056006755898, + "language_loss": 0.80675429, + "learning_rate": 1.7347964849102517e-06, + "loss": 0.82759988, + "num_input_tokens_seen": 194234665, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.38671875, + "step": 9021, + "time_per_iteration": 2.4742648601531982 + }, + { + "auxiliary_loss_clip": 0.0106379, + "auxiliary_loss_mlp": 0.01030735, + "balance_loss_clip": 1.01845098, + "balance_loss_mlp": 1.02248573, + "epoch": 0.5424319855704194, + "flos": 23545424908800.0, + "grad_norm": 1.532289387030436, + "language_loss": 0.78792906, + "learning_rate": 1.734422003716056e-06, + "loss": 0.80887431, + "num_input_tokens_seen": 194253790, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.4140625, + "step": 9022, + "time_per_iteration": 2.427483081817627 + }, + { + "auxiliary_loss_clip": 0.01060468, + "auxiliary_loss_mlp": 0.01024385, + "balance_loss_clip": 1.01196289, + "balance_loss_mlp": 1.02023113, + "epoch": 0.5424921088230873, + "flos": 26614394951040.0, + "grad_norm": 1.582428170741226, + "language_loss": 0.81991702, + "learning_rate": 1.7340475319996564e-06, + "loss": 0.8407656, + "num_input_tokens_seen": 194274950, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.40234375, + "step": 9023, + "time_per_iteration": 2.4688899517059326 + }, + { + "auxiliary_loss_clip": 0.01060226, + "auxiliary_loss_mlp": 0.01023513, + "balance_loss_clip": 1.0114851, + "balance_loss_mlp": 1.01965296, + "epoch": 0.5425522320757553, + "flos": 23585679573120.0, + "grad_norm": 1.516342855552023, + "language_loss": 0.7164529, + "learning_rate": 1.733673069774416e-06, + "loss": 0.73729026, + "num_input_tokens_seen": 194296155, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.40625, + "step": 9024, + "time_per_iteration": 2.4169116020202637 + }, + { + "auxiliary_loss_clip": 0.0106174, + "auxiliary_loss_mlp": 0.01029085, + "balance_loss_clip": 1.01678216, + "balance_loss_mlp": 1.01882005, + "epoch": 0.5426123553284232, + "flos": 30590932452480.0, + "grad_norm": 1.657260455799981, + "language_loss": 0.65051365, + "learning_rate": 1.7332986170536987e-06, + "loss": 0.67142189, + "num_input_tokens_seen": 194318025, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.4296875, + "step": 9025, + "time_per_iteration": 2.478243589401245 + }, + { + "auxiliary_loss_clip": 0.01063559, + "auxiliary_loss_mlp": 0.01027032, + "balance_loss_clip": 1.01437771, + "balance_loss_mlp": 1.02114677, + "epoch": 0.5426724785810912, + "flos": 12494296834560.0, + "grad_norm": 2.8226341642326447, + "language_loss": 0.73365533, + "learning_rate": 1.732924173850868e-06, + "loss": 0.75456125, + "num_input_tokens_seen": 194336150, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.42382812, + "step": 9026, + "time_per_iteration": 2.3593599796295166 + }, + { + "auxiliary_loss_clip": 0.01062207, + "auxiliary_loss_mlp": 0.01029584, + "balance_loss_clip": 1.01468289, + "balance_loss_mlp": 1.01914811, + "epoch": 0.5427326018337592, + "flos": 26063500608000.0, + "grad_norm": 1.628217010342728, + "language_loss": 0.78506851, + "learning_rate": 1.7325497401792861e-06, + "loss": 0.8059864, + "num_input_tokens_seen": 194355980, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.43164062, + "step": 9027, + "time_per_iteration": 2.428318977355957 + }, + { + "auxiliary_loss_clip": 0.01009076, + "auxiliary_loss_mlp": 0.01003792, + "balance_loss_clip": 1.00287402, + "balance_loss_mlp": 1.00162673, + "epoch": 0.5427927250864272, + "flos": 65981374041600.0, + "grad_norm": 0.743662384502935, + "language_loss": 0.5651921, + "learning_rate": 1.732175316052317e-06, + "loss": 0.58532071, + "num_input_tokens_seen": 194422660, + "router_z_loss_clip": 0.00915527, + "router_z_loss_mlp": 0.07421875, + "step": 9028, + "time_per_iteration": 3.090791702270508 + }, + { + "auxiliary_loss_clip": 0.01061586, + "auxiliary_loss_mlp": 0.01025854, + "balance_loss_clip": 1.0123713, + "balance_loss_mlp": 1.01951575, + "epoch": 0.5428528483390952, + "flos": 19974333559680.0, + "grad_norm": 2.4179561807072885, + "language_loss": 0.78201222, + "learning_rate": 1.7318009014833209e-06, + "loss": 0.80288661, + "num_input_tokens_seen": 194438545, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.41992188, + "step": 9029, + "time_per_iteration": 2.3970658779144287 + }, + { + "auxiliary_loss_clip": 0.01060553, + "auxiliary_loss_mlp": 0.01023473, + "balance_loss_clip": 1.01061654, + "balance_loss_mlp": 1.01956415, + "epoch": 0.5429129715917631, + "flos": 21831329064960.0, + "grad_norm": 1.4833366898727487, + "language_loss": 0.82973981, + "learning_rate": 1.731426496485661e-06, + "loss": 0.85057998, + "num_input_tokens_seen": 194458060, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.41015625, + "step": 9030, + "time_per_iteration": 2.436702013015747 + }, + { + "auxiliary_loss_clip": 0.01061504, + "auxiliary_loss_mlp": 0.01029273, + "balance_loss_clip": 1.01689327, + "balance_loss_mlp": 1.02126122, + "epoch": 0.5429730948444311, + "flos": 27781436701440.0, + "grad_norm": 2.1468625996053694, + "language_loss": 0.74743879, + "learning_rate": 1.7310521010726988e-06, + "loss": 0.76834661, + "num_input_tokens_seen": 194477405, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.40234375, + "step": 9031, + "time_per_iteration": 3.8631088733673096 + }, + { + "auxiliary_loss_clip": 0.01059374, + "auxiliary_loss_mlp": 0.01025678, + "balance_loss_clip": 1.01466334, + "balance_loss_mlp": 1.0200758, + "epoch": 0.543033218097099, + "flos": 26759249648640.0, + "grad_norm": 1.7227376273837223, + "language_loss": 0.85559088, + "learning_rate": 1.7306777152577949e-06, + "loss": 0.87644142, + "num_input_tokens_seen": 194497085, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.39257812, + "step": 9032, + "time_per_iteration": 2.415037155151367 + }, + { + "auxiliary_loss_clip": 0.0106411, + "auxiliary_loss_mlp": 0.01028065, + "balance_loss_clip": 1.0149405, + "balance_loss_mlp": 1.02073336, + "epoch": 0.5430933413497671, + "flos": 22674132218880.0, + "grad_norm": 1.6236777527657937, + "language_loss": 0.74168962, + "learning_rate": 1.7303033390543108e-06, + "loss": 0.76261139, + "num_input_tokens_seen": 194516785, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.43359375, + "step": 9033, + "time_per_iteration": 2.4017279148101807 + }, + { + "auxiliary_loss_clip": 0.01060419, + "auxiliary_loss_mlp": 0.01027102, + "balance_loss_clip": 1.01380467, + "balance_loss_mlp": 1.01951218, + "epoch": 0.543153464602435, + "flos": 24606365437440.0, + "grad_norm": 1.6480261021657485, + "language_loss": 0.75837421, + "learning_rate": 1.7299289724756065e-06, + "loss": 0.77924949, + "num_input_tokens_seen": 194536475, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.41015625, + "step": 9034, + "time_per_iteration": 3.858736276626587 + }, + { + "auxiliary_loss_clip": 0.0106023, + "auxiliary_loss_mlp": 0.01025602, + "balance_loss_clip": 1.01374054, + "balance_loss_mlp": 1.02020085, + "epoch": 0.543213587855103, + "flos": 19024730956800.0, + "grad_norm": 1.5983567375969991, + "language_loss": 0.83991152, + "learning_rate": 1.7295546155350431e-06, + "loss": 0.86076981, + "num_input_tokens_seen": 194554495, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.40039062, + "step": 9035, + "time_per_iteration": 3.9300549030303955 + }, + { + "auxiliary_loss_clip": 0.01009151, + "auxiliary_loss_mlp": 0.01005892, + "balance_loss_clip": 1.00493276, + "balance_loss_mlp": 1.00153422, + "epoch": 0.5432737111077709, + "flos": 65683251008640.0, + "grad_norm": 0.7133646366472381, + "language_loss": 0.55866468, + "learning_rate": 1.729180268245979e-06, + "loss": 0.57881516, + "num_input_tokens_seen": 194617620, + "router_z_loss_clip": 0.00958252, + "router_z_loss_mlp": 0.07617188, + "step": 9036, + "time_per_iteration": 3.027024269104004 + }, + { + "auxiliary_loss_clip": 0.01063673, + "auxiliary_loss_mlp": 0.01028276, + "balance_loss_clip": 1.01499581, + "balance_loss_mlp": 1.02077603, + "epoch": 0.5433338343604389, + "flos": 22090558976640.0, + "grad_norm": 1.537491361432893, + "language_loss": 0.7478438, + "learning_rate": 1.7288059306217751e-06, + "loss": 0.76876336, + "num_input_tokens_seen": 194637690, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.4296875, + "step": 9037, + "time_per_iteration": 2.4207398891448975 + }, + { + "auxiliary_loss_clip": 0.01060779, + "auxiliary_loss_mlp": 0.01029005, + "balance_loss_clip": 1.01610637, + "balance_loss_mlp": 1.01916099, + "epoch": 0.5433939576131068, + "flos": 34671371760000.0, + "grad_norm": 1.5892040437371275, + "language_loss": 0.66556001, + "learning_rate": 1.72843160267579e-06, + "loss": 0.68645787, + "num_input_tokens_seen": 194659520, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.41601562, + "step": 9038, + "time_per_iteration": 2.537379741668701 + }, + { + "auxiliary_loss_clip": 0.01058279, + "auxiliary_loss_mlp": 0.01026683, + "balance_loss_clip": 1.0150007, + "balance_loss_mlp": 1.01848924, + "epoch": 0.5434540808657748, + "flos": 20302307671680.0, + "grad_norm": 2.0611766538310556, + "language_loss": 0.77962798, + "learning_rate": 1.7280572844213818e-06, + "loss": 0.80047756, + "num_input_tokens_seen": 194677645, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.3984375, + "step": 9039, + "time_per_iteration": 2.408341884613037 + }, + { + "auxiliary_loss_clip": 0.01062833, + "auxiliary_loss_mlp": 0.01027658, + "balance_loss_clip": 1.01539719, + "balance_loss_mlp": 1.0211854, + "epoch": 0.5435142041184428, + "flos": 23111663777280.0, + "grad_norm": 1.9769982853615153, + "language_loss": 0.7655378, + "learning_rate": 1.7276829758719103e-06, + "loss": 0.7864427, + "num_input_tokens_seen": 194697400, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.41601562, + "step": 9040, + "time_per_iteration": 2.4382593631744385 + }, + { + "auxiliary_loss_clip": 0.01009506, + "auxiliary_loss_mlp": 0.01006323, + "balance_loss_clip": 1.00547099, + "balance_loss_mlp": 1.00164938, + "epoch": 0.5435743273711108, + "flos": 64009479651840.0, + "grad_norm": 0.6638071728596421, + "language_loss": 0.52487427, + "learning_rate": 1.7273086770407323e-06, + "loss": 0.54503256, + "num_input_tokens_seen": 194761205, + "router_z_loss_clip": 0.00854492, + "router_z_loss_mlp": 0.07861328, + "step": 9041, + "time_per_iteration": 3.1107773780822754 + }, + { + "auxiliary_loss_clip": 0.01059956, + "auxiliary_loss_mlp": 0.01027423, + "balance_loss_clip": 1.01437557, + "balance_loss_mlp": 1.0193193, + "epoch": 0.5436344506237788, + "flos": 25117738254720.0, + "grad_norm": 6.202704714574029, + "language_loss": 0.76523846, + "learning_rate": 1.7269343879412065e-06, + "loss": 0.78611225, + "num_input_tokens_seen": 194782445, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.40625, + "step": 9042, + "time_per_iteration": 2.453413486480713 + }, + { + "auxiliary_loss_clip": 0.01058799, + "auxiliary_loss_mlp": 0.01023312, + "balance_loss_clip": 1.01140893, + "balance_loss_mlp": 1.01828682, + "epoch": 0.5436945738764467, + "flos": 19571959607040.0, + "grad_norm": 1.4755234824889913, + "language_loss": 0.67450416, + "learning_rate": 1.7265601085866909e-06, + "loss": 0.69532531, + "num_input_tokens_seen": 194800325, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.40625, + "step": 9043, + "time_per_iteration": 2.396726369857788 + }, + { + "auxiliary_loss_clip": 0.01056976, + "auxiliary_loss_mlp": 0.01022793, + "balance_loss_clip": 1.01091373, + "balance_loss_mlp": 1.01847887, + "epoch": 0.5437546971291147, + "flos": 21141445132800.0, + "grad_norm": 1.5930546529126077, + "language_loss": 0.84280372, + "learning_rate": 1.7261858389905402e-06, + "loss": 0.86360133, + "num_input_tokens_seen": 194818675, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.38476562, + "step": 9044, + "time_per_iteration": 2.3882153034210205 + }, + { + "auxiliary_loss_clip": 0.01064562, + "auxiliary_loss_mlp": 0.01025316, + "balance_loss_clip": 1.01130927, + "balance_loss_mlp": 1.02086306, + "epoch": 0.5438148203817826, + "flos": 25117528786560.0, + "grad_norm": 1.859056085876024, + "language_loss": 0.61513662, + "learning_rate": 1.7258115791661134e-06, + "loss": 0.63603544, + "num_input_tokens_seen": 194836595, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.4375, + "step": 9045, + "time_per_iteration": 2.4105403423309326 + }, + { + "auxiliary_loss_clip": 0.01062635, + "auxiliary_loss_mlp": 0.010284, + "balance_loss_clip": 1.01568055, + "balance_loss_mlp": 1.02069259, + "epoch": 0.5438749436344507, + "flos": 23001827040000.0, + "grad_norm": 5.513842782664582, + "language_loss": 0.70196444, + "learning_rate": 1.7254373291267655e-06, + "loss": 0.72287476, + "num_input_tokens_seen": 194857520, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.41992188, + "step": 9046, + "time_per_iteration": 2.4084014892578125 + }, + { + "auxiliary_loss_clip": 0.01060322, + "auxiliary_loss_mlp": 0.010247, + "balance_loss_clip": 1.01253486, + "balance_loss_mlp": 1.0203476, + "epoch": 0.5439350668871186, + "flos": 15486109038720.0, + "grad_norm": 1.6515057340444534, + "language_loss": 0.7720989, + "learning_rate": 1.7250630888858533e-06, + "loss": 0.79294908, + "num_input_tokens_seen": 194876020, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.3984375, + "step": 9047, + "time_per_iteration": 2.362386703491211 + }, + { + "auxiliary_loss_clip": 0.01062535, + "auxiliary_loss_mlp": 0.01026078, + "balance_loss_clip": 1.013484, + "balance_loss_mlp": 1.02073801, + "epoch": 0.5439951901397866, + "flos": 17237457169920.0, + "grad_norm": 1.7268453018611416, + "language_loss": 0.72741449, + "learning_rate": 1.7246888584567325e-06, + "loss": 0.74830067, + "num_input_tokens_seen": 194894650, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.41796875, + "step": 9048, + "time_per_iteration": 2.3623456954956055 + }, + { + "auxiliary_loss_clip": 0.01063752, + "auxiliary_loss_mlp": 0.01029139, + "balance_loss_clip": 1.01555502, + "balance_loss_mlp": 1.02200007, + "epoch": 0.5440553133924545, + "flos": 18660028227840.0, + "grad_norm": 2.0380188337836262, + "language_loss": 0.93797731, + "learning_rate": 1.7243146378527576e-06, + "loss": 0.95890617, + "num_input_tokens_seen": 194911935, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.41796875, + "step": 9049, + "time_per_iteration": 2.357942581176758 + }, + { + "auxiliary_loss_clip": 0.01060933, + "auxiliary_loss_mlp": 0.01025372, + "balance_loss_clip": 1.01343942, + "balance_loss_mlp": 1.02037144, + "epoch": 0.5441154366451225, + "flos": 27121787873280.0, + "grad_norm": 1.7777070009704838, + "language_loss": 0.74350274, + "learning_rate": 1.7239404270872846e-06, + "loss": 0.76436579, + "num_input_tokens_seen": 194931620, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.40625, + "step": 9050, + "time_per_iteration": 2.4180543422698975 + }, + { + "auxiliary_loss_clip": 0.01064311, + "auxiliary_loss_mlp": 0.01022331, + "balance_loss_clip": 1.00943267, + "balance_loss_mlp": 1.02182484, + "epoch": 0.5441755598977904, + "flos": 25992696637440.0, + "grad_norm": 1.6840216806448631, + "language_loss": 0.67300576, + "learning_rate": 1.7235662261736672e-06, + "loss": 0.69387221, + "num_input_tokens_seen": 194952560, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.42578125, + "step": 9051, + "time_per_iteration": 2.4100255966186523 + }, + { + "auxiliary_loss_clip": 0.01060364, + "auxiliary_loss_mlp": 0.01025768, + "balance_loss_clip": 1.01380563, + "balance_loss_mlp": 1.01939821, + "epoch": 0.5442356831504584, + "flos": 32378660087040.0, + "grad_norm": 1.7792429562765788, + "language_loss": 0.67625248, + "learning_rate": 1.7231920351252604e-06, + "loss": 0.69711381, + "num_input_tokens_seen": 194973915, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.41015625, + "step": 9052, + "time_per_iteration": 2.4588232040405273 + }, + { + "auxiliary_loss_clip": 0.01062963, + "auxiliary_loss_mlp": 0.01023902, + "balance_loss_clip": 1.01081252, + "balance_loss_mlp": 1.02017951, + "epoch": 0.5442958064031264, + "flos": 24163317884160.0, + "grad_norm": 1.711273410172251, + "language_loss": 0.92829138, + "learning_rate": 1.7228178539554181e-06, + "loss": 0.94916004, + "num_input_tokens_seen": 194990170, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.42773438, + "step": 9053, + "time_per_iteration": 2.395336627960205 + }, + { + "auxiliary_loss_clip": 0.01063284, + "auxiliary_loss_mlp": 0.01028088, + "balance_loss_clip": 1.01480865, + "balance_loss_mlp": 1.02094674, + "epoch": 0.5443559296557944, + "flos": 18763860211200.0, + "grad_norm": 1.8807795398185674, + "language_loss": 0.83464497, + "learning_rate": 1.722443682677493e-06, + "loss": 0.85555869, + "num_input_tokens_seen": 195006395, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.421875, + "step": 9054, + "time_per_iteration": 2.376723527908325 + }, + { + "auxiliary_loss_clip": 0.01062407, + "auxiliary_loss_mlp": 0.01027407, + "balance_loss_clip": 1.01418686, + "balance_loss_mlp": 1.0201875, + "epoch": 0.5444160529084624, + "flos": 22631608316160.0, + "grad_norm": 1.9594471659038064, + "language_loss": 0.68595451, + "learning_rate": 1.7220695213048396e-06, + "loss": 0.70685261, + "num_input_tokens_seen": 195025080, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.421875, + "step": 9055, + "time_per_iteration": 2.4047930240631104 + }, + { + "auxiliary_loss_clip": 0.01010275, + "auxiliary_loss_mlp": 0.01001925, + "balance_loss_clip": 1.00107908, + "balance_loss_mlp": 1.00269413, + "epoch": 0.5444761761611303, + "flos": 69668376704640.0, + "grad_norm": 0.7429322010595235, + "language_loss": 0.5773561, + "learning_rate": 1.7216953698508092e-06, + "loss": 0.59747809, + "num_input_tokens_seen": 195085725, + "router_z_loss_clip": 0.00848389, + "router_z_loss_mlp": 0.07568359, + "step": 9056, + "time_per_iteration": 3.0348312854766846 + }, + { + "auxiliary_loss_clip": 0.01061679, + "auxiliary_loss_mlp": 0.010283, + "balance_loss_clip": 1.01448345, + "balance_loss_mlp": 1.01882768, + "epoch": 0.5445362994137983, + "flos": 14277695460480.0, + "grad_norm": 1.7069313013235692, + "language_loss": 0.69432247, + "learning_rate": 1.721321228328756e-06, + "loss": 0.71522224, + "num_input_tokens_seen": 195102585, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.42773438, + "step": 9057, + "time_per_iteration": 2.3655381202697754 + }, + { + "auxiliary_loss_clip": 0.01060208, + "auxiliary_loss_mlp": 0.01032777, + "balance_loss_clip": 1.01977086, + "balance_loss_mlp": 1.0201416, + "epoch": 0.5445964226664662, + "flos": 28984927777920.0, + "grad_norm": 4.05028112431894, + "language_loss": 0.75605351, + "learning_rate": 1.720947096752031e-06, + "loss": 0.77698338, + "num_input_tokens_seen": 195120055, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.40039062, + "step": 9058, + "time_per_iteration": 3.8185980319976807 + }, + { + "auxiliary_loss_clip": 0.01009849, + "auxiliary_loss_mlp": 0.01001369, + "balance_loss_clip": 1.00040936, + "balance_loss_mlp": 1.0019815, + "epoch": 0.5446565459191343, + "flos": 68616548040960.0, + "grad_norm": 0.8112898963094866, + "language_loss": 0.62658566, + "learning_rate": 1.7205729751339864e-06, + "loss": 0.64669782, + "num_input_tokens_seen": 195181045, + "router_z_loss_clip": 0.00958252, + "router_z_loss_mlp": 0.07910156, + "step": 9059, + "time_per_iteration": 3.1385762691497803 + }, + { + "auxiliary_loss_clip": 0.01059314, + "auxiliary_loss_mlp": 0.01029956, + "balance_loss_clip": 1.01721263, + "balance_loss_mlp": 1.0187465, + "epoch": 0.5447166691718022, + "flos": 16215549408000.0, + "grad_norm": 2.3344359248701583, + "language_loss": 0.78912377, + "learning_rate": 1.7201988634879736e-06, + "loss": 0.81001645, + "num_input_tokens_seen": 195198840, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.40625, + "step": 9060, + "time_per_iteration": 2.382307529449463 + }, + { + "auxiliary_loss_clip": 0.01062479, + "auxiliary_loss_mlp": 0.0102877, + "balance_loss_clip": 1.01580656, + "balance_loss_mlp": 1.02060425, + "epoch": 0.5447767924244702, + "flos": 25847841939840.0, + "grad_norm": 1.61264407303333, + "language_loss": 0.79616868, + "learning_rate": 1.7198247618273432e-06, + "loss": 0.81708121, + "num_input_tokens_seen": 195218720, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.41796875, + "step": 9061, + "time_per_iteration": 2.498650074005127 + }, + { + "auxiliary_loss_clip": 0.01061074, + "auxiliary_loss_mlp": 0.01027043, + "balance_loss_clip": 1.01440132, + "balance_loss_mlp": 1.02042472, + "epoch": 0.5448369156771381, + "flos": 19676838931200.0, + "grad_norm": 1.7223206766352073, + "language_loss": 0.86760712, + "learning_rate": 1.7194506701654467e-06, + "loss": 0.88848829, + "num_input_tokens_seen": 195235770, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.40625, + "step": 9062, + "time_per_iteration": 2.367253065109253 + }, + { + "auxiliary_loss_clip": 0.01064039, + "auxiliary_loss_mlp": 0.01031514, + "balance_loss_clip": 1.01775122, + "balance_loss_mlp": 1.02003264, + "epoch": 0.5448970389298061, + "flos": 19280783934720.0, + "grad_norm": 1.9233847072910748, + "language_loss": 0.82383907, + "learning_rate": 1.7190765885156338e-06, + "loss": 0.84479463, + "num_input_tokens_seen": 195254870, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.43945312, + "step": 9063, + "time_per_iteration": 2.3757402896881104 + }, + { + "auxiliary_loss_clip": 0.0106038, + "auxiliary_loss_mlp": 0.01026581, + "balance_loss_clip": 1.01321197, + "balance_loss_mlp": 1.01852751, + "epoch": 0.544957162182474, + "flos": 20990760238080.0, + "grad_norm": 2.0613219167673456, + "language_loss": 0.63740981, + "learning_rate": 1.718702516891255e-06, + "loss": 0.65827936, + "num_input_tokens_seen": 195273390, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.41796875, + "step": 9064, + "time_per_iteration": 2.38446307182312 + }, + { + "auxiliary_loss_clip": 0.01062436, + "auxiliary_loss_mlp": 0.01025716, + "balance_loss_clip": 1.01234078, + "balance_loss_mlp": 1.02006698, + "epoch": 0.545017285435142, + "flos": 25373407207680.0, + "grad_norm": 1.5394280658761197, + "language_loss": 0.80004722, + "learning_rate": 1.71832845530566e-06, + "loss": 0.82092869, + "num_input_tokens_seen": 195295635, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.42382812, + "step": 9065, + "time_per_iteration": 2.450850009918213 + }, + { + "auxiliary_loss_clip": 0.01059108, + "auxiliary_loss_mlp": 0.01023788, + "balance_loss_clip": 1.01243973, + "balance_loss_mlp": 1.0198195, + "epoch": 0.54507740868781, + "flos": 19133764732800.0, + "grad_norm": 1.8131348890156453, + "language_loss": 0.78282154, + "learning_rate": 1.7179544037721976e-06, + "loss": 0.8036505, + "num_input_tokens_seen": 195312545, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.39257812, + "step": 9066, + "time_per_iteration": 2.367149591445923 + }, + { + "auxiliary_loss_clip": 0.0106328, + "auxiliary_loss_mlp": 0.0102994, + "balance_loss_clip": 1.01655853, + "balance_loss_mlp": 1.01990545, + "epoch": 0.545137531940478, + "flos": 26248609969920.0, + "grad_norm": 2.274888199374942, + "language_loss": 0.75911975, + "learning_rate": 1.7175803623042174e-06, + "loss": 0.78005195, + "num_input_tokens_seen": 195332955, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.43359375, + "step": 9067, + "time_per_iteration": 2.4560792446136475 + }, + { + "auxiliary_loss_clip": 0.01067634, + "auxiliary_loss_mlp": 0.0103322, + "balance_loss_clip": 1.01749611, + "balance_loss_mlp": 1.02288735, + "epoch": 0.545197655193146, + "flos": 37554254922240.0, + "grad_norm": 2.663164768830577, + "language_loss": 0.63787419, + "learning_rate": 1.7172063309150668e-06, + "loss": 0.65888274, + "num_input_tokens_seen": 195355930, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.44726562, + "step": 9068, + "time_per_iteration": 2.5607573986053467 + }, + { + "auxiliary_loss_clip": 0.01061021, + "auxiliary_loss_mlp": 0.01033327, + "balance_loss_clip": 1.02042317, + "balance_loss_mlp": 1.0205611, + "epoch": 0.5452577784458139, + "flos": 26030053658880.0, + "grad_norm": 1.5762935027611462, + "language_loss": 0.72338301, + "learning_rate": 1.7168323096180956e-06, + "loss": 0.74432647, + "num_input_tokens_seen": 195376445, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.40429688, + "step": 9069, + "time_per_iteration": 2.444047689437866 + }, + { + "auxiliary_loss_clip": 0.0106107, + "auxiliary_loss_mlp": 0.01024793, + "balance_loss_clip": 1.01290166, + "balance_loss_mlp": 1.02226555, + "epoch": 0.5453179016984819, + "flos": 17638085554560.0, + "grad_norm": 1.8091671726768541, + "language_loss": 0.73443675, + "learning_rate": 1.7164582984266508e-06, + "loss": 0.7552954, + "num_input_tokens_seen": 195393725, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.38867188, + "step": 9070, + "time_per_iteration": 2.3598930835723877 + }, + { + "auxiliary_loss_clip": 0.01061948, + "auxiliary_loss_mlp": 0.01028084, + "balance_loss_clip": 1.01580548, + "balance_loss_mlp": 1.02084589, + "epoch": 0.5453780249511498, + "flos": 23215705228800.0, + "grad_norm": 1.9893582465180943, + "language_loss": 0.60545361, + "learning_rate": 1.7160842973540798e-06, + "loss": 0.62635392, + "num_input_tokens_seen": 195411380, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.41015625, + "step": 9071, + "time_per_iteration": 3.926656723022461 + }, + { + "auxiliary_loss_clip": 0.01008988, + "auxiliary_loss_mlp": 0.01013206, + "balance_loss_clip": 1.01234734, + "balance_loss_mlp": 1.00117135, + "epoch": 0.5454381482038179, + "flos": 68692728360960.0, + "grad_norm": 0.710510976818834, + "language_loss": 0.57057887, + "learning_rate": 1.71571030641373e-06, + "loss": 0.59080076, + "num_input_tokens_seen": 195482015, + "router_z_loss_clip": 0.00860596, + "router_z_loss_mlp": 0.078125, + "step": 9072, + "time_per_iteration": 3.160154104232788 + }, + { + "auxiliary_loss_clip": 0.01059883, + "auxiliary_loss_mlp": 0.01026394, + "balance_loss_clip": 1.01435947, + "balance_loss_mlp": 1.02013636, + "epoch": 0.5454982714564858, + "flos": 13259802504960.0, + "grad_norm": 1.5945588846785637, + "language_loss": 0.69142181, + "learning_rate": 1.715336325618948e-06, + "loss": 0.71228456, + "num_input_tokens_seen": 195500440, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.39648438, + "step": 9073, + "time_per_iteration": 3.787013292312622 + }, + { + "auxiliary_loss_clip": 0.01060319, + "auxiliary_loss_mlp": 0.01026245, + "balance_loss_clip": 1.01406217, + "balance_loss_mlp": 1.02055955, + "epoch": 0.5455583947091538, + "flos": 21834785289600.0, + "grad_norm": 2.754752922311159, + "language_loss": 0.71370625, + "learning_rate": 1.7149623549830805e-06, + "loss": 0.73457193, + "num_input_tokens_seen": 195520860, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.39648438, + "step": 9074, + "time_per_iteration": 2.4436802864074707 + }, + { + "auxiliary_loss_clip": 0.01059086, + "auxiliary_loss_mlp": 0.01028909, + "balance_loss_clip": 1.01676154, + "balance_loss_mlp": 1.01855016, + "epoch": 0.5456185179618217, + "flos": 17816596669440.0, + "grad_norm": 1.929963625957634, + "language_loss": 0.68585706, + "learning_rate": 1.7145883945194731e-06, + "loss": 0.70673698, + "num_input_tokens_seen": 195538615, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.40625, + "step": 9075, + "time_per_iteration": 3.8482577800750732 + }, + { + "auxiliary_loss_clip": 0.01059323, + "auxiliary_loss_mlp": 0.01025872, + "balance_loss_clip": 1.01410031, + "balance_loss_mlp": 1.02057326, + "epoch": 0.5456786412144897, + "flos": 21068337012480.0, + "grad_norm": 1.792942015041787, + "language_loss": 0.80694085, + "learning_rate": 1.7142144442414716e-06, + "loss": 0.82779282, + "num_input_tokens_seen": 195557460, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.38671875, + "step": 9076, + "time_per_iteration": 2.3913118839263916 + }, + { + "auxiliary_loss_clip": 0.01059388, + "auxiliary_loss_mlp": 0.01027587, + "balance_loss_clip": 1.01536226, + "balance_loss_mlp": 1.01894116, + "epoch": 0.5457387644671576, + "flos": 23293840584960.0, + "grad_norm": 1.5089359543691494, + "language_loss": 0.80208862, + "learning_rate": 1.713840504162422e-06, + "loss": 0.82295835, + "num_input_tokens_seen": 195577985, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.40429688, + "step": 9077, + "time_per_iteration": 2.393432140350342 + }, + { + "auxiliary_loss_clip": 0.01059405, + "auxiliary_loss_mlp": 0.01026459, + "balance_loss_clip": 1.01456785, + "balance_loss_mlp": 1.01843071, + "epoch": 0.5457988877198257, + "flos": 21615949687680.0, + "grad_norm": 1.9677729215340451, + "language_loss": 0.67377293, + "learning_rate": 1.713466574295668e-06, + "loss": 0.69463158, + "num_input_tokens_seen": 195597620, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.41015625, + "step": 9078, + "time_per_iteration": 2.4399781227111816 + }, + { + "auxiliary_loss_clip": 0.01063205, + "auxiliary_loss_mlp": 0.01031293, + "balance_loss_clip": 1.0180366, + "balance_loss_mlp": 1.02082562, + "epoch": 0.5458590109724936, + "flos": 23761537424640.0, + "grad_norm": 1.8176875284055072, + "language_loss": 0.81113064, + "learning_rate": 1.7130926546545555e-06, + "loss": 0.83207566, + "num_input_tokens_seen": 195615910, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.42382812, + "step": 9079, + "time_per_iteration": 2.4067330360412598 + }, + { + "auxiliary_loss_clip": 0.0106428, + "auxiliary_loss_mlp": 0.0102785, + "balance_loss_clip": 1.01429582, + "balance_loss_mlp": 1.02050591, + "epoch": 0.5459191342251616, + "flos": 24423176200320.0, + "grad_norm": 1.5448484424503466, + "language_loss": 0.75574541, + "learning_rate": 1.7127187452524275e-06, + "loss": 0.77666664, + "num_input_tokens_seen": 195635620, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.4375, + "step": 9080, + "time_per_iteration": 2.417630910873413 + }, + { + "auxiliary_loss_clip": 0.01061682, + "auxiliary_loss_mlp": 0.01027994, + "balance_loss_clip": 1.01407623, + "balance_loss_mlp": 1.01966667, + "epoch": 0.5459792574778296, + "flos": 23621884519680.0, + "grad_norm": 2.2171603298855524, + "language_loss": 0.83324057, + "learning_rate": 1.712344846102629e-06, + "loss": 0.85413736, + "num_input_tokens_seen": 195652495, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.41992188, + "step": 9081, + "time_per_iteration": 2.3915798664093018 + }, + { + "auxiliary_loss_clip": 0.01063486, + "auxiliary_loss_mlp": 0.01025351, + "balance_loss_clip": 1.01268458, + "balance_loss_mlp": 1.02055979, + "epoch": 0.5460393807304975, + "flos": 19134532782720.0, + "grad_norm": 1.5837946561360603, + "language_loss": 0.69881129, + "learning_rate": 1.7119709572185032e-06, + "loss": 0.71969962, + "num_input_tokens_seen": 195671965, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.4296875, + "step": 9082, + "time_per_iteration": 2.3961384296417236 + }, + { + "auxiliary_loss_clip": 0.01058732, + "auxiliary_loss_mlp": 0.01022345, + "balance_loss_clip": 1.01023912, + "balance_loss_mlp": 1.01869762, + "epoch": 0.5460995039831655, + "flos": 35917072536960.0, + "grad_norm": 1.5890916961150217, + "language_loss": 0.66496646, + "learning_rate": 1.7115970786133925e-06, + "loss": 0.68577725, + "num_input_tokens_seen": 195694725, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.40039062, + "step": 9083, + "time_per_iteration": 2.4979727268218994 + }, + { + "auxiliary_loss_clip": 0.0106093, + "auxiliary_loss_mlp": 0.01028, + "balance_loss_clip": 1.015751, + "balance_loss_mlp": 1.01980877, + "epoch": 0.5461596272358334, + "flos": 26758062662400.0, + "grad_norm": 1.7506901497149843, + "language_loss": 0.78921801, + "learning_rate": 1.7112232103006405e-06, + "loss": 0.81010729, + "num_input_tokens_seen": 195714090, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.41015625, + "step": 9084, + "time_per_iteration": 2.413357973098755 + }, + { + "auxiliary_loss_clip": 0.01061632, + "auxiliary_loss_mlp": 0.01028585, + "balance_loss_clip": 1.01564515, + "balance_loss_mlp": 1.0198431, + "epoch": 0.5462197504885015, + "flos": 20885531800320.0, + "grad_norm": 1.6071650472596413, + "language_loss": 0.7488246, + "learning_rate": 1.710849352293589e-06, + "loss": 0.76972675, + "num_input_tokens_seen": 195733585, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.41796875, + "step": 9085, + "time_per_iteration": 2.3707759380340576 + }, + { + "auxiliary_loss_clip": 0.01061175, + "auxiliary_loss_mlp": 0.01026751, + "balance_loss_clip": 1.0140729, + "balance_loss_mlp": 1.01976013, + "epoch": 0.5462798737411694, + "flos": 25803991405440.0, + "grad_norm": 1.9620854099295326, + "language_loss": 0.75014347, + "learning_rate": 1.7104755046055808e-06, + "loss": 0.77102274, + "num_input_tokens_seen": 195752820, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.4140625, + "step": 9086, + "time_per_iteration": 2.428372383117676 + }, + { + "auxiliary_loss_clip": 0.01060415, + "auxiliary_loss_mlp": 0.01026034, + "balance_loss_clip": 1.01367843, + "balance_loss_mlp": 1.02007461, + "epoch": 0.5463399969938374, + "flos": 25773861035520.0, + "grad_norm": 1.4563961460456825, + "language_loss": 0.77101016, + "learning_rate": 1.710101667249957e-06, + "loss": 0.79187471, + "num_input_tokens_seen": 195773740, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.40234375, + "step": 9087, + "time_per_iteration": 2.417628049850464 + }, + { + "auxiliary_loss_clip": 0.01063734, + "auxiliary_loss_mlp": 0.01031798, + "balance_loss_clip": 1.0186851, + "balance_loss_mlp": 1.02124357, + "epoch": 0.5464001202465053, + "flos": 18842309769600.0, + "grad_norm": 1.7257618276099957, + "language_loss": 0.77305472, + "learning_rate": 1.7097278402400592e-06, + "loss": 0.79401004, + "num_input_tokens_seen": 195792125, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.42578125, + "step": 9088, + "time_per_iteration": 2.413696765899658 + }, + { + "auxiliary_loss_clip": 0.01009212, + "auxiliary_loss_mlp": 0.0100221, + "balance_loss_clip": 1.00128591, + "balance_loss_mlp": 1.00155306, + "epoch": 0.5464602434991733, + "flos": 69046084897920.0, + "grad_norm": 0.7263878918984971, + "language_loss": 0.57733798, + "learning_rate": 1.709354023589228e-06, + "loss": 0.59745228, + "num_input_tokens_seen": 195854935, + "router_z_loss_clip": 0.00921631, + "router_z_loss_mlp": 0.07666016, + "step": 9089, + "time_per_iteration": 3.064657688140869 + }, + { + "auxiliary_loss_clip": 0.01061513, + "auxiliary_loss_mlp": 0.01026761, + "balance_loss_clip": 1.01420259, + "balance_loss_mlp": 1.01965928, + "epoch": 0.5465203667518412, + "flos": 27558900495360.0, + "grad_norm": 1.769283570699173, + "language_loss": 0.76719177, + "learning_rate": 1.7089802173108035e-06, + "loss": 0.78807455, + "num_input_tokens_seen": 195874715, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.41796875, + "step": 9090, + "time_per_iteration": 2.4254908561706543 + }, + { + "auxiliary_loss_clip": 0.01064358, + "auxiliary_loss_mlp": 0.01029399, + "balance_loss_clip": 1.01526093, + "balance_loss_mlp": 1.02021694, + "epoch": 0.5465804900045093, + "flos": 21209281637760.0, + "grad_norm": 1.7430468082596455, + "language_loss": 0.73586059, + "learning_rate": 1.7086064214181267e-06, + "loss": 0.75679815, + "num_input_tokens_seen": 195892610, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.44140625, + "step": 9091, + "time_per_iteration": 2.4001235961914062 + }, + { + "auxiliary_loss_clip": 0.01062183, + "auxiliary_loss_mlp": 0.01025534, + "balance_loss_clip": 1.01299286, + "balance_loss_mlp": 1.02039433, + "epoch": 0.5466406132571772, + "flos": 22487940604800.0, + "grad_norm": 1.705287257674208, + "language_loss": 0.78128612, + "learning_rate": 1.7082326359245376e-06, + "loss": 0.8021633, + "num_input_tokens_seen": 195911085, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.41796875, + "step": 9092, + "time_per_iteration": 2.4013686180114746 + }, + { + "auxiliary_loss_clip": 0.01063398, + "auxiliary_loss_mlp": 0.01027167, + "balance_loss_clip": 1.01364839, + "balance_loss_mlp": 1.02119517, + "epoch": 0.5467007365098452, + "flos": 17674883994240.0, + "grad_norm": 1.8033479896460107, + "language_loss": 0.7449019, + "learning_rate": 1.7078588608433747e-06, + "loss": 0.76580751, + "num_input_tokens_seen": 195929845, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.421875, + "step": 9093, + "time_per_iteration": 2.3543050289154053 + }, + { + "auxiliary_loss_clip": 0.01062817, + "auxiliary_loss_mlp": 0.0103147, + "balance_loss_clip": 1.01702738, + "balance_loss_mlp": 1.01882493, + "epoch": 0.5467608597625132, + "flos": 15698136925440.0, + "grad_norm": 2.8908246357570464, + "language_loss": 0.6908828, + "learning_rate": 1.7074850961879779e-06, + "loss": 0.71182561, + "num_input_tokens_seen": 195946350, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.43945312, + "step": 9094, + "time_per_iteration": 2.3722481727600098 + }, + { + "auxiliary_loss_clip": 0.01063251, + "auxiliary_loss_mlp": 0.01027468, + "balance_loss_clip": 1.01527286, + "balance_loss_mlp": 1.02196288, + "epoch": 0.5468209830151811, + "flos": 24311768451840.0, + "grad_norm": 3.088056400950744, + "language_loss": 0.68423218, + "learning_rate": 1.7071113419716852e-06, + "loss": 0.7051394, + "num_input_tokens_seen": 195959840, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.41210938, + "step": 9095, + "time_per_iteration": 2.387317419052124 + }, + { + "auxiliary_loss_clip": 0.01062078, + "auxiliary_loss_mlp": 0.01026435, + "balance_loss_clip": 1.01453841, + "balance_loss_mlp": 1.02222121, + "epoch": 0.5468811062678491, + "flos": 29165114638080.0, + "grad_norm": 1.4271497789389964, + "language_loss": 0.66456616, + "learning_rate": 1.7067375982078355e-06, + "loss": 0.68545127, + "num_input_tokens_seen": 195981125, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.3984375, + "step": 9096, + "time_per_iteration": 2.466686964035034 + }, + { + "auxiliary_loss_clip": 0.01008844, + "auxiliary_loss_mlp": 0.01002161, + "balance_loss_clip": 1.00121319, + "balance_loss_mlp": 1.0010618, + "epoch": 0.546941229520517, + "flos": 67864031262720.0, + "grad_norm": 0.9209416927862969, + "language_loss": 0.57500553, + "learning_rate": 1.7063638649097668e-06, + "loss": 0.59511554, + "num_input_tokens_seen": 196038880, + "router_z_loss_clip": 0.00946045, + "router_z_loss_mlp": 0.078125, + "step": 9097, + "time_per_iteration": 4.461362361907959 + }, + { + "auxiliary_loss_clip": 0.01008702, + "auxiliary_loss_mlp": 0.01000086, + "balance_loss_clip": 0.99930573, + "balance_loss_mlp": 1.00107789, + "epoch": 0.5470013527731851, + "flos": 58267594563840.0, + "grad_norm": 0.9134873049497998, + "language_loss": 0.64806229, + "learning_rate": 1.705990142090816e-06, + "loss": 0.66815019, + "num_input_tokens_seen": 196099215, + "router_z_loss_clip": 0.0078125, + "router_z_loss_mlp": 0.07617188, + "step": 9098, + "time_per_iteration": 3.00323224067688 + }, + { + "auxiliary_loss_clip": 0.01062535, + "auxiliary_loss_mlp": 0.01027353, + "balance_loss_clip": 1.0141983, + "balance_loss_mlp": 1.02055788, + "epoch": 0.547061476025853, + "flos": 22964819132160.0, + "grad_norm": 1.5070102133480539, + "language_loss": 0.73216283, + "learning_rate": 1.7056164297643213e-06, + "loss": 0.75306165, + "num_input_tokens_seen": 196120370, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.41992188, + "step": 9099, + "time_per_iteration": 2.43925404548645 + }, + { + "auxiliary_loss_clip": 0.01059832, + "auxiliary_loss_mlp": 0.0102108, + "balance_loss_clip": 1.00969529, + "balance_loss_mlp": 1.01926446, + "epoch": 0.547121599278521, + "flos": 29967034723200.0, + "grad_norm": 3.3336838657099803, + "language_loss": 0.7262587, + "learning_rate": 1.7052427279436183e-06, + "loss": 0.74706781, + "num_input_tokens_seen": 196139075, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.40625, + "step": 9100, + "time_per_iteration": 2.44386887550354 + }, + { + "auxiliary_loss_clip": 0.01060051, + "auxiliary_loss_mlp": 0.01026427, + "balance_loss_clip": 1.01331973, + "balance_loss_mlp": 1.02013588, + "epoch": 0.5471817225311889, + "flos": 36534057816960.0, + "grad_norm": 1.6493294767086026, + "language_loss": 0.6822288, + "learning_rate": 1.7048690366420447e-06, + "loss": 0.70309353, + "num_input_tokens_seen": 196159990, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.3984375, + "step": 9101, + "time_per_iteration": 2.52878999710083 + }, + { + "auxiliary_loss_clip": 0.01060329, + "auxiliary_loss_mlp": 0.01025865, + "balance_loss_clip": 1.01330614, + "balance_loss_mlp": 1.0192976, + "epoch": 0.5472418457838569, + "flos": 25774070503680.0, + "grad_norm": 1.7500818407126892, + "language_loss": 0.78136718, + "learning_rate": 1.7044953558729356e-06, + "loss": 0.80222911, + "num_input_tokens_seen": 196180570, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.41210938, + "step": 9102, + "time_per_iteration": 2.4079837799072266 + }, + { + "auxiliary_loss_clip": 0.01061887, + "auxiliary_loss_mlp": 0.01029212, + "balance_loss_clip": 1.01637888, + "balance_loss_mlp": 1.02132869, + "epoch": 0.5473019690365248, + "flos": 27886560405120.0, + "grad_norm": 1.3957371071212614, + "language_loss": 0.72032952, + "learning_rate": 1.7041216856496278e-06, + "loss": 0.7412405, + "num_input_tokens_seen": 196200300, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.40625, + "step": 9103, + "time_per_iteration": 2.448651075363159 + }, + { + "auxiliary_loss_clip": 0.01060241, + "auxiliary_loss_mlp": 0.01025709, + "balance_loss_clip": 1.01362145, + "balance_loss_mlp": 1.01937485, + "epoch": 0.5473620922891929, + "flos": 57629313354240.0, + "grad_norm": 1.4251047658381006, + "language_loss": 0.6534369, + "learning_rate": 1.7037480259854558e-06, + "loss": 0.67429644, + "num_input_tokens_seen": 196228525, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.41015625, + "step": 9104, + "time_per_iteration": 2.7402193546295166 + }, + { + "auxiliary_loss_clip": 0.01064015, + "auxiliary_loss_mlp": 0.01024859, + "balance_loss_clip": 1.01145971, + "balance_loss_mlp": 1.02094471, + "epoch": 0.5474222155418608, + "flos": 19353054182400.0, + "grad_norm": 2.3888195397291523, + "language_loss": 0.81451827, + "learning_rate": 1.703374376893754e-06, + "loss": 0.83540696, + "num_input_tokens_seen": 196247690, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.4296875, + "step": 9105, + "time_per_iteration": 2.374507188796997 + }, + { + "auxiliary_loss_clip": 0.01058736, + "auxiliary_loss_mlp": 0.01021498, + "balance_loss_clip": 1.00959504, + "balance_loss_mlp": 1.01914048, + "epoch": 0.5474823387945288, + "flos": 25119239443200.0, + "grad_norm": 1.47952487211539, + "language_loss": 0.80396318, + "learning_rate": 1.7030007383878583e-06, + "loss": 0.82476544, + "num_input_tokens_seen": 196268555, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.39648438, + "step": 9106, + "time_per_iteration": 2.413823127746582 + }, + { + "auxiliary_loss_clip": 0.01060751, + "auxiliary_loss_mlp": 0.01025403, + "balance_loss_clip": 1.01283288, + "balance_loss_mlp": 1.02035999, + "epoch": 0.5475424620471967, + "flos": 18003207219840.0, + "grad_norm": 1.86547509061609, + "language_loss": 0.69397736, + "learning_rate": 1.7026271104811017e-06, + "loss": 0.71483892, + "num_input_tokens_seen": 196285585, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.40429688, + "step": 9107, + "time_per_iteration": 2.368698835372925 + }, + { + "auxiliary_loss_clip": 0.01061236, + "auxiliary_loss_mlp": 0.01022574, + "balance_loss_clip": 1.01043832, + "balance_loss_mlp": 1.01975894, + "epoch": 0.5476025852998647, + "flos": 22308242503680.0, + "grad_norm": 1.7454341542187601, + "language_loss": 0.63256657, + "learning_rate": 1.702253493186819e-06, + "loss": 0.65340459, + "num_input_tokens_seen": 196305085, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.4140625, + "step": 9108, + "time_per_iteration": 2.3894593715667725 + }, + { + "auxiliary_loss_clip": 0.01058548, + "auxiliary_loss_mlp": 0.01022003, + "balance_loss_clip": 1.01030827, + "balance_loss_mlp": 1.01859212, + "epoch": 0.5476627085525327, + "flos": 20119467548160.0, + "grad_norm": 8.765463210265814, + "language_loss": 0.75205338, + "learning_rate": 1.7018798865183436e-06, + "loss": 0.77285892, + "num_input_tokens_seen": 196323945, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.40039062, + "step": 9109, + "time_per_iteration": 2.3897671699523926 + }, + { + "auxiliary_loss_clip": 0.0105913, + "auxiliary_loss_mlp": 0.01023529, + "balance_loss_clip": 1.01072025, + "balance_loss_mlp": 1.01973534, + "epoch": 0.5477228318052006, + "flos": 17711612611200.0, + "grad_norm": 1.7676947351381542, + "language_loss": 0.77186561, + "learning_rate": 1.7015062904890072e-06, + "loss": 0.79269218, + "num_input_tokens_seen": 196342200, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.39453125, + "step": 9110, + "time_per_iteration": 3.8957180976867676 + }, + { + "auxiliary_loss_clip": 0.010579, + "auxiliary_loss_mlp": 0.01024249, + "balance_loss_clip": 1.0132103, + "balance_loss_mlp": 1.019099, + "epoch": 0.5477829550578687, + "flos": 25847702294400.0, + "grad_norm": 1.43664714802647, + "language_loss": 0.71238106, + "learning_rate": 1.7011327051121443e-06, + "loss": 0.73320258, + "num_input_tokens_seen": 196362940, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.38867188, + "step": 9111, + "time_per_iteration": 2.448770046234131 + }, + { + "auxiliary_loss_clip": 0.01061763, + "auxiliary_loss_mlp": 0.01025832, + "balance_loss_clip": 1.01414347, + "balance_loss_mlp": 1.02051544, + "epoch": 0.5478430783105366, + "flos": 23038555656960.0, + "grad_norm": 2.9492515307939398, + "language_loss": 0.70988983, + "learning_rate": 1.7007591304010858e-06, + "loss": 0.73076582, + "num_input_tokens_seen": 196383070, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.41210938, + "step": 9112, + "time_per_iteration": 2.3921380043029785 + }, + { + "auxiliary_loss_clip": 0.01060802, + "auxiliary_loss_mlp": 0.01024096, + "balance_loss_clip": 1.0118351, + "balance_loss_mlp": 1.0190134, + "epoch": 0.5479032015632046, + "flos": 16070275774080.0, + "grad_norm": 2.2128073122510674, + "language_loss": 0.87817216, + "learning_rate": 1.7003855663691647e-06, + "loss": 0.89902115, + "num_input_tokens_seen": 196398485, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.41796875, + "step": 9113, + "time_per_iteration": 3.738016128540039 + }, + { + "auxiliary_loss_clip": 0.01062202, + "auxiliary_loss_mlp": 0.01024763, + "balance_loss_clip": 1.0123117, + "balance_loss_mlp": 1.02090406, + "epoch": 0.5479633248158725, + "flos": 24277588364160.0, + "grad_norm": 1.4329923308300647, + "language_loss": 0.7300837, + "learning_rate": 1.7000120130297119e-06, + "loss": 0.75095332, + "num_input_tokens_seen": 196417725, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.4140625, + "step": 9114, + "time_per_iteration": 3.8144326210021973 + }, + { + "auxiliary_loss_clip": 0.01058765, + "auxiliary_loss_mlp": 0.0102504, + "balance_loss_clip": 1.01342857, + "balance_loss_mlp": 1.01913118, + "epoch": 0.5480234480685405, + "flos": 26357050252800.0, + "grad_norm": 1.5572872032336873, + "language_loss": 0.72215617, + "learning_rate": 1.6996384703960584e-06, + "loss": 0.74299419, + "num_input_tokens_seen": 196437840, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.39648438, + "step": 9115, + "time_per_iteration": 2.406704902648926 + }, + { + "auxiliary_loss_clip": 0.0106061, + "auxiliary_loss_mlp": 0.01025027, + "balance_loss_clip": 1.01228988, + "balance_loss_mlp": 1.0188266, + "epoch": 0.5480835713212084, + "flos": 22053970005120.0, + "grad_norm": 1.7363698584548657, + "language_loss": 0.72102922, + "learning_rate": 1.6992649384815355e-06, + "loss": 0.7418856, + "num_input_tokens_seen": 196457300, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.41796875, + "step": 9116, + "time_per_iteration": 2.392216682434082 + }, + { + "auxiliary_loss_clip": 0.01061147, + "auxiliary_loss_mlp": 0.01029542, + "balance_loss_clip": 1.01741302, + "balance_loss_mlp": 1.02107346, + "epoch": 0.5481436945738765, + "flos": 25299880151040.0, + "grad_norm": 1.8920408329286915, + "language_loss": 0.76726735, + "learning_rate": 1.6988914172994732e-06, + "loss": 0.78817427, + "num_input_tokens_seen": 196476720, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.40039062, + "step": 9117, + "time_per_iteration": 2.4309780597686768 + }, + { + "auxiliary_loss_clip": 0.01056058, + "auxiliary_loss_mlp": 0.01022714, + "balance_loss_clip": 1.01175869, + "balance_loss_mlp": 1.01808488, + "epoch": 0.5482038178265444, + "flos": 33579532811520.0, + "grad_norm": 1.6235680363474412, + "language_loss": 0.62835467, + "learning_rate": 1.6985179068632025e-06, + "loss": 0.64914232, + "num_input_tokens_seen": 196496765, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.38085938, + "step": 9118, + "time_per_iteration": 2.4811758995056152 + }, + { + "auxiliary_loss_clip": 0.01061124, + "auxiliary_loss_mlp": 0.01024988, + "balance_loss_clip": 1.0119164, + "balance_loss_mlp": 1.02015519, + "epoch": 0.5482639410792124, + "flos": 19025184804480.0, + "grad_norm": 1.719451529556697, + "language_loss": 0.78938949, + "learning_rate": 1.6981444071860518e-06, + "loss": 0.81025052, + "num_input_tokens_seen": 196516220, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.41015625, + "step": 9119, + "time_per_iteration": 2.363877058029175 + }, + { + "auxiliary_loss_clip": 0.01058213, + "auxiliary_loss_mlp": 0.01023622, + "balance_loss_clip": 1.01290512, + "balance_loss_mlp": 1.01933646, + "epoch": 0.5483240643318803, + "flos": 25409158306560.0, + "grad_norm": 1.7741155983379373, + "language_loss": 0.82372129, + "learning_rate": 1.6977709182813503e-06, + "loss": 0.84453964, + "num_input_tokens_seen": 196533860, + "router_z_loss_clip": 0.10693359, + "router_z_loss_mlp": 0.38867188, + "step": 9120, + "time_per_iteration": 2.4159610271453857 + }, + { + "auxiliary_loss_clip": 0.01061974, + "auxiliary_loss_mlp": 0.01028086, + "balance_loss_clip": 1.01523542, + "balance_loss_mlp": 1.0200361, + "epoch": 0.5483841875845483, + "flos": 21465928108800.0, + "grad_norm": 1.9906104907311148, + "language_loss": 0.80314118, + "learning_rate": 1.6973974401624273e-06, + "loss": 0.82404184, + "num_input_tokens_seen": 196551305, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.41992188, + "step": 9121, + "time_per_iteration": 2.374544620513916 + }, + { + "auxiliary_loss_clip": 0.01060409, + "auxiliary_loss_mlp": 0.01026725, + "balance_loss_clip": 1.01349878, + "balance_loss_mlp": 1.01821518, + "epoch": 0.5484443108372163, + "flos": 24746297633280.0, + "grad_norm": 2.3085313201409567, + "language_loss": 0.61233401, + "learning_rate": 1.6970239728426114e-06, + "loss": 0.63320529, + "num_input_tokens_seen": 196569420, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.421875, + "step": 9122, + "time_per_iteration": 2.440157413482666 + }, + { + "auxiliary_loss_clip": 0.01060109, + "auxiliary_loss_mlp": 0.01024009, + "balance_loss_clip": 1.01315475, + "balance_loss_mlp": 1.01989448, + "epoch": 0.5485044340898843, + "flos": 25374175257600.0, + "grad_norm": 2.0938160663611196, + "language_loss": 0.71617699, + "learning_rate": 1.6966505163352307e-06, + "loss": 0.73701817, + "num_input_tokens_seen": 196590610, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.40234375, + "step": 9123, + "time_per_iteration": 2.421902656555176 + }, + { + "auxiliary_loss_clip": 0.0105857, + "auxiliary_loss_mlp": 0.01021739, + "balance_loss_clip": 1.01064062, + "balance_loss_mlp": 1.01889253, + "epoch": 0.5485645573425523, + "flos": 12640338518400.0, + "grad_norm": 1.8414175475629078, + "language_loss": 0.83551306, + "learning_rate": 1.6962770706536126e-06, + "loss": 0.85631615, + "num_input_tokens_seen": 196606495, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.39648438, + "step": 9124, + "time_per_iteration": 2.3690977096557617 + }, + { + "auxiliary_loss_clip": 0.01058719, + "auxiliary_loss_mlp": 0.01022551, + "balance_loss_clip": 1.01086891, + "balance_loss_mlp": 1.01949704, + "epoch": 0.5486246805952202, + "flos": 28328176592640.0, + "grad_norm": 7.620737408024527, + "language_loss": 0.80208886, + "learning_rate": 1.6959036358110845e-06, + "loss": 0.82290161, + "num_input_tokens_seen": 196626365, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.39257812, + "step": 9125, + "time_per_iteration": 2.439379930496216 + }, + { + "auxiliary_loss_clip": 0.01060366, + "auxiliary_loss_mlp": 0.01024078, + "balance_loss_clip": 1.01165676, + "balance_loss_mlp": 1.01936746, + "epoch": 0.5486848038478882, + "flos": 16799087738880.0, + "grad_norm": 2.5714264353485423, + "language_loss": 0.74256921, + "learning_rate": 1.6955302118209737e-06, + "loss": 0.76341367, + "num_input_tokens_seen": 196644465, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.41015625, + "step": 9126, + "time_per_iteration": 2.3550949096679688 + }, + { + "auxiliary_loss_clip": 0.01061858, + "auxiliary_loss_mlp": 0.01024958, + "balance_loss_clip": 1.01182711, + "balance_loss_mlp": 1.01933002, + "epoch": 0.5487449271005561, + "flos": 17235327576960.0, + "grad_norm": 2.5514851681347137, + "language_loss": 0.66955209, + "learning_rate": 1.6951567986966061e-06, + "loss": 0.69042021, + "num_input_tokens_seen": 196659160, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.42578125, + "step": 9127, + "time_per_iteration": 2.3454411029815674 + }, + { + "auxiliary_loss_clip": 0.01059794, + "auxiliary_loss_mlp": 0.01025566, + "balance_loss_clip": 1.01315045, + "balance_loss_mlp": 1.01926386, + "epoch": 0.5488050503532241, + "flos": 17340102167040.0, + "grad_norm": 1.6528514589782746, + "language_loss": 0.83687049, + "learning_rate": 1.6947833964513087e-06, + "loss": 0.85772413, + "num_input_tokens_seen": 196677410, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.40625, + "step": 9128, + "time_per_iteration": 2.3974545001983643 + }, + { + "auxiliary_loss_clip": 0.0106016, + "auxiliary_loss_mlp": 0.01028122, + "balance_loss_clip": 1.01540232, + "balance_loss_mlp": 1.01943874, + "epoch": 0.548865173605892, + "flos": 17238190308480.0, + "grad_norm": 1.9735187942793315, + "language_loss": 0.73905206, + "learning_rate": 1.6944100050984062e-06, + "loss": 0.75993484, + "num_input_tokens_seen": 196696765, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.40820312, + "step": 9129, + "time_per_iteration": 2.3705942630767822 + }, + { + "auxiliary_loss_clip": 0.01063395, + "auxiliary_loss_mlp": 0.01025636, + "balance_loss_clip": 1.0120821, + "balance_loss_mlp": 1.0213635, + "epoch": 0.5489252968585601, + "flos": 17455769101440.0, + "grad_norm": 2.707600664238006, + "language_loss": 0.6300354, + "learning_rate": 1.694036624651225e-06, + "loss": 0.65092576, + "num_input_tokens_seen": 196714895, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.41992188, + "step": 9130, + "time_per_iteration": 2.3700313568115234 + }, + { + "auxiliary_loss_clip": 0.0106246, + "auxiliary_loss_mlp": 0.01031719, + "balance_loss_clip": 1.01920199, + "balance_loss_mlp": 1.02093279, + "epoch": 0.548985420111228, + "flos": 26322171937920.0, + "grad_norm": 1.757110098241165, + "language_loss": 0.63095069, + "learning_rate": 1.6936632551230895e-06, + "loss": 0.65189242, + "num_input_tokens_seen": 196735510, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.4140625, + "step": 9131, + "time_per_iteration": 2.424190044403076 + }, + { + "auxiliary_loss_clip": 0.01061724, + "auxiliary_loss_mlp": 0.01028691, + "balance_loss_clip": 1.01589346, + "balance_loss_mlp": 1.01964831, + "epoch": 0.549045543363896, + "flos": 18692846772480.0, + "grad_norm": 1.762797492019322, + "language_loss": 0.74653721, + "learning_rate": 1.6932898965273243e-06, + "loss": 0.76744133, + "num_input_tokens_seen": 196752855, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.421875, + "step": 9132, + "time_per_iteration": 2.372720718383789 + }, + { + "auxiliary_loss_clip": 0.01058173, + "auxiliary_loss_mlp": 0.01023998, + "balance_loss_clip": 1.01195836, + "balance_loss_mlp": 1.01761699, + "epoch": 0.5491056666165639, + "flos": 24716237086080.0, + "grad_norm": 1.5195367580210974, + "language_loss": 0.81314445, + "learning_rate": 1.6929165488772545e-06, + "loss": 0.8339662, + "num_input_tokens_seen": 196772230, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.40625, + "step": 9133, + "time_per_iteration": 2.426116943359375 + }, + { + "auxiliary_loss_clip": 0.01058662, + "auxiliary_loss_mlp": 0.01026314, + "balance_loss_clip": 1.01383317, + "balance_loss_mlp": 1.01857972, + "epoch": 0.5491657898692319, + "flos": 21575939402880.0, + "grad_norm": 1.704241944561849, + "language_loss": 0.70532191, + "learning_rate": 1.6925432121862021e-06, + "loss": 0.72617161, + "num_input_tokens_seen": 196790405, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.40234375, + "step": 9134, + "time_per_iteration": 2.409421920776367 + }, + { + "auxiliary_loss_clip": 0.01061247, + "auxiliary_loss_mlp": 0.01025994, + "balance_loss_clip": 1.01386428, + "balance_loss_mlp": 1.01883066, + "epoch": 0.5492259131219, + "flos": 50474562566400.0, + "grad_norm": 2.0634681474352776, + "language_loss": 0.61081421, + "learning_rate": 1.6921698864674922e-06, + "loss": 0.63168669, + "num_input_tokens_seen": 196813785, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.42382812, + "step": 9135, + "time_per_iteration": 2.6460580825805664 + }, + { + "auxiliary_loss_clip": 0.01063151, + "auxiliary_loss_mlp": 0.01029111, + "balance_loss_clip": 1.01552677, + "balance_loss_mlp": 1.02035356, + "epoch": 0.5492860363745679, + "flos": 25118087368320.0, + "grad_norm": 2.079935241575968, + "language_loss": 0.72093141, + "learning_rate": 1.691796571734447e-06, + "loss": 0.74185395, + "num_input_tokens_seen": 196834390, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.42773438, + "step": 9136, + "time_per_iteration": 2.4369332790374756 + }, + { + "auxiliary_loss_clip": 0.01064466, + "auxiliary_loss_mlp": 0.01028431, + "balance_loss_clip": 1.01393485, + "balance_loss_mlp": 1.01964295, + "epoch": 0.5493461596272359, + "flos": 22632795302400.0, + "grad_norm": 2.0773062759291125, + "language_loss": 0.67885679, + "learning_rate": 1.6914232680003894e-06, + "loss": 0.69978571, + "num_input_tokens_seen": 196853290, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.44726562, + "step": 9137, + "time_per_iteration": 3.806732654571533 + }, + { + "auxiliary_loss_clip": 0.01008753, + "auxiliary_loss_mlp": 0.01000708, + "balance_loss_clip": 0.99985564, + "balance_loss_mlp": 1.00122869, + "epoch": 0.5494062828799038, + "flos": 66148853166720.0, + "grad_norm": 0.7369397080333059, + "language_loss": 0.65322882, + "learning_rate": 1.6910499752786416e-06, + "loss": 0.67332345, + "num_input_tokens_seen": 196913120, + "router_z_loss_clip": 0.00854492, + "router_z_loss_mlp": 0.07519531, + "step": 9138, + "time_per_iteration": 3.0687544345855713 + }, + { + "auxiliary_loss_clip": 0.01060121, + "auxiliary_loss_mlp": 0.01025712, + "balance_loss_clip": 1.01306438, + "balance_loss_mlp": 1.01937664, + "epoch": 0.5494664061325718, + "flos": 21104891072640.0, + "grad_norm": 1.6955627218016747, + "language_loss": 0.75173366, + "learning_rate": 1.6906766935825251e-06, + "loss": 0.77259201, + "num_input_tokens_seen": 196931530, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.40820312, + "step": 9139, + "time_per_iteration": 2.379244804382324 + }, + { + "auxiliary_loss_clip": 0.01059814, + "auxiliary_loss_mlp": 0.01025044, + "balance_loss_clip": 1.01293874, + "balance_loss_mlp": 1.0195322, + "epoch": 0.5495265293852397, + "flos": 14391686649600.0, + "grad_norm": 1.6566310666145987, + "language_loss": 0.71238196, + "learning_rate": 1.6903034229253624e-06, + "loss": 0.73323059, + "num_input_tokens_seen": 196949430, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.40234375, + "step": 9140, + "time_per_iteration": 2.368189811706543 + }, + { + "auxiliary_loss_clip": 0.01059752, + "auxiliary_loss_mlp": 0.01023568, + "balance_loss_clip": 1.01093221, + "balance_loss_mlp": 1.01841211, + "epoch": 0.5495866526379077, + "flos": 25548182807040.0, + "grad_norm": 1.6114467364388236, + "language_loss": 0.76474565, + "learning_rate": 1.6899301633204736e-06, + "loss": 0.78557885, + "num_input_tokens_seen": 196968265, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.4140625, + "step": 9141, + "time_per_iteration": 2.419553279876709 + }, + { + "auxiliary_loss_clip": 0.01058771, + "auxiliary_loss_mlp": 0.0102724, + "balance_loss_clip": 1.01534295, + "balance_loss_mlp": 1.01966941, + "epoch": 0.5496467758905756, + "flos": 21316395288960.0, + "grad_norm": 3.1747816422467507, + "language_loss": 0.74896777, + "learning_rate": 1.6895569147811794e-06, + "loss": 0.7698279, + "num_input_tokens_seen": 196984930, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.390625, + "step": 9142, + "time_per_iteration": 2.3891093730926514 + }, + { + "auxiliary_loss_clip": 0.01061944, + "auxiliary_loss_mlp": 0.01029091, + "balance_loss_clip": 1.01574004, + "balance_loss_mlp": 1.01970446, + "epoch": 0.5497068991432437, + "flos": 22232097095040.0, + "grad_norm": 2.0024084504394364, + "language_loss": 0.76708311, + "learning_rate": 1.6891836773208009e-06, + "loss": 0.78799343, + "num_input_tokens_seen": 197002320, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.421875, + "step": 9143, + "time_per_iteration": 2.3883166313171387 + }, + { + "auxiliary_loss_clip": 0.01058804, + "auxiliary_loss_mlp": 0.01024145, + "balance_loss_clip": 1.01217675, + "balance_loss_mlp": 1.01878881, + "epoch": 0.5497670223959116, + "flos": 18478095799680.0, + "grad_norm": 2.78779169579275, + "language_loss": 0.79777747, + "learning_rate": 1.688810450952657e-06, + "loss": 0.81860697, + "num_input_tokens_seen": 197020825, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.40039062, + "step": 9144, + "time_per_iteration": 2.3718817234039307 + }, + { + "auxiliary_loss_clip": 0.01061666, + "auxiliary_loss_mlp": 0.01024486, + "balance_loss_clip": 1.01264822, + "balance_loss_mlp": 1.02030039, + "epoch": 0.5498271456485796, + "flos": 29203833202560.0, + "grad_norm": 1.8765500543465914, + "language_loss": 0.71374488, + "learning_rate": 1.6884372356900679e-06, + "loss": 0.73460639, + "num_input_tokens_seen": 197040450, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.4140625, + "step": 9145, + "time_per_iteration": 2.4317266941070557 + }, + { + "auxiliary_loss_clip": 0.0106053, + "auxiliary_loss_mlp": 0.0102297, + "balance_loss_clip": 1.0111742, + "balance_loss_mlp": 1.02043366, + "epoch": 0.5498872689012475, + "flos": 34822929438720.0, + "grad_norm": 2.353605636625022, + "language_loss": 0.70897877, + "learning_rate": 1.688064031546352e-06, + "loss": 0.72981381, + "num_input_tokens_seen": 197063930, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.40039062, + "step": 9146, + "time_per_iteration": 2.5140390396118164 + }, + { + "auxiliary_loss_clip": 0.01056387, + "auxiliary_loss_mlp": 0.01021952, + "balance_loss_clip": 1.01103878, + "balance_loss_mlp": 1.01839757, + "epoch": 0.5499473921539155, + "flos": 25920740592000.0, + "grad_norm": 3.755065692843361, + "language_loss": 0.63851929, + "learning_rate": 1.6876908385348288e-06, + "loss": 0.65930271, + "num_input_tokens_seen": 197082660, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.37890625, + "step": 9147, + "time_per_iteration": 2.4252755641937256 + }, + { + "auxiliary_loss_clip": 0.01061655, + "auxiliary_loss_mlp": 0.01025843, + "balance_loss_clip": 1.01389802, + "balance_loss_mlp": 1.02083933, + "epoch": 0.5500075154065835, + "flos": 22272596138880.0, + "grad_norm": 2.036625742653253, + "language_loss": 0.80669099, + "learning_rate": 1.6873176566688168e-06, + "loss": 0.82756597, + "num_input_tokens_seen": 197100675, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.40820312, + "step": 9148, + "time_per_iteration": 2.3980965614318848 + }, + { + "auxiliary_loss_clip": 0.01060427, + "auxiliary_loss_mlp": 0.01027755, + "balance_loss_clip": 1.01577461, + "balance_loss_mlp": 1.01901245, + "epoch": 0.5500676386592515, + "flos": 28036267781760.0, + "grad_norm": 2.003916981357456, + "language_loss": 0.79229426, + "learning_rate": 1.6869444859616323e-06, + "loss": 0.81317616, + "num_input_tokens_seen": 197121320, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.4140625, + "step": 9149, + "time_per_iteration": 3.9170100688934326 + }, + { + "auxiliary_loss_clip": 0.0105975, + "auxiliary_loss_mlp": 0.01022417, + "balance_loss_clip": 1.00989962, + "balance_loss_mlp": 1.01897955, + "epoch": 0.5501277619119195, + "flos": 23913688596480.0, + "grad_norm": 2.698727296591227, + "language_loss": 0.71754366, + "learning_rate": 1.6865713264265944e-06, + "loss": 0.73836529, + "num_input_tokens_seen": 197138965, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.40820312, + "step": 9150, + "time_per_iteration": 2.396052837371826 + }, + { + "auxiliary_loss_clip": 0.01061507, + "auxiliary_loss_mlp": 0.01025768, + "balance_loss_clip": 1.01213098, + "balance_loss_mlp": 1.01841307, + "epoch": 0.5501878851645874, + "flos": 20922714264960.0, + "grad_norm": 1.9779036409976576, + "language_loss": 0.74860501, + "learning_rate": 1.686198178077019e-06, + "loss": 0.76947778, + "num_input_tokens_seen": 197156460, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.4296875, + "step": 9151, + "time_per_iteration": 2.396024227142334 + }, + { + "auxiliary_loss_clip": 0.01060614, + "auxiliary_loss_mlp": 0.01021356, + "balance_loss_clip": 1.00945926, + "balance_loss_mlp": 1.01994681, + "epoch": 0.5502480084172554, + "flos": 20664322225920.0, + "grad_norm": 2.2999447057299456, + "language_loss": 0.76223493, + "learning_rate": 1.685825040926224e-06, + "loss": 0.78305465, + "num_input_tokens_seen": 197175140, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.40625, + "step": 9152, + "time_per_iteration": 3.8346457481384277 + }, + { + "auxiliary_loss_clip": 0.01058807, + "auxiliary_loss_mlp": 0.01025729, + "balance_loss_clip": 1.0139389, + "balance_loss_mlp": 1.01948047, + "epoch": 0.5503081316699233, + "flos": 26431345359360.0, + "grad_norm": 1.852706272634034, + "language_loss": 0.82909632, + "learning_rate": 1.6854519149875253e-06, + "loss": 0.84994167, + "num_input_tokens_seen": 197194345, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.39453125, + "step": 9153, + "time_per_iteration": 3.9073727130889893 + }, + { + "auxiliary_loss_clip": 0.01060976, + "auxiliary_loss_mlp": 0.01024155, + "balance_loss_clip": 1.01142967, + "balance_loss_mlp": 1.01968741, + "epoch": 0.5503682549225913, + "flos": 30627800714880.0, + "grad_norm": 1.7004809685294255, + "language_loss": 0.74201483, + "learning_rate": 1.6850788002742379e-06, + "loss": 0.76286614, + "num_input_tokens_seen": 197215535, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.41210938, + "step": 9154, + "time_per_iteration": 2.487049102783203 + }, + { + "auxiliary_loss_clip": 0.01063417, + "auxiliary_loss_mlp": 0.01026768, + "balance_loss_clip": 1.01375651, + "balance_loss_mlp": 1.02024531, + "epoch": 0.5504283781752592, + "flos": 22564330392960.0, + "grad_norm": 1.534020548740588, + "language_loss": 0.72753191, + "learning_rate": 1.6847056967996786e-06, + "loss": 0.74843377, + "num_input_tokens_seen": 197234945, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.4296875, + "step": 9155, + "time_per_iteration": 2.401137351989746 + }, + { + "auxiliary_loss_clip": 0.01008158, + "auxiliary_loss_mlp": 0.01001641, + "balance_loss_clip": 1.00072873, + "balance_loss_mlp": 1.00058722, + "epoch": 0.5504885014279273, + "flos": 67898071704960.0, + "grad_norm": 0.7628294933454851, + "language_loss": 0.55394322, + "learning_rate": 1.6843326045771615e-06, + "loss": 0.57404125, + "num_input_tokens_seen": 197302285, + "router_z_loss_clip": 0.00909424, + "router_z_loss_mlp": 0.07568359, + "step": 9156, + "time_per_iteration": 3.1847825050354004 + }, + { + "auxiliary_loss_clip": 0.01058785, + "auxiliary_loss_mlp": 0.01022648, + "balance_loss_clip": 1.01060152, + "balance_loss_mlp": 1.01824677, + "epoch": 0.5505486246805952, + "flos": 22449117306240.0, + "grad_norm": 1.586995793066854, + "language_loss": 0.82336366, + "learning_rate": 1.6839595236200022e-06, + "loss": 0.84417796, + "num_input_tokens_seen": 197321575, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.40625, + "step": 9157, + "time_per_iteration": 2.4476990699768066 + }, + { + "auxiliary_loss_clip": 0.01063524, + "auxiliary_loss_mlp": 0.0102714, + "balance_loss_clip": 1.01328206, + "balance_loss_mlp": 1.02064216, + "epoch": 0.5506087479332632, + "flos": 26905675357440.0, + "grad_norm": 6.490742881056061, + "language_loss": 0.76254296, + "learning_rate": 1.6835864539415145e-06, + "loss": 0.78344953, + "num_input_tokens_seen": 197340255, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.4296875, + "step": 9158, + "time_per_iteration": 2.4746878147125244 + }, + { + "auxiliary_loss_clip": 0.01058937, + "auxiliary_loss_mlp": 0.01024375, + "balance_loss_clip": 1.0128293, + "balance_loss_mlp": 1.01975071, + "epoch": 0.5506688711859311, + "flos": 22929137856000.0, + "grad_norm": 2.1622691418600537, + "language_loss": 0.69611114, + "learning_rate": 1.683213395555012e-06, + "loss": 0.71694422, + "num_input_tokens_seen": 197360360, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.390625, + "step": 9159, + "time_per_iteration": 2.4481661319732666 + }, + { + "auxiliary_loss_clip": 0.01060117, + "auxiliary_loss_mlp": 0.01023513, + "balance_loss_clip": 1.01103139, + "balance_loss_mlp": 1.0191015, + "epoch": 0.5507289944385991, + "flos": 29605124903040.0, + "grad_norm": 1.5600469705212814, + "language_loss": 0.68223912, + "learning_rate": 1.6828403484738089e-06, + "loss": 0.70307541, + "num_input_tokens_seen": 197381905, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.41015625, + "step": 9160, + "time_per_iteration": 2.4710428714752197 + }, + { + "auxiliary_loss_clip": 0.01057503, + "auxiliary_loss_mlp": 0.01023119, + "balance_loss_clip": 1.01193142, + "balance_loss_mlp": 1.01932549, + "epoch": 0.5507891176912671, + "flos": 15333713487360.0, + "grad_norm": 2.6306067657119687, + "language_loss": 0.71189415, + "learning_rate": 1.6824673127112178e-06, + "loss": 0.73270035, + "num_input_tokens_seen": 197398555, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.3828125, + "step": 9161, + "time_per_iteration": 2.3918869495391846 + }, + { + "auxiliary_loss_clip": 0.0106097, + "auxiliary_loss_mlp": 0.01029598, + "balance_loss_clip": 1.01677108, + "balance_loss_mlp": 1.01985252, + "epoch": 0.5508492409439351, + "flos": 26577107752320.0, + "grad_norm": 2.307498467039448, + "language_loss": 0.6916396, + "learning_rate": 1.6820942882805515e-06, + "loss": 0.71254528, + "num_input_tokens_seen": 197419630, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.41015625, + "step": 9162, + "time_per_iteration": 2.458268404006958 + }, + { + "auxiliary_loss_clip": 0.0105978, + "auxiliary_loss_mlp": 0.01025373, + "balance_loss_clip": 1.01273727, + "balance_loss_mlp": 1.01930594, + "epoch": 0.5509093641966031, + "flos": 25442360876160.0, + "grad_norm": 1.6282387878704516, + "language_loss": 0.86007392, + "learning_rate": 1.681721275195123e-06, + "loss": 0.88092542, + "num_input_tokens_seen": 197438480, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.40429688, + "step": 9163, + "time_per_iteration": 2.455307722091675 + }, + { + "auxiliary_loss_clip": 0.01059715, + "auxiliary_loss_mlp": 0.01025646, + "balance_loss_clip": 1.01436257, + "balance_loss_mlp": 1.02007198, + "epoch": 0.550969487449271, + "flos": 18697524894720.0, + "grad_norm": 1.5851453209073836, + "language_loss": 0.80682856, + "learning_rate": 1.6813482734682426e-06, + "loss": 0.8276822, + "num_input_tokens_seen": 197456755, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.39648438, + "step": 9164, + "time_per_iteration": 2.392909526824951 + }, + { + "auxiliary_loss_clip": 0.01063707, + "auxiliary_loss_mlp": 0.01022107, + "balance_loss_clip": 1.00923848, + "balance_loss_mlp": 1.02196002, + "epoch": 0.551029610701939, + "flos": 22707683902080.0, + "grad_norm": 2.0340801105583557, + "language_loss": 0.73130935, + "learning_rate": 1.680975283113223e-06, + "loss": 0.75216752, + "num_input_tokens_seen": 197475530, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.41796875, + "step": 9165, + "time_per_iteration": 2.4271161556243896 + }, + { + "auxiliary_loss_clip": 0.01057514, + "auxiliary_loss_mlp": 0.01022166, + "balance_loss_clip": 1.01009607, + "balance_loss_mlp": 1.01790285, + "epoch": 0.5510897339546069, + "flos": 12419722437120.0, + "grad_norm": 2.7854252905292753, + "language_loss": 0.78776968, + "learning_rate": 1.6806023041433745e-06, + "loss": 0.80856645, + "num_input_tokens_seen": 197490835, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.39453125, + "step": 9166, + "time_per_iteration": 2.3575210571289062 + }, + { + "auxiliary_loss_clip": 0.01062583, + "auxiliary_loss_mlp": 0.01021907, + "balance_loss_clip": 1.01011705, + "balance_loss_mlp": 1.0204519, + "epoch": 0.5511498572072749, + "flos": 18769585674240.0, + "grad_norm": 2.029791057520658, + "language_loss": 0.76134121, + "learning_rate": 1.6802293365720087e-06, + "loss": 0.78218615, + "num_input_tokens_seen": 197508770, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.421875, + "step": 9167, + "time_per_iteration": 2.389617681503296 + }, + { + "auxiliary_loss_clip": 0.0105798, + "auxiliary_loss_mlp": 0.01021201, + "balance_loss_clip": 1.0089047, + "balance_loss_mlp": 1.01826441, + "epoch": 0.5512099804599428, + "flos": 19572308720640.0, + "grad_norm": 2.1900156587855126, + "language_loss": 0.80383825, + "learning_rate": 1.679856380412435e-06, + "loss": 0.82463002, + "num_input_tokens_seen": 197527340, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.3984375, + "step": 9168, + "time_per_iteration": 2.3541224002838135 + }, + { + "auxiliary_loss_clip": 0.01061521, + "auxiliary_loss_mlp": 0.01029698, + "balance_loss_clip": 1.01785493, + "balance_loss_mlp": 1.02023959, + "epoch": 0.5512701037126109, + "flos": 26244525340800.0, + "grad_norm": 1.5706477454358068, + "language_loss": 0.68883586, + "learning_rate": 1.6794834356779634e-06, + "loss": 0.70974809, + "num_input_tokens_seen": 197547280, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.41210938, + "step": 9169, + "time_per_iteration": 2.479041337966919 + }, + { + "auxiliary_loss_clip": 0.01056257, + "auxiliary_loss_mlp": 0.01020022, + "balance_loss_clip": 1.009269, + "balance_loss_mlp": 1.01783538, + "epoch": 0.5513302269652788, + "flos": 21944307824640.0, + "grad_norm": 2.0832719431050974, + "language_loss": 0.84987307, + "learning_rate": 1.6791105023819042e-06, + "loss": 0.87063587, + "num_input_tokens_seen": 197565045, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.3828125, + "step": 9170, + "time_per_iteration": 2.4072978496551514 + }, + { + "auxiliary_loss_clip": 0.01008234, + "auxiliary_loss_mlp": 0.01002676, + "balance_loss_clip": 1.00171626, + "balance_loss_mlp": 1.00080323, + "epoch": 0.5513903502179468, + "flos": 68232818620800.0, + "grad_norm": 0.7797433067757821, + "language_loss": 0.59901154, + "learning_rate": 1.678737580537565e-06, + "loss": 0.6191206, + "num_input_tokens_seen": 197625005, + "router_z_loss_clip": 0.00958252, + "router_z_loss_mlp": 0.07421875, + "step": 9171, + "time_per_iteration": 3.0909347534179688 + }, + { + "auxiliary_loss_clip": 0.01057838, + "auxiliary_loss_mlp": 0.01024471, + "balance_loss_clip": 1.01310444, + "balance_loss_mlp": 1.01851881, + "epoch": 0.5514504734706147, + "flos": 18733241082240.0, + "grad_norm": 1.4520745610776515, + "language_loss": 0.70501757, + "learning_rate": 1.6783646701582557e-06, + "loss": 0.72584069, + "num_input_tokens_seen": 197645050, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.39257812, + "step": 9172, + "time_per_iteration": 2.4155521392822266 + }, + { + "auxiliary_loss_clip": 0.01058238, + "auxiliary_loss_mlp": 0.01024111, + "balance_loss_clip": 1.01238036, + "balance_loss_mlp": 1.01874876, + "epoch": 0.5515105967232827, + "flos": 22269942875520.0, + "grad_norm": 1.8743267713348524, + "language_loss": 0.75929976, + "learning_rate": 1.6779917712572833e-06, + "loss": 0.78012323, + "num_input_tokens_seen": 197663910, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.39453125, + "step": 9173, + "time_per_iteration": 2.46028733253479 + }, + { + "auxiliary_loss_clip": 0.01057282, + "auxiliary_loss_mlp": 0.01022474, + "balance_loss_clip": 1.0110234, + "balance_loss_mlp": 1.01881146, + "epoch": 0.5515707199759508, + "flos": 22556789539200.0, + "grad_norm": 1.7422881291011891, + "language_loss": 0.75250304, + "learning_rate": 1.677618883847957e-06, + "loss": 0.77330053, + "num_input_tokens_seen": 197681580, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.38476562, + "step": 9174, + "time_per_iteration": 2.5777571201324463 + }, + { + "auxiliary_loss_clip": 0.01060424, + "auxiliary_loss_mlp": 0.01022443, + "balance_loss_clip": 1.01032567, + "balance_loss_mlp": 1.02003455, + "epoch": 0.5516308432286187, + "flos": 28289876964480.0, + "grad_norm": 3.8223660074021106, + "language_loss": 0.72519183, + "learning_rate": 1.6772460079435832e-06, + "loss": 0.74602044, + "num_input_tokens_seen": 197702095, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.40429688, + "step": 9175, + "time_per_iteration": 2.4470465183258057 + }, + { + "auxiliary_loss_clip": 0.01058906, + "auxiliary_loss_mlp": 0.0102506, + "balance_loss_clip": 1.01343083, + "balance_loss_mlp": 1.01835394, + "epoch": 0.5516909664812867, + "flos": 18763650743040.0, + "grad_norm": 2.0997093893997287, + "language_loss": 0.69764233, + "learning_rate": 1.676873143557469e-06, + "loss": 0.71848202, + "num_input_tokens_seen": 197720720, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.40625, + "step": 9176, + "time_per_iteration": 3.8213655948638916 + }, + { + "auxiliary_loss_clip": 0.01058424, + "auxiliary_loss_mlp": 0.01023257, + "balance_loss_clip": 1.01190829, + "balance_loss_mlp": 1.0190655, + "epoch": 0.5517510897339546, + "flos": 27739261912320.0, + "grad_norm": 1.4567562674102992, + "language_loss": 0.7093724, + "learning_rate": 1.6765002907029215e-06, + "loss": 0.7301892, + "num_input_tokens_seen": 197741820, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.39453125, + "step": 9177, + "time_per_iteration": 2.615405321121216 + }, + { + "auxiliary_loss_clip": 0.01060149, + "auxiliary_loss_mlp": 0.0102109, + "balance_loss_clip": 1.00924063, + "balance_loss_mlp": 1.01996326, + "epoch": 0.5518112129866226, + "flos": 18403521402240.0, + "grad_norm": 1.5100212053017663, + "language_loss": 0.801965, + "learning_rate": 1.6761274493932466e-06, + "loss": 0.82277739, + "num_input_tokens_seen": 197759160, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.40234375, + "step": 9178, + "time_per_iteration": 2.394702434539795 + }, + { + "auxiliary_loss_clip": 0.01061126, + "auxiliary_loss_mlp": 0.01034497, + "balance_loss_clip": 1.0221113, + "balance_loss_mlp": 1.0194931, + "epoch": 0.5518713362392905, + "flos": 25081498396800.0, + "grad_norm": 1.5350719424687234, + "language_loss": 0.74748254, + "learning_rate": 1.6757546196417496e-06, + "loss": 0.76843882, + "num_input_tokens_seen": 197779760, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.41796875, + "step": 9179, + "time_per_iteration": 2.4049973487854004 + }, + { + "auxiliary_loss_clip": 0.01060535, + "auxiliary_loss_mlp": 0.01022877, + "balance_loss_clip": 1.01167727, + "balance_loss_mlp": 1.0207566, + "epoch": 0.5519314594919585, + "flos": 36537514041600.0, + "grad_norm": 1.759559327523541, + "language_loss": 0.69704676, + "learning_rate": 1.6753818014617363e-06, + "loss": 0.71788085, + "num_input_tokens_seen": 197801545, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.3984375, + "step": 9180, + "time_per_iteration": 2.518559694290161 + }, + { + "auxiliary_loss_clip": 0.0106032, + "auxiliary_loss_mlp": 0.01029133, + "balance_loss_clip": 1.01700354, + "balance_loss_mlp": 1.02013302, + "epoch": 0.5519915827446265, + "flos": 20447581305600.0, + "grad_norm": 4.542025061451799, + "language_loss": 0.66885769, + "learning_rate": 1.6750089948665112e-06, + "loss": 0.68975222, + "num_input_tokens_seen": 197820760, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.40234375, + "step": 9181, + "time_per_iteration": 2.4015140533447266 + }, + { + "auxiliary_loss_clip": 0.01060314, + "auxiliary_loss_mlp": 0.01026733, + "balance_loss_clip": 1.01395345, + "balance_loss_mlp": 1.01895356, + "epoch": 0.5520517059972945, + "flos": 23766948685440.0, + "grad_norm": 1.8241267831122463, + "language_loss": 0.79184651, + "learning_rate": 1.6746361998693793e-06, + "loss": 0.81271708, + "num_input_tokens_seen": 197840195, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.4140625, + "step": 9182, + "time_per_iteration": 2.53857421875 + }, + { + "auxiliary_loss_clip": 0.01059837, + "auxiliary_loss_mlp": 0.01022163, + "balance_loss_clip": 1.01003969, + "balance_loss_mlp": 1.01958704, + "epoch": 0.5521118292499624, + "flos": 22195473212160.0, + "grad_norm": 2.3540126364972194, + "language_loss": 0.83193809, + "learning_rate": 1.6742634164836442e-06, + "loss": 0.85275811, + "num_input_tokens_seen": 197859475, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.40234375, + "step": 9183, + "time_per_iteration": 2.398482084274292 + }, + { + "auxiliary_loss_clip": 0.01058245, + "auxiliary_loss_mlp": 0.01022487, + "balance_loss_clip": 1.0105536, + "balance_loss_mlp": 1.0185256, + "epoch": 0.5521719525026304, + "flos": 23582258259840.0, + "grad_norm": 1.301722561561747, + "language_loss": 0.67274851, + "learning_rate": 1.6738906447226103e-06, + "loss": 0.69355583, + "num_input_tokens_seen": 197879395, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.3984375, + "step": 9184, + "time_per_iteration": 2.424783945083618 + }, + { + "auxiliary_loss_clip": 0.01060586, + "auxiliary_loss_mlp": 0.01021592, + "balance_loss_clip": 1.00929511, + "balance_loss_mlp": 1.01997066, + "epoch": 0.5522320757552983, + "flos": 26136503994240.0, + "grad_norm": 1.5510886768422618, + "language_loss": 0.76369739, + "learning_rate": 1.6735178845995803e-06, + "loss": 0.7845192, + "num_input_tokens_seen": 197900815, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.40625, + "step": 9185, + "time_per_iteration": 2.438058853149414 + }, + { + "auxiliary_loss_clip": 0.01060027, + "auxiliary_loss_mlp": 0.01026379, + "balance_loss_clip": 1.01393366, + "balance_loss_mlp": 1.01951241, + "epoch": 0.5522921990079663, + "flos": 24675144549120.0, + "grad_norm": 1.654083574044849, + "language_loss": 0.73804802, + "learning_rate": 1.673145136127857e-06, + "loss": 0.75891209, + "num_input_tokens_seen": 197918985, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.40625, + "step": 9186, + "time_per_iteration": 2.454637289047241 + }, + { + "auxiliary_loss_clip": 0.01062348, + "auxiliary_loss_mlp": 0.01026291, + "balance_loss_clip": 1.0146029, + "balance_loss_mlp": 1.0209372, + "epoch": 0.5523523222606344, + "flos": 22747030871040.0, + "grad_norm": 1.9466578579264253, + "language_loss": 0.67049116, + "learning_rate": 1.6727723993207432e-06, + "loss": 0.69137752, + "num_input_tokens_seen": 197937725, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.4140625, + "step": 9187, + "time_per_iteration": 2.4345500469207764 + }, + { + "auxiliary_loss_clip": 0.01060176, + "auxiliary_loss_mlp": 0.01024972, + "balance_loss_clip": 1.01356411, + "balance_loss_mlp": 1.0203954, + "epoch": 0.5524124455133023, + "flos": 19754799730560.0, + "grad_norm": 1.4862334753651385, + "language_loss": 0.77398026, + "learning_rate": 1.6723996741915406e-06, + "loss": 0.79483175, + "num_input_tokens_seen": 197955635, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.39648438, + "step": 9188, + "time_per_iteration": 2.4476118087768555 + }, + { + "auxiliary_loss_clip": 0.01058137, + "auxiliary_loss_mlp": 0.01023382, + "balance_loss_clip": 1.01236677, + "balance_loss_mlp": 1.0184809, + "epoch": 0.5524725687659703, + "flos": 23293700939520.0, + "grad_norm": 1.66925520942364, + "language_loss": 0.81169426, + "learning_rate": 1.672026960753551e-06, + "loss": 0.8325094, + "num_input_tokens_seen": 197974490, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.3984375, + "step": 9189, + "time_per_iteration": 3.8981242179870605 + }, + { + "auxiliary_loss_clip": 0.01058608, + "auxiliary_loss_mlp": 0.01020835, + "balance_loss_clip": 1.00892591, + "balance_loss_mlp": 1.01951754, + "epoch": 0.5525326920186382, + "flos": 24861056872320.0, + "grad_norm": 1.3350071450230618, + "language_loss": 0.76406908, + "learning_rate": 1.6716542590200753e-06, + "loss": 0.78486353, + "num_input_tokens_seen": 197995735, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.390625, + "step": 9190, + "time_per_iteration": 2.475370407104492 + }, + { + "auxiliary_loss_clip": 0.01061769, + "auxiliary_loss_mlp": 0.01025424, + "balance_loss_clip": 1.01186967, + "balance_loss_mlp": 1.01901031, + "epoch": 0.5525928152713062, + "flos": 13734725996160.0, + "grad_norm": 2.6631961168173017, + "language_loss": 0.78780347, + "learning_rate": 1.671281569004415e-06, + "loss": 0.80867541, + "num_input_tokens_seen": 198009685, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.42773438, + "step": 9191, + "time_per_iteration": 2.4060275554656982 + }, + { + "auxiliary_loss_clip": 0.01058259, + "auxiliary_loss_mlp": 0.0102112, + "balance_loss_clip": 1.01021791, + "balance_loss_mlp": 1.01934659, + "epoch": 0.5526529385239741, + "flos": 13070957627520.0, + "grad_norm": 1.949551898050896, + "language_loss": 0.68681169, + "learning_rate": 1.6709088907198698e-06, + "loss": 0.70760548, + "num_input_tokens_seen": 198026845, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.38867188, + "step": 9192, + "time_per_iteration": 3.7587640285491943 + }, + { + "auxiliary_loss_clip": 0.01056985, + "auxiliary_loss_mlp": 0.01018954, + "balance_loss_clip": 1.00909543, + "balance_loss_mlp": 1.01935363, + "epoch": 0.5527130617766421, + "flos": 23147275230720.0, + "grad_norm": 1.4509086305437524, + "language_loss": 0.77825117, + "learning_rate": 1.6705362241797398e-06, + "loss": 0.79901057, + "num_input_tokens_seen": 198045275, + "router_z_loss_clip": 0.09863281, + "router_z_loss_mlp": 0.37695312, + "step": 9193, + "time_per_iteration": 3.8872132301330566 + }, + { + "auxiliary_loss_clip": 0.01059507, + "auxiliary_loss_mlp": 0.01024208, + "balance_loss_clip": 1.01276386, + "balance_loss_mlp": 1.01947832, + "epoch": 0.55277318502931, + "flos": 21284554262400.0, + "grad_norm": 1.4554803950966924, + "language_loss": 0.78564674, + "learning_rate": 1.6701635693973245e-06, + "loss": 0.80648392, + "num_input_tokens_seen": 198065760, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.40039062, + "step": 9194, + "time_per_iteration": 2.480173110961914 + }, + { + "auxiliary_loss_clip": 0.0106232, + "auxiliary_loss_mlp": 0.01022857, + "balance_loss_clip": 1.01101398, + "balance_loss_mlp": 1.01896572, + "epoch": 0.5528333082819781, + "flos": 38323077171840.0, + "grad_norm": 1.678989791933088, + "language_loss": 0.6963799, + "learning_rate": 1.6697909263859226e-06, + "loss": 0.71723169, + "num_input_tokens_seen": 198087595, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.43359375, + "step": 9195, + "time_per_iteration": 2.5689306259155273 + }, + { + "auxiliary_loss_clip": 0.01064583, + "auxiliary_loss_mlp": 0.01023896, + "balance_loss_clip": 1.01138508, + "balance_loss_mlp": 1.02079964, + "epoch": 0.552893431534646, + "flos": 13552758656640.0, + "grad_norm": 3.002475981232837, + "language_loss": 0.74253953, + "learning_rate": 1.6694182951588335e-06, + "loss": 0.76342428, + "num_input_tokens_seen": 198104620, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.4375, + "step": 9196, + "time_per_iteration": 2.444375991821289 + }, + { + "auxiliary_loss_clip": 0.01060025, + "auxiliary_loss_mlp": 0.01026308, + "balance_loss_clip": 1.01475096, + "balance_loss_mlp": 1.0204283, + "epoch": 0.552953554787314, + "flos": 21938477627520.0, + "grad_norm": 1.5453619427573195, + "language_loss": 0.77287108, + "learning_rate": 1.669045675729355e-06, + "loss": 0.79373437, + "num_input_tokens_seen": 198123565, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.39453125, + "step": 9197, + "time_per_iteration": 2.4145395755767822 + }, + { + "auxiliary_loss_clip": 0.01057692, + "auxiliary_loss_mlp": 0.01021659, + "balance_loss_clip": 1.01070416, + "balance_loss_mlp": 1.01859093, + "epoch": 0.5530136780399819, + "flos": 43656199528320.0, + "grad_norm": 1.4243516894263302, + "language_loss": 0.76398069, + "learning_rate": 1.6686730681107849e-06, + "loss": 0.78477418, + "num_input_tokens_seen": 198148270, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.390625, + "step": 9198, + "time_per_iteration": 2.6724085807800293 + }, + { + "auxiliary_loss_clip": 0.01057724, + "auxiliary_loss_mlp": 0.01020204, + "balance_loss_clip": 1.00929022, + "balance_loss_mlp": 1.01804066, + "epoch": 0.5530738012926499, + "flos": 25044350843520.0, + "grad_norm": 1.5611702789487698, + "language_loss": 0.79132622, + "learning_rate": 1.6683004723164208e-06, + "loss": 0.81210548, + "num_input_tokens_seen": 198168810, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.3984375, + "step": 9199, + "time_per_iteration": 2.4309799671173096 + }, + { + "auxiliary_loss_clip": 0.01059145, + "auxiliary_loss_mlp": 0.01023191, + "balance_loss_clip": 1.01178288, + "balance_loss_mlp": 1.0189867, + "epoch": 0.553133924545318, + "flos": 16471148538240.0, + "grad_norm": 1.702975889101064, + "language_loss": 0.63863534, + "learning_rate": 1.6679278883595592e-06, + "loss": 0.6594587, + "num_input_tokens_seen": 198186200, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.40234375, + "step": 9200, + "time_per_iteration": 2.3720486164093018 + }, + { + "auxiliary_loss_clip": 0.01061432, + "auxiliary_loss_mlp": 0.01026077, + "balance_loss_clip": 1.01306558, + "balance_loss_mlp": 1.01977515, + "epoch": 0.5531940477979859, + "flos": 24605108628480.0, + "grad_norm": 1.513261330880206, + "language_loss": 0.66370988, + "learning_rate": 1.6675553162534977e-06, + "loss": 0.68458498, + "num_input_tokens_seen": 198207050, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.41796875, + "step": 9201, + "time_per_iteration": 2.4252126216888428 + }, + { + "auxiliary_loss_clip": 0.01061153, + "auxiliary_loss_mlp": 0.01022882, + "balance_loss_clip": 1.0105083, + "balance_loss_mlp": 1.0206219, + "epoch": 0.5532541710506539, + "flos": 22158604949760.0, + "grad_norm": 1.8991825749164182, + "language_loss": 0.6091156, + "learning_rate": 1.667182756011532e-06, + "loss": 0.62995601, + "num_input_tokens_seen": 198224565, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.40625, + "step": 9202, + "time_per_iteration": 2.3941972255706787 + }, + { + "auxiliary_loss_clip": 0.01058995, + "auxiliary_loss_mlp": 0.01022818, + "balance_loss_clip": 1.01186323, + "balance_loss_mlp": 1.02015495, + "epoch": 0.5533142943033218, + "flos": 21396206390400.0, + "grad_norm": 2.7434020623964415, + "language_loss": 0.64448953, + "learning_rate": 1.6668102076469567e-06, + "loss": 0.66530764, + "num_input_tokens_seen": 198244790, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.38671875, + "step": 9203, + "time_per_iteration": 2.403965473175049 + }, + { + "auxiliary_loss_clip": 0.01059453, + "auxiliary_loss_mlp": 0.01022752, + "balance_loss_clip": 1.01101589, + "balance_loss_mlp": 1.01884651, + "epoch": 0.5533744175559898, + "flos": 23549404803840.0, + "grad_norm": 1.6235596513877546, + "language_loss": 0.63723838, + "learning_rate": 1.6664376711730687e-06, + "loss": 0.65806049, + "num_input_tokens_seen": 198264375, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.40625, + "step": 9204, + "time_per_iteration": 2.396962881088257 + }, + { + "auxiliary_loss_clip": 0.01057046, + "auxiliary_loss_mlp": 0.010228, + "balance_loss_clip": 1.01192164, + "balance_loss_mlp": 1.01829302, + "epoch": 0.5534345408086577, + "flos": 24060358684800.0, + "grad_norm": 2.096455828126633, + "language_loss": 0.7709406, + "learning_rate": 1.6660651466031616e-06, + "loss": 0.79173899, + "num_input_tokens_seen": 198283895, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.38671875, + "step": 9205, + "time_per_iteration": 2.449313163757324 + }, + { + "auxiliary_loss_clip": 0.01059714, + "auxiliary_loss_mlp": 0.01024907, + "balance_loss_clip": 1.01342106, + "balance_loss_mlp": 1.02011669, + "epoch": 0.5534946640613257, + "flos": 33770262902400.0, + "grad_norm": 1.330582309378557, + "language_loss": 0.7258662, + "learning_rate": 1.6656926339505311e-06, + "loss": 0.74671245, + "num_input_tokens_seen": 198310035, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.39453125, + "step": 9206, + "time_per_iteration": 2.577876329421997 + }, + { + "auxiliary_loss_clip": 0.0105973, + "auxiliary_loss_mlp": 0.01028888, + "balance_loss_clip": 1.01730645, + "balance_loss_mlp": 1.0196228, + "epoch": 0.5535547873139937, + "flos": 15158309483520.0, + "grad_norm": 1.7932026689067269, + "language_loss": 0.75443161, + "learning_rate": 1.6653201332284705e-06, + "loss": 0.77531779, + "num_input_tokens_seen": 198327810, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.40234375, + "step": 9207, + "time_per_iteration": 2.3840765953063965 + }, + { + "auxiliary_loss_clip": 0.01061821, + "auxiliary_loss_mlp": 0.01025167, + "balance_loss_clip": 1.01251924, + "balance_loss_mlp": 1.01983356, + "epoch": 0.5536149105666617, + "flos": 16979972826240.0, + "grad_norm": 2.322224464186771, + "language_loss": 0.61953336, + "learning_rate": 1.6649476444502734e-06, + "loss": 0.64040321, + "num_input_tokens_seen": 198343150, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.41992188, + "step": 9208, + "time_per_iteration": 2.3531670570373535 + }, + { + "auxiliary_loss_clip": 0.01060238, + "auxiliary_loss_mlp": 0.01021597, + "balance_loss_clip": 1.01029563, + "balance_loss_mlp": 1.01929379, + "epoch": 0.5536750338193296, + "flos": 18148969612800.0, + "grad_norm": 2.4563983461125916, + "language_loss": 0.64147282, + "learning_rate": 1.664575167629233e-06, + "loss": 0.66229117, + "num_input_tokens_seen": 198360925, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.41015625, + "step": 9209, + "time_per_iteration": 2.383112668991089 + }, + { + "auxiliary_loss_clip": 0.01060606, + "auxiliary_loss_mlp": 0.01026202, + "balance_loss_clip": 1.01342869, + "balance_loss_mlp": 1.01984549, + "epoch": 0.5537351570719976, + "flos": 22746681757440.0, + "grad_norm": 2.061499919593469, + "language_loss": 0.82190585, + "learning_rate": 1.6642027027786415e-06, + "loss": 0.84277385, + "num_input_tokens_seen": 198379265, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.40820312, + "step": 9210, + "time_per_iteration": 2.402233123779297 + }, + { + "auxiliary_loss_clip": 0.01056814, + "auxiliary_loss_mlp": 0.01023903, + "balance_loss_clip": 1.0128051, + "balance_loss_mlp": 1.01783323, + "epoch": 0.5537952803246655, + "flos": 26354920659840.0, + "grad_norm": 1.8358297172429165, + "language_loss": 0.72993374, + "learning_rate": 1.6638302499117924e-06, + "loss": 0.75074089, + "num_input_tokens_seen": 198399490, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.390625, + "step": 9211, + "time_per_iteration": 2.5177791118621826 + }, + { + "auxiliary_loss_clip": 0.01061115, + "auxiliary_loss_mlp": 0.01028236, + "balance_loss_clip": 1.01417565, + "balance_loss_mlp": 1.01901627, + "epoch": 0.5538554035773335, + "flos": 18036549434880.0, + "grad_norm": 2.175345732094041, + "language_loss": 0.66861945, + "learning_rate": 1.6634578090419766e-06, + "loss": 0.68951297, + "num_input_tokens_seen": 198419110, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.421875, + "step": 9212, + "time_per_iteration": 2.4863383769989014 + }, + { + "auxiliary_loss_clip": 0.010603, + "auxiliary_loss_mlp": 0.01023897, + "balance_loss_clip": 1.01017606, + "balance_loss_mlp": 1.01779079, + "epoch": 0.5539155268300014, + "flos": 31684900993920.0, + "grad_norm": 2.138649418439393, + "language_loss": 0.51562548, + "learning_rate": 1.663085380182486e-06, + "loss": 0.53646743, + "num_input_tokens_seen": 198441360, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.42578125, + "step": 9213, + "time_per_iteration": 2.494345188140869 + }, + { + "auxiliary_loss_clip": 0.01061625, + "auxiliary_loss_mlp": 0.01025751, + "balance_loss_clip": 1.01376414, + "balance_loss_mlp": 1.0200913, + "epoch": 0.5539756500826695, + "flos": 15192908507520.0, + "grad_norm": 1.7835131706408158, + "language_loss": 0.85993409, + "learning_rate": 1.6627129633466117e-06, + "loss": 0.88080788, + "num_input_tokens_seen": 198459835, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.4140625, + "step": 9214, + "time_per_iteration": 2.3762340545654297 + }, + { + "auxiliary_loss_clip": 0.01059995, + "auxiliary_loss_mlp": 0.0102138, + "balance_loss_clip": 1.0097506, + "balance_loss_mlp": 1.02078474, + "epoch": 0.5540357733353375, + "flos": 26352092839680.0, + "grad_norm": 1.7543005833244114, + "language_loss": 0.69867086, + "learning_rate": 1.6623405585476438e-06, + "loss": 0.71948463, + "num_input_tokens_seen": 198478955, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.39257812, + "step": 9215, + "time_per_iteration": 2.457019805908203 + }, + { + "auxiliary_loss_clip": 0.01061694, + "auxiliary_loss_mlp": 0.01023867, + "balance_loss_clip": 1.01129651, + "balance_loss_mlp": 1.01997793, + "epoch": 0.5540958965880054, + "flos": 21322644422400.0, + "grad_norm": 1.734215469424625, + "language_loss": 0.73463607, + "learning_rate": 1.6619681657988732e-06, + "loss": 0.75549167, + "num_input_tokens_seen": 198499030, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.41601562, + "step": 9216, + "time_per_iteration": 3.8703482151031494 + }, + { + "auxiliary_loss_clip": 0.01057811, + "auxiliary_loss_mlp": 0.01022691, + "balance_loss_clip": 1.01115751, + "balance_loss_mlp": 1.01932728, + "epoch": 0.5541560198406734, + "flos": 25665630220800.0, + "grad_norm": 2.102571865512665, + "language_loss": 0.71805269, + "learning_rate": 1.661595785113589e-06, + "loss": 0.73885769, + "num_input_tokens_seen": 198520265, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.38476562, + "step": 9217, + "time_per_iteration": 2.478914499282837 + }, + { + "auxiliary_loss_clip": 0.01057454, + "auxiliary_loss_mlp": 0.01021395, + "balance_loss_clip": 1.00996888, + "balance_loss_mlp": 1.01854229, + "epoch": 0.5542161430933413, + "flos": 21938687095680.0, + "grad_norm": 1.7437713056846165, + "language_loss": 0.78337801, + "learning_rate": 1.6612234165050808e-06, + "loss": 0.8041665, + "num_input_tokens_seen": 198539645, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.38867188, + "step": 9218, + "time_per_iteration": 2.411794900894165 + }, + { + "auxiliary_loss_clip": 0.01060281, + "auxiliary_loss_mlp": 0.01025816, + "balance_loss_clip": 1.01206541, + "balance_loss_mlp": 1.01773906, + "epoch": 0.5542762663460093, + "flos": 19570493329920.0, + "grad_norm": 1.8299918259088301, + "language_loss": 0.72232348, + "learning_rate": 1.6608510599866374e-06, + "loss": 0.74318445, + "num_input_tokens_seen": 198558710, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.42578125, + "step": 9219, + "time_per_iteration": 2.3751182556152344 + }, + { + "auxiliary_loss_clip": 0.01063192, + "auxiliary_loss_mlp": 0.01027717, + "balance_loss_clip": 1.0149678, + "balance_loss_mlp": 1.02105701, + "epoch": 0.5543363895986773, + "flos": 19498083436800.0, + "grad_norm": 1.7406572169220327, + "language_loss": 0.71391952, + "learning_rate": 1.6604787155715471e-06, + "loss": 0.73482859, + "num_input_tokens_seen": 198577050, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.421875, + "step": 9220, + "time_per_iteration": 2.388030767440796 + }, + { + "auxiliary_loss_clip": 0.01058778, + "auxiliary_loss_mlp": 0.01021981, + "balance_loss_clip": 1.01057887, + "balance_loss_mlp": 1.01942956, + "epoch": 0.5543965128513453, + "flos": 22634575781760.0, + "grad_norm": 1.6590877098963068, + "language_loss": 0.79038906, + "learning_rate": 1.6601063832730984e-06, + "loss": 0.81119663, + "num_input_tokens_seen": 198595290, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.39453125, + "step": 9221, + "time_per_iteration": 2.372298002243042 + }, + { + "auxiliary_loss_clip": 0.01059467, + "auxiliary_loss_mlp": 0.01029282, + "balance_loss_clip": 1.01615727, + "balance_loss_mlp": 1.01886749, + "epoch": 0.5544566361040132, + "flos": 25988891299200.0, + "grad_norm": 1.6653284084703057, + "language_loss": 0.83392799, + "learning_rate": 1.6597340631045783e-06, + "loss": 0.85481548, + "num_input_tokens_seen": 198614110, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.40625, + "step": 9222, + "time_per_iteration": 2.399965524673462 + }, + { + "auxiliary_loss_clip": 0.01062289, + "auxiliary_loss_mlp": 0.01027057, + "balance_loss_clip": 1.0138129, + "balance_loss_mlp": 1.01923883, + "epoch": 0.5545167593566812, + "flos": 28256290369920.0, + "grad_norm": 1.7628423526340684, + "language_loss": 0.75594008, + "learning_rate": 1.6593617550792749e-06, + "loss": 0.77683353, + "num_input_tokens_seen": 198633880, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.43164062, + "step": 9223, + "time_per_iteration": 2.460369825363159 + }, + { + "auxiliary_loss_clip": 0.01059479, + "auxiliary_loss_mlp": 0.01027596, + "balance_loss_clip": 1.01493597, + "balance_loss_mlp": 1.01942444, + "epoch": 0.5545768826093491, + "flos": 28475265617280.0, + "grad_norm": 1.722771668276774, + "language_loss": 0.8194108, + "learning_rate": 1.6589894592104738e-06, + "loss": 0.84028161, + "num_input_tokens_seen": 198653505, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.40039062, + "step": 9224, + "time_per_iteration": 2.482306718826294 + }, + { + "auxiliary_loss_clip": 0.01062005, + "auxiliary_loss_mlp": 0.01031638, + "balance_loss_clip": 1.01884711, + "balance_loss_mlp": 1.01988077, + "epoch": 0.5546370058620171, + "flos": 18477083370240.0, + "grad_norm": 1.7406152385805198, + "language_loss": 0.57268882, + "learning_rate": 1.6586171755114614e-06, + "loss": 0.59362531, + "num_input_tokens_seen": 198671890, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.421875, + "step": 9225, + "time_per_iteration": 2.386018753051758 + }, + { + "auxiliary_loss_clip": 0.01061602, + "auxiliary_loss_mlp": 0.01024939, + "balance_loss_clip": 1.01259458, + "balance_loss_mlp": 1.01894987, + "epoch": 0.554697129114685, + "flos": 22929382235520.0, + "grad_norm": 1.9132779793829142, + "language_loss": 0.67713964, + "learning_rate": 1.6582449039955242e-06, + "loss": 0.69800508, + "num_input_tokens_seen": 198691995, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.42578125, + "step": 9226, + "time_per_iteration": 2.373483180999756 + }, + { + "auxiliary_loss_clip": 0.01059482, + "auxiliary_loss_mlp": 0.01025302, + "balance_loss_clip": 1.01332736, + "balance_loss_mlp": 1.01883972, + "epoch": 0.5547572523673531, + "flos": 21796136547840.0, + "grad_norm": 1.3847433417869583, + "language_loss": 0.74340117, + "learning_rate": 1.657872644675947e-06, + "loss": 0.76424903, + "num_input_tokens_seen": 198712440, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.40625, + "step": 9227, + "time_per_iteration": 2.401776075363159 + }, + { + "auxiliary_loss_clip": 0.01063507, + "auxiliary_loss_mlp": 0.01023882, + "balance_loss_clip": 1.01080513, + "balance_loss_mlp": 1.02139521, + "epoch": 0.5548173756200211, + "flos": 22341829098240.0, + "grad_norm": 1.5666101204549079, + "language_loss": 0.73587132, + "learning_rate": 1.6575003975660154e-06, + "loss": 0.75674516, + "num_input_tokens_seen": 198731515, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.421875, + "step": 9228, + "time_per_iteration": 2.378403425216675 + }, + { + "auxiliary_loss_clip": 0.01060629, + "auxiliary_loss_mlp": 0.01023688, + "balance_loss_clip": 1.01151705, + "balance_loss_mlp": 1.01881731, + "epoch": 0.554877498872689, + "flos": 17857759029120.0, + "grad_norm": 1.6848225159036558, + "language_loss": 0.75559586, + "learning_rate": 1.657128162679013e-06, + "loss": 0.77643907, + "num_input_tokens_seen": 198749750, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.41796875, + "step": 9229, + "time_per_iteration": 3.7427563667297363 + }, + { + "auxiliary_loss_clip": 0.01060772, + "auxiliary_loss_mlp": 0.0103252, + "balance_loss_clip": 1.0194906, + "balance_loss_mlp": 1.02035999, + "epoch": 0.554937622125357, + "flos": 17237387347200.0, + "grad_norm": 1.3709359346679761, + "language_loss": 0.68944097, + "learning_rate": 1.6567559400282248e-06, + "loss": 0.71037388, + "num_input_tokens_seen": 198768320, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.40234375, + "step": 9230, + "time_per_iteration": 2.428732395172119 + }, + { + "auxiliary_loss_clip": 0.01061098, + "auxiliary_loss_mlp": 0.0102643, + "balance_loss_clip": 1.01370442, + "balance_loss_mlp": 1.01883197, + "epoch": 0.5549977453780249, + "flos": 25367088251520.0, + "grad_norm": 2.251964770790021, + "language_loss": 0.68934411, + "learning_rate": 1.6563837296269347e-06, + "loss": 0.71021938, + "num_input_tokens_seen": 198787230, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.421875, + "step": 9231, + "time_per_iteration": 3.775827407836914 + }, + { + "auxiliary_loss_clip": 0.01062298, + "auxiliary_loss_mlp": 0.01025519, + "balance_loss_clip": 1.01193523, + "balance_loss_mlp": 1.02029312, + "epoch": 0.555057868630693, + "flos": 25078042172160.0, + "grad_norm": 2.1736941652189596, + "language_loss": 0.78101289, + "learning_rate": 1.6560115314884247e-06, + "loss": 0.80189103, + "num_input_tokens_seen": 198806720, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.41992188, + "step": 9232, + "time_per_iteration": 2.414865493774414 + }, + { + "auxiliary_loss_clip": 0.01058006, + "auxiliary_loss_mlp": 0.01025525, + "balance_loss_clip": 1.01417041, + "balance_loss_mlp": 1.01822329, + "epoch": 0.5551179918833609, + "flos": 26103022133760.0, + "grad_norm": 1.5149576118497479, + "language_loss": 0.82797211, + "learning_rate": 1.6556393456259787e-06, + "loss": 0.84880739, + "num_input_tokens_seen": 198826235, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.3984375, + "step": 9233, + "time_per_iteration": 3.845447063446045 + }, + { + "auxiliary_loss_clip": 0.01063157, + "auxiliary_loss_mlp": 0.01026275, + "balance_loss_clip": 1.01322746, + "balance_loss_mlp": 1.02062726, + "epoch": 0.5551781151360289, + "flos": 19383917690880.0, + "grad_norm": 2.605943346263569, + "language_loss": 0.74981135, + "learning_rate": 1.6552671720528783e-06, + "loss": 0.7707057, + "num_input_tokens_seen": 198842655, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.42578125, + "step": 9234, + "time_per_iteration": 2.373871088027954 + }, + { + "auxiliary_loss_clip": 0.01057287, + "auxiliary_loss_mlp": 0.01018757, + "balance_loss_clip": 1.00742042, + "balance_loss_mlp": 1.01794577, + "epoch": 0.5552382383886968, + "flos": 21724878729600.0, + "grad_norm": 1.9046110084189745, + "language_loss": 0.64750004, + "learning_rate": 1.6548950107824062e-06, + "loss": 0.66826046, + "num_input_tokens_seen": 198861210, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.39453125, + "step": 9235, + "time_per_iteration": 2.3745598793029785 + }, + { + "auxiliary_loss_clip": 0.01059886, + "auxiliary_loss_mlp": 0.01023946, + "balance_loss_clip": 1.01084483, + "balance_loss_mlp": 1.01822686, + "epoch": 0.5552983616413648, + "flos": 14355307146240.0, + "grad_norm": 1.7482302519639998, + "language_loss": 0.68503368, + "learning_rate": 1.6545228618278434e-06, + "loss": 0.70587206, + "num_input_tokens_seen": 198880045, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.41601562, + "step": 9236, + "time_per_iteration": 2.3592004776000977 + }, + { + "auxiliary_loss_clip": 0.01056075, + "auxiliary_loss_mlp": 0.01021887, + "balance_loss_clip": 1.00961494, + "balance_loss_mlp": 1.01770306, + "epoch": 0.5553584848940327, + "flos": 25477518481920.0, + "grad_norm": 1.6284388019771425, + "language_loss": 0.86360854, + "learning_rate": 1.6541507252024706e-06, + "loss": 0.88438821, + "num_input_tokens_seen": 198900210, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.38476562, + "step": 9237, + "time_per_iteration": 2.448861598968506 + }, + { + "auxiliary_loss_clip": 0.01058017, + "auxiliary_loss_mlp": 0.01027845, + "balance_loss_clip": 1.01554847, + "balance_loss_mlp": 1.01826215, + "epoch": 0.5554186081467007, + "flos": 22162759401600.0, + "grad_norm": 2.323343410235395, + "language_loss": 0.73330045, + "learning_rate": 1.6537786009195695e-06, + "loss": 0.75415909, + "num_input_tokens_seen": 198919055, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.39648438, + "step": 9238, + "time_per_iteration": 2.3861095905303955 + }, + { + "auxiliary_loss_clip": 0.01060954, + "auxiliary_loss_mlp": 0.01025533, + "balance_loss_clip": 1.01358867, + "balance_loss_mlp": 1.01905727, + "epoch": 0.5554787313993687, + "flos": 49744807994880.0, + "grad_norm": 2.722673998832447, + "language_loss": 0.7839455, + "learning_rate": 1.653406488992419e-06, + "loss": 0.80481035, + "num_input_tokens_seen": 198943505, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.41796875, + "step": 9239, + "time_per_iteration": 2.646998405456543 + }, + { + "auxiliary_loss_clip": 0.01062341, + "auxiliary_loss_mlp": 0.01024858, + "balance_loss_clip": 1.01272225, + "balance_loss_mlp": 1.02079093, + "epoch": 0.5555388546520367, + "flos": 22126275164160.0, + "grad_norm": 1.4766248062616991, + "language_loss": 0.79718482, + "learning_rate": 1.6530343894342994e-06, + "loss": 0.81805682, + "num_input_tokens_seen": 198963590, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.41601562, + "step": 9240, + "time_per_iteration": 2.3815364837646484 + }, + { + "auxiliary_loss_clip": 0.01059803, + "auxiliary_loss_mlp": 0.01022558, + "balance_loss_clip": 1.01014256, + "balance_loss_mlp": 1.01891172, + "epoch": 0.5555989779047047, + "flos": 24680939834880.0, + "grad_norm": 1.6452465613520855, + "language_loss": 0.65206659, + "learning_rate": 1.6526623022584902e-06, + "loss": 0.67289019, + "num_input_tokens_seen": 198982680, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.40820312, + "step": 9241, + "time_per_iteration": 2.447591543197632 + }, + { + "auxiliary_loss_clip": 0.01061132, + "auxiliary_loss_mlp": 0.01026271, + "balance_loss_clip": 1.01311612, + "balance_loss_mlp": 1.01911473, + "epoch": 0.5556591011573726, + "flos": 16105607936640.0, + "grad_norm": 2.0163287963843515, + "language_loss": 0.73001879, + "learning_rate": 1.6522902274782696e-06, + "loss": 0.75089282, + "num_input_tokens_seen": 199000185, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.41992188, + "step": 9242, + "time_per_iteration": 2.34329891204834 + }, + { + "auxiliary_loss_clip": 0.01061432, + "auxiliary_loss_mlp": 0.01027696, + "balance_loss_clip": 1.01427889, + "balance_loss_mlp": 1.01966572, + "epoch": 0.5557192244100406, + "flos": 12932840822400.0, + "grad_norm": 3.2157849914470127, + "language_loss": 0.63549232, + "learning_rate": 1.6519181651069167e-06, + "loss": 0.65638357, + "num_input_tokens_seen": 199018380, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.41796875, + "step": 9243, + "time_per_iteration": 2.375028133392334 + }, + { + "auxiliary_loss_clip": 0.01061175, + "auxiliary_loss_mlp": 0.01029878, + "balance_loss_clip": 1.01724768, + "balance_loss_mlp": 1.02045083, + "epoch": 0.5557793476627085, + "flos": 23110616436480.0, + "grad_norm": 2.0299707367212525, + "language_loss": 0.7518239, + "learning_rate": 1.6515461151577085e-06, + "loss": 0.7727344, + "num_input_tokens_seen": 199037115, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.40625, + "step": 9244, + "time_per_iteration": 2.3977789878845215 + }, + { + "auxiliary_loss_clip": 0.01057166, + "auxiliary_loss_mlp": 0.01025226, + "balance_loss_clip": 1.01410973, + "balance_loss_mlp": 1.01870406, + "epoch": 0.5558394709153766, + "flos": 21427139721600.0, + "grad_norm": 2.0774636767812233, + "language_loss": 0.75031078, + "learning_rate": 1.6511740776439238e-06, + "loss": 0.77113473, + "num_input_tokens_seen": 199053375, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.38476562, + "step": 9245, + "time_per_iteration": 2.3967881202697754 + }, + { + "auxiliary_loss_clip": 0.01062796, + "auxiliary_loss_mlp": 0.01027412, + "balance_loss_clip": 1.01370323, + "balance_loss_mlp": 1.01974535, + "epoch": 0.5558995941680445, + "flos": 25077274122240.0, + "grad_norm": 1.936509344281451, + "language_loss": 0.79986823, + "learning_rate": 1.6508020525788388e-06, + "loss": 0.82077032, + "num_input_tokens_seen": 199070930, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.4296875, + "step": 9246, + "time_per_iteration": 2.394207239151001 + }, + { + "auxiliary_loss_clip": 0.0106104, + "auxiliary_loss_mlp": 0.01029262, + "balance_loss_clip": 1.01706648, + "balance_loss_mlp": 1.02019906, + "epoch": 0.5559597174207125, + "flos": 20010119569920.0, + "grad_norm": 2.0090482746319074, + "language_loss": 0.73936027, + "learning_rate": 1.65043003997573e-06, + "loss": 0.76026326, + "num_input_tokens_seen": 199088675, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.40820312, + "step": 9247, + "time_per_iteration": 2.377729892730713 + }, + { + "auxiliary_loss_clip": 0.01061804, + "auxiliary_loss_mlp": 0.01032006, + "balance_loss_clip": 1.0188632, + "balance_loss_mlp": 1.0197202, + "epoch": 0.5560198406733804, + "flos": 16834769015040.0, + "grad_norm": 2.7371630546786005, + "language_loss": 0.75345623, + "learning_rate": 1.6500580398478743e-06, + "loss": 0.77439433, + "num_input_tokens_seen": 199103075, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.41992188, + "step": 9248, + "time_per_iteration": 2.373836040496826 + }, + { + "auxiliary_loss_clip": 0.01009618, + "auxiliary_loss_mlp": 0.01001199, + "balance_loss_clip": 1.00019753, + "balance_loss_mlp": 1.00193512, + "epoch": 0.5560799639260484, + "flos": 70693391577600.0, + "grad_norm": 0.8488014187598858, + "language_loss": 0.59521413, + "learning_rate": 1.6496860522085466e-06, + "loss": 0.61532235, + "num_input_tokens_seen": 199160325, + "router_z_loss_clip": 0.01000977, + "router_z_loss_mlp": 0.07714844, + "step": 9249, + "time_per_iteration": 3.150162935256958 + }, + { + "auxiliary_loss_clip": 0.01058477, + "auxiliary_loss_mlp": 0.01024741, + "balance_loss_clip": 1.01250398, + "balance_loss_mlp": 1.01844835, + "epoch": 0.5561400871787163, + "flos": 23147484698880.0, + "grad_norm": 1.9252157800206435, + "language_loss": 0.79439223, + "learning_rate": 1.6493140770710228e-06, + "loss": 0.81522441, + "num_input_tokens_seen": 199179760, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.40039062, + "step": 9250, + "time_per_iteration": 2.4105958938598633 + }, + { + "auxiliary_loss_clip": 0.01061029, + "auxiliary_loss_mlp": 0.01025936, + "balance_loss_clip": 1.01234007, + "balance_loss_mlp": 1.01843178, + "epoch": 0.5562002104313843, + "flos": 17565466193280.0, + "grad_norm": 2.3434353658874474, + "language_loss": 0.68731368, + "learning_rate": 1.6489421144485773e-06, + "loss": 0.70818329, + "num_input_tokens_seen": 199196695, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.42578125, + "step": 9251, + "time_per_iteration": 2.351121187210083 + }, + { + "auxiliary_loss_clip": 0.01060842, + "auxiliary_loss_mlp": 0.01030352, + "balance_loss_clip": 1.0173583, + "balance_loss_mlp": 1.01977551, + "epoch": 0.5562603336840523, + "flos": 25044281020800.0, + "grad_norm": 1.5907183420415048, + "language_loss": 0.75321883, + "learning_rate": 1.6485701643544852e-06, + "loss": 0.77413076, + "num_input_tokens_seen": 199217845, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.41015625, + "step": 9252, + "time_per_iteration": 2.458021640777588 + }, + { + "auxiliary_loss_clip": 0.01063248, + "auxiliary_loss_mlp": 0.01029132, + "balance_loss_clip": 1.01634073, + "balance_loss_mlp": 1.02075601, + "epoch": 0.5563204569367203, + "flos": 29057756607360.0, + "grad_norm": 1.4229154546717209, + "language_loss": 0.72703892, + "learning_rate": 1.6481982268020196e-06, + "loss": 0.74796271, + "num_input_tokens_seen": 199239250, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.42578125, + "step": 9253, + "time_per_iteration": 2.4455878734588623 + }, + { + "auxiliary_loss_clip": 0.01060383, + "auxiliary_loss_mlp": 0.01023858, + "balance_loss_clip": 1.01130533, + "balance_loss_mlp": 1.01982534, + "epoch": 0.5563805801893883, + "flos": 22089371990400.0, + "grad_norm": 1.6751275673812536, + "language_loss": 0.82602441, + "learning_rate": 1.6478263018044546e-06, + "loss": 0.84686679, + "num_input_tokens_seen": 199258320, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.40625, + "step": 9254, + "time_per_iteration": 2.422128915786743 + }, + { + "auxiliary_loss_clip": 0.01008692, + "auxiliary_loss_mlp": 0.01005343, + "balance_loss_clip": 1.00448442, + "balance_loss_mlp": 1.00105965, + "epoch": 0.5564407034420562, + "flos": 58633379544960.0, + "grad_norm": 0.8668132370509535, + "language_loss": 0.64847434, + "learning_rate": 1.647454389375063e-06, + "loss": 0.66861469, + "num_input_tokens_seen": 199314840, + "router_z_loss_clip": 0.00860596, + "router_z_loss_mlp": 0.07617188, + "step": 9255, + "time_per_iteration": 2.882516622543335 + }, + { + "auxiliary_loss_clip": 0.01060924, + "auxiliary_loss_mlp": 0.01027358, + "balance_loss_clip": 1.01557434, + "balance_loss_mlp": 1.02100146, + "epoch": 0.5565008266947242, + "flos": 23111209929600.0, + "grad_norm": 2.3929262606908384, + "language_loss": 0.69565475, + "learning_rate": 1.6470824895271168e-06, + "loss": 0.71653759, + "num_input_tokens_seen": 199335405, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.3984375, + "step": 9256, + "time_per_iteration": 3.8745758533477783 + }, + { + "auxiliary_loss_clip": 0.01059614, + "auxiliary_loss_mlp": 0.01026825, + "balance_loss_clip": 1.01474357, + "balance_loss_mlp": 1.01974487, + "epoch": 0.5565609499473921, + "flos": 21577370768640.0, + "grad_norm": 1.5015346009132562, + "language_loss": 0.76043493, + "learning_rate": 1.6467106022738896e-06, + "loss": 0.78129935, + "num_input_tokens_seen": 199354345, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.3984375, + "step": 9257, + "time_per_iteration": 2.3835387229919434 + }, + { + "auxiliary_loss_clip": 0.01059905, + "auxiliary_loss_mlp": 0.01029699, + "balance_loss_clip": 1.01717067, + "balance_loss_mlp": 1.01845169, + "epoch": 0.5566210732000602, + "flos": 18368643087360.0, + "grad_norm": 3.500306692316156, + "language_loss": 0.60738671, + "learning_rate": 1.6463387276286518e-06, + "loss": 0.62828279, + "num_input_tokens_seen": 199372250, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.4140625, + "step": 9258, + "time_per_iteration": 2.34240984916687 + }, + { + "auxiliary_loss_clip": 0.01062276, + "auxiliary_loss_mlp": 0.01028513, + "balance_loss_clip": 1.01428509, + "balance_loss_mlp": 1.0202682, + "epoch": 0.5566811964527281, + "flos": 25702149369600.0, + "grad_norm": 1.6240614212835764, + "language_loss": 0.790824, + "learning_rate": 1.6459668656046746e-06, + "loss": 0.81173182, + "num_input_tokens_seen": 199392815, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.41992188, + "step": 9259, + "time_per_iteration": 2.41882061958313 + }, + { + "auxiliary_loss_clip": 0.01061224, + "auxiliary_loss_mlp": 0.01027727, + "balance_loss_clip": 1.01493001, + "balance_loss_mlp": 1.01933682, + "epoch": 0.5567413197053961, + "flos": 26942753088000.0, + "grad_norm": 1.9834489259558301, + "language_loss": 0.81678712, + "learning_rate": 1.64559501621523e-06, + "loss": 0.8376767, + "num_input_tokens_seen": 199412375, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.41796875, + "step": 9260, + "time_per_iteration": 2.4017224311828613 + }, + { + "auxiliary_loss_clip": 0.01060861, + "auxiliary_loss_mlp": 0.01030138, + "balance_loss_clip": 1.01685226, + "balance_loss_mlp": 1.01987135, + "epoch": 0.556801442958064, + "flos": 20849536321920.0, + "grad_norm": 1.5525051648225294, + "language_loss": 0.68676323, + "learning_rate": 1.6452231794735872e-06, + "loss": 0.70767319, + "num_input_tokens_seen": 199431490, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.41015625, + "step": 9261, + "time_per_iteration": 2.3890368938446045 + }, + { + "auxiliary_loss_clip": 0.01009759, + "auxiliary_loss_mlp": 0.01005827, + "balance_loss_clip": 1.00505221, + "balance_loss_mlp": 1.00191009, + "epoch": 0.556861566210732, + "flos": 70495015898880.0, + "grad_norm": 0.7270686228510271, + "language_loss": 0.61123085, + "learning_rate": 1.6448513553930167e-06, + "loss": 0.6313867, + "num_input_tokens_seen": 199495855, + "router_z_loss_clip": 0.00775146, + "router_z_loss_mlp": 0.07861328, + "step": 9262, + "time_per_iteration": 3.040602445602417 + }, + { + "auxiliary_loss_clip": 0.01063085, + "auxiliary_loss_mlp": 0.01026733, + "balance_loss_clip": 1.01451445, + "balance_loss_mlp": 1.02072525, + "epoch": 0.5569216894633999, + "flos": 25336120008960.0, + "grad_norm": 1.5140957070886627, + "language_loss": 0.64589399, + "learning_rate": 1.644479543986788e-06, + "loss": 0.66679215, + "num_input_tokens_seen": 199515870, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.42382812, + "step": 9263, + "time_per_iteration": 2.4458858966827393 + }, + { + "auxiliary_loss_clip": 0.01061519, + "auxiliary_loss_mlp": 0.01025222, + "balance_loss_clip": 1.01280069, + "balance_loss_mlp": 1.0203861, + "epoch": 0.556981812716068, + "flos": 22637613070080.0, + "grad_norm": 1.8353548330625171, + "language_loss": 0.73114872, + "learning_rate": 1.6441077452681693e-06, + "loss": 0.75201613, + "num_input_tokens_seen": 199535745, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.41015625, + "step": 9264, + "time_per_iteration": 2.3890230655670166 + }, + { + "auxiliary_loss_clip": 0.01063909, + "auxiliary_loss_mlp": 0.01026676, + "balance_loss_clip": 1.01425397, + "balance_loss_mlp": 1.02147245, + "epoch": 0.5570419359687359, + "flos": 11035066982400.0, + "grad_norm": 1.9681249805797134, + "language_loss": 0.76069355, + "learning_rate": 1.64373595925043e-06, + "loss": 0.7815994, + "num_input_tokens_seen": 199554035, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.42382812, + "step": 9265, + "time_per_iteration": 2.388174057006836 + }, + { + "auxiliary_loss_clip": 0.01060297, + "auxiliary_loss_mlp": 0.0102738, + "balance_loss_clip": 1.01571512, + "balance_loss_mlp": 1.02119863, + "epoch": 0.5571020592214039, + "flos": 22821954382080.0, + "grad_norm": 1.5886702108066535, + "language_loss": 0.70672524, + "learning_rate": 1.643364185946838e-06, + "loss": 0.72760201, + "num_input_tokens_seen": 199576120, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.390625, + "step": 9266, + "time_per_iteration": 2.404606580734253 + }, + { + "auxiliary_loss_clip": 0.01061146, + "auxiliary_loss_mlp": 0.01030411, + "balance_loss_clip": 1.01791143, + "balance_loss_mlp": 1.02024496, + "epoch": 0.5571621824740719, + "flos": 22926728972160.0, + "grad_norm": 1.5795501927675535, + "language_loss": 0.6818192, + "learning_rate": 1.642992425370661e-06, + "loss": 0.70273471, + "num_input_tokens_seen": 199593780, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.41015625, + "step": 9267, + "time_per_iteration": 2.3882250785827637 + }, + { + "auxiliary_loss_clip": 0.0105987, + "auxiliary_loss_mlp": 0.0102962, + "balance_loss_clip": 1.01800323, + "balance_loss_mlp": 1.02007651, + "epoch": 0.5572223057267398, + "flos": 22965587182080.0, + "grad_norm": 1.7476861071553125, + "language_loss": 0.74063969, + "learning_rate": 1.6426206775351657e-06, + "loss": 0.76153451, + "num_input_tokens_seen": 199613220, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.3984375, + "step": 9268, + "time_per_iteration": 2.389503240585327 + }, + { + "auxiliary_loss_clip": 0.01061883, + "auxiliary_loss_mlp": 0.01026002, + "balance_loss_clip": 1.0138545, + "balance_loss_mlp": 1.02079654, + "epoch": 0.5572824289794078, + "flos": 20958989034240.0, + "grad_norm": 1.8287469885086374, + "language_loss": 0.74708146, + "learning_rate": 1.6422489424536192e-06, + "loss": 0.76796031, + "num_input_tokens_seen": 199632085, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.41015625, + "step": 9269, + "time_per_iteration": 3.752659320831299 + }, + { + "auxiliary_loss_clip": 0.01062258, + "auxiliary_loss_mlp": 0.01026452, + "balance_loss_clip": 1.01470947, + "balance_loss_mlp": 1.02061343, + "epoch": 0.5573425522320757, + "flos": 25041348466560.0, + "grad_norm": 1.5552645546965955, + "language_loss": 0.82350254, + "learning_rate": 1.6418772201392879e-06, + "loss": 0.84438956, + "num_input_tokens_seen": 199649295, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.41796875, + "step": 9270, + "time_per_iteration": 2.4222261905670166 + }, + { + "auxiliary_loss_clip": 0.01062643, + "auxiliary_loss_mlp": 0.01026354, + "balance_loss_clip": 1.01333058, + "balance_loss_mlp": 1.0206871, + "epoch": 0.5574026754847438, + "flos": 23658508402560.0, + "grad_norm": 2.720419018264233, + "language_loss": 0.79851282, + "learning_rate": 1.6415055106054369e-06, + "loss": 0.81940281, + "num_input_tokens_seen": 199668870, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.41992188, + "step": 9271, + "time_per_iteration": 3.774890661239624 + }, + { + "auxiliary_loss_clip": 0.01062285, + "auxiliary_loss_mlp": 0.01025889, + "balance_loss_clip": 1.01266885, + "balance_loss_mlp": 1.02044487, + "epoch": 0.5574627987374117, + "flos": 24781315593600.0, + "grad_norm": 1.6883719112520013, + "language_loss": 0.90108526, + "learning_rate": 1.6411338138653327e-06, + "loss": 0.92196691, + "num_input_tokens_seen": 199684870, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.41796875, + "step": 9272, + "time_per_iteration": 3.7904937267303467 + }, + { + "auxiliary_loss_clip": 0.0106024, + "auxiliary_loss_mlp": 0.01024269, + "balance_loss_clip": 1.01198435, + "balance_loss_mlp": 1.02035379, + "epoch": 0.5575229219900797, + "flos": 21833877594240.0, + "grad_norm": 1.777876843665932, + "language_loss": 0.83891279, + "learning_rate": 1.6407621299322387e-06, + "loss": 0.8597579, + "num_input_tokens_seen": 199701975, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.3984375, + "step": 9273, + "time_per_iteration": 2.4108967781066895 + }, + { + "auxiliary_loss_clip": 0.01063743, + "auxiliary_loss_mlp": 0.01030658, + "balance_loss_clip": 1.0177176, + "balance_loss_mlp": 1.02124214, + "epoch": 0.5575830452427476, + "flos": 27814010866560.0, + "grad_norm": 2.0293321083772278, + "language_loss": 0.74015051, + "learning_rate": 1.640390458819421e-06, + "loss": 0.76109451, + "num_input_tokens_seen": 199721865, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.42578125, + "step": 9274, + "time_per_iteration": 2.4385781288146973 + }, + { + "auxiliary_loss_clip": 0.01067766, + "auxiliary_loss_mlp": 0.01026579, + "balance_loss_clip": 1.01207757, + "balance_loss_mlp": 1.02201819, + "epoch": 0.5576431684954156, + "flos": 17812093104000.0, + "grad_norm": 3.2336379725760795, + "language_loss": 0.7813853, + "learning_rate": 1.6400188005401427e-06, + "loss": 0.80232877, + "num_input_tokens_seen": 199736455, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.45703125, + "step": 9275, + "time_per_iteration": 2.3475875854492188 + }, + { + "auxiliary_loss_clip": 0.01060584, + "auxiliary_loss_mlp": 0.01020585, + "balance_loss_clip": 1.00809789, + "balance_loss_mlp": 1.02008104, + "epoch": 0.5577032917480835, + "flos": 15485969393280.0, + "grad_norm": 1.7995029822655615, + "language_loss": 0.74909514, + "learning_rate": 1.6396471551076672e-06, + "loss": 0.76990682, + "num_input_tokens_seen": 199753125, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.40625, + "step": 9276, + "time_per_iteration": 2.3730411529541016 + }, + { + "auxiliary_loss_clip": 0.01057568, + "auxiliary_loss_mlp": 0.01023927, + "balance_loss_clip": 1.01229823, + "balance_loss_mlp": 1.0180769, + "epoch": 0.5577634150007516, + "flos": 21578697400320.0, + "grad_norm": 2.1297189357008315, + "language_loss": 0.75288486, + "learning_rate": 1.639275522535258e-06, + "loss": 0.77369988, + "num_input_tokens_seen": 199771365, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.39453125, + "step": 9277, + "time_per_iteration": 2.3978803157806396 + }, + { + "auxiliary_loss_clip": 0.01058165, + "auxiliary_loss_mlp": 0.01024135, + "balance_loss_clip": 1.01213026, + "balance_loss_mlp": 1.0184691, + "epoch": 0.5578235382534195, + "flos": 21138756958080.0, + "grad_norm": 1.7130941909148951, + "language_loss": 0.71599889, + "learning_rate": 1.638903902836177e-06, + "loss": 0.73682189, + "num_input_tokens_seen": 199790035, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.39648438, + "step": 9278, + "time_per_iteration": 2.423283338546753 + }, + { + "auxiliary_loss_clip": 0.01062433, + "auxiliary_loss_mlp": 0.01027606, + "balance_loss_clip": 1.01387894, + "balance_loss_mlp": 1.02022803, + "epoch": 0.5578836615060875, + "flos": 26503999632000.0, + "grad_norm": 1.4284679498123585, + "language_loss": 0.75669479, + "learning_rate": 1.6385322960236874e-06, + "loss": 0.77759516, + "num_input_tokens_seen": 199811125, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.421875, + "step": 9279, + "time_per_iteration": 2.429734706878662 + }, + { + "auxiliary_loss_clip": 0.01059683, + "auxiliary_loss_mlp": 0.01022056, + "balance_loss_clip": 1.00965202, + "balance_loss_mlp": 1.01875317, + "epoch": 0.5579437847587555, + "flos": 20152844674560.0, + "grad_norm": 1.5195623532821083, + "language_loss": 0.67413199, + "learning_rate": 1.6381607021110505e-06, + "loss": 0.69494939, + "num_input_tokens_seen": 199829915, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.41015625, + "step": 9280, + "time_per_iteration": 2.3773040771484375 + }, + { + "auxiliary_loss_clip": 0.01062376, + "auxiliary_loss_mlp": 0.01028692, + "balance_loss_clip": 1.01613951, + "balance_loss_mlp": 1.01991487, + "epoch": 0.5580039080114234, + "flos": 26101136920320.0, + "grad_norm": 1.5905459508512843, + "language_loss": 0.73257959, + "learning_rate": 1.6377891211115268e-06, + "loss": 0.75349021, + "num_input_tokens_seen": 199850670, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.42578125, + "step": 9281, + "time_per_iteration": 2.430905342102051 + }, + { + "auxiliary_loss_clip": 0.01058773, + "auxiliary_loss_mlp": 0.01024077, + "balance_loss_clip": 1.01110053, + "balance_loss_mlp": 1.01923776, + "epoch": 0.5580640312640914, + "flos": 13770826208640.0, + "grad_norm": 2.5586432729168282, + "language_loss": 0.75103259, + "learning_rate": 1.6374175530383778e-06, + "loss": 0.77186108, + "num_input_tokens_seen": 199867645, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.39453125, + "step": 9282, + "time_per_iteration": 2.3509674072265625 + }, + { + "auxiliary_loss_clip": 0.0106051, + "auxiliary_loss_mlp": 0.01024673, + "balance_loss_clip": 1.012514, + "balance_loss_mlp": 1.02045488, + "epoch": 0.5581241545167593, + "flos": 17675023639680.0, + "grad_norm": 1.8586443219124462, + "language_loss": 0.66261601, + "learning_rate": 1.6370459979048642e-06, + "loss": 0.68346786, + "num_input_tokens_seen": 199886320, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.40039062, + "step": 9283, + "time_per_iteration": 2.349226713180542 + }, + { + "auxiliary_loss_clip": 0.01060955, + "auxiliary_loss_mlp": 0.01021811, + "balance_loss_clip": 1.00978315, + "balance_loss_mlp": 1.01995277, + "epoch": 0.5581842777694274, + "flos": 19568259002880.0, + "grad_norm": 1.683807349226179, + "language_loss": 0.82917249, + "learning_rate": 1.6366744557242448e-06, + "loss": 0.85000014, + "num_input_tokens_seen": 199904895, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.41015625, + "step": 9284, + "time_per_iteration": 2.4074342250823975 + }, + { + "auxiliary_loss_clip": 0.01062449, + "auxiliary_loss_mlp": 0.01025133, + "balance_loss_clip": 1.01272345, + "balance_loss_mlp": 1.02007949, + "epoch": 0.5582444010220953, + "flos": 20594111748480.0, + "grad_norm": 2.734705799085521, + "language_loss": 0.85352683, + "learning_rate": 1.63630292650978e-06, + "loss": 0.87440264, + "num_input_tokens_seen": 199921090, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.42382812, + "step": 9285, + "time_per_iteration": 2.3701305389404297 + }, + { + "auxiliary_loss_clip": 0.01061845, + "auxiliary_loss_mlp": 0.01028101, + "balance_loss_clip": 1.0155127, + "balance_loss_mlp": 1.0196116, + "epoch": 0.5583045242747633, + "flos": 19134497871360.0, + "grad_norm": 2.2855383888081935, + "language_loss": 0.73819923, + "learning_rate": 1.6359314102747272e-06, + "loss": 0.75909871, + "num_input_tokens_seen": 199939925, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.421875, + "step": 9286, + "time_per_iteration": 2.3949880599975586 + }, + { + "auxiliary_loss_clip": 0.0106067, + "auxiliary_loss_mlp": 0.01026868, + "balance_loss_clip": 1.01361799, + "balance_loss_mlp": 1.01883364, + "epoch": 0.5583646475274312, + "flos": 27453322944000.0, + "grad_norm": 1.6384886669975105, + "language_loss": 0.7447207, + "learning_rate": 1.6355599070323467e-06, + "loss": 0.76559603, + "num_input_tokens_seen": 199960015, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.41796875, + "step": 9287, + "time_per_iteration": 2.422165632247925 + }, + { + "auxiliary_loss_clip": 0.01063325, + "auxiliary_loss_mlp": 0.01025636, + "balance_loss_clip": 1.01195121, + "balance_loss_mlp": 1.02016401, + "epoch": 0.5584247707800992, + "flos": 23652817850880.0, + "grad_norm": 1.5654574422662522, + "language_loss": 0.75053144, + "learning_rate": 1.6351884167958952e-06, + "loss": 0.77142107, + "num_input_tokens_seen": 199980505, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.43164062, + "step": 9288, + "time_per_iteration": 2.4889376163482666 + }, + { + "auxiliary_loss_clip": 0.01063017, + "auxiliary_loss_mlp": 0.01032713, + "balance_loss_clip": 1.01953435, + "balance_loss_mlp": 1.02109802, + "epoch": 0.5584848940327671, + "flos": 13698032290560.0, + "grad_norm": 2.01225607389239, + "language_loss": 0.77911639, + "learning_rate": 1.634816939578631e-06, + "loss": 0.80007368, + "num_input_tokens_seen": 199999020, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.41992188, + "step": 9289, + "time_per_iteration": 2.476210355758667 + }, + { + "auxiliary_loss_clip": 0.01064662, + "auxiliary_loss_mlp": 0.01027919, + "balance_loss_clip": 1.01398945, + "balance_loss_mlp": 1.02086616, + "epoch": 0.5585450172854352, + "flos": 27014988424320.0, + "grad_norm": 1.6934074483595283, + "language_loss": 0.61161149, + "learning_rate": 1.634445475393811e-06, + "loss": 0.63253725, + "num_input_tokens_seen": 200019020, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.4375, + "step": 9290, + "time_per_iteration": 2.4673502445220947 + }, + { + "auxiliary_loss_clip": 0.01061303, + "auxiliary_loss_mlp": 0.01027794, + "balance_loss_clip": 1.01477647, + "balance_loss_mlp": 1.01975703, + "epoch": 0.5586051405381031, + "flos": 23184527518080.0, + "grad_norm": 2.0932867023877195, + "language_loss": 0.68119752, + "learning_rate": 1.6340740242546911e-06, + "loss": 0.70208853, + "num_input_tokens_seen": 200038110, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.41601562, + "step": 9291, + "time_per_iteration": 2.4113998413085938 + }, + { + "auxiliary_loss_clip": 0.01061912, + "auxiliary_loss_mlp": 0.01028745, + "balance_loss_clip": 1.01595402, + "balance_loss_mlp": 1.01901543, + "epoch": 0.5586652637907711, + "flos": 20774542988160.0, + "grad_norm": 1.9900405404396162, + "language_loss": 0.8426134, + "learning_rate": 1.6337025861745286e-06, + "loss": 0.86351997, + "num_input_tokens_seen": 200056210, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.4296875, + "step": 9292, + "time_per_iteration": 2.4570093154907227 + }, + { + "auxiliary_loss_clip": 0.01059931, + "auxiliary_loss_mlp": 0.01026888, + "balance_loss_clip": 1.01463366, + "balance_loss_mlp": 1.01991427, + "epoch": 0.5587253870434391, + "flos": 28218654057600.0, + "grad_norm": 1.8649966035690166, + "language_loss": 0.73687285, + "learning_rate": 1.6333311611665779e-06, + "loss": 0.75774103, + "num_input_tokens_seen": 200075620, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.40039062, + "step": 9293, + "time_per_iteration": 2.430229902267456 + }, + { + "auxiliary_loss_clip": 0.01058487, + "auxiliary_loss_mlp": 0.01025362, + "balance_loss_clip": 1.0143888, + "balance_loss_mlp": 1.01915574, + "epoch": 0.558785510296107, + "flos": 26614499685120.0, + "grad_norm": 1.7361947857650781, + "language_loss": 0.72353166, + "learning_rate": 1.6329597492440957e-06, + "loss": 0.74437016, + "num_input_tokens_seen": 200095945, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.39453125, + "step": 9294, + "time_per_iteration": 2.476670503616333 + }, + { + "auxiliary_loss_clip": 0.01061436, + "auxiliary_loss_mlp": 0.0102659, + "balance_loss_clip": 1.01422763, + "balance_loss_mlp": 1.02069104, + "epoch": 0.558845633548775, + "flos": 20155742317440.0, + "grad_norm": 5.801826207205387, + "language_loss": 0.68660963, + "learning_rate": 1.632588350420335e-06, + "loss": 0.70748985, + "num_input_tokens_seen": 200114185, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.40820312, + "step": 9295, + "time_per_iteration": 3.851543426513672 + }, + { + "auxiliary_loss_clip": 0.01060946, + "auxiliary_loss_mlp": 0.01024013, + "balance_loss_clip": 1.01132321, + "balance_loss_mlp": 1.02033925, + "epoch": 0.5589057568014429, + "flos": 24349684055040.0, + "grad_norm": 1.4277329487547465, + "language_loss": 0.80695027, + "learning_rate": 1.6322169647085517e-06, + "loss": 0.82779992, + "num_input_tokens_seen": 200135030, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.40625, + "step": 9296, + "time_per_iteration": 2.520890474319458 + }, + { + "auxiliary_loss_clip": 0.0106057, + "auxiliary_loss_mlp": 0.01026443, + "balance_loss_clip": 1.01437879, + "balance_loss_mlp": 1.01998734, + "epoch": 0.558965880054111, + "flos": 21104123022720.0, + "grad_norm": 1.6006569045294508, + "language_loss": 0.65281475, + "learning_rate": 1.6318455921219988e-06, + "loss": 0.67368484, + "num_input_tokens_seen": 200154290, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.40625, + "step": 9297, + "time_per_iteration": 2.453413963317871 + }, + { + "auxiliary_loss_clip": 0.01062435, + "auxiliary_loss_mlp": 0.01029474, + "balance_loss_clip": 1.01571131, + "balance_loss_mlp": 1.01971149, + "epoch": 0.5590260033067789, + "flos": 18435257694720.0, + "grad_norm": 1.616790567351722, + "language_loss": 0.75122744, + "learning_rate": 1.6314742326739291e-06, + "loss": 0.77214658, + "num_input_tokens_seen": 200171555, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.42773438, + "step": 9298, + "time_per_iteration": 2.412104845046997 + }, + { + "auxiliary_loss_clip": 0.01008221, + "auxiliary_loss_mlp": 0.01001666, + "balance_loss_clip": 1.00074816, + "balance_loss_mlp": 1.00072658, + "epoch": 0.5590861265594469, + "flos": 70574058950400.0, + "grad_norm": 0.6820658127018074, + "language_loss": 0.52443933, + "learning_rate": 1.6311028863775974e-06, + "loss": 0.5445382, + "num_input_tokens_seen": 200237010, + "router_z_loss_clip": 0.00915527, + "router_z_loss_mlp": 0.07519531, + "step": 9299, + "time_per_iteration": 3.0909793376922607 + }, + { + "auxiliary_loss_clip": 0.01060102, + "auxiliary_loss_mlp": 0.01025657, + "balance_loss_clip": 1.01388502, + "balance_loss_mlp": 1.02039087, + "epoch": 0.5591462498121148, + "flos": 30663097966080.0, + "grad_norm": 1.927170076244289, + "language_loss": 0.68617237, + "learning_rate": 1.6307315532462536e-06, + "loss": 0.70703, + "num_input_tokens_seen": 200260820, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.39648438, + "step": 9300, + "time_per_iteration": 2.568181276321411 + }, + { + "auxiliary_loss_clip": 0.01061988, + "auxiliary_loss_mlp": 0.01032165, + "balance_loss_clip": 1.01874232, + "balance_loss_mlp": 1.01977253, + "epoch": 0.5592063730647828, + "flos": 18149458371840.0, + "grad_norm": 1.7305504337336288, + "language_loss": 0.81958735, + "learning_rate": 1.6303602332931513e-06, + "loss": 0.84052891, + "num_input_tokens_seen": 200278035, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.421875, + "step": 9301, + "time_per_iteration": 2.4789047241210938 + }, + { + "auxiliary_loss_clip": 0.01059568, + "auxiliary_loss_mlp": 0.01028455, + "balance_loss_clip": 1.01585436, + "balance_loss_mlp": 1.01868892, + "epoch": 0.5592664963174507, + "flos": 24059276432640.0, + "grad_norm": 1.941373800398792, + "language_loss": 0.6761719, + "learning_rate": 1.6299889265315415e-06, + "loss": 0.69705212, + "num_input_tokens_seen": 200297255, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.41015625, + "step": 9302, + "time_per_iteration": 2.5166685581207275 + }, + { + "auxiliary_loss_clip": 0.01062013, + "auxiliary_loss_mlp": 0.01021033, + "balance_loss_clip": 1.00835538, + "balance_loss_mlp": 1.01937318, + "epoch": 0.5593266195701188, + "flos": 19826895421440.0, + "grad_norm": 1.7214420502060157, + "language_loss": 0.7096895, + "learning_rate": 1.6296176329746745e-06, + "loss": 0.73052001, + "num_input_tokens_seen": 200317505, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.42578125, + "step": 9303, + "time_per_iteration": 2.5063247680664062 + }, + { + "auxiliary_loss_clip": 0.01057752, + "auxiliary_loss_mlp": 0.01023834, + "balance_loss_clip": 1.01290798, + "balance_loss_mlp": 1.01937735, + "epoch": 0.5593867428227867, + "flos": 25299600860160.0, + "grad_norm": 1.6948996845282758, + "language_loss": 0.72748345, + "learning_rate": 1.629246352635802e-06, + "loss": 0.7482993, + "num_input_tokens_seen": 200338350, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.38476562, + "step": 9304, + "time_per_iteration": 2.539860725402832 + }, + { + "auxiliary_loss_clip": 0.0105984, + "auxiliary_loss_mlp": 0.01023377, + "balance_loss_clip": 1.0110091, + "balance_loss_mlp": 1.01876736, + "epoch": 0.5594468660754547, + "flos": 12932177506560.0, + "grad_norm": 2.21096366132003, + "language_loss": 0.77832317, + "learning_rate": 1.628875085528173e-06, + "loss": 0.79915535, + "num_input_tokens_seen": 200353965, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.41015625, + "step": 9305, + "time_per_iteration": 2.456509590148926 + }, + { + "auxiliary_loss_clip": 0.01059225, + "auxiliary_loss_mlp": 0.01023119, + "balance_loss_clip": 1.01185369, + "balance_loss_mlp": 1.02004075, + "epoch": 0.5595069893281227, + "flos": 19061703953280.0, + "grad_norm": 1.5604372573664764, + "language_loss": 0.69549155, + "learning_rate": 1.628503831665038e-06, + "loss": 0.71631503, + "num_input_tokens_seen": 200373595, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.390625, + "step": 9306, + "time_per_iteration": 2.5181527137756348 + }, + { + "auxiliary_loss_clip": 0.01008821, + "auxiliary_loss_mlp": 0.01002384, + "balance_loss_clip": 1.00156116, + "balance_loss_mlp": 1.00119042, + "epoch": 0.5595671125807906, + "flos": 70270350099840.0, + "grad_norm": 0.9149931869188864, + "language_loss": 0.60241747, + "learning_rate": 1.6281325910596456e-06, + "loss": 0.62252951, + "num_input_tokens_seen": 200429155, + "router_z_loss_clip": 0.00823975, + "router_z_loss_mlp": 0.07617188, + "step": 9307, + "time_per_iteration": 4.713507652282715 + }, + { + "auxiliary_loss_clip": 0.01060139, + "auxiliary_loss_mlp": 0.01030367, + "balance_loss_clip": 1.01899457, + "balance_loss_mlp": 1.01965022, + "epoch": 0.5596272358334586, + "flos": 20664531694080.0, + "grad_norm": 1.5193279484233828, + "language_loss": 0.73843426, + "learning_rate": 1.627761363725244e-06, + "loss": 0.75933933, + "num_input_tokens_seen": 200448290, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.40429688, + "step": 9308, + "time_per_iteration": 2.5326714515686035 + }, + { + "auxiliary_loss_clip": 0.01061793, + "auxiliary_loss_mlp": 0.01028175, + "balance_loss_clip": 1.0148772, + "balance_loss_mlp": 1.01895988, + "epoch": 0.5596873590861265, + "flos": 25039986923520.0, + "grad_norm": 1.8018139243019824, + "language_loss": 0.69497371, + "learning_rate": 1.6273901496750823e-06, + "loss": 0.71587342, + "num_input_tokens_seen": 200466555, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.4296875, + "step": 9309, + "time_per_iteration": 2.539271354675293 + }, + { + "auxiliary_loss_clip": 0.01061698, + "auxiliary_loss_mlp": 0.01027792, + "balance_loss_clip": 1.01389861, + "balance_loss_mlp": 1.01991224, + "epoch": 0.5597474823387946, + "flos": 25957189918080.0, + "grad_norm": 1.8024127184561283, + "language_loss": 0.74860358, + "learning_rate": 1.6270189489224074e-06, + "loss": 0.76949859, + "num_input_tokens_seen": 200485980, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.41796875, + "step": 9310, + "time_per_iteration": 4.087930917739868 + }, + { + "auxiliary_loss_clip": 0.01062207, + "auxiliary_loss_mlp": 0.01028952, + "balance_loss_clip": 1.01520085, + "balance_loss_mlp": 1.01984394, + "epoch": 0.5598076055914625, + "flos": 26176234988160.0, + "grad_norm": 2.2531210812924254, + "language_loss": 0.69548815, + "learning_rate": 1.6266477614804673e-06, + "loss": 0.71639967, + "num_input_tokens_seen": 200504555, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.42382812, + "step": 9311, + "time_per_iteration": 4.104650020599365 + }, + { + "auxiliary_loss_clip": 0.01059795, + "auxiliary_loss_mlp": 0.01027064, + "balance_loss_clip": 1.01423097, + "balance_loss_mlp": 1.01887822, + "epoch": 0.5598677288441305, + "flos": 11654984816640.0, + "grad_norm": 2.4098901493300806, + "language_loss": 0.72032851, + "learning_rate": 1.626276587362508e-06, + "loss": 0.74119711, + "num_input_tokens_seen": 200522700, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.40820312, + "step": 9312, + "time_per_iteration": 2.5359716415405273 + }, + { + "auxiliary_loss_clip": 0.01059263, + "auxiliary_loss_mlp": 0.01026325, + "balance_loss_clip": 1.01502967, + "balance_loss_mlp": 1.01963091, + "epoch": 0.5599278520967984, + "flos": 22965482448000.0, + "grad_norm": 1.915182049947712, + "language_loss": 0.89319408, + "learning_rate": 1.6259054265817756e-06, + "loss": 0.91404998, + "num_input_tokens_seen": 200541910, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.39648438, + "step": 9313, + "time_per_iteration": 2.5306224822998047 + }, + { + "auxiliary_loss_clip": 0.01061808, + "auxiliary_loss_mlp": 0.01028069, + "balance_loss_clip": 1.01576054, + "balance_loss_mlp": 1.02102947, + "epoch": 0.5599879753494664, + "flos": 21214483430400.0, + "grad_norm": 1.498542729542319, + "language_loss": 0.77558005, + "learning_rate": 1.625534279151517e-06, + "loss": 0.79647881, + "num_input_tokens_seen": 200562600, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.40820312, + "step": 9314, + "time_per_iteration": 2.5554540157318115 + }, + { + "auxiliary_loss_clip": 0.01058486, + "auxiliary_loss_mlp": 0.01024038, + "balance_loss_clip": 1.01192021, + "balance_loss_mlp": 1.01833963, + "epoch": 0.5600480986021343, + "flos": 31901921205120.0, + "grad_norm": 1.5122265716919363, + "language_loss": 0.69986242, + "learning_rate": 1.6251631450849758e-06, + "loss": 0.72068763, + "num_input_tokens_seen": 200584795, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.40234375, + "step": 9315, + "time_per_iteration": 2.642331600189209 + }, + { + "auxiliary_loss_clip": 0.01062672, + "auxiliary_loss_mlp": 0.01030168, + "balance_loss_clip": 1.01713848, + "balance_loss_mlp": 1.01995957, + "epoch": 0.5601082218548024, + "flos": 28474776858240.0, + "grad_norm": 1.7054255324167387, + "language_loss": 0.67184746, + "learning_rate": 1.6247920243953983e-06, + "loss": 0.69277585, + "num_input_tokens_seen": 200606945, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.42773438, + "step": 9316, + "time_per_iteration": 2.6143815517425537 + }, + { + "auxiliary_loss_clip": 0.01059773, + "auxiliary_loss_mlp": 0.010254, + "balance_loss_clip": 1.0127579, + "balance_loss_mlp": 1.01952481, + "epoch": 0.5601683451074703, + "flos": 24096039960960.0, + "grad_norm": 1.8410817087877605, + "language_loss": 0.86509675, + "learning_rate": 1.6244209170960282e-06, + "loss": 0.88594848, + "num_input_tokens_seen": 200626340, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.40234375, + "step": 9317, + "time_per_iteration": 2.6539230346679688 + }, + { + "auxiliary_loss_clip": 0.01061757, + "auxiliary_loss_mlp": 0.01031143, + "balance_loss_clip": 1.01727355, + "balance_loss_mlp": 1.0188365, + "epoch": 0.5602284683601383, + "flos": 26355095216640.0, + "grad_norm": 1.805025285384696, + "language_loss": 0.77346247, + "learning_rate": 1.6240498232001094e-06, + "loss": 0.79439151, + "num_input_tokens_seen": 200644520, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.4296875, + "step": 9318, + "time_per_iteration": 2.6084818840026855 + }, + { + "auxiliary_loss_clip": 0.01059216, + "auxiliary_loss_mlp": 0.01029135, + "balance_loss_clip": 1.01749969, + "balance_loss_mlp": 1.01840496, + "epoch": 0.5602885916128063, + "flos": 24495306802560.0, + "grad_norm": 1.498199142242004, + "language_loss": 0.76530504, + "learning_rate": 1.6236787427208856e-06, + "loss": 0.78618854, + "num_input_tokens_seen": 200664845, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.40820312, + "step": 9319, + "time_per_iteration": 2.6607768535614014 + }, + { + "auxiliary_loss_clip": 0.01058843, + "auxiliary_loss_mlp": 0.01025496, + "balance_loss_clip": 1.01395082, + "balance_loss_mlp": 1.01918674, + "epoch": 0.5603487148654742, + "flos": 27343765497600.0, + "grad_norm": 1.329632978483798, + "language_loss": 0.85274732, + "learning_rate": 1.623307675671599e-06, + "loss": 0.87359071, + "num_input_tokens_seen": 200686535, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.39648438, + "step": 9320, + "time_per_iteration": 2.6399118900299072 + }, + { + "auxiliary_loss_clip": 0.01008802, + "auxiliary_loss_mlp": 0.0100253, + "balance_loss_clip": 1.00162411, + "balance_loss_mlp": 1.00133193, + "epoch": 0.5604088381181422, + "flos": 54084789550080.0, + "grad_norm": 0.7502138164372116, + "language_loss": 0.5260219, + "learning_rate": 1.622936622065493e-06, + "loss": 0.54613525, + "num_input_tokens_seen": 200736965, + "router_z_loss_clip": 0.0090332, + "router_z_loss_mlp": 0.07470703, + "step": 9321, + "time_per_iteration": 2.924217700958252 + }, + { + "auxiliary_loss_clip": 0.01059161, + "auxiliary_loss_mlp": 0.01025607, + "balance_loss_clip": 1.01399601, + "balance_loss_mlp": 1.01926339, + "epoch": 0.5604689613708101, + "flos": 22235308940160.0, + "grad_norm": 1.5243051324987593, + "language_loss": 0.74300808, + "learning_rate": 1.6225655819158083e-06, + "loss": 0.76385576, + "num_input_tokens_seen": 200757420, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.3984375, + "step": 9322, + "time_per_iteration": 2.532407283782959 + }, + { + "auxiliary_loss_clip": 0.01059522, + "auxiliary_loss_mlp": 0.0102493, + "balance_loss_clip": 1.01225853, + "balance_loss_mlp": 1.01889765, + "epoch": 0.5605290846234782, + "flos": 35296351741440.0, + "grad_norm": 2.368495517420381, + "language_loss": 0.73501992, + "learning_rate": 1.6221945552357879e-06, + "loss": 0.75586438, + "num_input_tokens_seen": 200779520, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.40625, + "step": 9323, + "time_per_iteration": 2.6856131553649902 + }, + { + "auxiliary_loss_clip": 0.01061882, + "auxiliary_loss_mlp": 0.01025386, + "balance_loss_clip": 1.01289296, + "balance_loss_mlp": 1.02021623, + "epoch": 0.5605892078761461, + "flos": 20262367209600.0, + "grad_norm": 1.5972114967869606, + "language_loss": 0.61378318, + "learning_rate": 1.6218235420386716e-06, + "loss": 0.63465583, + "num_input_tokens_seen": 200799485, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.41796875, + "step": 9324, + "time_per_iteration": 2.4859886169433594 + }, + { + "auxiliary_loss_clip": 0.01060315, + "auxiliary_loss_mlp": 0.01026141, + "balance_loss_clip": 1.01311183, + "balance_loss_mlp": 1.01880944, + "epoch": 0.5606493311288141, + "flos": 17307458179200.0, + "grad_norm": 1.7618344357558762, + "language_loss": 0.88089544, + "learning_rate": 1.6214525423377e-06, + "loss": 0.90175998, + "num_input_tokens_seen": 200817540, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.4140625, + "step": 9325, + "time_per_iteration": 2.4877216815948486 + }, + { + "auxiliary_loss_clip": 0.01061585, + "auxiliary_loss_mlp": 0.01025392, + "balance_loss_clip": 1.01201057, + "balance_loss_mlp": 1.01918268, + "epoch": 0.560709454381482, + "flos": 21651910254720.0, + "grad_norm": 1.6226597550461597, + "language_loss": 0.73499107, + "learning_rate": 1.6210815561461143e-06, + "loss": 0.75586092, + "num_input_tokens_seen": 200838380, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.42382812, + "step": 9326, + "time_per_iteration": 2.466944694519043 + }, + { + "auxiliary_loss_clip": 0.01063234, + "auxiliary_loss_mlp": 0.01024417, + "balance_loss_clip": 1.01006436, + "balance_loss_mlp": 1.01968575, + "epoch": 0.56076957763415, + "flos": 20302307671680.0, + "grad_norm": 1.72676029549513, + "language_loss": 0.78037488, + "learning_rate": 1.6207105834771523e-06, + "loss": 0.80125141, + "num_input_tokens_seen": 200855640, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.43554688, + "step": 9327, + "time_per_iteration": 2.4164934158325195 + }, + { + "auxiliary_loss_clip": 0.01063605, + "auxiliary_loss_mlp": 0.01023657, + "balance_loss_clip": 1.01116347, + "balance_loss_mlp": 1.02156544, + "epoch": 0.5608297008868179, + "flos": 25044734868480.0, + "grad_norm": 1.5664230520971558, + "language_loss": 0.78529161, + "learning_rate": 1.6203396243440543e-06, + "loss": 0.80616426, + "num_input_tokens_seen": 200876585, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.41992188, + "step": 9328, + "time_per_iteration": 2.497744560241699 + }, + { + "auxiliary_loss_clip": 0.01059313, + "auxiliary_loss_mlp": 0.01022416, + "balance_loss_clip": 1.01055419, + "balance_loss_mlp": 1.01785922, + "epoch": 0.560889824139486, + "flos": 19865753631360.0, + "grad_norm": 1.5241933813260393, + "language_loss": 0.73641831, + "learning_rate": 1.6199686787600592e-06, + "loss": 0.75723565, + "num_input_tokens_seen": 200898175, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.4140625, + "step": 9329, + "time_per_iteration": 2.479416847229004 + }, + { + "auxiliary_loss_clip": 0.01062821, + "auxiliary_loss_mlp": 0.01027494, + "balance_loss_clip": 1.01382673, + "balance_loss_mlp": 1.02031374, + "epoch": 0.5609499473921539, + "flos": 22928299983360.0, + "grad_norm": 1.505240132429724, + "language_loss": 0.83579767, + "learning_rate": 1.6195977467384035e-06, + "loss": 0.8567009, + "num_input_tokens_seen": 200917515, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.42578125, + "step": 9330, + "time_per_iteration": 2.4327893257141113 + }, + { + "auxiliary_loss_clip": 0.01009016, + "auxiliary_loss_mlp": 0.01001141, + "balance_loss_clip": 1.00018108, + "balance_loss_mlp": 1.00152338, + "epoch": 0.5610100706448219, + "flos": 53032716506880.0, + "grad_norm": 0.7212091697844606, + "language_loss": 0.57897455, + "learning_rate": 1.6192268282923261e-06, + "loss": 0.59907615, + "num_input_tokens_seen": 200978615, + "router_z_loss_clip": 0.00958252, + "router_z_loss_mlp": 0.07519531, + "step": 9331, + "time_per_iteration": 3.1112189292907715 + }, + { + "auxiliary_loss_clip": 0.01061362, + "auxiliary_loss_mlp": 0.01026172, + "balance_loss_clip": 1.01341653, + "balance_loss_mlp": 1.02011538, + "epoch": 0.5610701938974898, + "flos": 21833877594240.0, + "grad_norm": 1.7359368203100705, + "language_loss": 0.81522018, + "learning_rate": 1.6188559234350632e-06, + "loss": 0.83609557, + "num_input_tokens_seen": 200997745, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.4140625, + "step": 9332, + "time_per_iteration": 2.5141942501068115 + }, + { + "auxiliary_loss_clip": 0.01065395, + "auxiliary_loss_mlp": 0.01025803, + "balance_loss_clip": 1.01201642, + "balance_loss_mlp": 1.02110159, + "epoch": 0.5611303171501578, + "flos": 17456222949120.0, + "grad_norm": 1.71173279498305, + "language_loss": 0.81710428, + "learning_rate": 1.6184850321798524e-06, + "loss": 0.83801627, + "num_input_tokens_seen": 201016370, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.44335938, + "step": 9333, + "time_per_iteration": 2.42177152633667 + }, + { + "auxiliary_loss_clip": 0.01061001, + "auxiliary_loss_mlp": 0.01024677, + "balance_loss_clip": 1.01224351, + "balance_loss_mlp": 1.01961589, + "epoch": 0.5611904404028258, + "flos": 22636705374720.0, + "grad_norm": 1.5866263215467893, + "language_loss": 0.72829723, + "learning_rate": 1.6181141545399294e-06, + "loss": 0.74915403, + "num_input_tokens_seen": 201034310, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.4140625, + "step": 9334, + "time_per_iteration": 3.8695623874664307 + }, + { + "auxiliary_loss_clip": 0.01062369, + "auxiliary_loss_mlp": 0.010294, + "balance_loss_clip": 1.01614988, + "balance_loss_mlp": 1.02042079, + "epoch": 0.5612505636554938, + "flos": 14315541240960.0, + "grad_norm": 3.6592724601306865, + "language_loss": 0.71364337, + "learning_rate": 1.6177432905285296e-06, + "loss": 0.73456109, + "num_input_tokens_seen": 201052030, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.41992188, + "step": 9335, + "time_per_iteration": 2.4029414653778076 + }, + { + "auxiliary_loss_clip": 0.01062042, + "auxiliary_loss_mlp": 0.01024301, + "balance_loss_clip": 1.01189709, + "balance_loss_mlp": 1.02063406, + "epoch": 0.5613106869081618, + "flos": 16507353484800.0, + "grad_norm": 1.767748814492976, + "language_loss": 0.76351506, + "learning_rate": 1.617372440158889e-06, + "loss": 0.78437841, + "num_input_tokens_seen": 201068445, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.4140625, + "step": 9336, + "time_per_iteration": 2.437260150909424 + }, + { + "auxiliary_loss_clip": 0.01060447, + "auxiliary_loss_mlp": 0.0102506, + "balance_loss_clip": 1.01288247, + "balance_loss_mlp": 1.01962638, + "epoch": 0.5613708101608297, + "flos": 24057495953280.0, + "grad_norm": 2.639936843976639, + "language_loss": 0.64587831, + "learning_rate": 1.6170016034442412e-06, + "loss": 0.66673338, + "num_input_tokens_seen": 201082140, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.40625, + "step": 9337, + "time_per_iteration": 2.4266927242279053 + }, + { + "auxiliary_loss_clip": 0.01009003, + "auxiliary_loss_mlp": 0.01000749, + "balance_loss_clip": 0.99970025, + "balance_loss_mlp": 1.00150275, + "epoch": 0.5614309334134977, + "flos": 64902977832960.0, + "grad_norm": 0.9006390461471315, + "language_loss": 0.62682271, + "learning_rate": 1.6166307803978213e-06, + "loss": 0.6469202, + "num_input_tokens_seen": 201137245, + "router_z_loss_clip": 0.01049805, + "router_z_loss_mlp": 0.07519531, + "step": 9338, + "time_per_iteration": 3.0277273654937744 + }, + { + "auxiliary_loss_clip": 0.01062304, + "auxiliary_loss_mlp": 0.01024151, + "balance_loss_clip": 1.01120532, + "balance_loss_mlp": 1.01940584, + "epoch": 0.5614910566661656, + "flos": 32918662085760.0, + "grad_norm": 1.8863851027775318, + "language_loss": 0.65467298, + "learning_rate": 1.6162599710328624e-06, + "loss": 0.67553759, + "num_input_tokens_seen": 201157270, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.4296875, + "step": 9339, + "time_per_iteration": 2.5171706676483154 + }, + { + "auxiliary_loss_clip": 0.01060613, + "auxiliary_loss_mlp": 0.01023282, + "balance_loss_clip": 1.01033044, + "balance_loss_mlp": 1.01975346, + "epoch": 0.5615511799188336, + "flos": 18587862714240.0, + "grad_norm": 1.6202458464540739, + "language_loss": 0.6990245, + "learning_rate": 1.6158891753625986e-06, + "loss": 0.71986341, + "num_input_tokens_seen": 201174530, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.40820312, + "step": 9340, + "time_per_iteration": 2.423816442489624 + }, + { + "auxiliary_loss_clip": 0.01060055, + "auxiliary_loss_mlp": 0.01025443, + "balance_loss_clip": 1.01339138, + "balance_loss_mlp": 1.02077532, + "epoch": 0.5616113031715015, + "flos": 22818917093760.0, + "grad_norm": 2.056694310533803, + "language_loss": 0.77460086, + "learning_rate": 1.6155183934002618e-06, + "loss": 0.79545581, + "num_input_tokens_seen": 201194905, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.39453125, + "step": 9341, + "time_per_iteration": 2.4610812664031982 + }, + { + "auxiliary_loss_clip": 0.01060166, + "auxiliary_loss_mlp": 0.01022967, + "balance_loss_clip": 1.0101043, + "balance_loss_mlp": 1.01854205, + "epoch": 0.5616714264241696, + "flos": 22344622007040.0, + "grad_norm": 1.5748597194809755, + "language_loss": 0.79500747, + "learning_rate": 1.6151476251590843e-06, + "loss": 0.81583881, + "num_input_tokens_seen": 201213715, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.41796875, + "step": 9342, + "time_per_iteration": 2.421229362487793 + }, + { + "auxiliary_loss_clip": 0.01063253, + "auxiliary_loss_mlp": 0.01022413, + "balance_loss_clip": 1.00967526, + "balance_loss_mlp": 1.02137268, + "epoch": 0.5617315496768375, + "flos": 18806768138880.0, + "grad_norm": 2.184229374955021, + "language_loss": 0.76039004, + "learning_rate": 1.6147768706522983e-06, + "loss": 0.78124666, + "num_input_tokens_seen": 201231415, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.41796875, + "step": 9343, + "time_per_iteration": 2.4146597385406494 + }, + { + "auxiliary_loss_clip": 0.0106295, + "auxiliary_loss_mlp": 0.01032995, + "balance_loss_clip": 1.01993608, + "balance_loss_mlp": 1.02142572, + "epoch": 0.5617916729295055, + "flos": 18368328885120.0, + "grad_norm": 1.6939990150156647, + "language_loss": 0.68796074, + "learning_rate": 1.614406129893135e-06, + "loss": 0.70892018, + "num_input_tokens_seen": 201249625, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.41601562, + "step": 9344, + "time_per_iteration": 2.4470555782318115 + }, + { + "auxiliary_loss_clip": 0.01061111, + "auxiliary_loss_mlp": 0.01024775, + "balance_loss_clip": 1.0118885, + "balance_loss_mlp": 1.0201993, + "epoch": 0.5618517961821734, + "flos": 28178818329600.0, + "grad_norm": 1.7965185391532088, + "language_loss": 0.66341531, + "learning_rate": 1.6140354028948253e-06, + "loss": 0.68427414, + "num_input_tokens_seen": 201271205, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.41015625, + "step": 9345, + "time_per_iteration": 2.4779539108276367 + }, + { + "auxiliary_loss_clip": 0.01062689, + "auxiliary_loss_mlp": 0.01027541, + "balance_loss_clip": 1.01499438, + "balance_loss_mlp": 1.02057278, + "epoch": 0.5619119194348414, + "flos": 15485969393280.0, + "grad_norm": 1.9583235939646804, + "language_loss": 0.8738575, + "learning_rate": 1.613664689670599e-06, + "loss": 0.89475977, + "num_input_tokens_seen": 201287700, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.421875, + "step": 9346, + "time_per_iteration": 3.9443159103393555 + }, + { + "auxiliary_loss_clip": 0.01062471, + "auxiliary_loss_mlp": 0.01028524, + "balance_loss_clip": 1.01468349, + "balance_loss_mlp": 1.02043033, + "epoch": 0.5619720426875094, + "flos": 29127478325760.0, + "grad_norm": 2.0198485139352353, + "language_loss": 0.59716177, + "learning_rate": 1.6132939902336857e-06, + "loss": 0.61807173, + "num_input_tokens_seen": 201307530, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.421875, + "step": 9347, + "time_per_iteration": 2.500040292739868 + }, + { + "auxiliary_loss_clip": 0.01062434, + "auxiliary_loss_mlp": 0.01025322, + "balance_loss_clip": 1.01146412, + "balance_loss_mlp": 1.02009344, + "epoch": 0.5620321659401774, + "flos": 18002788283520.0, + "grad_norm": 2.3308966249304857, + "language_loss": 0.69454128, + "learning_rate": 1.6129233045973159e-06, + "loss": 0.71541882, + "num_input_tokens_seen": 201326210, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.42382812, + "step": 9348, + "time_per_iteration": 2.591062307357788 + }, + { + "auxiliary_loss_clip": 0.01064794, + "auxiliary_loss_mlp": 0.01023117, + "balance_loss_clip": 1.0091635, + "balance_loss_mlp": 1.01952755, + "epoch": 0.5620922891928454, + "flos": 20593483344000.0, + "grad_norm": 1.9963929019723865, + "language_loss": 0.78895199, + "learning_rate": 1.612552632774717e-06, + "loss": 0.80983114, + "num_input_tokens_seen": 201346120, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.453125, + "step": 9349, + "time_per_iteration": 2.434753894805908 + }, + { + "auxiliary_loss_clip": 0.0106285, + "auxiliary_loss_mlp": 0.01027872, + "balance_loss_clip": 1.01365066, + "balance_loss_mlp": 1.02040815, + "epoch": 0.5621524124455133, + "flos": 26285792434560.0, + "grad_norm": 2.0657848143070767, + "language_loss": 0.67305869, + "learning_rate": 1.6121819747791183e-06, + "loss": 0.69396597, + "num_input_tokens_seen": 201365700, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.42382812, + "step": 9350, + "time_per_iteration": 3.974392890930176 + }, + { + "auxiliary_loss_clip": 0.01064092, + "auxiliary_loss_mlp": 0.01029941, + "balance_loss_clip": 1.01675034, + "balance_loss_mlp": 1.02099681, + "epoch": 0.5622125356981813, + "flos": 12749477028480.0, + "grad_norm": 2.451964305608808, + "language_loss": 0.78843045, + "learning_rate": 1.6118113306237474e-06, + "loss": 0.8093707, + "num_input_tokens_seen": 201382795, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.43164062, + "step": 9351, + "time_per_iteration": 2.4469783306121826 + }, + { + "auxiliary_loss_clip": 0.0106479, + "auxiliary_loss_mlp": 0.01026268, + "balance_loss_clip": 1.01244557, + "balance_loss_mlp": 1.02151322, + "epoch": 0.5622726589508492, + "flos": 23804200972800.0, + "grad_norm": 1.8027043946636974, + "language_loss": 0.58968759, + "learning_rate": 1.6114407003218314e-06, + "loss": 0.61059815, + "num_input_tokens_seen": 201402780, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.43359375, + "step": 9352, + "time_per_iteration": 3.8676207065582275 + }, + { + "auxiliary_loss_clip": 0.01059203, + "auxiliary_loss_mlp": 0.01025363, + "balance_loss_clip": 1.01336432, + "balance_loss_mlp": 1.01963794, + "epoch": 0.5623327822035172, + "flos": 24717040047360.0, + "grad_norm": 1.3236141394066079, + "language_loss": 0.71972442, + "learning_rate": 1.6110700838865976e-06, + "loss": 0.74057013, + "num_input_tokens_seen": 201424140, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.39453125, + "step": 9353, + "time_per_iteration": 2.4719815254211426 + }, + { + "auxiliary_loss_clip": 0.01062519, + "auxiliary_loss_mlp": 0.0102678, + "balance_loss_clip": 1.01317823, + "balance_loss_mlp": 1.02016592, + "epoch": 0.5623929054561851, + "flos": 14018640105600.0, + "grad_norm": 3.414753963046977, + "language_loss": 0.76152349, + "learning_rate": 1.6106994813312716e-06, + "loss": 0.78241646, + "num_input_tokens_seen": 201439645, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.42382812, + "step": 9354, + "time_per_iteration": 2.402418375015259 + }, + { + "auxiliary_loss_clip": 0.01062853, + "auxiliary_loss_mlp": 0.01025681, + "balance_loss_clip": 1.01305676, + "balance_loss_mlp": 1.01949477, + "epoch": 0.5624530287088532, + "flos": 20703354992640.0, + "grad_norm": 1.9180678725238363, + "language_loss": 0.72800183, + "learning_rate": 1.61032889266908e-06, + "loss": 0.74888718, + "num_input_tokens_seen": 201459970, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.43359375, + "step": 9355, + "time_per_iteration": 2.4866714477539062 + }, + { + "auxiliary_loss_clip": 0.01061171, + "auxiliary_loss_mlp": 0.01027002, + "balance_loss_clip": 1.01432419, + "balance_loss_mlp": 1.01941311, + "epoch": 0.5625131519615211, + "flos": 21469838181120.0, + "grad_norm": 4.463021266596698, + "language_loss": 0.73635566, + "learning_rate": 1.6099583179132482e-06, + "loss": 0.75723737, + "num_input_tokens_seen": 201480055, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.41796875, + "step": 9356, + "time_per_iteration": 2.4044642448425293 + }, + { + "auxiliary_loss_clip": 0.01060292, + "auxiliary_loss_mlp": 0.01026882, + "balance_loss_clip": 1.01413858, + "balance_loss_mlp": 1.02020526, + "epoch": 0.5625732752141891, + "flos": 18697001224320.0, + "grad_norm": 2.0564828728841817, + "language_loss": 0.83213329, + "learning_rate": 1.609587757077e-06, + "loss": 0.85300505, + "num_input_tokens_seen": 201497645, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.40039062, + "step": 9357, + "time_per_iteration": 2.383101463317871 + }, + { + "auxiliary_loss_clip": 0.01060082, + "auxiliary_loss_mlp": 0.01025172, + "balance_loss_clip": 1.01291084, + "balance_loss_mlp": 1.01906538, + "epoch": 0.562633398466857, + "flos": 16215968344320.0, + "grad_norm": 2.1243360204976325, + "language_loss": 0.72223502, + "learning_rate": 1.609217210173561e-06, + "loss": 0.74308753, + "num_input_tokens_seen": 201515455, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.41015625, + "step": 9358, + "time_per_iteration": 2.3657307624816895 + }, + { + "auxiliary_loss_clip": 0.01060196, + "auxiliary_loss_mlp": 0.0102992, + "balance_loss_clip": 1.01689649, + "balance_loss_mlp": 1.01928723, + "epoch": 0.562693521719525, + "flos": 22490838247680.0, + "grad_norm": 1.6719498891098292, + "language_loss": 0.77549297, + "learning_rate": 1.6088466772161547e-06, + "loss": 0.79639411, + "num_input_tokens_seen": 201534500, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.40820312, + "step": 9359, + "time_per_iteration": 2.3996593952178955 + }, + { + "auxiliary_loss_clip": 0.01059277, + "auxiliary_loss_mlp": 0.01023919, + "balance_loss_clip": 1.01123548, + "balance_loss_mlp": 1.01827526, + "epoch": 0.562753644972193, + "flos": 25330185077760.0, + "grad_norm": 1.8187036668754777, + "language_loss": 0.70267069, + "learning_rate": 1.6084761582180039e-06, + "loss": 0.7235027, + "num_input_tokens_seen": 201553280, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.41015625, + "step": 9360, + "time_per_iteration": 2.429276466369629 + }, + { + "auxiliary_loss_clip": 0.01008897, + "auxiliary_loss_mlp": 0.01001242, + "balance_loss_clip": 1.00028276, + "balance_loss_mlp": 1.00132608, + "epoch": 0.562813768224861, + "flos": 67389631441920.0, + "grad_norm": 0.7756698615845128, + "language_loss": 0.55564243, + "learning_rate": 1.6081056531923321e-06, + "loss": 0.57574385, + "num_input_tokens_seen": 201610030, + "router_z_loss_clip": 0.00958252, + "router_z_loss_mlp": 0.07568359, + "step": 9361, + "time_per_iteration": 2.989426851272583 + }, + { + "auxiliary_loss_clip": 0.01060833, + "auxiliary_loss_mlp": 0.01027282, + "balance_loss_clip": 1.01571286, + "balance_loss_mlp": 1.02108645, + "epoch": 0.562873891477529, + "flos": 23330045531520.0, + "grad_norm": 1.4381778705830282, + "language_loss": 0.81968129, + "learning_rate": 1.6077351621523615e-06, + "loss": 0.84056246, + "num_input_tokens_seen": 201628370, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.39648438, + "step": 9362, + "time_per_iteration": 2.409508466720581 + }, + { + "auxiliary_loss_clip": 0.01060209, + "auxiliary_loss_mlp": 0.01027626, + "balance_loss_clip": 1.01476359, + "balance_loss_mlp": 1.01933932, + "epoch": 0.5629340147301969, + "flos": 38471283360000.0, + "grad_norm": 1.5295904316794424, + "language_loss": 0.74411619, + "learning_rate": 1.6073646851113139e-06, + "loss": 0.7649945, + "num_input_tokens_seen": 201649790, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.41015625, + "step": 9363, + "time_per_iteration": 2.548039197921753 + }, + { + "auxiliary_loss_clip": 0.01062726, + "auxiliary_loss_mlp": 0.01028566, + "balance_loss_clip": 1.01523197, + "balance_loss_mlp": 1.02028525, + "epoch": 0.5629941379828649, + "flos": 29240736376320.0, + "grad_norm": 2.026355467301781, + "language_loss": 0.82642281, + "learning_rate": 1.6069942220824104e-06, + "loss": 0.8473357, + "num_input_tokens_seen": 201669175, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.42382812, + "step": 9364, + "time_per_iteration": 2.4882969856262207 + }, + { + "auxiliary_loss_clip": 0.01060003, + "auxiliary_loss_mlp": 0.01025127, + "balance_loss_clip": 1.01296711, + "balance_loss_mlp": 1.01930547, + "epoch": 0.5630542612355328, + "flos": 19420052814720.0, + "grad_norm": 1.96179010533912, + "language_loss": 0.64729035, + "learning_rate": 1.6066237730788725e-06, + "loss": 0.6681416, + "num_input_tokens_seen": 201687000, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.40625, + "step": 9365, + "time_per_iteration": 2.4187545776367188 + }, + { + "auxiliary_loss_clip": 0.01064414, + "auxiliary_loss_mlp": 0.0102651, + "balance_loss_clip": 1.01263952, + "balance_loss_mlp": 1.02015042, + "epoch": 0.5631143844882008, + "flos": 22265404398720.0, + "grad_norm": 13.819695248439455, + "language_loss": 0.8086282, + "learning_rate": 1.6062533381139201e-06, + "loss": 0.82953745, + "num_input_tokens_seen": 201703335, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.44335938, + "step": 9366, + "time_per_iteration": 2.4086148738861084 + }, + { + "auxiliary_loss_clip": 0.01060835, + "auxiliary_loss_mlp": 0.01024593, + "balance_loss_clip": 1.01289296, + "balance_loss_mlp": 1.02026987, + "epoch": 0.5631745077408687, + "flos": 22964225639040.0, + "grad_norm": 1.4847695692716016, + "language_loss": 0.73507643, + "learning_rate": 1.6058829172007732e-06, + "loss": 0.75593078, + "num_input_tokens_seen": 201723495, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.40625, + "step": 9367, + "time_per_iteration": 2.4942803382873535 + }, + { + "auxiliary_loss_clip": 0.01008586, + "auxiliary_loss_mlp": 0.01003829, + "balance_loss_clip": 1.00286913, + "balance_loss_mlp": 1.0009681, + "epoch": 0.5632346309935368, + "flos": 65076948604800.0, + "grad_norm": 0.6225217209739566, + "language_loss": 0.53513527, + "learning_rate": 1.6055125103526518e-06, + "loss": 0.55525947, + "num_input_tokens_seen": 201792615, + "router_z_loss_clip": 0.00958252, + "router_z_loss_mlp": 0.07617188, + "step": 9368, + "time_per_iteration": 3.1808199882507324 + }, + { + "auxiliary_loss_clip": 0.01061494, + "auxiliary_loss_mlp": 0.01026037, + "balance_loss_clip": 1.01368713, + "balance_loss_mlp": 1.02044499, + "epoch": 0.5632947542462047, + "flos": 23001792128640.0, + "grad_norm": 1.8620463132474907, + "language_loss": 0.69394809, + "learning_rate": 1.6051421175827734e-06, + "loss": 0.71482342, + "num_input_tokens_seen": 201812520, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.41015625, + "step": 9369, + "time_per_iteration": 2.4471635818481445 + }, + { + "auxiliary_loss_clip": 0.01059943, + "auxiliary_loss_mlp": 0.01030823, + "balance_loss_clip": 1.01781774, + "balance_loss_mlp": 1.01904941, + "epoch": 0.5633548774988727, + "flos": 30481270272000.0, + "grad_norm": 1.9666637257665318, + "language_loss": 0.75975287, + "learning_rate": 1.6047717389043574e-06, + "loss": 0.78066051, + "num_input_tokens_seen": 201834185, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.41015625, + "step": 9370, + "time_per_iteration": 2.485236167907715 + }, + { + "auxiliary_loss_clip": 0.01062725, + "auxiliary_loss_mlp": 0.01027978, + "balance_loss_clip": 1.01393557, + "balance_loss_mlp": 1.01923311, + "epoch": 0.5634150007515406, + "flos": 18514056366720.0, + "grad_norm": 2.0162825927442607, + "language_loss": 0.75636005, + "learning_rate": 1.604401374330621e-06, + "loss": 0.77726704, + "num_input_tokens_seen": 201851305, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.43554688, + "step": 9371, + "time_per_iteration": 2.3599672317504883 + }, + { + "auxiliary_loss_clip": 0.01059813, + "auxiliary_loss_mlp": 0.01030832, + "balance_loss_clip": 1.01867247, + "balance_loss_mlp": 1.01931429, + "epoch": 0.5634751240042086, + "flos": 19243671292800.0, + "grad_norm": 1.7412819029845232, + "language_loss": 0.76462901, + "learning_rate": 1.6040310238747826e-06, + "loss": 0.78553545, + "num_input_tokens_seen": 201870350, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.40625, + "step": 9372, + "time_per_iteration": 2.3970510959625244 + }, + { + "auxiliary_loss_clip": 0.0105964, + "auxiliary_loss_mlp": 0.01028628, + "balance_loss_clip": 1.01636136, + "balance_loss_mlp": 1.02010751, + "epoch": 0.5635352472568766, + "flos": 12019827191040.0, + "grad_norm": 1.8899228483845274, + "language_loss": 0.71011794, + "learning_rate": 1.6036606875500583e-06, + "loss": 0.73100054, + "num_input_tokens_seen": 201886800, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.39453125, + "step": 9373, + "time_per_iteration": 3.8764121532440186 + }, + { + "auxiliary_loss_clip": 0.01062718, + "auxiliary_loss_mlp": 0.0102986, + "balance_loss_clip": 1.01647305, + "balance_loss_mlp": 1.02016795, + "epoch": 0.5635953705095446, + "flos": 21870571299840.0, + "grad_norm": 1.8865937413916414, + "language_loss": 0.83141685, + "learning_rate": 1.6032903653696645e-06, + "loss": 0.85234272, + "num_input_tokens_seen": 201904730, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.42578125, + "step": 9374, + "time_per_iteration": 2.397357702255249 + }, + { + "auxiliary_loss_clip": 0.01058257, + "auxiliary_loss_mlp": 0.01021454, + "balance_loss_clip": 1.00961602, + "balance_loss_mlp": 1.01789165, + "epoch": 0.5636554937622126, + "flos": 27124929895680.0, + "grad_norm": 1.8938119846700219, + "language_loss": 0.66073287, + "learning_rate": 1.6029200573468172e-06, + "loss": 0.68153, + "num_input_tokens_seen": 201924850, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.40429688, + "step": 9375, + "time_per_iteration": 2.443230152130127 + }, + { + "auxiliary_loss_clip": 0.01060878, + "auxiliary_loss_mlp": 0.01026385, + "balance_loss_clip": 1.01386833, + "balance_loss_mlp": 1.01903296, + "epoch": 0.5637156170148805, + "flos": 12925753816320.0, + "grad_norm": 5.095749969994379, + "language_loss": 0.81173253, + "learning_rate": 1.602549763494731e-06, + "loss": 0.83260518, + "num_input_tokens_seen": 201939500, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.41992188, + "step": 9376, + "time_per_iteration": 2.3900370597839355 + }, + { + "auxiliary_loss_clip": 0.01061689, + "auxiliary_loss_mlp": 0.01027584, + "balance_loss_clip": 1.01434553, + "balance_loss_mlp": 1.01883042, + "epoch": 0.5637757402675485, + "flos": 45549295246080.0, + "grad_norm": 1.350505581596707, + "language_loss": 0.69051939, + "learning_rate": 1.6021794838266223e-06, + "loss": 0.71141207, + "num_input_tokens_seen": 201963000, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.4296875, + "step": 9377, + "time_per_iteration": 2.5968198776245117 + }, + { + "auxiliary_loss_clip": 0.01058489, + "auxiliary_loss_mlp": 0.01023313, + "balance_loss_clip": 1.01098037, + "balance_loss_mlp": 1.02011001, + "epoch": 0.5638358635202164, + "flos": 20885008129920.0, + "grad_norm": 2.800298543122573, + "language_loss": 0.74675977, + "learning_rate": 1.601809218355704e-06, + "loss": 0.76757783, + "num_input_tokens_seen": 201983145, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.3828125, + "step": 9378, + "time_per_iteration": 2.403878688812256 + }, + { + "auxiliary_loss_clip": 0.01065156, + "auxiliary_loss_mlp": 0.01030207, + "balance_loss_clip": 1.01616383, + "balance_loss_mlp": 1.0201782, + "epoch": 0.5638959867728844, + "flos": 18805581152640.0, + "grad_norm": 2.248254144980931, + "language_loss": 0.8217628, + "learning_rate": 1.6014389670951902e-06, + "loss": 0.84271646, + "num_input_tokens_seen": 202000335, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.44921875, + "step": 9379, + "time_per_iteration": 2.4163901805877686 + }, + { + "auxiliary_loss_clip": 0.01061619, + "auxiliary_loss_mlp": 0.01025591, + "balance_loss_clip": 1.0128355, + "balance_loss_mlp": 1.01916599, + "epoch": 0.5639561100255523, + "flos": 27489108954240.0, + "grad_norm": 1.9910340402997595, + "language_loss": 0.71475589, + "learning_rate": 1.6010687300582948e-06, + "loss": 0.73562795, + "num_input_tokens_seen": 202018275, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.42578125, + "step": 9380, + "time_per_iteration": 2.4454805850982666 + }, + { + "auxiliary_loss_clip": 0.01063537, + "auxiliary_loss_mlp": 0.01034452, + "balance_loss_clip": 1.01978922, + "balance_loss_mlp": 1.02084279, + "epoch": 0.5640162332782204, + "flos": 18075617112960.0, + "grad_norm": 2.2132086290360022, + "language_loss": 0.74451298, + "learning_rate": 1.60069850725823e-06, + "loss": 0.76549286, + "num_input_tokens_seen": 202034330, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.42578125, + "step": 9381, + "time_per_iteration": 2.4064009189605713 + }, + { + "auxiliary_loss_clip": 0.01061436, + "auxiliary_loss_mlp": 0.01027959, + "balance_loss_clip": 1.01534069, + "balance_loss_mlp": 1.01913369, + "epoch": 0.5640763565308883, + "flos": 20883856055040.0, + "grad_norm": 2.0706071279778975, + "language_loss": 0.72285503, + "learning_rate": 1.6003282987082086e-06, + "loss": 0.74374896, + "num_input_tokens_seen": 202053100, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.421875, + "step": 9382, + "time_per_iteration": 2.416933059692383 + }, + { + "auxiliary_loss_clip": 0.01008561, + "auxiliary_loss_mlp": 0.01000604, + "balance_loss_clip": 0.99960816, + "balance_loss_mlp": 1.0009644, + "epoch": 0.5641364797835563, + "flos": 64444707060480.0, + "grad_norm": 0.7243456252354803, + "language_loss": 0.54416496, + "learning_rate": 1.5999581044214417e-06, + "loss": 0.56425655, + "num_input_tokens_seen": 202120125, + "router_z_loss_clip": 0.00994873, + "router_z_loss_mlp": 0.07617188, + "step": 9383, + "time_per_iteration": 3.222151279449463 + }, + { + "auxiliary_loss_clip": 0.01062489, + "auxiliary_loss_mlp": 0.01025099, + "balance_loss_clip": 1.01128829, + "balance_loss_mlp": 1.01930737, + "epoch": 0.5641966030362242, + "flos": 18659958405120.0, + "grad_norm": 1.9449465010651412, + "language_loss": 0.7052297, + "learning_rate": 1.5995879244111417e-06, + "loss": 0.72610563, + "num_input_tokens_seen": 202138030, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.43164062, + "step": 9384, + "time_per_iteration": 2.4816601276397705 + }, + { + "auxiliary_loss_clip": 0.01061367, + "auxiliary_loss_mlp": 0.01026303, + "balance_loss_clip": 1.01292217, + "balance_loss_mlp": 1.01869214, + "epoch": 0.5642567262888922, + "flos": 22491222272640.0, + "grad_norm": 1.78439105941312, + "language_loss": 0.75708961, + "learning_rate": 1.5992177586905185e-06, + "loss": 0.77796632, + "num_input_tokens_seen": 202155580, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.42578125, + "step": 9385, + "time_per_iteration": 3.869999408721924 + }, + { + "auxiliary_loss_clip": 0.01059294, + "auxiliary_loss_mlp": 0.0102822, + "balance_loss_clip": 1.01532149, + "balance_loss_mlp": 1.01880229, + "epoch": 0.5643168495415603, + "flos": 13003190945280.0, + "grad_norm": 2.461012093480193, + "language_loss": 0.82688218, + "learning_rate": 1.598847607272782e-06, + "loss": 0.84775734, + "num_input_tokens_seen": 202170365, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.40625, + "step": 9386, + "time_per_iteration": 2.384225606918335 + }, + { + "auxiliary_loss_clip": 0.01065031, + "auxiliary_loss_mlp": 0.01030843, + "balance_loss_clip": 1.01733017, + "balance_loss_mlp": 1.02124655, + "epoch": 0.5643769727942282, + "flos": 18587304132480.0, + "grad_norm": 1.9171685746168656, + "language_loss": 0.70080316, + "learning_rate": 1.5984774701711433e-06, + "loss": 0.72176194, + "num_input_tokens_seen": 202189095, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.4375, + "step": 9387, + "time_per_iteration": 2.4977023601531982 + }, + { + "auxiliary_loss_clip": 0.01063053, + "auxiliary_loss_mlp": 0.01028294, + "balance_loss_clip": 1.01531172, + "balance_loss_mlp": 1.0202806, + "epoch": 0.5644370960468962, + "flos": 33804757192320.0, + "grad_norm": 1.4242966375303159, + "language_loss": 0.74534094, + "learning_rate": 1.59810734739881e-06, + "loss": 0.76625437, + "num_input_tokens_seen": 202213500, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.42773438, + "step": 9388, + "time_per_iteration": 2.5942490100860596 + }, + { + "auxiliary_loss_clip": 0.0105937, + "auxiliary_loss_mlp": 0.01022002, + "balance_loss_clip": 1.010921, + "balance_loss_mlp": 1.02029622, + "epoch": 0.5644972192995641, + "flos": 21213855025920.0, + "grad_norm": 1.5501468383396901, + "language_loss": 0.82100445, + "learning_rate": 1.5977372389689927e-06, + "loss": 0.84181821, + "num_input_tokens_seen": 202231920, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.390625, + "step": 9389, + "time_per_iteration": 2.4078116416931152 + }, + { + "auxiliary_loss_clip": 0.01062862, + "auxiliary_loss_mlp": 0.01028156, + "balance_loss_clip": 1.01453066, + "balance_loss_mlp": 1.01952243, + "epoch": 0.5645573425522321, + "flos": 18586745550720.0, + "grad_norm": 3.163105645118918, + "language_loss": 0.64310056, + "learning_rate": 1.5973671448948981e-06, + "loss": 0.66401076, + "num_input_tokens_seen": 202247600, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.43359375, + "step": 9390, + "time_per_iteration": 3.9263272285461426 + }, + { + "auxiliary_loss_clip": 0.01061107, + "auxiliary_loss_mlp": 0.01023829, + "balance_loss_clip": 1.0119853, + "balance_loss_mlp": 1.02064586, + "epoch": 0.5646174658049, + "flos": 18112834488960.0, + "grad_norm": 2.989423463926984, + "language_loss": 0.92103839, + "learning_rate": 1.5969970651897343e-06, + "loss": 0.94188774, + "num_input_tokens_seen": 202265350, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.40429688, + "step": 9391, + "time_per_iteration": 2.3957929611206055 + }, + { + "auxiliary_loss_clip": 0.01066022, + "auxiliary_loss_mlp": 0.01029666, + "balance_loss_clip": 1.01469338, + "balance_loss_mlp": 1.02087748, + "epoch": 0.564677589057568, + "flos": 28328700263040.0, + "grad_norm": 1.9106689538916133, + "language_loss": 0.60208786, + "learning_rate": 1.5966269998667088e-06, + "loss": 0.62304473, + "num_input_tokens_seen": 202284285, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.453125, + "step": 9392, + "time_per_iteration": 3.8821635246276855 + }, + { + "auxiliary_loss_clip": 0.01060027, + "auxiliary_loss_mlp": 0.01024182, + "balance_loss_clip": 1.01130104, + "balance_loss_mlp": 1.01862407, + "epoch": 0.564737712310236, + "flos": 22162654667520.0, + "grad_norm": 3.1985651893355094, + "language_loss": 0.81649959, + "learning_rate": 1.5962569489390277e-06, + "loss": 0.83734167, + "num_input_tokens_seen": 202303450, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.4140625, + "step": 9393, + "time_per_iteration": 2.3858447074890137 + }, + { + "auxiliary_loss_clip": 0.01061894, + "auxiliary_loss_mlp": 0.01028345, + "balance_loss_clip": 1.01626301, + "balance_loss_mlp": 1.02060616, + "epoch": 0.564797835562904, + "flos": 20957976604800.0, + "grad_norm": 1.874183060710923, + "language_loss": 0.87392467, + "learning_rate": 1.595886912419898e-06, + "loss": 0.89482701, + "num_input_tokens_seen": 202322315, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.4140625, + "step": 9394, + "time_per_iteration": 2.4018394947052 + }, + { + "auxiliary_loss_clip": 0.01062697, + "auxiliary_loss_mlp": 0.01027921, + "balance_loss_clip": 1.01565492, + "balance_loss_mlp": 1.02235138, + "epoch": 0.5648579588155719, + "flos": 17419354686720.0, + "grad_norm": 1.8936605499750108, + "language_loss": 0.84830219, + "learning_rate": 1.5955168903225246e-06, + "loss": 0.86920834, + "num_input_tokens_seen": 202339905, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.40429688, + "step": 9395, + "time_per_iteration": 2.36329984664917 + }, + { + "auxiliary_loss_clip": 0.01008566, + "auxiliary_loss_mlp": 0.01002094, + "balance_loss_clip": 1.00121748, + "balance_loss_mlp": 1.00089359, + "epoch": 0.5649180820682399, + "flos": 69925965782400.0, + "grad_norm": 0.8034152977630151, + "language_loss": 0.58270419, + "learning_rate": 1.5951468826601127e-06, + "loss": 0.60281074, + "num_input_tokens_seen": 202397320, + "router_z_loss_clip": 0.00878906, + "router_z_loss_mlp": 0.07666016, + "step": 9396, + "time_per_iteration": 3.028729200363159 + }, + { + "auxiliary_loss_clip": 0.01062578, + "auxiliary_loss_mlp": 0.01022978, + "balance_loss_clip": 1.01031196, + "balance_loss_mlp": 1.01939344, + "epoch": 0.5649782053209078, + "flos": 24971906039040.0, + "grad_norm": 2.2602658053556297, + "language_loss": 0.70230997, + "learning_rate": 1.5947768894458674e-06, + "loss": 0.72316557, + "num_input_tokens_seen": 202416865, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.43164062, + "step": 9397, + "time_per_iteration": 2.4265944957733154 + }, + { + "auxiliary_loss_clip": 0.01064166, + "auxiliary_loss_mlp": 0.01026812, + "balance_loss_clip": 1.01435447, + "balance_loss_mlp": 1.02078247, + "epoch": 0.5650383285735758, + "flos": 21725507134080.0, + "grad_norm": 1.7286568763131385, + "language_loss": 0.67214549, + "learning_rate": 1.5944069106929924e-06, + "loss": 0.69305527, + "num_input_tokens_seen": 202436210, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.43359375, + "step": 9398, + "time_per_iteration": 2.403465747833252 + }, + { + "auxiliary_loss_clip": 0.01008812, + "auxiliary_loss_mlp": 0.01000435, + "balance_loss_clip": 0.99942183, + "balance_loss_mlp": 1.00123405, + "epoch": 0.5650984518262439, + "flos": 65901318560640.0, + "grad_norm": 0.7934877082178501, + "language_loss": 0.58216715, + "learning_rate": 1.594036946414692e-06, + "loss": 0.60225964, + "num_input_tokens_seen": 202492925, + "router_z_loss_clip": 0.01013184, + "router_z_loss_mlp": 0.07617188, + "step": 9399, + "time_per_iteration": 2.996180772781372 + }, + { + "auxiliary_loss_clip": 0.01064898, + "auxiliary_loss_mlp": 0.0102789, + "balance_loss_clip": 1.01335883, + "balance_loss_mlp": 1.02062917, + "epoch": 0.5651585750789118, + "flos": 21031538572800.0, + "grad_norm": 1.8738519856066178, + "language_loss": 0.7383225, + "learning_rate": 1.5936669966241684e-06, + "loss": 0.7592504, + "num_input_tokens_seen": 202511905, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.44335938, + "step": 9400, + "time_per_iteration": 2.428723096847534 + }, + { + "auxiliary_loss_clip": 0.01065304, + "auxiliary_loss_mlp": 0.0102678, + "balance_loss_clip": 1.01250505, + "balance_loss_mlp": 1.02113843, + "epoch": 0.5652186983315798, + "flos": 18550924629120.0, + "grad_norm": 1.6672635150608583, + "language_loss": 0.60700464, + "learning_rate": 1.593297061334624e-06, + "loss": 0.6279254, + "num_input_tokens_seen": 202529815, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.44140625, + "step": 9401, + "time_per_iteration": 2.3796558380126953 + }, + { + "auxiliary_loss_clip": 0.01059913, + "auxiliary_loss_mlp": 0.01025583, + "balance_loss_clip": 1.01312578, + "balance_loss_mlp": 1.01960957, + "epoch": 0.5652788215842477, + "flos": 18477676863360.0, + "grad_norm": 2.113450442348843, + "language_loss": 0.81169659, + "learning_rate": 1.5929271405592622e-06, + "loss": 0.8325516, + "num_input_tokens_seen": 202547710, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.40234375, + "step": 9402, + "time_per_iteration": 2.369173526763916 + }, + { + "auxiliary_loss_clip": 0.01060175, + "auxiliary_loss_mlp": 0.01024659, + "balance_loss_clip": 1.01248169, + "balance_loss_mlp": 1.01967382, + "epoch": 0.5653389448369157, + "flos": 30042761195520.0, + "grad_norm": 1.5987540394973105, + "language_loss": 0.77863109, + "learning_rate": 1.592557234311283e-06, + "loss": 0.79947948, + "num_input_tokens_seen": 202568835, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.40429688, + "step": 9403, + "time_per_iteration": 2.463070869445801 + }, + { + "auxiliary_loss_clip": 0.01062172, + "auxiliary_loss_mlp": 0.01022563, + "balance_loss_clip": 1.00959301, + "balance_loss_mlp": 1.0197928, + "epoch": 0.5653990680895836, + "flos": 16726608023040.0, + "grad_norm": 1.6293775500205723, + "language_loss": 0.68885517, + "learning_rate": 1.5921873426038888e-06, + "loss": 0.70970249, + "num_input_tokens_seen": 202587385, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.42382812, + "step": 9404, + "time_per_iteration": 2.404663324356079 + }, + { + "auxiliary_loss_clip": 0.01057838, + "auxiliary_loss_mlp": 0.0102328, + "balance_loss_clip": 1.01181221, + "balance_loss_mlp": 1.01858759, + "epoch": 0.5654591913422516, + "flos": 14537379219840.0, + "grad_norm": 1.782471810769345, + "language_loss": 0.67464888, + "learning_rate": 1.5918174654502784e-06, + "loss": 0.69546008, + "num_input_tokens_seen": 202604815, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.39257812, + "step": 9405, + "time_per_iteration": 2.364969491958618 + }, + { + "auxiliary_loss_clip": 0.0106003, + "auxiliary_loss_mlp": 0.01023782, + "balance_loss_clip": 1.01215291, + "balance_loss_mlp": 1.0209837, + "epoch": 0.5655193145949196, + "flos": 26208809153280.0, + "grad_norm": 1.6453265803427877, + "language_loss": 0.74162751, + "learning_rate": 1.5914476028636532e-06, + "loss": 0.7624656, + "num_input_tokens_seen": 202623775, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.390625, + "step": 9406, + "time_per_iteration": 2.4602973461151123 + }, + { + "auxiliary_loss_clip": 0.01063333, + "auxiliary_loss_mlp": 0.01027831, + "balance_loss_clip": 1.01427722, + "balance_loss_mlp": 1.01981688, + "epoch": 0.5655794378475876, + "flos": 25045398184320.0, + "grad_norm": 2.024813901633247, + "language_loss": 0.795506, + "learning_rate": 1.591077754857212e-06, + "loss": 0.81641763, + "num_input_tokens_seen": 202643375, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.43554688, + "step": 9407, + "time_per_iteration": 2.449805736541748 + }, + { + "auxiliary_loss_clip": 0.01059936, + "auxiliary_loss_mlp": 0.01025946, + "balance_loss_clip": 1.0135839, + "balance_loss_mlp": 1.02008247, + "epoch": 0.5656395611002555, + "flos": 31031431476480.0, + "grad_norm": 1.2243412623608456, + "language_loss": 0.70912147, + "learning_rate": 1.5907079214441537e-06, + "loss": 0.72998035, + "num_input_tokens_seen": 202668400, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.3984375, + "step": 9408, + "time_per_iteration": 2.5182995796203613 + }, + { + "auxiliary_loss_clip": 0.01061408, + "auxiliary_loss_mlp": 0.01025901, + "balance_loss_clip": 1.01315188, + "balance_loss_mlp": 1.0202446, + "epoch": 0.5656996843529235, + "flos": 20228501324160.0, + "grad_norm": 1.8156840060007438, + "language_loss": 0.81323332, + "learning_rate": 1.5903381026376769e-06, + "loss": 0.83410645, + "num_input_tokens_seen": 202685125, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.41210938, + "step": 9409, + "time_per_iteration": 2.3898491859436035 + }, + { + "auxiliary_loss_clip": 0.01060243, + "auxiliary_loss_mlp": 0.01021966, + "balance_loss_clip": 1.00993752, + "balance_loss_mlp": 1.01925397, + "epoch": 0.5657598076055914, + "flos": 20995193980800.0, + "grad_norm": 1.7552260981401075, + "language_loss": 0.78572428, + "learning_rate": 1.5899682984509794e-06, + "loss": 0.80654639, + "num_input_tokens_seen": 202703830, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.41015625, + "step": 9410, + "time_per_iteration": 2.3956034183502197 + }, + { + "auxiliary_loss_clip": 0.01061826, + "auxiliary_loss_mlp": 0.01029141, + "balance_loss_clip": 1.0159446, + "balance_loss_mlp": 1.01996899, + "epoch": 0.5658199308582594, + "flos": 11545217902080.0, + "grad_norm": 2.96693040747571, + "language_loss": 0.83323526, + "learning_rate": 1.589598508897259e-06, + "loss": 0.85414493, + "num_input_tokens_seen": 202719835, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.41796875, + "step": 9411, + "time_per_iteration": 2.4073972702026367 + }, + { + "auxiliary_loss_clip": 0.01063969, + "auxiliary_loss_mlp": 0.01030742, + "balance_loss_clip": 1.01659203, + "balance_loss_mlp": 1.02055216, + "epoch": 0.5658800541109275, + "flos": 14171314947840.0, + "grad_norm": 2.7764510644855007, + "language_loss": 0.67174739, + "learning_rate": 1.589228733989712e-06, + "loss": 0.69269449, + "num_input_tokens_seen": 202736795, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.43359375, + "step": 9412, + "time_per_iteration": 2.391127347946167 + }, + { + "auxiliary_loss_clip": 0.01057646, + "auxiliary_loss_mlp": 0.01023914, + "balance_loss_clip": 1.0126133, + "balance_loss_mlp": 1.01887441, + "epoch": 0.5659401773635954, + "flos": 27303929769600.0, + "grad_norm": 1.6339869799989843, + "language_loss": 0.58395708, + "learning_rate": 1.5888589737415342e-06, + "loss": 0.60477269, + "num_input_tokens_seen": 202756900, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.38867188, + "step": 9413, + "time_per_iteration": 3.8691184520721436 + }, + { + "auxiliary_loss_clip": 0.01058533, + "auxiliary_loss_mlp": 0.01024971, + "balance_loss_clip": 1.01280546, + "balance_loss_mlp": 1.01882684, + "epoch": 0.5660003006162634, + "flos": 16727236427520.0, + "grad_norm": 1.4959159722451645, + "language_loss": 0.69002581, + "learning_rate": 1.588489228165923e-06, + "loss": 0.71086085, + "num_input_tokens_seen": 202775145, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.39648438, + "step": 9414, + "time_per_iteration": 2.394911766052246 + }, + { + "auxiliary_loss_clip": 0.01058078, + "auxiliary_loss_mlp": 0.01026646, + "balance_loss_clip": 1.01477885, + "balance_loss_mlp": 1.01882958, + "epoch": 0.5660604238689313, + "flos": 21652364102400.0, + "grad_norm": 1.4931937451298065, + "language_loss": 0.78322482, + "learning_rate": 1.588119497276072e-06, + "loss": 0.80407202, + "num_input_tokens_seen": 202794505, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.39257812, + "step": 9415, + "time_per_iteration": 2.392521619796753 + }, + { + "auxiliary_loss_clip": 0.01008378, + "auxiliary_loss_mlp": 0.01007628, + "balance_loss_clip": 1.00668621, + "balance_loss_mlp": 1.00096989, + "epoch": 0.5661205471215993, + "flos": 68820755783040.0, + "grad_norm": 0.6924523099685265, + "language_loss": 0.49176306, + "learning_rate": 1.587749781085177e-06, + "loss": 0.51192307, + "num_input_tokens_seen": 202858580, + "router_z_loss_clip": 0.00939941, + "router_z_loss_mlp": 0.07421875, + "step": 9416, + "time_per_iteration": 3.181480646133423 + }, + { + "auxiliary_loss_clip": 0.01061395, + "auxiliary_loss_mlp": 0.01027179, + "balance_loss_clip": 1.01475155, + "balance_loss_mlp": 1.01950884, + "epoch": 0.5661806703742672, + "flos": 28620504339840.0, + "grad_norm": 3.8574120454246352, + "language_loss": 0.62547278, + "learning_rate": 1.587380079606432e-06, + "loss": 0.64635855, + "num_input_tokens_seen": 202878565, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.41796875, + "step": 9417, + "time_per_iteration": 2.436441659927368 + }, + { + "auxiliary_loss_clip": 0.01062252, + "auxiliary_loss_mlp": 0.010256, + "balance_loss_clip": 1.01237392, + "balance_loss_mlp": 1.01973343, + "epoch": 0.5662407936269352, + "flos": 21396869706240.0, + "grad_norm": 1.8698333204179283, + "language_loss": 0.68641508, + "learning_rate": 1.5870103928530302e-06, + "loss": 0.70729363, + "num_input_tokens_seen": 202897350, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.42578125, + "step": 9418, + "time_per_iteration": 2.408456563949585 + }, + { + "auxiliary_loss_clip": 0.01060526, + "auxiliary_loss_mlp": 0.01024909, + "balance_loss_clip": 1.0106039, + "balance_loss_mlp": 1.01886034, + "epoch": 0.5663009168796032, + "flos": 25658997062400.0, + "grad_norm": 1.7123471990066284, + "language_loss": 0.64450014, + "learning_rate": 1.5866407208381659e-06, + "loss": 0.66535449, + "num_input_tokens_seen": 202916745, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.41796875, + "step": 9419, + "time_per_iteration": 2.4274067878723145 + }, + { + "auxiliary_loss_clip": 0.01008773, + "auxiliary_loss_mlp": 0.01002475, + "balance_loss_clip": 1.00139606, + "balance_loss_mlp": 1.00132167, + "epoch": 0.5663610401322712, + "flos": 67926699020160.0, + "grad_norm": 0.7342971375794615, + "language_loss": 0.59692222, + "learning_rate": 1.58627106357503e-06, + "loss": 0.61703467, + "num_input_tokens_seen": 202982375, + "router_z_loss_clip": 0.01080322, + "router_z_loss_mlp": 0.07421875, + "step": 9420, + "time_per_iteration": 3.120048761367798 + }, + { + "auxiliary_loss_clip": 0.0105838, + "auxiliary_loss_mlp": 0.01023564, + "balance_loss_clip": 1.01164877, + "balance_loss_mlp": 1.0185616, + "epoch": 0.5664211633849391, + "flos": 24608180828160.0, + "grad_norm": 1.7017242456473474, + "language_loss": 0.7408824, + "learning_rate": 1.5859014210768163e-06, + "loss": 0.76170188, + "num_input_tokens_seen": 203002430, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.3984375, + "step": 9421, + "time_per_iteration": 2.438894033432007 + }, + { + "auxiliary_loss_clip": 0.01063742, + "auxiliary_loss_mlp": 0.0102614, + "balance_loss_clip": 1.01313972, + "balance_loss_mlp": 1.02058458, + "epoch": 0.5664812866376071, + "flos": 11648212012800.0, + "grad_norm": 2.4123865292308575, + "language_loss": 0.72718507, + "learning_rate": 1.5855317933567156e-06, + "loss": 0.74808395, + "num_input_tokens_seen": 203019425, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.43164062, + "step": 9422, + "time_per_iteration": 2.3925554752349854 + }, + { + "auxiliary_loss_clip": 0.01058552, + "auxiliary_loss_mlp": 0.0102667, + "balance_loss_clip": 1.01558948, + "balance_loss_mlp": 1.01993167, + "epoch": 0.566541409890275, + "flos": 24642849674880.0, + "grad_norm": 1.5822920614116676, + "language_loss": 0.81715393, + "learning_rate": 1.5851621804279186e-06, + "loss": 0.83800614, + "num_input_tokens_seen": 203039035, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.38671875, + "step": 9423, + "time_per_iteration": 2.439692735671997 + }, + { + "auxiliary_loss_clip": 0.01058807, + "auxiliary_loss_mlp": 0.01027589, + "balance_loss_clip": 1.01460147, + "balance_loss_mlp": 1.01907098, + "epoch": 0.566601533142943, + "flos": 22269558850560.0, + "grad_norm": 1.5575691099046272, + "language_loss": 0.80870384, + "learning_rate": 1.5847925823036169e-06, + "loss": 0.82956779, + "num_input_tokens_seen": 203059320, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.3984375, + "step": 9424, + "time_per_iteration": 2.4388504028320312 + }, + { + "auxiliary_loss_clip": 0.0106383, + "auxiliary_loss_mlp": 0.01028445, + "balance_loss_clip": 1.01503968, + "balance_loss_mlp": 1.02029908, + "epoch": 0.5666616563956111, + "flos": 29970351302400.0, + "grad_norm": 1.870749104412411, + "language_loss": 0.78632331, + "learning_rate": 1.584422998996999e-06, + "loss": 0.80724603, + "num_input_tokens_seen": 203078490, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.43554688, + "step": 9425, + "time_per_iteration": 3.8908638954162598 + }, + { + "auxiliary_loss_clip": 0.01060862, + "auxiliary_loss_mlp": 0.01026735, + "balance_loss_clip": 1.01454592, + "balance_loss_mlp": 1.01986694, + "epoch": 0.566721779648279, + "flos": 17780601191040.0, + "grad_norm": 2.0372999697643728, + "language_loss": 0.59117568, + "learning_rate": 1.584053430521256e-06, + "loss": 0.61205161, + "num_input_tokens_seen": 203096065, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.41015625, + "step": 9426, + "time_per_iteration": 2.3901875019073486 + }, + { + "auxiliary_loss_clip": 0.01063695, + "auxiliary_loss_mlp": 0.01026155, + "balance_loss_clip": 1.01345897, + "balance_loss_mlp": 1.02007532, + "epoch": 0.566781902900947, + "flos": 21032411356800.0, + "grad_norm": 3.0393183372660015, + "language_loss": 0.81820703, + "learning_rate": 1.5836838768895757e-06, + "loss": 0.83910555, + "num_input_tokens_seen": 203115270, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.4375, + "step": 9427, + "time_per_iteration": 2.4561314582824707 + }, + { + "auxiliary_loss_clip": 0.01058966, + "auxiliary_loss_mlp": 0.01027707, + "balance_loss_clip": 1.01560152, + "balance_loss_mlp": 1.0193727, + "epoch": 0.5668420261536149, + "flos": 23147484698880.0, + "grad_norm": 2.310331987835464, + "language_loss": 0.86115199, + "learning_rate": 1.5833143381151474e-06, + "loss": 0.88201869, + "num_input_tokens_seen": 203134290, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.39648438, + "step": 9428, + "time_per_iteration": 2.4007062911987305 + }, + { + "auxiliary_loss_clip": 0.01064424, + "auxiliary_loss_mlp": 0.01030136, + "balance_loss_clip": 1.01690936, + "balance_loss_mlp": 1.02149904, + "epoch": 0.5669021494062829, + "flos": 22600500428160.0, + "grad_norm": 2.211203911932634, + "language_loss": 0.73971421, + "learning_rate": 1.5829448142111586e-06, + "loss": 0.76065981, + "num_input_tokens_seen": 203152935, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.4296875, + "step": 9429, + "time_per_iteration": 3.885897397994995 + }, + { + "auxiliary_loss_clip": 0.01061812, + "auxiliary_loss_mlp": 0.01026072, + "balance_loss_clip": 1.0126071, + "balance_loss_mlp": 1.01913071, + "epoch": 0.5669622726589508, + "flos": 17380356831360.0, + "grad_norm": 1.934911514776056, + "language_loss": 0.75312358, + "learning_rate": 1.582575305190796e-06, + "loss": 0.77400243, + "num_input_tokens_seen": 203170110, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.42578125, + "step": 9430, + "time_per_iteration": 2.3554370403289795 + }, + { + "auxiliary_loss_clip": 0.01061484, + "auxiliary_loss_mlp": 0.01028401, + "balance_loss_clip": 1.01614022, + "balance_loss_mlp": 1.01975274, + "epoch": 0.5670223959116188, + "flos": 18762463756800.0, + "grad_norm": 1.7289261014134238, + "language_loss": 0.72685838, + "learning_rate": 1.5822058110672475e-06, + "loss": 0.7477572, + "num_input_tokens_seen": 203188825, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.41796875, + "step": 9431, + "time_per_iteration": 3.9325969219207764 + }, + { + "auxiliary_loss_clip": 0.01056185, + "auxiliary_loss_mlp": 0.01025593, + "balance_loss_clip": 1.01492977, + "balance_loss_mlp": 1.01993215, + "epoch": 0.5670825191642868, + "flos": 13552479365760.0, + "grad_norm": 1.6051534075708642, + "language_loss": 0.73447102, + "learning_rate": 1.5818363318536985e-06, + "loss": 0.75528884, + "num_input_tokens_seen": 203206860, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.36328125, + "step": 9432, + "time_per_iteration": 2.3944413661956787 + }, + { + "auxiliary_loss_clip": 0.01061269, + "auxiliary_loss_mlp": 0.01026174, + "balance_loss_clip": 1.01406217, + "balance_loss_mlp": 1.01978242, + "epoch": 0.5671426424169548, + "flos": 22052957575680.0, + "grad_norm": 1.5197893467557375, + "language_loss": 0.7800802, + "learning_rate": 1.5814668675633356e-06, + "loss": 0.80095464, + "num_input_tokens_seen": 203225625, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.41601562, + "step": 9433, + "time_per_iteration": 2.403679609298706 + }, + { + "auxiliary_loss_clip": 0.01064654, + "auxiliary_loss_mlp": 0.01026605, + "balance_loss_clip": 1.01190603, + "balance_loss_mlp": 1.020432, + "epoch": 0.5672027656696227, + "flos": 21322923713280.0, + "grad_norm": 2.2739179608029914, + "language_loss": 0.63805759, + "learning_rate": 1.581097418209344e-06, + "loss": 0.65897012, + "num_input_tokens_seen": 203242920, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.44140625, + "step": 9434, + "time_per_iteration": 2.3983213901519775 + }, + { + "auxiliary_loss_clip": 0.01058992, + "auxiliary_loss_mlp": 0.01025295, + "balance_loss_clip": 1.01297486, + "balance_loss_mlp": 1.01878905, + "epoch": 0.5672628889222907, + "flos": 23512920566400.0, + "grad_norm": 1.5944337862245836, + "language_loss": 0.66560036, + "learning_rate": 1.580727983804907e-06, + "loss": 0.68644321, + "num_input_tokens_seen": 203261995, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.40234375, + "step": 9435, + "time_per_iteration": 2.397047281265259 + }, + { + "auxiliary_loss_clip": 0.01058665, + "auxiliary_loss_mlp": 0.01022715, + "balance_loss_clip": 1.01112747, + "balance_loss_mlp": 1.01998544, + "epoch": 0.5673230121749586, + "flos": 27120810355200.0, + "grad_norm": 1.3778312359551874, + "language_loss": 0.7148627, + "learning_rate": 1.5803585643632102e-06, + "loss": 0.73567647, + "num_input_tokens_seen": 203280670, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.38671875, + "step": 9436, + "time_per_iteration": 2.443995714187622 + }, + { + "auxiliary_loss_clip": 0.01060669, + "auxiliary_loss_mlp": 0.01024977, + "balance_loss_clip": 1.01155448, + "balance_loss_mlp": 1.01900721, + "epoch": 0.5673831354276266, + "flos": 31140569986560.0, + "grad_norm": 1.5648261017693414, + "language_loss": 0.74353981, + "learning_rate": 1.5799891598974366e-06, + "loss": 0.76439625, + "num_input_tokens_seen": 203304800, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.41601562, + "step": 9437, + "time_per_iteration": 2.4840340614318848 + }, + { + "auxiliary_loss_clip": 0.01064842, + "auxiliary_loss_mlp": 0.0102828, + "balance_loss_clip": 1.01528597, + "balance_loss_mlp": 1.02238512, + "epoch": 0.5674432586802945, + "flos": 27671949077760.0, + "grad_norm": 2.851142650609121, + "language_loss": 0.61071706, + "learning_rate": 1.5796197704207698e-06, + "loss": 0.6316483, + "num_input_tokens_seen": 203324060, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.42578125, + "step": 9438, + "time_per_iteration": 2.452641248703003 + }, + { + "auxiliary_loss_clip": 0.01059263, + "auxiliary_loss_mlp": 0.01027293, + "balance_loss_clip": 1.01463246, + "balance_loss_mlp": 1.01848447, + "epoch": 0.5675033819329626, + "flos": 26613941103360.0, + "grad_norm": 1.4406979862310076, + "language_loss": 0.74913514, + "learning_rate": 1.579250395946392e-06, + "loss": 0.7700007, + "num_input_tokens_seen": 203344360, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.40820312, + "step": 9439, + "time_per_iteration": 2.456995725631714 + }, + { + "auxiliary_loss_clip": 0.0106149, + "auxiliary_loss_mlp": 0.01029487, + "balance_loss_clip": 1.01568854, + "balance_loss_mlp": 1.02076471, + "epoch": 0.5675635051856306, + "flos": 19097385229440.0, + "grad_norm": 2.5291621671882356, + "language_loss": 0.83679599, + "learning_rate": 1.5788810364874849e-06, + "loss": 0.85770577, + "num_input_tokens_seen": 203362115, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.40625, + "step": 9440, + "time_per_iteration": 2.402573347091675 + }, + { + "auxiliary_loss_clip": 0.01063439, + "auxiliary_loss_mlp": 0.01034389, + "balance_loss_clip": 1.02022648, + "balance_loss_mlp": 1.01926053, + "epoch": 0.5676236284382985, + "flos": 17565361459200.0, + "grad_norm": 1.9575830711391697, + "language_loss": 0.75436723, + "learning_rate": 1.5785116920572307e-06, + "loss": 0.77534544, + "num_input_tokens_seen": 203380550, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.44140625, + "step": 9441, + "time_per_iteration": 2.3726086616516113 + }, + { + "auxiliary_loss_clip": 0.01061883, + "auxiliary_loss_mlp": 0.01029033, + "balance_loss_clip": 1.01695132, + "balance_loss_mlp": 1.02052498, + "epoch": 0.5676837516909665, + "flos": 15953352030720.0, + "grad_norm": 1.744303620050826, + "language_loss": 0.82719958, + "learning_rate": 1.5781423626688097e-06, + "loss": 0.84810877, + "num_input_tokens_seen": 203396590, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.4140625, + "step": 9442, + "time_per_iteration": 2.3745388984680176 + }, + { + "auxiliary_loss_clip": 0.01058472, + "auxiliary_loss_mlp": 0.0102341, + "balance_loss_clip": 1.01184082, + "balance_loss_mlp": 1.02139902, + "epoch": 0.5677438749436344, + "flos": 18294941473920.0, + "grad_norm": 2.2201276655370203, + "language_loss": 0.74435985, + "learning_rate": 1.5777730483354033e-06, + "loss": 0.76517862, + "num_input_tokens_seen": 203414280, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.37109375, + "step": 9443, + "time_per_iteration": 2.401337146759033 + }, + { + "auxiliary_loss_clip": 0.0106144, + "auxiliary_loss_mlp": 0.01024385, + "balance_loss_clip": 1.01226723, + "balance_loss_mlp": 1.02038193, + "epoch": 0.5678039981963025, + "flos": 17930343479040.0, + "grad_norm": 1.9007644273721318, + "language_loss": 0.77683902, + "learning_rate": 1.5774037490701903e-06, + "loss": 0.79769725, + "num_input_tokens_seen": 203433280, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.41015625, + "step": 9444, + "time_per_iteration": 2.3804421424865723 + }, + { + "auxiliary_loss_clip": 0.01063537, + "auxiliary_loss_mlp": 0.01033406, + "balance_loss_clip": 1.02022123, + "balance_loss_mlp": 1.02170992, + "epoch": 0.5678641214489704, + "flos": 19315382958720.0, + "grad_norm": 2.0922219365540666, + "language_loss": 0.81050646, + "learning_rate": 1.57703446488635e-06, + "loss": 0.83147585, + "num_input_tokens_seen": 203449935, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.41796875, + "step": 9445, + "time_per_iteration": 2.4341964721679688 + }, + { + "auxiliary_loss_clip": 0.01057459, + "auxiliary_loss_mlp": 0.01022693, + "balance_loss_clip": 1.00972319, + "balance_loss_mlp": 1.01726127, + "epoch": 0.5679242447016384, + "flos": 27749700408960.0, + "grad_norm": 1.2562309505325533, + "language_loss": 0.71026182, + "learning_rate": 1.5766651957970624e-06, + "loss": 0.73106337, + "num_input_tokens_seen": 203473025, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.40234375, + "step": 9446, + "time_per_iteration": 2.4903981685638428 + }, + { + "auxiliary_loss_clip": 0.01062466, + "auxiliary_loss_mlp": 0.01023831, + "balance_loss_clip": 1.01055074, + "balance_loss_mlp": 1.01935816, + "epoch": 0.5679843679543063, + "flos": 23767961114880.0, + "grad_norm": 3.4773328731908326, + "language_loss": 0.73619843, + "learning_rate": 1.5762959418155043e-06, + "loss": 0.75706136, + "num_input_tokens_seen": 203492895, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.43164062, + "step": 9447, + "time_per_iteration": 2.446754217147827 + }, + { + "auxiliary_loss_clip": 0.01061606, + "auxiliary_loss_mlp": 0.01027119, + "balance_loss_clip": 1.01460814, + "balance_loss_mlp": 1.01926768, + "epoch": 0.5680444912069743, + "flos": 25590741621120.0, + "grad_norm": 2.0928593391746477, + "language_loss": 0.75049204, + "learning_rate": 1.5759267029548548e-06, + "loss": 0.77137923, + "num_input_tokens_seen": 203513710, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.42382812, + "step": 9448, + "time_per_iteration": 2.4522361755371094 + }, + { + "auxiliary_loss_clip": 0.01065922, + "auxiliary_loss_mlp": 0.01028579, + "balance_loss_clip": 1.01543665, + "balance_loss_mlp": 1.02210808, + "epoch": 0.5681046144596422, + "flos": 23694678437760.0, + "grad_norm": 2.0915310522853954, + "language_loss": 0.76325011, + "learning_rate": 1.5755574792282902e-06, + "loss": 0.78419518, + "num_input_tokens_seen": 203531630, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.43945312, + "step": 9449, + "time_per_iteration": 2.4514622688293457 + }, + { + "auxiliary_loss_clip": 0.01060517, + "auxiliary_loss_mlp": 0.0102953, + "balance_loss_clip": 1.01685762, + "balance_loss_mlp": 1.01978958, + "epoch": 0.5681647377123102, + "flos": 27999539164800.0, + "grad_norm": 1.6417518562858882, + "language_loss": 0.75551277, + "learning_rate": 1.5751882706489875e-06, + "loss": 0.77641326, + "num_input_tokens_seen": 203551885, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.40625, + "step": 9450, + "time_per_iteration": 2.4786040782928467 + }, + { + "auxiliary_loss_clip": 0.0106655, + "auxiliary_loss_mlp": 0.0103008, + "balance_loss_clip": 1.0174973, + "balance_loss_mlp": 1.02264285, + "epoch": 0.5682248609649782, + "flos": 22746646846080.0, + "grad_norm": 1.707405853948617, + "language_loss": 0.67030871, + "learning_rate": 1.5748190772301228e-06, + "loss": 0.691275, + "num_input_tokens_seen": 203572250, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.43945312, + "step": 9451, + "time_per_iteration": 2.4240007400512695 + }, + { + "auxiliary_loss_clip": 0.01065454, + "auxiliary_loss_mlp": 0.01027428, + "balance_loss_clip": 1.01281285, + "balance_loss_mlp": 1.01974726, + "epoch": 0.5682849842176462, + "flos": 21287521728000.0, + "grad_norm": 1.847012263609353, + "language_loss": 0.71784371, + "learning_rate": 1.574449898984871e-06, + "loss": 0.73877257, + "num_input_tokens_seen": 203590605, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.45703125, + "step": 9452, + "time_per_iteration": 3.8430278301239014 + }, + { + "auxiliary_loss_clip": 0.01058661, + "auxiliary_loss_mlp": 0.0102478, + "balance_loss_clip": 1.01288867, + "balance_loss_mlp": 1.01922131, + "epoch": 0.5683451074703142, + "flos": 21140642171520.0, + "grad_norm": 1.44176779539113, + "language_loss": 0.70219898, + "learning_rate": 1.5740807359264082e-06, + "loss": 0.72303337, + "num_input_tokens_seen": 203610080, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.39453125, + "step": 9453, + "time_per_iteration": 2.414170503616333 + }, + { + "auxiliary_loss_clip": 0.01060166, + "auxiliary_loss_mlp": 0.01027924, + "balance_loss_clip": 1.01610422, + "balance_loss_mlp": 1.01956129, + "epoch": 0.5684052307229821, + "flos": 22343435020800.0, + "grad_norm": 1.518763122495727, + "language_loss": 0.69291425, + "learning_rate": 1.5737115880679074e-06, + "loss": 0.71379519, + "num_input_tokens_seen": 203630060, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.40625, + "step": 9454, + "time_per_iteration": 2.4188740253448486 + }, + { + "auxiliary_loss_clip": 0.01056868, + "auxiliary_loss_mlp": 0.0102572, + "balance_loss_clip": 1.01441956, + "balance_loss_mlp": 1.01779246, + "epoch": 0.5684653539756501, + "flos": 21797567913600.0, + "grad_norm": 1.9934491949568964, + "language_loss": 0.60950184, + "learning_rate": 1.5733424554225443e-06, + "loss": 0.6303277, + "num_input_tokens_seen": 203649065, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.390625, + "step": 9455, + "time_per_iteration": 2.398301839828491 + }, + { + "auxiliary_loss_clip": 0.01064524, + "auxiliary_loss_mlp": 0.01028711, + "balance_loss_clip": 1.01488876, + "balance_loss_mlp": 1.02126002, + "epoch": 0.568525477228318, + "flos": 22998615194880.0, + "grad_norm": 2.596707398434861, + "language_loss": 0.73019063, + "learning_rate": 1.5729733380034915e-06, + "loss": 0.75112295, + "num_input_tokens_seen": 203667545, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.43164062, + "step": 9456, + "time_per_iteration": 2.4062533378601074 + }, + { + "auxiliary_loss_clip": 0.01059742, + "auxiliary_loss_mlp": 0.01027773, + "balance_loss_clip": 1.0162394, + "balance_loss_mlp": 1.01941812, + "epoch": 0.568585600480986, + "flos": 21391563179520.0, + "grad_norm": 1.6231926244253596, + "language_loss": 0.76732433, + "learning_rate": 1.5726042358239212e-06, + "loss": 0.78819942, + "num_input_tokens_seen": 203686025, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.40234375, + "step": 9457, + "time_per_iteration": 2.3860135078430176 + }, + { + "auxiliary_loss_clip": 0.01061517, + "auxiliary_loss_mlp": 0.01027513, + "balance_loss_clip": 1.01426899, + "balance_loss_mlp": 1.01992059, + "epoch": 0.568645723733654, + "flos": 30006067489920.0, + "grad_norm": 1.8241342213829144, + "language_loss": 0.66253042, + "learning_rate": 1.5722351488970072e-06, + "loss": 0.68342072, + "num_input_tokens_seen": 203705540, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.41601562, + "step": 9458, + "time_per_iteration": 2.4705617427825928 + }, + { + "auxiliary_loss_clip": 0.01061503, + "auxiliary_loss_mlp": 0.0102458, + "balance_loss_clip": 1.01197362, + "balance_loss_mlp": 1.01920557, + "epoch": 0.568705846986322, + "flos": 20411620738560.0, + "grad_norm": 2.0685600350308, + "language_loss": 0.68090498, + "learning_rate": 1.5718660772359197e-06, + "loss": 0.70176584, + "num_input_tokens_seen": 203723670, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.42382812, + "step": 9459, + "time_per_iteration": 2.391523599624634 + }, + { + "auxiliary_loss_clip": 0.01062586, + "auxiliary_loss_mlp": 0.01031017, + "balance_loss_clip": 1.01820254, + "balance_loss_mlp": 1.02029777, + "epoch": 0.5687659702389899, + "flos": 17455804012800.0, + "grad_norm": 5.130584666163703, + "language_loss": 0.76953673, + "learning_rate": 1.571497020853831e-06, + "loss": 0.79047275, + "num_input_tokens_seen": 203739705, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.421875, + "step": 9460, + "time_per_iteration": 2.371680974960327 + }, + { + "auxiliary_loss_clip": 0.01059903, + "auxiliary_loss_mlp": 0.01026588, + "balance_loss_clip": 1.01334953, + "balance_loss_mlp": 1.01841974, + "epoch": 0.5688260934916579, + "flos": 25405038766080.0, + "grad_norm": 1.5991158618117742, + "language_loss": 0.71599275, + "learning_rate": 1.571127979763911e-06, + "loss": 0.73685765, + "num_input_tokens_seen": 203759000, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.4140625, + "step": 9461, + "time_per_iteration": 2.4591526985168457 + }, + { + "auxiliary_loss_clip": 0.01060973, + "auxiliary_loss_mlp": 0.01024728, + "balance_loss_clip": 1.01150179, + "balance_loss_mlp": 1.01888216, + "epoch": 0.5688862167443258, + "flos": 21607186936320.0, + "grad_norm": 1.9930482638700293, + "language_loss": 0.72909224, + "learning_rate": 1.5707589539793305e-06, + "loss": 0.74994922, + "num_input_tokens_seen": 203774295, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.421875, + "step": 9462, + "time_per_iteration": 2.396146535873413 + }, + { + "auxiliary_loss_clip": 0.0105739, + "auxiliary_loss_mlp": 0.01021774, + "balance_loss_clip": 1.01053834, + "balance_loss_mlp": 1.01844513, + "epoch": 0.5689463399969938, + "flos": 22417904684160.0, + "grad_norm": 1.8191067514373986, + "language_loss": 0.72627687, + "learning_rate": 1.5703899435132588e-06, + "loss": 0.74706852, + "num_input_tokens_seen": 203792710, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.390625, + "step": 9463, + "time_per_iteration": 2.4200992584228516 + }, + { + "auxiliary_loss_clip": 0.01059463, + "auxiliary_loss_mlp": 0.01022855, + "balance_loss_clip": 1.0109396, + "balance_loss_mlp": 1.01894069, + "epoch": 0.5690064632496618, + "flos": 18295814257920.0, + "grad_norm": 1.787316726480116, + "language_loss": 0.74156976, + "learning_rate": 1.570020948378865e-06, + "loss": 0.76239294, + "num_input_tokens_seen": 203811645, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.40625, + "step": 9464, + "time_per_iteration": 3.855839967727661 + }, + { + "auxiliary_loss_clip": 0.01062771, + "auxiliary_loss_mlp": 0.01029696, + "balance_loss_clip": 1.01689839, + "balance_loss_mlp": 1.02009916, + "epoch": 0.5690665865023298, + "flos": 21578208641280.0, + "grad_norm": 1.6605620043678402, + "language_loss": 0.84417665, + "learning_rate": 1.5696519685893175e-06, + "loss": 0.86510134, + "num_input_tokens_seen": 203830040, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.42773438, + "step": 9465, + "time_per_iteration": 2.4091506004333496 + }, + { + "auxiliary_loss_clip": 0.01056874, + "auxiliary_loss_mlp": 0.01027301, + "balance_loss_clip": 1.01506972, + "balance_loss_mlp": 1.0180645, + "epoch": 0.5691267097549978, + "flos": 24420418202880.0, + "grad_norm": 1.520250444875699, + "language_loss": 0.7228809, + "learning_rate": 1.5692830041577842e-06, + "loss": 0.74372262, + "num_input_tokens_seen": 203851245, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.38867188, + "step": 9466, + "time_per_iteration": 2.477630853652954 + }, + { + "auxiliary_loss_clip": 0.01008448, + "auxiliary_loss_mlp": 0.01002598, + "balance_loss_clip": 1.0017333, + "balance_loss_mlp": 1.00094891, + "epoch": 0.5691868330076657, + "flos": 61654238000640.0, + "grad_norm": 0.7194965774071473, + "language_loss": 0.55356914, + "learning_rate": 1.5689140550974323e-06, + "loss": 0.57367957, + "num_input_tokens_seen": 203916400, + "router_z_loss_clip": 0.00866699, + "router_z_loss_mlp": 0.07519531, + "step": 9467, + "time_per_iteration": 3.1138436794281006 + }, + { + "auxiliary_loss_clip": 0.01062454, + "auxiliary_loss_mlp": 0.01026224, + "balance_loss_clip": 1.01160884, + "balance_loss_mlp": 1.01903915, + "epoch": 0.5692469562603337, + "flos": 21324110699520.0, + "grad_norm": 3.320310840713623, + "language_loss": 0.6352275, + "learning_rate": 1.5685451214214292e-06, + "loss": 0.65611428, + "num_input_tokens_seen": 203935870, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.43359375, + "step": 9468, + "time_per_iteration": 2.404435634613037 + }, + { + "auxiliary_loss_clip": 0.0106099, + "auxiliary_loss_mlp": 0.010254, + "balance_loss_clip": 1.01297867, + "balance_loss_mlp": 1.01997471, + "epoch": 0.5693070795130016, + "flos": 23366774148480.0, + "grad_norm": 1.9846586154272066, + "language_loss": 0.79119998, + "learning_rate": 1.5681762031429405e-06, + "loss": 0.81206381, + "num_input_tokens_seen": 203954950, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.41015625, + "step": 9469, + "time_per_iteration": 3.9633848667144775 + }, + { + "auxiliary_loss_clip": 0.01063221, + "auxiliary_loss_mlp": 0.01034869, + "balance_loss_clip": 1.0207901, + "balance_loss_mlp": 1.01964092, + "epoch": 0.5693672027656697, + "flos": 18696268085760.0, + "grad_norm": 2.5388051569573826, + "language_loss": 0.69406772, + "learning_rate": 1.5678073002751329e-06, + "loss": 0.71504861, + "num_input_tokens_seen": 203972715, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.43554688, + "step": 9470, + "time_per_iteration": 3.866490125656128 + }, + { + "auxiliary_loss_clip": 0.01060255, + "auxiliary_loss_mlp": 0.0102422, + "balance_loss_clip": 1.01147652, + "balance_loss_mlp": 1.01849258, + "epoch": 0.5694273260183376, + "flos": 20448139887360.0, + "grad_norm": 1.8430325087070283, + "language_loss": 0.74412751, + "learning_rate": 1.5674384128311702e-06, + "loss": 0.76497221, + "num_input_tokens_seen": 203990775, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.41796875, + "step": 9471, + "time_per_iteration": 2.393246650695801 + }, + { + "auxiliary_loss_clip": 0.01062732, + "auxiliary_loss_mlp": 0.01024586, + "balance_loss_clip": 1.0113833, + "balance_loss_mlp": 1.02148342, + "epoch": 0.5694874492710056, + "flos": 17602229721600.0, + "grad_norm": 1.761497678259465, + "language_loss": 0.57365698, + "learning_rate": 1.5670695408242186e-06, + "loss": 0.59453017, + "num_input_tokens_seen": 204008845, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.41210938, + "step": 9472, + "time_per_iteration": 2.4189398288726807 + }, + { + "auxiliary_loss_clip": 0.01058169, + "auxiliary_loss_mlp": 0.01025621, + "balance_loss_clip": 1.01294899, + "balance_loss_mlp": 1.01930451, + "epoch": 0.5695475725236735, + "flos": 13369988355840.0, + "grad_norm": 1.9647874228771776, + "language_loss": 0.73972356, + "learning_rate": 1.5667006842674412e-06, + "loss": 0.76056147, + "num_input_tokens_seen": 204023755, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.38867188, + "step": 9473, + "time_per_iteration": 2.3852269649505615 + }, + { + "auxiliary_loss_clip": 0.01008373, + "auxiliary_loss_mlp": 0.01002658, + "balance_loss_clip": 1.00162053, + "balance_loss_mlp": 1.00077736, + "epoch": 0.5696076957763415, + "flos": 68170951958400.0, + "grad_norm": 0.645223981934886, + "language_loss": 0.57623541, + "learning_rate": 1.5663318431740017e-06, + "loss": 0.59634578, + "num_input_tokens_seen": 204091255, + "router_z_loss_clip": 0.01037598, + "router_z_loss_mlp": 0.07617188, + "step": 9474, + "time_per_iteration": 3.083498954772949 + }, + { + "auxiliary_loss_clip": 0.0106098, + "auxiliary_loss_mlp": 0.01026907, + "balance_loss_clip": 1.01405621, + "balance_loss_mlp": 1.02060533, + "epoch": 0.5696678190290094, + "flos": 33836912421120.0, + "grad_norm": 1.6673364749467454, + "language_loss": 0.53686517, + "learning_rate": 1.5659630175570634e-06, + "loss": 0.55774403, + "num_input_tokens_seen": 204113285, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.40429688, + "step": 9475, + "time_per_iteration": 2.517735719680786 + }, + { + "auxiliary_loss_clip": 0.01062447, + "auxiliary_loss_mlp": 0.01030986, + "balance_loss_clip": 1.01769412, + "balance_loss_mlp": 1.02015567, + "epoch": 0.5697279422816774, + "flos": 26355479241600.0, + "grad_norm": 1.4414492256783564, + "language_loss": 0.79585379, + "learning_rate": 1.565594207429788e-06, + "loss": 0.81678814, + "num_input_tokens_seen": 204133045, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.421875, + "step": 9476, + "time_per_iteration": 2.4355502128601074 + }, + { + "auxiliary_loss_clip": 0.01063547, + "auxiliary_loss_mlp": 0.01026083, + "balance_loss_clip": 1.01347029, + "balance_loss_mlp": 1.02054739, + "epoch": 0.5697880655343454, + "flos": 22929382235520.0, + "grad_norm": 1.831446946822396, + "language_loss": 0.66962552, + "learning_rate": 1.5652254128053385e-06, + "loss": 0.69052184, + "num_input_tokens_seen": 204152590, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.4296875, + "step": 9477, + "time_per_iteration": 2.423797369003296 + }, + { + "auxiliary_loss_clip": 0.01061402, + "auxiliary_loss_mlp": 0.01025755, + "balance_loss_clip": 1.01268935, + "balance_loss_mlp": 1.01970029, + "epoch": 0.5698481887870134, + "flos": 26760087521280.0, + "grad_norm": 1.6191598455132266, + "language_loss": 0.70622271, + "learning_rate": 1.5648566336968758e-06, + "loss": 0.72709429, + "num_input_tokens_seen": 204171815, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.41796875, + "step": 9478, + "time_per_iteration": 2.4679572582244873 + }, + { + "auxiliary_loss_clip": 0.01057491, + "auxiliary_loss_mlp": 0.01025231, + "balance_loss_clip": 1.01378083, + "balance_loss_mlp": 1.01958871, + "epoch": 0.5699083120396814, + "flos": 15741359055360.0, + "grad_norm": 1.8381798064702564, + "language_loss": 0.69682056, + "learning_rate": 1.5644878701175604e-06, + "loss": 0.71764779, + "num_input_tokens_seen": 204188535, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.37890625, + "step": 9479, + "time_per_iteration": 2.3870489597320557 + }, + { + "auxiliary_loss_clip": 0.0105834, + "auxiliary_loss_mlp": 0.01020554, + "balance_loss_clip": 1.00905621, + "balance_loss_mlp": 1.01922631, + "epoch": 0.5699684352923493, + "flos": 19536243419520.0, + "grad_norm": 1.4997179936836522, + "language_loss": 0.71588457, + "learning_rate": 1.5641191220805525e-06, + "loss": 0.73667353, + "num_input_tokens_seen": 204208365, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.39257812, + "step": 9480, + "time_per_iteration": 2.41404390335083 + }, + { + "auxiliary_loss_clip": 0.01061743, + "auxiliary_loss_mlp": 0.0102493, + "balance_loss_clip": 1.01235342, + "balance_loss_mlp": 1.02057326, + "epoch": 0.5700285585450173, + "flos": 16252417670400.0, + "grad_norm": 2.1428211971466147, + "language_loss": 0.71702182, + "learning_rate": 1.5637503895990116e-06, + "loss": 0.73788857, + "num_input_tokens_seen": 204226560, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.41210938, + "step": 9481, + "time_per_iteration": 2.407465934753418 + }, + { + "auxiliary_loss_clip": 0.01061953, + "auxiliary_loss_mlp": 0.0102646, + "balance_loss_clip": 1.01316798, + "balance_loss_mlp": 1.02076042, + "epoch": 0.5700886817976852, + "flos": 19863973152000.0, + "grad_norm": 1.6288991455708166, + "language_loss": 0.78389817, + "learning_rate": 1.5633816726860975e-06, + "loss": 0.80478227, + "num_input_tokens_seen": 204245410, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.41210938, + "step": 9482, + "time_per_iteration": 2.40372896194458 + }, + { + "auxiliary_loss_clip": 0.01060505, + "auxiliary_loss_mlp": 0.01025105, + "balance_loss_clip": 1.01240325, + "balance_loss_mlp": 1.01954031, + "epoch": 0.5701488050503533, + "flos": 23840580476160.0, + "grad_norm": 1.5803560079655012, + "language_loss": 0.77779877, + "learning_rate": 1.5630129713549685e-06, + "loss": 0.79865485, + "num_input_tokens_seen": 204264840, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.41015625, + "step": 9483, + "time_per_iteration": 2.423156499862671 + }, + { + "auxiliary_loss_clip": 0.01058601, + "auxiliary_loss_mlp": 0.01026433, + "balance_loss_clip": 1.01428521, + "balance_loss_mlp": 1.01882637, + "epoch": 0.5702089283030212, + "flos": 23658543313920.0, + "grad_norm": 1.9822666709371142, + "language_loss": 0.80911744, + "learning_rate": 1.562644285618782e-06, + "loss": 0.8299678, + "num_input_tokens_seen": 204284335, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.3984375, + "step": 9484, + "time_per_iteration": 2.4258852005004883 + }, + { + "auxiliary_loss_clip": 0.01062068, + "auxiliary_loss_mlp": 0.01026807, + "balance_loss_clip": 1.01318169, + "balance_loss_mlp": 1.01992178, + "epoch": 0.5702690515556892, + "flos": 27889946807040.0, + "grad_norm": 2.310855333220534, + "language_loss": 0.61002409, + "learning_rate": 1.5622756154906964e-06, + "loss": 0.63091278, + "num_input_tokens_seen": 204302590, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.421875, + "step": 9485, + "time_per_iteration": 2.447519063949585 + }, + { + "auxiliary_loss_clip": 0.01059743, + "auxiliary_loss_mlp": 0.01024341, + "balance_loss_clip": 1.01173496, + "balance_loss_mlp": 1.01919031, + "epoch": 0.5703291748083571, + "flos": 24022827106560.0, + "grad_norm": 1.8625887705083304, + "language_loss": 0.65266126, + "learning_rate": 1.5619069609838676e-06, + "loss": 0.67350209, + "num_input_tokens_seen": 204323055, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.40429688, + "step": 9486, + "time_per_iteration": 2.4241302013397217 + }, + { + "auxiliary_loss_clip": 0.01009033, + "auxiliary_loss_mlp": 0.01000906, + "balance_loss_clip": 0.99992877, + "balance_loss_mlp": 1.00119519, + "epoch": 0.5703892980610251, + "flos": 57019867061760.0, + "grad_norm": 0.664686208436981, + "language_loss": 0.47987413, + "learning_rate": 1.5615383221114531e-06, + "loss": 0.49997354, + "num_input_tokens_seen": 204386160, + "router_z_loss_clip": 0.00976562, + "router_z_loss_mlp": 0.07861328, + "step": 9487, + "time_per_iteration": 3.1125619411468506 + }, + { + "auxiliary_loss_clip": 0.01060845, + "auxiliary_loss_mlp": 0.01024174, + "balance_loss_clip": 1.01129913, + "balance_loss_mlp": 1.01993334, + "epoch": 0.570449421313693, + "flos": 24349928434560.0, + "grad_norm": 1.6602997392289691, + "language_loss": 0.86006123, + "learning_rate": 1.5611696988866076e-06, + "loss": 0.88091147, + "num_input_tokens_seen": 204406315, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.40820312, + "step": 9488, + "time_per_iteration": 2.46226167678833 + }, + { + "auxiliary_loss_clip": 0.01062565, + "auxiliary_loss_mlp": 0.01027718, + "balance_loss_clip": 1.01466489, + "balance_loss_mlp": 1.02001953, + "epoch": 0.570509544566361, + "flos": 24827365543680.0, + "grad_norm": 1.3619800961957986, + "language_loss": 0.78913558, + "learning_rate": 1.5608010913224868e-06, + "loss": 0.81003845, + "num_input_tokens_seen": 204427645, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.42578125, + "step": 9489, + "time_per_iteration": 2.4374165534973145 + }, + { + "auxiliary_loss_clip": 0.01060718, + "auxiliary_loss_mlp": 0.01026695, + "balance_loss_clip": 1.01508451, + "balance_loss_mlp": 1.02005172, + "epoch": 0.570569667819029, + "flos": 21396241301760.0, + "grad_norm": 1.8955687623226507, + "language_loss": 0.69883323, + "learning_rate": 1.5604324994322453e-06, + "loss": 0.71970737, + "num_input_tokens_seen": 204445910, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.40625, + "step": 9490, + "time_per_iteration": 2.3888514041900635 + }, + { + "auxiliary_loss_clip": 0.01060422, + "auxiliary_loss_mlp": 0.01023008, + "balance_loss_clip": 1.01090217, + "balance_loss_mlp": 1.02007818, + "epoch": 0.570629791071697, + "flos": 23215775051520.0, + "grad_norm": 1.427238492566915, + "language_loss": 0.76353252, + "learning_rate": 1.560063923229037e-06, + "loss": 0.78436685, + "num_input_tokens_seen": 204464680, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.40429688, + "step": 9491, + "time_per_iteration": 2.437507390975952 + }, + { + "auxiliary_loss_clip": 0.01008367, + "auxiliary_loss_mlp": 0.01001797, + "balance_loss_clip": 1.00092709, + "balance_loss_mlp": 1.00072002, + "epoch": 0.570689914324365, + "flos": 65281505460480.0, + "grad_norm": 0.795050856522655, + "language_loss": 0.57393366, + "learning_rate": 1.559695362726016e-06, + "loss": 0.59403533, + "num_input_tokens_seen": 204525580, + "router_z_loss_clip": 0.00872803, + "router_z_loss_mlp": 0.07666016, + "step": 9492, + "time_per_iteration": 4.47448205947876 + }, + { + "auxiliary_loss_clip": 0.01061816, + "auxiliary_loss_mlp": 0.01026642, + "balance_loss_clip": 1.01398194, + "balance_loss_mlp": 1.02062583, + "epoch": 0.5707500375770329, + "flos": 21140851639680.0, + "grad_norm": 1.9551185688272696, + "language_loss": 0.71861899, + "learning_rate": 1.5593268179363346e-06, + "loss": 0.73950356, + "num_input_tokens_seen": 204541320, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.41210938, + "step": 9493, + "time_per_iteration": 2.3850409984588623 + }, + { + "auxiliary_loss_clip": 0.01063524, + "auxiliary_loss_mlp": 0.01024335, + "balance_loss_clip": 1.01078069, + "balance_loss_mlp": 1.02030003, + "epoch": 0.5708101608297009, + "flos": 20811725452800.0, + "grad_norm": 1.573371365787169, + "language_loss": 0.78234839, + "learning_rate": 1.5589582888731462e-06, + "loss": 0.80322695, + "num_input_tokens_seen": 204560275, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.43359375, + "step": 9494, + "time_per_iteration": 2.4109439849853516 + }, + { + "auxiliary_loss_clip": 0.01064039, + "auxiliary_loss_mlp": 0.01025806, + "balance_loss_clip": 1.0118289, + "balance_loss_mlp": 1.02020562, + "epoch": 0.5708702840823688, + "flos": 25811148234240.0, + "grad_norm": 2.6907359285823786, + "language_loss": 0.80122423, + "learning_rate": 1.5585897755496016e-06, + "loss": 0.82212269, + "num_input_tokens_seen": 204579430, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.4375, + "step": 9495, + "time_per_iteration": 2.4283947944641113 + }, + { + "auxiliary_loss_clip": 0.01066561, + "auxiliary_loss_mlp": 0.01032036, + "balance_loss_clip": 1.01649761, + "balance_loss_mlp": 1.02128327, + "epoch": 0.5709304073350369, + "flos": 23651002460160.0, + "grad_norm": 2.67289304667543, + "language_loss": 0.66514164, + "learning_rate": 1.558221277978852e-06, + "loss": 0.68612754, + "num_input_tokens_seen": 204597710, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.453125, + "step": 9496, + "time_per_iteration": 2.4321248531341553 + }, + { + "auxiliary_loss_clip": 0.01064982, + "auxiliary_loss_mlp": 0.01031518, + "balance_loss_clip": 1.01764774, + "balance_loss_mlp": 1.02092493, + "epoch": 0.5709905305877048, + "flos": 16106620366080.0, + "grad_norm": 3.4582183880563417, + "language_loss": 0.69793689, + "learning_rate": 1.557852796174049e-06, + "loss": 0.71890193, + "num_input_tokens_seen": 204616140, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.44140625, + "step": 9497, + "time_per_iteration": 2.3781375885009766 + }, + { + "auxiliary_loss_clip": 0.01063047, + "auxiliary_loss_mlp": 0.01026493, + "balance_loss_clip": 1.01308775, + "balance_loss_mlp": 1.02047241, + "epoch": 0.5710506538403728, + "flos": 24749753857920.0, + "grad_norm": 1.8373289924730942, + "language_loss": 0.81040221, + "learning_rate": 1.5574843301483422e-06, + "loss": 0.83129752, + "num_input_tokens_seen": 204636470, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.42578125, + "step": 9498, + "time_per_iteration": 2.4516875743865967 + }, + { + "auxiliary_loss_clip": 0.01062358, + "auxiliary_loss_mlp": 0.01026271, + "balance_loss_clip": 1.01236475, + "balance_loss_mlp": 1.01969063, + "epoch": 0.5711107770930407, + "flos": 21981141175680.0, + "grad_norm": 2.093830423748108, + "language_loss": 0.66119909, + "learning_rate": 1.5571158799148815e-06, + "loss": 0.68208539, + "num_input_tokens_seen": 204656640, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.42578125, + "step": 9499, + "time_per_iteration": 2.400934934616089 + }, + { + "auxiliary_loss_clip": 0.01061976, + "auxiliary_loss_mlp": 0.0102714, + "balance_loss_clip": 1.01382458, + "balance_loss_mlp": 1.02050853, + "epoch": 0.5711709003457087, + "flos": 19572972036480.0, + "grad_norm": 1.5311052378842842, + "language_loss": 0.71810329, + "learning_rate": 1.556747445486816e-06, + "loss": 0.73899442, + "num_input_tokens_seen": 204675475, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.4140625, + "step": 9500, + "time_per_iteration": 2.4110214710235596 + }, + { + "auxiliary_loss_clip": 0.01063709, + "auxiliary_loss_mlp": 0.01031197, + "balance_loss_clip": 1.01671863, + "balance_loss_mlp": 1.0194298, + "epoch": 0.5712310235983766, + "flos": 24241557974400.0, + "grad_norm": 1.85441124968441, + "language_loss": 0.76210284, + "learning_rate": 1.5563790268772934e-06, + "loss": 0.78305191, + "num_input_tokens_seen": 204695385, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.44140625, + "step": 9501, + "time_per_iteration": 2.4323153495788574 + }, + { + "auxiliary_loss_clip": 0.01062559, + "auxiliary_loss_mlp": 0.01025426, + "balance_loss_clip": 1.01261091, + "balance_loss_mlp": 1.01933134, + "epoch": 0.5712911468510447, + "flos": 20995089246720.0, + "grad_norm": 1.5710509098490617, + "language_loss": 0.74833488, + "learning_rate": 1.5560106240994629e-06, + "loss": 0.76921475, + "num_input_tokens_seen": 204714730, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.43359375, + "step": 9502, + "time_per_iteration": 2.405400276184082 + }, + { + "auxiliary_loss_clip": 0.01060103, + "auxiliary_loss_mlp": 0.01024683, + "balance_loss_clip": 1.01202881, + "balance_loss_mlp": 1.0194999, + "epoch": 0.5713512701037126, + "flos": 18915976471680.0, + "grad_norm": 1.6760489059950667, + "language_loss": 0.82107949, + "learning_rate": 1.5556422371664705e-06, + "loss": 0.84192735, + "num_input_tokens_seen": 204735025, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.40625, + "step": 9503, + "time_per_iteration": 2.425752639770508 + }, + { + "auxiliary_loss_clip": 0.01060682, + "auxiliary_loss_mlp": 0.01028963, + "balance_loss_clip": 1.01638603, + "balance_loss_mlp": 1.01980734, + "epoch": 0.5714113933563806, + "flos": 17412686616960.0, + "grad_norm": 21.968022560067563, + "language_loss": 0.86054099, + "learning_rate": 1.555273866091464e-06, + "loss": 0.88143748, + "num_input_tokens_seen": 204751365, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.41015625, + "step": 9504, + "time_per_iteration": 3.8209402561187744 + }, + { + "auxiliary_loss_clip": 0.01058971, + "auxiliary_loss_mlp": 0.01023516, + "balance_loss_clip": 1.01155949, + "balance_loss_mlp": 1.01938653, + "epoch": 0.5714715166090486, + "flos": 20192331288960.0, + "grad_norm": 1.739382324535408, + "language_loss": 0.7509436, + "learning_rate": 1.5549055108875895e-06, + "loss": 0.77176845, + "num_input_tokens_seen": 204768980, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.39453125, + "step": 9505, + "time_per_iteration": 2.396498680114746 + }, + { + "auxiliary_loss_clip": 0.01060567, + "auxiliary_loss_mlp": 0.01023727, + "balance_loss_clip": 1.01210451, + "balance_loss_mlp": 1.01873803, + "epoch": 0.5715316398617165, + "flos": 18550680249600.0, + "grad_norm": 1.5206470824768503, + "language_loss": 0.81864721, + "learning_rate": 1.5545371715679919e-06, + "loss": 0.83949012, + "num_input_tokens_seen": 204788110, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.41796875, + "step": 9506, + "time_per_iteration": 2.418351411819458 + }, + { + "auxiliary_loss_clip": 0.01058848, + "auxiliary_loss_mlp": 0.0101983, + "balance_loss_clip": 1.00812387, + "balance_loss_mlp": 1.01885664, + "epoch": 0.5715917631143845, + "flos": 18477223015680.0, + "grad_norm": 2.190033842333699, + "language_loss": 0.77468503, + "learning_rate": 1.5541688481458169e-06, + "loss": 0.79547179, + "num_input_tokens_seen": 204807240, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.40039062, + "step": 9507, + "time_per_iteration": 2.3849408626556396 + }, + { + "auxiliary_loss_clip": 0.01061599, + "auxiliary_loss_mlp": 0.01026701, + "balance_loss_clip": 1.01304531, + "balance_loss_mlp": 1.01929188, + "epoch": 0.5716518863670524, + "flos": 24019021768320.0, + "grad_norm": 2.656128952711751, + "language_loss": 0.68349874, + "learning_rate": 1.5538005406342088e-06, + "loss": 0.70438176, + "num_input_tokens_seen": 204826415, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.421875, + "step": 9508, + "time_per_iteration": 3.890115261077881 + }, + { + "auxiliary_loss_clip": 0.01062542, + "auxiliary_loss_mlp": 0.01028439, + "balance_loss_clip": 1.0153861, + "balance_loss_mlp": 1.0196991, + "epoch": 0.5717120096197205, + "flos": 17818586616960.0, + "grad_norm": 2.193837279931019, + "language_loss": 0.79703039, + "learning_rate": 1.5534322490463124e-06, + "loss": 0.81794024, + "num_input_tokens_seen": 204844305, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.42773438, + "step": 9509, + "time_per_iteration": 2.3936150074005127 + }, + { + "auxiliary_loss_clip": 0.01059431, + "auxiliary_loss_mlp": 0.01029246, + "balance_loss_clip": 1.01691341, + "balance_loss_mlp": 1.01849699, + "epoch": 0.5717721328723884, + "flos": 21865125127680.0, + "grad_norm": 1.800844611921343, + "language_loss": 0.72058094, + "learning_rate": 1.5530639733952697e-06, + "loss": 0.74146771, + "num_input_tokens_seen": 204861765, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.41015625, + "step": 9510, + "time_per_iteration": 3.934203863143921 + }, + { + "auxiliary_loss_clip": 0.01061743, + "auxiliary_loss_mlp": 0.01022935, + "balance_loss_clip": 1.01034617, + "balance_loss_mlp": 1.01968777, + "epoch": 0.5718322561250564, + "flos": 28436407407360.0, + "grad_norm": 1.426479078282339, + "language_loss": 0.69525248, + "learning_rate": 1.552695713694224e-06, + "loss": 0.71609926, + "num_input_tokens_seen": 204882505, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.41992188, + "step": 9511, + "time_per_iteration": 2.4746532440185547 + }, + { + "auxiliary_loss_clip": 0.0106099, + "auxiliary_loss_mlp": 0.01028695, + "balance_loss_clip": 1.01667821, + "balance_loss_mlp": 1.01962256, + "epoch": 0.5718923793777243, + "flos": 13551013088640.0, + "grad_norm": 2.0204689953598067, + "language_loss": 0.61658859, + "learning_rate": 1.552327469956318e-06, + "loss": 0.63748544, + "num_input_tokens_seen": 204899830, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.4140625, + "step": 9512, + "time_per_iteration": 2.3744871616363525 + }, + { + "auxiliary_loss_clip": 0.01060192, + "auxiliary_loss_mlp": 0.0102103, + "balance_loss_clip": 1.00907922, + "balance_loss_mlp": 1.01986468, + "epoch": 0.5719525026303923, + "flos": 17821065323520.0, + "grad_norm": 4.151134150230552, + "language_loss": 0.75384098, + "learning_rate": 1.5519592421946925e-06, + "loss": 0.7746532, + "num_input_tokens_seen": 204918100, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.40429688, + "step": 9513, + "time_per_iteration": 2.4153242111206055 + }, + { + "auxiliary_loss_clip": 0.01008868, + "auxiliary_loss_mlp": 0.01000859, + "balance_loss_clip": 0.99992359, + "balance_loss_mlp": 1.00123167, + "epoch": 0.5720126258830602, + "flos": 61295262600960.0, + "grad_norm": 0.8917714503070419, + "language_loss": 0.66857994, + "learning_rate": 1.5515910304224898e-06, + "loss": 0.68867725, + "num_input_tokens_seen": 204972925, + "router_z_loss_clip": 0.00933838, + "router_z_loss_mlp": 0.07617188, + "step": 9514, + "time_per_iteration": 3.016826629638672 + }, + { + "auxiliary_loss_clip": 0.01066091, + "auxiliary_loss_mlp": 0.01032407, + "balance_loss_clip": 1.01896644, + "balance_loss_mlp": 1.0216608, + "epoch": 0.5720727491357283, + "flos": 23986901450880.0, + "grad_norm": 2.0849121374063193, + "language_loss": 0.81687784, + "learning_rate": 1.551222834652849e-06, + "loss": 0.83786285, + "num_input_tokens_seen": 204990910, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.4453125, + "step": 9515, + "time_per_iteration": 2.416442394256592 + }, + { + "auxiliary_loss_clip": 0.01059205, + "auxiliary_loss_mlp": 0.01022782, + "balance_loss_clip": 1.01070583, + "balance_loss_mlp": 1.01922631, + "epoch": 0.5721328723883962, + "flos": 23404270815360.0, + "grad_norm": 1.6754829531298445, + "language_loss": 0.85897857, + "learning_rate": 1.5508546548989117e-06, + "loss": 0.87979841, + "num_input_tokens_seen": 205010500, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.40039062, + "step": 9516, + "time_per_iteration": 2.400451183319092 + }, + { + "auxiliary_loss_clip": 0.01058327, + "auxiliary_loss_mlp": 0.01024737, + "balance_loss_clip": 1.01245832, + "balance_loss_mlp": 1.01816225, + "epoch": 0.5721929956410642, + "flos": 18803975230080.0, + "grad_norm": 1.622811174945259, + "language_loss": 0.87530184, + "learning_rate": 1.5504864911738163e-06, + "loss": 0.89613247, + "num_input_tokens_seen": 205028560, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.40234375, + "step": 9517, + "time_per_iteration": 2.4118571281433105 + }, + { + "auxiliary_loss_clip": 0.01060826, + "auxiliary_loss_mlp": 0.010245, + "balance_loss_clip": 1.01145828, + "balance_loss_mlp": 1.0197401, + "epoch": 0.5722531188937322, + "flos": 27195489486720.0, + "grad_norm": 1.9314442326115622, + "language_loss": 0.85369122, + "learning_rate": 1.5501183434907012e-06, + "loss": 0.8745445, + "num_input_tokens_seen": 205048650, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.41015625, + "step": 9518, + "time_per_iteration": 2.4388647079467773 + }, + { + "auxiliary_loss_clip": 0.01058473, + "auxiliary_loss_mlp": 0.01026749, + "balance_loss_clip": 1.0149653, + "balance_loss_mlp": 1.0192008, + "epoch": 0.5723132421464001, + "flos": 15194758809600.0, + "grad_norm": 1.5697952905204295, + "language_loss": 0.78849739, + "learning_rate": 1.5497502118627057e-06, + "loss": 0.80934966, + "num_input_tokens_seen": 205066480, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.39257812, + "step": 9519, + "time_per_iteration": 2.3676533699035645 + }, + { + "auxiliary_loss_clip": 0.010602, + "auxiliary_loss_mlp": 0.01024616, + "balance_loss_clip": 1.01294541, + "balance_loss_mlp": 1.01988804, + "epoch": 0.5723733653990681, + "flos": 27598212552960.0, + "grad_norm": 1.5390429624962607, + "language_loss": 0.82917398, + "learning_rate": 1.5493820963029665e-06, + "loss": 0.85002214, + "num_input_tokens_seen": 205087475, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.40234375, + "step": 9520, + "time_per_iteration": 2.4553682804107666 + }, + { + "auxiliary_loss_clip": 0.01061375, + "auxiliary_loss_mlp": 0.01029863, + "balance_loss_clip": 1.0167799, + "balance_loss_mlp": 1.01974416, + "epoch": 0.572433488651736, + "flos": 18221903176320.0, + "grad_norm": 1.8131877762435964, + "language_loss": 0.72233212, + "learning_rate": 1.5490139968246214e-06, + "loss": 0.74324453, + "num_input_tokens_seen": 205106495, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.41601562, + "step": 9521, + "time_per_iteration": 2.4155516624450684 + }, + { + "auxiliary_loss_clip": 0.01059978, + "auxiliary_loss_mlp": 0.01025965, + "balance_loss_clip": 1.01416326, + "balance_loss_mlp": 1.01872957, + "epoch": 0.5724936119044041, + "flos": 31247753460480.0, + "grad_norm": 1.5685192007609903, + "language_loss": 0.78185701, + "learning_rate": 1.548645913440807e-06, + "loss": 0.80271643, + "num_input_tokens_seen": 205128285, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.41210938, + "step": 9522, + "time_per_iteration": 2.482383966445923 + }, + { + "auxiliary_loss_clip": 0.01057762, + "auxiliary_loss_mlp": 0.01025016, + "balance_loss_clip": 1.01396537, + "balance_loss_mlp": 1.01926374, + "epoch": 0.572553735157072, + "flos": 19201356858240.0, + "grad_norm": 1.5258643387413913, + "language_loss": 0.71832156, + "learning_rate": 1.5482778461646583e-06, + "loss": 0.73914933, + "num_input_tokens_seen": 205146595, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.38476562, + "step": 9523, + "time_per_iteration": 2.4673726558685303 + }, + { + "auxiliary_loss_clip": 0.01057984, + "auxiliary_loss_mlp": 0.01023307, + "balance_loss_clip": 1.01139259, + "balance_loss_mlp": 1.0188241, + "epoch": 0.57261385840974, + "flos": 21213855025920.0, + "grad_norm": 2.197564051268722, + "language_loss": 0.69987547, + "learning_rate": 1.5479097950093124e-06, + "loss": 0.72068846, + "num_input_tokens_seen": 205164295, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.390625, + "step": 9524, + "time_per_iteration": 2.413179874420166 + }, + { + "auxiliary_loss_clip": 0.01061603, + "auxiliary_loss_mlp": 0.01024601, + "balance_loss_clip": 1.01247787, + "balance_loss_mlp": 1.02096212, + "epoch": 0.5726739816624079, + "flos": 33983128661760.0, + "grad_norm": 1.5749846039759325, + "language_loss": 0.65156424, + "learning_rate": 1.5475417599879017e-06, + "loss": 0.67242628, + "num_input_tokens_seen": 205185380, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.40625, + "step": 9525, + "time_per_iteration": 2.5300910472869873 + }, + { + "auxiliary_loss_clip": 0.01057443, + "auxiliary_loss_mlp": 0.01022077, + "balance_loss_clip": 1.01025176, + "balance_loss_mlp": 1.0178324, + "epoch": 0.5727341049150759, + "flos": 24274935100800.0, + "grad_norm": 1.7609428249978383, + "language_loss": 0.72526318, + "learning_rate": 1.5471737411135623e-06, + "loss": 0.7460584, + "num_input_tokens_seen": 205204895, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.39648438, + "step": 9526, + "time_per_iteration": 2.4289941787719727 + }, + { + "auxiliary_loss_clip": 0.01058081, + "auxiliary_loss_mlp": 0.01022841, + "balance_loss_clip": 1.01133108, + "balance_loss_mlp": 1.01890922, + "epoch": 0.5727942281677438, + "flos": 28399364588160.0, + "grad_norm": 1.6175333501962792, + "language_loss": 0.80391049, + "learning_rate": 1.5468057383994275e-06, + "loss": 0.82471967, + "num_input_tokens_seen": 205223440, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.39257812, + "step": 9527, + "time_per_iteration": 2.5049285888671875 + }, + { + "auxiliary_loss_clip": 0.0105838, + "auxiliary_loss_mlp": 0.01024385, + "balance_loss_clip": 1.01156998, + "balance_loss_mlp": 1.01847625, + "epoch": 0.5728543514204119, + "flos": 19535754660480.0, + "grad_norm": 1.7372288137515166, + "language_loss": 0.72639215, + "learning_rate": 1.5464377518586296e-06, + "loss": 0.7472198, + "num_input_tokens_seen": 205242800, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.3984375, + "step": 9528, + "time_per_iteration": 2.406308889389038 + }, + { + "auxiliary_loss_clip": 0.01060358, + "auxiliary_loss_mlp": 0.01027649, + "balance_loss_clip": 1.01537633, + "balance_loss_mlp": 1.01951909, + "epoch": 0.5729144746730798, + "flos": 21505694014080.0, + "grad_norm": 2.58194083978683, + "language_loss": 0.85848081, + "learning_rate": 1.5460697815043021e-06, + "loss": 0.87936091, + "num_input_tokens_seen": 205259465, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.40820312, + "step": 9529, + "time_per_iteration": 2.412734270095825 + }, + { + "auxiliary_loss_clip": 0.01008405, + "auxiliary_loss_mlp": 0.01002513, + "balance_loss_clip": 1.00161278, + "balance_loss_mlp": 1.00084615, + "epoch": 0.5729745979257478, + "flos": 58678626533760.0, + "grad_norm": 0.7740080065316932, + "language_loss": 0.56148803, + "learning_rate": 1.5457018273495758e-06, + "loss": 0.58159721, + "num_input_tokens_seen": 205314100, + "router_z_loss_clip": 0.00897217, + "router_z_loss_mlp": 0.07568359, + "step": 9530, + "time_per_iteration": 2.898390531539917 + }, + { + "auxiliary_loss_clip": 0.01063056, + "auxiliary_loss_mlp": 0.01021902, + "balance_loss_clip": 1.00989771, + "balance_loss_mlp": 1.02123415, + "epoch": 0.5730347211784158, + "flos": 18551099185920.0, + "grad_norm": 1.8016430909080838, + "language_loss": 0.66520214, + "learning_rate": 1.5453338894075834e-06, + "loss": 0.68605173, + "num_input_tokens_seen": 205333420, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.41796875, + "step": 9531, + "time_per_iteration": 3.921294689178467 + }, + { + "auxiliary_loss_clip": 0.01061349, + "auxiliary_loss_mlp": 0.01026049, + "balance_loss_clip": 1.0131917, + "balance_loss_mlp": 1.01989293, + "epoch": 0.5730948444310837, + "flos": 38030051197440.0, + "grad_norm": 2.7269638183020706, + "language_loss": 0.75615883, + "learning_rate": 1.5449659676914547e-06, + "loss": 0.77703273, + "num_input_tokens_seen": 205350995, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.4140625, + "step": 9532, + "time_per_iteration": 2.5360214710235596 + }, + { + "auxiliary_loss_clip": 0.01060624, + "auxiliary_loss_mlp": 0.01024567, + "balance_loss_clip": 1.01204419, + "balance_loss_mlp": 1.0200665, + "epoch": 0.5731549676837517, + "flos": 25225934158080.0, + "grad_norm": 1.3330729426486136, + "language_loss": 0.78941619, + "learning_rate": 1.54459806221432e-06, + "loss": 0.8102681, + "num_input_tokens_seen": 205372675, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.40625, + "step": 9533, + "time_per_iteration": 2.46632719039917 + }, + { + "auxiliary_loss_clip": 0.01060094, + "auxiliary_loss_mlp": 0.01022429, + "balance_loss_clip": 1.01078272, + "balance_loss_mlp": 1.02055347, + "epoch": 0.5732150909364196, + "flos": 23367088350720.0, + "grad_norm": 1.6368235639506152, + "language_loss": 0.85726857, + "learning_rate": 1.5442301729893092e-06, + "loss": 0.87809378, + "num_input_tokens_seen": 205392590, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.39453125, + "step": 9534, + "time_per_iteration": 2.446441173553467 + }, + { + "auxiliary_loss_clip": 0.01060963, + "auxiliary_loss_mlp": 0.01029997, + "balance_loss_clip": 1.01668119, + "balance_loss_mlp": 1.0194869, + "epoch": 0.5732752141890877, + "flos": 23078147005440.0, + "grad_norm": 1.4970122848937701, + "language_loss": 0.75135529, + "learning_rate": 1.543862300029551e-06, + "loss": 0.77226496, + "num_input_tokens_seen": 205414885, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.4140625, + "step": 9535, + "time_per_iteration": 2.460066318511963 + }, + { + "auxiliary_loss_clip": 0.01064376, + "auxiliary_loss_mlp": 0.01031955, + "balance_loss_clip": 1.01974761, + "balance_loss_mlp": 1.02099013, + "epoch": 0.5733353374417556, + "flos": 24351150332160.0, + "grad_norm": 1.6259670328690408, + "language_loss": 0.7118414, + "learning_rate": 1.543494443348174e-06, + "loss": 0.73280472, + "num_input_tokens_seen": 205434440, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.43359375, + "step": 9536, + "time_per_iteration": 2.47458815574646 + }, + { + "auxiliary_loss_clip": 0.01060143, + "auxiliary_loss_mlp": 0.01026358, + "balance_loss_clip": 1.01400161, + "balance_loss_mlp": 1.01972246, + "epoch": 0.5733954606944236, + "flos": 27197619079680.0, + "grad_norm": 2.068779992977388, + "language_loss": 0.7018429, + "learning_rate": 1.5431266029583058e-06, + "loss": 0.72270793, + "num_input_tokens_seen": 205454225, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.40429688, + "step": 9537, + "time_per_iteration": 2.4477131366729736 + }, + { + "auxiliary_loss_clip": 0.01060146, + "auxiliary_loss_mlp": 0.01028466, + "balance_loss_clip": 1.01643777, + "balance_loss_mlp": 1.01968813, + "epoch": 0.5734555839470915, + "flos": 28763927671680.0, + "grad_norm": 1.9568852636773018, + "language_loss": 0.63018429, + "learning_rate": 1.5427587788730744e-06, + "loss": 0.65107042, + "num_input_tokens_seen": 205474750, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.40429688, + "step": 9538, + "time_per_iteration": 2.4737722873687744 + }, + { + "auxiliary_loss_clip": 0.01058716, + "auxiliary_loss_mlp": 0.010272, + "balance_loss_clip": 1.0146594, + "balance_loss_mlp": 1.01918149, + "epoch": 0.5735157071997595, + "flos": 22965691916160.0, + "grad_norm": 1.6550135735828069, + "language_loss": 0.83198678, + "learning_rate": 1.5423909711056062e-06, + "loss": 0.85284591, + "num_input_tokens_seen": 205495495, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.39453125, + "step": 9539, + "time_per_iteration": 2.41890811920166 + }, + { + "auxiliary_loss_clip": 0.01061316, + "auxiliary_loss_mlp": 0.01026335, + "balance_loss_clip": 1.01337099, + "balance_loss_mlp": 1.01976407, + "epoch": 0.5735758304524274, + "flos": 18988456187520.0, + "grad_norm": 1.9152013315505603, + "language_loss": 0.73206127, + "learning_rate": 1.5420231796690268e-06, + "loss": 0.75293779, + "num_input_tokens_seen": 205510070, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.4140625, + "step": 9540, + "time_per_iteration": 2.392683506011963 + }, + { + "auxiliary_loss_clip": 0.01056691, + "auxiliary_loss_mlp": 0.01022375, + "balance_loss_clip": 1.01134849, + "balance_loss_mlp": 1.01769602, + "epoch": 0.5736359537050955, + "flos": 28396816058880.0, + "grad_norm": 1.7637242442433667, + "language_loss": 0.79789865, + "learning_rate": 1.5416554045764623e-06, + "loss": 0.81868935, + "num_input_tokens_seen": 205530190, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.390625, + "step": 9541, + "time_per_iteration": 2.4710006713867188 + }, + { + "auxiliary_loss_clip": 0.01062753, + "auxiliary_loss_mlp": 0.0102689, + "balance_loss_clip": 1.01385498, + "balance_loss_mlp": 1.01973248, + "epoch": 0.5736960769577634, + "flos": 15626460170880.0, + "grad_norm": 2.3827345681580234, + "language_loss": 0.64912331, + "learning_rate": 1.541287645841037e-06, + "loss": 0.67001969, + "num_input_tokens_seen": 205547380, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.4296875, + "step": 9542, + "time_per_iteration": 2.368363618850708 + }, + { + "auxiliary_loss_clip": 0.01060009, + "auxiliary_loss_mlp": 0.0102368, + "balance_loss_clip": 1.01234317, + "balance_loss_mlp": 1.01824164, + "epoch": 0.5737562002104314, + "flos": 18003032663040.0, + "grad_norm": 2.0316981940862253, + "language_loss": 0.83268088, + "learning_rate": 1.540919903475876e-06, + "loss": 0.85351777, + "num_input_tokens_seen": 205566540, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.41796875, + "step": 9543, + "time_per_iteration": 3.8184828758239746 + }, + { + "auxiliary_loss_clip": 0.01063932, + "auxiliary_loss_mlp": 0.01026432, + "balance_loss_clip": 1.01246047, + "balance_loss_mlp": 1.02008939, + "epoch": 0.5738163234630994, + "flos": 20697315327360.0, + "grad_norm": 1.7269084836654882, + "language_loss": 0.73127514, + "learning_rate": 1.5405521774941027e-06, + "loss": 0.75217879, + "num_input_tokens_seen": 205584200, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.43945312, + "step": 9544, + "time_per_iteration": 2.429131031036377 + }, + { + "auxiliary_loss_clip": 0.01061657, + "auxiliary_loss_mlp": 0.01023276, + "balance_loss_clip": 1.01019275, + "balance_loss_mlp": 1.01909137, + "epoch": 0.5738764467157673, + "flos": 23148182926080.0, + "grad_norm": 1.9637194390834316, + "language_loss": 0.75943369, + "learning_rate": 1.5401844679088399e-06, + "loss": 0.78028303, + "num_input_tokens_seen": 205604675, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.42578125, + "step": 9545, + "time_per_iteration": 2.445827007293701 + }, + { + "auxiliary_loss_clip": 0.01059522, + "auxiliary_loss_mlp": 0.01028704, + "balance_loss_clip": 1.01604366, + "balance_loss_mlp": 1.01843798, + "epoch": 0.5739365699684353, + "flos": 29491762118400.0, + "grad_norm": 2.1066943438878583, + "language_loss": 0.56188893, + "learning_rate": 1.539816774733211e-06, + "loss": 0.58277118, + "num_input_tokens_seen": 205624680, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.41015625, + "step": 9546, + "time_per_iteration": 2.478156805038452 + }, + { + "auxiliary_loss_clip": 0.01060261, + "auxiliary_loss_mlp": 0.01025361, + "balance_loss_clip": 1.01234365, + "balance_loss_mlp": 1.01882768, + "epoch": 0.5739966932211032, + "flos": 14026390427520.0, + "grad_norm": 2.253847155911352, + "language_loss": 0.76685965, + "learning_rate": 1.5394490979803374e-06, + "loss": 0.78771585, + "num_input_tokens_seen": 205641950, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.4140625, + "step": 9547, + "time_per_iteration": 3.848097085952759 + }, + { + "auxiliary_loss_clip": 0.01060719, + "auxiliary_loss_mlp": 0.01026232, + "balance_loss_clip": 1.01349425, + "balance_loss_mlp": 1.01989424, + "epoch": 0.5740568164737713, + "flos": 19061040637440.0, + "grad_norm": 1.8950097818593086, + "language_loss": 0.74267834, + "learning_rate": 1.5390814376633413e-06, + "loss": 0.7635479, + "num_input_tokens_seen": 205660130, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.40820312, + "step": 9548, + "time_per_iteration": 2.3940467834472656 + }, + { + "auxiliary_loss_clip": 0.0106032, + "auxiliary_loss_mlp": 0.01025149, + "balance_loss_clip": 1.01267374, + "balance_loss_mlp": 1.01846552, + "epoch": 0.5741169397264392, + "flos": 22126729011840.0, + "grad_norm": 2.709257308869778, + "language_loss": 0.69366044, + "learning_rate": 1.538713793795343e-06, + "loss": 0.71451521, + "num_input_tokens_seen": 205678895, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.41796875, + "step": 9549, + "time_per_iteration": 3.893972873687744 + }, + { + "auxiliary_loss_clip": 0.01062101, + "auxiliary_loss_mlp": 0.0102625, + "balance_loss_clip": 1.01416779, + "balance_loss_mlp": 1.02139759, + "epoch": 0.5741770629791072, + "flos": 24935666181120.0, + "grad_norm": 1.502435832181573, + "language_loss": 0.79665655, + "learning_rate": 1.5383461663894623e-06, + "loss": 0.81754005, + "num_input_tokens_seen": 205698450, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.40625, + "step": 9550, + "time_per_iteration": 2.429617404937744 + }, + { + "auxiliary_loss_clip": 0.01061715, + "auxiliary_loss_mlp": 0.01027014, + "balance_loss_clip": 1.01413977, + "balance_loss_mlp": 1.01934886, + "epoch": 0.5742371862317751, + "flos": 18800623739520.0, + "grad_norm": 2.199011709428841, + "language_loss": 0.67769194, + "learning_rate": 1.53797855545882e-06, + "loss": 0.69857931, + "num_input_tokens_seen": 205714870, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.42382812, + "step": 9551, + "time_per_iteration": 2.3876519203186035 + }, + { + "auxiliary_loss_clip": 0.01061759, + "auxiliary_loss_mlp": 0.01024801, + "balance_loss_clip": 1.01211119, + "balance_loss_mlp": 1.01976049, + "epoch": 0.5742973094844431, + "flos": 24459555703680.0, + "grad_norm": 1.74550533099847, + "language_loss": 0.72298044, + "learning_rate": 1.537610961016534e-06, + "loss": 0.74384606, + "num_input_tokens_seen": 205736045, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.41992188, + "step": 9552, + "time_per_iteration": 2.4233500957489014 + }, + { + "auxiliary_loss_clip": 0.01061888, + "auxiliary_loss_mlp": 0.01028478, + "balance_loss_clip": 1.01593721, + "balance_loss_mlp": 1.01985693, + "epoch": 0.574357432737111, + "flos": 21651700786560.0, + "grad_norm": 1.8314199318422328, + "language_loss": 0.79994619, + "learning_rate": 1.537243383075724e-06, + "loss": 0.82084984, + "num_input_tokens_seen": 205754445, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.41992188, + "step": 9553, + "time_per_iteration": 2.4146013259887695 + }, + { + "auxiliary_loss_clip": 0.01063626, + "auxiliary_loss_mlp": 0.01024675, + "balance_loss_clip": 1.01237893, + "balance_loss_mlp": 1.02023661, + "epoch": 0.5744175559897791, + "flos": 16543802810880.0, + "grad_norm": 2.207949528889844, + "language_loss": 0.83412671, + "learning_rate": 1.536875821649507e-06, + "loss": 0.85500968, + "num_input_tokens_seen": 205770595, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.43359375, + "step": 9554, + "time_per_iteration": 2.3630051612854004 + }, + { + "auxiliary_loss_clip": 0.01065574, + "auxiliary_loss_mlp": 0.01030403, + "balance_loss_clip": 1.01712894, + "balance_loss_mlp": 1.02096462, + "epoch": 0.574477679242447, + "flos": 24206435280000.0, + "grad_norm": 1.4037120688524998, + "language_loss": 0.70717633, + "learning_rate": 1.536508276751001e-06, + "loss": 0.72813612, + "num_input_tokens_seen": 205791935, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.44726562, + "step": 9555, + "time_per_iteration": 2.4727134704589844 + }, + { + "auxiliary_loss_clip": 0.01058558, + "auxiliary_loss_mlp": 0.01024161, + "balance_loss_clip": 1.01120269, + "balance_loss_mlp": 1.01826572, + "epoch": 0.574537802495115, + "flos": 14902116860160.0, + "grad_norm": 2.1818309854048343, + "language_loss": 0.72613811, + "learning_rate": 1.5361407483933223e-06, + "loss": 0.74696529, + "num_input_tokens_seen": 205807260, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.40234375, + "step": 9556, + "time_per_iteration": 2.4388022422790527 + }, + { + "auxiliary_loss_clip": 0.01061864, + "auxiliary_loss_mlp": 0.01025274, + "balance_loss_clip": 1.01303172, + "balance_loss_mlp": 1.02047968, + "epoch": 0.5745979257477829, + "flos": 24933850790400.0, + "grad_norm": 1.579291798881979, + "language_loss": 0.742755, + "learning_rate": 1.5357732365895863e-06, + "loss": 0.7636264, + "num_input_tokens_seen": 205826885, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.4140625, + "step": 9557, + "time_per_iteration": 2.431654214859009 + }, + { + "auxiliary_loss_clip": 0.01060784, + "auxiliary_loss_mlp": 0.01028146, + "balance_loss_clip": 1.01496184, + "balance_loss_mlp": 1.01914179, + "epoch": 0.5746580490004509, + "flos": 17234873729280.0, + "grad_norm": 1.864505735436066, + "language_loss": 0.67749566, + "learning_rate": 1.5354057413529103e-06, + "loss": 0.69838488, + "num_input_tokens_seen": 205844630, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.41796875, + "step": 9558, + "time_per_iteration": 2.3760123252868652 + }, + { + "auxiliary_loss_clip": 0.01066559, + "auxiliary_loss_mlp": 0.01033617, + "balance_loss_clip": 1.0189662, + "balance_loss_mlp": 1.02087343, + "epoch": 0.5747181722531189, + "flos": 13187043498240.0, + "grad_norm": 1.9525017506402174, + "language_loss": 0.70790005, + "learning_rate": 1.5350382626964076e-06, + "loss": 0.72890174, + "num_input_tokens_seen": 205860960, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.45703125, + "step": 9559, + "time_per_iteration": 2.3838961124420166 + }, + { + "auxiliary_loss_clip": 0.01060353, + "auxiliary_loss_mlp": 0.01022803, + "balance_loss_clip": 1.01206255, + "balance_loss_mlp": 1.01996934, + "epoch": 0.5747782955057869, + "flos": 22961991312000.0, + "grad_norm": 1.5349266849878083, + "language_loss": 0.78937787, + "learning_rate": 1.5346708006331936e-06, + "loss": 0.81020939, + "num_input_tokens_seen": 205880675, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.40429688, + "step": 9560, + "time_per_iteration": 2.411297082901001 + }, + { + "auxiliary_loss_clip": 0.01062011, + "auxiliary_loss_mlp": 0.01025513, + "balance_loss_clip": 1.01318669, + "balance_loss_mlp": 1.02035534, + "epoch": 0.5748384187584549, + "flos": 23402141222400.0, + "grad_norm": 1.6891595257983574, + "language_loss": 0.64161003, + "learning_rate": 1.534303355176382e-06, + "loss": 0.66248524, + "num_input_tokens_seen": 205900050, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.41601562, + "step": 9561, + "time_per_iteration": 2.4751412868499756 + }, + { + "auxiliary_loss_clip": 0.01066977, + "auxiliary_loss_mlp": 0.01027736, + "balance_loss_clip": 1.01383042, + "balance_loss_mlp": 1.02285182, + "epoch": 0.5748985420111228, + "flos": 17784546174720.0, + "grad_norm": 1.5180136247865517, + "language_loss": 0.71439087, + "learning_rate": 1.5339359263390852e-06, + "loss": 0.73533797, + "num_input_tokens_seen": 205918855, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.44140625, + "step": 9562, + "time_per_iteration": 2.397484540939331 + }, + { + "auxiliary_loss_clip": 0.01061156, + "auxiliary_loss_mlp": 0.01029738, + "balance_loss_clip": 1.01865196, + "balance_loss_mlp": 1.02042437, + "epoch": 0.5749586652637908, + "flos": 19865195049600.0, + "grad_norm": 1.6253862720028753, + "language_loss": 0.84046251, + "learning_rate": 1.5335685141344169e-06, + "loss": 0.8613714, + "num_input_tokens_seen": 205936970, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.40820312, + "step": 9563, + "time_per_iteration": 2.3851816654205322 + }, + { + "auxiliary_loss_clip": 0.01063714, + "auxiliary_loss_mlp": 0.01029096, + "balance_loss_clip": 1.01590526, + "balance_loss_mlp": 1.01985073, + "epoch": 0.5750187885164587, + "flos": 21286195096320.0, + "grad_norm": 1.94477656256362, + "language_loss": 0.5731616, + "learning_rate": 1.5332011185754878e-06, + "loss": 0.59408969, + "num_input_tokens_seen": 205954630, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.43945312, + "step": 9564, + "time_per_iteration": 2.4155657291412354 + }, + { + "auxiliary_loss_clip": 0.01058322, + "auxiliary_loss_mlp": 0.01022156, + "balance_loss_clip": 1.01045537, + "balance_loss_mlp": 1.0192672, + "epoch": 0.5750789117691267, + "flos": 18803730850560.0, + "grad_norm": 3.1382307472604314, + "language_loss": 0.76212943, + "learning_rate": 1.5328337396754108e-06, + "loss": 0.78293425, + "num_input_tokens_seen": 205971510, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.390625, + "step": 9565, + "time_per_iteration": 2.3723363876342773 + }, + { + "auxiliary_loss_clip": 0.01009591, + "auxiliary_loss_mlp": 0.0100229, + "balance_loss_clip": 1.00149155, + "balance_loss_mlp": 1.00200069, + "epoch": 0.5751390350217946, + "flos": 70659490780800.0, + "grad_norm": 0.7388225413700475, + "language_loss": 0.60757899, + "learning_rate": 1.5324663774472955e-06, + "loss": 0.62769777, + "num_input_tokens_seen": 206035125, + "router_z_loss_clip": 0.00799561, + "router_z_loss_mlp": 0.07617188, + "step": 9566, + "time_per_iteration": 3.1416540145874023 + }, + { + "auxiliary_loss_clip": 0.01060672, + "auxiliary_loss_mlp": 0.01026047, + "balance_loss_clip": 1.01401877, + "balance_loss_mlp": 1.02098513, + "epoch": 0.5751991582744627, + "flos": 14245470408960.0, + "grad_norm": 1.8266216044113237, + "language_loss": 0.75926125, + "learning_rate": 1.5320990319042525e-06, + "loss": 0.78012848, + "num_input_tokens_seen": 206052075, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.39648438, + "step": 9567, + "time_per_iteration": 2.4155819416046143 + }, + { + "auxiliary_loss_clip": 0.01061018, + "auxiliary_loss_mlp": 0.01023051, + "balance_loss_clip": 1.01194096, + "balance_loss_mlp": 1.01967108, + "epoch": 0.5752592815271306, + "flos": 18327306170880.0, + "grad_norm": 10.58605708026799, + "language_loss": 0.74921149, + "learning_rate": 1.5317317030593916e-06, + "loss": 0.77005219, + "num_input_tokens_seen": 206069970, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.4140625, + "step": 9568, + "time_per_iteration": 2.4034597873687744 + }, + { + "auxiliary_loss_clip": 0.01062207, + "auxiliary_loss_mlp": 0.01025803, + "balance_loss_clip": 1.01285708, + "balance_loss_mlp": 1.01976359, + "epoch": 0.5753194047797986, + "flos": 20921701835520.0, + "grad_norm": 1.5630596019829035, + "language_loss": 0.70948386, + "learning_rate": 1.5313643909258217e-06, + "loss": 0.73036402, + "num_input_tokens_seen": 206088950, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.42578125, + "step": 9569, + "time_per_iteration": 2.423276901245117 + }, + { + "auxiliary_loss_clip": 0.01062007, + "auxiliary_loss_mlp": 0.010276, + "balance_loss_clip": 1.01411152, + "balance_loss_mlp": 1.01979518, + "epoch": 0.5753795280324665, + "flos": 19280783934720.0, + "grad_norm": 2.8018326189884903, + "language_loss": 0.55420625, + "learning_rate": 1.5309970955166515e-06, + "loss": 0.57510233, + "num_input_tokens_seen": 206107780, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.421875, + "step": 9570, + "time_per_iteration": 2.3963897228240967 + }, + { + "auxiliary_loss_clip": 0.01058601, + "auxiliary_loss_mlp": 0.01027787, + "balance_loss_clip": 1.01609898, + "balance_loss_mlp": 1.01869357, + "epoch": 0.5754396512851345, + "flos": 21651805520640.0, + "grad_norm": 1.801198350847879, + "language_loss": 0.6433382, + "learning_rate": 1.5306298168449888e-06, + "loss": 0.66420209, + "num_input_tokens_seen": 206127445, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.3984375, + "step": 9571, + "time_per_iteration": 3.8654258251190186 + }, + { + "auxiliary_loss_clip": 0.01061053, + "auxiliary_loss_mlp": 0.01028341, + "balance_loss_clip": 1.01552594, + "balance_loss_mlp": 1.01930535, + "epoch": 0.5754997745378025, + "flos": 51019871091840.0, + "grad_norm": 2.0372837929949053, + "language_loss": 0.66876072, + "learning_rate": 1.5302625549239396e-06, + "loss": 0.68965465, + "num_input_tokens_seen": 206152005, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.41796875, + "step": 9572, + "time_per_iteration": 2.6587109565734863 + }, + { + "auxiliary_loss_clip": 0.01061391, + "auxiliary_loss_mlp": 0.01025602, + "balance_loss_clip": 1.01313269, + "balance_loss_mlp": 1.02027965, + "epoch": 0.5755598977904705, + "flos": 22855785356160.0, + "grad_norm": 1.6681026096498717, + "language_loss": 0.7175104, + "learning_rate": 1.529895309766612e-06, + "loss": 0.73838031, + "num_input_tokens_seen": 206169875, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.41210938, + "step": 9573, + "time_per_iteration": 2.496636152267456 + }, + { + "auxiliary_loss_clip": 0.01060368, + "auxiliary_loss_mlp": 0.01023537, + "balance_loss_clip": 1.01332712, + "balance_loss_mlp": 1.02209592, + "epoch": 0.5756200210431385, + "flos": 38471283360000.0, + "grad_norm": 1.659241986956223, + "language_loss": 0.7622034, + "learning_rate": 1.5295280813861111e-06, + "loss": 0.78304243, + "num_input_tokens_seen": 206192635, + "router_z_loss_clip": 0.10205078, + "router_z_loss_mlp": 0.3828125, + "step": 9574, + "time_per_iteration": 2.589470863342285 + }, + { + "auxiliary_loss_clip": 0.01063259, + "auxiliary_loss_mlp": 0.01028086, + "balance_loss_clip": 1.01463914, + "balance_loss_mlp": 1.01952457, + "epoch": 0.5756801442958064, + "flos": 23909010474240.0, + "grad_norm": 2.3690740301863333, + "language_loss": 0.66460443, + "learning_rate": 1.5291608697955434e-06, + "loss": 0.68551785, + "num_input_tokens_seen": 206211485, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.4375, + "step": 9575, + "time_per_iteration": 2.425971031188965 + }, + { + "auxiliary_loss_clip": 0.01059035, + "auxiliary_loss_mlp": 0.01022307, + "balance_loss_clip": 1.01090467, + "balance_loss_mlp": 1.02009773, + "epoch": 0.5757402675484744, + "flos": 21104227756800.0, + "grad_norm": 1.5869801998598094, + "language_loss": 0.79766166, + "learning_rate": 1.528793675008013e-06, + "loss": 0.81847513, + "num_input_tokens_seen": 206231740, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.390625, + "step": 9576, + "time_per_iteration": 2.4193389415740967 + }, + { + "auxiliary_loss_clip": 0.01059087, + "auxiliary_loss_mlp": 0.01025948, + "balance_loss_clip": 1.01446795, + "balance_loss_mlp": 1.01936889, + "epoch": 0.5758003908011423, + "flos": 20558046447360.0, + "grad_norm": 1.6126675850264873, + "language_loss": 0.695117, + "learning_rate": 1.528426497036624e-06, + "loss": 0.7159673, + "num_input_tokens_seen": 206250975, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.39648438, + "step": 9577, + "time_per_iteration": 2.432541847229004 + }, + { + "auxiliary_loss_clip": 0.0106131, + "auxiliary_loss_mlp": 0.01022369, + "balance_loss_clip": 1.01057947, + "balance_loss_mlp": 1.01993978, + "epoch": 0.5758605140538103, + "flos": 16472056233600.0, + "grad_norm": 1.5712154026486305, + "language_loss": 0.66956192, + "learning_rate": 1.5280593358944804e-06, + "loss": 0.69039869, + "num_input_tokens_seen": 206268800, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.4140625, + "step": 9578, + "time_per_iteration": 2.3866868019104004 + }, + { + "auxiliary_loss_clip": 0.01059692, + "auxiliary_loss_mlp": 0.01024948, + "balance_loss_clip": 1.01401699, + "balance_loss_mlp": 1.01987636, + "epoch": 0.5759206373064782, + "flos": 21286509298560.0, + "grad_norm": 1.5873640580057653, + "language_loss": 0.72725141, + "learning_rate": 1.527692191594685e-06, + "loss": 0.74809778, + "num_input_tokens_seen": 206287190, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.3984375, + "step": 9579, + "time_per_iteration": 2.4236674308776855 + }, + { + "auxiliary_loss_clip": 0.01064385, + "auxiliary_loss_mlp": 0.01021451, + "balance_loss_clip": 1.00774193, + "balance_loss_mlp": 1.02014613, + "epoch": 0.5759807605591463, + "flos": 26066677541760.0, + "grad_norm": 6.23826201649372, + "language_loss": 0.64679891, + "learning_rate": 1.5273250641503406e-06, + "loss": 0.66765726, + "num_input_tokens_seen": 206307020, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.44335938, + "step": 9580, + "time_per_iteration": 2.459584951400757 + }, + { + "auxiliary_loss_clip": 0.01061169, + "auxiliary_loss_mlp": 0.01024777, + "balance_loss_clip": 1.01266515, + "balance_loss_mlp": 1.01936483, + "epoch": 0.5760408838118142, + "flos": 18872265582720.0, + "grad_norm": 3.584916505328038, + "language_loss": 0.85639971, + "learning_rate": 1.5269579535745486e-06, + "loss": 0.87725919, + "num_input_tokens_seen": 206324095, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.41796875, + "step": 9581, + "time_per_iteration": 2.4334568977355957 + }, + { + "auxiliary_loss_clip": 0.01066333, + "auxiliary_loss_mlp": 0.01027702, + "balance_loss_clip": 1.01488101, + "balance_loss_mlp": 1.02319813, + "epoch": 0.5761010070644822, + "flos": 15377214908160.0, + "grad_norm": 3.082756648038508, + "language_loss": 0.67022502, + "learning_rate": 1.5265908598804104e-06, + "loss": 0.69116533, + "num_input_tokens_seen": 206343210, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.43164062, + "step": 9582, + "time_per_iteration": 2.4232571125030518 + }, + { + "auxiliary_loss_clip": 0.01009495, + "auxiliary_loss_mlp": 0.01001565, + "balance_loss_clip": 1.00062335, + "balance_loss_mlp": 1.00203061, + "epoch": 0.5761611303171501, + "flos": 71468009112960.0, + "grad_norm": 0.6387000392283178, + "language_loss": 0.57191992, + "learning_rate": 1.526223783081027e-06, + "loss": 0.59203053, + "num_input_tokens_seen": 206415935, + "router_z_loss_clip": 0.00939941, + "router_z_loss_mlp": 0.07470703, + "step": 9583, + "time_per_iteration": 4.7041051387786865 + }, + { + "auxiliary_loss_clip": 0.01062417, + "auxiliary_loss_mlp": 0.01026179, + "balance_loss_clip": 1.01422238, + "balance_loss_mlp": 1.02032423, + "epoch": 0.5762212535698181, + "flos": 16245435398400.0, + "grad_norm": 1.8746082302204183, + "language_loss": 0.81980872, + "learning_rate": 1.5258567231894977e-06, + "loss": 0.84069467, + "num_input_tokens_seen": 206431900, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.421875, + "step": 9584, + "time_per_iteration": 2.5367884635925293 + }, + { + "auxiliary_loss_clip": 0.01009338, + "auxiliary_loss_mlp": 0.0100292, + "balance_loss_clip": 1.00198984, + "balance_loss_mlp": 1.00167572, + "epoch": 0.5762813768224861, + "flos": 70181879114880.0, + "grad_norm": 0.6223073163342362, + "language_loss": 0.49541253, + "learning_rate": 1.525489680218923e-06, + "loss": 0.51553512, + "num_input_tokens_seen": 206501200, + "router_z_loss_clip": 0.00927734, + "router_z_loss_mlp": 0.07666016, + "step": 9585, + "time_per_iteration": 3.189716100692749 + }, + { + "auxiliary_loss_clip": 0.01058346, + "auxiliary_loss_mlp": 0.01021756, + "balance_loss_clip": 1.01035404, + "balance_loss_mlp": 1.01934195, + "epoch": 0.5763415000751541, + "flos": 20517093555840.0, + "grad_norm": 1.4960349630561196, + "language_loss": 0.8478874, + "learning_rate": 1.5251226541824003e-06, + "loss": 0.86868834, + "num_input_tokens_seen": 206520575, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.390625, + "step": 9586, + "time_per_iteration": 2.4454505443573 + }, + { + "auxiliary_loss_clip": 0.01060618, + "auxiliary_loss_mlp": 0.01022185, + "balance_loss_clip": 1.00932217, + "balance_loss_mlp": 1.02033365, + "epoch": 0.5764016233278221, + "flos": 15814606821120.0, + "grad_norm": 1.764282965232711, + "language_loss": 0.80207193, + "learning_rate": 1.5247556450930287e-06, + "loss": 0.8229, + "num_input_tokens_seen": 206538060, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.40234375, + "step": 9587, + "time_per_iteration": 3.853822946548462 + }, + { + "auxiliary_loss_clip": 0.01060129, + "auxiliary_loss_mlp": 0.01022584, + "balance_loss_clip": 1.01012063, + "balance_loss_mlp": 1.01958513, + "epoch": 0.57646174658049, + "flos": 20771400965760.0, + "grad_norm": 1.7615690437450517, + "language_loss": 0.65539634, + "learning_rate": 1.524388652963906e-06, + "loss": 0.6762234, + "num_input_tokens_seen": 206557320, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.40429688, + "step": 9588, + "time_per_iteration": 2.414405345916748 + }, + { + "auxiliary_loss_clip": 0.01061487, + "auxiliary_loss_mlp": 0.01027781, + "balance_loss_clip": 1.01395905, + "balance_loss_mlp": 1.01968479, + "epoch": 0.576521869833158, + "flos": 23548811310720.0, + "grad_norm": 1.8004722884439492, + "language_loss": 0.78998518, + "learning_rate": 1.5240216778081282e-06, + "loss": 0.81087792, + "num_input_tokens_seen": 206575780, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.41796875, + "step": 9589, + "time_per_iteration": 3.868642568588257 + }, + { + "auxiliary_loss_clip": 0.01058019, + "auxiliary_loss_mlp": 0.01020669, + "balance_loss_clip": 1.00913596, + "balance_loss_mlp": 1.01872075, + "epoch": 0.5765819930858259, + "flos": 20265544143360.0, + "grad_norm": 1.958498741626051, + "language_loss": 0.79831612, + "learning_rate": 1.523654719638793e-06, + "loss": 0.819103, + "num_input_tokens_seen": 206594100, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.39453125, + "step": 9590, + "time_per_iteration": 2.404857635498047 + }, + { + "auxiliary_loss_clip": 0.01059857, + "auxiliary_loss_mlp": 0.01020599, + "balance_loss_clip": 1.00959635, + "balance_loss_mlp": 1.02034688, + "epoch": 0.5766421163384939, + "flos": 23147659255680.0, + "grad_norm": 1.9204098638800546, + "language_loss": 0.6337111, + "learning_rate": 1.5232877784689947e-06, + "loss": 0.65451562, + "num_input_tokens_seen": 206613325, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.39453125, + "step": 9591, + "time_per_iteration": 2.534141778945923 + }, + { + "auxiliary_loss_clip": 0.01060961, + "auxiliary_loss_mlp": 0.01027918, + "balance_loss_clip": 1.01559782, + "balance_loss_mlp": 1.02046359, + "epoch": 0.5767022395911618, + "flos": 25847702294400.0, + "grad_norm": 1.5733088910656943, + "language_loss": 0.77897984, + "learning_rate": 1.5229208543118302e-06, + "loss": 0.79986864, + "num_input_tokens_seen": 206634265, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.40429688, + "step": 9592, + "time_per_iteration": 2.433811902999878 + }, + { + "auxiliary_loss_clip": 0.01008844, + "auxiliary_loss_mlp": 0.01004263, + "balance_loss_clip": 1.00333321, + "balance_loss_mlp": 1.0012486, + "epoch": 0.5767623628438299, + "flos": 68289586358400.0, + "grad_norm": 0.7305274258068215, + "language_loss": 0.59611869, + "learning_rate": 1.522553947180393e-06, + "loss": 0.61624968, + "num_input_tokens_seen": 206696990, + "router_z_loss_clip": 0.00927734, + "router_z_loss_mlp": 0.07617188, + "step": 9593, + "time_per_iteration": 3.1741561889648438 + }, + { + "auxiliary_loss_clip": 0.01061784, + "auxiliary_loss_mlp": 0.01025443, + "balance_loss_clip": 1.01248479, + "balance_loss_mlp": 1.02017748, + "epoch": 0.5768224860964978, + "flos": 30187196956800.0, + "grad_norm": 1.62544672958285, + "language_loss": 0.70848858, + "learning_rate": 1.5221870570877771e-06, + "loss": 0.72936088, + "num_input_tokens_seen": 206717815, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.41601562, + "step": 9594, + "time_per_iteration": 2.4850573539733887 + }, + { + "auxiliary_loss_clip": 0.01059669, + "auxiliary_loss_mlp": 0.01023654, + "balance_loss_clip": 1.01183403, + "balance_loss_mlp": 1.01886427, + "epoch": 0.5768826093491658, + "flos": 17894068709760.0, + "grad_norm": 2.2797384317079534, + "language_loss": 0.7077105, + "learning_rate": 1.5218201840470761e-06, + "loss": 0.72854376, + "num_input_tokens_seen": 206735985, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.40820312, + "step": 9595, + "time_per_iteration": 2.4163315296173096 + }, + { + "auxiliary_loss_clip": 0.01060416, + "auxiliary_loss_mlp": 0.01022553, + "balance_loss_clip": 1.01020348, + "balance_loss_mlp": 1.01968884, + "epoch": 0.5769427326018337, + "flos": 17456222949120.0, + "grad_norm": 1.885482270442243, + "language_loss": 0.69570661, + "learning_rate": 1.5214533280713827e-06, + "loss": 0.71653628, + "num_input_tokens_seen": 206753370, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.40820312, + "step": 9596, + "time_per_iteration": 2.375095844268799 + }, + { + "auxiliary_loss_clip": 0.01063183, + "auxiliary_loss_mlp": 0.01027378, + "balance_loss_clip": 1.01397896, + "balance_loss_mlp": 1.02035236, + "epoch": 0.5770028558545017, + "flos": 39420152824320.0, + "grad_norm": 1.8269492031566048, + "language_loss": 0.68046516, + "learning_rate": 1.521086489173789e-06, + "loss": 0.70137078, + "num_input_tokens_seen": 206777645, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.42773438, + "step": 9597, + "time_per_iteration": 2.5872349739074707 + }, + { + "auxiliary_loss_clip": 0.01061318, + "auxiliary_loss_mlp": 0.0102584, + "balance_loss_clip": 1.01332295, + "balance_loss_mlp": 1.02045369, + "epoch": 0.5770629791071697, + "flos": 21535510181760.0, + "grad_norm": 1.8123421551742787, + "language_loss": 0.81878388, + "learning_rate": 1.520719667367387e-06, + "loss": 0.83965552, + "num_input_tokens_seen": 206794865, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.40820312, + "step": 9598, + "time_per_iteration": 2.4670310020446777 + }, + { + "auxiliary_loss_clip": 0.01061992, + "auxiliary_loss_mlp": 0.01026332, + "balance_loss_clip": 1.0132246, + "balance_loss_mlp": 1.01995015, + "epoch": 0.5771231023598377, + "flos": 20885741268480.0, + "grad_norm": 1.4931158427896822, + "language_loss": 0.73016912, + "learning_rate": 1.5203528626652666e-06, + "loss": 0.75105232, + "num_input_tokens_seen": 206814095, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.41992188, + "step": 9599, + "time_per_iteration": 2.4625515937805176 + }, + { + "auxiliary_loss_clip": 0.01057403, + "auxiliary_loss_mlp": 0.01024245, + "balance_loss_clip": 1.01275277, + "balance_loss_mlp": 1.01888323, + "epoch": 0.5771832256125057, + "flos": 18076245517440.0, + "grad_norm": 1.7860441616108356, + "language_loss": 0.78264546, + "learning_rate": 1.5199860750805196e-06, + "loss": 0.80346197, + "num_input_tokens_seen": 206832245, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.38476562, + "step": 9600, + "time_per_iteration": 2.381479263305664 + }, + { + "auxiliary_loss_clip": 0.01061961, + "auxiliary_loss_mlp": 0.0102841, + "balance_loss_clip": 1.01531482, + "balance_loss_mlp": 1.02134776, + "epoch": 0.5772433488651736, + "flos": 26357888125440.0, + "grad_norm": 1.6706032501881223, + "language_loss": 0.72268569, + "learning_rate": 1.519619304626234e-06, + "loss": 0.7435894, + "num_input_tokens_seen": 206851535, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.40625, + "step": 9601, + "time_per_iteration": 2.502200126647949 + }, + { + "auxiliary_loss_clip": 0.01059079, + "auxiliary_loss_mlp": 0.01025526, + "balance_loss_clip": 1.01376593, + "balance_loss_mlp": 1.0195353, + "epoch": 0.5773034721178416, + "flos": 19680015864960.0, + "grad_norm": 1.5936787406207382, + "language_loss": 0.68671793, + "learning_rate": 1.5192525513155e-06, + "loss": 0.707564, + "num_input_tokens_seen": 206870595, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.39453125, + "step": 9602, + "time_per_iteration": 2.4152004718780518 + }, + { + "auxiliary_loss_clip": 0.01057781, + "auxiliary_loss_mlp": 0.01025612, + "balance_loss_clip": 1.01455557, + "balance_loss_mlp": 1.02015054, + "epoch": 0.5773635953705095, + "flos": 25081707864960.0, + "grad_norm": 1.546206499166946, + "language_loss": 0.73157161, + "learning_rate": 1.5188858151614056e-06, + "loss": 0.75240552, + "num_input_tokens_seen": 206892320, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.375, + "step": 9603, + "time_per_iteration": 2.5158936977386475 + }, + { + "auxiliary_loss_clip": 0.01058118, + "auxiliary_loss_mlp": 0.01022936, + "balance_loss_clip": 1.01091361, + "balance_loss_mlp": 1.01899314, + "epoch": 0.5774237186231775, + "flos": 21031922597760.0, + "grad_norm": 1.6323485589178552, + "language_loss": 0.76554257, + "learning_rate": 1.5185190961770394e-06, + "loss": 0.78635311, + "num_input_tokens_seen": 206912485, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.390625, + "step": 9604, + "time_per_iteration": 2.4243433475494385 + }, + { + "auxiliary_loss_clip": 0.01060271, + "auxiliary_loss_mlp": 0.0102333, + "balance_loss_clip": 1.01032472, + "balance_loss_mlp": 1.0195899, + "epoch": 0.5774838418758454, + "flos": 15230824110720.0, + "grad_norm": 1.6048373024063454, + "language_loss": 0.83486992, + "learning_rate": 1.5181523943754878e-06, + "loss": 0.85570598, + "num_input_tokens_seen": 206929100, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.40625, + "step": 9605, + "time_per_iteration": 2.4316108226776123 + }, + { + "auxiliary_loss_clip": 0.010633, + "auxiliary_loss_mlp": 0.01025713, + "balance_loss_clip": 1.01092482, + "balance_loss_mlp": 1.02009165, + "epoch": 0.5775439651285135, + "flos": 23581594944000.0, + "grad_norm": 1.6512910346675231, + "language_loss": 0.78150678, + "learning_rate": 1.5177857097698378e-06, + "loss": 0.80239683, + "num_input_tokens_seen": 206947020, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.43164062, + "step": 9606, + "time_per_iteration": 2.421304941177368 + }, + { + "auxiliary_loss_clip": 0.01058743, + "auxiliary_loss_mlp": 0.01025428, + "balance_loss_clip": 1.01400208, + "balance_loss_mlp": 1.01945138, + "epoch": 0.5776040883811814, + "flos": 18039551811840.0, + "grad_norm": 1.6582878817335418, + "language_loss": 0.74089545, + "learning_rate": 1.5174190423731755e-06, + "loss": 0.76173717, + "num_input_tokens_seen": 206964065, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.39257812, + "step": 9607, + "time_per_iteration": 2.4236936569213867 + }, + { + "auxiliary_loss_clip": 0.01061596, + "auxiliary_loss_mlp": 0.01022143, + "balance_loss_clip": 1.01041305, + "balance_loss_mlp": 1.0202713, + "epoch": 0.5776642116338494, + "flos": 18623648724480.0, + "grad_norm": 1.5579671664951844, + "language_loss": 0.69287133, + "learning_rate": 1.517052392198586e-06, + "loss": 0.71370876, + "num_input_tokens_seen": 206981940, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.4140625, + "step": 9608, + "time_per_iteration": 2.3807661533355713 + }, + { + "auxiliary_loss_clip": 0.01060426, + "auxiliary_loss_mlp": 0.01028919, + "balance_loss_clip": 1.01620579, + "balance_loss_mlp": 1.01952124, + "epoch": 0.5777243348865173, + "flos": 28401284712960.0, + "grad_norm": 2.073253994167638, + "language_loss": 0.76637185, + "learning_rate": 1.5166857592591547e-06, + "loss": 0.7872653, + "num_input_tokens_seen": 207002365, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.40820312, + "step": 9609, + "time_per_iteration": 2.4792561531066895 + }, + { + "auxiliary_loss_clip": 0.01060738, + "auxiliary_loss_mlp": 0.01025709, + "balance_loss_clip": 1.01405025, + "balance_loss_mlp": 1.01991904, + "epoch": 0.5777844581391853, + "flos": 24023560245120.0, + "grad_norm": 1.6344623513524301, + "language_loss": 0.77270806, + "learning_rate": 1.5163191435679651e-06, + "loss": 0.79357255, + "num_input_tokens_seen": 207021195, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.40820312, + "step": 9610, + "time_per_iteration": 2.4107604026794434 + }, + { + "auxiliary_loss_clip": 0.01058474, + "auxiliary_loss_mlp": 0.01025777, + "balance_loss_clip": 1.01324248, + "balance_loss_mlp": 1.01904368, + "epoch": 0.5778445813918534, + "flos": 17776621296000.0, + "grad_norm": 1.9064667260196455, + "language_loss": 0.68736982, + "learning_rate": 1.5159525451381012e-06, + "loss": 0.70821232, + "num_input_tokens_seen": 207037465, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.39453125, + "step": 9611, + "time_per_iteration": 3.86418080329895 + }, + { + "auxiliary_loss_clip": 0.01058515, + "auxiliary_loss_mlp": 0.01023638, + "balance_loss_clip": 1.0119133, + "balance_loss_mlp": 1.0187422, + "epoch": 0.5779047046445213, + "flos": 22232201829120.0, + "grad_norm": 2.950715828243936, + "language_loss": 0.83227611, + "learning_rate": 1.515585963982646e-06, + "loss": 0.85309756, + "num_input_tokens_seen": 207054230, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.3984375, + "step": 9612, + "time_per_iteration": 2.44356369972229 + }, + { + "auxiliary_loss_clip": 0.01059699, + "auxiliary_loss_mlp": 0.01027437, + "balance_loss_clip": 1.01491952, + "balance_loss_mlp": 1.01993632, + "epoch": 0.5779648278971893, + "flos": 21433284120960.0, + "grad_norm": 2.005987079922689, + "language_loss": 0.79544473, + "learning_rate": 1.5152194001146813e-06, + "loss": 0.81631607, + "num_input_tokens_seen": 207073150, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.3984375, + "step": 9613, + "time_per_iteration": 2.4318718910217285 + }, + { + "auxiliary_loss_clip": 0.01057527, + "auxiliary_loss_mlp": 0.01023095, + "balance_loss_clip": 1.01217556, + "balance_loss_mlp": 1.01860762, + "epoch": 0.5780249511498572, + "flos": 19025114981760.0, + "grad_norm": 1.8788114502321285, + "language_loss": 0.77427047, + "learning_rate": 1.5148528535472894e-06, + "loss": 0.79507673, + "num_input_tokens_seen": 207090375, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.38867188, + "step": 9614, + "time_per_iteration": 2.380732774734497 + }, + { + "auxiliary_loss_clip": 0.01060639, + "auxiliary_loss_mlp": 0.01026547, + "balance_loss_clip": 1.0143106, + "balance_loss_mlp": 1.01888323, + "epoch": 0.5780850744025252, + "flos": 12124008288000.0, + "grad_norm": 4.228935815090382, + "language_loss": 0.81045461, + "learning_rate": 1.514486324293552e-06, + "loss": 0.83132648, + "num_input_tokens_seen": 207106030, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.41601562, + "step": 9615, + "time_per_iteration": 2.376643419265747 + }, + { + "auxiliary_loss_clip": 0.01058544, + "auxiliary_loss_mlp": 0.01025056, + "balance_loss_clip": 1.01337349, + "balance_loss_mlp": 1.01835585, + "epoch": 0.5781451976551931, + "flos": 25043303502720.0, + "grad_norm": 5.948832425659251, + "language_loss": 0.66896486, + "learning_rate": 1.5141198123665477e-06, + "loss": 0.68980086, + "num_input_tokens_seen": 207125435, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.40234375, + "step": 9616, + "time_per_iteration": 2.419785499572754 + }, + { + "auxiliary_loss_clip": 0.01057811, + "auxiliary_loss_mlp": 0.01024402, + "balance_loss_clip": 1.01226091, + "balance_loss_mlp": 1.0184412, + "epoch": 0.5782053209078611, + "flos": 19244578988160.0, + "grad_norm": 1.6251847987790355, + "language_loss": 0.77767479, + "learning_rate": 1.513753317779358e-06, + "loss": 0.7984969, + "num_input_tokens_seen": 207145095, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.39453125, + "step": 9617, + "time_per_iteration": 2.3913450241088867 + }, + { + "auxiliary_loss_clip": 0.0105694, + "auxiliary_loss_mlp": 0.0102487, + "balance_loss_clip": 1.01271629, + "balance_loss_mlp": 1.01828599, + "epoch": 0.578265444160529, + "flos": 25992661726080.0, + "grad_norm": 1.4275184758598483, + "language_loss": 0.74963152, + "learning_rate": 1.5133868405450611e-06, + "loss": 0.77044964, + "num_input_tokens_seen": 207166045, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.38671875, + "step": 9618, + "time_per_iteration": 2.523974657058716 + }, + { + "auxiliary_loss_clip": 0.01057865, + "auxiliary_loss_mlp": 0.01025455, + "balance_loss_clip": 1.01419532, + "balance_loss_mlp": 1.01923537, + "epoch": 0.5783255674131971, + "flos": 21797533002240.0, + "grad_norm": 15.60264858306768, + "language_loss": 0.81247699, + "learning_rate": 1.5130203806767367e-06, + "loss": 0.83331019, + "num_input_tokens_seen": 207185290, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.38671875, + "step": 9619, + "time_per_iteration": 2.4109480381011963 + }, + { + "auxiliary_loss_clip": 0.01057671, + "auxiliary_loss_mlp": 0.01025326, + "balance_loss_clip": 1.01402521, + "balance_loss_mlp": 1.01871288, + "epoch": 0.578385690665865, + "flos": 24788612067840.0, + "grad_norm": 1.7478357204798567, + "language_loss": 0.72831774, + "learning_rate": 1.512653938187462e-06, + "loss": 0.74914777, + "num_input_tokens_seen": 207205505, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.390625, + "step": 9620, + "time_per_iteration": 2.4249634742736816 + }, + { + "auxiliary_loss_clip": 0.01058589, + "auxiliary_loss_mlp": 0.01023542, + "balance_loss_clip": 1.01175797, + "balance_loss_mlp": 1.0191803, + "epoch": 0.578445813918533, + "flos": 21211865078400.0, + "grad_norm": 1.5539057121881457, + "language_loss": 0.7684347, + "learning_rate": 1.5122875130903147e-06, + "loss": 0.78925598, + "num_input_tokens_seen": 207225315, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.39453125, + "step": 9621, + "time_per_iteration": 2.4265270233154297 + }, + { + "auxiliary_loss_clip": 0.0105995, + "auxiliary_loss_mlp": 0.01023563, + "balance_loss_clip": 1.0113498, + "balance_loss_mlp": 1.01913202, + "epoch": 0.5785059371712009, + "flos": 25045607652480.0, + "grad_norm": 1.50769119795964, + "language_loss": 0.70212901, + "learning_rate": 1.5119211053983715e-06, + "loss": 0.72296417, + "num_input_tokens_seen": 207247690, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.40820312, + "step": 9622, + "time_per_iteration": 3.9062793254852295 + }, + { + "auxiliary_loss_clip": 0.01008611, + "auxiliary_loss_mlp": 0.01001819, + "balance_loss_clip": 1.00086522, + "balance_loss_mlp": 1.00087762, + "epoch": 0.5785660604238689, + "flos": 70839503084160.0, + "grad_norm": 0.7701342361687432, + "language_loss": 0.55953985, + "learning_rate": 1.5115547151247082e-06, + "loss": 0.5796442, + "num_input_tokens_seen": 207301735, + "router_z_loss_clip": 0.00952148, + "router_z_loss_mlp": 0.07714844, + "step": 9623, + "time_per_iteration": 3.0946569442749023 + }, + { + "auxiliary_loss_clip": 0.0106112, + "auxiliary_loss_mlp": 0.01030476, + "balance_loss_clip": 1.01737475, + "balance_loss_mlp": 1.01944351, + "epoch": 0.578626183676537, + "flos": 31648626224640.0, + "grad_norm": 1.7644604724513504, + "language_loss": 0.70909107, + "learning_rate": 1.5111883422824013e-06, + "loss": 0.73000699, + "num_input_tokens_seen": 207321240, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.41601562, + "step": 9624, + "time_per_iteration": 2.492050886154175 + }, + { + "auxiliary_loss_clip": 0.0106009, + "auxiliary_loss_mlp": 0.01028274, + "balance_loss_clip": 1.01642454, + "balance_loss_mlp": 1.02016234, + "epoch": 0.5786863069292049, + "flos": 21864287255040.0, + "grad_norm": 1.8113637129495632, + "language_loss": 0.82058656, + "learning_rate": 1.5108219868845247e-06, + "loss": 0.84147024, + "num_input_tokens_seen": 207339540, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.3984375, + "step": 9625, + "time_per_iteration": 2.4253287315368652 + }, + { + "auxiliary_loss_clip": 0.01056425, + "auxiliary_loss_mlp": 0.01027217, + "balance_loss_clip": 1.0140022, + "balance_loss_mlp": 1.01816761, + "epoch": 0.5787464301818729, + "flos": 23363911416960.0, + "grad_norm": 1.4382545695121336, + "language_loss": 0.70132452, + "learning_rate": 1.5104556489441534e-06, + "loss": 0.72216094, + "num_input_tokens_seen": 207360470, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.3828125, + "step": 9626, + "time_per_iteration": 3.9100050926208496 + }, + { + "auxiliary_loss_clip": 0.01058073, + "auxiliary_loss_mlp": 0.01022866, + "balance_loss_clip": 1.01165462, + "balance_loss_mlp": 1.01920152, + "epoch": 0.5788065534345408, + "flos": 30002820733440.0, + "grad_norm": 1.4143670693046915, + "language_loss": 0.7136991, + "learning_rate": 1.5100893284743605e-06, + "loss": 0.73450851, + "num_input_tokens_seen": 207383080, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.38867188, + "step": 9627, + "time_per_iteration": 2.510380268096924 + }, + { + "auxiliary_loss_clip": 0.01059058, + "auxiliary_loss_mlp": 0.01022754, + "balance_loss_clip": 1.0115186, + "balance_loss_mlp": 1.01946092, + "epoch": 0.5788666766872088, + "flos": 24526903449600.0, + "grad_norm": 1.535798121931199, + "language_loss": 0.83631402, + "learning_rate": 1.509723025488219e-06, + "loss": 0.8571322, + "num_input_tokens_seen": 207401000, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.39648438, + "step": 9628, + "time_per_iteration": 2.4946658611297607 + }, + { + "auxiliary_loss_clip": 0.01058975, + "auxiliary_loss_mlp": 0.01026381, + "balance_loss_clip": 1.01454389, + "balance_loss_mlp": 1.01848316, + "epoch": 0.5789267999398767, + "flos": 23731686345600.0, + "grad_norm": 1.5732291770206288, + "language_loss": 0.72399658, + "learning_rate": 1.5093567399988022e-06, + "loss": 0.74485016, + "num_input_tokens_seen": 207419230, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.40429688, + "step": 9629, + "time_per_iteration": 3.8698086738586426 + }, + { + "auxiliary_loss_clip": 0.01060326, + "auxiliary_loss_mlp": 0.01023483, + "balance_loss_clip": 1.0109601, + "balance_loss_mlp": 1.02013326, + "epoch": 0.5789869231925447, + "flos": 21134183569920.0, + "grad_norm": 2.753190455367712, + "language_loss": 0.74330729, + "learning_rate": 1.5089904720191809e-06, + "loss": 0.76414537, + "num_input_tokens_seen": 207437615, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.40234375, + "step": 9630, + "time_per_iteration": 2.3893868923187256 + }, + { + "auxiliary_loss_clip": 0.01056963, + "auxiliary_loss_mlp": 0.01023947, + "balance_loss_clip": 1.01292014, + "balance_loss_mlp": 1.01925254, + "epoch": 0.5790470464452127, + "flos": 21208723056000.0, + "grad_norm": 1.5568652033120045, + "language_loss": 0.78989202, + "learning_rate": 1.5086242215624268e-06, + "loss": 0.81070113, + "num_input_tokens_seen": 207457270, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.37695312, + "step": 9631, + "time_per_iteration": 2.4087135791778564 + }, + { + "auxiliary_loss_clip": 0.01058141, + "auxiliary_loss_mlp": 0.010207, + "balance_loss_clip": 1.00976849, + "balance_loss_mlp": 1.0191915, + "epoch": 0.5791071696978807, + "flos": 23403258385920.0, + "grad_norm": 1.5015887662495744, + "language_loss": 0.75262964, + "learning_rate": 1.5082579886416102e-06, + "loss": 0.77341801, + "num_input_tokens_seen": 207477890, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.390625, + "step": 9632, + "time_per_iteration": 2.4045398235321045 + }, + { + "auxiliary_loss_clip": 0.01057586, + "auxiliary_loss_mlp": 0.01022431, + "balance_loss_clip": 1.01025963, + "balance_loss_mlp": 1.01869547, + "epoch": 0.5791672929505486, + "flos": 24205387939200.0, + "grad_norm": 1.8285774448756444, + "language_loss": 0.79459, + "learning_rate": 1.5078917732698009e-06, + "loss": 0.81539011, + "num_input_tokens_seen": 207497670, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.38867188, + "step": 9633, + "time_per_iteration": 2.4247758388519287 + }, + { + "auxiliary_loss_clip": 0.01065348, + "auxiliary_loss_mlp": 0.01030584, + "balance_loss_clip": 1.01727438, + "balance_loss_mlp": 1.02198267, + "epoch": 0.5792274162032166, + "flos": 24347833752960.0, + "grad_norm": 1.98032335263081, + "language_loss": 0.78094941, + "learning_rate": 1.5075255754600686e-06, + "loss": 0.80190873, + "num_input_tokens_seen": 207516105, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.43359375, + "step": 9634, + "time_per_iteration": 2.423391342163086 + }, + { + "auxiliary_loss_clip": 0.01061013, + "auxiliary_loss_mlp": 0.01031183, + "balance_loss_clip": 1.01896977, + "balance_loss_mlp": 1.01971316, + "epoch": 0.5792875394558845, + "flos": 20448349355520.0, + "grad_norm": 1.8520365393532277, + "language_loss": 0.62980348, + "learning_rate": 1.5071593952254814e-06, + "loss": 0.65072548, + "num_input_tokens_seen": 207533685, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.41210938, + "step": 9635, + "time_per_iteration": 2.400853157043457 + }, + { + "auxiliary_loss_clip": 0.01058262, + "auxiliary_loss_mlp": 0.01021057, + "balance_loss_clip": 1.01079917, + "balance_loss_mlp": 1.02055979, + "epoch": 0.5793476627085525, + "flos": 24059206609920.0, + "grad_norm": 1.6025199381385455, + "language_loss": 0.77613401, + "learning_rate": 1.5067932325791077e-06, + "loss": 0.79692721, + "num_input_tokens_seen": 207552840, + "router_z_loss_clip": 0.10253906, + "router_z_loss_mlp": 0.37890625, + "step": 9636, + "time_per_iteration": 2.419712781906128 + }, + { + "auxiliary_loss_clip": 0.01055837, + "auxiliary_loss_mlp": 0.01020742, + "balance_loss_clip": 1.01062727, + "balance_loss_mlp": 1.01841784, + "epoch": 0.5794077859612206, + "flos": 22053201955200.0, + "grad_norm": 1.6960950999038231, + "language_loss": 0.68073273, + "learning_rate": 1.5064270875340153e-06, + "loss": 0.70149851, + "num_input_tokens_seen": 207572095, + "router_z_loss_clip": 0.10107422, + "router_z_loss_mlp": 0.375, + "step": 9637, + "time_per_iteration": 2.4093899726867676 + }, + { + "auxiliary_loss_clip": 0.01058869, + "auxiliary_loss_mlp": 0.01025136, + "balance_loss_clip": 1.01251209, + "balance_loss_mlp": 1.01797748, + "epoch": 0.5794679092138885, + "flos": 11434124355840.0, + "grad_norm": 2.582147554863157, + "language_loss": 0.72375715, + "learning_rate": 1.50606096010327e-06, + "loss": 0.74459714, + "num_input_tokens_seen": 207587495, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.40820312, + "step": 9638, + "time_per_iteration": 2.3764748573303223 + }, + { + "auxiliary_loss_clip": 0.01058309, + "auxiliary_loss_mlp": 0.01026204, + "balance_loss_clip": 1.01451564, + "balance_loss_mlp": 1.01973248, + "epoch": 0.5795280324665565, + "flos": 18879212943360.0, + "grad_norm": 1.6930111478712175, + "language_loss": 0.72470707, + "learning_rate": 1.5056948502999386e-06, + "loss": 0.74555218, + "num_input_tokens_seen": 207606795, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.38671875, + "step": 9639, + "time_per_iteration": 2.393322229385376 + }, + { + "auxiliary_loss_clip": 0.01061594, + "auxiliary_loss_mlp": 0.01023266, + "balance_loss_clip": 1.01204205, + "balance_loss_mlp": 1.02066088, + "epoch": 0.5795881557192244, + "flos": 13005111070080.0, + "grad_norm": 2.0103999701417403, + "language_loss": 0.69484103, + "learning_rate": 1.5053287581370863e-06, + "loss": 0.7156896, + "num_input_tokens_seen": 207623620, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.41015625, + "step": 9640, + "time_per_iteration": 2.3774187564849854 + }, + { + "auxiliary_loss_clip": 0.01064455, + "auxiliary_loss_mlp": 0.01030407, + "balance_loss_clip": 1.01762748, + "balance_loss_mlp": 1.02120924, + "epoch": 0.5796482789718924, + "flos": 19931530366080.0, + "grad_norm": 2.771932698022213, + "language_loss": 0.77789021, + "learning_rate": 1.5049626836277787e-06, + "loss": 0.79883885, + "num_input_tokens_seen": 207639380, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.43164062, + "step": 9641, + "time_per_iteration": 2.371868848800659 + }, + { + "auxiliary_loss_clip": 0.01059092, + "auxiliary_loss_mlp": 0.01022778, + "balance_loss_clip": 1.01179338, + "balance_loss_mlp": 1.01964355, + "epoch": 0.5797084022245603, + "flos": 21649780661760.0, + "grad_norm": 1.8624588923072158, + "language_loss": 0.73936093, + "learning_rate": 1.5045966267850793e-06, + "loss": 0.7601797, + "num_input_tokens_seen": 207657915, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.39453125, + "step": 9642, + "time_per_iteration": 2.382620334625244 + }, + { + "auxiliary_loss_clip": 0.01057598, + "auxiliary_loss_mlp": 0.01026218, + "balance_loss_clip": 1.01411819, + "balance_loss_mlp": 1.01813126, + "epoch": 0.5797685254772283, + "flos": 26030367861120.0, + "grad_norm": 1.8130731571975658, + "language_loss": 0.73534441, + "learning_rate": 1.5042305876220515e-06, + "loss": 0.75618261, + "num_input_tokens_seen": 207678620, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.39453125, + "step": 9643, + "time_per_iteration": 2.4354374408721924 + }, + { + "auxiliary_loss_clip": 0.01058606, + "auxiliary_loss_mlp": 0.01023935, + "balance_loss_clip": 1.01234746, + "balance_loss_mlp": 1.01871467, + "epoch": 0.5798286487298963, + "flos": 22704227677440.0, + "grad_norm": 1.8254593899964753, + "language_loss": 0.67567194, + "learning_rate": 1.5038645661517594e-06, + "loss": 0.69649738, + "num_input_tokens_seen": 207696980, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.3984375, + "step": 9644, + "time_per_iteration": 2.3789913654327393 + }, + { + "auxiliary_loss_clip": 0.01064313, + "auxiliary_loss_mlp": 0.0102615, + "balance_loss_clip": 1.01315641, + "balance_loss_mlp": 1.02172911, + "epoch": 0.5798887719825643, + "flos": 23147868723840.0, + "grad_norm": 2.1150836190652798, + "language_loss": 0.85881174, + "learning_rate": 1.5034985623872647e-06, + "loss": 0.8797164, + "num_input_tokens_seen": 207714065, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.42578125, + "step": 9645, + "time_per_iteration": 2.3953747749328613 + }, + { + "auxiliary_loss_clip": 0.01058756, + "auxiliary_loss_mlp": 0.01026114, + "balance_loss_clip": 1.01481891, + "balance_loss_mlp": 1.01921058, + "epoch": 0.5799488952352322, + "flos": 24424886856960.0, + "grad_norm": 1.910323216446836, + "language_loss": 0.75208443, + "learning_rate": 1.5031325763416292e-06, + "loss": 0.77293313, + "num_input_tokens_seen": 207734720, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.39453125, + "step": 9646, + "time_per_iteration": 2.441415548324585 + }, + { + "auxiliary_loss_clip": 0.01062203, + "auxiliary_loss_mlp": 0.01023828, + "balance_loss_clip": 1.0113287, + "balance_loss_mlp": 1.02015412, + "epoch": 0.5800090184879002, + "flos": 38394474635520.0, + "grad_norm": 1.9844961415700493, + "language_loss": 0.5948965, + "learning_rate": 1.5027666080279134e-06, + "loss": 0.61575681, + "num_input_tokens_seen": 207755435, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.41992188, + "step": 9647, + "time_per_iteration": 2.540083408355713 + }, + { + "auxiliary_loss_clip": 0.01061461, + "auxiliary_loss_mlp": 0.01031885, + "balance_loss_clip": 1.019714, + "balance_loss_mlp": 1.02085674, + "epoch": 0.5800691417405681, + "flos": 19784022405120.0, + "grad_norm": 1.568347838939396, + "language_loss": 0.84178418, + "learning_rate": 1.5024006574591788e-06, + "loss": 0.86271763, + "num_input_tokens_seen": 207773570, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.40625, + "step": 9648, + "time_per_iteration": 2.3878633975982666 + }, + { + "auxiliary_loss_clip": 0.01009503, + "auxiliary_loss_mlp": 0.01003502, + "balance_loss_clip": 1.00247073, + "balance_loss_mlp": 1.00165236, + "epoch": 0.5801292649932361, + "flos": 70311407725440.0, + "grad_norm": 0.8293731874235762, + "language_loss": 0.62962657, + "learning_rate": 1.5020347246484848e-06, + "loss": 0.64975661, + "num_input_tokens_seen": 207830095, + "router_z_loss_clip": 0.01031494, + "router_z_loss_mlp": 0.07861328, + "step": 9649, + "time_per_iteration": 3.0845210552215576 + }, + { + "auxiliary_loss_clip": 0.01056083, + "auxiliary_loss_mlp": 0.01022348, + "balance_loss_clip": 1.01127374, + "balance_loss_mlp": 1.01855195, + "epoch": 0.5801893882459042, + "flos": 18733799664000.0, + "grad_norm": 1.6129111346986322, + "language_loss": 0.81929612, + "learning_rate": 1.50166880960889e-06, + "loss": 0.84008038, + "num_input_tokens_seen": 207848555, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.375, + "step": 9650, + "time_per_iteration": 3.9412097930908203 + }, + { + "auxiliary_loss_clip": 0.0106189, + "auxiliary_loss_mlp": 0.01024731, + "balance_loss_clip": 1.01272058, + "balance_loss_mlp": 1.01906824, + "epoch": 0.5802495114985721, + "flos": 15595596662400.0, + "grad_norm": 2.3816237469360773, + "language_loss": 0.7804302, + "learning_rate": 1.501302912353454e-06, + "loss": 0.80129635, + "num_input_tokens_seen": 207867060, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.42773438, + "step": 9651, + "time_per_iteration": 2.372101306915283 + }, + { + "auxiliary_loss_clip": 0.01061614, + "auxiliary_loss_mlp": 0.01022371, + "balance_loss_clip": 1.01023591, + "balance_loss_mlp": 1.02049243, + "epoch": 0.5803096347512401, + "flos": 18254547164160.0, + "grad_norm": 1.6098350465348172, + "language_loss": 0.74090576, + "learning_rate": 1.500937032895234e-06, + "loss": 0.76174563, + "num_input_tokens_seen": 207884520, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.41015625, + "step": 9652, + "time_per_iteration": 2.3691513538360596 + }, + { + "auxiliary_loss_clip": 0.01058828, + "auxiliary_loss_mlp": 0.01026303, + "balance_loss_clip": 1.01419103, + "balance_loss_mlp": 1.01825166, + "epoch": 0.580369758003908, + "flos": 22892060125440.0, + "grad_norm": 2.067313599789653, + "language_loss": 0.76400447, + "learning_rate": 1.5005711712472877e-06, + "loss": 0.78485578, + "num_input_tokens_seen": 207905370, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.40625, + "step": 9653, + "time_per_iteration": 2.432175397872925 + }, + { + "auxiliary_loss_clip": 0.01058765, + "auxiliary_loss_mlp": 0.01022782, + "balance_loss_clip": 1.00993145, + "balance_loss_mlp": 1.01828778, + "epoch": 0.580429881256576, + "flos": 18696687022080.0, + "grad_norm": 2.0549207848919457, + "language_loss": 0.74247074, + "learning_rate": 1.5002053274226718e-06, + "loss": 0.76328623, + "num_input_tokens_seen": 207923790, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.40429688, + "step": 9654, + "time_per_iteration": 2.367176055908203 + }, + { + "auxiliary_loss_clip": 0.01057575, + "auxiliary_loss_mlp": 0.01022997, + "balance_loss_clip": 1.0121969, + "balance_loss_mlp": 1.02021658, + "epoch": 0.5804900045092439, + "flos": 24680800189440.0, + "grad_norm": 1.824165049703442, + "language_loss": 0.69713104, + "learning_rate": 1.4998395014344416e-06, + "loss": 0.71793675, + "num_input_tokens_seen": 207942335, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.375, + "step": 9655, + "time_per_iteration": 2.424471616744995 + }, + { + "auxiliary_loss_clip": 0.01062333, + "auxiliary_loss_mlp": 0.01025076, + "balance_loss_clip": 1.01222515, + "balance_loss_mlp": 1.01885819, + "epoch": 0.580550127761912, + "flos": 23111663777280.0, + "grad_norm": 2.32105288965995, + "language_loss": 0.69822824, + "learning_rate": 1.4994736932956536e-06, + "loss": 0.71910226, + "num_input_tokens_seen": 207961975, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.43554688, + "step": 9656, + "time_per_iteration": 2.4004950523376465 + }, + { + "auxiliary_loss_clip": 0.01056026, + "auxiliary_loss_mlp": 0.01023548, + "balance_loss_clip": 1.01299167, + "balance_loss_mlp": 1.01879072, + "epoch": 0.5806102510145799, + "flos": 18474779220480.0, + "grad_norm": 1.5054846030826259, + "language_loss": 0.71751595, + "learning_rate": 1.4991079030193614e-06, + "loss": 0.73831177, + "num_input_tokens_seen": 207979520, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.37304688, + "step": 9657, + "time_per_iteration": 2.3607311248779297 + }, + { + "auxiliary_loss_clip": 0.01059619, + "auxiliary_loss_mlp": 0.01026175, + "balance_loss_clip": 1.0133245, + "balance_loss_mlp": 1.01790047, + "epoch": 0.5806703742672479, + "flos": 23914491557760.0, + "grad_norm": 1.9779917455829785, + "language_loss": 0.7127763, + "learning_rate": 1.4987421306186202e-06, + "loss": 0.73363417, + "num_input_tokens_seen": 207998375, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.41796875, + "step": 9658, + "time_per_iteration": 2.4018075466156006 + }, + { + "auxiliary_loss_clip": 0.01009183, + "auxiliary_loss_mlp": 0.01000387, + "balance_loss_clip": 0.99936163, + "balance_loss_mlp": 1.00117445, + "epoch": 0.5807304975199158, + "flos": 66308649926400.0, + "grad_norm": 0.6497216963578006, + "language_loss": 0.53577077, + "learning_rate": 1.498376376106483e-06, + "loss": 0.55586648, + "num_input_tokens_seen": 208060605, + "router_z_loss_clip": 0.01025391, + "router_z_loss_mlp": 0.08007812, + "step": 9659, + "time_per_iteration": 3.070441961288452 + }, + { + "auxiliary_loss_clip": 0.01061202, + "auxiliary_loss_mlp": 0.01028133, + "balance_loss_clip": 1.01616406, + "balance_loss_mlp": 1.02101755, + "epoch": 0.5807906207725838, + "flos": 31721105940480.0, + "grad_norm": 1.670555914185074, + "language_loss": 0.61831963, + "learning_rate": 1.4980106394960026e-06, + "loss": 0.63921297, + "num_input_tokens_seen": 208080320, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.40234375, + "step": 9660, + "time_per_iteration": 2.4807546138763428 + }, + { + "auxiliary_loss_clip": 0.0105889, + "auxiliary_loss_mlp": 0.01023727, + "balance_loss_clip": 1.0111444, + "balance_loss_mlp": 1.01739395, + "epoch": 0.5808507440252517, + "flos": 23800151255040.0, + "grad_norm": 1.5493327530060963, + "language_loss": 0.65449858, + "learning_rate": 1.4976449208002312e-06, + "loss": 0.6753248, + "num_input_tokens_seen": 208099305, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.4140625, + "step": 9661, + "time_per_iteration": 2.4427549839019775 + }, + { + "auxiliary_loss_clip": 0.01055385, + "auxiliary_loss_mlp": 0.01020865, + "balance_loss_clip": 1.01019621, + "balance_loss_mlp": 1.01888883, + "epoch": 0.5809108672779197, + "flos": 13697613354240.0, + "grad_norm": 1.8920868896013103, + "language_loss": 0.74442112, + "learning_rate": 1.4972792200322197e-06, + "loss": 0.76518357, + "num_input_tokens_seen": 208116960, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.36523438, + "step": 9662, + "time_per_iteration": 3.855165481567383 + }, + { + "auxiliary_loss_clip": 0.01009506, + "auxiliary_loss_mlp": 0.01000173, + "balance_loss_clip": 0.99910575, + "balance_loss_mlp": 1.00156522, + "epoch": 0.5809709905305876, + "flos": 69131062880640.0, + "grad_norm": 0.8766230194301112, + "language_loss": 0.58373916, + "learning_rate": 1.4969135372050204e-06, + "loss": 0.60383594, + "num_input_tokens_seen": 208182190, + "router_z_loss_clip": 0.01068115, + "router_z_loss_mlp": 0.07910156, + "step": 9663, + "time_per_iteration": 3.119493246078491 + }, + { + "auxiliary_loss_clip": 0.01059703, + "auxiliary_loss_mlp": 0.01023197, + "balance_loss_clip": 1.01162779, + "balance_loss_mlp": 1.02034342, + "epoch": 0.5810311137832557, + "flos": 19826546307840.0, + "grad_norm": 2.246685793028985, + "language_loss": 0.81566441, + "learning_rate": 1.4965478723316826e-06, + "loss": 0.83649343, + "num_input_tokens_seen": 208197015, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.39257812, + "step": 9664, + "time_per_iteration": 2.371976613998413 + }, + { + "auxiliary_loss_clip": 0.01058052, + "auxiliary_loss_mlp": 0.01019607, + "balance_loss_clip": 1.00764394, + "balance_loss_mlp": 1.01756763, + "epoch": 0.5810912370359237, + "flos": 29237315063040.0, + "grad_norm": 1.6271126492601666, + "language_loss": 0.81443226, + "learning_rate": 1.496182225425256e-06, + "loss": 0.83520889, + "num_input_tokens_seen": 208215795, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.40429688, + "step": 9665, + "time_per_iteration": 3.8796167373657227 + }, + { + "auxiliary_loss_clip": 0.01063454, + "auxiliary_loss_mlp": 0.01032066, + "balance_loss_clip": 1.01833916, + "balance_loss_mlp": 1.02019119, + "epoch": 0.5811513602885916, + "flos": 22784422803840.0, + "grad_norm": 2.1572011203103623, + "language_loss": 0.81081909, + "learning_rate": 1.4958165964987904e-06, + "loss": 0.83177429, + "num_input_tokens_seen": 208234655, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.43359375, + "step": 9666, + "time_per_iteration": 2.4194865226745605 + }, + { + "auxiliary_loss_clip": 0.01057293, + "auxiliary_loss_mlp": 0.01025, + "balance_loss_clip": 1.01453924, + "balance_loss_mlp": 1.01956677, + "epoch": 0.5812114835412596, + "flos": 18733345816320.0, + "grad_norm": 1.9824174945404238, + "language_loss": 0.80178905, + "learning_rate": 1.4954509855653328e-06, + "loss": 0.82261193, + "num_input_tokens_seen": 208251300, + "router_z_loss_clip": 0.10449219, + "router_z_loss_mlp": 0.37695312, + "step": 9667, + "time_per_iteration": 2.3848822116851807 + }, + { + "auxiliary_loss_clip": 0.01062254, + "auxiliary_loss_mlp": 0.01024552, + "balance_loss_clip": 1.0110333, + "balance_loss_mlp": 1.01985097, + "epoch": 0.5812716067939275, + "flos": 26430123461760.0, + "grad_norm": 1.6520595223473, + "language_loss": 0.78646004, + "learning_rate": 1.4950853926379323e-06, + "loss": 0.8073281, + "num_input_tokens_seen": 208272685, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.42382812, + "step": 9668, + "time_per_iteration": 3.883272647857666 + }, + { + "auxiliary_loss_clip": 0.01060966, + "auxiliary_loss_mlp": 0.0102817, + "balance_loss_clip": 1.01651788, + "balance_loss_mlp": 1.02033126, + "epoch": 0.5813317300465956, + "flos": 43396201566720.0, + "grad_norm": 2.14216257908515, + "language_loss": 0.64632636, + "learning_rate": 1.4947198177296347e-06, + "loss": 0.66721773, + "num_input_tokens_seen": 208294315, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.40625, + "step": 9669, + "time_per_iteration": 2.5767674446105957 + }, + { + "auxiliary_loss_clip": 0.01061053, + "auxiliary_loss_mlp": 0.0102897, + "balance_loss_clip": 1.01670289, + "balance_loss_mlp": 1.02040112, + "epoch": 0.5813918532992635, + "flos": 24784457616000.0, + "grad_norm": 1.786212546078717, + "language_loss": 0.73262495, + "learning_rate": 1.4943542608534877e-06, + "loss": 0.75352526, + "num_input_tokens_seen": 208315610, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.40625, + "step": 9670, + "time_per_iteration": 2.438523769378662 + }, + { + "auxiliary_loss_clip": 0.01059867, + "auxiliary_loss_mlp": 0.01026971, + "balance_loss_clip": 1.01484203, + "balance_loss_mlp": 1.01900482, + "epoch": 0.5814519765519315, + "flos": 22856239203840.0, + "grad_norm": 2.9642303920664768, + "language_loss": 0.78811783, + "learning_rate": 1.4939887220225361e-06, + "loss": 0.80898619, + "num_input_tokens_seen": 208334725, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.40820312, + "step": 9671, + "time_per_iteration": 2.390514612197876 + }, + { + "auxiliary_loss_clip": 0.01062665, + "auxiliary_loss_mlp": 0.01026806, + "balance_loss_clip": 1.01415229, + "balance_loss_mlp": 1.02081728, + "epoch": 0.5815120998045994, + "flos": 24059695368960.0, + "grad_norm": 2.3292579727008715, + "language_loss": 0.60752821, + "learning_rate": 1.4936232012498256e-06, + "loss": 0.62842298, + "num_input_tokens_seen": 208353825, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.41796875, + "step": 9672, + "time_per_iteration": 2.4300453662872314 + }, + { + "auxiliary_loss_clip": 0.01058561, + "auxiliary_loss_mlp": 0.01023482, + "balance_loss_clip": 1.01261604, + "balance_loss_mlp": 1.02055275, + "epoch": 0.5815722230572674, + "flos": 24278356414080.0, + "grad_norm": 1.765938396513948, + "language_loss": 0.80972469, + "learning_rate": 1.4932576985484005e-06, + "loss": 0.83054507, + "num_input_tokens_seen": 208374160, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.38085938, + "step": 9673, + "time_per_iteration": 2.4330592155456543 + }, + { + "auxiliary_loss_clip": 0.01063977, + "auxiliary_loss_mlp": 0.010284, + "balance_loss_clip": 1.01522732, + "balance_loss_mlp": 1.02108943, + "epoch": 0.5816323463099353, + "flos": 22199278550400.0, + "grad_norm": 1.9925965615068664, + "language_loss": 0.88313425, + "learning_rate": 1.492892213931304e-06, + "loss": 0.90405804, + "num_input_tokens_seen": 208392105, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.42773438, + "step": 9674, + "time_per_iteration": 2.3893260955810547 + }, + { + "auxiliary_loss_clip": 0.01059574, + "auxiliary_loss_mlp": 0.01025974, + "balance_loss_clip": 1.01401138, + "balance_loss_mlp": 1.01928997, + "epoch": 0.5816924695626033, + "flos": 24133292248320.0, + "grad_norm": 1.4517125789691234, + "language_loss": 0.78949797, + "learning_rate": 1.4925267474115812e-06, + "loss": 0.81035352, + "num_input_tokens_seen": 208411755, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.40234375, + "step": 9675, + "time_per_iteration": 2.4341323375701904 + }, + { + "auxiliary_loss_clip": 0.01060771, + "auxiliary_loss_mlp": 0.01024834, + "balance_loss_clip": 1.0133785, + "balance_loss_mlp": 1.02058041, + "epoch": 0.5817525928152713, + "flos": 21323168092800.0, + "grad_norm": 1.851787596325848, + "language_loss": 0.70325559, + "learning_rate": 1.492161299002273e-06, + "loss": 0.72411168, + "num_input_tokens_seen": 208429995, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.40234375, + "step": 9676, + "time_per_iteration": 2.382453441619873 + }, + { + "auxiliary_loss_clip": 0.01059654, + "auxiliary_loss_mlp": 0.01024133, + "balance_loss_clip": 1.012218, + "balance_loss_mlp": 1.01930463, + "epoch": 0.5818127160679393, + "flos": 26933536488960.0, + "grad_norm": 2.1791733436175935, + "language_loss": 0.63164616, + "learning_rate": 1.4917958687164212e-06, + "loss": 0.65248406, + "num_input_tokens_seen": 208443655, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.40234375, + "step": 9677, + "time_per_iteration": 2.397329807281494 + }, + { + "auxiliary_loss_clip": 0.01008391, + "auxiliary_loss_mlp": 0.0100864, + "balance_loss_clip": 1.00754917, + "balance_loss_mlp": 1.00068831, + "epoch": 0.5818728393206073, + "flos": 63914934090240.0, + "grad_norm": 0.8087456611229404, + "language_loss": 0.54161894, + "learning_rate": 1.491430456567068e-06, + "loss": 0.56178927, + "num_input_tokens_seen": 208498405, + "router_z_loss_clip": 0.01092529, + "router_z_loss_mlp": 0.07714844, + "step": 9678, + "time_per_iteration": 3.104806423187256 + }, + { + "auxiliary_loss_clip": 0.01008325, + "auxiliary_loss_mlp": 0.01004343, + "balance_loss_clip": 1.00332952, + "balance_loss_mlp": 1.00077176, + "epoch": 0.5819329625732752, + "flos": 64951017730560.0, + "grad_norm": 0.7384268022322326, + "language_loss": 0.56198692, + "learning_rate": 1.491065062567253e-06, + "loss": 0.58211362, + "num_input_tokens_seen": 208559075, + "router_z_loss_clip": 0.01013184, + "router_z_loss_mlp": 0.07519531, + "step": 9679, + "time_per_iteration": 2.9546003341674805 + }, + { + "auxiliary_loss_clip": 0.01058541, + "auxiliary_loss_mlp": 0.01024362, + "balance_loss_clip": 1.01335907, + "balance_loss_mlp": 1.01943135, + "epoch": 0.5819930858259432, + "flos": 21214204139520.0, + "grad_norm": 1.964429206520717, + "language_loss": 0.65655988, + "learning_rate": 1.4906996867300174e-06, + "loss": 0.67738891, + "num_input_tokens_seen": 208577770, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.390625, + "step": 9680, + "time_per_iteration": 2.3893849849700928 + }, + { + "auxiliary_loss_clip": 0.01057735, + "auxiliary_loss_mlp": 0.0102257, + "balance_loss_clip": 1.01127529, + "balance_loss_mlp": 1.01874721, + "epoch": 0.5820532090786111, + "flos": 19457654215680.0, + "grad_norm": 1.9355257873813967, + "language_loss": 0.83136046, + "learning_rate": 1.4903343290683999e-06, + "loss": 0.85216355, + "num_input_tokens_seen": 208595110, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.390625, + "step": 9681, + "time_per_iteration": 2.3936758041381836 + }, + { + "auxiliary_loss_clip": 0.01059913, + "auxiliary_loss_mlp": 0.01025619, + "balance_loss_clip": 1.01371598, + "balance_loss_mlp": 1.01996541, + "epoch": 0.5821133323312792, + "flos": 17711647522560.0, + "grad_norm": 2.9754269102921658, + "language_loss": 0.75658721, + "learning_rate": 1.4899689895954385e-06, + "loss": 0.77744251, + "num_input_tokens_seen": 208612080, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.3984375, + "step": 9682, + "time_per_iteration": 2.370086908340454 + }, + { + "auxiliary_loss_clip": 0.01059516, + "auxiliary_loss_mlp": 0.01023955, + "balance_loss_clip": 1.01196909, + "balance_loss_mlp": 1.01933885, + "epoch": 0.5821734555839471, + "flos": 24570649249920.0, + "grad_norm": 1.9301456596321533, + "language_loss": 0.74576819, + "learning_rate": 1.4896036683241727e-06, + "loss": 0.76660287, + "num_input_tokens_seen": 208630235, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.40234375, + "step": 9683, + "time_per_iteration": 2.4306094646453857 + }, + { + "auxiliary_loss_clip": 0.01061222, + "auxiliary_loss_mlp": 0.01023902, + "balance_loss_clip": 1.0118916, + "balance_loss_mlp": 1.01933336, + "epoch": 0.5822335788366151, + "flos": 22381176067200.0, + "grad_norm": 1.8235397197529613, + "language_loss": 0.73729879, + "learning_rate": 1.4892383652676385e-06, + "loss": 0.75814998, + "num_input_tokens_seen": 208647925, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.41796875, + "step": 9684, + "time_per_iteration": 2.404383897781372 + }, + { + "auxiliary_loss_clip": 0.0105961, + "auxiliary_loss_mlp": 0.01024501, + "balance_loss_clip": 1.01247251, + "balance_loss_mlp": 1.01919699, + "epoch": 0.582293702089283, + "flos": 26721334045440.0, + "grad_norm": 2.085410500877047, + "language_loss": 0.78099388, + "learning_rate": 1.4888730804388736e-06, + "loss": 0.801835, + "num_input_tokens_seen": 208666180, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.40429688, + "step": 9685, + "time_per_iteration": 2.446990728378296 + }, + { + "auxiliary_loss_clip": 0.01061219, + "auxiliary_loss_mlp": 0.01026949, + "balance_loss_clip": 1.0148735, + "balance_loss_mlp": 1.02089953, + "epoch": 0.582353825341951, + "flos": 17347677932160.0, + "grad_norm": 1.6015845974091285, + "language_loss": 0.74964178, + "learning_rate": 1.4885078138509137e-06, + "loss": 0.77052343, + "num_input_tokens_seen": 208684240, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.40234375, + "step": 9686, + "time_per_iteration": 2.3989040851593018 + }, + { + "auxiliary_loss_clip": 0.01008488, + "auxiliary_loss_mlp": 0.01000917, + "balance_loss_clip": 0.99993956, + "balance_loss_mlp": 1.00087094, + "epoch": 0.5824139485946189, + "flos": 55470282877440.0, + "grad_norm": 0.8162762493856185, + "language_loss": 0.57401639, + "learning_rate": 1.4881425655167936e-06, + "loss": 0.59411043, + "num_input_tokens_seen": 208736090, + "router_z_loss_clip": 0.00976562, + "router_z_loss_mlp": 0.07617188, + "step": 9687, + "time_per_iteration": 2.8637566566467285 + }, + { + "auxiliary_loss_clip": 0.01058864, + "auxiliary_loss_mlp": 0.01025263, + "balance_loss_clip": 1.01312757, + "balance_loss_mlp": 1.01926684, + "epoch": 0.582474071847287, + "flos": 20301993469440.0, + "grad_norm": 1.9639232507668127, + "language_loss": 0.69901407, + "learning_rate": 1.4877773354495496e-06, + "loss": 0.71985531, + "num_input_tokens_seen": 208754600, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.39648438, + "step": 9688, + "time_per_iteration": 2.372792959213257 + }, + { + "auxiliary_loss_clip": 0.01059045, + "auxiliary_loss_mlp": 0.01022943, + "balance_loss_clip": 1.01127219, + "balance_loss_mlp": 1.01828647, + "epoch": 0.5825341950999549, + "flos": 23876890156800.0, + "grad_norm": 2.7015200222925158, + "language_loss": 0.65361989, + "learning_rate": 1.4874121236622141e-06, + "loss": 0.67443985, + "num_input_tokens_seen": 208773140, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.40625, + "step": 9689, + "time_per_iteration": 2.4076101779937744 + }, + { + "auxiliary_loss_clip": 0.01059396, + "auxiliary_loss_mlp": 0.0102121, + "balance_loss_clip": 1.00929475, + "balance_loss_mlp": 1.02075744, + "epoch": 0.5825943183526229, + "flos": 23111908156800.0, + "grad_norm": 2.4223929923264746, + "language_loss": 0.73556048, + "learning_rate": 1.4870469301678223e-06, + "loss": 0.75636655, + "num_input_tokens_seen": 208793410, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.38671875, + "step": 9690, + "time_per_iteration": 3.8305461406707764 + }, + { + "auxiliary_loss_clip": 0.01060671, + "auxiliary_loss_mlp": 0.01025224, + "balance_loss_clip": 1.01223648, + "balance_loss_mlp": 1.01938224, + "epoch": 0.5826544416052909, + "flos": 22856309026560.0, + "grad_norm": 2.5545230986163783, + "language_loss": 0.75611347, + "learning_rate": 1.4866817549794053e-06, + "loss": 0.77697241, + "num_input_tokens_seen": 208811920, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.4140625, + "step": 9691, + "time_per_iteration": 2.398853302001953 + }, + { + "auxiliary_loss_clip": 0.01061183, + "auxiliary_loss_mlp": 0.0102448, + "balance_loss_clip": 1.0126369, + "balance_loss_mlp": 1.01940525, + "epoch": 0.5827145648579588, + "flos": 31500559681920.0, + "grad_norm": 1.6807880709277152, + "language_loss": 0.80758047, + "learning_rate": 1.4863165981099963e-06, + "loss": 0.82843709, + "num_input_tokens_seen": 208834720, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.41796875, + "step": 9692, + "time_per_iteration": 2.462160348892212 + }, + { + "auxiliary_loss_clip": 0.01008482, + "auxiliary_loss_mlp": 0.01001423, + "balance_loss_clip": 1.00044513, + "balance_loss_mlp": 1.00096416, + "epoch": 0.5827746881106268, + "flos": 69808448545920.0, + "grad_norm": 0.7642409221663609, + "language_loss": 0.56935418, + "learning_rate": 1.4859514595726267e-06, + "loss": 0.58945322, + "num_input_tokens_seen": 208898415, + "router_z_loss_clip": 0.00976562, + "router_z_loss_mlp": 0.07519531, + "step": 9693, + "time_per_iteration": 2.992157459259033 + }, + { + "auxiliary_loss_clip": 0.01059027, + "auxiliary_loss_mlp": 0.01025563, + "balance_loss_clip": 1.01444042, + "balance_loss_mlp": 1.01900506, + "epoch": 0.5828348113632947, + "flos": 23111279752320.0, + "grad_norm": 1.6821619545626507, + "language_loss": 0.79540706, + "learning_rate": 1.485586339380327e-06, + "loss": 0.81625295, + "num_input_tokens_seen": 208919045, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.40039062, + "step": 9694, + "time_per_iteration": 2.405369520187378 + }, + { + "auxiliary_loss_clip": 0.01056662, + "auxiliary_loss_mlp": 0.01024039, + "balance_loss_clip": 1.01379883, + "balance_loss_mlp": 1.01959848, + "epoch": 0.5828949346159628, + "flos": 21871967754240.0, + "grad_norm": 1.4139202595984055, + "language_loss": 0.76111448, + "learning_rate": 1.4852212375461277e-06, + "loss": 0.78192139, + "num_input_tokens_seen": 208939375, + "router_z_loss_clip": 0.10253906, + "router_z_loss_mlp": 0.37109375, + "step": 9695, + "time_per_iteration": 2.400343179702759 + }, + { + "auxiliary_loss_clip": 0.01008729, + "auxiliary_loss_mlp": 0.0100178, + "balance_loss_clip": 1.00066531, + "balance_loss_mlp": 1.00116849, + "epoch": 0.5829550578686307, + "flos": 65958784525440.0, + "grad_norm": 0.7759043747763231, + "language_loss": 0.55016708, + "learning_rate": 1.4848561540830579e-06, + "loss": 0.57027215, + "num_input_tokens_seen": 209004760, + "router_z_loss_clip": 0.01116943, + "router_z_loss_mlp": 0.07519531, + "step": 9696, + "time_per_iteration": 3.146028995513916 + }, + { + "auxiliary_loss_clip": 0.01059266, + "auxiliary_loss_mlp": 0.01024477, + "balance_loss_clip": 1.01327121, + "balance_loss_mlp": 1.0191958, + "epoch": 0.5830151811212987, + "flos": 16288866996480.0, + "grad_norm": 13.70039158665993, + "language_loss": 0.76644146, + "learning_rate": 1.4844910890041474e-06, + "loss": 0.78727889, + "num_input_tokens_seen": 209022930, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.40039062, + "step": 9697, + "time_per_iteration": 2.378603219985962 + }, + { + "auxiliary_loss_clip": 0.01061531, + "auxiliary_loss_mlp": 0.0102729, + "balance_loss_clip": 1.01423049, + "balance_loss_mlp": 1.01968217, + "epoch": 0.5830753043739666, + "flos": 24167751626880.0, + "grad_norm": 1.9456585673453322, + "language_loss": 0.77626926, + "learning_rate": 1.4841260423224239e-06, + "loss": 0.79715747, + "num_input_tokens_seen": 209043740, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.41796875, + "step": 9698, + "time_per_iteration": 2.431443214416504 + }, + { + "auxiliary_loss_clip": 0.01061019, + "auxiliary_loss_mlp": 0.01027329, + "balance_loss_clip": 1.01519322, + "balance_loss_mlp": 1.02071595, + "epoch": 0.5831354276266346, + "flos": 27057651972480.0, + "grad_norm": 1.6012786378132737, + "language_loss": 0.83555615, + "learning_rate": 1.4837610140509144e-06, + "loss": 0.85643959, + "num_input_tokens_seen": 209068885, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.40234375, + "step": 9699, + "time_per_iteration": 2.523698568344116 + }, + { + "auxiliary_loss_clip": 0.01060995, + "auxiliary_loss_mlp": 0.01024468, + "balance_loss_clip": 1.01165342, + "balance_loss_mlp": 1.01997161, + "epoch": 0.5831955508793025, + "flos": 23622338367360.0, + "grad_norm": 3.1566908782668976, + "language_loss": 0.66338688, + "learning_rate": 1.4833960042026467e-06, + "loss": 0.68424153, + "num_input_tokens_seen": 209087340, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.41015625, + "step": 9700, + "time_per_iteration": 2.4546425342559814 + }, + { + "auxiliary_loss_clip": 0.0106013, + "auxiliary_loss_mlp": 0.01023739, + "balance_loss_clip": 1.01110864, + "balance_loss_mlp": 1.01859534, + "epoch": 0.5832556741319705, + "flos": 24972080595840.0, + "grad_norm": 1.566370223872045, + "language_loss": 0.71585411, + "learning_rate": 1.4830310127906467e-06, + "loss": 0.73669285, + "num_input_tokens_seen": 209108840, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.41601562, + "step": 9701, + "time_per_iteration": 2.5078213214874268 + }, + { + "auxiliary_loss_clip": 0.01058561, + "auxiliary_loss_mlp": 0.01024173, + "balance_loss_clip": 1.01271129, + "balance_loss_mlp": 1.0190227, + "epoch": 0.5833157973846385, + "flos": 23220453173760.0, + "grad_norm": 2.0703574319892337, + "language_loss": 0.85167891, + "learning_rate": 1.48266603982794e-06, + "loss": 0.87250626, + "num_input_tokens_seen": 209127985, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.39648438, + "step": 9702, + "time_per_iteration": 3.812279462814331 + }, + { + "auxiliary_loss_clip": 0.01055299, + "auxiliary_loss_mlp": 0.01022529, + "balance_loss_clip": 1.011729, + "balance_loss_mlp": 1.01811886, + "epoch": 0.5833759206373065, + "flos": 21976951812480.0, + "grad_norm": 1.6478902507707738, + "language_loss": 0.7781378, + "learning_rate": 1.482301085327552e-06, + "loss": 0.79891604, + "num_input_tokens_seen": 209146885, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.37109375, + "step": 9703, + "time_per_iteration": 2.3925507068634033 + }, + { + "auxiliary_loss_clip": 0.01062138, + "auxiliary_loss_mlp": 0.01024523, + "balance_loss_clip": 1.01203561, + "balance_loss_mlp": 1.0215261, + "epoch": 0.5834360438899745, + "flos": 21761328055680.0, + "grad_norm": 1.742668985935889, + "language_loss": 0.71356785, + "learning_rate": 1.481936149302506e-06, + "loss": 0.73443437, + "num_input_tokens_seen": 209166130, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.40625, + "step": 9704, + "time_per_iteration": 3.86053466796875 + }, + { + "auxiliary_loss_clip": 0.0105892, + "auxiliary_loss_mlp": 0.01026414, + "balance_loss_clip": 1.01494002, + "balance_loss_mlp": 1.01909542, + "epoch": 0.5834961671426424, + "flos": 15991791304320.0, + "grad_norm": 2.426007005158113, + "language_loss": 0.81454754, + "learning_rate": 1.4815712317658271e-06, + "loss": 0.83540088, + "num_input_tokens_seen": 209183350, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.3984375, + "step": 9705, + "time_per_iteration": 2.3873276710510254 + }, + { + "auxiliary_loss_clip": 0.01059423, + "auxiliary_loss_mlp": 0.01025995, + "balance_loss_clip": 1.01440787, + "balance_loss_mlp": 1.01914454, + "epoch": 0.5835562903953104, + "flos": 22817276259840.0, + "grad_norm": 1.7187859697744365, + "language_loss": 0.80627275, + "learning_rate": 1.4812063327305367e-06, + "loss": 0.82712692, + "num_input_tokens_seen": 209203945, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.40234375, + "step": 9706, + "time_per_iteration": 2.3898630142211914 + }, + { + "auxiliary_loss_clip": 0.0105765, + "auxiliary_loss_mlp": 0.0102577, + "balance_loss_clip": 1.01313996, + "balance_loss_mlp": 1.01929855, + "epoch": 0.5836164136479783, + "flos": 48466288673280.0, + "grad_norm": 1.92340514401776, + "language_loss": 0.75295913, + "learning_rate": 1.480841452209658e-06, + "loss": 0.77379328, + "num_input_tokens_seen": 209227080, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.3828125, + "step": 9707, + "time_per_iteration": 2.6099698543548584 + }, + { + "auxiliary_loss_clip": 0.01058839, + "auxiliary_loss_mlp": 0.01024124, + "balance_loss_clip": 1.01269209, + "balance_loss_mlp": 1.02051544, + "epoch": 0.5836765369006464, + "flos": 26904802573440.0, + "grad_norm": 4.814687272251012, + "language_loss": 0.81324232, + "learning_rate": 1.4804765902162122e-06, + "loss": 0.83407187, + "num_input_tokens_seen": 209248170, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.3828125, + "step": 9708, + "time_per_iteration": 3.8407580852508545 + }, + { + "auxiliary_loss_clip": 0.01059021, + "auxiliary_loss_mlp": 0.01025873, + "balance_loss_clip": 1.01351678, + "balance_loss_mlp": 1.01969957, + "epoch": 0.5837366601533143, + "flos": 20083018222080.0, + "grad_norm": 1.6967004801206937, + "language_loss": 0.78603327, + "learning_rate": 1.4801117467632204e-06, + "loss": 0.80688214, + "num_input_tokens_seen": 209267730, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.39453125, + "step": 9709, + "time_per_iteration": 2.3762049674987793 + }, + { + "auxiliary_loss_clip": 0.01008832, + "auxiliary_loss_mlp": 0.01000772, + "balance_loss_clip": 0.99977624, + "balance_loss_mlp": 1.00128698, + "epoch": 0.5837967834059823, + "flos": 65360548512000.0, + "grad_norm": 0.7672521281998783, + "language_loss": 0.5658145, + "learning_rate": 1.4797469218637035e-06, + "loss": 0.58591056, + "num_input_tokens_seen": 209332510, + "router_z_loss_clip": 0.00994873, + "router_z_loss_mlp": 0.07568359, + "step": 9710, + "time_per_iteration": 3.061432123184204 + }, + { + "auxiliary_loss_clip": 0.01057681, + "auxiliary_loss_mlp": 0.01024608, + "balance_loss_clip": 1.01337266, + "balance_loss_mlp": 1.01871443, + "epoch": 0.5838569066586502, + "flos": 25337446640640.0, + "grad_norm": 1.462373491570532, + "language_loss": 0.6565522, + "learning_rate": 1.4793821155306803e-06, + "loss": 0.67737514, + "num_input_tokens_seen": 209353355, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.390625, + "step": 9711, + "time_per_iteration": 2.435471296310425 + }, + { + "auxiliary_loss_clip": 0.01064484, + "auxiliary_loss_mlp": 0.01027914, + "balance_loss_clip": 1.01412129, + "balance_loss_mlp": 1.020293, + "epoch": 0.5839170299113182, + "flos": 22228361579520.0, + "grad_norm": 1.9031758904783136, + "language_loss": 0.78579324, + "learning_rate": 1.4790173277771705e-06, + "loss": 0.80671716, + "num_input_tokens_seen": 209370960, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.44140625, + "step": 9712, + "time_per_iteration": 2.3764429092407227 + }, + { + "auxiliary_loss_clip": 0.01059343, + "auxiliary_loss_mlp": 0.01019301, + "balance_loss_clip": 1.00746405, + "balance_loss_mlp": 1.01970959, + "epoch": 0.5839771531639861, + "flos": 22198929436800.0, + "grad_norm": 1.7463354685323824, + "language_loss": 0.73224014, + "learning_rate": 1.4786525586161917e-06, + "loss": 0.7530266, + "num_input_tokens_seen": 209390955, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.39648438, + "step": 9713, + "time_per_iteration": 2.4077279567718506 + }, + { + "auxiliary_loss_clip": 0.01062634, + "auxiliary_loss_mlp": 0.01024458, + "balance_loss_clip": 1.01038551, + "balance_loss_mlp": 1.0195502, + "epoch": 0.5840372764166541, + "flos": 22418253797760.0, + "grad_norm": 2.4939264225404636, + "language_loss": 0.69318962, + "learning_rate": 1.4782878080607627e-06, + "loss": 0.71406054, + "num_input_tokens_seen": 209410260, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.4296875, + "step": 9714, + "time_per_iteration": 2.375549793243408 + }, + { + "auxiliary_loss_clip": 0.01059906, + "auxiliary_loss_mlp": 0.0102629, + "balance_loss_clip": 1.01308131, + "balance_loss_mlp": 1.01867104, + "epoch": 0.5840973996693221, + "flos": 19827244535040.0, + "grad_norm": 1.9099880026501421, + "language_loss": 0.80016124, + "learning_rate": 1.4779230761238997e-06, + "loss": 0.82102317, + "num_input_tokens_seen": 209429920, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.41210938, + "step": 9715, + "time_per_iteration": 2.4069645404815674 + }, + { + "auxiliary_loss_clip": 0.01059342, + "auxiliary_loss_mlp": 0.01025312, + "balance_loss_clip": 1.01263976, + "balance_loss_mlp": 1.02002001, + "epoch": 0.5841575229219901, + "flos": 21141898980480.0, + "grad_norm": 3.0421535730627616, + "language_loss": 0.7276417, + "learning_rate": 1.4775583628186184e-06, + "loss": 0.74848831, + "num_input_tokens_seen": 209449470, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.39257812, + "step": 9716, + "time_per_iteration": 2.4173481464385986 + }, + { + "auxiliary_loss_clip": 0.010579, + "auxiliary_loss_mlp": 0.01022499, + "balance_loss_clip": 1.01128757, + "balance_loss_mlp": 1.01945519, + "epoch": 0.5842176461746581, + "flos": 24639288716160.0, + "grad_norm": 1.6355981634474168, + "language_loss": 0.74963468, + "learning_rate": 1.477193668157936e-06, + "loss": 0.77043867, + "num_input_tokens_seen": 209467695, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.38476562, + "step": 9717, + "time_per_iteration": 2.4477646350860596 + }, + { + "auxiliary_loss_clip": 0.0105902, + "auxiliary_loss_mlp": 0.0102714, + "balance_loss_clip": 1.01474786, + "balance_loss_mlp": 1.02010798, + "epoch": 0.584277769427326, + "flos": 19130273596800.0, + "grad_norm": 2.16370507119852, + "language_loss": 0.7993753, + "learning_rate": 1.4768289921548665e-06, + "loss": 0.82023692, + "num_input_tokens_seen": 209484250, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.390625, + "step": 9718, + "time_per_iteration": 2.3568241596221924 + }, + { + "auxiliary_loss_clip": 0.01060142, + "auxiliary_loss_mlp": 0.01023404, + "balance_loss_clip": 1.01108408, + "balance_loss_mlp": 1.01995945, + "epoch": 0.584337892679994, + "flos": 22673992573440.0, + "grad_norm": 1.6585919086765906, + "language_loss": 0.67472881, + "learning_rate": 1.4764643348224247e-06, + "loss": 0.69556427, + "num_input_tokens_seen": 209502830, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.40234375, + "step": 9719, + "time_per_iteration": 2.4114630222320557 + }, + { + "auxiliary_loss_clip": 0.01061184, + "auxiliary_loss_mlp": 0.01026553, + "balance_loss_clip": 1.01430392, + "balance_loss_mlp": 1.02013016, + "epoch": 0.5843980159326619, + "flos": 31282771420800.0, + "grad_norm": 1.9592361480888179, + "language_loss": 0.75608253, + "learning_rate": 1.4760996961736245e-06, + "loss": 0.7769599, + "num_input_tokens_seen": 209525995, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.41015625, + "step": 9720, + "time_per_iteration": 2.474851369857788 + }, + { + "auxiliary_loss_clip": 0.0105982, + "auxiliary_loss_mlp": 0.01025003, + "balance_loss_clip": 1.01339746, + "balance_loss_mlp": 1.01934731, + "epoch": 0.58445813918533, + "flos": 22746995959680.0, + "grad_norm": 1.5563215350808866, + "language_loss": 0.82661784, + "learning_rate": 1.4757350762214778e-06, + "loss": 0.84746611, + "num_input_tokens_seen": 209545895, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.40429688, + "step": 9721, + "time_per_iteration": 2.4243500232696533 + }, + { + "auxiliary_loss_clip": 0.01008893, + "auxiliary_loss_mlp": 0.01002034, + "balance_loss_clip": 1.00108588, + "balance_loss_mlp": 1.00124598, + "epoch": 0.5845182624379979, + "flos": 60683548936320.0, + "grad_norm": 0.9870868197768281, + "language_loss": 0.71329486, + "learning_rate": 1.4753704749789976e-06, + "loss": 0.73340404, + "num_input_tokens_seen": 209602315, + "router_z_loss_clip": 0.00946045, + "router_z_loss_mlp": 0.07666016, + "step": 9722, + "time_per_iteration": 3.0693137645721436 + }, + { + "auxiliary_loss_clip": 0.01057526, + "auxiliary_loss_mlp": 0.01025085, + "balance_loss_clip": 1.01350355, + "balance_loss_mlp": 1.01818526, + "epoch": 0.5845783856906659, + "flos": 16361521269120.0, + "grad_norm": 3.029445023080166, + "language_loss": 0.89415497, + "learning_rate": 1.4750058924591957e-06, + "loss": 0.91498113, + "num_input_tokens_seen": 209617615, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.39453125, + "step": 9723, + "time_per_iteration": 2.3613576889038086 + }, + { + "auxiliary_loss_clip": 0.01058133, + "auxiliary_loss_mlp": 0.01023769, + "balance_loss_clip": 1.01215243, + "balance_loss_mlp": 1.01953292, + "epoch": 0.5846385089433338, + "flos": 20082389817600.0, + "grad_norm": 1.4397501149449619, + "language_loss": 0.6841501, + "learning_rate": 1.4746413286750836e-06, + "loss": 0.70496917, + "num_input_tokens_seen": 209637005, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.38476562, + "step": 9724, + "time_per_iteration": 2.3874666690826416 + }, + { + "auxiliary_loss_clip": 0.01063486, + "auxiliary_loss_mlp": 0.01030379, + "balance_loss_clip": 1.01743281, + "balance_loss_mlp": 1.02169609, + "epoch": 0.5846986321960018, + "flos": 17310111442560.0, + "grad_norm": 1.8670170416932752, + "language_loss": 0.86262578, + "learning_rate": 1.474276783639671e-06, + "loss": 0.88356441, + "num_input_tokens_seen": 209653170, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.41796875, + "step": 9725, + "time_per_iteration": 2.3573830127716064 + }, + { + "auxiliary_loss_clip": 0.01063147, + "auxiliary_loss_mlp": 0.01029131, + "balance_loss_clip": 1.01644135, + "balance_loss_mlp": 1.0203737, + "epoch": 0.5847587554486697, + "flos": 17197062860160.0, + "grad_norm": 1.6766376163491281, + "language_loss": 0.8256942, + "learning_rate": 1.473912257365967e-06, + "loss": 0.84661698, + "num_input_tokens_seen": 209671275, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.42578125, + "step": 9726, + "time_per_iteration": 2.360848903656006 + }, + { + "auxiliary_loss_clip": 0.01060923, + "auxiliary_loss_mlp": 0.01026369, + "balance_loss_clip": 1.01451981, + "balance_loss_mlp": 1.02036154, + "epoch": 0.5848188787013378, + "flos": 24528125347200.0, + "grad_norm": 1.8168611232587988, + "language_loss": 0.66670942, + "learning_rate": 1.4735477498669817e-06, + "loss": 0.68758237, + "num_input_tokens_seen": 209690380, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.40625, + "step": 9727, + "time_per_iteration": 2.4477767944335938 + }, + { + "auxiliary_loss_clip": 0.01061818, + "auxiliary_loss_mlp": 0.01030077, + "balance_loss_clip": 1.0177505, + "balance_loss_mlp": 1.02099264, + "epoch": 0.5848790019540057, + "flos": 20627419052160.0, + "grad_norm": 1.7095122903477644, + "language_loss": 0.81812465, + "learning_rate": 1.4731832611557229e-06, + "loss": 0.83904356, + "num_input_tokens_seen": 209708845, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.40820312, + "step": 9728, + "time_per_iteration": 2.3800384998321533 + }, + { + "auxiliary_loss_clip": 0.01058866, + "auxiliary_loss_mlp": 0.01027356, + "balance_loss_clip": 1.01663959, + "balance_loss_mlp": 1.01993167, + "epoch": 0.5849391252066737, + "flos": 22417765038720.0, + "grad_norm": 1.6308977171134618, + "language_loss": 0.77852863, + "learning_rate": 1.4728187912451987e-06, + "loss": 0.79939091, + "num_input_tokens_seen": 209729000, + "router_z_loss_clip": 0.10693359, + "router_z_loss_mlp": 0.38867188, + "step": 9729, + "time_per_iteration": 2.4036667346954346 + }, + { + "auxiliary_loss_clip": 0.0105934, + "auxiliary_loss_mlp": 0.01023513, + "balance_loss_clip": 1.01127028, + "balance_loss_mlp": 1.0189662, + "epoch": 0.5849992484593417, + "flos": 25409751799680.0, + "grad_norm": 1.7041153905450446, + "language_loss": 0.70237195, + "learning_rate": 1.4724543401484155e-06, + "loss": 0.72320044, + "num_input_tokens_seen": 209747435, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.40429688, + "step": 9730, + "time_per_iteration": 3.8973162174224854 + }, + { + "auxiliary_loss_clip": 0.01062684, + "auxiliary_loss_mlp": 0.01027817, + "balance_loss_clip": 1.0143882, + "balance_loss_mlp": 1.01954329, + "epoch": 0.5850593717120096, + "flos": 21064217472000.0, + "grad_norm": 3.436417364755712, + "language_loss": 0.7849164, + "learning_rate": 1.4720899078783797e-06, + "loss": 0.80582142, + "num_input_tokens_seen": 209764910, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.43164062, + "step": 9731, + "time_per_iteration": 2.408247947692871 + }, + { + "auxiliary_loss_clip": 0.01060963, + "auxiliary_loss_mlp": 0.01026503, + "balance_loss_clip": 1.01368809, + "balance_loss_mlp": 1.02065611, + "epoch": 0.5851194949646776, + "flos": 25300368910080.0, + "grad_norm": 3.23608271986181, + "language_loss": 0.70358312, + "learning_rate": 1.4717254944480978e-06, + "loss": 0.72445786, + "num_input_tokens_seen": 209786115, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.40234375, + "step": 9732, + "time_per_iteration": 2.4281299114227295 + }, + { + "auxiliary_loss_clip": 0.01063597, + "auxiliary_loss_mlp": 0.01029892, + "balance_loss_clip": 1.01630211, + "balance_loss_mlp": 1.02036917, + "epoch": 0.5851796182173455, + "flos": 23586098509440.0, + "grad_norm": 1.7898178330523908, + "language_loss": 0.5267393, + "learning_rate": 1.471361099870573e-06, + "loss": 0.54767418, + "num_input_tokens_seen": 209806095, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.43164062, + "step": 9733, + "time_per_iteration": 2.399587631225586 + }, + { + "auxiliary_loss_clip": 0.01061076, + "auxiliary_loss_mlp": 0.01027632, + "balance_loss_clip": 1.01488864, + "balance_loss_mlp": 1.02028155, + "epoch": 0.5852397414700136, + "flos": 24821674992000.0, + "grad_norm": 2.136080258745772, + "language_loss": 0.87485766, + "learning_rate": 1.4709967241588116e-06, + "loss": 0.89574468, + "num_input_tokens_seen": 209823650, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.40625, + "step": 9734, + "time_per_iteration": 2.4023969173431396 + }, + { + "auxiliary_loss_clip": 0.01057824, + "auxiliary_loss_mlp": 0.01025443, + "balance_loss_clip": 1.01361763, + "balance_loss_mlp": 1.02039647, + "epoch": 0.5852998647226815, + "flos": 19936767070080.0, + "grad_norm": 1.4081342361209193, + "language_loss": 0.72121447, + "learning_rate": 1.4706323673258165e-06, + "loss": 0.74204713, + "num_input_tokens_seen": 209843220, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.375, + "step": 9735, + "time_per_iteration": 2.3977489471435547 + }, + { + "auxiliary_loss_clip": 0.01061655, + "auxiliary_loss_mlp": 0.01030369, + "balance_loss_clip": 1.01720262, + "balance_loss_mlp": 1.0193193, + "epoch": 0.5853599879753495, + "flos": 16719625751040.0, + "grad_norm": 2.242606946376082, + "language_loss": 0.74076796, + "learning_rate": 1.4702680293845901e-06, + "loss": 0.76168823, + "num_input_tokens_seen": 209854880, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.421875, + "step": 9736, + "time_per_iteration": 2.320281982421875 + }, + { + "auxiliary_loss_clip": 0.01059735, + "auxiliary_loss_mlp": 0.01022601, + "balance_loss_clip": 1.01100779, + "balance_loss_mlp": 1.0195334, + "epoch": 0.5854201112280174, + "flos": 44454872856960.0, + "grad_norm": 1.5735659932218051, + "language_loss": 0.70459867, + "learning_rate": 1.4699037103481356e-06, + "loss": 0.72542202, + "num_input_tokens_seen": 209877870, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.40234375, + "step": 9737, + "time_per_iteration": 2.579883337020874 + }, + { + "auxiliary_loss_clip": 0.01059213, + "auxiliary_loss_mlp": 0.01029781, + "balance_loss_clip": 1.01816964, + "balance_loss_mlp": 1.01976418, + "epoch": 0.5854802344806854, + "flos": 20338163504640.0, + "grad_norm": 1.7043687572634914, + "language_loss": 0.82390261, + "learning_rate": 1.469539410229453e-06, + "loss": 0.84479249, + "num_input_tokens_seen": 209896690, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.39453125, + "step": 9738, + "time_per_iteration": 2.3707892894744873 + }, + { + "auxiliary_loss_clip": 0.01064236, + "auxiliary_loss_mlp": 0.01023687, + "balance_loss_clip": 1.0110389, + "balance_loss_mlp": 1.02135301, + "epoch": 0.5855403577333533, + "flos": 20920061001600.0, + "grad_norm": 2.122850409358142, + "language_loss": 0.68815869, + "learning_rate": 1.4691751290415454e-06, + "loss": 0.7090379, + "num_input_tokens_seen": 209914640, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.4296875, + "step": 9739, + "time_per_iteration": 2.386054277420044 + }, + { + "auxiliary_loss_clip": 0.0106092, + "auxiliary_loss_mlp": 0.01021735, + "balance_loss_clip": 1.00924158, + "balance_loss_mlp": 1.019333, + "epoch": 0.5856004809860214, + "flos": 20447616216960.0, + "grad_norm": 1.9556744250134124, + "language_loss": 0.58597863, + "learning_rate": 1.4688108667974115e-06, + "loss": 0.60680521, + "num_input_tokens_seen": 209933375, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.4140625, + "step": 9740, + "time_per_iteration": 2.3815054893493652 + }, + { + "auxiliary_loss_clip": 0.01057568, + "auxiliary_loss_mlp": 0.0102363, + "balance_loss_clip": 1.01111293, + "balance_loss_mlp": 1.01723945, + "epoch": 0.5856606042386893, + "flos": 19639900846080.0, + "grad_norm": 2.3269368960377146, + "language_loss": 0.75029081, + "learning_rate": 1.4684466235100517e-06, + "loss": 0.77110279, + "num_input_tokens_seen": 209952055, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.40234375, + "step": 9741, + "time_per_iteration": 3.8382740020751953 + }, + { + "auxiliary_loss_clip": 0.01062002, + "auxiliary_loss_mlp": 0.01023923, + "balance_loss_clip": 1.01155472, + "balance_loss_mlp": 1.02120638, + "epoch": 0.5857207274913573, + "flos": 21685182647040.0, + "grad_norm": 1.66926513258991, + "language_loss": 0.75595856, + "learning_rate": 1.4680823991924645e-06, + "loss": 0.7768178, + "num_input_tokens_seen": 209971190, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.40820312, + "step": 9742, + "time_per_iteration": 2.413740634918213 + }, + { + "auxiliary_loss_clip": 0.01059581, + "auxiliary_loss_mlp": 0.01024364, + "balance_loss_clip": 1.01270485, + "balance_loss_mlp": 1.01940501, + "epoch": 0.5857808507440253, + "flos": 23181664786560.0, + "grad_norm": 2.077521269768289, + "language_loss": 0.7505213, + "learning_rate": 1.4677181938576477e-06, + "loss": 0.7713607, + "num_input_tokens_seen": 209990695, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.40234375, + "step": 9743, + "time_per_iteration": 2.4060781002044678 + }, + { + "auxiliary_loss_clip": 0.0105906, + "auxiliary_loss_mlp": 0.01023, + "balance_loss_clip": 1.01166296, + "balance_loss_mlp": 1.02019167, + "epoch": 0.5858409739966932, + "flos": 27234068405760.0, + "grad_norm": 1.654215738688301, + "language_loss": 0.81220472, + "learning_rate": 1.4673540075186002e-06, + "loss": 0.83302528, + "num_input_tokens_seen": 210010210, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.38867188, + "step": 9744, + "time_per_iteration": 3.9842796325683594 + }, + { + "auxiliary_loss_clip": 0.0106145, + "auxiliary_loss_mlp": 0.01021407, + "balance_loss_clip": 1.00936735, + "balance_loss_mlp": 1.01957035, + "epoch": 0.5859010972493612, + "flos": 27854265530880.0, + "grad_norm": 1.845293359253652, + "language_loss": 0.71236694, + "learning_rate": 1.4669898401883171e-06, + "loss": 0.73319548, + "num_input_tokens_seen": 210030030, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.41796875, + "step": 9745, + "time_per_iteration": 2.423741102218628 + }, + { + "auxiliary_loss_clip": 0.01008965, + "auxiliary_loss_mlp": 0.01001242, + "balance_loss_clip": 1.00016916, + "balance_loss_mlp": 1.00115705, + "epoch": 0.5859612205020291, + "flos": 70003333088640.0, + "grad_norm": 0.7413218744282539, + "language_loss": 0.53276539, + "learning_rate": 1.4666256918797964e-06, + "loss": 0.55286741, + "num_input_tokens_seen": 210094840, + "router_z_loss_clip": 0.01074219, + "router_z_loss_mlp": 0.078125, + "step": 9746, + "time_per_iteration": 3.1090400218963623 + }, + { + "auxiliary_loss_clip": 0.0106147, + "auxiliary_loss_mlp": 0.01025333, + "balance_loss_clip": 1.01213074, + "balance_loss_mlp": 1.01985335, + "epoch": 0.5860213437546972, + "flos": 24055017246720.0, + "grad_norm": 1.898002048150598, + "language_loss": 0.73396003, + "learning_rate": 1.4662615626060325e-06, + "loss": 0.7548281, + "num_input_tokens_seen": 210114660, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.41601562, + "step": 9747, + "time_per_iteration": 2.3970961570739746 + }, + { + "auxiliary_loss_clip": 0.01062224, + "auxiliary_loss_mlp": 0.01029935, + "balance_loss_clip": 1.01692939, + "balance_loss_mlp": 1.0208199, + "epoch": 0.5860814670073651, + "flos": 18946735246080.0, + "grad_norm": 1.6992459386826904, + "language_loss": 0.81015396, + "learning_rate": 1.4658974523800202e-06, + "loss": 0.83107555, + "num_input_tokens_seen": 210132770, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.4140625, + "step": 9748, + "time_per_iteration": 3.7389895915985107 + }, + { + "auxiliary_loss_clip": 0.01061184, + "auxiliary_loss_mlp": 0.01024387, + "balance_loss_clip": 1.01135731, + "balance_loss_mlp": 1.02005935, + "epoch": 0.5861415902600331, + "flos": 22560839256960.0, + "grad_norm": 1.8727512561841504, + "language_loss": 0.71785867, + "learning_rate": 1.4655333612147542e-06, + "loss": 0.73871434, + "num_input_tokens_seen": 210151895, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.41015625, + "step": 9749, + "time_per_iteration": 2.3944923877716064 + }, + { + "auxiliary_loss_clip": 0.01059049, + "auxiliary_loss_mlp": 0.01025338, + "balance_loss_clip": 1.01362586, + "balance_loss_mlp": 1.01915908, + "epoch": 0.586201713512701, + "flos": 14391162979200.0, + "grad_norm": 2.2035634159424777, + "language_loss": 0.75145954, + "learning_rate": 1.4651692891232279e-06, + "loss": 0.77230346, + "num_input_tokens_seen": 210168040, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.3984375, + "step": 9750, + "time_per_iteration": 2.3466882705688477 + }, + { + "auxiliary_loss_clip": 0.01061314, + "auxiliary_loss_mlp": 0.01025979, + "balance_loss_clip": 1.01270461, + "balance_loss_mlp": 1.02078879, + "epoch": 0.586261836765369, + "flos": 19497594677760.0, + "grad_norm": 1.6331484234761369, + "language_loss": 0.71012634, + "learning_rate": 1.4648052361184337e-06, + "loss": 0.73099929, + "num_input_tokens_seen": 210187720, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.40429688, + "step": 9751, + "time_per_iteration": 2.398275136947632 + }, + { + "auxiliary_loss_clip": 0.01060661, + "auxiliary_loss_mlp": 0.01028663, + "balance_loss_clip": 1.01466227, + "balance_loss_mlp": 1.01876974, + "epoch": 0.5863219600180369, + "flos": 20700841374720.0, + "grad_norm": 2.0650973902392127, + "language_loss": 0.7457605, + "learning_rate": 1.4644412022133637e-06, + "loss": 0.76665372, + "num_input_tokens_seen": 210206080, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.41796875, + "step": 9752, + "time_per_iteration": 2.370304822921753 + }, + { + "auxiliary_loss_clip": 0.01058317, + "auxiliary_loss_mlp": 0.01023235, + "balance_loss_clip": 1.01089668, + "balance_loss_mlp": 1.01863813, + "epoch": 0.586382083270705, + "flos": 19791109411200.0, + "grad_norm": 2.0247931545424858, + "language_loss": 0.77412695, + "learning_rate": 1.4640771874210101e-06, + "loss": 0.7949425, + "num_input_tokens_seen": 210225660, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.39648438, + "step": 9753, + "time_per_iteration": 2.391984224319458 + }, + { + "auxiliary_loss_clip": 0.01057058, + "auxiliary_loss_mlp": 0.01021355, + "balance_loss_clip": 1.01016104, + "balance_loss_mlp": 1.01869893, + "epoch": 0.5864422065233729, + "flos": 16499847542400.0, + "grad_norm": 1.9785201783238466, + "language_loss": 0.71229672, + "learning_rate": 1.4637131917543628e-06, + "loss": 0.73308086, + "num_input_tokens_seen": 210242725, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.3828125, + "step": 9754, + "time_per_iteration": 2.3670408725738525 + }, + { + "auxiliary_loss_clip": 0.01059698, + "auxiliary_loss_mlp": 0.01023911, + "balance_loss_clip": 1.01104259, + "balance_loss_mlp": 1.01868081, + "epoch": 0.5865023297760409, + "flos": 20412214231680.0, + "grad_norm": 1.8239747153638834, + "language_loss": 0.71943694, + "learning_rate": 1.4633492152264123e-06, + "loss": 0.74027306, + "num_input_tokens_seen": 210263225, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.41015625, + "step": 9755, + "time_per_iteration": 2.4251668453216553 + }, + { + "auxiliary_loss_clip": 0.01008348, + "auxiliary_loss_mlp": 0.01002816, + "balance_loss_clip": 1.00173712, + "balance_loss_mlp": 1.00055027, + "epoch": 0.5865624530287089, + "flos": 63347666319360.0, + "grad_norm": 0.7462979904558632, + "language_loss": 0.5696336, + "learning_rate": 1.462985257850148e-06, + "loss": 0.58974528, + "num_input_tokens_seen": 210322310, + "router_z_loss_clip": 0.01080322, + "router_z_loss_mlp": 0.078125, + "step": 9756, + "time_per_iteration": 3.036965847015381 + }, + { + "auxiliary_loss_clip": 0.01059295, + "auxiliary_loss_mlp": 0.01025859, + "balance_loss_clip": 1.01392603, + "balance_loss_mlp": 1.01927114, + "epoch": 0.5866225762813768, + "flos": 27015058247040.0, + "grad_norm": 1.685511125968246, + "language_loss": 0.76073205, + "learning_rate": 1.4626213196385577e-06, + "loss": 0.78158355, + "num_input_tokens_seen": 210340845, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.40039062, + "step": 9757, + "time_per_iteration": 2.444089889526367 + }, + { + "auxiliary_loss_clip": 0.01008553, + "auxiliary_loss_mlp": 0.01001689, + "balance_loss_clip": 1.00074685, + "balance_loss_mlp": 1.00087833, + "epoch": 0.5866826995340448, + "flos": 72983554721280.0, + "grad_norm": 0.8668251187197378, + "language_loss": 0.60498095, + "learning_rate": 1.462257400604631e-06, + "loss": 0.62508333, + "num_input_tokens_seen": 210397815, + "router_z_loss_clip": 0.00939941, + "router_z_loss_mlp": 0.07714844, + "step": 9758, + "time_per_iteration": 3.0251097679138184 + }, + { + "auxiliary_loss_clip": 0.01063475, + "auxiliary_loss_mlp": 0.01029242, + "balance_loss_clip": 1.01571798, + "balance_loss_mlp": 1.01958084, + "epoch": 0.5867428227867127, + "flos": 21284728819200.0, + "grad_norm": 3.837280461531405, + "language_loss": 0.71656793, + "learning_rate": 1.4618935007613544e-06, + "loss": 0.73749512, + "num_input_tokens_seen": 210413900, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.4375, + "step": 9759, + "time_per_iteration": 2.3975114822387695 + }, + { + "auxiliary_loss_clip": 0.01062112, + "auxiliary_loss_mlp": 0.01026591, + "balance_loss_clip": 1.01488495, + "balance_loss_mlp": 1.02055109, + "epoch": 0.5868029460393808, + "flos": 33467601392640.0, + "grad_norm": 1.4619976562391317, + "language_loss": 0.72962964, + "learning_rate": 1.461529620121714e-06, + "loss": 0.75051671, + "num_input_tokens_seen": 210434110, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.41601562, + "step": 9760, + "time_per_iteration": 2.4996211528778076 + }, + { + "auxiliary_loss_clip": 0.01059616, + "auxiliary_loss_mlp": 0.01024001, + "balance_loss_clip": 1.01133537, + "balance_loss_mlp": 1.01913714, + "epoch": 0.5868630692920487, + "flos": 17888657448960.0, + "grad_norm": 3.32256170583426, + "language_loss": 0.72973943, + "learning_rate": 1.461165758698697e-06, + "loss": 0.75057566, + "num_input_tokens_seen": 210451685, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.40429688, + "step": 9761, + "time_per_iteration": 2.406545639038086 + }, + { + "auxiliary_loss_clip": 0.01063938, + "auxiliary_loss_mlp": 0.01025984, + "balance_loss_clip": 1.01178622, + "balance_loss_mlp": 1.02133536, + "epoch": 0.5869231925447167, + "flos": 21033912545280.0, + "grad_norm": 1.8533469061067966, + "language_loss": 0.74769568, + "learning_rate": 1.4608019165052876e-06, + "loss": 0.76859492, + "num_input_tokens_seen": 210470825, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.42578125, + "step": 9762, + "time_per_iteration": 2.422966241836548 + }, + { + "auxiliary_loss_clip": 0.01059956, + "auxiliary_loss_mlp": 0.01026392, + "balance_loss_clip": 1.0134635, + "balance_loss_mlp": 1.01974964, + "epoch": 0.5869833157973846, + "flos": 74735707680000.0, + "grad_norm": 1.2925295996305914, + "language_loss": 0.72460824, + "learning_rate": 1.4604380935544712e-06, + "loss": 0.74547172, + "num_input_tokens_seen": 210500075, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.40234375, + "step": 9763, + "time_per_iteration": 2.8143184185028076 + }, + { + "auxiliary_loss_clip": 0.01059566, + "auxiliary_loss_mlp": 0.01025087, + "balance_loss_clip": 1.01306462, + "balance_loss_mlp": 1.0197804, + "epoch": 0.5870434390500526, + "flos": 17638050643200.0, + "grad_norm": 1.6823323910464527, + "language_loss": 0.80262947, + "learning_rate": 1.4600742898592313e-06, + "loss": 0.82347608, + "num_input_tokens_seen": 210518150, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.3984375, + "step": 9764, + "time_per_iteration": 2.399445056915283 + }, + { + "auxiliary_loss_clip": 0.0106076, + "auxiliary_loss_mlp": 0.01033413, + "balance_loss_clip": 1.02065802, + "balance_loss_mlp": 1.01965618, + "epoch": 0.5871035623027205, + "flos": 21505100520960.0, + "grad_norm": 1.8042305034868977, + "language_loss": 0.78647232, + "learning_rate": 1.4597105054325512e-06, + "loss": 0.80741405, + "num_input_tokens_seen": 210537760, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.41015625, + "step": 9765, + "time_per_iteration": 2.3968567848205566 + }, + { + "auxiliary_loss_clip": 0.01060187, + "auxiliary_loss_mlp": 0.01030463, + "balance_loss_clip": 1.01777887, + "balance_loss_mlp": 1.02040362, + "epoch": 0.5871636855553886, + "flos": 13551048000000.0, + "grad_norm": 2.0203853881906797, + "language_loss": 0.83487034, + "learning_rate": 1.4593467402874132e-06, + "loss": 0.85577685, + "num_input_tokens_seen": 210555515, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.39648438, + "step": 9766, + "time_per_iteration": 2.355588436126709 + }, + { + "auxiliary_loss_clip": 0.01062973, + "auxiliary_loss_mlp": 0.01028552, + "balance_loss_clip": 1.01567769, + "balance_loss_mlp": 1.01980591, + "epoch": 0.5872238088080565, + "flos": 26211741707520.0, + "grad_norm": 1.5638011954417967, + "language_loss": 0.69559991, + "learning_rate": 1.4589829944367989e-06, + "loss": 0.71651518, + "num_input_tokens_seen": 210575000, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.4296875, + "step": 9767, + "time_per_iteration": 2.4597086906433105 + }, + { + "auxiliary_loss_clip": 0.01061443, + "auxiliary_loss_mlp": 0.01026557, + "balance_loss_clip": 1.01423061, + "balance_loss_mlp": 1.01942003, + "epoch": 0.5872839320607245, + "flos": 30663866016000.0, + "grad_norm": 1.919313441021121, + "language_loss": 0.63352525, + "learning_rate": 1.4586192678936903e-06, + "loss": 0.65440524, + "num_input_tokens_seen": 210595185, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.41992188, + "step": 9768, + "time_per_iteration": 2.4504714012145996 + }, + { + "auxiliary_loss_clip": 0.01008835, + "auxiliary_loss_mlp": 0.01001273, + "balance_loss_clip": 1.00021827, + "balance_loss_mlp": 1.00108826, + "epoch": 0.5873440553133924, + "flos": 60300062029440.0, + "grad_norm": 0.8133135506720145, + "language_loss": 0.53921783, + "learning_rate": 1.4582555606710676e-06, + "loss": 0.55931896, + "num_input_tokens_seen": 210653210, + "router_z_loss_clip": 0.01055908, + "router_z_loss_mlp": 0.07714844, + "step": 9769, + "time_per_iteration": 4.417914152145386 + }, + { + "auxiliary_loss_clip": 0.01057827, + "auxiliary_loss_mlp": 0.01026781, + "balance_loss_clip": 1.01460934, + "balance_loss_mlp": 1.01848936, + "epoch": 0.5874041785660604, + "flos": 21538338001920.0, + "grad_norm": 1.7262193813627738, + "language_loss": 0.7077356, + "learning_rate": 1.4578918727819099e-06, + "loss": 0.72858161, + "num_input_tokens_seen": 210673750, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.39257812, + "step": 9770, + "time_per_iteration": 2.3947503566741943 + }, + { + "auxiliary_loss_clip": 0.01060039, + "auxiliary_loss_mlp": 0.01025025, + "balance_loss_clip": 1.01176274, + "balance_loss_mlp": 1.02013552, + "epoch": 0.5874643018187284, + "flos": 24387809126400.0, + "grad_norm": 1.816884692959906, + "language_loss": 0.67587984, + "learning_rate": 1.457528204239197e-06, + "loss": 0.69673049, + "num_input_tokens_seen": 210692960, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.40039062, + "step": 9771, + "time_per_iteration": 2.410862684249878 + }, + { + "auxiliary_loss_clip": 0.01059989, + "auxiliary_loss_mlp": 0.01028474, + "balance_loss_clip": 1.01552832, + "balance_loss_mlp": 1.01841092, + "epoch": 0.5875244250713964, + "flos": 28146453632640.0, + "grad_norm": 1.7390390940009155, + "language_loss": 0.66326386, + "learning_rate": 1.4571645550559068e-06, + "loss": 0.68414849, + "num_input_tokens_seen": 210714040, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.41601562, + "step": 9772, + "time_per_iteration": 2.502359390258789 + }, + { + "auxiliary_loss_clip": 0.010633, + "auxiliary_loss_mlp": 0.01032056, + "balance_loss_clip": 1.01815617, + "balance_loss_mlp": 1.02034581, + "epoch": 0.5875845483240644, + "flos": 25811218056960.0, + "grad_norm": 1.5791225978471402, + "language_loss": 0.73980308, + "learning_rate": 1.4568009252450177e-06, + "loss": 0.76075661, + "num_input_tokens_seen": 210733710, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.4296875, + "step": 9773, + "time_per_iteration": 2.4419732093811035 + }, + { + "auxiliary_loss_clip": 0.01060137, + "auxiliary_loss_mlp": 0.01025475, + "balance_loss_clip": 1.01162934, + "balance_loss_mlp": 1.01861084, + "epoch": 0.5876446715767323, + "flos": 26905361155200.0, + "grad_norm": 2.117729946346568, + "language_loss": 0.54024506, + "learning_rate": 1.456437314819506e-06, + "loss": 0.56110126, + "num_input_tokens_seen": 210753580, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.4140625, + "step": 9774, + "time_per_iteration": 2.420015573501587 + }, + { + "auxiliary_loss_clip": 0.01062406, + "auxiliary_loss_mlp": 0.01026008, + "balance_loss_clip": 1.01284158, + "balance_loss_mlp": 1.02025843, + "epoch": 0.5877047948294003, + "flos": 36683346257280.0, + "grad_norm": 2.169757723902424, + "language_loss": 0.6537236, + "learning_rate": 1.456073723792349e-06, + "loss": 0.67460775, + "num_input_tokens_seen": 210773495, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.421875, + "step": 9775, + "time_per_iteration": 2.520303249359131 + }, + { + "auxiliary_loss_clip": 0.01060772, + "auxiliary_loss_mlp": 0.01025504, + "balance_loss_clip": 1.01242137, + "balance_loss_mlp": 1.01944065, + "epoch": 0.5877649180820682, + "flos": 26723498549760.0, + "grad_norm": 1.6545338067597002, + "language_loss": 0.73608643, + "learning_rate": 1.4557101521765211e-06, + "loss": 0.75694919, + "num_input_tokens_seen": 210793645, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.41210938, + "step": 9776, + "time_per_iteration": 2.425199031829834 + }, + { + "auxiliary_loss_clip": 0.01058856, + "auxiliary_loss_mlp": 0.0102572, + "balance_loss_clip": 1.01388812, + "balance_loss_mlp": 1.01815557, + "epoch": 0.5878250413347362, + "flos": 21031154547840.0, + "grad_norm": 1.6046288907074193, + "language_loss": 0.74837172, + "learning_rate": 1.4553465999849977e-06, + "loss": 0.76921749, + "num_input_tokens_seen": 210813415, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.40625, + "step": 9777, + "time_per_iteration": 2.394351005554199 + }, + { + "auxiliary_loss_clip": 0.01058978, + "auxiliary_loss_mlp": 0.01025406, + "balance_loss_clip": 1.01349151, + "balance_loss_mlp": 1.02085006, + "epoch": 0.5878851645874041, + "flos": 25483069388160.0, + "grad_norm": 1.5433817503943055, + "language_loss": 0.74145448, + "learning_rate": 1.4549830672307533e-06, + "loss": 0.76229835, + "num_input_tokens_seen": 210833850, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.38085938, + "step": 9778, + "time_per_iteration": 2.421292304992676 + }, + { + "auxiliary_loss_clip": 0.01059448, + "auxiliary_loss_mlp": 0.01025606, + "balance_loss_clip": 1.01364374, + "balance_loss_mlp": 1.01975513, + "epoch": 0.5879452878400722, + "flos": 23767996026240.0, + "grad_norm": 1.8009500131180625, + "language_loss": 0.70427746, + "learning_rate": 1.454619553926761e-06, + "loss": 0.72512805, + "num_input_tokens_seen": 210853115, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.3984375, + "step": 9779, + "time_per_iteration": 2.4276845455169678 + }, + { + "auxiliary_loss_clip": 0.01061554, + "auxiliary_loss_mlp": 0.01025009, + "balance_loss_clip": 1.01236713, + "balance_loss_mlp": 1.01963949, + "epoch": 0.5880054110927401, + "flos": 17823474207360.0, + "grad_norm": 2.095744396795089, + "language_loss": 0.67078274, + "learning_rate": 1.4542560600859949e-06, + "loss": 0.69164836, + "num_input_tokens_seen": 210872090, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.41796875, + "step": 9780, + "time_per_iteration": 3.906172752380371 + }, + { + "auxiliary_loss_clip": 0.01061076, + "auxiliary_loss_mlp": 0.01026906, + "balance_loss_clip": 1.01363254, + "balance_loss_mlp": 1.02049041, + "epoch": 0.5880655343454081, + "flos": 19462402160640.0, + "grad_norm": 2.7554233152570418, + "language_loss": 0.72411978, + "learning_rate": 1.4538925857214256e-06, + "loss": 0.74499965, + "num_input_tokens_seen": 210888490, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.40625, + "step": 9781, + "time_per_iteration": 2.3622043132781982 + }, + { + "auxiliary_loss_clip": 0.01058612, + "auxiliary_loss_mlp": 0.01023903, + "balance_loss_clip": 1.01196408, + "balance_loss_mlp": 1.01842356, + "epoch": 0.588125657598076, + "flos": 21396520592640.0, + "grad_norm": 1.4521549711699897, + "language_loss": 0.70455033, + "learning_rate": 1.453529130846025e-06, + "loss": 0.72537547, + "num_input_tokens_seen": 210908220, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.40234375, + "step": 9782, + "time_per_iteration": 3.816467761993408 + }, + { + "auxiliary_loss_clip": 0.01062203, + "auxiliary_loss_mlp": 0.01026491, + "balance_loss_clip": 1.01375902, + "balance_loss_mlp": 1.01919723, + "epoch": 0.588185780850744, + "flos": 16033721713920.0, + "grad_norm": 2.091199035708525, + "language_loss": 0.70411634, + "learning_rate": 1.4531656954727641e-06, + "loss": 0.72500336, + "num_input_tokens_seen": 210923945, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.4296875, + "step": 9783, + "time_per_iteration": 2.355715036392212 + }, + { + "auxiliary_loss_clip": 0.01062805, + "auxiliary_loss_mlp": 0.01028901, + "balance_loss_clip": 1.01457179, + "balance_loss_mlp": 1.0207026, + "epoch": 0.588245904103412, + "flos": 23727217691520.0, + "grad_norm": 4.336395215330512, + "language_loss": 0.68906426, + "learning_rate": 1.4528022796146128e-06, + "loss": 0.70998132, + "num_input_tokens_seen": 210941955, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.41992188, + "step": 9784, + "time_per_iteration": 2.39340877532959 + }, + { + "auxiliary_loss_clip": 0.01061326, + "auxiliary_loss_mlp": 0.01026089, + "balance_loss_clip": 1.01319098, + "balance_loss_mlp": 1.01908445, + "epoch": 0.58830602735608, + "flos": 33801126410880.0, + "grad_norm": 1.9265991736581494, + "language_loss": 0.69296706, + "learning_rate": 1.452438883284541e-06, + "loss": 0.71384132, + "num_input_tokens_seen": 210963105, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.421875, + "step": 9785, + "time_per_iteration": 2.4919819831848145 + }, + { + "auxiliary_loss_clip": 0.0105751, + "auxiliary_loss_mlp": 0.01023519, + "balance_loss_clip": 1.01170576, + "balance_loss_mlp": 1.01829207, + "epoch": 0.588366150608748, + "flos": 17089809563520.0, + "grad_norm": 1.9516748520270721, + "language_loss": 0.77349746, + "learning_rate": 1.4520755064955165e-06, + "loss": 0.79430777, + "num_input_tokens_seen": 210978720, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.39257812, + "step": 9786, + "time_per_iteration": 2.366201400756836 + }, + { + "auxiliary_loss_clip": 0.01060836, + "auxiliary_loss_mlp": 0.01028784, + "balance_loss_clip": 1.01594496, + "balance_loss_mlp": 1.0198828, + "epoch": 0.5884262738614159, + "flos": 22126100607360.0, + "grad_norm": 1.4519404771325948, + "language_loss": 0.7921834, + "learning_rate": 1.4517121492605075e-06, + "loss": 0.8130796, + "num_input_tokens_seen": 210998750, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.41015625, + "step": 9787, + "time_per_iteration": 2.3881185054779053 + }, + { + "auxiliary_loss_clip": 0.01059062, + "auxiliary_loss_mlp": 0.0102618, + "balance_loss_clip": 1.01456904, + "balance_loss_mlp": 1.01876903, + "epoch": 0.5884863971140839, + "flos": 21030805434240.0, + "grad_norm": 1.5394731138707058, + "language_loss": 0.66132641, + "learning_rate": 1.4513488115924823e-06, + "loss": 0.68217885, + "num_input_tokens_seen": 211017550, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.40234375, + "step": 9788, + "time_per_iteration": 3.758538007736206 + }, + { + "auxiliary_loss_clip": 0.01059704, + "auxiliary_loss_mlp": 0.01024854, + "balance_loss_clip": 1.01214623, + "balance_loss_mlp": 1.01973724, + "epoch": 0.5885465203667518, + "flos": 23803991504640.0, + "grad_norm": 2.8337096481105783, + "language_loss": 0.80375206, + "learning_rate": 1.450985493504406e-06, + "loss": 0.82459766, + "num_input_tokens_seen": 211034135, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.40039062, + "step": 9789, + "time_per_iteration": 2.4056553840637207 + }, + { + "auxiliary_loss_clip": 0.01058477, + "auxiliary_loss_mlp": 0.01021036, + "balance_loss_clip": 1.00932932, + "balance_loss_mlp": 1.01887846, + "epoch": 0.5886066436194198, + "flos": 18879562056960.0, + "grad_norm": 1.5959817423082607, + "language_loss": 0.70585895, + "learning_rate": 1.4506221950092457e-06, + "loss": 0.72665405, + "num_input_tokens_seen": 211053850, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.39648438, + "step": 9790, + "time_per_iteration": 2.392141580581665 + }, + { + "auxiliary_loss_clip": 0.0105868, + "auxiliary_loss_mlp": 0.01023703, + "balance_loss_clip": 1.01169872, + "balance_loss_mlp": 1.01925683, + "epoch": 0.5886667668720877, + "flos": 24788996092800.0, + "grad_norm": 2.0323185540452062, + "language_loss": 0.83470607, + "learning_rate": 1.450258916119966e-06, + "loss": 0.85552996, + "num_input_tokens_seen": 211072165, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.39453125, + "step": 9791, + "time_per_iteration": 2.4109139442443848 + }, + { + "auxiliary_loss_clip": 0.01060368, + "auxiliary_loss_mlp": 0.01025049, + "balance_loss_clip": 1.01279461, + "balance_loss_mlp": 1.01952648, + "epoch": 0.5887268901247558, + "flos": 21613366247040.0, + "grad_norm": 1.6673039437004533, + "language_loss": 0.76153135, + "learning_rate": 1.4498956568495313e-06, + "loss": 0.78238559, + "num_input_tokens_seen": 211089630, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.40820312, + "step": 9792, + "time_per_iteration": 2.41072154045105 + }, + { + "auxiliary_loss_clip": 0.01059972, + "auxiliary_loss_mlp": 0.01026761, + "balance_loss_clip": 1.01441717, + "balance_loss_mlp": 1.01858163, + "epoch": 0.5887870133774237, + "flos": 20480783875200.0, + "grad_norm": 1.9604865057370051, + "language_loss": 0.69331503, + "learning_rate": 1.4495324172109057e-06, + "loss": 0.71418238, + "num_input_tokens_seen": 211106120, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.4140625, + "step": 9793, + "time_per_iteration": 2.368412733078003 + }, + { + "auxiliary_loss_clip": 0.01059393, + "auxiliary_loss_mlp": 0.01024067, + "balance_loss_clip": 1.01218152, + "balance_loss_mlp": 1.02036166, + "epoch": 0.5888471366300917, + "flos": 19205336753280.0, + "grad_norm": 1.7196559076594684, + "language_loss": 0.60249084, + "learning_rate": 1.449169197217052e-06, + "loss": 0.62332541, + "num_input_tokens_seen": 211122450, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.390625, + "step": 9794, + "time_per_iteration": 2.3771705627441406 + }, + { + "auxiliary_loss_clip": 0.01061122, + "auxiliary_loss_mlp": 0.0102953, + "balance_loss_clip": 1.01654792, + "balance_loss_mlp": 1.02064729, + "epoch": 0.5889072598827596, + "flos": 19971924675840.0, + "grad_norm": 1.5435837919481368, + "language_loss": 0.65422106, + "learning_rate": 1.4488059968809335e-06, + "loss": 0.67512763, + "num_input_tokens_seen": 211141765, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.40429688, + "step": 9795, + "time_per_iteration": 2.4012973308563232 + }, + { + "auxiliary_loss_clip": 0.0106028, + "auxiliary_loss_mlp": 0.01025357, + "balance_loss_clip": 1.01375151, + "balance_loss_mlp": 1.01996732, + "epoch": 0.5889673831354276, + "flos": 20740188343680.0, + "grad_norm": 1.555166286595855, + "language_loss": 0.74037415, + "learning_rate": 1.4484428162155102e-06, + "loss": 0.76123053, + "num_input_tokens_seen": 211160475, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.40234375, + "step": 9796, + "time_per_iteration": 2.3974761962890625 + }, + { + "auxiliary_loss_clip": 0.01058413, + "auxiliary_loss_mlp": 0.01024669, + "balance_loss_clip": 1.01348114, + "balance_loss_mlp": 1.01877975, + "epoch": 0.5890275063880956, + "flos": 25299775416960.0, + "grad_norm": 1.5070930649338998, + "language_loss": 0.83185846, + "learning_rate": 1.4480796552337444e-06, + "loss": 0.85268927, + "num_input_tokens_seen": 211180480, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.39648438, + "step": 9797, + "time_per_iteration": 2.417853593826294 + }, + { + "auxiliary_loss_clip": 0.0105663, + "auxiliary_loss_mlp": 0.01021663, + "balance_loss_clip": 1.00963438, + "balance_loss_mlp": 1.01774478, + "epoch": 0.5890876296407636, + "flos": 11764577174400.0, + "grad_norm": 2.084747139870573, + "language_loss": 0.79048896, + "learning_rate": 1.4477165139485962e-06, + "loss": 0.81127191, + "num_input_tokens_seen": 211198000, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.38867188, + "step": 9798, + "time_per_iteration": 2.3943309783935547 + }, + { + "auxiliary_loss_clip": 0.01058672, + "auxiliary_loss_mlp": 0.01023471, + "balance_loss_clip": 1.01205111, + "balance_loss_mlp": 1.01927328, + "epoch": 0.5891477528934316, + "flos": 13588614489600.0, + "grad_norm": 2.3377191302406612, + "language_loss": 0.73563087, + "learning_rate": 1.4473533923730244e-06, + "loss": 0.75645232, + "num_input_tokens_seen": 211214765, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.39453125, + "step": 9799, + "time_per_iteration": 2.3620879650115967 + }, + { + "auxiliary_loss_clip": 0.01062263, + "auxiliary_loss_mlp": 0.0102229, + "balance_loss_clip": 1.00895619, + "balance_loss_mlp": 1.02004659, + "epoch": 0.5892078761460995, + "flos": 15048298189440.0, + "grad_norm": 1.8266830791350155, + "language_loss": 0.76227605, + "learning_rate": 1.4469902905199889e-06, + "loss": 0.78312159, + "num_input_tokens_seen": 211232335, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.421875, + "step": 9800, + "time_per_iteration": 2.3762309551239014 + }, + { + "auxiliary_loss_clip": 0.01060503, + "auxiliary_loss_mlp": 0.01032596, + "balance_loss_clip": 1.02039528, + "balance_loss_mlp": 1.01914918, + "epoch": 0.5892679993987675, + "flos": 15777319622400.0, + "grad_norm": 1.7458517447191793, + "language_loss": 0.78752846, + "learning_rate": 1.446627208402447e-06, + "loss": 0.8084594, + "num_input_tokens_seen": 211249985, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.4140625, + "step": 9801, + "time_per_iteration": 2.3614466190338135 + }, + { + "auxiliary_loss_clip": 0.01061803, + "auxiliary_loss_mlp": 0.01021145, + "balance_loss_clip": 1.0085566, + "balance_loss_mlp": 1.02033174, + "epoch": 0.5893281226514354, + "flos": 25264024318080.0, + "grad_norm": 1.8787742916517085, + "language_loss": 0.65959519, + "learning_rate": 1.4462641460333572e-06, + "loss": 0.68042463, + "num_input_tokens_seen": 211268425, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.4140625, + "step": 9802, + "time_per_iteration": 2.463134765625 + }, + { + "auxiliary_loss_clip": 0.01060171, + "auxiliary_loss_mlp": 0.01023326, + "balance_loss_clip": 1.01070142, + "balance_loss_mlp": 1.0191505, + "epoch": 0.5893882459041034, + "flos": 19457374924800.0, + "grad_norm": 1.5648829176003922, + "language_loss": 0.78328514, + "learning_rate": 1.4459011034256752e-06, + "loss": 0.80412006, + "num_input_tokens_seen": 211286680, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.41015625, + "step": 9803, + "time_per_iteration": 2.4058144092559814 + }, + { + "auxiliary_loss_clip": 0.01061101, + "auxiliary_loss_mlp": 0.01026722, + "balance_loss_clip": 1.01386571, + "balance_loss_mlp": 1.01925707, + "epoch": 0.5894483691567713, + "flos": 20632935047040.0, + "grad_norm": 1.6486123745841301, + "language_loss": 0.72780842, + "learning_rate": 1.4455380805923573e-06, + "loss": 0.74868667, + "num_input_tokens_seen": 211307700, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.41796875, + "step": 9804, + "time_per_iteration": 2.417085647583008 + }, + { + "auxiliary_loss_clip": 0.01059408, + "auxiliary_loss_mlp": 0.01026137, + "balance_loss_clip": 1.01370907, + "balance_loss_mlp": 1.01970351, + "epoch": 0.5895084924094394, + "flos": 17777459168640.0, + "grad_norm": 1.728364401130138, + "language_loss": 0.74728084, + "learning_rate": 1.4451750775463596e-06, + "loss": 0.76813632, + "num_input_tokens_seen": 211324835, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.3984375, + "step": 9805, + "time_per_iteration": 2.3625316619873047 + }, + { + "auxiliary_loss_clip": 0.01063242, + "auxiliary_loss_mlp": 0.01026082, + "balance_loss_clip": 1.01309407, + "balance_loss_mlp": 1.02017844, + "epoch": 0.5895686156621073, + "flos": 20120026129920.0, + "grad_norm": 2.1309477661064875, + "language_loss": 0.78264815, + "learning_rate": 1.4448120943006359e-06, + "loss": 0.80354136, + "num_input_tokens_seen": 211344130, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.4296875, + "step": 9806, + "time_per_iteration": 2.389394760131836 + }, + { + "auxiliary_loss_clip": 0.01058789, + "auxiliary_loss_mlp": 0.01025228, + "balance_loss_clip": 1.01261616, + "balance_loss_mlp": 1.01830637, + "epoch": 0.5896287389147753, + "flos": 20849012651520.0, + "grad_norm": 1.810880398548259, + "language_loss": 0.76717496, + "learning_rate": 1.4444491308681404e-06, + "loss": 0.78801513, + "num_input_tokens_seen": 211362915, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.40625, + "step": 9807, + "time_per_iteration": 2.3737480640411377 + }, + { + "auxiliary_loss_clip": 0.01061519, + "auxiliary_loss_mlp": 0.01028036, + "balance_loss_clip": 1.01512527, + "balance_loss_mlp": 1.02015233, + "epoch": 0.5896888621674432, + "flos": 14537030106240.0, + "grad_norm": 1.7214533611437928, + "language_loss": 0.74448317, + "learning_rate": 1.4440861872618268e-06, + "loss": 0.76537871, + "num_input_tokens_seen": 211380700, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.4140625, + "step": 9808, + "time_per_iteration": 2.3767361640930176 + }, + { + "auxiliary_loss_clip": 0.01063001, + "auxiliary_loss_mlp": 0.01026159, + "balance_loss_clip": 1.01247406, + "balance_loss_mlp": 1.02037358, + "epoch": 0.5897489854201112, + "flos": 20885706357120.0, + "grad_norm": 1.8520034099157943, + "language_loss": 0.71897602, + "learning_rate": 1.4437232634946465e-06, + "loss": 0.73986757, + "num_input_tokens_seen": 211400095, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.42578125, + "step": 9809, + "time_per_iteration": 3.7854957580566406 + }, + { + "auxiliary_loss_clip": 0.01062599, + "auxiliary_loss_mlp": 0.01024978, + "balance_loss_clip": 1.01181781, + "balance_loss_mlp": 1.01984096, + "epoch": 0.5898091086727792, + "flos": 20010119569920.0, + "grad_norm": 2.404238629515017, + "language_loss": 0.81957948, + "learning_rate": 1.4433603595795525e-06, + "loss": 0.84045529, + "num_input_tokens_seen": 211417810, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.42773438, + "step": 9810, + "time_per_iteration": 2.3740081787109375 + }, + { + "auxiliary_loss_clip": 0.01059323, + "auxiliary_loss_mlp": 0.01028089, + "balance_loss_clip": 1.0159893, + "balance_loss_mlp": 1.01946521, + "epoch": 0.5898692319254472, + "flos": 16252312936320.0, + "grad_norm": 1.5638168832621777, + "language_loss": 0.81095839, + "learning_rate": 1.4429974755294956e-06, + "loss": 0.83183247, + "num_input_tokens_seen": 211436020, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.3984375, + "step": 9811, + "time_per_iteration": 2.365499258041382 + }, + { + "auxiliary_loss_clip": 0.01062115, + "auxiliary_loss_mlp": 0.01024678, + "balance_loss_clip": 1.01138067, + "balance_loss_mlp": 1.02029896, + "epoch": 0.5899293551781152, + "flos": 20447511482880.0, + "grad_norm": 1.6352676593683375, + "language_loss": 0.76739573, + "learning_rate": 1.442634611357426e-06, + "loss": 0.78826368, + "num_input_tokens_seen": 211454335, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.41796875, + "step": 9812, + "time_per_iteration": 2.4085304737091064 + }, + { + "auxiliary_loss_clip": 0.01059033, + "auxiliary_loss_mlp": 0.01028674, + "balance_loss_clip": 1.0170747, + "balance_loss_mlp": 1.01939273, + "epoch": 0.5899894784307831, + "flos": 13880837502720.0, + "grad_norm": 2.101437760817552, + "language_loss": 0.70355284, + "learning_rate": 1.4422717670762932e-06, + "loss": 0.72442985, + "num_input_tokens_seen": 211472775, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.39648438, + "step": 9813, + "time_per_iteration": 2.350639820098877 + }, + { + "auxiliary_loss_clip": 0.01059235, + "auxiliary_loss_mlp": 0.01025163, + "balance_loss_clip": 1.01284266, + "balance_loss_mlp": 1.01954269, + "epoch": 0.5900496016834511, + "flos": 20082773842560.0, + "grad_norm": 1.8362905869813004, + "language_loss": 0.72334278, + "learning_rate": 1.441908942699046e-06, + "loss": 0.74418676, + "num_input_tokens_seen": 211492195, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.3984375, + "step": 9814, + "time_per_iteration": 2.3773462772369385 + }, + { + "auxiliary_loss_clip": 0.01060878, + "auxiliary_loss_mlp": 0.01022401, + "balance_loss_clip": 1.00929952, + "balance_loss_mlp": 1.02054214, + "epoch": 0.590109724936119, + "flos": 20258317491840.0, + "grad_norm": 1.8975169363082758, + "language_loss": 0.78309458, + "learning_rate": 1.4415461382386335e-06, + "loss": 0.8039273, + "num_input_tokens_seen": 211510220, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.40429688, + "step": 9815, + "time_per_iteration": 2.389234781265259 + }, + { + "auxiliary_loss_clip": 0.01061502, + "auxiliary_loss_mlp": 0.01032932, + "balance_loss_clip": 1.01989675, + "balance_loss_mlp": 1.01904869, + "epoch": 0.590169848188787, + "flos": 24234156766080.0, + "grad_norm": 2.9801276388902234, + "language_loss": 0.75858796, + "learning_rate": 1.4411833537080026e-06, + "loss": 0.77953231, + "num_input_tokens_seen": 211526260, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.42382812, + "step": 9816, + "time_per_iteration": 2.407169818878174 + }, + { + "auxiliary_loss_clip": 0.01059835, + "auxiliary_loss_mlp": 0.0102283, + "balance_loss_clip": 1.01077211, + "balance_loss_mlp": 1.02025616, + "epoch": 0.590229971441455, + "flos": 17783778124800.0, + "grad_norm": 1.9352122743093496, + "language_loss": 0.81036794, + "learning_rate": 1.4408205891201005e-06, + "loss": 0.83119452, + "num_input_tokens_seen": 211542890, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.39648438, + "step": 9817, + "time_per_iteration": 2.3606791496276855 + }, + { + "auxiliary_loss_clip": 0.01059773, + "auxiliary_loss_mlp": 0.01024404, + "balance_loss_clip": 1.01228595, + "balance_loss_mlp": 1.01981759, + "epoch": 0.590290094694123, + "flos": 22235797699200.0, + "grad_norm": 1.5969149047776752, + "language_loss": 0.76298487, + "learning_rate": 1.4404578444878727e-06, + "loss": 0.78382659, + "num_input_tokens_seen": 211562685, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.40039062, + "step": 9818, + "time_per_iteration": 2.39916729927063 + }, + { + "auxiliary_loss_clip": 0.01058542, + "auxiliary_loss_mlp": 0.01023998, + "balance_loss_clip": 1.01207733, + "balance_loss_mlp": 1.01934195, + "epoch": 0.5903502179467909, + "flos": 19097629608960.0, + "grad_norm": 2.9225048248189553, + "language_loss": 0.66382909, + "learning_rate": 1.440095119824266e-06, + "loss": 0.68465453, + "num_input_tokens_seen": 211579960, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.39257812, + "step": 9819, + "time_per_iteration": 2.3604888916015625 + }, + { + "auxiliary_loss_clip": 0.01061019, + "auxiliary_loss_mlp": 0.01028275, + "balance_loss_clip": 1.01581192, + "balance_loss_mlp": 1.02074635, + "epoch": 0.5904103411994589, + "flos": 24234575702400.0, + "grad_norm": 1.7872414758336648, + "language_loss": 0.77910197, + "learning_rate": 1.439732415142224e-06, + "loss": 0.79999495, + "num_input_tokens_seen": 211599310, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.40234375, + "step": 9820, + "time_per_iteration": 3.8172409534454346 + }, + { + "auxiliary_loss_clip": 0.01008554, + "auxiliary_loss_mlp": 0.01001024, + "balance_loss_clip": 1.00003409, + "balance_loss_mlp": 1.00129652, + "epoch": 0.5904704644521268, + "flos": 64873650424320.0, + "grad_norm": 0.9224351228926618, + "language_loss": 0.65138626, + "learning_rate": 1.43936973045469e-06, + "loss": 0.67148209, + "num_input_tokens_seen": 211658790, + "router_z_loss_clip": 0.0098877, + "router_z_loss_mlp": 0.07275391, + "step": 9821, + "time_per_iteration": 3.031134843826294 + }, + { + "auxiliary_loss_clip": 0.01058841, + "auxiliary_loss_mlp": 0.01025106, + "balance_loss_clip": 1.01320314, + "balance_loss_mlp": 1.01882577, + "epoch": 0.5905305877047948, + "flos": 19608967514880.0, + "grad_norm": 2.34385130192752, + "language_loss": 0.62057924, + "learning_rate": 1.4390070657746093e-06, + "loss": 0.64141876, + "num_input_tokens_seen": 211677240, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.40039062, + "step": 9822, + "time_per_iteration": 3.8229594230651855 + }, + { + "auxiliary_loss_clip": 0.01061585, + "auxiliary_loss_mlp": 0.01033867, + "balance_loss_clip": 1.02103996, + "balance_loss_mlp": 1.01992369, + "epoch": 0.5905907109574628, + "flos": 18988630744320.0, + "grad_norm": 2.1876466477138625, + "language_loss": 0.82905447, + "learning_rate": 1.4386444211149226e-06, + "loss": 0.85000902, + "num_input_tokens_seen": 211695485, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.41796875, + "step": 9823, + "time_per_iteration": 2.3967580795288086 + }, + { + "auxiliary_loss_clip": 0.01058909, + "auxiliary_loss_mlp": 0.01022857, + "balance_loss_clip": 1.01019073, + "balance_loss_mlp": 1.01964617, + "epoch": 0.5906508342101308, + "flos": 22199313461760.0, + "grad_norm": 2.3124513588377065, + "language_loss": 0.73227674, + "learning_rate": 1.4382817964885731e-06, + "loss": 0.75309438, + "num_input_tokens_seen": 211713090, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.39453125, + "step": 9824, + "time_per_iteration": 2.3717470169067383 + }, + { + "auxiliary_loss_clip": 0.01066047, + "auxiliary_loss_mlp": 0.01027305, + "balance_loss_clip": 1.01347065, + "balance_loss_mlp": 1.02085412, + "epoch": 0.5907109574627988, + "flos": 20885636534400.0, + "grad_norm": 5.756580723259824, + "language_loss": 0.83149058, + "learning_rate": 1.4379191919085014e-06, + "loss": 0.85242409, + "num_input_tokens_seen": 211732510, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.45117188, + "step": 9825, + "time_per_iteration": 2.3732497692108154 + }, + { + "auxiliary_loss_clip": 0.01057206, + "auxiliary_loss_mlp": 0.01022231, + "balance_loss_clip": 1.01129377, + "balance_loss_mlp": 1.01935315, + "epoch": 0.5907710807154667, + "flos": 21505589280000.0, + "grad_norm": 1.8970026958735589, + "language_loss": 0.76742601, + "learning_rate": 1.4375566073876478e-06, + "loss": 0.78822041, + "num_input_tokens_seen": 211748695, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.37890625, + "step": 9826, + "time_per_iteration": 2.384979009628296 + }, + { + "auxiliary_loss_clip": 0.01058258, + "auxiliary_loss_mlp": 0.01021658, + "balance_loss_clip": 1.00972521, + "balance_loss_mlp": 1.01860738, + "epoch": 0.5908312039681347, + "flos": 22017276299520.0, + "grad_norm": 2.1736785061471897, + "language_loss": 0.71923482, + "learning_rate": 1.4371940429389523e-06, + "loss": 0.74003398, + "num_input_tokens_seen": 211768545, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.39648438, + "step": 9827, + "time_per_iteration": 3.8325281143188477 + }, + { + "auxiliary_loss_clip": 0.01059856, + "auxiliary_loss_mlp": 0.01022558, + "balance_loss_clip": 1.01072693, + "balance_loss_mlp": 1.0187583, + "epoch": 0.5908913272208026, + "flos": 18478514736000.0, + "grad_norm": 1.7982894109013883, + "language_loss": 0.79586053, + "learning_rate": 1.4368314985753531e-06, + "loss": 0.8166846, + "num_input_tokens_seen": 211786665, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.41015625, + "step": 9828, + "time_per_iteration": 2.3658018112182617 + }, + { + "auxiliary_loss_clip": 0.01060656, + "auxiliary_loss_mlp": 0.01028829, + "balance_loss_clip": 1.01587677, + "balance_loss_mlp": 1.01950312, + "epoch": 0.5909514504734706, + "flos": 12311386888320.0, + "grad_norm": 2.875980218665986, + "language_loss": 0.8851552, + "learning_rate": 1.4364689743097892e-06, + "loss": 0.90605009, + "num_input_tokens_seen": 211801215, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.41015625, + "step": 9829, + "time_per_iteration": 2.415417194366455 + }, + { + "auxiliary_loss_clip": 0.01008765, + "auxiliary_loss_mlp": 0.01001053, + "balance_loss_clip": 1.00021839, + "balance_loss_mlp": 1.00133216, + "epoch": 0.5910115737261386, + "flos": 70209879891840.0, + "grad_norm": 0.7559068424316181, + "language_loss": 0.57883757, + "learning_rate": 1.4361064701551985e-06, + "loss": 0.59893578, + "num_input_tokens_seen": 211857005, + "router_z_loss_clip": 0.00836182, + "router_z_loss_mlp": 0.07421875, + "step": 9830, + "time_per_iteration": 2.9695940017700195 + }, + { + "auxiliary_loss_clip": 0.01064207, + "auxiliary_loss_mlp": 0.01025774, + "balance_loss_clip": 1.01179111, + "balance_loss_mlp": 1.01975572, + "epoch": 0.5910716969788066, + "flos": 22016682806400.0, + "grad_norm": 2.1262417334141457, + "language_loss": 0.75786787, + "learning_rate": 1.4357439861245168e-06, + "loss": 0.7787677, + "num_input_tokens_seen": 211876675, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.4453125, + "step": 9831, + "time_per_iteration": 2.381779432296753 + }, + { + "auxiliary_loss_clip": 0.01059833, + "auxiliary_loss_mlp": 0.01023972, + "balance_loss_clip": 1.01215851, + "balance_loss_mlp": 1.02044618, + "epoch": 0.5911318202314745, + "flos": 21250583642880.0, + "grad_norm": 1.755341202276288, + "language_loss": 0.77414805, + "learning_rate": 1.4353815222306813e-06, + "loss": 0.79498613, + "num_input_tokens_seen": 211895725, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.39453125, + "step": 9832, + "time_per_iteration": 2.3802645206451416 + }, + { + "auxiliary_loss_clip": 0.010603, + "auxiliary_loss_mlp": 0.01022656, + "balance_loss_clip": 1.01034725, + "balance_loss_mlp": 1.0197165, + "epoch": 0.5911919434841425, + "flos": 17820646387200.0, + "grad_norm": 1.6934628457366516, + "language_loss": 0.83327299, + "learning_rate": 1.4350190784866266e-06, + "loss": 0.85410255, + "num_input_tokens_seen": 211913860, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.40625, + "step": 9833, + "time_per_iteration": 2.3807802200317383 + }, + { + "auxiliary_loss_clip": 0.01059263, + "auxiliary_loss_mlp": 0.01023985, + "balance_loss_clip": 1.01167655, + "balance_loss_mlp": 1.01886368, + "epoch": 0.5912520667368104, + "flos": 20373774958080.0, + "grad_norm": 1.734006253361228, + "language_loss": 0.7423265, + "learning_rate": 1.4346566549052877e-06, + "loss": 0.76315904, + "num_input_tokens_seen": 211932880, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.40429688, + "step": 9834, + "time_per_iteration": 2.4049854278564453 + }, + { + "auxiliary_loss_clip": 0.01059882, + "auxiliary_loss_mlp": 0.01026221, + "balance_loss_clip": 1.01413321, + "balance_loss_mlp": 1.01895416, + "epoch": 0.5913121899894784, + "flos": 17929610340480.0, + "grad_norm": 2.4201998085532708, + "language_loss": 0.77702427, + "learning_rate": 1.4342942514995989e-06, + "loss": 0.7978853, + "num_input_tokens_seen": 211948625, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.41015625, + "step": 9835, + "time_per_iteration": 2.354496717453003 + }, + { + "auxiliary_loss_clip": 0.01059664, + "auxiliary_loss_mlp": 0.01029268, + "balance_loss_clip": 1.0168581, + "balance_loss_mlp": 1.01898026, + "epoch": 0.5913723132421465, + "flos": 22125856227840.0, + "grad_norm": 2.865884198863656, + "language_loss": 0.73862612, + "learning_rate": 1.4339318682824924e-06, + "loss": 0.7595154, + "num_input_tokens_seen": 211965355, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.40820312, + "step": 9836, + "time_per_iteration": 2.3785784244537354 + }, + { + "auxiliary_loss_clip": 0.01060085, + "auxiliary_loss_mlp": 0.0102523, + "balance_loss_clip": 1.01288009, + "balance_loss_mlp": 1.01902056, + "epoch": 0.5914324364948144, + "flos": 15697229230080.0, + "grad_norm": 2.0011128667697764, + "language_loss": 0.8223117, + "learning_rate": 1.433569505266902e-06, + "loss": 0.84316486, + "num_input_tokens_seen": 211982245, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.41015625, + "step": 9837, + "time_per_iteration": 2.370316505432129 + }, + { + "auxiliary_loss_clip": 0.01059563, + "auxiliary_loss_mlp": 0.01026793, + "balance_loss_clip": 1.01378107, + "balance_loss_mlp": 1.01917481, + "epoch": 0.5914925597474824, + "flos": 22746227909760.0, + "grad_norm": 1.6797286098861919, + "language_loss": 0.79607868, + "learning_rate": 1.4332071624657585e-06, + "loss": 0.81694221, + "num_input_tokens_seen": 212000250, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.40429688, + "step": 9838, + "time_per_iteration": 2.400765895843506 + }, + { + "auxiliary_loss_clip": 0.01059903, + "auxiliary_loss_mlp": 0.01026154, + "balance_loss_clip": 1.01363659, + "balance_loss_mlp": 1.01908994, + "epoch": 0.5915526830001503, + "flos": 18291904185600.0, + "grad_norm": 2.321941912952039, + "language_loss": 0.69785488, + "learning_rate": 1.4328448398919937e-06, + "loss": 0.71871543, + "num_input_tokens_seen": 212017505, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.40820312, + "step": 9839, + "time_per_iteration": 2.3756518363952637 + }, + { + "auxiliary_loss_clip": 0.01059027, + "auxiliary_loss_mlp": 0.01027592, + "balance_loss_clip": 1.01534927, + "balance_loss_mlp": 1.01880264, + "epoch": 0.5916128062528183, + "flos": 17018132808960.0, + "grad_norm": 2.0267525537645783, + "language_loss": 0.65563583, + "learning_rate": 1.4324825375585379e-06, + "loss": 0.67650211, + "num_input_tokens_seen": 212034595, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.40234375, + "step": 9840, + "time_per_iteration": 2.3581995964050293 + }, + { + "auxiliary_loss_clip": 0.01061054, + "auxiliary_loss_mlp": 0.01023636, + "balance_loss_clip": 1.01108289, + "balance_loss_mlp": 1.01921916, + "epoch": 0.5916729295054862, + "flos": 24753070437120.0, + "grad_norm": 1.6872613113367558, + "language_loss": 0.81675005, + "learning_rate": 1.43212025547832e-06, + "loss": 0.83759689, + "num_input_tokens_seen": 212055775, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.41796875, + "step": 9841, + "time_per_iteration": 2.439359426498413 + }, + { + "auxiliary_loss_clip": 0.01057386, + "auxiliary_loss_mlp": 0.01020893, + "balance_loss_clip": 1.00969291, + "balance_loss_mlp": 1.01786673, + "epoch": 0.5917330527581542, + "flos": 15957366837120.0, + "grad_norm": 1.9398182318661343, + "language_loss": 0.6912322, + "learning_rate": 1.4317579936642701e-06, + "loss": 0.71201503, + "num_input_tokens_seen": 212074000, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.39453125, + "step": 9842, + "time_per_iteration": 2.399021625518799 + }, + { + "auxiliary_loss_clip": 0.01060931, + "auxiliary_loss_mlp": 0.0102626, + "balance_loss_clip": 1.01382637, + "balance_loss_mlp": 1.01890469, + "epoch": 0.5917931760108222, + "flos": 23799732318720.0, + "grad_norm": 1.8013498134487045, + "language_loss": 0.82689619, + "learning_rate": 1.431395752129315e-06, + "loss": 0.84776813, + "num_input_tokens_seen": 212091415, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.41992188, + "step": 9843, + "time_per_iteration": 2.406480073928833 + }, + { + "auxiliary_loss_clip": 0.01058393, + "auxiliary_loss_mlp": 0.0102355, + "balance_loss_clip": 1.01140881, + "balance_loss_mlp": 1.01776218, + "epoch": 0.5918532992634902, + "flos": 23248733241600.0, + "grad_norm": 2.5422299444967487, + "language_loss": 0.81744504, + "learning_rate": 1.431033530886383e-06, + "loss": 0.83826452, + "num_input_tokens_seen": 212105255, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.40625, + "step": 9844, + "time_per_iteration": 2.3705644607543945 + }, + { + "auxiliary_loss_clip": 0.01061817, + "auxiliary_loss_mlp": 0.01024421, + "balance_loss_clip": 1.01255977, + "balance_loss_mlp": 1.02053237, + "epoch": 0.5919134225161581, + "flos": 19498851486720.0, + "grad_norm": 9.306453718379641, + "language_loss": 0.74439013, + "learning_rate": 1.4306713299484008e-06, + "loss": 0.76525259, + "num_input_tokens_seen": 212122765, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.4140625, + "step": 9845, + "time_per_iteration": 2.369810104370117 + }, + { + "auxiliary_loss_clip": 0.0106096, + "auxiliary_loss_mlp": 0.01028172, + "balance_loss_clip": 1.01456428, + "balance_loss_mlp": 1.01933074, + "epoch": 0.5919735457688261, + "flos": 38799397117440.0, + "grad_norm": 1.784613657590856, + "language_loss": 0.63895953, + "learning_rate": 1.4303091493282944e-06, + "loss": 0.65985084, + "num_input_tokens_seen": 212143960, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.41601562, + "step": 9846, + "time_per_iteration": 2.52596378326416 + }, + { + "auxiliary_loss_clip": 0.01056246, + "auxiliary_loss_mlp": 0.01019452, + "balance_loss_clip": 1.00875878, + "balance_loss_mlp": 1.01894355, + "epoch": 0.592033669021494, + "flos": 22162899047040.0, + "grad_norm": 1.609297770704961, + "language_loss": 0.7624203, + "learning_rate": 1.4299469890389893e-06, + "loss": 0.78317726, + "num_input_tokens_seen": 212162005, + "router_z_loss_clip": 0.10693359, + "router_z_loss_mlp": 0.37304688, + "step": 9847, + "time_per_iteration": 2.396254301071167 + }, + { + "auxiliary_loss_clip": 0.01058762, + "auxiliary_loss_mlp": 0.0102482, + "balance_loss_clip": 1.01202321, + "balance_loss_mlp": 1.01868832, + "epoch": 0.592093792274162, + "flos": 22709883317760.0, + "grad_norm": 1.5782184123808818, + "language_loss": 0.62247181, + "learning_rate": 1.4295848490934093e-06, + "loss": 0.64330769, + "num_input_tokens_seen": 212181635, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.40039062, + "step": 9848, + "time_per_iteration": 2.3931033611297607 + }, + { + "auxiliary_loss_clip": 0.01060114, + "auxiliary_loss_mlp": 0.01029735, + "balance_loss_clip": 1.01842844, + "balance_loss_mlp": 1.0191983, + "epoch": 0.59215391552683, + "flos": 22527846155520.0, + "grad_norm": 1.774170329594447, + "language_loss": 0.75847566, + "learning_rate": 1.4292227295044793e-06, + "loss": 0.77937418, + "num_input_tokens_seen": 212201615, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.41015625, + "step": 9849, + "time_per_iteration": 3.8096835613250732 + }, + { + "auxiliary_loss_clip": 0.01060465, + "auxiliary_loss_mlp": 0.01026236, + "balance_loss_clip": 1.01300955, + "balance_loss_mlp": 1.01911497, + "epoch": 0.592214038779498, + "flos": 24497855331840.0, + "grad_norm": 1.6505633874316985, + "language_loss": 0.75641263, + "learning_rate": 1.4288606302851211e-06, + "loss": 0.77727962, + "num_input_tokens_seen": 212219355, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.4140625, + "step": 9850, + "time_per_iteration": 2.413339376449585 + }, + { + "auxiliary_loss_clip": 0.01058357, + "auxiliary_loss_mlp": 0.0102721, + "balance_loss_clip": 1.01460993, + "balance_loss_mlp": 1.01833344, + "epoch": 0.592274162032166, + "flos": 21830386458240.0, + "grad_norm": 2.0057026995280705, + "language_loss": 0.75762522, + "learning_rate": 1.4284985514482584e-06, + "loss": 0.77848089, + "num_input_tokens_seen": 212236710, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.40039062, + "step": 9851, + "time_per_iteration": 2.3811354637145996 + }, + { + "auxiliary_loss_clip": 0.01059666, + "auxiliary_loss_mlp": 0.01020893, + "balance_loss_clip": 1.00916851, + "balance_loss_mlp": 1.01913095, + "epoch": 0.5923342852848339, + "flos": 24606993841920.0, + "grad_norm": 1.9010700003908299, + "language_loss": 0.70852369, + "learning_rate": 1.4281364930068125e-06, + "loss": 0.72932929, + "num_input_tokens_seen": 212256195, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.40625, + "step": 9852, + "time_per_iteration": 2.4028286933898926 + }, + { + "auxiliary_loss_clip": 0.01063195, + "auxiliary_loss_mlp": 0.01031602, + "balance_loss_clip": 1.01819062, + "balance_loss_mlp": 1.02008176, + "epoch": 0.5923944085375019, + "flos": 19937116183680.0, + "grad_norm": 1.8213525777711823, + "language_loss": 0.80608541, + "learning_rate": 1.4277744549737035e-06, + "loss": 0.8270334, + "num_input_tokens_seen": 212274085, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.4296875, + "step": 9853, + "time_per_iteration": 2.3830811977386475 + }, + { + "auxiliary_loss_clip": 0.01056765, + "auxiliary_loss_mlp": 0.01022171, + "balance_loss_clip": 1.01063752, + "balance_loss_mlp": 1.01853621, + "epoch": 0.5924545317901698, + "flos": 28657232956800.0, + "grad_norm": 1.979418815833492, + "language_loss": 0.67355055, + "learning_rate": 1.427412437361853e-06, + "loss": 0.69433993, + "num_input_tokens_seen": 212295530, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.3828125, + "step": 9854, + "time_per_iteration": 2.4319615364074707 + }, + { + "auxiliary_loss_clip": 0.01057737, + "auxiliary_loss_mlp": 0.01024144, + "balance_loss_clip": 1.01259232, + "balance_loss_mlp": 1.01856041, + "epoch": 0.5925146550428378, + "flos": 19863868417920.0, + "grad_norm": 1.8714917133404478, + "language_loss": 0.88649046, + "learning_rate": 1.4270504401841791e-06, + "loss": 0.90730929, + "num_input_tokens_seen": 212313770, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.390625, + "step": 9855, + "time_per_iteration": 2.381927490234375 + }, + { + "auxiliary_loss_clip": 0.01056918, + "auxiliary_loss_mlp": 0.01026493, + "balance_loss_clip": 1.01482213, + "balance_loss_mlp": 1.01845574, + "epoch": 0.5925747782955058, + "flos": 15122069625600.0, + "grad_norm": 1.654059807678504, + "language_loss": 0.86571717, + "learning_rate": 1.426688463453602e-06, + "loss": 0.88655132, + "num_input_tokens_seen": 212331525, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.3828125, + "step": 9856, + "time_per_iteration": 2.38246488571167 + }, + { + "auxiliary_loss_clip": 0.01058682, + "auxiliary_loss_mlp": 0.01025882, + "balance_loss_clip": 1.01473594, + "balance_loss_mlp": 1.01836395, + "epoch": 0.5926349015481738, + "flos": 18404464008960.0, + "grad_norm": 1.8854527215640218, + "language_loss": 0.77501619, + "learning_rate": 1.4263265071830387e-06, + "loss": 0.79586184, + "num_input_tokens_seen": 212347295, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.40429688, + "step": 9857, + "time_per_iteration": 2.375465154647827 + }, + { + "auxiliary_loss_clip": 0.01060996, + "auxiliary_loss_mlp": 0.01027793, + "balance_loss_clip": 1.0161345, + "balance_loss_mlp": 1.02082419, + "epoch": 0.5926950248008417, + "flos": 23110057854720.0, + "grad_norm": 2.09908190485097, + "language_loss": 0.6501888, + "learning_rate": 1.425964571385406e-06, + "loss": 0.67107671, + "num_input_tokens_seen": 212365750, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.40234375, + "step": 9858, + "time_per_iteration": 2.3879668712615967 + }, + { + "auxiliary_loss_clip": 0.01056682, + "auxiliary_loss_mlp": 0.01026222, + "balance_loss_clip": 1.01471233, + "balance_loss_mlp": 1.01796126, + "epoch": 0.5927551480535097, + "flos": 28032776645760.0, + "grad_norm": 1.8300390293311617, + "language_loss": 0.77198148, + "learning_rate": 1.4256026560736218e-06, + "loss": 0.79281056, + "num_input_tokens_seen": 212385300, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.38671875, + "step": 9859, + "time_per_iteration": 2.4358577728271484 + }, + { + "auxiliary_loss_clip": 0.01061212, + "auxiliary_loss_mlp": 0.01029294, + "balance_loss_clip": 1.01630032, + "balance_loss_mlp": 1.01940262, + "epoch": 0.5928152713061776, + "flos": 21797602824960.0, + "grad_norm": 2.1269753897094477, + "language_loss": 0.75432205, + "learning_rate": 1.4252407612606008e-06, + "loss": 0.77522713, + "num_input_tokens_seen": 212402140, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.41796875, + "step": 9860, + "time_per_iteration": 3.8478927612304688 + }, + { + "auxiliary_loss_clip": 0.0105947, + "auxiliary_loss_mlp": 0.01026013, + "balance_loss_clip": 1.01514685, + "balance_loss_mlp": 1.01984453, + "epoch": 0.5928753945588456, + "flos": 24315678524160.0, + "grad_norm": 4.478424527118558, + "language_loss": 0.76153666, + "learning_rate": 1.4248788869592589e-06, + "loss": 0.78239143, + "num_input_tokens_seen": 212421790, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.39648438, + "step": 9861, + "time_per_iteration": 2.423748254776001 + }, + { + "auxiliary_loss_clip": 0.01056943, + "auxiliary_loss_mlp": 0.01023603, + "balance_loss_clip": 1.01227808, + "balance_loss_mlp": 1.01763225, + "epoch": 0.5929355178115137, + "flos": 26463535499520.0, + "grad_norm": 1.500220492385101, + "language_loss": 0.7049641, + "learning_rate": 1.42451703318251e-06, + "loss": 0.72576958, + "num_input_tokens_seen": 212442115, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.39453125, + "step": 9862, + "time_per_iteration": 3.8923871517181396 + }, + { + "auxiliary_loss_clip": 0.01058784, + "auxiliary_loss_mlp": 0.01024544, + "balance_loss_clip": 1.01317728, + "balance_loss_mlp": 1.01946008, + "epoch": 0.5929956410641816, + "flos": 24965028501120.0, + "grad_norm": 1.7269473999225171, + "language_loss": 0.78048742, + "learning_rate": 1.424155199943268e-06, + "loss": 0.80132067, + "num_input_tokens_seen": 212459535, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.39453125, + "step": 9863, + "time_per_iteration": 2.4163131713867188 + }, + { + "auxiliary_loss_clip": 0.01059102, + "auxiliary_loss_mlp": 0.01024515, + "balance_loss_clip": 1.01237988, + "balance_loss_mlp": 1.01995897, + "epoch": 0.5930557643168496, + "flos": 26207273053440.0, + "grad_norm": 1.6922519426208391, + "language_loss": 0.70374739, + "learning_rate": 1.4237933872544456e-06, + "loss": 0.72458363, + "num_input_tokens_seen": 212479385, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.39257812, + "step": 9864, + "time_per_iteration": 2.444216728210449 + }, + { + "auxiliary_loss_clip": 0.01059501, + "auxiliary_loss_mlp": 0.01022606, + "balance_loss_clip": 1.01112056, + "balance_loss_mlp": 1.0193187, + "epoch": 0.5931158875695175, + "flos": 27853706949120.0, + "grad_norm": 1.4828537340175527, + "language_loss": 0.67562532, + "learning_rate": 1.4234315951289548e-06, + "loss": 0.69644636, + "num_input_tokens_seen": 212500060, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.40234375, + "step": 9865, + "time_per_iteration": 2.419546127319336 + }, + { + "auxiliary_loss_clip": 0.01060127, + "auxiliary_loss_mlp": 0.01025126, + "balance_loss_clip": 1.01257348, + "balance_loss_mlp": 1.01910472, + "epoch": 0.5931760108221855, + "flos": 15412756538880.0, + "grad_norm": 2.8508025390464837, + "language_loss": 0.78024983, + "learning_rate": 1.4230698235797073e-06, + "loss": 0.80110234, + "num_input_tokens_seen": 212518590, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.41015625, + "step": 9866, + "time_per_iteration": 2.3640196323394775 + }, + { + "auxiliary_loss_clip": 0.01058659, + "auxiliary_loss_mlp": 0.01020979, + "balance_loss_clip": 1.01017261, + "balance_loss_mlp": 1.01970387, + "epoch": 0.5932361340748534, + "flos": 30187266779520.0, + "grad_norm": 2.6592033471483894, + "language_loss": 0.72450608, + "learning_rate": 1.422708072619614e-06, + "loss": 0.74530244, + "num_input_tokens_seen": 212538190, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.38867188, + "step": 9867, + "time_per_iteration": 3.8214991092681885 + }, + { + "auxiliary_loss_clip": 0.0105969, + "auxiliary_loss_mlp": 0.01022348, + "balance_loss_clip": 1.01017058, + "balance_loss_mlp": 1.02051497, + "epoch": 0.5932962573275214, + "flos": 20156510367360.0, + "grad_norm": 1.5500509822316142, + "language_loss": 0.66632146, + "learning_rate": 1.4223463422615844e-06, + "loss": 0.68714184, + "num_input_tokens_seen": 212557820, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.390625, + "step": 9868, + "time_per_iteration": 2.39166259765625 + }, + { + "auxiliary_loss_clip": 0.01060269, + "auxiliary_loss_mlp": 0.01024201, + "balance_loss_clip": 1.01161253, + "balance_loss_mlp": 1.01942086, + "epoch": 0.5933563805801894, + "flos": 25444769760000.0, + "grad_norm": 1.5919612977392503, + "language_loss": 0.75454623, + "learning_rate": 1.4219846325185282e-06, + "loss": 0.77539092, + "num_input_tokens_seen": 212577645, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.40820312, + "step": 9869, + "time_per_iteration": 2.4040157794952393 + }, + { + "auxiliary_loss_clip": 0.01062515, + "auxiliary_loss_mlp": 0.01029593, + "balance_loss_clip": 1.01684391, + "balance_loss_mlp": 1.02178168, + "epoch": 0.5934165038328574, + "flos": 59993701781760.0, + "grad_norm": 1.5719090682815298, + "language_loss": 0.74090952, + "learning_rate": 1.4216229434033533e-06, + "loss": 0.76183057, + "num_input_tokens_seen": 212603430, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.40625, + "step": 9870, + "time_per_iteration": 2.7236902713775635 + }, + { + "auxiliary_loss_clip": 0.0106075, + "auxiliary_loss_mlp": 0.01024073, + "balance_loss_clip": 1.01220608, + "balance_loss_mlp": 1.01947546, + "epoch": 0.5934766270855253, + "flos": 24419545418880.0, + "grad_norm": 1.9531951169514759, + "language_loss": 0.71855718, + "learning_rate": 1.4212612749289687e-06, + "loss": 0.73940539, + "num_input_tokens_seen": 212620730, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.41210938, + "step": 9871, + "time_per_iteration": 2.3813045024871826 + }, + { + "auxiliary_loss_clip": 0.01061993, + "auxiliary_loss_mlp": 0.01020705, + "balance_loss_clip": 1.00836051, + "balance_loss_mlp": 1.01963425, + "epoch": 0.5935367503381933, + "flos": 23512047782400.0, + "grad_norm": 2.9338735835177125, + "language_loss": 0.74817789, + "learning_rate": 1.4208996271082794e-06, + "loss": 0.76900488, + "num_input_tokens_seen": 212639745, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.42382812, + "step": 9872, + "time_per_iteration": 2.406097412109375 + }, + { + "auxiliary_loss_clip": 0.01060265, + "auxiliary_loss_mlp": 0.01029357, + "balance_loss_clip": 1.01583886, + "balance_loss_mlp": 1.01862133, + "epoch": 0.5935968735908612, + "flos": 18947468384640.0, + "grad_norm": 2.076240944757009, + "language_loss": 0.7923373, + "learning_rate": 1.4205379999541935e-06, + "loss": 0.81323349, + "num_input_tokens_seen": 212655915, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.41601562, + "step": 9873, + "time_per_iteration": 2.377115249633789 + }, + { + "auxiliary_loss_clip": 0.01060384, + "auxiliary_loss_mlp": 0.01027887, + "balance_loss_clip": 1.01584697, + "balance_loss_mlp": 1.0189476, + "epoch": 0.5936569968435292, + "flos": 25482266426880.0, + "grad_norm": 1.6212139090551687, + "language_loss": 0.84737378, + "learning_rate": 1.4201763934796157e-06, + "loss": 0.86825651, + "num_input_tokens_seen": 212676115, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.4140625, + "step": 9874, + "time_per_iteration": 2.5023183822631836 + }, + { + "auxiliary_loss_clip": 0.01008649, + "auxiliary_loss_mlp": 0.01001518, + "balance_loss_clip": 1.00062418, + "balance_loss_mlp": 1.00124884, + "epoch": 0.5937171200961973, + "flos": 66375194711040.0, + "grad_norm": 0.7077015704125607, + "language_loss": 0.60016096, + "learning_rate": 1.4198148076974503e-06, + "loss": 0.62026262, + "num_input_tokens_seen": 212737560, + "router_z_loss_clip": 0.00891113, + "router_z_loss_mlp": 0.07421875, + "step": 9875, + "time_per_iteration": 3.128615379333496 + }, + { + "auxiliary_loss_clip": 0.0106102, + "auxiliary_loss_mlp": 0.01022104, + "balance_loss_clip": 1.00930691, + "balance_loss_mlp": 1.01834023, + "epoch": 0.5937772433488652, + "flos": 14902570707840.0, + "grad_norm": 2.4532246455010394, + "language_loss": 0.77400124, + "learning_rate": 1.4194532426206028e-06, + "loss": 0.79483247, + "num_input_tokens_seen": 212755365, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.42773438, + "step": 9876, + "time_per_iteration": 2.3914594650268555 + }, + { + "auxiliary_loss_clip": 0.01055215, + "auxiliary_loss_mlp": 0.01021548, + "balance_loss_clip": 1.01030612, + "balance_loss_mlp": 1.01759672, + "epoch": 0.5938373666015332, + "flos": 22560490143360.0, + "grad_norm": 1.445799188392346, + "language_loss": 0.75603652, + "learning_rate": 1.4190916982619749e-06, + "loss": 0.77680409, + "num_input_tokens_seen": 212773875, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.375, + "step": 9877, + "time_per_iteration": 2.39890456199646 + }, + { + "auxiliary_loss_clip": 0.0106008, + "auxiliary_loss_mlp": 0.01025282, + "balance_loss_clip": 1.0130688, + "balance_loss_mlp": 1.01846945, + "epoch": 0.5938974898542011, + "flos": 18439935816960.0, + "grad_norm": 2.4816762510054065, + "language_loss": 0.80727911, + "learning_rate": 1.418730174634471e-06, + "loss": 0.82813275, + "num_input_tokens_seen": 212790590, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.41601562, + "step": 9878, + "time_per_iteration": 2.3607397079467773 + }, + { + "auxiliary_loss_clip": 0.01059935, + "auxiliary_loss_mlp": 0.01033296, + "balance_loss_clip": 1.02097011, + "balance_loss_mlp": 1.01934123, + "epoch": 0.5939576131068691, + "flos": 45585011433600.0, + "grad_norm": 1.7496817186175535, + "language_loss": 0.71040869, + "learning_rate": 1.4183686717509913e-06, + "loss": 0.731341, + "num_input_tokens_seen": 212812265, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.40625, + "step": 9879, + "time_per_iteration": 2.599663019180298 + }, + { + "auxiliary_loss_clip": 0.01059305, + "auxiliary_loss_mlp": 0.01027018, + "balance_loss_clip": 1.01561582, + "balance_loss_mlp": 1.01930237, + "epoch": 0.594017736359537, + "flos": 23586552357120.0, + "grad_norm": 1.4687774524828363, + "language_loss": 0.57717884, + "learning_rate": 1.4180071896244375e-06, + "loss": 0.59804207, + "num_input_tokens_seen": 212831915, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.40039062, + "step": 9880, + "time_per_iteration": 2.454163074493408 + }, + { + "auxiliary_loss_clip": 0.0105795, + "auxiliary_loss_mlp": 0.01027637, + "balance_loss_clip": 1.01477408, + "balance_loss_mlp": 1.01783764, + "epoch": 0.594077859612205, + "flos": 29456045930880.0, + "grad_norm": 2.020712596468703, + "language_loss": 0.77552247, + "learning_rate": 1.4176457282677103e-06, + "loss": 0.79637837, + "num_input_tokens_seen": 212851350, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.40039062, + "step": 9881, + "time_per_iteration": 2.478395938873291 + }, + { + "auxiliary_loss_clip": 0.01057382, + "auxiliary_loss_mlp": 0.01023565, + "balance_loss_clip": 1.01145387, + "balance_loss_mlp": 1.01806259, + "epoch": 0.594137982864873, + "flos": 16799157561600.0, + "grad_norm": 2.453350302727312, + "language_loss": 0.8229804, + "learning_rate": 1.4172842876937088e-06, + "loss": 0.84378994, + "num_input_tokens_seen": 212867995, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.39257812, + "step": 9882, + "time_per_iteration": 2.4010677337646484 + }, + { + "auxiliary_loss_clip": 0.01058023, + "auxiliary_loss_mlp": 0.01022116, + "balance_loss_clip": 1.01091027, + "balance_loss_mlp": 1.01805091, + "epoch": 0.594198106117541, + "flos": 12749442117120.0, + "grad_norm": 2.019982040019453, + "language_loss": 0.79294169, + "learning_rate": 1.4169228679153324e-06, + "loss": 0.81374311, + "num_input_tokens_seen": 212885220, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.3984375, + "step": 9883, + "time_per_iteration": 2.364865303039551 + }, + { + "auxiliary_loss_clip": 0.01063513, + "auxiliary_loss_mlp": 0.01026808, + "balance_loss_clip": 1.01351666, + "balance_loss_mlp": 1.02021217, + "epoch": 0.5942582293702089, + "flos": 20995473271680.0, + "grad_norm": 1.756431152228983, + "language_loss": 0.74385142, + "learning_rate": 1.4165614689454788e-06, + "loss": 0.76475459, + "num_input_tokens_seen": 212903195, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.43359375, + "step": 9884, + "time_per_iteration": 2.401526689529419 + }, + { + "auxiliary_loss_clip": 0.01060301, + "auxiliary_loss_mlp": 0.01024638, + "balance_loss_clip": 1.01209688, + "balance_loss_mlp": 1.01951027, + "epoch": 0.5943183526228769, + "flos": 28290226078080.0, + "grad_norm": 1.9114471772959944, + "language_loss": 0.65722191, + "learning_rate": 1.416200090797046e-06, + "loss": 0.67807132, + "num_input_tokens_seen": 212923340, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.40625, + "step": 9885, + "time_per_iteration": 2.601661443710327 + }, + { + "auxiliary_loss_clip": 0.01059856, + "auxiliary_loss_mlp": 0.01027451, + "balance_loss_clip": 1.01471961, + "balance_loss_mlp": 1.01972163, + "epoch": 0.5943784758755448, + "flos": 26613417432960.0, + "grad_norm": 2.2580406364455774, + "language_loss": 0.76845211, + "learning_rate": 1.4158387334829304e-06, + "loss": 0.78932524, + "num_input_tokens_seen": 212942755, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.40234375, + "step": 9886, + "time_per_iteration": 2.456775188446045 + }, + { + "auxiliary_loss_clip": 0.01008455, + "auxiliary_loss_mlp": 0.01001778, + "balance_loss_clip": 1.00079417, + "balance_loss_mlp": 1.00100088, + "epoch": 0.5944385991282128, + "flos": 64187781298560.0, + "grad_norm": 0.833816178054357, + "language_loss": 0.64331424, + "learning_rate": 1.4154773970160272e-06, + "loss": 0.6634165, + "num_input_tokens_seen": 212999355, + "router_z_loss_clip": 0.00982666, + "router_z_loss_mlp": 0.07470703, + "step": 9887, + "time_per_iteration": 2.930919885635376 + }, + { + "auxiliary_loss_clip": 0.01058287, + "auxiliary_loss_mlp": 0.010242, + "balance_loss_clip": 1.01248789, + "balance_loss_mlp": 1.01856649, + "epoch": 0.5944987223808808, + "flos": 19571017000320.0, + "grad_norm": 1.7505515186695233, + "language_loss": 0.6905421, + "learning_rate": 1.4151160814092325e-06, + "loss": 0.71136701, + "num_input_tokens_seen": 213018570, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.39648438, + "step": 9888, + "time_per_iteration": 4.0275819301605225 + }, + { + "auxiliary_loss_clip": 0.01059669, + "auxiliary_loss_mlp": 0.01025068, + "balance_loss_clip": 1.01297975, + "balance_loss_mlp": 1.01966619, + "epoch": 0.5945588456335488, + "flos": 26176374633600.0, + "grad_norm": 1.5931636231331483, + "language_loss": 0.79478574, + "learning_rate": 1.4147547866754396e-06, + "loss": 0.81563306, + "num_input_tokens_seen": 213037735, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.3984375, + "step": 9889, + "time_per_iteration": 2.449514865875244 + }, + { + "auxiliary_loss_clip": 0.01057238, + "auxiliary_loss_mlp": 0.0102494, + "balance_loss_clip": 1.01323378, + "balance_loss_mlp": 1.01789641, + "epoch": 0.5946189688862168, + "flos": 20445521535360.0, + "grad_norm": 1.5874650742649963, + "language_loss": 0.70489234, + "learning_rate": 1.414393512827544e-06, + "loss": 0.72571415, + "num_input_tokens_seen": 213057160, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.39453125, + "step": 9890, + "time_per_iteration": 2.411473512649536 + }, + { + "auxiliary_loss_clip": 0.01060566, + "auxiliary_loss_mlp": 0.01024783, + "balance_loss_clip": 1.01233125, + "balance_loss_mlp": 1.01878071, + "epoch": 0.5946790921388847, + "flos": 13436847342720.0, + "grad_norm": 1.9265568986364514, + "language_loss": 0.6923129, + "learning_rate": 1.414032259878437e-06, + "loss": 0.71316636, + "num_input_tokens_seen": 213073630, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.41796875, + "step": 9891, + "time_per_iteration": 2.406820297241211 + }, + { + "auxiliary_loss_clip": 0.01057065, + "auxiliary_loss_mlp": 0.01025381, + "balance_loss_clip": 1.01323974, + "balance_loss_mlp": 1.01885128, + "epoch": 0.5947392153915527, + "flos": 20411236713600.0, + "grad_norm": 2.7283249607047764, + "language_loss": 0.53769821, + "learning_rate": 1.4136710278410111e-06, + "loss": 0.55852264, + "num_input_tokens_seen": 213092450, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.3828125, + "step": 9892, + "time_per_iteration": 2.3995773792266846 + }, + { + "auxiliary_loss_clip": 0.01007905, + "auxiliary_loss_mlp": 0.01001333, + "balance_loss_clip": 1.00034332, + "balance_loss_mlp": 1.00050318, + "epoch": 0.5947993386442206, + "flos": 65615798528640.0, + "grad_norm": 0.6599374522607311, + "language_loss": 0.54550803, + "learning_rate": 1.4133098167281583e-06, + "loss": 0.5656004, + "num_input_tokens_seen": 213155465, + "router_z_loss_clip": 0.0098877, + "router_z_loss_mlp": 0.07421875, + "step": 9893, + "time_per_iteration": 3.105945110321045 + }, + { + "auxiliary_loss_clip": 0.01059196, + "auxiliary_loss_mlp": 0.01022235, + "balance_loss_clip": 1.01090407, + "balance_loss_mlp": 1.01979017, + "epoch": 0.5948594618968887, + "flos": 23182048811520.0, + "grad_norm": 1.687701195850652, + "language_loss": 0.74375796, + "learning_rate": 1.4129486265527689e-06, + "loss": 0.76457226, + "num_input_tokens_seen": 213174875, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.39453125, + "step": 9894, + "time_per_iteration": 2.412101984024048 + }, + { + "auxiliary_loss_clip": 0.0105834, + "auxiliary_loss_mlp": 0.01025746, + "balance_loss_clip": 1.01389647, + "balance_loss_mlp": 1.01846933, + "epoch": 0.5949195851495566, + "flos": 13625901688320.0, + "grad_norm": 2.2572406086242505, + "language_loss": 0.77713621, + "learning_rate": 1.4125874573277333e-06, + "loss": 0.79797703, + "num_input_tokens_seen": 213192695, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.3984375, + "step": 9895, + "time_per_iteration": 2.3814268112182617 + }, + { + "auxiliary_loss_clip": 0.01061431, + "auxiliary_loss_mlp": 0.0102684, + "balance_loss_clip": 1.01464486, + "balance_loss_mlp": 1.02099669, + "epoch": 0.5949797084022246, + "flos": 19750121608320.0, + "grad_norm": 1.6172676754420598, + "language_loss": 0.78924537, + "learning_rate": 1.41222630906594e-06, + "loss": 0.81012809, + "num_input_tokens_seen": 213211195, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.40429688, + "step": 9896, + "time_per_iteration": 2.4046835899353027 + }, + { + "auxiliary_loss_clip": 0.01058984, + "auxiliary_loss_mlp": 0.01026012, + "balance_loss_clip": 1.01355505, + "balance_loss_mlp": 1.01860762, + "epoch": 0.5950398316548925, + "flos": 25772743872000.0, + "grad_norm": 1.502144464574872, + "language_loss": 0.83444631, + "learning_rate": 1.4118651817802776e-06, + "loss": 0.85529625, + "num_input_tokens_seen": 213231975, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.40429688, + "step": 9897, + "time_per_iteration": 2.4553496837615967 + }, + { + "auxiliary_loss_clip": 0.01060123, + "auxiliary_loss_mlp": 0.01024701, + "balance_loss_clip": 1.0125891, + "balance_loss_mlp": 1.01948071, + "epoch": 0.5950999549075605, + "flos": 23037927252480.0, + "grad_norm": 1.669378312699458, + "language_loss": 0.70594662, + "learning_rate": 1.4115040754836344e-06, + "loss": 0.7267949, + "num_input_tokens_seen": 213249760, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.40625, + "step": 9898, + "time_per_iteration": 2.4512979984283447 + }, + { + "auxiliary_loss_clip": 0.01064079, + "auxiliary_loss_mlp": 0.01033134, + "balance_loss_clip": 1.01949072, + "balance_loss_mlp": 1.01971865, + "epoch": 0.5951600781602284, + "flos": 32445169960320.0, + "grad_norm": 2.174884426947218, + "language_loss": 0.6375736, + "learning_rate": 1.4111429901888964e-06, + "loss": 0.65854567, + "num_input_tokens_seen": 213269890, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.44335938, + "step": 9899, + "time_per_iteration": 2.5174663066864014 + }, + { + "auxiliary_loss_clip": 0.0105982, + "auxiliary_loss_mlp": 0.01020614, + "balance_loss_clip": 1.0099504, + "balance_loss_mlp": 1.01962364, + "epoch": 0.5952202014128964, + "flos": 23799871964160.0, + "grad_norm": 1.5691820199686273, + "language_loss": 0.72129214, + "learning_rate": 1.4107819259089514e-06, + "loss": 0.74209642, + "num_input_tokens_seen": 213289400, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.40234375, + "step": 9900, + "time_per_iteration": 3.9191744327545166 + }, + { + "auxiliary_loss_clip": 0.01058216, + "auxiliary_loss_mlp": 0.01023421, + "balance_loss_clip": 1.01149404, + "balance_loss_mlp": 1.01973522, + "epoch": 0.5952803246655644, + "flos": 22491082627200.0, + "grad_norm": 1.6282210960069947, + "language_loss": 0.84600556, + "learning_rate": 1.4104208826566835e-06, + "loss": 0.86682194, + "num_input_tokens_seen": 213308040, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.38476562, + "step": 9901, + "time_per_iteration": 3.8608200550079346 + }, + { + "auxiliary_loss_clip": 0.01008567, + "auxiliary_loss_mlp": 0.01002389, + "balance_loss_clip": 1.00145912, + "balance_loss_mlp": 1.00117385, + "epoch": 0.5953404479182324, + "flos": 51232001846400.0, + "grad_norm": 0.7918441524569241, + "language_loss": 0.58204573, + "learning_rate": 1.4100598604449773e-06, + "loss": 0.60215527, + "num_input_tokens_seen": 213358585, + "router_z_loss_clip": 0.00927734, + "router_z_loss_mlp": 0.07421875, + "step": 9902, + "time_per_iteration": 2.921813726425171 + }, + { + "auxiliary_loss_clip": 0.01060059, + "auxiliary_loss_mlp": 0.01027638, + "balance_loss_clip": 1.014835, + "balance_loss_mlp": 1.01873159, + "epoch": 0.5954005711709004, + "flos": 23111559043200.0, + "grad_norm": 1.963277308084482, + "language_loss": 0.77017701, + "learning_rate": 1.4096988592867173e-06, + "loss": 0.79105401, + "num_input_tokens_seen": 213379585, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.4140625, + "step": 9903, + "time_per_iteration": 2.455009698867798 + }, + { + "auxiliary_loss_clip": 0.01060853, + "auxiliary_loss_mlp": 0.01027066, + "balance_loss_clip": 1.01577687, + "balance_loss_mlp": 1.0199914, + "epoch": 0.5954606944235683, + "flos": 35953277483520.0, + "grad_norm": 1.7922452192957508, + "language_loss": 0.77635789, + "learning_rate": 1.4093378791947863e-06, + "loss": 0.79723716, + "num_input_tokens_seen": 213401465, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.41015625, + "step": 9904, + "time_per_iteration": 2.54026460647583 + }, + { + "auxiliary_loss_clip": 0.01060522, + "auxiliary_loss_mlp": 0.01025193, + "balance_loss_clip": 1.0129261, + "balance_loss_mlp": 1.019063, + "epoch": 0.5955208176762363, + "flos": 30442412062080.0, + "grad_norm": 1.7188763204263695, + "language_loss": 0.72964048, + "learning_rate": 1.4089769201820673e-06, + "loss": 0.7504977, + "num_input_tokens_seen": 213422720, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.4140625, + "step": 9905, + "time_per_iteration": 2.556755781173706 + }, + { + "auxiliary_loss_clip": 0.01061131, + "auxiliary_loss_mlp": 0.0102455, + "balance_loss_clip": 1.0119971, + "balance_loss_mlp": 1.01994181, + "epoch": 0.5955809409289042, + "flos": 17639132895360.0, + "grad_norm": 1.965855630531533, + "language_loss": 0.69740617, + "learning_rate": 1.4086159822614417e-06, + "loss": 0.71826303, + "num_input_tokens_seen": 213439480, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.41210938, + "step": 9906, + "time_per_iteration": 2.3655567169189453 + }, + { + "auxiliary_loss_clip": 0.01060052, + "auxiliary_loss_mlp": 0.01025054, + "balance_loss_clip": 1.01384258, + "balance_loss_mlp": 1.01905656, + "epoch": 0.5956410641815723, + "flos": 24278740439040.0, + "grad_norm": 1.7799896970412579, + "language_loss": 0.75090337, + "learning_rate": 1.4082550654457906e-06, + "loss": 0.77175438, + "num_input_tokens_seen": 213458895, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.41015625, + "step": 9907, + "time_per_iteration": 3.824955463409424 + }, + { + "auxiliary_loss_clip": 0.01060478, + "auxiliary_loss_mlp": 0.01026081, + "balance_loss_clip": 1.01423144, + "balance_loss_mlp": 1.01947474, + "epoch": 0.5957011874342402, + "flos": 35732870870400.0, + "grad_norm": 1.7678849899728692, + "language_loss": 0.67310447, + "learning_rate": 1.407894169747994e-06, + "loss": 0.69397002, + "num_input_tokens_seen": 213481730, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.41015625, + "step": 9908, + "time_per_iteration": 2.530210256576538 + }, + { + "auxiliary_loss_clip": 0.01059511, + "auxiliary_loss_mlp": 0.01023959, + "balance_loss_clip": 1.0119729, + "balance_loss_mlp": 1.02012396, + "epoch": 0.5957613106869082, + "flos": 21244125041280.0, + "grad_norm": 1.9900466422401324, + "language_loss": 0.76356411, + "learning_rate": 1.4075332951809312e-06, + "loss": 0.78439879, + "num_input_tokens_seen": 213497225, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.39453125, + "step": 9909, + "time_per_iteration": 2.3847692012786865 + }, + { + "auxiliary_loss_clip": 0.01008041, + "auxiliary_loss_mlp": 0.01001271, + "balance_loss_clip": 1.00040662, + "balance_loss_mlp": 1.00070179, + "epoch": 0.5958214339395761, + "flos": 65937802798080.0, + "grad_norm": 0.9222208267415828, + "language_loss": 0.73447728, + "learning_rate": 1.4071724417574814e-06, + "loss": 0.75457036, + "num_input_tokens_seen": 213556890, + "router_z_loss_clip": 0.00866699, + "router_z_loss_mlp": 0.07324219, + "step": 9910, + "time_per_iteration": 3.0865159034729004 + }, + { + "auxiliary_loss_clip": 0.01059655, + "auxiliary_loss_mlp": 0.01026358, + "balance_loss_clip": 1.0139606, + "balance_loss_mlp": 1.01865315, + "epoch": 0.5958815571922441, + "flos": 23217660264960.0, + "grad_norm": 1.6400220607384801, + "language_loss": 0.69513905, + "learning_rate": 1.4068116094905218e-06, + "loss": 0.71599919, + "num_input_tokens_seen": 213575800, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.41015625, + "step": 9911, + "time_per_iteration": 2.4218623638153076 + }, + { + "auxiliary_loss_clip": 0.01064167, + "auxiliary_loss_mlp": 0.01029304, + "balance_loss_clip": 1.01614952, + "balance_loss_mlp": 1.0205071, + "epoch": 0.595941680444912, + "flos": 16537867879680.0, + "grad_norm": 1.9759916228697, + "language_loss": 0.65353978, + "learning_rate": 1.4064507983929304e-06, + "loss": 0.67447448, + "num_input_tokens_seen": 213592740, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.4375, + "step": 9912, + "time_per_iteration": 2.424767017364502 + }, + { + "auxiliary_loss_clip": 0.0105943, + "auxiliary_loss_mlp": 0.01030976, + "balance_loss_clip": 1.0181905, + "balance_loss_mlp": 1.01909137, + "epoch": 0.59600180369758, + "flos": 27817641648000.0, + "grad_norm": 2.0557930077934983, + "language_loss": 0.72963661, + "learning_rate": 1.4060900084775832e-06, + "loss": 0.75054061, + "num_input_tokens_seen": 213611970, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.40234375, + "step": 9913, + "time_per_iteration": 2.4622578620910645 + }, + { + "auxiliary_loss_clip": 0.01061735, + "auxiliary_loss_mlp": 0.01024887, + "balance_loss_clip": 1.01248312, + "balance_loss_mlp": 1.01879585, + "epoch": 0.596061926950248, + "flos": 29490435486720.0, + "grad_norm": 2.000490398368653, + "language_loss": 0.80102074, + "learning_rate": 1.4057292397573553e-06, + "loss": 0.82188696, + "num_input_tokens_seen": 213632230, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.4296875, + "step": 9914, + "time_per_iteration": 2.464299440383911 + }, + { + "auxiliary_loss_clip": 0.01060226, + "auxiliary_loss_mlp": 0.01025174, + "balance_loss_clip": 1.0129137, + "balance_loss_mlp": 1.02042413, + "epoch": 0.596122050202916, + "flos": 16835851267200.0, + "grad_norm": 1.8571964571546897, + "language_loss": 0.67531103, + "learning_rate": 1.405368492245123e-06, + "loss": 0.69616503, + "num_input_tokens_seen": 213649645, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.3984375, + "step": 9915, + "time_per_iteration": 2.3982207775115967 + }, + { + "auxiliary_loss_clip": 0.01059685, + "auxiliary_loss_mlp": 0.01025519, + "balance_loss_clip": 1.01329994, + "balance_loss_mlp": 1.01895976, + "epoch": 0.596182173455584, + "flos": 20995578005760.0, + "grad_norm": 1.6928497474398452, + "language_loss": 0.78684378, + "learning_rate": 1.4050077659537593e-06, + "loss": 0.80769581, + "num_input_tokens_seen": 213668850, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.40820312, + "step": 9916, + "time_per_iteration": 2.4073925018310547 + }, + { + "auxiliary_loss_clip": 0.01058654, + "auxiliary_loss_mlp": 0.01024796, + "balance_loss_clip": 1.01295888, + "balance_loss_mlp": 1.01900899, + "epoch": 0.5962422967082519, + "flos": 16064899424640.0, + "grad_norm": 1.9625131810553087, + "language_loss": 0.82711816, + "learning_rate": 1.404647060896138e-06, + "loss": 0.84795272, + "num_input_tokens_seen": 213685695, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.39648438, + "step": 9917, + "time_per_iteration": 2.4015653133392334 + }, + { + "auxiliary_loss_clip": 0.01061077, + "auxiliary_loss_mlp": 0.01023191, + "balance_loss_clip": 1.01132941, + "balance_loss_mlp": 1.02070999, + "epoch": 0.5963024199609199, + "flos": 12166148165760.0, + "grad_norm": 1.8430547123292373, + "language_loss": 0.77012569, + "learning_rate": 1.404286377085132e-06, + "loss": 0.7909683, + "num_input_tokens_seen": 213703515, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.40234375, + "step": 9918, + "time_per_iteration": 2.388901948928833 + }, + { + "auxiliary_loss_clip": 0.01060834, + "auxiliary_loss_mlp": 0.01024906, + "balance_loss_clip": 1.01329505, + "balance_loss_mlp": 1.02061796, + "epoch": 0.5963625432135878, + "flos": 28073031310080.0, + "grad_norm": 1.4753566306570225, + "language_loss": 0.78858525, + "learning_rate": 1.4039257145336118e-06, + "loss": 0.80944264, + "num_input_tokens_seen": 213724170, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.40234375, + "step": 9919, + "time_per_iteration": 2.4608023166656494 + }, + { + "auxiliary_loss_clip": 0.01059525, + "auxiliary_loss_mlp": 0.0102376, + "balance_loss_clip": 1.01139235, + "balance_loss_mlp": 1.019117, + "epoch": 0.5964226664662559, + "flos": 19859434675200.0, + "grad_norm": 1.9294379320419648, + "language_loss": 0.77792579, + "learning_rate": 1.4035650732544504e-06, + "loss": 0.79875869, + "num_input_tokens_seen": 213740620, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.40429688, + "step": 9920, + "time_per_iteration": 2.391240358352661 + }, + { + "auxiliary_loss_clip": 0.01063368, + "auxiliary_loss_mlp": 0.01025976, + "balance_loss_clip": 1.01373935, + "balance_loss_mlp": 1.02138281, + "epoch": 0.5964827897189238, + "flos": 12931793481600.0, + "grad_norm": 2.2520764504993216, + "language_loss": 0.82441384, + "learning_rate": 1.4032044532605168e-06, + "loss": 0.84530735, + "num_input_tokens_seen": 213755390, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.41992188, + "step": 9921, + "time_per_iteration": 2.4466259479522705 + }, + { + "auxiliary_loss_clip": 0.01059413, + "auxiliary_loss_mlp": 0.01023741, + "balance_loss_clip": 1.01144433, + "balance_loss_mlp": 1.01863623, + "epoch": 0.5965429129715918, + "flos": 18149807485440.0, + "grad_norm": 2.4112114578519783, + "language_loss": 0.80764771, + "learning_rate": 1.4028438545646817e-06, + "loss": 0.82847929, + "num_input_tokens_seen": 213773225, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.40820312, + "step": 9922, + "time_per_iteration": 2.369492530822754 + }, + { + "auxiliary_loss_clip": 0.01058783, + "auxiliary_loss_mlp": 0.0102349, + "balance_loss_clip": 1.01130116, + "balance_loss_mlp": 1.01902032, + "epoch": 0.5966030362242597, + "flos": 21762131016960.0, + "grad_norm": 2.119500939552655, + "language_loss": 0.76967454, + "learning_rate": 1.4024832771798132e-06, + "loss": 0.79049736, + "num_input_tokens_seen": 213791860, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.3984375, + "step": 9923, + "time_per_iteration": 2.3974382877349854 + }, + { + "auxiliary_loss_clip": 0.01064224, + "auxiliary_loss_mlp": 0.01030446, + "balance_loss_clip": 1.01647425, + "balance_loss_mlp": 1.02088678, + "epoch": 0.5966631594769277, + "flos": 18806209557120.0, + "grad_norm": 1.757928676547696, + "language_loss": 0.75709748, + "learning_rate": 1.4021227211187793e-06, + "loss": 0.77804422, + "num_input_tokens_seen": 213809455, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.43359375, + "step": 9924, + "time_per_iteration": 2.4118402004241943 + }, + { + "auxiliary_loss_clip": 0.010574, + "auxiliary_loss_mlp": 0.01025249, + "balance_loss_clip": 1.01286888, + "balance_loss_mlp": 1.01896882, + "epoch": 0.5967232827295956, + "flos": 14063293601280.0, + "grad_norm": 2.050613806776552, + "language_loss": 0.66244185, + "learning_rate": 1.4017621863944475e-06, + "loss": 0.68326837, + "num_input_tokens_seen": 213826615, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.38476562, + "step": 9925, + "time_per_iteration": 2.430246114730835 + }, + { + "auxiliary_loss_clip": 0.01060854, + "auxiliary_loss_mlp": 0.0102697, + "balance_loss_clip": 1.01535296, + "balance_loss_mlp": 1.02181578, + "epoch": 0.5967834059822636, + "flos": 17237282613120.0, + "grad_norm": 2.233375260073239, + "language_loss": 0.71599215, + "learning_rate": 1.4014016730196845e-06, + "loss": 0.73687041, + "num_input_tokens_seen": 213844495, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.390625, + "step": 9926, + "time_per_iteration": 2.410348892211914 + }, + { + "auxiliary_loss_clip": 0.01060667, + "auxiliary_loss_mlp": 0.01023604, + "balance_loss_clip": 1.01081252, + "balance_loss_mlp": 1.01875782, + "epoch": 0.5968435292349316, + "flos": 42518659743360.0, + "grad_norm": 2.1909749973975967, + "language_loss": 0.70025921, + "learning_rate": 1.4010411810073563e-06, + "loss": 0.72110194, + "num_input_tokens_seen": 213869125, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.41992188, + "step": 9927, + "time_per_iteration": 2.592897891998291 + }, + { + "auxiliary_loss_clip": 0.01063272, + "auxiliary_loss_mlp": 0.01025943, + "balance_loss_clip": 1.01173294, + "balance_loss_mlp": 1.01977825, + "epoch": 0.5969036524875996, + "flos": 37629457724160.0, + "grad_norm": 3.8598970950553526, + "language_loss": 0.63692588, + "learning_rate": 1.4006807103703271e-06, + "loss": 0.65781808, + "num_input_tokens_seen": 213891115, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.43359375, + "step": 9928, + "time_per_iteration": 4.042337417602539 + }, + { + "auxiliary_loss_clip": 0.01057045, + "auxiliary_loss_mlp": 0.01023837, + "balance_loss_clip": 1.01247025, + "balance_loss_mlp": 1.01795459, + "epoch": 0.5969637757402676, + "flos": 23147275230720.0, + "grad_norm": 1.5971828233228933, + "language_loss": 0.69722867, + "learning_rate": 1.4003202611214623e-06, + "loss": 0.71803749, + "num_input_tokens_seen": 213911925, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.390625, + "step": 9929, + "time_per_iteration": 2.4332847595214844 + }, + { + "auxiliary_loss_clip": 0.0105849, + "auxiliary_loss_mlp": 0.01024486, + "balance_loss_clip": 1.01310754, + "balance_loss_mlp": 1.01994693, + "epoch": 0.5970238989929355, + "flos": 24019894552320.0, + "grad_norm": 1.6863159419824827, + "language_loss": 0.76739162, + "learning_rate": 1.3999598332736247e-06, + "loss": 0.78822142, + "num_input_tokens_seen": 213930715, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.38671875, + "step": 9930, + "time_per_iteration": 2.4283952713012695 + }, + { + "auxiliary_loss_clip": 0.01062733, + "auxiliary_loss_mlp": 0.01025161, + "balance_loss_clip": 1.01231599, + "balance_loss_mlp": 1.02109766, + "epoch": 0.5970840222456035, + "flos": 19425883011840.0, + "grad_norm": 2.0012110114273027, + "language_loss": 0.68976188, + "learning_rate": 1.399599426839677e-06, + "loss": 0.71064079, + "num_input_tokens_seen": 213950015, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.41601562, + "step": 9931, + "time_per_iteration": 2.405776262283325 + }, + { + "auxiliary_loss_clip": 0.01008801, + "auxiliary_loss_mlp": 0.01001617, + "balance_loss_clip": 1.00073457, + "balance_loss_mlp": 1.00124454, + "epoch": 0.5971441454982714, + "flos": 62973781902720.0, + "grad_norm": 0.8526923606303582, + "language_loss": 0.64215261, + "learning_rate": 1.3992390418324815e-06, + "loss": 0.66225678, + "num_input_tokens_seen": 214003330, + "router_z_loss_clip": 0.0088501, + "router_z_loss_mlp": 0.07568359, + "step": 9932, + "time_per_iteration": 2.994739055633545 + }, + { + "auxiliary_loss_clip": 0.01060774, + "auxiliary_loss_mlp": 0.01026311, + "balance_loss_clip": 1.01304269, + "balance_loss_mlp": 1.01896942, + "epoch": 0.5972042687509395, + "flos": 20265195029760.0, + "grad_norm": 1.86468286611753, + "language_loss": 0.7408849, + "learning_rate": 1.3988786782648992e-06, + "loss": 0.7617557, + "num_input_tokens_seen": 214021680, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.41796875, + "step": 9933, + "time_per_iteration": 2.4213833808898926 + }, + { + "auxiliary_loss_clip": 0.01008334, + "auxiliary_loss_mlp": 0.01001071, + "balance_loss_clip": 1.00022507, + "balance_loss_mlp": 1.00092053, + "epoch": 0.5972643920036074, + "flos": 71648510175360.0, + "grad_norm": 0.6659708932318416, + "language_loss": 0.52006334, + "learning_rate": 1.3985183361497906e-06, + "loss": 0.54015738, + "num_input_tokens_seen": 214090265, + "router_z_loss_clip": 0.00848389, + "router_z_loss_mlp": 0.07421875, + "step": 9934, + "time_per_iteration": 3.17118763923645 + }, + { + "auxiliary_loss_clip": 0.01008353, + "auxiliary_loss_mlp": 0.01001606, + "balance_loss_clip": 1.00075328, + "balance_loss_mlp": 1.00089359, + "epoch": 0.5973245152562754, + "flos": 56889781735680.0, + "grad_norm": 0.825682229959743, + "language_loss": 0.54190534, + "learning_rate": 1.3981580155000155e-06, + "loss": 0.56200498, + "num_input_tokens_seen": 214146375, + "router_z_loss_clip": 0.00854492, + "router_z_loss_mlp": 0.07470703, + "step": 9935, + "time_per_iteration": 3.0358517169952393 + }, + { + "auxiliary_loss_clip": 0.01060314, + "auxiliary_loss_mlp": 0.01026571, + "balance_loss_clip": 1.0138334, + "balance_loss_mlp": 1.01985097, + "epoch": 0.5973846385089433, + "flos": 24163387706880.0, + "grad_norm": 1.7823125681865264, + "language_loss": 0.65901798, + "learning_rate": 1.3977977163284323e-06, + "loss": 0.67988682, + "num_input_tokens_seen": 214165340, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.40429688, + "step": 9936, + "time_per_iteration": 2.4394497871398926 + }, + { + "auxiliary_loss_clip": 0.01061284, + "auxiliary_loss_mlp": 0.01028182, + "balance_loss_clip": 1.01601076, + "balance_loss_mlp": 1.01997781, + "epoch": 0.5974447617616113, + "flos": 17669786935680.0, + "grad_norm": 1.849837067354614, + "language_loss": 0.67643321, + "learning_rate": 1.3974374386478998e-06, + "loss": 0.69732791, + "num_input_tokens_seen": 214181360, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.41210938, + "step": 9937, + "time_per_iteration": 2.383004665374756 + }, + { + "auxiliary_loss_clip": 0.01061928, + "auxiliary_loss_mlp": 0.01023964, + "balance_loss_clip": 1.01089239, + "balance_loss_mlp": 1.01964331, + "epoch": 0.5975048850142792, + "flos": 22891431720960.0, + "grad_norm": 1.957403715767467, + "language_loss": 0.77327627, + "learning_rate": 1.397077182471275e-06, + "loss": 0.79413521, + "num_input_tokens_seen": 214198525, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.421875, + "step": 9938, + "time_per_iteration": 2.4252774715423584 + }, + { + "auxiliary_loss_clip": 0.01061708, + "auxiliary_loss_mlp": 0.0102454, + "balance_loss_clip": 1.01189184, + "balance_loss_mlp": 1.02014792, + "epoch": 0.5975650082669473, + "flos": 24351953293440.0, + "grad_norm": 1.4998672606712242, + "language_loss": 0.76067567, + "learning_rate": 1.3967169478114149e-06, + "loss": 0.78153813, + "num_input_tokens_seen": 214218710, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.41601562, + "step": 9939, + "time_per_iteration": 2.4636402130126953 + }, + { + "auxiliary_loss_clip": 0.01063458, + "auxiliary_loss_mlp": 0.01027496, + "balance_loss_clip": 1.01270854, + "balance_loss_mlp": 1.01986623, + "epoch": 0.5976251315196152, + "flos": 20922295328640.0, + "grad_norm": 2.138720292730782, + "language_loss": 0.68433541, + "learning_rate": 1.396356734681175e-06, + "loss": 0.70524502, + "num_input_tokens_seen": 214237800, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.4375, + "step": 9940, + "time_per_iteration": 5.312073707580566 + }, + { + "auxiliary_loss_clip": 0.01056374, + "auxiliary_loss_mlp": 0.01025261, + "balance_loss_clip": 1.01402521, + "balance_loss_mlp": 1.01834321, + "epoch": 0.5976852547722832, + "flos": 35843161455360.0, + "grad_norm": 1.4887739470797738, + "language_loss": 0.70003664, + "learning_rate": 1.3959965430934105e-06, + "loss": 0.72085303, + "num_input_tokens_seen": 214260355, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.38085938, + "step": 9941, + "time_per_iteration": 2.5497381687164307 + }, + { + "auxiliary_loss_clip": 0.01058075, + "auxiliary_loss_mlp": 0.01026584, + "balance_loss_clip": 1.01448393, + "balance_loss_mlp": 1.01822972, + "epoch": 0.5977453780249512, + "flos": 12855229136640.0, + "grad_norm": 1.8444948583520258, + "language_loss": 0.77311158, + "learning_rate": 1.3956363730609757e-06, + "loss": 0.79395819, + "num_input_tokens_seen": 214277120, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.3984375, + "step": 9942, + "time_per_iteration": 2.378892660140991 + }, + { + "auxiliary_loss_clip": 0.01060863, + "auxiliary_loss_mlp": 0.01031326, + "balance_loss_clip": 1.01861215, + "balance_loss_mlp": 1.01929808, + "epoch": 0.5978055012776191, + "flos": 20958116250240.0, + "grad_norm": 1.9055244525773571, + "language_loss": 0.75840783, + "learning_rate": 1.3952762245967239e-06, + "loss": 0.77932972, + "num_input_tokens_seen": 214295300, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.41601562, + "step": 9943, + "time_per_iteration": 2.438880205154419 + }, + { + "auxiliary_loss_clip": 0.01058672, + "auxiliary_loss_mlp": 0.01025175, + "balance_loss_clip": 1.01391602, + "balance_loss_mlp": 1.0195241, + "epoch": 0.5978656245302871, + "flos": 34056585895680.0, + "grad_norm": 1.8332694255027133, + "language_loss": 0.62485653, + "learning_rate": 1.3949160977135084e-06, + "loss": 0.64569503, + "num_input_tokens_seen": 214317050, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.390625, + "step": 9944, + "time_per_iteration": 2.5190088748931885 + }, + { + "auxiliary_loss_clip": 0.01061018, + "auxiliary_loss_mlp": 0.01026602, + "balance_loss_clip": 1.01380515, + "balance_loss_mlp": 1.02010226, + "epoch": 0.597925747782955, + "flos": 37371903557760.0, + "grad_norm": 1.6578843364189941, + "language_loss": 0.72619617, + "learning_rate": 1.394555992424181e-06, + "loss": 0.7470724, + "num_input_tokens_seen": 214337470, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.41015625, + "step": 9945, + "time_per_iteration": 2.5550003051757812 + }, + { + "auxiliary_loss_clip": 0.01058833, + "auxiliary_loss_mlp": 0.01026838, + "balance_loss_clip": 1.01423764, + "balance_loss_mlp": 1.01862669, + "epoch": 0.5979858710356231, + "flos": 25373616675840.0, + "grad_norm": 2.7814654066493594, + "language_loss": 0.67154264, + "learning_rate": 1.394195908741593e-06, + "loss": 0.69239932, + "num_input_tokens_seen": 214357975, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.40234375, + "step": 9946, + "time_per_iteration": 2.43046236038208 + }, + { + "auxiliary_loss_clip": 0.01061047, + "auxiliary_loss_mlp": 0.01024793, + "balance_loss_clip": 1.01226449, + "balance_loss_mlp": 1.01938105, + "epoch": 0.598045994288291, + "flos": 13697578442880.0, + "grad_norm": 1.9023008827092676, + "language_loss": 0.88251251, + "learning_rate": 1.3938358466785944e-06, + "loss": 0.90337086, + "num_input_tokens_seen": 214374125, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.41601562, + "step": 9947, + "time_per_iteration": 3.804002285003662 + }, + { + "auxiliary_loss_clip": 0.01056861, + "auxiliary_loss_mlp": 0.01026843, + "balance_loss_clip": 1.0154289, + "balance_loss_mlp": 1.01882911, + "epoch": 0.598106117540959, + "flos": 21980268391680.0, + "grad_norm": 1.735068164743745, + "language_loss": 0.71681011, + "learning_rate": 1.3934758062480347e-06, + "loss": 0.73764718, + "num_input_tokens_seen": 214393395, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.38085938, + "step": 9948, + "time_per_iteration": 2.4055275917053223 + }, + { + "auxiliary_loss_clip": 0.01060693, + "auxiliary_loss_mlp": 0.01028432, + "balance_loss_clip": 1.01502681, + "balance_loss_mlp": 1.01982784, + "epoch": 0.5981662407936269, + "flos": 20558290826880.0, + "grad_norm": 1.8200483361138495, + "language_loss": 0.89613712, + "learning_rate": 1.3931157874627642e-06, + "loss": 0.91702843, + "num_input_tokens_seen": 214411550, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.40820312, + "step": 9949, + "time_per_iteration": 2.4299471378326416 + }, + { + "auxiliary_loss_clip": 0.01058826, + "auxiliary_loss_mlp": 0.01024381, + "balance_loss_clip": 1.01275182, + "balance_loss_mlp": 1.01965547, + "epoch": 0.5982263640462949, + "flos": 14062979399040.0, + "grad_norm": 1.7137670897407429, + "language_loss": 0.70631427, + "learning_rate": 1.3927557903356294e-06, + "loss": 0.72714639, + "num_input_tokens_seen": 214429780, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.390625, + "step": 9950, + "time_per_iteration": 2.4046216011047363 + }, + { + "auxiliary_loss_clip": 0.0106033, + "auxiliary_loss_mlp": 0.01023337, + "balance_loss_clip": 1.01052165, + "balance_loss_mlp": 1.01899791, + "epoch": 0.5982864872989628, + "flos": 17706410818560.0, + "grad_norm": 1.574048508622112, + "language_loss": 0.78083611, + "learning_rate": 1.3923958148794788e-06, + "loss": 0.80167282, + "num_input_tokens_seen": 214447775, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.4140625, + "step": 9951, + "time_per_iteration": 2.422919511795044 + }, + { + "auxiliary_loss_clip": 0.01061235, + "auxiliary_loss_mlp": 0.01028636, + "balance_loss_clip": 1.01512992, + "balance_loss_mlp": 1.01965809, + "epoch": 0.5983466105516309, + "flos": 16763825399040.0, + "grad_norm": 1.467257453634355, + "language_loss": 0.73667169, + "learning_rate": 1.3920358611071587e-06, + "loss": 0.75757045, + "num_input_tokens_seen": 214467245, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.4140625, + "step": 9952, + "time_per_iteration": 2.393123149871826 + }, + { + "auxiliary_loss_clip": 0.01063155, + "auxiliary_loss_mlp": 0.01026209, + "balance_loss_clip": 1.01277363, + "balance_loss_mlp": 1.02031207, + "epoch": 0.5984067338042988, + "flos": 20041820951040.0, + "grad_norm": 2.547876209300964, + "language_loss": 0.78808844, + "learning_rate": 1.3916759290315145e-06, + "loss": 0.80898207, + "num_input_tokens_seen": 214484385, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.4296875, + "step": 9953, + "time_per_iteration": 2.4096486568450928 + }, + { + "auxiliary_loss_clip": 0.01058364, + "auxiliary_loss_mlp": 0.01025923, + "balance_loss_clip": 1.01390624, + "balance_loss_mlp": 1.01832438, + "epoch": 0.5984668570569668, + "flos": 26318785536000.0, + "grad_norm": 1.368139918843678, + "language_loss": 0.69174349, + "learning_rate": 1.391316018665392e-06, + "loss": 0.71258634, + "num_input_tokens_seen": 214503465, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.40039062, + "step": 9954, + "time_per_iteration": 2.462873697280884 + }, + { + "auxiliary_loss_clip": 0.0105913, + "auxiliary_loss_mlp": 0.010237, + "balance_loss_clip": 1.01179647, + "balance_loss_mlp": 1.01878977, + "epoch": 0.5985269803096348, + "flos": 20592715294080.0, + "grad_norm": 2.151677076874081, + "language_loss": 0.73325032, + "learning_rate": 1.3909561300216343e-06, + "loss": 0.75407863, + "num_input_tokens_seen": 214520725, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.40234375, + "step": 9955, + "time_per_iteration": 2.4102463722229004 + }, + { + "auxiliary_loss_clip": 0.01060135, + "auxiliary_loss_mlp": 0.01021648, + "balance_loss_clip": 1.00980425, + "balance_loss_mlp": 1.01918459, + "epoch": 0.5985871035623027, + "flos": 26864303529600.0, + "grad_norm": 1.526228355928898, + "language_loss": 0.68459713, + "learning_rate": 1.3905962631130867e-06, + "loss": 0.70541501, + "num_input_tokens_seen": 214540675, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.41015625, + "step": 9956, + "time_per_iteration": 2.4787652492523193 + }, + { + "auxiliary_loss_clip": 0.01060738, + "auxiliary_loss_mlp": 0.01025807, + "balance_loss_clip": 1.01356375, + "balance_loss_mlp": 1.0194931, + "epoch": 0.5986472268149707, + "flos": 19608688224000.0, + "grad_norm": 2.527732461840852, + "language_loss": 0.74079418, + "learning_rate": 1.3902364179525905e-06, + "loss": 0.76165962, + "num_input_tokens_seen": 214559910, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.4140625, + "step": 9957, + "time_per_iteration": 2.40362548828125 + }, + { + "auxiliary_loss_clip": 0.01058013, + "auxiliary_loss_mlp": 0.01020869, + "balance_loss_clip": 1.00959802, + "balance_loss_mlp": 1.01919389, + "epoch": 0.5987073500676386, + "flos": 21793657841280.0, + "grad_norm": 1.796116456023039, + "language_loss": 0.84440672, + "learning_rate": 1.3898765945529878e-06, + "loss": 0.86519551, + "num_input_tokens_seen": 214575960, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.38867188, + "step": 9958, + "time_per_iteration": 2.4159646034240723 + }, + { + "auxiliary_loss_clip": 0.01060479, + "auxiliary_loss_mlp": 0.01027183, + "balance_loss_clip": 1.01488662, + "balance_loss_mlp": 1.01901317, + "epoch": 0.5987674733203067, + "flos": 24313269640320.0, + "grad_norm": 1.9206898141023188, + "language_loss": 0.66487885, + "learning_rate": 1.3895167929271203e-06, + "loss": 0.68575537, + "num_input_tokens_seen": 214594230, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.4140625, + "step": 9959, + "time_per_iteration": 2.4311280250549316 + }, + { + "auxiliary_loss_clip": 0.01061981, + "auxiliary_loss_mlp": 0.01021162, + "balance_loss_clip": 1.00922894, + "balance_loss_mlp": 1.02126133, + "epoch": 0.5988275965729746, + "flos": 21319258020480.0, + "grad_norm": 1.658560720696304, + "language_loss": 0.83656919, + "learning_rate": 1.3891570130878276e-06, + "loss": 0.85740066, + "num_input_tokens_seen": 214613130, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.40625, + "step": 9960, + "time_per_iteration": 2.417999029159546 + }, + { + "auxiliary_loss_clip": 0.01057963, + "auxiliary_loss_mlp": 0.01021759, + "balance_loss_clip": 1.01040483, + "balance_loss_mlp": 1.01758969, + "epoch": 0.5988877198256426, + "flos": 25116900382080.0, + "grad_norm": 1.758920224764302, + "language_loss": 0.79576576, + "learning_rate": 1.388797255047951e-06, + "loss": 0.81656295, + "num_input_tokens_seen": 214634470, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.40429688, + "step": 9961, + "time_per_iteration": 2.468175172805786 + }, + { + "auxiliary_loss_clip": 0.01057472, + "auxiliary_loss_mlp": 0.01023509, + "balance_loss_clip": 1.01170146, + "balance_loss_mlp": 1.01848602, + "epoch": 0.5989478430783105, + "flos": 26427993868800.0, + "grad_norm": 1.6077686480224327, + "language_loss": 0.6735431, + "learning_rate": 1.3884375188203278e-06, + "loss": 0.69435298, + "num_input_tokens_seen": 214654030, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.38867188, + "step": 9962, + "time_per_iteration": 2.43542742729187 + }, + { + "auxiliary_loss_clip": 0.01058494, + "auxiliary_loss_mlp": 0.01026042, + "balance_loss_clip": 1.01437783, + "balance_loss_mlp": 1.01873636, + "epoch": 0.5990079663309785, + "flos": 25777177614720.0, + "grad_norm": 1.334096397463718, + "language_loss": 0.74499071, + "learning_rate": 1.3880778044177955e-06, + "loss": 0.76583612, + "num_input_tokens_seen": 214676985, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.39648438, + "step": 9963, + "time_per_iteration": 2.4981422424316406 + }, + { + "auxiliary_loss_clip": 0.01057373, + "auxiliary_loss_mlp": 0.01023056, + "balance_loss_clip": 1.01162398, + "balance_loss_mlp": 1.0184679, + "epoch": 0.5990680895836464, + "flos": 36830260725120.0, + "grad_norm": 2.343530369203257, + "language_loss": 0.68290806, + "learning_rate": 1.387718111853193e-06, + "loss": 0.70371234, + "num_input_tokens_seen": 214700105, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.38867188, + "step": 9964, + "time_per_iteration": 2.5552687644958496 + }, + { + "auxiliary_loss_clip": 0.01057114, + "auxiliary_loss_mlp": 0.01023648, + "balance_loss_clip": 1.01139307, + "balance_loss_mlp": 1.018121, + "epoch": 0.5991282128363145, + "flos": 24132419464320.0, + "grad_norm": 2.4514670011907627, + "language_loss": 0.76807117, + "learning_rate": 1.3873584411393557e-06, + "loss": 0.7888788, + "num_input_tokens_seen": 214717885, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.390625, + "step": 9965, + "time_per_iteration": 2.4199771881103516 + }, + { + "auxiliary_loss_clip": 0.01059207, + "auxiliary_loss_mlp": 0.01026432, + "balance_loss_clip": 1.01451731, + "balance_loss_mlp": 1.01925576, + "epoch": 0.5991883360889824, + "flos": 10303357374720.0, + "grad_norm": 1.826834369972989, + "language_loss": 0.77054691, + "learning_rate": 1.3869987922891202e-06, + "loss": 0.79140329, + "num_input_tokens_seen": 214733680, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.3984375, + "step": 9966, + "time_per_iteration": 2.373837471008301 + }, + { + "auxiliary_loss_clip": 0.01059635, + "auxiliary_loss_mlp": 0.01023403, + "balance_loss_clip": 1.01160157, + "balance_loss_mlp": 1.02054346, + "epoch": 0.5992484593416504, + "flos": 23950068099840.0, + "grad_norm": 1.6071520917111732, + "language_loss": 0.73482597, + "learning_rate": 1.3866391653153208e-06, + "loss": 0.75565636, + "num_input_tokens_seen": 214753285, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.390625, + "step": 9967, + "time_per_iteration": 2.4332873821258545 + }, + { + "auxiliary_loss_clip": 0.01062752, + "auxiliary_loss_mlp": 0.01029823, + "balance_loss_clip": 1.01732385, + "balance_loss_mlp": 1.02017617, + "epoch": 0.5993085825943184, + "flos": 11943402491520.0, + "grad_norm": 2.1405303452957654, + "language_loss": 0.68401945, + "learning_rate": 1.3862795602307914e-06, + "loss": 0.70494521, + "num_input_tokens_seen": 214767810, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.42578125, + "step": 9968, + "time_per_iteration": 3.8119924068450928 + }, + { + "auxiliary_loss_clip": 0.01059597, + "auxiliary_loss_mlp": 0.01023443, + "balance_loss_clip": 1.01106882, + "balance_loss_mlp": 1.0189544, + "epoch": 0.5993687058469863, + "flos": 19025813208960.0, + "grad_norm": 1.6302541225716554, + "language_loss": 0.79332817, + "learning_rate": 1.3859199770483665e-06, + "loss": 0.81415856, + "num_input_tokens_seen": 214786040, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.40625, + "step": 9969, + "time_per_iteration": 2.418381452560425 + }, + { + "auxiliary_loss_clip": 0.01058275, + "auxiliary_loss_mlp": 0.01020146, + "balance_loss_clip": 1.00790358, + "balance_loss_mlp": 1.01791525, + "epoch": 0.5994288290996543, + "flos": 14282094291840.0, + "grad_norm": 1.5649503539711018, + "language_loss": 0.80983365, + "learning_rate": 1.3855604157808776e-06, + "loss": 0.8306179, + "num_input_tokens_seen": 214803110, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.40429688, + "step": 9970, + "time_per_iteration": 2.3552842140197754 + }, + { + "auxiliary_loss_clip": 0.0105972, + "auxiliary_loss_mlp": 0.01028548, + "balance_loss_clip": 1.01411152, + "balance_loss_mlp": 1.0180459, + "epoch": 0.5994889523523222, + "flos": 19205685866880.0, + "grad_norm": 1.8485756080081581, + "language_loss": 0.61615431, + "learning_rate": 1.385200876441157e-06, + "loss": 0.63703698, + "num_input_tokens_seen": 214819945, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.41796875, + "step": 9971, + "time_per_iteration": 2.4256865978240967 + }, + { + "auxiliary_loss_clip": 0.01056728, + "auxiliary_loss_mlp": 0.01019707, + "balance_loss_clip": 1.00858486, + "balance_loss_mlp": 1.01955962, + "epoch": 0.5995490756049903, + "flos": 28035813934080.0, + "grad_norm": 1.6851388965256542, + "language_loss": 0.7906183, + "learning_rate": 1.3848413590420358e-06, + "loss": 0.81138265, + "num_input_tokens_seen": 214838810, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.37109375, + "step": 9972, + "time_per_iteration": 2.4620494842529297 + }, + { + "auxiliary_loss_clip": 0.01063005, + "auxiliary_loss_mlp": 0.01028384, + "balance_loss_clip": 1.01494956, + "balance_loss_mlp": 1.01883793, + "epoch": 0.5996091988576582, + "flos": 29051856587520.0, + "grad_norm": 2.301329967712529, + "language_loss": 0.7630142, + "learning_rate": 1.3844818635963442e-06, + "loss": 0.7839281, + "num_input_tokens_seen": 214857040, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.44140625, + "step": 9973, + "time_per_iteration": 2.4622061252593994 + }, + { + "auxiliary_loss_clip": 0.01057381, + "auxiliary_loss_mlp": 0.01026147, + "balance_loss_clip": 1.01402974, + "balance_loss_mlp": 1.01726246, + "epoch": 0.5996693221103262, + "flos": 20812912439040.0, + "grad_norm": 1.731935869010283, + "language_loss": 0.65512115, + "learning_rate": 1.3841223901169116e-06, + "loss": 0.67595649, + "num_input_tokens_seen": 214873375, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.40039062, + "step": 9974, + "time_per_iteration": 2.4369640350341797 + }, + { + "auxiliary_loss_clip": 0.01059554, + "auxiliary_loss_mlp": 0.01027654, + "balance_loss_clip": 1.01549482, + "balance_loss_mlp": 1.01905882, + "epoch": 0.5997294453629941, + "flos": 23767786558080.0, + "grad_norm": 1.3636784799132171, + "language_loss": 0.74242842, + "learning_rate": 1.383762938616566e-06, + "loss": 0.76330054, + "num_input_tokens_seen": 214893900, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.40429688, + "step": 9975, + "time_per_iteration": 2.4805078506469727 + }, + { + "auxiliary_loss_clip": 0.01058493, + "auxiliary_loss_mlp": 0.0102663, + "balance_loss_clip": 1.01386237, + "balance_loss_mlp": 1.01823187, + "epoch": 0.5997895686156621, + "flos": 20958954122880.0, + "grad_norm": 1.743679547919733, + "language_loss": 0.76987934, + "learning_rate": 1.3834035091081374e-06, + "loss": 0.79073054, + "num_input_tokens_seen": 214912110, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.40234375, + "step": 9976, + "time_per_iteration": 2.3991613388061523 + }, + { + "auxiliary_loss_clip": 0.01058465, + "auxiliary_loss_mlp": 0.01026845, + "balance_loss_clip": 1.01391053, + "balance_loss_mlp": 1.01883054, + "epoch": 0.59984969186833, + "flos": 28364206982400.0, + "grad_norm": 2.4317658531332684, + "language_loss": 0.74737275, + "learning_rate": 1.38304410160445e-06, + "loss": 0.76822585, + "num_input_tokens_seen": 214930140, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.39648438, + "step": 9977, + "time_per_iteration": 2.4751062393188477 + }, + { + "auxiliary_loss_clip": 0.01058497, + "auxiliary_loss_mlp": 0.01022012, + "balance_loss_clip": 1.00981045, + "balance_loss_mlp": 1.01800144, + "epoch": 0.5999098151209981, + "flos": 22564784240640.0, + "grad_norm": 1.5914917078983444, + "language_loss": 0.69168842, + "learning_rate": 1.3826847161183324e-06, + "loss": 0.71249354, + "num_input_tokens_seen": 214949200, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.40429688, + "step": 9978, + "time_per_iteration": 2.4297478199005127 + }, + { + "auxiliary_loss_clip": 0.01064033, + "auxiliary_loss_mlp": 0.01026475, + "balance_loss_clip": 1.01446486, + "balance_loss_mlp": 1.02105927, + "epoch": 0.599969938373666, + "flos": 18767770283520.0, + "grad_norm": 2.0846023683665678, + "language_loss": 0.81359541, + "learning_rate": 1.3823253526626095e-06, + "loss": 0.83450055, + "num_input_tokens_seen": 214965775, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.4296875, + "step": 9979, + "time_per_iteration": 3.8964648246765137 + }, + { + "auxiliary_loss_clip": 0.01058052, + "auxiliary_loss_mlp": 0.0102142, + "balance_loss_clip": 1.00970161, + "balance_loss_mlp": 1.01948261, + "epoch": 0.600030061626334, + "flos": 11326452122880.0, + "grad_norm": 1.909232121854755, + "language_loss": 0.69989979, + "learning_rate": 1.3819660112501052e-06, + "loss": 0.72069454, + "num_input_tokens_seen": 214982480, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.38671875, + "step": 9980, + "time_per_iteration": 2.4201719760894775 + }, + { + "auxiliary_loss_clip": 0.01061784, + "auxiliary_loss_mlp": 0.01023084, + "balance_loss_clip": 1.0094111, + "balance_loss_mlp": 1.01937461, + "epoch": 0.600090184879002, + "flos": 16577808341760.0, + "grad_norm": 3.256339102122809, + "language_loss": 0.68646371, + "learning_rate": 1.3816066918936446e-06, + "loss": 0.70731235, + "num_input_tokens_seen": 214998110, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.42382812, + "step": 9981, + "time_per_iteration": 2.368241310119629 + }, + { + "auxiliary_loss_clip": 0.01058257, + "auxiliary_loss_mlp": 0.01025419, + "balance_loss_clip": 1.01349795, + "balance_loss_mlp": 1.01954007, + "epoch": 0.6001503081316699, + "flos": 23617625333760.0, + "grad_norm": 1.948676756862926, + "language_loss": 0.78568643, + "learning_rate": 1.3812473946060504e-06, + "loss": 0.80652326, + "num_input_tokens_seen": 215017995, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.38671875, + "step": 9982, + "time_per_iteration": 2.4516148567199707 + }, + { + "auxiliary_loss_clip": 0.0105917, + "auxiliary_loss_mlp": 0.01026277, + "balance_loss_clip": 1.01375413, + "balance_loss_mlp": 1.01996469, + "epoch": 0.6002104313843379, + "flos": 20666626375680.0, + "grad_norm": 2.7369647909664736, + "language_loss": 0.73180258, + "learning_rate": 1.3808881194001451e-06, + "loss": 0.752657, + "num_input_tokens_seen": 215038285, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.390625, + "step": 9983, + "time_per_iteration": 2.4427974224090576 + }, + { + "auxiliary_loss_clip": 0.01059148, + "auxiliary_loss_mlp": 0.01029604, + "balance_loss_clip": 1.01674759, + "balance_loss_mlp": 1.01848197, + "epoch": 0.6002705546370058, + "flos": 22454144542080.0, + "grad_norm": 2.0016449881238754, + "language_loss": 0.77833319, + "learning_rate": 1.3805288662887504e-06, + "loss": 0.79922068, + "num_input_tokens_seen": 215057825, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.40625, + "step": 9984, + "time_per_iteration": 2.4015328884124756 + }, + { + "auxiliary_loss_clip": 0.01058365, + "auxiliary_loss_mlp": 0.01026928, + "balance_loss_clip": 1.0150609, + "balance_loss_mlp": 1.01850271, + "epoch": 0.6003306778896739, + "flos": 25190811463680.0, + "grad_norm": 1.5607037361293095, + "language_loss": 0.83431518, + "learning_rate": 1.3801696352846865e-06, + "loss": 0.85516816, + "num_input_tokens_seen": 215077790, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.3984375, + "step": 9985, + "time_per_iteration": 2.548151731491089 + }, + { + "auxiliary_loss_clip": 0.01059263, + "auxiliary_loss_mlp": 0.01025952, + "balance_loss_clip": 1.01446033, + "balance_loss_mlp": 1.01997459, + "epoch": 0.6003908011423418, + "flos": 26686525553280.0, + "grad_norm": 1.7135726009183276, + "language_loss": 0.7118119, + "learning_rate": 1.3798104264007745e-06, + "loss": 0.73266399, + "num_input_tokens_seen": 215097650, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.39257812, + "step": 9986, + "time_per_iteration": 3.9938440322875977 + }, + { + "auxiliary_loss_clip": 0.01062604, + "auxiliary_loss_mlp": 0.01028508, + "balance_loss_clip": 1.01601458, + "balance_loss_mlp": 1.02120876, + "epoch": 0.6004509243950098, + "flos": 22563981279360.0, + "grad_norm": 1.3654424801963925, + "language_loss": 0.71609926, + "learning_rate": 1.3794512396498326e-06, + "loss": 0.73701036, + "num_input_tokens_seen": 215118235, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.4140625, + "step": 9987, + "time_per_iteration": 2.4825005531311035 + }, + { + "auxiliary_loss_clip": 0.01058328, + "auxiliary_loss_mlp": 0.01021818, + "balance_loss_clip": 1.01041007, + "balance_loss_mlp": 1.0180614, + "epoch": 0.6005110476476777, + "flos": 19718280581760.0, + "grad_norm": 1.610486903957175, + "language_loss": 0.84603256, + "learning_rate": 1.3790920750446801e-06, + "loss": 0.86683404, + "num_input_tokens_seen": 215136755, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.40234375, + "step": 9988, + "time_per_iteration": 2.3838963508605957 + }, + { + "auxiliary_loss_clip": 0.01058292, + "auxiliary_loss_mlp": 0.01027215, + "balance_loss_clip": 1.01500833, + "balance_loss_mlp": 1.01896822, + "epoch": 0.6005711709003457, + "flos": 17711577699840.0, + "grad_norm": 2.368964556986323, + "language_loss": 0.65369606, + "learning_rate": 1.3787329325981343e-06, + "loss": 0.67455113, + "num_input_tokens_seen": 215155225, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.39257812, + "step": 9989, + "time_per_iteration": 2.3632748126983643 + }, + { + "auxiliary_loss_clip": 0.01058192, + "auxiliary_loss_mlp": 0.01027044, + "balance_loss_clip": 1.01538539, + "balance_loss_mlp": 1.0195483, + "epoch": 0.6006312941530136, + "flos": 18513497784960.0, + "grad_norm": 1.5288236883256663, + "language_loss": 0.80336601, + "learning_rate": 1.3783738123230114e-06, + "loss": 0.82421839, + "num_input_tokens_seen": 215174815, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.38671875, + "step": 9990, + "time_per_iteration": 2.4084198474884033 + }, + { + "auxiliary_loss_clip": 0.01059784, + "auxiliary_loss_mlp": 0.01026189, + "balance_loss_clip": 1.01394582, + "balance_loss_mlp": 1.01919508, + "epoch": 0.6006914174056817, + "flos": 21389957256960.0, + "grad_norm": 2.3273772733687124, + "language_loss": 0.8272754, + "learning_rate": 1.3780147142321292e-06, + "loss": 0.84813511, + "num_input_tokens_seen": 215192045, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.40625, + "step": 9991, + "time_per_iteration": 2.4004998207092285 + }, + { + "auxiliary_loss_clip": 0.01057703, + "auxiliary_loss_mlp": 0.01023423, + "balance_loss_clip": 1.01126373, + "balance_loss_mlp": 1.0189153, + "epoch": 0.6007515406583496, + "flos": 12749686496640.0, + "grad_norm": 2.4362155929473506, + "language_loss": 0.82467163, + "learning_rate": 1.3776556383383011e-06, + "loss": 0.84548283, + "num_input_tokens_seen": 215209885, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.38671875, + "step": 9992, + "time_per_iteration": 2.3883461952209473 + }, + { + "auxiliary_loss_clip": 0.01058839, + "auxiliary_loss_mlp": 0.01029383, + "balance_loss_clip": 1.01736033, + "balance_loss_mlp": 1.0192188, + "epoch": 0.6008116639110176, + "flos": 19205930246400.0, + "grad_norm": 2.048543923337106, + "language_loss": 0.66063571, + "learning_rate": 1.377296584654343e-06, + "loss": 0.68151784, + "num_input_tokens_seen": 215228150, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.39453125, + "step": 9993, + "time_per_iteration": 2.3771979808807373 + }, + { + "auxiliary_loss_clip": 0.01057907, + "auxiliary_loss_mlp": 0.01022614, + "balance_loss_clip": 1.01103282, + "balance_loss_mlp": 1.01907182, + "epoch": 0.6008717871636855, + "flos": 17054407578240.0, + "grad_norm": 2.3208388863424294, + "language_loss": 0.80940467, + "learning_rate": 1.3769375531930672e-06, + "loss": 0.83020991, + "num_input_tokens_seen": 215243755, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.38867188, + "step": 9994, + "time_per_iteration": 2.3631606101989746 + }, + { + "auxiliary_loss_clip": 0.0105607, + "auxiliary_loss_mlp": 0.01023698, + "balance_loss_clip": 1.01310015, + "balance_loss_mlp": 1.01974344, + "epoch": 0.6009319104163535, + "flos": 20297769194880.0, + "grad_norm": 1.792276530245563, + "language_loss": 0.7203027, + "learning_rate": 1.376578543967288e-06, + "loss": 0.74110043, + "num_input_tokens_seen": 215262130, + "router_z_loss_clip": 0.10595703, + "router_z_loss_mlp": 0.36328125, + "step": 9995, + "time_per_iteration": 2.411578893661499 + }, + { + "auxiliary_loss_clip": 0.01057947, + "auxiliary_loss_mlp": 0.01026442, + "balance_loss_clip": 1.0142467, + "balance_loss_mlp": 1.01720881, + "epoch": 0.6009920336690215, + "flos": 21835658073600.0, + "grad_norm": 1.8327671676565993, + "language_loss": 0.80893666, + "learning_rate": 1.376219556989817e-06, + "loss": 0.82978058, + "num_input_tokens_seen": 215281785, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.40625, + "step": 9996, + "time_per_iteration": 2.4075119495391846 + }, + { + "auxiliary_loss_clip": 0.01059005, + "auxiliary_loss_mlp": 0.01023799, + "balance_loss_clip": 1.01177073, + "balance_loss_mlp": 1.01954412, + "epoch": 0.6010521569216895, + "flos": 22595158990080.0, + "grad_norm": 1.7119617884302647, + "language_loss": 0.78069967, + "learning_rate": 1.3758605922734648e-06, + "loss": 0.80152774, + "num_input_tokens_seen": 215297550, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.39453125, + "step": 9997, + "time_per_iteration": 2.430967330932617 + }, + { + "auxiliary_loss_clip": 0.01059778, + "auxiliary_loss_mlp": 0.01026504, + "balance_loss_clip": 1.01378441, + "balance_loss_mlp": 1.01942003, + "epoch": 0.6011122801743575, + "flos": 19170702817920.0, + "grad_norm": 1.620591146984094, + "language_loss": 0.73082876, + "learning_rate": 1.3755016498310432e-06, + "loss": 0.75169164, + "num_input_tokens_seen": 215316360, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.40429688, + "step": 9998, + "time_per_iteration": 2.40390682220459 + }, + { + "auxiliary_loss_clip": 0.01057641, + "auxiliary_loss_mlp": 0.01022905, + "balance_loss_clip": 1.01175261, + "balance_loss_mlp": 1.0188036, + "epoch": 0.6011724034270254, + "flos": 25008844124160.0, + "grad_norm": 1.424608095596136, + "language_loss": 0.7245912, + "learning_rate": 1.3751427296753608e-06, + "loss": 0.74539661, + "num_input_tokens_seen": 215336405, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.38867188, + "step": 9999, + "time_per_iteration": 2.458123207092285 + }, + { + "auxiliary_loss_clip": 0.01060181, + "auxiliary_loss_mlp": 0.01024195, + "balance_loss_clip": 1.01193404, + "balance_loss_mlp": 1.01804483, + "epoch": 0.6012325266796934, + "flos": 21796625306880.0, + "grad_norm": 1.975589215334161, + "language_loss": 0.78539741, + "learning_rate": 1.3747838318192275e-06, + "loss": 0.80624121, + "num_input_tokens_seen": 215356590, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.421875, + "step": 10000, + "time_per_iteration": 2.4383649826049805 + }, + { + "auxiliary_loss_clip": 0.01062961, + "auxiliary_loss_mlp": 0.01024357, + "balance_loss_clip": 1.01129723, + "balance_loss_mlp": 1.02118385, + "epoch": 0.6012926499323613, + "flos": 19571994518400.0, + "grad_norm": 1.9431884168441023, + "language_loss": 0.77568984, + "learning_rate": 1.3744249562754511e-06, + "loss": 0.79656303, + "num_input_tokens_seen": 215374295, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.41796875, + "step": 10001, + "time_per_iteration": 2.3886537551879883 + }, + { + "auxiliary_loss_clip": 0.01061155, + "auxiliary_loss_mlp": 0.01021253, + "balance_loss_clip": 1.00846827, + "balance_loss_mlp": 1.0194695, + "epoch": 0.6013527731850293, + "flos": 34859343853440.0, + "grad_norm": 1.8455375057669703, + "language_loss": 0.58966517, + "learning_rate": 1.3740661030568385e-06, + "loss": 0.61048925, + "num_input_tokens_seen": 215394535, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.41796875, + "step": 10002, + "time_per_iteration": 2.5146665573120117 + }, + { + "auxiliary_loss_clip": 0.01062599, + "auxiliary_loss_mlp": 0.01027927, + "balance_loss_clip": 1.01419353, + "balance_loss_mlp": 1.02137816, + "epoch": 0.6014128964376972, + "flos": 23290908030720.0, + "grad_norm": 1.5020662323308243, + "language_loss": 0.7772553, + "learning_rate": 1.3737072721761966e-06, + "loss": 0.79816049, + "num_input_tokens_seen": 215414355, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.41210938, + "step": 10003, + "time_per_iteration": 2.449726104736328 + }, + { + "auxiliary_loss_clip": 0.01060975, + "auxiliary_loss_mlp": 0.0102806, + "balance_loss_clip": 1.01569819, + "balance_loss_mlp": 1.01911068, + "epoch": 0.6014730196903653, + "flos": 24819929424000.0, + "grad_norm": 2.071435647334491, + "language_loss": 0.77278626, + "learning_rate": 1.373348463646331e-06, + "loss": 0.79367667, + "num_input_tokens_seen": 215428280, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.41796875, + "step": 10004, + "time_per_iteration": 2.4167749881744385 + }, + { + "auxiliary_loss_clip": 0.01057622, + "auxiliary_loss_mlp": 0.01025441, + "balance_loss_clip": 1.01357996, + "balance_loss_mlp": 1.01863551, + "epoch": 0.6015331429430332, + "flos": 23111244840960.0, + "grad_norm": 1.4699738100311346, + "language_loss": 0.72173035, + "learning_rate": 1.3729896774800474e-06, + "loss": 0.74256098, + "num_input_tokens_seen": 215448970, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.390625, + "step": 10005, + "time_per_iteration": 2.470848560333252 + }, + { + "auxiliary_loss_clip": 0.01057796, + "auxiliary_loss_mlp": 0.01021691, + "balance_loss_clip": 1.0103898, + "balance_loss_mlp": 1.01940703, + "epoch": 0.6015932661957012, + "flos": 19200553896960.0, + "grad_norm": 1.9752034989447318, + "language_loss": 0.74521106, + "learning_rate": 1.3726309136901495e-06, + "loss": 0.76600587, + "num_input_tokens_seen": 215465260, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.3828125, + "step": 10006, + "time_per_iteration": 3.8439695835113525 + }, + { + "auxiliary_loss_clip": 0.01057499, + "auxiliary_loss_mlp": 0.0102877, + "balance_loss_clip": 1.01634216, + "balance_loss_mlp": 1.01765275, + "epoch": 0.6016533894483691, + "flos": 18112659932160.0, + "grad_norm": 1.7444237223578507, + "language_loss": 0.73855394, + "learning_rate": 1.3722721722894397e-06, + "loss": 0.75941658, + "num_input_tokens_seen": 215482725, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.3984375, + "step": 10007, + "time_per_iteration": 2.3821957111358643 + }, + { + "auxiliary_loss_clip": 0.01058561, + "auxiliary_loss_mlp": 0.01020896, + "balance_loss_clip": 1.00901651, + "balance_loss_mlp": 1.01864469, + "epoch": 0.6017135127010371, + "flos": 16215968344320.0, + "grad_norm": 1.7070884122245367, + "language_loss": 0.70418948, + "learning_rate": 1.371913453290722e-06, + "loss": 0.72498405, + "num_input_tokens_seen": 215500420, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.3984375, + "step": 10008, + "time_per_iteration": 2.4022836685180664 + }, + { + "auxiliary_loss_clip": 0.0105717, + "auxiliary_loss_mlp": 0.01022761, + "balance_loss_clip": 1.01165628, + "balance_loss_mlp": 1.01841187, + "epoch": 0.6017736359537051, + "flos": 23443024291200.0, + "grad_norm": 1.5784933959381424, + "language_loss": 0.76180351, + "learning_rate": 1.3715547567067968e-06, + "loss": 0.78260291, + "num_input_tokens_seen": 215522260, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.38671875, + "step": 10009, + "time_per_iteration": 2.48907732963562 + }, + { + "auxiliary_loss_clip": 0.01061132, + "auxiliary_loss_mlp": 0.01022939, + "balance_loss_clip": 1.01107764, + "balance_loss_mlp": 1.01989996, + "epoch": 0.601833759206373, + "flos": 23512920566400.0, + "grad_norm": 1.9073277659320103, + "language_loss": 0.74480337, + "learning_rate": 1.3711960825504662e-06, + "loss": 0.76564407, + "num_input_tokens_seen": 215541715, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.41210938, + "step": 10010, + "time_per_iteration": 2.443746566772461 + }, + { + "auxiliary_loss_clip": 0.0100862, + "auxiliary_loss_mlp": 0.01002021, + "balance_loss_clip": 1.00120461, + "balance_loss_mlp": 1.00133348, + "epoch": 0.6018938824590411, + "flos": 63987972387840.0, + "grad_norm": 0.8085625237317737, + "language_loss": 0.55072993, + "learning_rate": 1.37083743083453e-06, + "loss": 0.57083637, + "num_input_tokens_seen": 215603020, + "router_z_loss_clip": 0.00817871, + "router_z_loss_mlp": 0.07324219, + "step": 10011, + "time_per_iteration": 3.1015262603759766 + }, + { + "auxiliary_loss_clip": 0.0106, + "auxiliary_loss_mlp": 0.01029502, + "balance_loss_clip": 1.01796877, + "balance_loss_mlp": 1.01998031, + "epoch": 0.601954005711709, + "flos": 34638623038080.0, + "grad_norm": 1.6473953056296082, + "language_loss": 0.62189329, + "learning_rate": 1.3704788015717872e-06, + "loss": 0.64278829, + "num_input_tokens_seen": 215625115, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.40039062, + "step": 10012, + "time_per_iteration": 2.5464494228363037 + }, + { + "auxiliary_loss_clip": 0.01058412, + "auxiliary_loss_mlp": 0.01023002, + "balance_loss_clip": 1.01236224, + "balance_loss_mlp": 1.0193696, + "epoch": 0.602014128964377, + "flos": 19426057568640.0, + "grad_norm": 1.639268490232705, + "language_loss": 0.74864113, + "learning_rate": 1.3701201947750368e-06, + "loss": 0.76945525, + "num_input_tokens_seen": 215643730, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.390625, + "step": 10013, + "time_per_iteration": 2.4100770950317383 + }, + { + "auxiliary_loss_clip": 0.0105452, + "auxiliary_loss_mlp": 0.01023086, + "balance_loss_clip": 1.01257825, + "balance_loss_mlp": 1.01819479, + "epoch": 0.6020742522170449, + "flos": 28328141681280.0, + "grad_norm": 3.0961329492175773, + "language_loss": 0.81519997, + "learning_rate": 1.3697616104570764e-06, + "loss": 0.835976, + "num_input_tokens_seen": 215664425, + "router_z_loss_clip": 0.10498047, + "router_z_loss_mlp": 0.36328125, + "step": 10014, + "time_per_iteration": 2.490976095199585 + }, + { + "auxiliary_loss_clip": 0.01055462, + "auxiliary_loss_mlp": 0.01020498, + "balance_loss_clip": 1.00944138, + "balance_loss_mlp": 1.01880574, + "epoch": 0.6021343754697129, + "flos": 22745948618880.0, + "grad_norm": 1.490457949485537, + "language_loss": 0.72292918, + "learning_rate": 1.369403048630703e-06, + "loss": 0.74368882, + "num_input_tokens_seen": 215684280, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.3671875, + "step": 10015, + "time_per_iteration": 2.471421718597412 + }, + { + "auxiliary_loss_clip": 0.01008072, + "auxiliary_loss_mlp": 0.01001902, + "balance_loss_clip": 1.00108588, + "balance_loss_mlp": 1.00081968, + "epoch": 0.6021944987223808, + "flos": 65749027743360.0, + "grad_norm": 0.8302735404182662, + "language_loss": 0.5477733, + "learning_rate": 1.3690445093087125e-06, + "loss": 0.56787306, + "num_input_tokens_seen": 215739780, + "router_z_loss_clip": 0.00817871, + "router_z_loss_mlp": 0.07226562, + "step": 10016, + "time_per_iteration": 2.9765419960021973 + }, + { + "auxiliary_loss_clip": 0.01058678, + "auxiliary_loss_mlp": 0.01022275, + "balance_loss_clip": 1.01075387, + "balance_loss_mlp": 1.02004123, + "epoch": 0.6022546219750489, + "flos": 16104316216320.0, + "grad_norm": 1.4373349254709202, + "language_loss": 0.83011222, + "learning_rate": 1.3686859925039009e-06, + "loss": 0.85092175, + "num_input_tokens_seen": 215757885, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.38671875, + "step": 10017, + "time_per_iteration": 2.4274795055389404 + }, + { + "auxiliary_loss_clip": 0.01058273, + "auxiliary_loss_mlp": 0.01022423, + "balance_loss_clip": 1.01097941, + "balance_loss_mlp": 1.01981568, + "epoch": 0.6023147452277168, + "flos": 25511593835520.0, + "grad_norm": 1.7648663257966284, + "language_loss": 0.83437878, + "learning_rate": 1.3683274982290622e-06, + "loss": 0.85518575, + "num_input_tokens_seen": 215776415, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.38476562, + "step": 10018, + "time_per_iteration": 3.9884235858917236 + }, + { + "auxiliary_loss_clip": 0.01060658, + "auxiliary_loss_mlp": 0.01029285, + "balance_loss_clip": 1.01690507, + "balance_loss_mlp": 1.01920533, + "epoch": 0.6023748684803848, + "flos": 22635029629440.0, + "grad_norm": 5.9427548149624725, + "language_loss": 0.78565794, + "learning_rate": 1.36796902649699e-06, + "loss": 0.80655742, + "num_input_tokens_seen": 215794865, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.4140625, + "step": 10019, + "time_per_iteration": 3.8633954524993896 + }, + { + "auxiliary_loss_clip": 0.01055338, + "auxiliary_loss_mlp": 0.01024408, + "balance_loss_clip": 1.01350582, + "balance_loss_mlp": 1.01663005, + "epoch": 0.6024349917330527, + "flos": 26209332823680.0, + "grad_norm": 1.4213399840796035, + "language_loss": 0.73851335, + "learning_rate": 1.3676105773204774e-06, + "loss": 0.75931078, + "num_input_tokens_seen": 215816840, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.38671875, + "step": 10020, + "time_per_iteration": 2.53448748588562 + }, + { + "auxiliary_loss_clip": 0.01057491, + "auxiliary_loss_mlp": 0.01027728, + "balance_loss_clip": 1.0157181, + "balance_loss_mlp": 1.01836634, + "epoch": 0.6024951149857207, + "flos": 21250688376960.0, + "grad_norm": 1.5343367786727278, + "language_loss": 0.64255619, + "learning_rate": 1.3672521507123169e-06, + "loss": 0.66340834, + "num_input_tokens_seen": 215836100, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.390625, + "step": 10021, + "time_per_iteration": 2.4300155639648438 + }, + { + "auxiliary_loss_clip": 0.01057004, + "auxiliary_loss_mlp": 0.01024152, + "balance_loss_clip": 1.01183724, + "balance_loss_mlp": 1.01788306, + "epoch": 0.6025552382383887, + "flos": 26942229417600.0, + "grad_norm": 2.465216698389869, + "language_loss": 0.80688095, + "learning_rate": 1.3668937466852994e-06, + "loss": 0.82769251, + "num_input_tokens_seen": 215858480, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.390625, + "step": 10022, + "time_per_iteration": 2.4347758293151855 + }, + { + "auxiliary_loss_clip": 0.01060142, + "auxiliary_loss_mlp": 0.01024358, + "balance_loss_clip": 1.01226401, + "balance_loss_mlp": 1.01863575, + "epoch": 0.6026153614910567, + "flos": 31683085603200.0, + "grad_norm": 1.518646515212516, + "language_loss": 0.66675746, + "learning_rate": 1.3665353652522157e-06, + "loss": 0.68760246, + "num_input_tokens_seen": 215879950, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.4140625, + "step": 10023, + "time_per_iteration": 2.621558666229248 + }, + { + "auxiliary_loss_clip": 0.0106262, + "auxiliary_loss_mlp": 0.0102487, + "balance_loss_clip": 1.01256132, + "balance_loss_mlp": 1.02102995, + "epoch": 0.6026754847437247, + "flos": 29311505435520.0, + "grad_norm": 2.113338151361432, + "language_loss": 0.74316686, + "learning_rate": 1.3661770064258549e-06, + "loss": 0.76404178, + "num_input_tokens_seen": 215899830, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.41601562, + "step": 10024, + "time_per_iteration": 2.52400541305542 + }, + { + "auxiliary_loss_clip": 0.0105827, + "auxiliary_loss_mlp": 0.01021607, + "balance_loss_clip": 1.00994802, + "balance_loss_mlp": 1.01888847, + "epoch": 0.6027356079963926, + "flos": 23585644661760.0, + "grad_norm": 2.1370839976426352, + "language_loss": 0.73054016, + "learning_rate": 1.3658186702190068e-06, + "loss": 0.75133896, + "num_input_tokens_seen": 215920440, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.39453125, + "step": 10025, + "time_per_iteration": 2.578584671020508 + }, + { + "auxiliary_loss_clip": 0.0106286, + "auxiliary_loss_mlp": 0.01025199, + "balance_loss_clip": 1.0118897, + "balance_loss_mlp": 1.02044606, + "epoch": 0.6027957312490606, + "flos": 20812702970880.0, + "grad_norm": 2.1363202683071663, + "language_loss": 0.67300516, + "learning_rate": 1.3654603566444585e-06, + "loss": 0.69388574, + "num_input_tokens_seen": 215940535, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.42382812, + "step": 10026, + "time_per_iteration": 3.9013588428497314 + }, + { + "auxiliary_loss_clip": 0.01054468, + "auxiliary_loss_mlp": 0.0102242, + "balance_loss_clip": 1.01151192, + "balance_loss_mlp": 1.01746726, + "epoch": 0.6028558545017285, + "flos": 19934812033920.0, + "grad_norm": 1.8186737474404493, + "language_loss": 0.80104601, + "learning_rate": 1.3651020657149986e-06, + "loss": 0.82181489, + "num_input_tokens_seen": 215958045, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.36914062, + "step": 10027, + "time_per_iteration": 2.4329888820648193 + }, + { + "auxiliary_loss_clip": 0.01060774, + "auxiliary_loss_mlp": 0.01026151, + "balance_loss_clip": 1.01387215, + "balance_loss_mlp": 1.01943374, + "epoch": 0.6029159777543965, + "flos": 22819720055040.0, + "grad_norm": 1.9167883669689874, + "language_loss": 0.70906466, + "learning_rate": 1.3647437974434124e-06, + "loss": 0.72993398, + "num_input_tokens_seen": 215977330, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.4140625, + "step": 10028, + "time_per_iteration": 2.4643681049346924 + }, + { + "auxiliary_loss_clip": 0.01061041, + "auxiliary_loss_mlp": 0.01031252, + "balance_loss_clip": 1.01804996, + "balance_loss_mlp": 1.01937866, + "epoch": 0.6029761010070644, + "flos": 23586098509440.0, + "grad_norm": 1.99708898068798, + "language_loss": 0.84135962, + "learning_rate": 1.3643855518424859e-06, + "loss": 0.86228251, + "num_input_tokens_seen": 215997865, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.41601562, + "step": 10029, + "time_per_iteration": 2.5175681114196777 + }, + { + "auxiliary_loss_clip": 0.01057268, + "auxiliary_loss_mlp": 0.01025042, + "balance_loss_clip": 1.01358056, + "balance_loss_mlp": 1.0183624, + "epoch": 0.6030362242597325, + "flos": 13661582964480.0, + "grad_norm": 1.9178669498112961, + "language_loss": 0.80065137, + "learning_rate": 1.3640273289250043e-06, + "loss": 0.82147443, + "num_input_tokens_seen": 216016230, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.38867188, + "step": 10030, + "time_per_iteration": 2.4043290615081787 + }, + { + "auxiliary_loss_clip": 0.01059413, + "auxiliary_loss_mlp": 0.01025619, + "balance_loss_clip": 1.01380527, + "balance_loss_mlp": 1.01970017, + "epoch": 0.6030963475124004, + "flos": 24421814657280.0, + "grad_norm": 1.6215141523189227, + "language_loss": 0.71078104, + "learning_rate": 1.363669128703751e-06, + "loss": 0.7316314, + "num_input_tokens_seen": 216035785, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.3984375, + "step": 10031, + "time_per_iteration": 2.575995922088623 + }, + { + "auxiliary_loss_clip": 0.01061452, + "auxiliary_loss_mlp": 0.01025939, + "balance_loss_clip": 1.01233709, + "balance_loss_mlp": 1.01903558, + "epoch": 0.6031564707650684, + "flos": 29642726304000.0, + "grad_norm": 2.4877452869434515, + "language_loss": 0.73390174, + "learning_rate": 1.3633109511915099e-06, + "loss": 0.75477564, + "num_input_tokens_seen": 216059555, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.42382812, + "step": 10032, + "time_per_iteration": 2.5277183055877686 + }, + { + "auxiliary_loss_clip": 0.01061281, + "auxiliary_loss_mlp": 0.01029257, + "balance_loss_clip": 1.01687098, + "balance_loss_mlp": 1.02049971, + "epoch": 0.6032165940177363, + "flos": 16617818626560.0, + "grad_norm": 1.6953092290627891, + "language_loss": 0.68281317, + "learning_rate": 1.3629527964010635e-06, + "loss": 0.70371854, + "num_input_tokens_seen": 216077235, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.40820312, + "step": 10033, + "time_per_iteration": 2.436659097671509 + }, + { + "auxiliary_loss_clip": 0.0106135, + "auxiliary_loss_mlp": 0.01023997, + "balance_loss_clip": 1.01202238, + "balance_loss_mlp": 1.02047181, + "epoch": 0.6032767172704043, + "flos": 17487365748480.0, + "grad_norm": 1.864303571846902, + "language_loss": 0.75554967, + "learning_rate": 1.3625946643451924e-06, + "loss": 0.77640319, + "num_input_tokens_seen": 216094985, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.40820312, + "step": 10034, + "time_per_iteration": 2.4394800662994385 + }, + { + "auxiliary_loss_clip": 0.01060962, + "auxiliary_loss_mlp": 0.01024446, + "balance_loss_clip": 1.01156497, + "balance_loss_mlp": 1.01995707, + "epoch": 0.6033368405230723, + "flos": 26831764275840.0, + "grad_norm": 2.3294251089064923, + "language_loss": 0.74276733, + "learning_rate": 1.3622365550366789e-06, + "loss": 0.76362145, + "num_input_tokens_seen": 216115905, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.41015625, + "step": 10035, + "time_per_iteration": 2.460775136947632 + }, + { + "auxiliary_loss_clip": 0.01056724, + "auxiliary_loss_mlp": 0.01022593, + "balance_loss_clip": 1.01131582, + "balance_loss_mlp": 1.01807523, + "epoch": 0.6033969637757403, + "flos": 16908959387520.0, + "grad_norm": 1.5843376343018567, + "language_loss": 0.86483556, + "learning_rate": 1.3618784684883019e-06, + "loss": 0.8856287, + "num_input_tokens_seen": 216132420, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.38671875, + "step": 10036, + "time_per_iteration": 2.412907838821411 + }, + { + "auxiliary_loss_clip": 0.01060188, + "auxiliary_loss_mlp": 0.01026272, + "balance_loss_clip": 1.01287913, + "balance_loss_mlp": 1.01907516, + "epoch": 0.6034570870284083, + "flos": 22928963299200.0, + "grad_norm": 1.8015242783444412, + "language_loss": 0.69966304, + "learning_rate": 1.361520404712841e-06, + "loss": 0.72052759, + "num_input_tokens_seen": 216149800, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.41015625, + "step": 10037, + "time_per_iteration": 2.475799560546875 + }, + { + "auxiliary_loss_clip": 0.0105866, + "auxiliary_loss_mlp": 0.01023234, + "balance_loss_clip": 1.01223111, + "balance_loss_mlp": 1.02011216, + "epoch": 0.6035172102810762, + "flos": 23365238048640.0, + "grad_norm": 1.7603665915987357, + "language_loss": 0.85162777, + "learning_rate": 1.3611623637230743e-06, + "loss": 0.87244672, + "num_input_tokens_seen": 216168200, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.38671875, + "step": 10038, + "time_per_iteration": 2.481666326522827 + }, + { + "auxiliary_loss_clip": 0.01060624, + "auxiliary_loss_mlp": 0.01027241, + "balance_loss_clip": 1.01424742, + "balance_loss_mlp": 1.01889133, + "epoch": 0.6035773335337442, + "flos": 20886020559360.0, + "grad_norm": 2.187248423703803, + "language_loss": 0.7628879, + "learning_rate": 1.36080434553178e-06, + "loss": 0.78376657, + "num_input_tokens_seen": 216187105, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.41796875, + "step": 10039, + "time_per_iteration": 2.459665298461914 + }, + { + "auxiliary_loss_clip": 0.0105845, + "auxiliary_loss_mlp": 0.01022045, + "balance_loss_clip": 1.01012385, + "balance_loss_mlp": 1.01850009, + "epoch": 0.6036374567864121, + "flos": 24935142510720.0, + "grad_norm": 1.9130330546770513, + "language_loss": 0.713548, + "learning_rate": 1.3604463501517338e-06, + "loss": 0.73435295, + "num_input_tokens_seen": 216205440, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.3984375, + "step": 10040, + "time_per_iteration": 2.467000961303711 + }, + { + "auxiliary_loss_clip": 0.01063576, + "auxiliary_loss_mlp": 0.01028624, + "balance_loss_clip": 1.01368666, + "balance_loss_mlp": 1.02037287, + "epoch": 0.6036975800390801, + "flos": 23147170496640.0, + "grad_norm": 2.5811069226079355, + "language_loss": 0.78050649, + "learning_rate": 1.3600883775957123e-06, + "loss": 0.8014285, + "num_input_tokens_seen": 216223130, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.4296875, + "step": 10041, + "time_per_iteration": 2.459270477294922 + }, + { + "auxiliary_loss_clip": 0.01058927, + "auxiliary_loss_mlp": 0.01024626, + "balance_loss_clip": 1.01255608, + "balance_loss_mlp": 1.0196557, + "epoch": 0.603757703291748, + "flos": 18659748936960.0, + "grad_norm": 2.0167288460136237, + "language_loss": 0.75879222, + "learning_rate": 1.3597304278764909e-06, + "loss": 0.77962774, + "num_input_tokens_seen": 216240260, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.39257812, + "step": 10042, + "time_per_iteration": 2.4039766788482666 + }, + { + "auxiliary_loss_clip": 0.0105844, + "auxiliary_loss_mlp": 0.01023633, + "balance_loss_clip": 1.01222444, + "balance_loss_mlp": 1.01894367, + "epoch": 0.6038178265444161, + "flos": 19681586876160.0, + "grad_norm": 1.756696058717129, + "language_loss": 0.84486639, + "learning_rate": 1.3593725010068431e-06, + "loss": 0.86568713, + "num_input_tokens_seen": 216258510, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.39453125, + "step": 10043, + "time_per_iteration": 2.4603335857391357 + }, + { + "auxiliary_loss_clip": 0.01060398, + "auxiliary_loss_mlp": 0.01027213, + "balance_loss_clip": 1.01441026, + "balance_loss_mlp": 1.01872301, + "epoch": 0.603877949797084, + "flos": 22637124311040.0, + "grad_norm": 1.7293087843058994, + "language_loss": 0.69512212, + "learning_rate": 1.3590145969995434e-06, + "loss": 0.71599829, + "num_input_tokens_seen": 216277550, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.41601562, + "step": 10044, + "time_per_iteration": 2.454848051071167 + }, + { + "auxiliary_loss_clip": 0.0106413, + "auxiliary_loss_mlp": 0.01027442, + "balance_loss_clip": 1.0143764, + "balance_loss_mlp": 1.02192903, + "epoch": 0.603938073049752, + "flos": 25446689884800.0, + "grad_norm": 2.977725423622307, + "language_loss": 0.7790432, + "learning_rate": 1.3586567158673639e-06, + "loss": 0.79995894, + "num_input_tokens_seen": 216296690, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.421875, + "step": 10045, + "time_per_iteration": 2.465317487716675 + }, + { + "auxiliary_loss_clip": 0.01059939, + "auxiliary_loss_mlp": 0.01024075, + "balance_loss_clip": 1.01199961, + "balance_loss_mlp": 1.01990438, + "epoch": 0.6039981963024199, + "flos": 22339210746240.0, + "grad_norm": 1.5630393370404703, + "language_loss": 0.77203631, + "learning_rate": 1.3582988576230761e-06, + "loss": 0.79287648, + "num_input_tokens_seen": 216316110, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.40039062, + "step": 10046, + "time_per_iteration": 3.866551637649536 + }, + { + "auxiliary_loss_clip": 0.01057487, + "auxiliary_loss_mlp": 0.01023261, + "balance_loss_clip": 1.01194835, + "balance_loss_mlp": 1.01791596, + "epoch": 0.6040583195550879, + "flos": 20702133095040.0, + "grad_norm": 1.7882589297991134, + "language_loss": 0.86991817, + "learning_rate": 1.3579410222794515e-06, + "loss": 0.89072567, + "num_input_tokens_seen": 216333855, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.39648438, + "step": 10047, + "time_per_iteration": 2.414015531539917 + }, + { + "auxiliary_loss_clip": 0.01059362, + "auxiliary_loss_mlp": 0.01022381, + "balance_loss_clip": 1.01081216, + "balance_loss_mlp": 1.02062273, + "epoch": 0.604118442807756, + "flos": 27161867980800.0, + "grad_norm": 1.5505485352413462, + "language_loss": 0.75572681, + "learning_rate": 1.3575832098492601e-06, + "loss": 0.77654433, + "num_input_tokens_seen": 216354890, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.38671875, + "step": 10048, + "time_per_iteration": 2.4726457595825195 + }, + { + "auxiliary_loss_clip": 0.01065824, + "auxiliary_loss_mlp": 0.01035378, + "balance_loss_clip": 1.0202024, + "balance_loss_mlp": 1.0210228, + "epoch": 0.6041785660604239, + "flos": 30880257822720.0, + "grad_norm": 2.135837218992464, + "language_loss": 0.66111505, + "learning_rate": 1.357225420345272e-06, + "loss": 0.68212712, + "num_input_tokens_seen": 216376055, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.44921875, + "step": 10049, + "time_per_iteration": 2.510687828063965 + }, + { + "auxiliary_loss_clip": 0.01058193, + "auxiliary_loss_mlp": 0.01027081, + "balance_loss_clip": 1.01528549, + "balance_loss_mlp": 1.01786923, + "epoch": 0.6042386893130919, + "flos": 19937186006400.0, + "grad_norm": 1.6721440448909923, + "language_loss": 0.67104399, + "learning_rate": 1.356867653780255e-06, + "loss": 0.69189668, + "num_input_tokens_seen": 216396295, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.40234375, + "step": 10050, + "time_per_iteration": 2.408267021179199 + }, + { + "auxiliary_loss_clip": 0.01062078, + "auxiliary_loss_mlp": 0.01025334, + "balance_loss_clip": 1.01312089, + "balance_loss_mlp": 1.02127969, + "epoch": 0.6042988125657598, + "flos": 32414550831360.0, + "grad_norm": 1.7548628541656095, + "language_loss": 0.69551688, + "learning_rate": 1.356509910166977e-06, + "loss": 0.71639097, + "num_input_tokens_seen": 216416605, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.40820312, + "step": 10051, + "time_per_iteration": 2.5090036392211914 + }, + { + "auxiliary_loss_clip": 0.01056538, + "auxiliary_loss_mlp": 0.01021304, + "balance_loss_clip": 1.00949025, + "balance_loss_mlp": 1.01740217, + "epoch": 0.6043589358184278, + "flos": 17419843445760.0, + "grad_norm": 1.7362820607899454, + "language_loss": 0.64489174, + "learning_rate": 1.3561521895182054e-06, + "loss": 0.66567016, + "num_input_tokens_seen": 216435130, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.39257812, + "step": 10052, + "time_per_iteration": 2.3893930912017822 + }, + { + "auxiliary_loss_clip": 0.01058095, + "auxiliary_loss_mlp": 0.01024691, + "balance_loss_clip": 1.01202536, + "balance_loss_mlp": 1.01862979, + "epoch": 0.6044190590710957, + "flos": 27671599964160.0, + "grad_norm": 1.781879776632941, + "language_loss": 0.68593621, + "learning_rate": 1.3557944918467052e-06, + "loss": 0.7067641, + "num_input_tokens_seen": 216455640, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.39453125, + "step": 10053, + "time_per_iteration": 2.4784655570983887 + }, + { + "auxiliary_loss_clip": 0.01008286, + "auxiliary_loss_mlp": 0.01002146, + "balance_loss_clip": 1.00125194, + "balance_loss_mlp": 1.00104547, + "epoch": 0.6044791823237637, + "flos": 65313241752960.0, + "grad_norm": 0.730038338033742, + "language_loss": 0.60461658, + "learning_rate": 1.355436817165243e-06, + "loss": 0.62472087, + "num_input_tokens_seen": 216518130, + "router_z_loss_clip": 0.00891113, + "router_z_loss_mlp": 0.07226562, + "step": 10054, + "time_per_iteration": 3.1508424282073975 + }, + { + "auxiliary_loss_clip": 0.01058787, + "auxiliary_loss_mlp": 0.01026801, + "balance_loss_clip": 1.01436126, + "balance_loss_mlp": 1.01863146, + "epoch": 0.6045393055764317, + "flos": 24491396730240.0, + "grad_norm": 1.5745472744735172, + "language_loss": 0.8554188, + "learning_rate": 1.355079165486583e-06, + "loss": 0.8762747, + "num_input_tokens_seen": 216536845, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.40234375, + "step": 10055, + "time_per_iteration": 2.427682399749756 + }, + { + "auxiliary_loss_clip": 0.01064281, + "auxiliary_loss_mlp": 0.01031173, + "balance_loss_clip": 1.01812553, + "balance_loss_mlp": 1.02179122, + "epoch": 0.6045994288290997, + "flos": 19053569606400.0, + "grad_norm": 2.7825705353015193, + "language_loss": 0.7382313, + "learning_rate": 1.3547215368234879e-06, + "loss": 0.75918579, + "num_input_tokens_seen": 216551860, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.42578125, + "step": 10056, + "time_per_iteration": 2.3999783992767334 + }, + { + "auxiliary_loss_clip": 0.01060551, + "auxiliary_loss_mlp": 0.01027352, + "balance_loss_clip": 1.01494789, + "balance_loss_mlp": 1.02054572, + "epoch": 0.6046595520817676, + "flos": 26575536741120.0, + "grad_norm": 1.4939722907135333, + "language_loss": 0.80362272, + "learning_rate": 1.3543639311887221e-06, + "loss": 0.82450175, + "num_input_tokens_seen": 216574775, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.3984375, + "step": 10057, + "time_per_iteration": 2.5155692100524902 + }, + { + "auxiliary_loss_clip": 0.01058214, + "auxiliary_loss_mlp": 0.01022269, + "balance_loss_clip": 1.01049709, + "balance_loss_mlp": 1.01875985, + "epoch": 0.6047196753344356, + "flos": 13581632217600.0, + "grad_norm": 3.7306288874553957, + "language_loss": 0.75259137, + "learning_rate": 1.3540063485950462e-06, + "loss": 0.77339613, + "num_input_tokens_seen": 216590100, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.39453125, + "step": 10058, + "time_per_iteration": 5.185436487197876 + }, + { + "auxiliary_loss_clip": 0.01057699, + "auxiliary_loss_mlp": 0.0102299, + "balance_loss_clip": 1.01162326, + "balance_loss_mlp": 1.01901615, + "epoch": 0.6047797985871035, + "flos": 25519274334720.0, + "grad_norm": 1.7811951348426767, + "language_loss": 0.70686495, + "learning_rate": 1.3536487890552224e-06, + "loss": 0.72767186, + "num_input_tokens_seen": 216610145, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.38671875, + "step": 10059, + "time_per_iteration": 2.451303482055664 + }, + { + "auxiliary_loss_clip": 0.01058453, + "auxiliary_loss_mlp": 0.01024096, + "balance_loss_clip": 1.0118829, + "balance_loss_mlp": 1.0184567, + "epoch": 0.6048399218397715, + "flos": 20519153326080.0, + "grad_norm": 1.5722098494484682, + "language_loss": 0.76120257, + "learning_rate": 1.3532912525820104e-06, + "loss": 0.78202808, + "num_input_tokens_seen": 216630625, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.40039062, + "step": 10060, + "time_per_iteration": 2.4746201038360596 + }, + { + "auxiliary_loss_clip": 0.01058094, + "auxiliary_loss_mlp": 0.01028748, + "balance_loss_clip": 1.01623702, + "balance_loss_mlp": 1.01845646, + "epoch": 0.6049000450924396, + "flos": 20407850311680.0, + "grad_norm": 1.8923265189235394, + "language_loss": 0.73718071, + "learning_rate": 1.3529337391881704e-06, + "loss": 0.75804913, + "num_input_tokens_seen": 216649255, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.39648438, + "step": 10061, + "time_per_iteration": 2.4415483474731445 + }, + { + "auxiliary_loss_clip": 0.01060294, + "auxiliary_loss_mlp": 0.01026747, + "balance_loss_clip": 1.01464772, + "balance_loss_mlp": 1.02010918, + "epoch": 0.6049601683451075, + "flos": 20740293077760.0, + "grad_norm": 1.6195053237375039, + "language_loss": 0.67398894, + "learning_rate": 1.3525762488864606e-06, + "loss": 0.69485933, + "num_input_tokens_seen": 216668100, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.40234375, + "step": 10062, + "time_per_iteration": 2.410047769546509 + }, + { + "auxiliary_loss_clip": 0.01059137, + "auxiliary_loss_mlp": 0.0102775, + "balance_loss_clip": 1.01528633, + "balance_loss_mlp": 1.01848865, + "epoch": 0.6050202915977755, + "flos": 20082110526720.0, + "grad_norm": 2.136332619776739, + "language_loss": 0.71651077, + "learning_rate": 1.3522187816896392e-06, + "loss": 0.73737961, + "num_input_tokens_seen": 216686125, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.40625, + "step": 10063, + "time_per_iteration": 2.4592976570129395 + }, + { + "auxiliary_loss_clip": 0.0105646, + "auxiliary_loss_mlp": 0.01026331, + "balance_loss_clip": 1.01429713, + "balance_loss_mlp": 1.01797986, + "epoch": 0.6050804148504434, + "flos": 15959915366400.0, + "grad_norm": 1.7122876253655122, + "language_loss": 0.84262902, + "learning_rate": 1.351861337610463e-06, + "loss": 0.86345685, + "num_input_tokens_seen": 216704265, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.38476562, + "step": 10064, + "time_per_iteration": 2.3899526596069336 + }, + { + "auxiliary_loss_clip": 0.01059163, + "auxiliary_loss_mlp": 0.01024693, + "balance_loss_clip": 1.0126524, + "balance_loss_mlp": 1.01937032, + "epoch": 0.6051405381031114, + "flos": 17455699278720.0, + "grad_norm": 3.0600257006899003, + "language_loss": 0.79660213, + "learning_rate": 1.3515039166616885e-06, + "loss": 0.81744063, + "num_input_tokens_seen": 216721765, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.3984375, + "step": 10065, + "time_per_iteration": 2.362933874130249 + }, + { + "auxiliary_loss_clip": 0.01062347, + "auxiliary_loss_mlp": 0.01027503, + "balance_loss_clip": 1.0137881, + "balance_loss_mlp": 1.02039313, + "epoch": 0.6052006613557793, + "flos": 11399350775040.0, + "grad_norm": 1.7843409477141794, + "language_loss": 0.78805828, + "learning_rate": 1.3511465188560717e-06, + "loss": 0.80895674, + "num_input_tokens_seen": 216738295, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.41992188, + "step": 10066, + "time_per_iteration": 3.7603158950805664 + }, + { + "auxiliary_loss_clip": 0.01059919, + "auxiliary_loss_mlp": 0.01026373, + "balance_loss_clip": 1.01407695, + "balance_loss_mlp": 1.01954687, + "epoch": 0.6052607846084473, + "flos": 24749928414720.0, + "grad_norm": 1.508178259237224, + "language_loss": 0.73156947, + "learning_rate": 1.350789144206366e-06, + "loss": 0.75243235, + "num_input_tokens_seen": 216759875, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.40234375, + "step": 10067, + "time_per_iteration": 2.4390926361083984 + }, + { + "auxiliary_loss_clip": 0.01058934, + "auxiliary_loss_mlp": 0.01026139, + "balance_loss_clip": 1.01376522, + "balance_loss_mlp": 1.01893342, + "epoch": 0.6053209078611153, + "flos": 20740083609600.0, + "grad_norm": 1.377285247037293, + "language_loss": 0.69155908, + "learning_rate": 1.350431792725326e-06, + "loss": 0.71240973, + "num_input_tokens_seen": 216780705, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.40039062, + "step": 10068, + "time_per_iteration": 2.557666063308716 + }, + { + "auxiliary_loss_clip": 0.01057991, + "auxiliary_loss_mlp": 0.01025647, + "balance_loss_clip": 1.01339781, + "balance_loss_mlp": 1.01782727, + "epoch": 0.6053810311137833, + "flos": 18952146506880.0, + "grad_norm": 1.8724528002113345, + "language_loss": 0.87142664, + "learning_rate": 1.3500744644257043e-06, + "loss": 0.892263, + "num_input_tokens_seen": 216797625, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.40234375, + "step": 10069, + "time_per_iteration": 2.3725132942199707 + }, + { + "auxiliary_loss_clip": 0.01059156, + "auxiliary_loss_mlp": 0.01020579, + "balance_loss_clip": 1.00863457, + "balance_loss_mlp": 1.01864672, + "epoch": 0.6054411543664512, + "flos": 23949998277120.0, + "grad_norm": 1.7014120384230806, + "language_loss": 0.83383358, + "learning_rate": 1.349717159320253e-06, + "loss": 0.85463089, + "num_input_tokens_seen": 216817610, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.40429688, + "step": 10070, + "time_per_iteration": 2.417117118835449 + }, + { + "auxiliary_loss_clip": 0.01059462, + "auxiliary_loss_mlp": 0.01026668, + "balance_loss_clip": 1.01426482, + "balance_loss_mlp": 1.01831794, + "epoch": 0.6055012776191192, + "flos": 20592959673600.0, + "grad_norm": 1.6517255393136279, + "language_loss": 0.86062455, + "learning_rate": 1.349359877421724e-06, + "loss": 0.88148582, + "num_input_tokens_seen": 216836835, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.41015625, + "step": 10071, + "time_per_iteration": 2.4208977222442627 + }, + { + "auxiliary_loss_clip": 0.01062912, + "auxiliary_loss_mlp": 0.01028315, + "balance_loss_clip": 1.01443303, + "balance_loss_mlp": 1.02137005, + "epoch": 0.6055614008717871, + "flos": 30296928960000.0, + "grad_norm": 1.563872119908849, + "language_loss": 0.77034974, + "learning_rate": 1.3490026187428668e-06, + "loss": 0.79126191, + "num_input_tokens_seen": 216856760, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.41601562, + "step": 10072, + "time_per_iteration": 2.507807970046997 + }, + { + "auxiliary_loss_clip": 0.0105911, + "auxiliary_loss_mlp": 0.01024251, + "balance_loss_clip": 1.01069736, + "balance_loss_mlp": 1.01804519, + "epoch": 0.6056215241244551, + "flos": 27123812732160.0, + "grad_norm": 1.5719712959641008, + "language_loss": 0.74175262, + "learning_rate": 1.3486453832964318e-06, + "loss": 0.7625863, + "num_input_tokens_seen": 216878795, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.41015625, + "step": 10073, + "time_per_iteration": 2.4903976917266846 + }, + { + "auxiliary_loss_clip": 0.01008276, + "auxiliary_loss_mlp": 0.01003383, + "balance_loss_clip": 1.00238216, + "balance_loss_mlp": 1.00089526, + "epoch": 0.6056816473771232, + "flos": 56004699058560.0, + "grad_norm": 0.758727780744924, + "language_loss": 0.55212539, + "learning_rate": 1.3482881710951674e-06, + "loss": 0.57224202, + "num_input_tokens_seen": 216937800, + "router_z_loss_clip": 0.01000977, + "router_z_loss_mlp": 0.07421875, + "step": 10074, + "time_per_iteration": 3.0904295444488525 + }, + { + "auxiliary_loss_clip": 0.01060883, + "auxiliary_loss_mlp": 0.01023181, + "balance_loss_clip": 1.01113522, + "balance_loss_mlp": 1.02109671, + "epoch": 0.6057417706297911, + "flos": 23548392374400.0, + "grad_norm": 1.798147709202773, + "language_loss": 0.81754625, + "learning_rate": 1.347930982151822e-06, + "loss": 0.83838689, + "num_input_tokens_seen": 216955280, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.3984375, + "step": 10075, + "time_per_iteration": 2.426853895187378 + }, + { + "auxiliary_loss_clip": 0.01059976, + "auxiliary_loss_mlp": 0.01020548, + "balance_loss_clip": 1.00760818, + "balance_loss_mlp": 1.01940858, + "epoch": 0.6058018938824591, + "flos": 27743102161920.0, + "grad_norm": 1.8012150145706673, + "language_loss": 0.78127944, + "learning_rate": 1.3475738164791425e-06, + "loss": 0.80208468, + "num_input_tokens_seen": 216976950, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.40625, + "step": 10076, + "time_per_iteration": 2.460216522216797 + }, + { + "auxiliary_loss_clip": 0.01060667, + "auxiliary_loss_mlp": 0.01026201, + "balance_loss_clip": 1.01283753, + "balance_loss_mlp": 1.01919866, + "epoch": 0.605862017135127, + "flos": 22782293210880.0, + "grad_norm": 1.984557658110188, + "language_loss": 0.71913481, + "learning_rate": 1.3472166740898754e-06, + "loss": 0.74000347, + "num_input_tokens_seen": 216996945, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.4140625, + "step": 10077, + "time_per_iteration": 2.4338700771331787 + }, + { + "auxiliary_loss_clip": 0.01061027, + "auxiliary_loss_mlp": 0.01025023, + "balance_loss_clip": 1.01269698, + "balance_loss_mlp": 1.02000856, + "epoch": 0.605922140387795, + "flos": 21213959760000.0, + "grad_norm": 2.128502727949298, + "language_loss": 0.55408561, + "learning_rate": 1.3468595549967657e-06, + "loss": 0.57494617, + "num_input_tokens_seen": 217016580, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.41015625, + "step": 10078, + "time_per_iteration": 2.4168038368225098 + }, + { + "auxiliary_loss_clip": 0.01061316, + "auxiliary_loss_mlp": 0.01027325, + "balance_loss_clip": 1.01402724, + "balance_loss_mlp": 1.01978993, + "epoch": 0.6059822636404629, + "flos": 27267236064000.0, + "grad_norm": 1.5461172628922617, + "language_loss": 0.70635545, + "learning_rate": 1.3465024592125588e-06, + "loss": 0.72724187, + "num_input_tokens_seen": 217037300, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.41601562, + "step": 10079, + "time_per_iteration": 2.456432342529297 + }, + { + "auxiliary_loss_clip": 0.01059422, + "auxiliary_loss_mlp": 0.01032524, + "balance_loss_clip": 1.02010202, + "balance_loss_mlp": 1.01898777, + "epoch": 0.606042386893131, + "flos": 20630281783680.0, + "grad_norm": 1.9718775753106026, + "language_loss": 0.6251018, + "learning_rate": 1.3461453867499975e-06, + "loss": 0.64602125, + "num_input_tokens_seen": 217055805, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.40429688, + "step": 10080, + "time_per_iteration": 2.399351119995117 + }, + { + "auxiliary_loss_clip": 0.01061047, + "auxiliary_loss_mlp": 0.01021133, + "balance_loss_clip": 1.00814486, + "balance_loss_mlp": 1.01981235, + "epoch": 0.6061025101457989, + "flos": 23001198635520.0, + "grad_norm": 1.7899818951817563, + "language_loss": 0.71185821, + "learning_rate": 1.3457883376218262e-06, + "loss": 0.73267996, + "num_input_tokens_seen": 217074175, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.41210938, + "step": 10081, + "time_per_iteration": 2.4023830890655518 + }, + { + "auxiliary_loss_clip": 0.01059108, + "auxiliary_loss_mlp": 0.01023061, + "balance_loss_clip": 1.01053166, + "balance_loss_mlp": 1.01896715, + "epoch": 0.6061626333984669, + "flos": 29897627207040.0, + "grad_norm": 1.565822464993628, + "language_loss": 0.69173259, + "learning_rate": 1.345431311840786e-06, + "loss": 0.71255422, + "num_input_tokens_seen": 217095695, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.40234375, + "step": 10082, + "time_per_iteration": 2.4919323921203613 + }, + { + "auxiliary_loss_clip": 0.0106073, + "auxiliary_loss_mlp": 0.01022544, + "balance_loss_clip": 1.01098633, + "balance_loss_mlp": 1.02084243, + "epoch": 0.6062227566511348, + "flos": 25008041162880.0, + "grad_norm": 1.36707456071708, + "language_loss": 0.65854263, + "learning_rate": 1.3450743094196183e-06, + "loss": 0.67937535, + "num_input_tokens_seen": 217116260, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.3984375, + "step": 10083, + "time_per_iteration": 2.4490649700164795 + }, + { + "auxiliary_loss_clip": 0.01060159, + "auxiliary_loss_mlp": 0.01023119, + "balance_loss_clip": 1.01081681, + "balance_loss_mlp": 1.01927924, + "epoch": 0.6062828799038028, + "flos": 19462925831040.0, + "grad_norm": 2.0025865586117204, + "language_loss": 0.74280024, + "learning_rate": 1.3447173303710644e-06, + "loss": 0.76363301, + "num_input_tokens_seen": 217134465, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.41015625, + "step": 10084, + "time_per_iteration": 2.371859073638916 + }, + { + "auxiliary_loss_clip": 0.01060719, + "auxiliary_loss_mlp": 0.01025611, + "balance_loss_clip": 1.01245642, + "balance_loss_mlp": 1.01937985, + "epoch": 0.6063430031564707, + "flos": 13588719223680.0, + "grad_norm": 2.475896225352487, + "language_loss": 0.72226119, + "learning_rate": 1.3443603747078625e-06, + "loss": 0.74312449, + "num_input_tokens_seen": 217149920, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.4140625, + "step": 10085, + "time_per_iteration": 3.831359624862671 + }, + { + "auxiliary_loss_clip": 0.01058734, + "auxiliary_loss_mlp": 0.01026845, + "balance_loss_clip": 1.01493585, + "balance_loss_mlp": 1.01852846, + "epoch": 0.6064031264091387, + "flos": 23254458704640.0, + "grad_norm": 1.9372479918710326, + "language_loss": 0.76354146, + "learning_rate": 1.344003442442753e-06, + "loss": 0.78439724, + "num_input_tokens_seen": 217168165, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.40234375, + "step": 10086, + "time_per_iteration": 2.414367914199829 + }, + { + "auxiliary_loss_clip": 0.01058997, + "auxiliary_loss_mlp": 0.01023579, + "balance_loss_clip": 1.01118743, + "balance_loss_mlp": 1.01931369, + "epoch": 0.6064632496618068, + "flos": 22457286564480.0, + "grad_norm": 2.844181174410112, + "language_loss": 0.73045444, + "learning_rate": 1.3436465335884728e-06, + "loss": 0.75128019, + "num_input_tokens_seen": 217190070, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.39648438, + "step": 10087, + "time_per_iteration": 2.4291491508483887 + }, + { + "auxiliary_loss_clip": 0.01056907, + "auxiliary_loss_mlp": 0.01024524, + "balance_loss_clip": 1.01325226, + "balance_loss_mlp": 1.01865149, + "epoch": 0.6065233729144747, + "flos": 25117493875200.0, + "grad_norm": 1.6134838434762302, + "language_loss": 0.83577192, + "learning_rate": 1.3432896481577597e-06, + "loss": 0.85658622, + "num_input_tokens_seen": 217209370, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.3828125, + "step": 10088, + "time_per_iteration": 2.414510488510132 + }, + { + "auxiliary_loss_clip": 0.01059136, + "auxiliary_loss_mlp": 0.01023421, + "balance_loss_clip": 1.01069546, + "balance_loss_mlp": 1.01910257, + "epoch": 0.6065834961671427, + "flos": 23476226860800.0, + "grad_norm": 2.150559510367106, + "language_loss": 0.71140087, + "learning_rate": 1.3429327861633501e-06, + "loss": 0.73222649, + "num_input_tokens_seen": 217226990, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.40039062, + "step": 10089, + "time_per_iteration": 2.4054834842681885 + }, + { + "auxiliary_loss_clip": 0.01060821, + "auxiliary_loss_mlp": 0.01028896, + "balance_loss_clip": 1.01732695, + "balance_loss_mlp": 1.01999235, + "epoch": 0.6066436194198106, + "flos": 17018447011200.0, + "grad_norm": 5.737668442139397, + "language_loss": 0.83150041, + "learning_rate": 1.3425759476179785e-06, + "loss": 0.85239756, + "num_input_tokens_seen": 217244585, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.40820312, + "step": 10090, + "time_per_iteration": 2.383201837539673 + }, + { + "auxiliary_loss_clip": 0.01064009, + "auxiliary_loss_mlp": 0.01031055, + "balance_loss_clip": 1.01756692, + "balance_loss_mlp": 1.02016926, + "epoch": 0.6067037426724786, + "flos": 18513777075840.0, + "grad_norm": 2.2943365690353725, + "language_loss": 0.7545166, + "learning_rate": 1.3422191325343808e-06, + "loss": 0.77546722, + "num_input_tokens_seen": 217263435, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.4375, + "step": 10091, + "time_per_iteration": 2.380610704421997 + }, + { + "auxiliary_loss_clip": 0.01056821, + "auxiliary_loss_mlp": 0.01029301, + "balance_loss_clip": 1.01741612, + "balance_loss_mlp": 1.01805854, + "epoch": 0.6067638659251465, + "flos": 22344901297920.0, + "grad_norm": 1.6682737992786107, + "language_loss": 0.79678893, + "learning_rate": 1.3418623409252899e-06, + "loss": 0.81765008, + "num_input_tokens_seen": 217283725, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.38671875, + "step": 10092, + "time_per_iteration": 2.3959174156188965 + }, + { + "auxiliary_loss_clip": 0.01061034, + "auxiliary_loss_mlp": 0.01030716, + "balance_loss_clip": 1.01760912, + "balance_loss_mlp": 1.01900125, + "epoch": 0.6068239891778145, + "flos": 12450411388800.0, + "grad_norm": 1.910013258439716, + "language_loss": 0.75907427, + "learning_rate": 1.3415055728034394e-06, + "loss": 0.77999187, + "num_input_tokens_seen": 217301120, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.41992188, + "step": 10093, + "time_per_iteration": 2.386808156967163 + }, + { + "auxiliary_loss_clip": 0.01058592, + "auxiliary_loss_mlp": 0.01024427, + "balance_loss_clip": 1.01221395, + "balance_loss_mlp": 1.01977909, + "epoch": 0.6068841124304825, + "flos": 23184736986240.0, + "grad_norm": 1.8545789748385564, + "language_loss": 0.87095857, + "learning_rate": 1.3411488281815611e-06, + "loss": 0.89178872, + "num_input_tokens_seen": 217319585, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.38867188, + "step": 10094, + "time_per_iteration": 2.3948378562927246 + }, + { + "auxiliary_loss_clip": 0.01059338, + "auxiliary_loss_mlp": 0.01023526, + "balance_loss_clip": 1.0108366, + "balance_loss_mlp": 1.01944709, + "epoch": 0.6069442356831505, + "flos": 18586920107520.0, + "grad_norm": 1.7474945117945409, + "language_loss": 0.72198772, + "learning_rate": 1.340792107072386e-06, + "loss": 0.74281633, + "num_input_tokens_seen": 217338880, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.3984375, + "step": 10095, + "time_per_iteration": 2.3834445476531982 + }, + { + "auxiliary_loss_clip": 0.01059742, + "auxiliary_loss_mlp": 0.01023665, + "balance_loss_clip": 1.01191688, + "balance_loss_mlp": 1.01870155, + "epoch": 0.6070043589358184, + "flos": 20959268325120.0, + "grad_norm": 1.6440985154547803, + "language_loss": 0.76706266, + "learning_rate": 1.3404354094886454e-06, + "loss": 0.78789675, + "num_input_tokens_seen": 217357480, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.41015625, + "step": 10096, + "time_per_iteration": 2.437160015106201 + }, + { + "auxiliary_loss_clip": 0.01062529, + "auxiliary_loss_mlp": 0.0103221, + "balance_loss_clip": 1.01983631, + "balance_loss_mlp": 1.02117074, + "epoch": 0.6070644821884864, + "flos": 11691643610880.0, + "grad_norm": 2.021773012769265, + "language_loss": 0.79817069, + "learning_rate": 1.3400787354430683e-06, + "loss": 0.81911808, + "num_input_tokens_seen": 217374575, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.4140625, + "step": 10097, + "time_per_iteration": 5.192333936691284 + }, + { + "auxiliary_loss_clip": 0.01057672, + "auxiliary_loss_mlp": 0.01023942, + "balance_loss_clip": 1.01172948, + "balance_loss_mlp": 1.01869917, + "epoch": 0.6071246054411543, + "flos": 19309762229760.0, + "grad_norm": 2.4646678828603976, + "language_loss": 0.67179877, + "learning_rate": 1.3397220849483837e-06, + "loss": 0.69261485, + "num_input_tokens_seen": 217392950, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.390625, + "step": 10098, + "time_per_iteration": 2.389936923980713 + }, + { + "auxiliary_loss_clip": 0.01058862, + "auxiliary_loss_mlp": 0.01021416, + "balance_loss_clip": 1.00903034, + "balance_loss_mlp": 1.01884174, + "epoch": 0.6071847286938223, + "flos": 17748061937280.0, + "grad_norm": 2.124356036880353, + "language_loss": 0.80683076, + "learning_rate": 1.3393654580173194e-06, + "loss": 0.82763362, + "num_input_tokens_seen": 217412145, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.40039062, + "step": 10099, + "time_per_iteration": 2.497847080230713 + }, + { + "auxiliary_loss_clip": 0.01061478, + "auxiliary_loss_mlp": 0.01026096, + "balance_loss_clip": 1.01235723, + "balance_loss_mlp": 1.01808131, + "epoch": 0.6072448519464904, + "flos": 22636426083840.0, + "grad_norm": 3.4114837787120607, + "language_loss": 0.70493507, + "learning_rate": 1.3390088546626023e-06, + "loss": 0.72581077, + "num_input_tokens_seen": 217432080, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.43359375, + "step": 10100, + "time_per_iteration": 2.4043829441070557 + }, + { + "auxiliary_loss_clip": 0.01063347, + "auxiliary_loss_mlp": 0.01032931, + "balance_loss_clip": 1.01926994, + "balance_loss_mlp": 1.02193749, + "epoch": 0.6073049751991583, + "flos": 19536278330880.0, + "grad_norm": 2.1216844361896445, + "language_loss": 0.71045589, + "learning_rate": 1.338652274896959e-06, + "loss": 0.73141861, + "num_input_tokens_seen": 217450945, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.4140625, + "step": 10101, + "time_per_iteration": 2.386899948120117 + }, + { + "auxiliary_loss_clip": 0.01057572, + "auxiliary_loss_mlp": 0.0102253, + "balance_loss_clip": 1.01047254, + "balance_loss_mlp": 1.01796567, + "epoch": 0.6073650984518263, + "flos": 28292949164160.0, + "grad_norm": 2.266929206694677, + "language_loss": 0.69236195, + "learning_rate": 1.3382957187331147e-06, + "loss": 0.71316296, + "num_input_tokens_seen": 217473105, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.39648438, + "step": 10102, + "time_per_iteration": 2.5272927284240723 + }, + { + "auxiliary_loss_clip": 0.01060438, + "auxiliary_loss_mlp": 0.01023476, + "balance_loss_clip": 1.01135206, + "balance_loss_mlp": 1.01949489, + "epoch": 0.6074252217044942, + "flos": 25663291159680.0, + "grad_norm": 2.0996789300882495, + "language_loss": 0.73623222, + "learning_rate": 1.3379391861837945e-06, + "loss": 0.75707138, + "num_input_tokens_seen": 217491780, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.41015625, + "step": 10103, + "time_per_iteration": 2.4325759410858154 + }, + { + "auxiliary_loss_clip": 0.01063003, + "auxiliary_loss_mlp": 0.01028975, + "balance_loss_clip": 1.01526582, + "balance_loss_mlp": 1.02062488, + "epoch": 0.6074853449571622, + "flos": 22855994824320.0, + "grad_norm": 1.8265724607805396, + "language_loss": 0.76500416, + "learning_rate": 1.3375826772617212e-06, + "loss": 0.78592396, + "num_input_tokens_seen": 217510605, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.42382812, + "step": 10104, + "time_per_iteration": 3.88008975982666 + }, + { + "auxiliary_loss_clip": 0.01008937, + "auxiliary_loss_mlp": 0.01003434, + "balance_loss_clip": 1.00256944, + "balance_loss_mlp": 1.00153732, + "epoch": 0.6075454682098301, + "flos": 67108126216320.0, + "grad_norm": 1.1972975575733364, + "language_loss": 0.55899799, + "learning_rate": 1.3372261919796187e-06, + "loss": 0.57912171, + "num_input_tokens_seen": 217574815, + "router_z_loss_clip": 0.00866699, + "router_z_loss_mlp": 0.07421875, + "step": 10105, + "time_per_iteration": 3.051769256591797 + }, + { + "auxiliary_loss_clip": 0.01008674, + "auxiliary_loss_mlp": 0.01007834, + "balance_loss_clip": 1.00690997, + "balance_loss_mlp": 1.00120568, + "epoch": 0.6076055914624982, + "flos": 70708963910400.0, + "grad_norm": 0.7602531517446522, + "language_loss": 0.56837088, + "learning_rate": 1.3368697303502083e-06, + "loss": 0.5885359, + "num_input_tokens_seen": 217632375, + "router_z_loss_clip": 0.00921631, + "router_z_loss_mlp": 0.07470703, + "step": 10106, + "time_per_iteration": 3.007040500640869 + }, + { + "auxiliary_loss_clip": 0.01059054, + "auxiliary_loss_mlp": 0.01026083, + "balance_loss_clip": 1.01416218, + "balance_loss_mlp": 1.01927865, + "epoch": 0.6076657147151661, + "flos": 28363334198400.0, + "grad_norm": 1.5468700187885218, + "language_loss": 0.68542677, + "learning_rate": 1.3365132923862112e-06, + "loss": 0.70627815, + "num_input_tokens_seen": 217653055, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.3984375, + "step": 10107, + "time_per_iteration": 2.480841875076294 + }, + { + "auxiliary_loss_clip": 0.0106196, + "auxiliary_loss_mlp": 0.01027358, + "balance_loss_clip": 1.01426852, + "balance_loss_mlp": 1.01934445, + "epoch": 0.6077258379678341, + "flos": 15048856771200.0, + "grad_norm": 2.069859786908637, + "language_loss": 0.81014293, + "learning_rate": 1.3361568781003485e-06, + "loss": 0.83103615, + "num_input_tokens_seen": 217671520, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.42578125, + "step": 10108, + "time_per_iteration": 2.358222007751465 + }, + { + "auxiliary_loss_clip": 0.0106482, + "auxiliary_loss_mlp": 0.01028075, + "balance_loss_clip": 1.01437187, + "balance_loss_mlp": 1.02108705, + "epoch": 0.607785961220502, + "flos": 36165968686080.0, + "grad_norm": 1.787845417599924, + "language_loss": 0.71391237, + "learning_rate": 1.3358004875053387e-06, + "loss": 0.73484129, + "num_input_tokens_seen": 217691880, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.4375, + "step": 10109, + "time_per_iteration": 2.511730670928955 + }, + { + "auxiliary_loss_clip": 0.01056935, + "auxiliary_loss_mlp": 0.0102623, + "balance_loss_clip": 1.01408839, + "balance_loss_mlp": 1.01852226, + "epoch": 0.60784608447317, + "flos": 22523272767360.0, + "grad_norm": 1.6151049410768874, + "language_loss": 0.80200899, + "learning_rate": 1.3354441206139012e-06, + "loss": 0.82284063, + "num_input_tokens_seen": 217710530, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.3828125, + "step": 10110, + "time_per_iteration": 2.4349071979522705 + }, + { + "auxiliary_loss_clip": 0.0106143, + "auxiliary_loss_mlp": 0.01025006, + "balance_loss_clip": 1.01160073, + "balance_loss_mlp": 1.01894391, + "epoch": 0.6079062077258379, + "flos": 23840056805760.0, + "grad_norm": 2.2393158578211283, + "language_loss": 0.70080209, + "learning_rate": 1.3350877774387541e-06, + "loss": 0.72166646, + "num_input_tokens_seen": 217728650, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.42578125, + "step": 10111, + "time_per_iteration": 2.4269590377807617 + }, + { + "auxiliary_loss_clip": 0.01061917, + "auxiliary_loss_mlp": 0.01027012, + "balance_loss_clip": 1.01295733, + "balance_loss_mlp": 1.02045286, + "epoch": 0.6079663309785059, + "flos": 23365936275840.0, + "grad_norm": 2.2863804932467855, + "language_loss": 0.65015197, + "learning_rate": 1.3347314579926137e-06, + "loss": 0.67104125, + "num_input_tokens_seen": 217747135, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.41601562, + "step": 10112, + "time_per_iteration": 2.4038312435150146 + }, + { + "auxiliary_loss_clip": 0.01008316, + "auxiliary_loss_mlp": 0.01000805, + "balance_loss_clip": 0.99980378, + "balance_loss_mlp": 1.00072742, + "epoch": 0.6080264542311739, + "flos": 71379400348800.0, + "grad_norm": 0.6284891156999933, + "language_loss": 0.49576026, + "learning_rate": 1.334375162288196e-06, + "loss": 0.51585144, + "num_input_tokens_seen": 217811860, + "router_z_loss_clip": 0.01000977, + "router_z_loss_mlp": 0.07617188, + "step": 10113, + "time_per_iteration": 3.06559157371521 + }, + { + "auxiliary_loss_clip": 0.01008008, + "auxiliary_loss_mlp": 0.01001367, + "balance_loss_clip": 1.0004313, + "balance_loss_mlp": 1.0006336, + "epoch": 0.6080865774838419, + "flos": 66526508010240.0, + "grad_norm": 0.8337760811816706, + "language_loss": 0.57001346, + "learning_rate": 1.3340188903382164e-06, + "loss": 0.59010726, + "num_input_tokens_seen": 217866510, + "router_z_loss_clip": 0.00933838, + "router_z_loss_mlp": 0.07373047, + "step": 10114, + "time_per_iteration": 2.9751296043395996 + }, + { + "auxiliary_loss_clip": 0.01062177, + "auxiliary_loss_mlp": 0.01023526, + "balance_loss_clip": 1.00940013, + "balance_loss_mlp": 1.02006936, + "epoch": 0.6081467007365099, + "flos": 19206942675840.0, + "grad_norm": 1.86821180700226, + "language_loss": 0.70665431, + "learning_rate": 1.3336626421553897e-06, + "loss": 0.72751129, + "num_input_tokens_seen": 217885650, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.421875, + "step": 10115, + "time_per_iteration": 2.4013431072235107 + }, + { + "auxiliary_loss_clip": 0.01060112, + "auxiliary_loss_mlp": 0.01025374, + "balance_loss_clip": 1.01279759, + "balance_loss_mlp": 1.01904345, + "epoch": 0.6082068239891778, + "flos": 24166669374720.0, + "grad_norm": 2.9116154840244066, + "language_loss": 0.72578228, + "learning_rate": 1.3333064177524296e-06, + "loss": 0.74663717, + "num_input_tokens_seen": 217905300, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.41015625, + "step": 10116, + "time_per_iteration": 2.433985471725464 + }, + { + "auxiliary_loss_clip": 0.0106226, + "auxiliary_loss_mlp": 0.01028616, + "balance_loss_clip": 1.0149008, + "balance_loss_mlp": 1.01992631, + "epoch": 0.6082669472418458, + "flos": 37411844019840.0, + "grad_norm": 2.216538819385793, + "language_loss": 0.53914458, + "learning_rate": 1.3329502171420478e-06, + "loss": 0.56005335, + "num_input_tokens_seen": 217927845, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.42382812, + "step": 10117, + "time_per_iteration": 2.5650510787963867 + }, + { + "auxiliary_loss_clip": 0.01060572, + "auxiliary_loss_mlp": 0.01022271, + "balance_loss_clip": 1.00953293, + "balance_loss_mlp": 1.01945949, + "epoch": 0.6083270704945137, + "flos": 15084642781440.0, + "grad_norm": 1.6405143215937017, + "language_loss": 0.69876337, + "learning_rate": 1.3325940403369575e-06, + "loss": 0.71959174, + "num_input_tokens_seen": 217946145, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.41015625, + "step": 10118, + "time_per_iteration": 2.53532338142395 + }, + { + "auxiliary_loss_clip": 0.01060291, + "auxiliary_loss_mlp": 0.01025331, + "balance_loss_clip": 1.01259923, + "balance_loss_mlp": 1.01981807, + "epoch": 0.6083871937471818, + "flos": 20667394425600.0, + "grad_norm": 1.576817607672624, + "language_loss": 0.74612057, + "learning_rate": 1.3322378873498685e-06, + "loss": 0.76697683, + "num_input_tokens_seen": 217965190, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.40625, + "step": 10119, + "time_per_iteration": 2.4504475593566895 + }, + { + "auxiliary_loss_clip": 0.01059308, + "auxiliary_loss_mlp": 0.01023254, + "balance_loss_clip": 1.01103449, + "balance_loss_mlp": 1.01942956, + "epoch": 0.6084473169998497, + "flos": 21505833659520.0, + "grad_norm": 1.957295350306346, + "language_loss": 0.67976868, + "learning_rate": 1.3318817581934922e-06, + "loss": 0.70059431, + "num_input_tokens_seen": 217983625, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.3984375, + "step": 10120, + "time_per_iteration": 2.4228646755218506 + }, + { + "auxiliary_loss_clip": 0.01063958, + "auxiliary_loss_mlp": 0.01032367, + "balance_loss_clip": 1.01902699, + "balance_loss_mlp": 1.02114177, + "epoch": 0.6085074402525177, + "flos": 26868842006400.0, + "grad_norm": 1.6696301100083302, + "language_loss": 0.74020267, + "learning_rate": 1.3315256528805373e-06, + "loss": 0.76116598, + "num_input_tokens_seen": 218006005, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.42773438, + "step": 10121, + "time_per_iteration": 2.4917800426483154 + }, + { + "auxiliary_loss_clip": 0.0105966, + "auxiliary_loss_mlp": 0.01022744, + "balance_loss_clip": 1.01073384, + "balance_loss_mlp": 1.01905465, + "epoch": 0.6085675635051856, + "flos": 10889060209920.0, + "grad_norm": 1.707601768742778, + "language_loss": 0.81001359, + "learning_rate": 1.3311695714237118e-06, + "loss": 0.83083761, + "num_input_tokens_seen": 218024195, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.40625, + "step": 10122, + "time_per_iteration": 2.3765275478363037 + }, + { + "auxiliary_loss_clip": 0.01061245, + "auxiliary_loss_mlp": 0.01030638, + "balance_loss_clip": 1.01730442, + "balance_loss_mlp": 1.01986599, + "epoch": 0.6086276867578536, + "flos": 34860705396480.0, + "grad_norm": 1.6942116747717935, + "language_loss": 0.5569635, + "learning_rate": 1.3308135138357247e-06, + "loss": 0.57788235, + "num_input_tokens_seen": 218047190, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.4140625, + "step": 10123, + "time_per_iteration": 2.5266385078430176 + }, + { + "auxiliary_loss_clip": 0.01061022, + "auxiliary_loss_mlp": 0.01029237, + "balance_loss_clip": 1.0162015, + "balance_loss_mlp": 1.02028704, + "epoch": 0.6086878100105215, + "flos": 20046673630080.0, + "grad_norm": 1.7773814086253021, + "language_loss": 0.74229521, + "learning_rate": 1.330457480129281e-06, + "loss": 0.76319778, + "num_input_tokens_seen": 218065945, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.40625, + "step": 10124, + "time_per_iteration": 3.8486602306365967 + }, + { + "auxiliary_loss_clip": 0.01061169, + "auxiliary_loss_mlp": 0.01028531, + "balance_loss_clip": 1.01578152, + "balance_loss_mlp": 1.0198158, + "epoch": 0.6087479332631895, + "flos": 18331495534080.0, + "grad_norm": 3.6795914397132954, + "language_loss": 0.65899432, + "learning_rate": 1.3301014703170883e-06, + "loss": 0.67989129, + "num_input_tokens_seen": 218085285, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.4140625, + "step": 10125, + "time_per_iteration": 2.38214373588562 + }, + { + "auxiliary_loss_clip": 0.01060402, + "auxiliary_loss_mlp": 0.01025281, + "balance_loss_clip": 1.01320457, + "balance_loss_mlp": 1.02019989, + "epoch": 0.6088080565158575, + "flos": 24492409159680.0, + "grad_norm": 1.687964897259549, + "language_loss": 0.76059294, + "learning_rate": 1.3297454844118503e-06, + "loss": 0.78144979, + "num_input_tokens_seen": 218104735, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.40234375, + "step": 10126, + "time_per_iteration": 2.480034112930298 + }, + { + "auxiliary_loss_clip": 0.01061899, + "auxiliary_loss_mlp": 0.01027748, + "balance_loss_clip": 1.01400936, + "balance_loss_mlp": 1.01902103, + "epoch": 0.6088681797685255, + "flos": 10268269591680.0, + "grad_norm": 2.890987104544346, + "language_loss": 0.71355212, + "learning_rate": 1.3293895224262728e-06, + "loss": 0.73444855, + "num_input_tokens_seen": 218121855, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.4296875, + "step": 10127, + "time_per_iteration": 2.3941612243652344 + }, + { + "auxiliary_loss_clip": 0.01063608, + "auxiliary_loss_mlp": 0.01026155, + "balance_loss_clip": 1.01210618, + "balance_loss_mlp": 1.02062368, + "epoch": 0.6089283030211935, + "flos": 21972832272000.0, + "grad_norm": 2.838351933547376, + "language_loss": 0.73057103, + "learning_rate": 1.3290335843730578e-06, + "loss": 0.75146866, + "num_input_tokens_seen": 218137325, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.4296875, + "step": 10128, + "time_per_iteration": 2.4131827354431152 + }, + { + "auxiliary_loss_clip": 0.01060143, + "auxiliary_loss_mlp": 0.0102117, + "balance_loss_clip": 1.0086832, + "balance_loss_mlp": 1.01949859, + "epoch": 0.6089884262738614, + "flos": 17784231972480.0, + "grad_norm": 2.2850768402934643, + "language_loss": 0.72941685, + "learning_rate": 1.3286776702649078e-06, + "loss": 0.75022995, + "num_input_tokens_seen": 218155530, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.40625, + "step": 10129, + "time_per_iteration": 2.4092605113983154 + }, + { + "auxiliary_loss_clip": 0.01058406, + "auxiliary_loss_mlp": 0.01024149, + "balance_loss_clip": 1.01172733, + "balance_loss_mlp": 1.01805186, + "epoch": 0.6090485495265294, + "flos": 36908745194880.0, + "grad_norm": 1.4442497911680925, + "language_loss": 0.65453547, + "learning_rate": 1.3283217801145255e-06, + "loss": 0.67536104, + "num_input_tokens_seen": 218182535, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.40234375, + "step": 10130, + "time_per_iteration": 2.606034278869629 + }, + { + "auxiliary_loss_clip": 0.01056747, + "auxiliary_loss_mlp": 0.01022428, + "balance_loss_clip": 1.01097226, + "balance_loss_mlp": 1.01828551, + "epoch": 0.6091086727791973, + "flos": 19898083416960.0, + "grad_norm": 1.7217626033298703, + "language_loss": 0.76917672, + "learning_rate": 1.3279659139346104e-06, + "loss": 0.78996849, + "num_input_tokens_seen": 218201740, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.38476562, + "step": 10131, + "time_per_iteration": 2.420240640640259 + }, + { + "auxiliary_loss_clip": 0.01059239, + "auxiliary_loss_mlp": 0.01023782, + "balance_loss_clip": 1.01215911, + "balance_loss_mlp": 1.01931989, + "epoch": 0.6091687960318654, + "flos": 22162549933440.0, + "grad_norm": 1.866087670796876, + "language_loss": 0.76897305, + "learning_rate": 1.327610071737864e-06, + "loss": 0.78980327, + "num_input_tokens_seen": 218219800, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.3984375, + "step": 10132, + "time_per_iteration": 2.4269139766693115 + }, + { + "auxiliary_loss_clip": 0.01058223, + "auxiliary_loss_mlp": 0.01024663, + "balance_loss_clip": 1.01321888, + "balance_loss_mlp": 1.02030015, + "epoch": 0.6092289192845333, + "flos": 21464357097600.0, + "grad_norm": 1.6635122483285183, + "language_loss": 0.75518584, + "learning_rate": 1.3272542535369841e-06, + "loss": 0.77601475, + "num_input_tokens_seen": 218237585, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.37890625, + "step": 10133, + "time_per_iteration": 2.4003336429595947 + }, + { + "auxiliary_loss_clip": 0.01057757, + "auxiliary_loss_mlp": 0.01023396, + "balance_loss_clip": 1.01088452, + "balance_loss_mlp": 1.01856303, + "epoch": 0.6092890425372013, + "flos": 28693647371520.0, + "grad_norm": 1.6655948429022038, + "language_loss": 0.6372, + "learning_rate": 1.3268984593446693e-06, + "loss": 0.65801156, + "num_input_tokens_seen": 218258700, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.39257812, + "step": 10134, + "time_per_iteration": 2.463099479675293 + }, + { + "auxiliary_loss_clip": 0.01056979, + "auxiliary_loss_mlp": 0.01020996, + "balance_loss_clip": 1.0090636, + "balance_loss_mlp": 1.01803076, + "epoch": 0.6093491657898692, + "flos": 20812144389120.0, + "grad_norm": 1.900531839840363, + "language_loss": 0.78231812, + "learning_rate": 1.326542689173617e-06, + "loss": 0.8030979, + "num_input_tokens_seen": 218275655, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.390625, + "step": 10135, + "time_per_iteration": 2.40346097946167 + }, + { + "auxiliary_loss_clip": 0.01059943, + "auxiliary_loss_mlp": 0.01026956, + "balance_loss_clip": 1.01537526, + "balance_loss_mlp": 1.01960289, + "epoch": 0.6094092890425372, + "flos": 25445817100800.0, + "grad_norm": 1.827838465847294, + "language_loss": 0.72049493, + "learning_rate": 1.3261869430365237e-06, + "loss": 0.74136388, + "num_input_tokens_seen": 218295720, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.40234375, + "step": 10136, + "time_per_iteration": 3.9472806453704834 + }, + { + "auxiliary_loss_clip": 0.01062707, + "auxiliary_loss_mlp": 0.01026463, + "balance_loss_clip": 1.01358843, + "balance_loss_mlp": 1.02105248, + "epoch": 0.6094694122952051, + "flos": 35619961933440.0, + "grad_norm": 1.687566967758485, + "language_loss": 0.74189425, + "learning_rate": 1.3258312209460859e-06, + "loss": 0.76278591, + "num_input_tokens_seen": 218316745, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.41601562, + "step": 10137, + "time_per_iteration": 3.96246600151062 + }, + { + "auxiliary_loss_clip": 0.01063049, + "auxiliary_loss_mlp": 0.01028868, + "balance_loss_clip": 1.01562381, + "balance_loss_mlp": 1.02079082, + "epoch": 0.6095295355478731, + "flos": 24455959833600.0, + "grad_norm": 1.6544730379487367, + "language_loss": 0.7991479, + "learning_rate": 1.325475522914997e-06, + "loss": 0.82006711, + "num_input_tokens_seen": 218335385, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.421875, + "step": 10138, + "time_per_iteration": 2.430776596069336 + }, + { + "auxiliary_loss_clip": 0.01055795, + "auxiliary_loss_mlp": 0.0102524, + "balance_loss_clip": 1.01356399, + "balance_loss_mlp": 1.01811874, + "epoch": 0.6095896588005411, + "flos": 15632290368000.0, + "grad_norm": 1.5303323693005817, + "language_loss": 0.81012237, + "learning_rate": 1.3251198489559517e-06, + "loss": 0.83093274, + "num_input_tokens_seen": 218353320, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.37695312, + "step": 10139, + "time_per_iteration": 2.4290049076080322 + }, + { + "auxiliary_loss_clip": 0.01063234, + "auxiliary_loss_mlp": 0.01030191, + "balance_loss_clip": 1.01658976, + "balance_loss_mlp": 1.02066755, + "epoch": 0.6096497820532091, + "flos": 15549930737280.0, + "grad_norm": 2.0254438909239347, + "language_loss": 0.83552706, + "learning_rate": 1.3247641990816432e-06, + "loss": 0.85646129, + "num_input_tokens_seen": 218365620, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.42578125, + "step": 10140, + "time_per_iteration": 2.363118886947632 + }, + { + "auxiliary_loss_clip": 0.01060388, + "auxiliary_loss_mlp": 0.01025022, + "balance_loss_clip": 1.0127492, + "balance_loss_mlp": 1.01930737, + "epoch": 0.6097099053058771, + "flos": 24203397991680.0, + "grad_norm": 1.6658740918197168, + "language_loss": 0.75635552, + "learning_rate": 1.324408573304763e-06, + "loss": 0.77720964, + "num_input_tokens_seen": 218383785, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.41015625, + "step": 10141, + "time_per_iteration": 2.494422674179077 + }, + { + "auxiliary_loss_clip": 0.01059735, + "auxiliary_loss_mlp": 0.01024848, + "balance_loss_clip": 1.01261735, + "balance_loss_mlp": 1.01949561, + "epoch": 0.609770028558545, + "flos": 19569306343680.0, + "grad_norm": 1.9238310386750712, + "language_loss": 0.76233447, + "learning_rate": 1.3240529716380038e-06, + "loss": 0.78318036, + "num_input_tokens_seen": 218399055, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.40234375, + "step": 10142, + "time_per_iteration": 2.398796558380127 + }, + { + "auxiliary_loss_clip": 0.0106, + "auxiliary_loss_mlp": 0.01024329, + "balance_loss_clip": 1.01078057, + "balance_loss_mlp": 1.02004457, + "epoch": 0.609830151811213, + "flos": 23948113063680.0, + "grad_norm": 4.250772222655423, + "language_loss": 0.76590788, + "learning_rate": 1.3236973940940552e-06, + "loss": 0.78675115, + "num_input_tokens_seen": 218419120, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.3984375, + "step": 10143, + "time_per_iteration": 2.433637857437134 + }, + { + "auxiliary_loss_clip": 0.01059447, + "auxiliary_loss_mlp": 0.01022823, + "balance_loss_clip": 1.00981152, + "balance_loss_mlp": 1.01882577, + "epoch": 0.6098902750638809, + "flos": 16178820791040.0, + "grad_norm": 1.7423951225492496, + "language_loss": 0.75114858, + "learning_rate": 1.323341840685606e-06, + "loss": 0.77197123, + "num_input_tokens_seen": 218435290, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.40625, + "step": 10144, + "time_per_iteration": 3.7797529697418213 + }, + { + "auxiliary_loss_clip": 0.01060985, + "auxiliary_loss_mlp": 0.01025699, + "balance_loss_clip": 1.01230621, + "balance_loss_mlp": 1.01953518, + "epoch": 0.609950398316549, + "flos": 44524769132160.0, + "grad_norm": 1.6867365351985724, + "language_loss": 0.72591794, + "learning_rate": 1.322986311425347e-06, + "loss": 0.74678469, + "num_input_tokens_seen": 218457880, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.4140625, + "step": 10145, + "time_per_iteration": 2.6154091358184814 + }, + { + "auxiliary_loss_clip": 0.0105837, + "auxiliary_loss_mlp": 0.01022488, + "balance_loss_clip": 1.01097846, + "balance_loss_mlp": 1.01951361, + "epoch": 0.6100105215692169, + "flos": 23220627730560.0, + "grad_norm": 2.475606693488003, + "language_loss": 0.69307351, + "learning_rate": 1.3226308063259643e-06, + "loss": 0.71388209, + "num_input_tokens_seen": 218475930, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.38867188, + "step": 10146, + "time_per_iteration": 2.394796848297119 + }, + { + "auxiliary_loss_clip": 0.01057731, + "auxiliary_loss_mlp": 0.010218, + "balance_loss_clip": 1.0107131, + "balance_loss_mlp": 1.01995862, + "epoch": 0.6100706448218849, + "flos": 15011674306560.0, + "grad_norm": 1.8050659307875752, + "language_loss": 0.7690205, + "learning_rate": 1.3222753254001462e-06, + "loss": 0.78981578, + "num_input_tokens_seen": 218493675, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.37890625, + "step": 10147, + "time_per_iteration": 2.3780605792999268 + }, + { + "auxiliary_loss_clip": 0.01056792, + "auxiliary_loss_mlp": 0.01022091, + "balance_loss_clip": 1.01053965, + "balance_loss_mlp": 1.01739883, + "epoch": 0.6101307680745528, + "flos": 19639132796160.0, + "grad_norm": 2.3250835978484354, + "language_loss": 0.7839191, + "learning_rate": 1.321919868660578e-06, + "loss": 0.80470794, + "num_input_tokens_seen": 218511780, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.39453125, + "step": 10148, + "time_per_iteration": 2.3911051750183105 + }, + { + "auxiliary_loss_clip": 0.01063007, + "auxiliary_loss_mlp": 0.01030411, + "balance_loss_clip": 1.01636851, + "balance_loss_mlp": 1.01938963, + "epoch": 0.6101908913272208, + "flos": 29934251089920.0, + "grad_norm": 3.0452013185874844, + "language_loss": 0.54329497, + "learning_rate": 1.321564436119946e-06, + "loss": 0.56422913, + "num_input_tokens_seen": 218531850, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.4375, + "step": 10149, + "time_per_iteration": 2.4729928970336914 + }, + { + "auxiliary_loss_clip": 0.01058508, + "auxiliary_loss_mlp": 0.01027225, + "balance_loss_clip": 1.015221, + "balance_loss_mlp": 1.02068257, + "epoch": 0.6102510145798887, + "flos": 21797567913600.0, + "grad_norm": 2.680301724988097, + "language_loss": 0.8040235, + "learning_rate": 1.3212090277909335e-06, + "loss": 0.82488078, + "num_input_tokens_seen": 218551245, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.37890625, + "step": 10150, + "time_per_iteration": 2.396583318710327 + }, + { + "auxiliary_loss_clip": 0.01061346, + "auxiliary_loss_mlp": 0.01031322, + "balance_loss_clip": 1.01780403, + "balance_loss_mlp": 1.0200752, + "epoch": 0.6103111378325567, + "flos": 20705030737920.0, + "grad_norm": 1.4912760090755688, + "language_loss": 0.68975282, + "learning_rate": 1.320853643686225e-06, + "loss": 0.71067941, + "num_input_tokens_seen": 218571365, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.4140625, + "step": 10151, + "time_per_iteration": 2.4363982677459717 + }, + { + "auxiliary_loss_clip": 0.01057131, + "auxiliary_loss_mlp": 0.01022302, + "balance_loss_clip": 1.01088214, + "balance_loss_mlp": 1.01843858, + "epoch": 0.6103712610852247, + "flos": 29380528926720.0, + "grad_norm": 1.6372441996908322, + "language_loss": 0.71030092, + "learning_rate": 1.320498283818503e-06, + "loss": 0.73109525, + "num_input_tokens_seen": 218588315, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.38671875, + "step": 10152, + "time_per_iteration": 2.4704315662384033 + }, + { + "auxiliary_loss_clip": 0.01061354, + "auxiliary_loss_mlp": 0.01028087, + "balance_loss_clip": 1.01573682, + "balance_loss_mlp": 1.01991427, + "epoch": 0.6104313843378927, + "flos": 20812004743680.0, + "grad_norm": 1.6027711588022817, + "language_loss": 0.77970648, + "learning_rate": 1.3201429482004493e-06, + "loss": 0.80060089, + "num_input_tokens_seen": 218605940, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.4140625, + "step": 10153, + "time_per_iteration": 2.4208462238311768 + }, + { + "auxiliary_loss_clip": 0.0106094, + "auxiliary_loss_mlp": 0.01025955, + "balance_loss_clip": 1.01398671, + "balance_loss_mlp": 1.01968884, + "epoch": 0.6104915075905607, + "flos": 26577247397760.0, + "grad_norm": 1.7939259033295059, + "language_loss": 0.78947097, + "learning_rate": 1.3197876368447452e-06, + "loss": 0.81033993, + "num_input_tokens_seen": 218626100, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.41210938, + "step": 10154, + "time_per_iteration": 2.4439799785614014 + }, + { + "auxiliary_loss_clip": 0.01057725, + "auxiliary_loss_mlp": 0.01021294, + "balance_loss_clip": 1.00968885, + "balance_loss_mlp": 1.01912892, + "epoch": 0.6105516308432286, + "flos": 23914631203200.0, + "grad_norm": 3.9057089285661877, + "language_loss": 0.69959998, + "learning_rate": 1.3194323497640707e-06, + "loss": 0.7203902, + "num_input_tokens_seen": 218645060, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.38671875, + "step": 10155, + "time_per_iteration": 2.5041229724884033 + }, + { + "auxiliary_loss_clip": 0.01059945, + "auxiliary_loss_mlp": 0.01026068, + "balance_loss_clip": 1.01290774, + "balance_loss_mlp": 1.01975989, + "epoch": 0.6106117540958966, + "flos": 31576006863360.0, + "grad_norm": 1.695102042324128, + "language_loss": 0.71433806, + "learning_rate": 1.3190770869711045e-06, + "loss": 0.73519826, + "num_input_tokens_seen": 218667690, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.40234375, + "step": 10156, + "time_per_iteration": 2.500065326690674 + }, + { + "auxiliary_loss_clip": 0.01057679, + "auxiliary_loss_mlp": 0.01025693, + "balance_loss_clip": 1.01362872, + "balance_loss_mlp": 1.01951718, + "epoch": 0.6106718773485645, + "flos": 19607187035520.0, + "grad_norm": 1.444333733150839, + "language_loss": 0.67304504, + "learning_rate": 1.3187218484785264e-06, + "loss": 0.69387877, + "num_input_tokens_seen": 218687505, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.38085938, + "step": 10157, + "time_per_iteration": 2.40519642829895 + }, + { + "auxiliary_loss_clip": 0.010583, + "auxiliary_loss_mlp": 0.01020606, + "balance_loss_clip": 1.00925708, + "balance_loss_mlp": 1.0183177, + "epoch": 0.6107320006012326, + "flos": 17123081955840.0, + "grad_norm": 1.941669502745173, + "language_loss": 0.72588539, + "learning_rate": 1.3183666342990122e-06, + "loss": 0.74667442, + "num_input_tokens_seen": 218705315, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.3984375, + "step": 10158, + "time_per_iteration": 2.3862416744232178 + }, + { + "auxiliary_loss_clip": 0.0106048, + "auxiliary_loss_mlp": 0.01023738, + "balance_loss_clip": 1.01218688, + "balance_loss_mlp": 1.02043104, + "epoch": 0.6107921238539005, + "flos": 30147081937920.0, + "grad_norm": 1.4704761539988658, + "language_loss": 0.69226396, + "learning_rate": 1.3180114444452398e-06, + "loss": 0.71310616, + "num_input_tokens_seen": 218725735, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.40039062, + "step": 10159, + "time_per_iteration": 2.4630959033966064 + }, + { + "auxiliary_loss_clip": 0.01060189, + "auxiliary_loss_mlp": 0.01022482, + "balance_loss_clip": 1.01027465, + "balance_loss_mlp": 1.01900804, + "epoch": 0.6108522471065685, + "flos": 18439342323840.0, + "grad_norm": 1.82373918578496, + "language_loss": 0.78929573, + "learning_rate": 1.3176562789298852e-06, + "loss": 0.81012249, + "num_input_tokens_seen": 218743215, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.41210938, + "step": 10160, + "time_per_iteration": 2.390261173248291 + }, + { + "auxiliary_loss_clip": 0.01008882, + "auxiliary_loss_mlp": 0.01001299, + "balance_loss_clip": 1.00035691, + "balance_loss_mlp": 1.00152171, + "epoch": 0.6109123703592364, + "flos": 64131814656000.0, + "grad_norm": 0.8194331988742699, + "language_loss": 0.6148504, + "learning_rate": 1.3173011377656217e-06, + "loss": 0.63495219, + "num_input_tokens_seen": 218806440, + "router_z_loss_clip": 0.00939941, + "router_z_loss_mlp": 0.07373047, + "step": 10161, + "time_per_iteration": 3.2238516807556152 + }, + { + "auxiliary_loss_clip": 0.01060986, + "auxiliary_loss_mlp": 0.01026271, + "balance_loss_clip": 1.0131464, + "balance_loss_mlp": 1.01995873, + "epoch": 0.6109724936119044, + "flos": 20666800932480.0, + "grad_norm": 1.8492239879723862, + "language_loss": 0.75919431, + "learning_rate": 1.3169460209651253e-06, + "loss": 0.78006691, + "num_input_tokens_seen": 218825720, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.41015625, + "step": 10162, + "time_per_iteration": 2.410004138946533 + }, + { + "auxiliary_loss_clip": 0.01058578, + "auxiliary_loss_mlp": 0.01027119, + "balance_loss_clip": 1.01507294, + "balance_loss_mlp": 1.01929855, + "epoch": 0.6110326168645723, + "flos": 31350712659840.0, + "grad_norm": 2.383325619365867, + "language_loss": 0.7158832, + "learning_rate": 1.3165909285410676e-06, + "loss": 0.73674023, + "num_input_tokens_seen": 218847735, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.39257812, + "step": 10163, + "time_per_iteration": 2.46996808052063 + }, + { + "auxiliary_loss_clip": 0.01057406, + "auxiliary_loss_mlp": 0.01022045, + "balance_loss_clip": 1.01131058, + "balance_loss_mlp": 1.01911247, + "epoch": 0.6110927401172404, + "flos": 25002385522560.0, + "grad_norm": 1.777224001582439, + "language_loss": 0.59434956, + "learning_rate": 1.3162358605061226e-06, + "loss": 0.61514407, + "num_input_tokens_seen": 218866585, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.3828125, + "step": 10164, + "time_per_iteration": 3.8153011798858643 + }, + { + "auxiliary_loss_clip": 0.01057298, + "auxiliary_loss_mlp": 0.01024473, + "balance_loss_clip": 1.01282644, + "balance_loss_mlp": 1.01788819, + "epoch": 0.6111528633699083, + "flos": 26246934224640.0, + "grad_norm": 2.5059108465012456, + "language_loss": 0.75723577, + "learning_rate": 1.3158808168729607e-06, + "loss": 0.77805352, + "num_input_tokens_seen": 218885560, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.39453125, + "step": 10165, + "time_per_iteration": 2.4422407150268555 + }, + { + "auxiliary_loss_clip": 0.01058089, + "auxiliary_loss_mlp": 0.01024162, + "balance_loss_clip": 1.01203215, + "balance_loss_mlp": 1.020298, + "epoch": 0.6112129866225763, + "flos": 22381385535360.0, + "grad_norm": 2.0642140626627263, + "language_loss": 0.79075259, + "learning_rate": 1.3155257976542523e-06, + "loss": 0.81157511, + "num_input_tokens_seen": 218905055, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.37695312, + "step": 10166, + "time_per_iteration": 2.410322666168213 + }, + { + "auxiliary_loss_clip": 0.01059998, + "auxiliary_loss_mlp": 0.01022914, + "balance_loss_clip": 1.01021779, + "balance_loss_mlp": 1.01967418, + "epoch": 0.6112731098752443, + "flos": 25226737119360.0, + "grad_norm": 1.9973732775774977, + "language_loss": 0.67569411, + "learning_rate": 1.3151708028626676e-06, + "loss": 0.69652319, + "num_input_tokens_seen": 218924030, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.40429688, + "step": 10167, + "time_per_iteration": 2.427908182144165 + }, + { + "auxiliary_loss_clip": 0.01058392, + "auxiliary_loss_mlp": 0.01025786, + "balance_loss_clip": 1.01472926, + "balance_loss_mlp": 1.019135, + "epoch": 0.6113332331279122, + "flos": 22892060125440.0, + "grad_norm": 2.2320817384967975, + "language_loss": 0.78928816, + "learning_rate": 1.3148158325108754e-06, + "loss": 0.81012994, + "num_input_tokens_seen": 218943750, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.39257812, + "step": 10168, + "time_per_iteration": 2.429468870162964 + }, + { + "auxiliary_loss_clip": 0.01063498, + "auxiliary_loss_mlp": 0.0102959, + "balance_loss_clip": 1.01524293, + "balance_loss_mlp": 1.01949704, + "epoch": 0.6113933563805802, + "flos": 18619459361280.0, + "grad_norm": 2.6478881625581074, + "language_loss": 0.85699129, + "learning_rate": 1.3144608866115437e-06, + "loss": 0.87792218, + "num_input_tokens_seen": 218957585, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.43945312, + "step": 10169, + "time_per_iteration": 2.4192230701446533 + }, + { + "auxiliary_loss_clip": 0.01057698, + "auxiliary_loss_mlp": 0.01021214, + "balance_loss_clip": 1.01037157, + "balance_loss_mlp": 1.01893783, + "epoch": 0.6114534796332481, + "flos": 41864631644160.0, + "grad_norm": 2.0361382157111985, + "language_loss": 0.78490829, + "learning_rate": 1.3141059651773395e-06, + "loss": 0.80569744, + "num_input_tokens_seen": 218980025, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.38671875, + "step": 10170, + "time_per_iteration": 2.562845230102539 + }, + { + "auxiliary_loss_clip": 0.01062469, + "auxiliary_loss_mlp": 0.01030475, + "balance_loss_clip": 1.01671791, + "balance_loss_mlp": 1.01993132, + "epoch": 0.6115136028859162, + "flos": 21907369739520.0, + "grad_norm": 3.74857606734124, + "language_loss": 0.68684864, + "learning_rate": 1.3137510682209293e-06, + "loss": 0.7077781, + "num_input_tokens_seen": 218998200, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.42578125, + "step": 10171, + "time_per_iteration": 2.392066717147827 + }, + { + "auxiliary_loss_clip": 0.01060996, + "auxiliary_loss_mlp": 0.01024939, + "balance_loss_clip": 1.011814, + "balance_loss_mlp": 1.01921785, + "epoch": 0.6115737261385841, + "flos": 28803553931520.0, + "grad_norm": 1.6096413189314216, + "language_loss": 0.79316163, + "learning_rate": 1.3133961957549783e-06, + "loss": 0.81402099, + "num_input_tokens_seen": 219017910, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.41796875, + "step": 10172, + "time_per_iteration": 2.4427504539489746 + }, + { + "auxiliary_loss_clip": 0.01063601, + "auxiliary_loss_mlp": 0.0102813, + "balance_loss_clip": 1.01439691, + "balance_loss_mlp": 1.02077413, + "epoch": 0.6116338493912521, + "flos": 21250409086080.0, + "grad_norm": 2.8006469001788648, + "language_loss": 0.67078519, + "learning_rate": 1.3130413477921504e-06, + "loss": 0.69170254, + "num_input_tokens_seen": 219037730, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.42773438, + "step": 10173, + "time_per_iteration": 2.431795120239258 + }, + { + "auxiliary_loss_clip": 0.01060777, + "auxiliary_loss_mlp": 0.01022989, + "balance_loss_clip": 1.01071095, + "balance_loss_mlp": 1.01991904, + "epoch": 0.61169397264392, + "flos": 17529226335360.0, + "grad_norm": 1.7610221885611683, + "language_loss": 0.56024539, + "learning_rate": 1.3126865243451102e-06, + "loss": 0.58108306, + "num_input_tokens_seen": 219056755, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.40820312, + "step": 10174, + "time_per_iteration": 2.397397756576538 + }, + { + "auxiliary_loss_clip": 0.01062345, + "auxiliary_loss_mlp": 0.01026099, + "balance_loss_clip": 1.0129025, + "balance_loss_mlp": 1.0202961, + "epoch": 0.611754095896588, + "flos": 23950417213440.0, + "grad_norm": 1.7404681101405932, + "language_loss": 0.66317022, + "learning_rate": 1.3123317254265195e-06, + "loss": 0.68405467, + "num_input_tokens_seen": 219076985, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.421875, + "step": 10175, + "time_per_iteration": 2.433487892150879 + }, + { + "auxiliary_loss_clip": 0.01057509, + "auxiliary_loss_mlp": 0.01024479, + "balance_loss_clip": 1.01350594, + "balance_loss_mlp": 1.01754856, + "epoch": 0.6118142191492559, + "flos": 25993674155520.0, + "grad_norm": 1.9352327103418012, + "language_loss": 0.83012182, + "learning_rate": 1.311976951049041e-06, + "loss": 0.85094166, + "num_input_tokens_seen": 219096050, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.3984375, + "step": 10176, + "time_per_iteration": 5.33783221244812 + }, + { + "auxiliary_loss_clip": 0.01061566, + "auxiliary_loss_mlp": 0.01026538, + "balance_loss_clip": 1.01372886, + "balance_loss_mlp": 1.01998425, + "epoch": 0.611874342401924, + "flos": 24602176074240.0, + "grad_norm": 1.6989551741962574, + "language_loss": 0.77640194, + "learning_rate": 1.3116222012253354e-06, + "loss": 0.79728305, + "num_input_tokens_seen": 219112665, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.41601562, + "step": 10177, + "time_per_iteration": 2.4123623371124268 + }, + { + "auxiliary_loss_clip": 0.01059173, + "auxiliary_loss_mlp": 0.0102367, + "balance_loss_clip": 1.01170146, + "balance_loss_mlp": 1.01943302, + "epoch": 0.6119344656545919, + "flos": 15886248664320.0, + "grad_norm": 1.9867134216035875, + "language_loss": 0.76066422, + "learning_rate": 1.3112674759680622e-06, + "loss": 0.78149271, + "num_input_tokens_seen": 219129120, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.3984375, + "step": 10178, + "time_per_iteration": 2.4021682739257812 + }, + { + "auxiliary_loss_clip": 0.01060086, + "auxiliary_loss_mlp": 0.01025355, + "balance_loss_clip": 1.01323783, + "balance_loss_mlp": 1.02020836, + "epoch": 0.6119945889072599, + "flos": 21651805520640.0, + "grad_norm": 1.6236245441177806, + "language_loss": 0.66917276, + "learning_rate": 1.3109127752898817e-06, + "loss": 0.69002718, + "num_input_tokens_seen": 219148950, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.3984375, + "step": 10179, + "time_per_iteration": 2.409400224685669 + }, + { + "auxiliary_loss_clip": 0.01061485, + "auxiliary_loss_mlp": 0.01031819, + "balance_loss_clip": 1.01935554, + "balance_loss_mlp": 1.02066624, + "epoch": 0.6120547121599279, + "flos": 13771664081280.0, + "grad_norm": 1.9723142465101273, + "language_loss": 0.8344568, + "learning_rate": 1.3105580992034511e-06, + "loss": 0.85538983, + "num_input_tokens_seen": 219165585, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.40820312, + "step": 10180, + "time_per_iteration": 2.427004814147949 + }, + { + "auxiliary_loss_clip": 0.01059455, + "auxiliary_loss_mlp": 0.01021932, + "balance_loss_clip": 1.010041, + "balance_loss_mlp": 1.0202136, + "epoch": 0.6121148354125958, + "flos": 20078270277120.0, + "grad_norm": 1.625936439185128, + "language_loss": 0.78110826, + "learning_rate": 1.310203447721429e-06, + "loss": 0.80192208, + "num_input_tokens_seen": 219183280, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.39257812, + "step": 10181, + "time_per_iteration": 2.41294002532959 + }, + { + "auxiliary_loss_clip": 0.01060525, + "auxiliary_loss_mlp": 0.01024328, + "balance_loss_clip": 1.01173377, + "balance_loss_mlp": 1.02021873, + "epoch": 0.6121749586652638, + "flos": 13470713228160.0, + "grad_norm": 1.9891915579484225, + "language_loss": 0.80358094, + "learning_rate": 1.3098488208564712e-06, + "loss": 0.82442951, + "num_input_tokens_seen": 219197200, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.40234375, + "step": 10182, + "time_per_iteration": 2.3884713649749756 + }, + { + "auxiliary_loss_clip": 0.01056718, + "auxiliary_loss_mlp": 0.01019827, + "balance_loss_clip": 1.00944996, + "balance_loss_mlp": 1.01920724, + "epoch": 0.6122350819179317, + "flos": 20119502459520.0, + "grad_norm": 2.0455918993692093, + "language_loss": 0.82854384, + "learning_rate": 1.309494218621234e-06, + "loss": 0.84930927, + "num_input_tokens_seen": 219216825, + "router_z_loss_clip": 0.10351562, + "router_z_loss_mlp": 0.375, + "step": 10183, + "time_per_iteration": 2.370802640914917 + }, + { + "auxiliary_loss_clip": 0.01008216, + "auxiliary_loss_mlp": 0.01002652, + "balance_loss_clip": 1.00169826, + "balance_loss_mlp": 1.00096595, + "epoch": 0.6122952051705998, + "flos": 65426115778560.0, + "grad_norm": 0.7025063946524467, + "language_loss": 0.62913626, + "learning_rate": 1.3091396410283718e-06, + "loss": 0.6492449, + "num_input_tokens_seen": 219283795, + "router_z_loss_clip": 0.00952148, + "router_z_loss_mlp": 0.07226562, + "step": 10184, + "time_per_iteration": 4.5329365730285645 + }, + { + "auxiliary_loss_clip": 0.01061952, + "auxiliary_loss_mlp": 0.01023641, + "balance_loss_clip": 1.01185107, + "balance_loss_mlp": 1.02131295, + "epoch": 0.6123553284232677, + "flos": 20375206323840.0, + "grad_norm": 6.703639395432398, + "language_loss": 0.81922895, + "learning_rate": 1.3087850880905383e-06, + "loss": 0.84008491, + "num_input_tokens_seen": 219302385, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.40625, + "step": 10185, + "time_per_iteration": 2.387993574142456 + }, + { + "auxiliary_loss_clip": 0.01062776, + "auxiliary_loss_mlp": 0.01025438, + "balance_loss_clip": 1.01189625, + "balance_loss_mlp": 1.02033627, + "epoch": 0.6124154516759357, + "flos": 23986517425920.0, + "grad_norm": 1.851730999505361, + "language_loss": 0.74459094, + "learning_rate": 1.3084305598203874e-06, + "loss": 0.76547313, + "num_input_tokens_seen": 219319765, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.42382812, + "step": 10186, + "time_per_iteration": 2.423654556274414 + }, + { + "auxiliary_loss_clip": 0.01058374, + "auxiliary_loss_mlp": 0.01023784, + "balance_loss_clip": 1.01271582, + "balance_loss_mlp": 1.0194453, + "epoch": 0.6124755749286036, + "flos": 21467778410880.0, + "grad_norm": 1.5927164747261326, + "language_loss": 0.77989423, + "learning_rate": 1.3080760562305715e-06, + "loss": 0.8007158, + "num_input_tokens_seen": 219337440, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.390625, + "step": 10187, + "time_per_iteration": 2.4446613788604736 + }, + { + "auxiliary_loss_clip": 0.01057951, + "auxiliary_loss_mlp": 0.01023833, + "balance_loss_clip": 1.01181722, + "balance_loss_mlp": 1.01850319, + "epoch": 0.6125356981812716, + "flos": 23878042231680.0, + "grad_norm": 1.55037639330821, + "language_loss": 0.82994998, + "learning_rate": 1.3077215773337405e-06, + "loss": 0.85076785, + "num_input_tokens_seen": 219357525, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.39453125, + "step": 10188, + "time_per_iteration": 2.447507858276367 + }, + { + "auxiliary_loss_clip": 0.01059233, + "auxiliary_loss_mlp": 0.01023094, + "balance_loss_clip": 1.01098216, + "balance_loss_mlp": 1.01865363, + "epoch": 0.6125958214339395, + "flos": 14424819396480.0, + "grad_norm": 1.9337051128905303, + "language_loss": 0.75870252, + "learning_rate": 1.3073671231425461e-06, + "loss": 0.77952576, + "num_input_tokens_seen": 219374855, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.40625, + "step": 10189, + "time_per_iteration": 2.3919103145599365 + }, + { + "auxiliary_loss_clip": 0.01059909, + "auxiliary_loss_mlp": 0.01023665, + "balance_loss_clip": 1.01145804, + "balance_loss_mlp": 1.01943159, + "epoch": 0.6126559446866076, + "flos": 23257949840640.0, + "grad_norm": 1.4454481642589434, + "language_loss": 0.74087214, + "learning_rate": 1.3070126936696366e-06, + "loss": 0.7617079, + "num_input_tokens_seen": 219394740, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.40625, + "step": 10190, + "time_per_iteration": 2.4240522384643555 + }, + { + "auxiliary_loss_clip": 0.01060392, + "auxiliary_loss_mlp": 0.01024156, + "balance_loss_clip": 1.01180553, + "balance_loss_mlp": 1.01957071, + "epoch": 0.6127160679392755, + "flos": 26863744947840.0, + "grad_norm": 1.5309681005515257, + "language_loss": 0.68608052, + "learning_rate": 1.3066582889276622e-06, + "loss": 0.70692599, + "num_input_tokens_seen": 219413755, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.40820312, + "step": 10191, + "time_per_iteration": 2.4340806007385254 + }, + { + "auxiliary_loss_clip": 0.01058578, + "auxiliary_loss_mlp": 0.01031385, + "balance_loss_clip": 1.01857567, + "balance_loss_mlp": 1.01801276, + "epoch": 0.6127761911919435, + "flos": 26395210235520.0, + "grad_norm": 1.9384968478347677, + "language_loss": 0.73786902, + "learning_rate": 1.3063039089292696e-06, + "loss": 0.75876868, + "num_input_tokens_seen": 219433560, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.40625, + "step": 10192, + "time_per_iteration": 2.4367637634277344 + }, + { + "auxiliary_loss_clip": 0.01058513, + "auxiliary_loss_mlp": 0.01022686, + "balance_loss_clip": 1.01116407, + "balance_loss_mlp": 1.01989865, + "epoch": 0.6128363144446115, + "flos": 22633737909120.0, + "grad_norm": 2.1245703034316095, + "language_loss": 0.83120102, + "learning_rate": 1.3059495536871063e-06, + "loss": 0.85201299, + "num_input_tokens_seen": 219452640, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.38671875, + "step": 10193, + "time_per_iteration": 2.430626630783081 + }, + { + "auxiliary_loss_clip": 0.01058926, + "auxiliary_loss_mlp": 0.01025437, + "balance_loss_clip": 1.01286674, + "balance_loss_mlp": 1.01875782, + "epoch": 0.6128964376972794, + "flos": 26757888105600.0, + "grad_norm": 2.131916130774769, + "language_loss": 0.69991219, + "learning_rate": 1.3055952232138184e-06, + "loss": 0.72075582, + "num_input_tokens_seen": 219468585, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.40234375, + "step": 10194, + "time_per_iteration": 2.4620232582092285 + }, + { + "auxiliary_loss_clip": 0.01008422, + "auxiliary_loss_mlp": 0.01002809, + "balance_loss_clip": 1.00194478, + "balance_loss_mlp": 1.00100255, + "epoch": 0.6129565609499474, + "flos": 65565000633600.0, + "grad_norm": 0.8264408088795477, + "language_loss": 0.58668101, + "learning_rate": 1.3052409175220502e-06, + "loss": 0.60679328, + "num_input_tokens_seen": 219523015, + "router_z_loss_clip": 0.00866699, + "router_z_loss_mlp": 0.07421875, + "step": 10195, + "time_per_iteration": 2.852365732192993 + }, + { + "auxiliary_loss_clip": 0.01059979, + "auxiliary_loss_mlp": 0.01023422, + "balance_loss_clip": 1.01068449, + "balance_loss_mlp": 1.01932371, + "epoch": 0.6130166842026153, + "flos": 16361172155520.0, + "grad_norm": 1.779327763456003, + "language_loss": 0.69683456, + "learning_rate": 1.304886636624447e-06, + "loss": 0.71766859, + "num_input_tokens_seen": 219539980, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.40625, + "step": 10196, + "time_per_iteration": 2.379678964614868 + }, + { + "auxiliary_loss_clip": 0.01058892, + "auxiliary_loss_mlp": 0.01026917, + "balance_loss_clip": 1.01458454, + "balance_loss_mlp": 1.01940632, + "epoch": 0.6130768074552834, + "flos": 23651526130560.0, + "grad_norm": 1.81943916984255, + "language_loss": 0.71198791, + "learning_rate": 1.3045323805336512e-06, + "loss": 0.73284596, + "num_input_tokens_seen": 219556980, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.39453125, + "step": 10197, + "time_per_iteration": 2.443697214126587 + }, + { + "auxiliary_loss_clip": 0.01061563, + "auxiliary_loss_mlp": 0.0102803, + "balance_loss_clip": 1.01528633, + "balance_loss_mlp": 1.02088082, + "epoch": 0.6131369307079513, + "flos": 20046429250560.0, + "grad_norm": 1.796392644272621, + "language_loss": 0.78883541, + "learning_rate": 1.3041781492623064e-06, + "loss": 0.80973136, + "num_input_tokens_seen": 219576410, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.40820312, + "step": 10198, + "time_per_iteration": 2.3949217796325684 + }, + { + "auxiliary_loss_clip": 0.0106156, + "auxiliary_loss_mlp": 0.01028071, + "balance_loss_clip": 1.01527357, + "balance_loss_mlp": 1.02028692, + "epoch": 0.6131970539606193, + "flos": 22671129841920.0, + "grad_norm": 1.4166701478872705, + "language_loss": 0.74510026, + "learning_rate": 1.3038239428230534e-06, + "loss": 0.76599658, + "num_input_tokens_seen": 219597180, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.41210938, + "step": 10199, + "time_per_iteration": 2.4259393215179443 + }, + { + "auxiliary_loss_clip": 0.01060974, + "auxiliary_loss_mlp": 0.01030846, + "balance_loss_clip": 1.01779222, + "balance_loss_mlp": 1.01920748, + "epoch": 0.6132571772132872, + "flos": 26869679879040.0, + "grad_norm": 1.7529516516686288, + "language_loss": 0.61901593, + "learning_rate": 1.3034697612285324e-06, + "loss": 0.63993406, + "num_input_tokens_seen": 219617630, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.41796875, + "step": 10200, + "time_per_iteration": 2.454989194869995 + }, + { + "auxiliary_loss_clip": 0.01058652, + "auxiliary_loss_mlp": 0.01026877, + "balance_loss_clip": 1.01474738, + "balance_loss_mlp": 1.01966667, + "epoch": 0.6133173004659552, + "flos": 22891571366400.0, + "grad_norm": 1.7464196428137055, + "language_loss": 0.68652689, + "learning_rate": 1.3031156044913847e-06, + "loss": 0.70738214, + "num_input_tokens_seen": 219637025, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.390625, + "step": 10201, + "time_per_iteration": 2.4658243656158447 + }, + { + "auxiliary_loss_clip": 0.01057771, + "auxiliary_loss_mlp": 0.01020813, + "balance_loss_clip": 1.00882685, + "balance_loss_mlp": 1.01761246, + "epoch": 0.6133774237186231, + "flos": 20484065543040.0, + "grad_norm": 1.9459782442240452, + "language_loss": 0.83205736, + "learning_rate": 1.3027614726242485e-06, + "loss": 0.85284317, + "num_input_tokens_seen": 219656625, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.40234375, + "step": 10202, + "time_per_iteration": 2.399785280227661 + }, + { + "auxiliary_loss_clip": 0.01008051, + "auxiliary_loss_mlp": 0.0100188, + "balance_loss_clip": 1.00088465, + "balance_loss_mlp": 1.00076103, + "epoch": 0.6134375469712912, + "flos": 69409635552000.0, + "grad_norm": 0.8726954234933949, + "language_loss": 0.67144978, + "learning_rate": 1.3024073656397616e-06, + "loss": 0.69154906, + "num_input_tokens_seen": 219718090, + "router_z_loss_clip": 0.00994873, + "router_z_loss_mlp": 0.07324219, + "step": 10203, + "time_per_iteration": 4.566222906112671 + }, + { + "auxiliary_loss_clip": 0.01059507, + "auxiliary_loss_mlp": 0.01020496, + "balance_loss_clip": 1.00869381, + "balance_loss_mlp": 1.019611, + "epoch": 0.6134976702239591, + "flos": 41279941238400.0, + "grad_norm": 1.9636619624067455, + "language_loss": 0.61469585, + "learning_rate": 1.3020532835505615e-06, + "loss": 0.6354959, + "num_input_tokens_seen": 219740100, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.3984375, + "step": 10204, + "time_per_iteration": 2.575319528579712 + }, + { + "auxiliary_loss_clip": 0.01057029, + "auxiliary_loss_mlp": 0.01022345, + "balance_loss_clip": 1.01072824, + "balance_loss_mlp": 1.01869917, + "epoch": 0.6135577934766271, + "flos": 22345494791040.0, + "grad_norm": 10.278318541343157, + "language_loss": 0.7247802, + "learning_rate": 1.301699226369284e-06, + "loss": 0.74557394, + "num_input_tokens_seen": 219761225, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.3828125, + "step": 10205, + "time_per_iteration": 2.4698758125305176 + }, + { + "auxiliary_loss_clip": 0.0105662, + "auxiliary_loss_mlp": 0.01021468, + "balance_loss_clip": 1.00878453, + "balance_loss_mlp": 1.01859343, + "epoch": 0.6136179167292951, + "flos": 23727147868800.0, + "grad_norm": 1.949012343947043, + "language_loss": 0.75943327, + "learning_rate": 1.3013451941085655e-06, + "loss": 0.78021407, + "num_input_tokens_seen": 219780085, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.37890625, + "step": 10206, + "time_per_iteration": 2.409734010696411 + }, + { + "auxiliary_loss_clip": 0.01058663, + "auxiliary_loss_mlp": 0.01025534, + "balance_loss_clip": 1.01296973, + "balance_loss_mlp": 1.01796341, + "epoch": 0.613678039981963, + "flos": 26023664880000.0, + "grad_norm": 1.9979484857250196, + "language_loss": 0.75527245, + "learning_rate": 1.3009911867810393e-06, + "loss": 0.77611446, + "num_input_tokens_seen": 219797895, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.40820312, + "step": 10207, + "time_per_iteration": 2.458686590194702 + }, + { + "auxiliary_loss_clip": 0.01060055, + "auxiliary_loss_mlp": 0.01025289, + "balance_loss_clip": 1.0132612, + "balance_loss_mlp": 1.02026772, + "epoch": 0.613738163234631, + "flos": 9859437037440.0, + "grad_norm": 1.9288956193304243, + "language_loss": 0.82339591, + "learning_rate": 1.3006372043993396e-06, + "loss": 0.84424937, + "num_input_tokens_seen": 219811295, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.3984375, + "step": 10208, + "time_per_iteration": 2.3566153049468994 + }, + { + "auxiliary_loss_clip": 0.01062208, + "auxiliary_loss_mlp": 0.01027081, + "balance_loss_clip": 1.01373577, + "balance_loss_mlp": 1.01993454, + "epoch": 0.613798286487299, + "flos": 33180684906240.0, + "grad_norm": 2.7028438913006063, + "language_loss": 0.72441959, + "learning_rate": 1.3002832469760997e-06, + "loss": 0.74531245, + "num_input_tokens_seen": 219832735, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.421875, + "step": 10209, + "time_per_iteration": 2.5410354137420654 + }, + { + "auxiliary_loss_clip": 0.01058924, + "auxiliary_loss_mlp": 0.01027809, + "balance_loss_clip": 1.01636505, + "balance_loss_mlp": 1.02040935, + "epoch": 0.613858409739967, + "flos": 25626772010880.0, + "grad_norm": 1.6296162580047324, + "language_loss": 0.74271214, + "learning_rate": 1.29992931452395e-06, + "loss": 0.76357937, + "num_input_tokens_seen": 219852755, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.38671875, + "step": 10210, + "time_per_iteration": 2.429305076599121 + }, + { + "auxiliary_loss_clip": 0.01007746, + "auxiliary_loss_mlp": 0.01000683, + "balance_loss_clip": 0.99972361, + "balance_loss_mlp": 1.00058901, + "epoch": 0.6139185329926349, + "flos": 67188810101760.0, + "grad_norm": 0.8512843926431403, + "language_loss": 0.64938885, + "learning_rate": 1.2995754070555229e-06, + "loss": 0.66947323, + "num_input_tokens_seen": 219922785, + "router_z_loss_clip": 0.00958252, + "router_z_loss_mlp": 0.07177734, + "step": 10211, + "time_per_iteration": 3.2051875591278076 + }, + { + "auxiliary_loss_clip": 0.01060763, + "auxiliary_loss_mlp": 0.01029618, + "balance_loss_clip": 1.01697016, + "balance_loss_mlp": 1.02077889, + "epoch": 0.6139786562453029, + "flos": 21542562276480.0, + "grad_norm": 1.6525022328436154, + "language_loss": 0.75669003, + "learning_rate": 1.2992215245834472e-06, + "loss": 0.77759385, + "num_input_tokens_seen": 219942215, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.40039062, + "step": 10212, + "time_per_iteration": 2.4175729751586914 + }, + { + "auxiliary_loss_clip": 0.01060162, + "auxiliary_loss_mlp": 0.01026763, + "balance_loss_clip": 1.01418662, + "balance_loss_mlp": 1.01984596, + "epoch": 0.6140387794979708, + "flos": 26067271034880.0, + "grad_norm": 1.4016268136339314, + "language_loss": 0.73853832, + "learning_rate": 1.298867667120353e-06, + "loss": 0.75940752, + "num_input_tokens_seen": 219963830, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.40234375, + "step": 10213, + "time_per_iteration": 2.489621877670288 + }, + { + "auxiliary_loss_clip": 0.01061784, + "auxiliary_loss_mlp": 0.01024772, + "balance_loss_clip": 1.01154602, + "balance_loss_mlp": 1.01996017, + "epoch": 0.6140989027506388, + "flos": 23693526362880.0, + "grad_norm": 1.5253043070070924, + "language_loss": 0.72896492, + "learning_rate": 1.2985138346788685e-06, + "loss": 0.74983048, + "num_input_tokens_seen": 219983815, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.41796875, + "step": 10214, + "time_per_iteration": 2.4181723594665527 + }, + { + "auxiliary_loss_clip": 0.01059532, + "auxiliary_loss_mlp": 0.01024711, + "balance_loss_clip": 1.01199114, + "balance_loss_mlp": 1.01880682, + "epoch": 0.6141590260033067, + "flos": 22230770463360.0, + "grad_norm": 1.543709959625947, + "language_loss": 0.7424686, + "learning_rate": 1.2981600272716207e-06, + "loss": 0.76331097, + "num_input_tokens_seen": 220003165, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.40625, + "step": 10215, + "time_per_iteration": 2.46167254447937 + }, + { + "auxiliary_loss_clip": 0.01057947, + "auxiliary_loss_mlp": 0.01029561, + "balance_loss_clip": 1.01724672, + "balance_loss_mlp": 1.01910996, + "epoch": 0.6142191492559748, + "flos": 23870710846080.0, + "grad_norm": 1.7728801762059678, + "language_loss": 0.78433293, + "learning_rate": 1.2978062449112362e-06, + "loss": 0.80520803, + "num_input_tokens_seen": 220021015, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.38867188, + "step": 10216, + "time_per_iteration": 5.229642868041992 + }, + { + "auxiliary_loss_clip": 0.01059294, + "auxiliary_loss_mlp": 0.01026552, + "balance_loss_clip": 1.01283693, + "balance_loss_mlp": 1.01926684, + "epoch": 0.6142792725086427, + "flos": 15041804676480.0, + "grad_norm": 2.4425289660569365, + "language_loss": 0.79561687, + "learning_rate": 1.2974524876103404e-06, + "loss": 0.81647539, + "num_input_tokens_seen": 220035780, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.40039062, + "step": 10217, + "time_per_iteration": 2.365567922592163 + }, + { + "auxiliary_loss_clip": 0.01057772, + "auxiliary_loss_mlp": 0.01023698, + "balance_loss_clip": 1.0117414, + "balance_loss_mlp": 1.01766276, + "epoch": 0.6143393957613107, + "flos": 23329836063360.0, + "grad_norm": 2.336840763798742, + "language_loss": 0.78356314, + "learning_rate": 1.2970987553815584e-06, + "loss": 0.80437785, + "num_input_tokens_seen": 220054280, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.40039062, + "step": 10218, + "time_per_iteration": 2.4787256717681885 + }, + { + "auxiliary_loss_clip": 0.01059176, + "auxiliary_loss_mlp": 0.01026982, + "balance_loss_clip": 1.01440501, + "balance_loss_mlp": 1.01955867, + "epoch": 0.6143995190139786, + "flos": 20116150968960.0, + "grad_norm": 1.7802903297145451, + "language_loss": 0.82006174, + "learning_rate": 1.2967450482375133e-06, + "loss": 0.84092331, + "num_input_tokens_seen": 220074120, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.39648438, + "step": 10219, + "time_per_iteration": 2.436577320098877 + }, + { + "auxiliary_loss_clip": 0.01059948, + "auxiliary_loss_mlp": 0.01023849, + "balance_loss_clip": 1.01072359, + "balance_loss_mlp": 1.01865935, + "epoch": 0.6144596422666466, + "flos": 42301918823040.0, + "grad_norm": 1.8114888356092862, + "language_loss": 0.66865343, + "learning_rate": 1.2963913661908287e-06, + "loss": 0.68949145, + "num_input_tokens_seen": 220096320, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.4140625, + "step": 10220, + "time_per_iteration": 2.606074810028076 + }, + { + "auxiliary_loss_clip": 0.0105717, + "auxiliary_loss_mlp": 0.01026487, + "balance_loss_clip": 1.01491785, + "balance_loss_mlp": 1.0184648, + "epoch": 0.6145197655193146, + "flos": 21572727557760.0, + "grad_norm": 1.7857785945402962, + "language_loss": 0.71694469, + "learning_rate": 1.2960377092541267e-06, + "loss": 0.73778123, + "num_input_tokens_seen": 220114850, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.38671875, + "step": 10221, + "time_per_iteration": 2.3827075958251953 + }, + { + "auxiliary_loss_clip": 0.01057221, + "auxiliary_loss_mlp": 0.0102236, + "balance_loss_clip": 1.01113009, + "balance_loss_mlp": 1.01789618, + "epoch": 0.6145798887719826, + "flos": 21470012737920.0, + "grad_norm": 1.6711643307894113, + "language_loss": 0.79492968, + "learning_rate": 1.2956840774400274e-06, + "loss": 0.81572545, + "num_input_tokens_seen": 220133395, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.39453125, + "step": 10222, + "time_per_iteration": 2.38411808013916 + }, + { + "auxiliary_loss_clip": 0.01060959, + "auxiliary_loss_mlp": 0.01025974, + "balance_loss_clip": 1.01382113, + "balance_loss_mlp": 1.02048731, + "epoch": 0.6146400120246506, + "flos": 20775974353920.0, + "grad_norm": 2.6152039595843553, + "language_loss": 0.76059359, + "learning_rate": 1.295330470761152e-06, + "loss": 0.78146291, + "num_input_tokens_seen": 220152790, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.40429688, + "step": 10223, + "time_per_iteration": 3.9391915798187256 + }, + { + "auxiliary_loss_clip": 0.01056032, + "auxiliary_loss_mlp": 0.01022999, + "balance_loss_clip": 1.01142979, + "balance_loss_mlp": 1.01846516, + "epoch": 0.6147001352773185, + "flos": 13880453477760.0, + "grad_norm": 1.835377750855553, + "language_loss": 0.78431177, + "learning_rate": 1.294976889230119e-06, + "loss": 0.80510211, + "num_input_tokens_seen": 220169535, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.375, + "step": 10224, + "time_per_iteration": 2.3881332874298096 + }, + { + "auxiliary_loss_clip": 0.0105399, + "auxiliary_loss_mlp": 0.01021847, + "balance_loss_clip": 1.01152372, + "balance_loss_mlp": 1.01684284, + "epoch": 0.6147602585299865, + "flos": 56639770289280.0, + "grad_norm": 1.3858643181978332, + "language_loss": 0.6645962, + "learning_rate": 1.2946233328595479e-06, + "loss": 0.68535459, + "num_input_tokens_seen": 220195305, + "router_z_loss_clip": 0.10302734, + "router_z_loss_mlp": 0.37109375, + "step": 10225, + "time_per_iteration": 2.7172627449035645 + }, + { + "auxiliary_loss_clip": 0.01058874, + "auxiliary_loss_mlp": 0.0102418, + "balance_loss_clip": 1.0116334, + "balance_loss_mlp": 1.01827359, + "epoch": 0.6148203817826544, + "flos": 32815842531840.0, + "grad_norm": 2.678724431641214, + "language_loss": 0.62823606, + "learning_rate": 1.2942698016620554e-06, + "loss": 0.64906657, + "num_input_tokens_seen": 220215040, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.40625, + "step": 10226, + "time_per_iteration": 2.4780192375183105 + }, + { + "auxiliary_loss_clip": 0.01060678, + "auxiliary_loss_mlp": 0.01023914, + "balance_loss_clip": 1.01155782, + "balance_loss_mlp": 1.02038169, + "epoch": 0.6148805050353224, + "flos": 18331076597760.0, + "grad_norm": 2.6009597499703494, + "language_loss": 0.75444424, + "learning_rate": 1.2939162956502582e-06, + "loss": 0.77529013, + "num_input_tokens_seen": 220234205, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.40234375, + "step": 10227, + "time_per_iteration": 2.4049737453460693 + }, + { + "auxiliary_loss_clip": 0.01059964, + "auxiliary_loss_mlp": 0.01027041, + "balance_loss_clip": 1.01437461, + "balance_loss_mlp": 1.01901627, + "epoch": 0.6149406282879903, + "flos": 14063119044480.0, + "grad_norm": 3.732567938100343, + "language_loss": 0.62090933, + "learning_rate": 1.2935628148367724e-06, + "loss": 0.64177936, + "num_input_tokens_seen": 220252730, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.40820312, + "step": 10228, + "time_per_iteration": 2.3616652488708496 + }, + { + "auxiliary_loss_clip": 0.0105714, + "auxiliary_loss_mlp": 0.01022995, + "balance_loss_clip": 1.01127648, + "balance_loss_mlp": 1.01906395, + "epoch": 0.6150007515406584, + "flos": 25189065895680.0, + "grad_norm": 1.4461196630104702, + "language_loss": 0.74447834, + "learning_rate": 1.2932093592342122e-06, + "loss": 0.76527965, + "num_input_tokens_seen": 220273345, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.38085938, + "step": 10229, + "time_per_iteration": 2.435697317123413 + }, + { + "auxiliary_loss_clip": 0.01060735, + "auxiliary_loss_mlp": 0.01027504, + "balance_loss_clip": 1.0150466, + "balance_loss_mlp": 1.02056265, + "epoch": 0.6150608747933263, + "flos": 21944168179200.0, + "grad_norm": 1.651102590350646, + "language_loss": 0.7808888, + "learning_rate": 1.2928559288551921e-06, + "loss": 0.8017711, + "num_input_tokens_seen": 220293845, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.40234375, + "step": 10230, + "time_per_iteration": 2.390890598297119 + }, + { + "auxiliary_loss_clip": 0.01056533, + "auxiliary_loss_mlp": 0.01021486, + "balance_loss_clip": 1.00952923, + "balance_loss_mlp": 1.01782227, + "epoch": 0.6151209980459943, + "flos": 30116148606720.0, + "grad_norm": 1.4872448794137623, + "language_loss": 0.73106587, + "learning_rate": 1.2925025237123253e-06, + "loss": 0.75184608, + "num_input_tokens_seen": 220316070, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.38671875, + "step": 10231, + "time_per_iteration": 2.4942030906677246 + }, + { + "auxiliary_loss_clip": 0.01058785, + "auxiliary_loss_mlp": 0.01024651, + "balance_loss_clip": 1.01329601, + "balance_loss_mlp": 1.01912367, + "epoch": 0.6151811212986622, + "flos": 30007045008000.0, + "grad_norm": 1.668043532449051, + "language_loss": 0.69916475, + "learning_rate": 1.2921491438182232e-06, + "loss": 0.71999907, + "num_input_tokens_seen": 220335695, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.39648438, + "step": 10232, + "time_per_iteration": 2.497506618499756 + }, + { + "auxiliary_loss_clip": 0.01056088, + "auxiliary_loss_mlp": 0.01021588, + "balance_loss_clip": 1.01031709, + "balance_loss_mlp": 1.0190028, + "epoch": 0.6152412445513302, + "flos": 18872091025920.0, + "grad_norm": 1.8644668005422242, + "language_loss": 0.91955614, + "learning_rate": 1.2917957891854974e-06, + "loss": 0.94033289, + "num_input_tokens_seen": 220353720, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.37109375, + "step": 10233, + "time_per_iteration": 2.404421091079712 + }, + { + "auxiliary_loss_clip": 0.01055464, + "auxiliary_loss_mlp": 0.01026131, + "balance_loss_clip": 1.01512814, + "balance_loss_mlp": 1.01789391, + "epoch": 0.6153013678039982, + "flos": 25702393749120.0, + "grad_norm": 1.8028148931165415, + "language_loss": 0.71615714, + "learning_rate": 1.2914424598267577e-06, + "loss": 0.73697311, + "num_input_tokens_seen": 220372515, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.37695312, + "step": 10234, + "time_per_iteration": 2.414937973022461 + }, + { + "auxiliary_loss_clip": 0.01058733, + "auxiliary_loss_mlp": 0.01026166, + "balance_loss_clip": 1.01349401, + "balance_loss_mlp": 1.01797366, + "epoch": 0.6153614910566662, + "flos": 28509061680000.0, + "grad_norm": 1.9826172310222032, + "language_loss": 0.67293257, + "learning_rate": 1.2910891557546144e-06, + "loss": 0.69378155, + "num_input_tokens_seen": 220393490, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.40625, + "step": 10235, + "time_per_iteration": 2.4615471363067627 + }, + { + "auxiliary_loss_clip": 0.01059531, + "auxiliary_loss_mlp": 0.01026101, + "balance_loss_clip": 1.01445985, + "balance_loss_mlp": 1.01867008, + "epoch": 0.6154216143093342, + "flos": 23548671665280.0, + "grad_norm": 1.6924163289092478, + "language_loss": 0.81300718, + "learning_rate": 1.2907358769816755e-06, + "loss": 0.8338635, + "num_input_tokens_seen": 220412855, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.40820312, + "step": 10236, + "time_per_iteration": 2.4351491928100586 + }, + { + "auxiliary_loss_clip": 0.01058951, + "auxiliary_loss_mlp": 0.01026061, + "balance_loss_clip": 1.01386571, + "balance_loss_mlp": 1.01811719, + "epoch": 0.6154817375620021, + "flos": 22746961048320.0, + "grad_norm": 1.3737171835926925, + "language_loss": 0.80586427, + "learning_rate": 1.2903826235205487e-06, + "loss": 0.8267144, + "num_input_tokens_seen": 220433440, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.40820312, + "step": 10237, + "time_per_iteration": 2.41434907913208 + }, + { + "auxiliary_loss_clip": 0.01059418, + "auxiliary_loss_mlp": 0.01027515, + "balance_loss_clip": 1.01512933, + "balance_loss_mlp": 1.01941431, + "epoch": 0.6155418608146701, + "flos": 27161728335360.0, + "grad_norm": 1.5176624774268568, + "language_loss": 0.75835866, + "learning_rate": 1.2900293953838408e-06, + "loss": 0.77922797, + "num_input_tokens_seen": 220453445, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.40039062, + "step": 10238, + "time_per_iteration": 2.4461004734039307 + }, + { + "auxiliary_loss_clip": 0.01007524, + "auxiliary_loss_mlp": 0.01000721, + "balance_loss_clip": 0.99974984, + "balance_loss_mlp": 1.00032735, + "epoch": 0.615601984067338, + "flos": 68808257516160.0, + "grad_norm": 0.755116529306874, + "language_loss": 0.57657939, + "learning_rate": 1.2896761925841575e-06, + "loss": 0.59666181, + "num_input_tokens_seen": 220509730, + "router_z_loss_clip": 0.00970459, + "router_z_loss_mlp": 0.07226562, + "step": 10239, + "time_per_iteration": 3.056715488433838 + }, + { + "auxiliary_loss_clip": 0.01057938, + "auxiliary_loss_mlp": 0.01022727, + "balance_loss_clip": 1.01037097, + "balance_loss_mlp": 1.01852727, + "epoch": 0.615662107320006, + "flos": 15516413965440.0, + "grad_norm": 1.828982952970772, + "language_loss": 0.77250516, + "learning_rate": 1.2893230151341038e-06, + "loss": 0.79331183, + "num_input_tokens_seen": 220527295, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.39453125, + "step": 10240, + "time_per_iteration": 2.369738817214966 + }, + { + "auxiliary_loss_clip": 0.01061818, + "auxiliary_loss_mlp": 0.01027764, + "balance_loss_clip": 1.01527691, + "balance_loss_mlp": 1.02088976, + "epoch": 0.615722230572674, + "flos": 21062786106240.0, + "grad_norm": 2.5867473817974336, + "language_loss": 0.72470182, + "learning_rate": 1.288969863046283e-06, + "loss": 0.74559766, + "num_input_tokens_seen": 220542730, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.41015625, + "step": 10241, + "time_per_iteration": 2.400981903076172 + }, + { + "auxiliary_loss_clip": 0.01057496, + "auxiliary_loss_mlp": 0.01019868, + "balance_loss_clip": 1.00895476, + "balance_loss_mlp": 1.01922488, + "epoch": 0.615782353825342, + "flos": 23255715513600.0, + "grad_norm": 1.6275488097100714, + "language_loss": 0.71975183, + "learning_rate": 1.2886167363332996e-06, + "loss": 0.74052548, + "num_input_tokens_seen": 220562995, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.3828125, + "step": 10242, + "time_per_iteration": 2.400918960571289 + }, + { + "auxiliary_loss_clip": 0.01058307, + "auxiliary_loss_mlp": 0.01023172, + "balance_loss_clip": 1.01173377, + "balance_loss_mlp": 1.01903307, + "epoch": 0.6158424770780099, + "flos": 21102901125120.0, + "grad_norm": 1.8096410376710985, + "language_loss": 0.72429794, + "learning_rate": 1.288263635007755e-06, + "loss": 0.74511266, + "num_input_tokens_seen": 220581775, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.39257812, + "step": 10243, + "time_per_iteration": 3.8289990425109863 + }, + { + "auxiliary_loss_clip": 0.01008036, + "auxiliary_loss_mlp": 0.0100074, + "balance_loss_clip": 0.99989343, + "balance_loss_mlp": 1.00078213, + "epoch": 0.6159026003306779, + "flos": 70329596544000.0, + "grad_norm": 0.7804590657161882, + "language_loss": 0.56822276, + "learning_rate": 1.2879105590822497e-06, + "loss": 0.58831048, + "num_input_tokens_seen": 220646395, + "router_z_loss_clip": 0.00848389, + "router_z_loss_mlp": 0.07226562, + "step": 10244, + "time_per_iteration": 3.092496395111084 + }, + { + "auxiliary_loss_clip": 0.01057934, + "auxiliary_loss_mlp": 0.01025624, + "balance_loss_clip": 1.01342893, + "balance_loss_mlp": 1.01844764, + "epoch": 0.6159627235833458, + "flos": 33872977722240.0, + "grad_norm": 1.5553737866751574, + "language_loss": 0.6400162, + "learning_rate": 1.2875575085693853e-06, + "loss": 0.66085184, + "num_input_tokens_seen": 220668335, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.39453125, + "step": 10245, + "time_per_iteration": 2.502309799194336 + }, + { + "auxiliary_loss_clip": 0.01055918, + "auxiliary_loss_mlp": 0.01022225, + "balance_loss_clip": 1.0108881, + "balance_loss_mlp": 1.01822865, + "epoch": 0.6160228468360138, + "flos": 26574314843520.0, + "grad_norm": 1.643597574284487, + "language_loss": 0.79217148, + "learning_rate": 1.2872044834817606e-06, + "loss": 0.81295288, + "num_input_tokens_seen": 220688915, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.37695312, + "step": 10246, + "time_per_iteration": 2.4281363487243652 + }, + { + "auxiliary_loss_clip": 0.01060989, + "auxiliary_loss_mlp": 0.01034806, + "balance_loss_clip": 1.02153206, + "balance_loss_mlp": 1.01974392, + "epoch": 0.6160829700886818, + "flos": 17192559294720.0, + "grad_norm": 2.7712860416572815, + "language_loss": 0.87822139, + "learning_rate": 1.286851483831975e-06, + "loss": 0.89917934, + "num_input_tokens_seen": 220703465, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.41210938, + "step": 10247, + "time_per_iteration": 2.3627915382385254 + }, + { + "auxiliary_loss_clip": 0.01059963, + "auxiliary_loss_mlp": 0.01025605, + "balance_loss_clip": 1.01427984, + "balance_loss_mlp": 1.01996708, + "epoch": 0.6161430933413498, + "flos": 23622408190080.0, + "grad_norm": 1.5249457212657294, + "language_loss": 0.79802167, + "learning_rate": 1.2864985096326253e-06, + "loss": 0.8188774, + "num_input_tokens_seen": 220722090, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.40039062, + "step": 10248, + "time_per_iteration": 2.4175097942352295 + }, + { + "auxiliary_loss_clip": 0.01057961, + "auxiliary_loss_mlp": 0.01022685, + "balance_loss_clip": 1.01189077, + "balance_loss_mlp": 1.01932287, + "epoch": 0.6162032165940178, + "flos": 23001338280960.0, + "grad_norm": 1.9495518207715479, + "language_loss": 0.87429643, + "learning_rate": 1.286145560896308e-06, + "loss": 0.89510292, + "num_input_tokens_seen": 220741075, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.38671875, + "step": 10249, + "time_per_iteration": 2.4013166427612305 + }, + { + "auxiliary_loss_clip": 0.01058524, + "auxiliary_loss_mlp": 0.01022919, + "balance_loss_clip": 1.01054561, + "balance_loss_mlp": 1.01924098, + "epoch": 0.6162633398466857, + "flos": 39420397203840.0, + "grad_norm": 2.1105870726236216, + "language_loss": 0.68783057, + "learning_rate": 1.2857926376356196e-06, + "loss": 0.70864499, + "num_input_tokens_seen": 220763395, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.39257812, + "step": 10250, + "time_per_iteration": 2.5377144813537598 + }, + { + "auxiliary_loss_clip": 0.01052908, + "auxiliary_loss_mlp": 0.010193, + "balance_loss_clip": 1.008816, + "balance_loss_mlp": 1.01732481, + "epoch": 0.6163234630993537, + "flos": 19243671292800.0, + "grad_norm": 2.48645402671104, + "language_loss": 0.74066633, + "learning_rate": 1.2854397398631544e-06, + "loss": 0.76138842, + "num_input_tokens_seen": 220780640, + "router_z_loss_clip": 0.10498047, + "router_z_loss_mlp": 0.35546875, + "step": 10251, + "time_per_iteration": 2.381608724594116 + }, + { + "auxiliary_loss_clip": 0.01055555, + "auxiliary_loss_mlp": 0.01020871, + "balance_loss_clip": 1.00868177, + "balance_loss_mlp": 1.01769996, + "epoch": 0.6163835863520216, + "flos": 15960857973120.0, + "grad_norm": 2.703309084317094, + "language_loss": 0.68027711, + "learning_rate": 1.2850868675915071e-06, + "loss": 0.70104134, + "num_input_tokens_seen": 220797960, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.37890625, + "step": 10252, + "time_per_iteration": 2.3910634517669678 + }, + { + "auxiliary_loss_clip": 0.01007469, + "auxiliary_loss_mlp": 0.01000678, + "balance_loss_clip": 0.99980152, + "balance_loss_mlp": 1.0005511, + "epoch": 0.6164437096046896, + "flos": 68100322544640.0, + "grad_norm": 0.8785569096544834, + "language_loss": 0.57818586, + "learning_rate": 1.2847340208332705e-06, + "loss": 0.59826732, + "num_input_tokens_seen": 220856930, + "router_z_loss_clip": 0.00878906, + "router_z_loss_mlp": 0.06933594, + "step": 10253, + "time_per_iteration": 3.0759451389312744 + }, + { + "auxiliary_loss_clip": 0.01056903, + "auxiliary_loss_mlp": 0.01027, + "balance_loss_clip": 1.0150373, + "balance_loss_mlp": 1.01787889, + "epoch": 0.6165038328573575, + "flos": 21360141089280.0, + "grad_norm": 2.932487983048658, + "language_loss": 0.7971549, + "learning_rate": 1.2843811996010372e-06, + "loss": 0.81799388, + "num_input_tokens_seen": 220877595, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.390625, + "step": 10254, + "time_per_iteration": 2.4281630516052246 + }, + { + "auxiliary_loss_clip": 0.01060387, + "auxiliary_loss_mlp": 0.01025839, + "balance_loss_clip": 1.01405501, + "balance_loss_mlp": 1.01858735, + "epoch": 0.6165639561100256, + "flos": 21101015911680.0, + "grad_norm": 1.664810310591659, + "language_loss": 0.80176866, + "learning_rate": 1.284028403907398e-06, + "loss": 0.82263088, + "num_input_tokens_seen": 220896880, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.41796875, + "step": 10255, + "time_per_iteration": 3.8471121788024902 + }, + { + "auxiliary_loss_clip": 0.01057819, + "auxiliary_loss_mlp": 0.01024608, + "balance_loss_clip": 1.01243126, + "balance_loss_mlp": 1.01830602, + "epoch": 0.6166240793626935, + "flos": 25337341906560.0, + "grad_norm": 5.382248356109734, + "language_loss": 0.66018695, + "learning_rate": 1.2836756337649429e-06, + "loss": 0.6810112, + "num_input_tokens_seen": 220916425, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.39453125, + "step": 10256, + "time_per_iteration": 3.873898983001709 + }, + { + "auxiliary_loss_clip": 0.01058093, + "auxiliary_loss_mlp": 0.01020599, + "balance_loss_clip": 1.00915456, + "balance_loss_mlp": 1.01996422, + "epoch": 0.6166842026153615, + "flos": 19681621787520.0, + "grad_norm": 1.653762059341226, + "language_loss": 0.71767074, + "learning_rate": 1.2833228891862619e-06, + "loss": 0.73845768, + "num_input_tokens_seen": 220935050, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.38085938, + "step": 10257, + "time_per_iteration": 2.4154086112976074 + }, + { + "auxiliary_loss_clip": 0.01061372, + "auxiliary_loss_mlp": 0.01023886, + "balance_loss_clip": 1.01163113, + "balance_loss_mlp": 1.02102578, + "epoch": 0.6167443258680294, + "flos": 19317337994880.0, + "grad_norm": 1.6202373364864144, + "language_loss": 0.7185539, + "learning_rate": 1.2829701701839434e-06, + "loss": 0.73940653, + "num_input_tokens_seen": 220953085, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.40429688, + "step": 10258, + "time_per_iteration": 2.424778461456299 + }, + { + "auxiliary_loss_clip": 0.01058555, + "auxiliary_loss_mlp": 0.0102435, + "balance_loss_clip": 1.01202989, + "balance_loss_mlp": 1.01943851, + "epoch": 0.6168044491206974, + "flos": 25264059229440.0, + "grad_norm": 3.413467541221682, + "language_loss": 0.63824683, + "learning_rate": 1.2826174767705758e-06, + "loss": 0.65907586, + "num_input_tokens_seen": 220969050, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.390625, + "step": 10259, + "time_per_iteration": 2.425518274307251 + }, + { + "auxiliary_loss_clip": 0.0105664, + "auxiliary_loss_mlp": 0.01023181, + "balance_loss_clip": 1.01127839, + "balance_loss_mlp": 1.01920772, + "epoch": 0.6168645723733654, + "flos": 13219198727040.0, + "grad_norm": 2.611647422079873, + "language_loss": 0.70988393, + "learning_rate": 1.282264808958745e-06, + "loss": 0.73068213, + "num_input_tokens_seen": 220985825, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.375, + "step": 10260, + "time_per_iteration": 2.390970230102539 + }, + { + "auxiliary_loss_clip": 0.01062156, + "auxiliary_loss_mlp": 0.01026579, + "balance_loss_clip": 1.01331663, + "balance_loss_mlp": 1.02115858, + "epoch": 0.6169246956260334, + "flos": 26650809365760.0, + "grad_norm": 2.9549392422939618, + "language_loss": 0.68706024, + "learning_rate": 1.2819121667610363e-06, + "loss": 0.70794755, + "num_input_tokens_seen": 221004465, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.41015625, + "step": 10261, + "time_per_iteration": 2.436706304550171 + }, + { + "auxiliary_loss_clip": 0.01056448, + "auxiliary_loss_mlp": 0.01025199, + "balance_loss_clip": 1.01404691, + "balance_loss_mlp": 1.01795673, + "epoch": 0.6169848188787014, + "flos": 23147310142080.0, + "grad_norm": 2.2043830849942356, + "language_loss": 0.71074593, + "learning_rate": 1.2815595501900358e-06, + "loss": 0.73156244, + "num_input_tokens_seen": 221023260, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.38476562, + "step": 10262, + "time_per_iteration": 3.8188726902008057 + }, + { + "auxiliary_loss_clip": 0.01056361, + "auxiliary_loss_mlp": 0.01025816, + "balance_loss_clip": 1.01340032, + "balance_loss_mlp": 1.01743853, + "epoch": 0.6170449421313693, + "flos": 23330778670080.0, + "grad_norm": 2.1793912046968664, + "language_loss": 0.69780129, + "learning_rate": 1.2812069592583265e-06, + "loss": 0.71862304, + "num_input_tokens_seen": 221043090, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.390625, + "step": 10263, + "time_per_iteration": 2.4215192794799805 + }, + { + "auxiliary_loss_clip": 0.01058975, + "auxiliary_loss_mlp": 0.01021109, + "balance_loss_clip": 1.00897396, + "balance_loss_mlp": 1.01984024, + "epoch": 0.6171050653840373, + "flos": 15850707033600.0, + "grad_norm": 2.0209835608275997, + "language_loss": 0.76415175, + "learning_rate": 1.2808543939784922e-06, + "loss": 0.78495258, + "num_input_tokens_seen": 221061435, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.390625, + "step": 10264, + "time_per_iteration": 2.3788557052612305 + }, + { + "auxiliary_loss_clip": 0.01058525, + "auxiliary_loss_mlp": 0.01022192, + "balance_loss_clip": 1.00954378, + "balance_loss_mlp": 1.01917017, + "epoch": 0.6171651886367052, + "flos": 20044544037120.0, + "grad_norm": 2.2951190137805573, + "language_loss": 0.85135853, + "learning_rate": 1.2805018543631148e-06, + "loss": 0.87216574, + "num_input_tokens_seen": 221078705, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.39453125, + "step": 10265, + "time_per_iteration": 2.382431983947754 + }, + { + "auxiliary_loss_clip": 0.01057047, + "auxiliary_loss_mlp": 0.01021434, + "balance_loss_clip": 1.01022828, + "balance_loss_mlp": 1.01928234, + "epoch": 0.6172253118893732, + "flos": 26431485004800.0, + "grad_norm": 1.8059067069592323, + "language_loss": 0.6467706, + "learning_rate": 1.2801493404247748e-06, + "loss": 0.66755539, + "num_input_tokens_seen": 221099245, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.37890625, + "step": 10266, + "time_per_iteration": 2.451178789138794 + }, + { + "auxiliary_loss_clip": 0.01055701, + "auxiliary_loss_mlp": 0.01021533, + "balance_loss_clip": 1.01027989, + "balance_loss_mlp": 1.0171833, + "epoch": 0.6172854351420412, + "flos": 22631922518400.0, + "grad_norm": 1.6158032783309477, + "language_loss": 0.75652909, + "learning_rate": 1.279796852176054e-06, + "loss": 0.77730137, + "num_input_tokens_seen": 221116930, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.38476562, + "step": 10267, + "time_per_iteration": 2.4147942066192627 + }, + { + "auxiliary_loss_clip": 0.01057855, + "auxiliary_loss_mlp": 0.01022784, + "balance_loss_clip": 1.01038575, + "balance_loss_mlp": 1.0181812, + "epoch": 0.6173455583947092, + "flos": 21211934901120.0, + "grad_norm": 1.616301024743452, + "language_loss": 0.75208765, + "learning_rate": 1.2794443896295299e-06, + "loss": 0.77289402, + "num_input_tokens_seen": 221137660, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.39648438, + "step": 10268, + "time_per_iteration": 2.4435675144195557 + }, + { + "auxiliary_loss_clip": 0.01058297, + "auxiliary_loss_mlp": 0.01022116, + "balance_loss_clip": 1.0107317, + "balance_loss_mlp": 1.01789796, + "epoch": 0.6174056816473771, + "flos": 19499270423040.0, + "grad_norm": 1.5878555545325805, + "language_loss": 0.75761855, + "learning_rate": 1.279091952797783e-06, + "loss": 0.77842271, + "num_input_tokens_seen": 221156225, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.40429688, + "step": 10269, + "time_per_iteration": 2.402297019958496 + }, + { + "auxiliary_loss_clip": 0.01054999, + "auxiliary_loss_mlp": 0.01020689, + "balance_loss_clip": 1.00935209, + "balance_loss_mlp": 1.01673996, + "epoch": 0.6174658049000451, + "flos": 15996434515200.0, + "grad_norm": 2.3279365868999617, + "language_loss": 0.76557493, + "learning_rate": 1.2787395416933895e-06, + "loss": 0.78633183, + "num_input_tokens_seen": 221173820, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.3828125, + "step": 10270, + "time_per_iteration": 2.365673065185547 + }, + { + "auxiliary_loss_clip": 0.01057973, + "auxiliary_loss_mlp": 0.01023118, + "balance_loss_clip": 1.01151919, + "balance_loss_mlp": 1.01894045, + "epoch": 0.617525928152713, + "flos": 21902935996800.0, + "grad_norm": 1.7484490824576155, + "language_loss": 0.82474005, + "learning_rate": 1.2783871563289263e-06, + "loss": 0.84555101, + "num_input_tokens_seen": 221191815, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.390625, + "step": 10271, + "time_per_iteration": 2.436485528945923 + }, + { + "auxiliary_loss_clip": 0.01056739, + "auxiliary_loss_mlp": 0.01028349, + "balance_loss_clip": 1.0168457, + "balance_loss_mlp": 1.01811409, + "epoch": 0.617586051405381, + "flos": 21104891072640.0, + "grad_norm": 1.4871843341208713, + "language_loss": 0.76934344, + "learning_rate": 1.2780347967169697e-06, + "loss": 0.79019427, + "num_input_tokens_seen": 221211205, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.38671875, + "step": 10272, + "time_per_iteration": 2.4341375827789307 + }, + { + "auxiliary_loss_clip": 0.01007424, + "auxiliary_loss_mlp": 0.01001446, + "balance_loss_clip": 1.0005641, + "balance_loss_mlp": 1.00043511, + "epoch": 0.617646174658049, + "flos": 58607717829120.0, + "grad_norm": 0.8084606760302465, + "language_loss": 0.59169996, + "learning_rate": 1.2776824628700938e-06, + "loss": 0.61178869, + "num_input_tokens_seen": 221268430, + "router_z_loss_clip": 0.0088501, + "router_z_loss_mlp": 0.0703125, + "step": 10273, + "time_per_iteration": 2.9334659576416016 + }, + { + "auxiliary_loss_clip": 0.01056238, + "auxiliary_loss_mlp": 0.01022849, + "balance_loss_clip": 1.01052248, + "balance_loss_mlp": 1.01788092, + "epoch": 0.617706297910717, + "flos": 13877904948480.0, + "grad_norm": 2.288836877905841, + "language_loss": 0.73186934, + "learning_rate": 1.2773301548008728e-06, + "loss": 0.75266016, + "num_input_tokens_seen": 221281930, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.38476562, + "step": 10274, + "time_per_iteration": 2.3630454540252686 + }, + { + "auxiliary_loss_clip": 0.01056634, + "auxiliary_loss_mlp": 0.01022063, + "balance_loss_clip": 1.01136994, + "balance_loss_mlp": 1.01882052, + "epoch": 0.617766421163385, + "flos": 19207431434880.0, + "grad_norm": 1.800015227889881, + "language_loss": 0.77242881, + "learning_rate": 1.2769778725218797e-06, + "loss": 0.79321575, + "num_input_tokens_seen": 221301605, + "router_z_loss_clip": 0.10693359, + "router_z_loss_mlp": 0.37890625, + "step": 10275, + "time_per_iteration": 2.39947772026062 + }, + { + "auxiliary_loss_clip": 0.01057316, + "auxiliary_loss_mlp": 0.01023166, + "balance_loss_clip": 1.01149499, + "balance_loss_mlp": 1.01835763, + "epoch": 0.6178265444160529, + "flos": 22564854063360.0, + "grad_norm": 1.6508365231303246, + "language_loss": 0.79423082, + "learning_rate": 1.2766256160456866e-06, + "loss": 0.81503564, + "num_input_tokens_seen": 221320105, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.38867188, + "step": 10276, + "time_per_iteration": 2.4261984825134277 + }, + { + "auxiliary_loss_clip": 0.01058327, + "auxiliary_loss_mlp": 0.01028446, + "balance_loss_clip": 1.01573801, + "balance_loss_mlp": 1.01891506, + "epoch": 0.6178866676687209, + "flos": 11983482599040.0, + "grad_norm": 2.434186229025769, + "language_loss": 0.80627918, + "learning_rate": 1.2762733853848647e-06, + "loss": 0.82714689, + "num_input_tokens_seen": 221335915, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.39453125, + "step": 10277, + "time_per_iteration": 2.3734562397003174 + }, + { + "auxiliary_loss_clip": 0.01058742, + "auxiliary_loss_mlp": 0.01022889, + "balance_loss_clip": 1.01046801, + "balance_loss_mlp": 1.01929128, + "epoch": 0.6179467909213888, + "flos": 20990585681280.0, + "grad_norm": 1.6753677684489185, + "language_loss": 0.8132714, + "learning_rate": 1.2759211805519835e-06, + "loss": 0.83408773, + "num_input_tokens_seen": 221353965, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.39453125, + "step": 10278, + "time_per_iteration": 2.423753499984741 + }, + { + "auxiliary_loss_clip": 0.01057599, + "auxiliary_loss_mlp": 0.01024652, + "balance_loss_clip": 1.01357126, + "balance_loss_mlp": 1.01814914, + "epoch": 0.6180069141740568, + "flos": 25336922970240.0, + "grad_norm": 1.937649971200201, + "language_loss": 0.74097258, + "learning_rate": 1.2755690015596133e-06, + "loss": 0.76179504, + "num_input_tokens_seen": 221374080, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.39453125, + "step": 10279, + "time_per_iteration": 2.443000316619873 + }, + { + "auxiliary_loss_clip": 0.01054374, + "auxiliary_loss_mlp": 0.01021403, + "balance_loss_clip": 1.00994694, + "balance_loss_mlp": 1.01634383, + "epoch": 0.6180670374267248, + "flos": 19644718613760.0, + "grad_norm": 1.4457302026575638, + "language_loss": 0.70650613, + "learning_rate": 1.2752168484203215e-06, + "loss": 0.72726393, + "num_input_tokens_seen": 221392910, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.38085938, + "step": 10280, + "time_per_iteration": 2.3918004035949707 + }, + { + "auxiliary_loss_clip": 0.01056458, + "auxiliary_loss_mlp": 0.01028207, + "balance_loss_clip": 1.01638699, + "balance_loss_mlp": 1.01812351, + "epoch": 0.6181271606793928, + "flos": 19463833526400.0, + "grad_norm": 1.4191822377263112, + "language_loss": 0.72657824, + "learning_rate": 1.2748647211466766e-06, + "loss": 0.7474249, + "num_input_tokens_seen": 221410990, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.3828125, + "step": 10281, + "time_per_iteration": 2.378755569458008 + }, + { + "auxiliary_loss_clip": 0.01054899, + "auxiliary_loss_mlp": 0.01020868, + "balance_loss_clip": 1.01061058, + "balance_loss_mlp": 1.01840174, + "epoch": 0.6181872839320607, + "flos": 25593080682240.0, + "grad_norm": 1.57410305488097, + "language_loss": 0.76406729, + "learning_rate": 1.274512619751244e-06, + "loss": 0.78482497, + "num_input_tokens_seen": 221431020, + "router_z_loss_clip": 0.10253906, + "router_z_loss_mlp": 0.36523438, + "step": 10282, + "time_per_iteration": 2.465322494506836 + }, + { + "auxiliary_loss_clip": 0.01057551, + "auxiliary_loss_mlp": 0.01025476, + "balance_loss_clip": 1.01227379, + "balance_loss_mlp": 1.01735449, + "epoch": 0.6182474071847287, + "flos": 25550766247680.0, + "grad_norm": 2.174993992347473, + "language_loss": 0.69224131, + "learning_rate": 1.27416054424659e-06, + "loss": 0.71307158, + "num_input_tokens_seen": 221453235, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.40234375, + "step": 10283, + "time_per_iteration": 3.971557855606079 + }, + { + "auxiliary_loss_clip": 0.01060451, + "auxiliary_loss_mlp": 0.01024462, + "balance_loss_clip": 1.01195109, + "balance_loss_mlp": 1.01834023, + "epoch": 0.6183075304373966, + "flos": 22122749116800.0, + "grad_norm": 3.329384105300463, + "language_loss": 0.75025636, + "learning_rate": 1.2738084946452791e-06, + "loss": 0.77110547, + "num_input_tokens_seen": 221472560, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.421875, + "step": 10284, + "time_per_iteration": 2.390752077102661 + }, + { + "auxiliary_loss_clip": 0.01054911, + "auxiliary_loss_mlp": 0.01024758, + "balance_loss_clip": 1.01252115, + "balance_loss_mlp": 1.01828957, + "epoch": 0.6183676536900646, + "flos": 22454493655680.0, + "grad_norm": 1.9741130084711724, + "language_loss": 0.75434262, + "learning_rate": 1.273456470959875e-06, + "loss": 0.77513933, + "num_input_tokens_seen": 221492835, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.36523438, + "step": 10285, + "time_per_iteration": 2.4281628131866455 + }, + { + "auxiliary_loss_clip": 0.010583, + "auxiliary_loss_mlp": 0.01025519, + "balance_loss_clip": 1.01425934, + "balance_loss_mlp": 1.01884198, + "epoch": 0.6184277769427327, + "flos": 23363108455680.0, + "grad_norm": 1.5704184423251049, + "language_loss": 0.73136544, + "learning_rate": 1.2731044732029406e-06, + "loss": 0.75220364, + "num_input_tokens_seen": 221511870, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.39453125, + "step": 10286, + "time_per_iteration": 2.4307827949523926 + }, + { + "auxiliary_loss_clip": 0.01056289, + "auxiliary_loss_mlp": 0.01020154, + "balance_loss_clip": 1.00913906, + "balance_loss_mlp": 1.01846123, + "epoch": 0.6184879001954006, + "flos": 22709953140480.0, + "grad_norm": 3.5427708388578814, + "language_loss": 0.76496017, + "learning_rate": 1.272752501387038e-06, + "loss": 0.78572458, + "num_input_tokens_seen": 221529915, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.37890625, + "step": 10287, + "time_per_iteration": 2.425414562225342 + }, + { + "auxiliary_loss_clip": 0.01056129, + "auxiliary_loss_mlp": 0.01024538, + "balance_loss_clip": 1.01293898, + "balance_loss_mlp": 1.01872349, + "epoch": 0.6185480234480686, + "flos": 23840789944320.0, + "grad_norm": 1.708626895305514, + "language_loss": 0.73091507, + "learning_rate": 1.2724005555247273e-06, + "loss": 0.75172174, + "num_input_tokens_seen": 221549745, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.375, + "step": 10288, + "time_per_iteration": 2.4130003452301025 + }, + { + "auxiliary_loss_clip": 0.01053914, + "auxiliary_loss_mlp": 0.01020837, + "balance_loss_clip": 1.01073408, + "balance_loss_mlp": 1.01863551, + "epoch": 0.6186081467007365, + "flos": 45475872923520.0, + "grad_norm": 1.4990596895365251, + "language_loss": 0.72729546, + "learning_rate": 1.2720486356285698e-06, + "loss": 0.74804294, + "num_input_tokens_seen": 221572455, + "router_z_loss_clip": 0.10107422, + "router_z_loss_mlp": 0.3515625, + "step": 10289, + "time_per_iteration": 2.603806257247925 + }, + { + "auxiliary_loss_clip": 0.01058436, + "auxiliary_loss_mlp": 0.01022077, + "balance_loss_clip": 1.01016808, + "balance_loss_mlp": 1.017766, + "epoch": 0.6186682699534045, + "flos": 23549719006080.0, + "grad_norm": 1.4691976177295438, + "language_loss": 0.79405224, + "learning_rate": 1.2716967417111235e-06, + "loss": 0.81485736, + "num_input_tokens_seen": 221591325, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.40625, + "step": 10290, + "time_per_iteration": 2.4208359718322754 + }, + { + "auxiliary_loss_clip": 0.01059095, + "auxiliary_loss_mlp": 0.01027162, + "balance_loss_clip": 1.01516998, + "balance_loss_mlp": 1.01898992, + "epoch": 0.6187283932060724, + "flos": 25773058074240.0, + "grad_norm": 1.6150580418767293, + "language_loss": 0.8153013, + "learning_rate": 1.2713448737849474e-06, + "loss": 0.83616388, + "num_input_tokens_seen": 221611640, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.40234375, + "step": 10291, + "time_per_iteration": 2.4430923461914062 + }, + { + "auxiliary_loss_clip": 0.01056798, + "auxiliary_loss_mlp": 0.01025212, + "balance_loss_clip": 1.01409554, + "balance_loss_mlp": 1.01840043, + "epoch": 0.6187885164587404, + "flos": 25264024318080.0, + "grad_norm": 1.7469862095640203, + "language_loss": 0.77307391, + "learning_rate": 1.2709930318625989e-06, + "loss": 0.79389399, + "num_input_tokens_seen": 221631225, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.38476562, + "step": 10292, + "time_per_iteration": 2.427053928375244 + }, + { + "auxiliary_loss_clip": 0.01061085, + "auxiliary_loss_mlp": 0.01031282, + "balance_loss_clip": 1.01754308, + "balance_loss_mlp": 1.01966429, + "epoch": 0.6188486397114084, + "flos": 26249552576640.0, + "grad_norm": 2.1562525591323385, + "language_loss": 0.73584419, + "learning_rate": 1.270641215956633e-06, + "loss": 0.75676787, + "num_input_tokens_seen": 221651035, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.4140625, + "step": 10293, + "time_per_iteration": 2.447779655456543 + }, + { + "auxiliary_loss_clip": 0.01058204, + "auxiliary_loss_mlp": 0.01028553, + "balance_loss_clip": 1.0161972, + "balance_loss_mlp": 1.02003956, + "epoch": 0.6189087629640764, + "flos": 20922330240000.0, + "grad_norm": 1.5968270152797912, + "language_loss": 0.82711446, + "learning_rate": 1.2702894260796062e-06, + "loss": 0.84798205, + "num_input_tokens_seen": 221671300, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.3828125, + "step": 10294, + "time_per_iteration": 2.3963077068328857 + }, + { + "auxiliary_loss_clip": 0.01058932, + "auxiliary_loss_mlp": 0.01025594, + "balance_loss_clip": 1.013798, + "balance_loss_mlp": 1.01963627, + "epoch": 0.6189688862167443, + "flos": 14828938917120.0, + "grad_norm": 2.0459274687184577, + "language_loss": 0.70778048, + "learning_rate": 1.2699376622440727e-06, + "loss": 0.72862566, + "num_input_tokens_seen": 221687320, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.39257812, + "step": 10295, + "time_per_iteration": 5.268973112106323 + }, + { + "auxiliary_loss_clip": 0.01057898, + "auxiliary_loss_mlp": 0.01031383, + "balance_loss_clip": 1.01909828, + "balance_loss_mlp": 1.01953983, + "epoch": 0.6190290094694123, + "flos": 24283767674880.0, + "grad_norm": 1.527906646752715, + "language_loss": 0.70027089, + "learning_rate": 1.2695859244625864e-06, + "loss": 0.72116363, + "num_input_tokens_seen": 221710175, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.3828125, + "step": 10296, + "time_per_iteration": 2.4867348670959473 + }, + { + "auxiliary_loss_clip": 0.01059067, + "auxiliary_loss_mlp": 0.01026186, + "balance_loss_clip": 1.01318002, + "balance_loss_mlp": 1.01931477, + "epoch": 0.6190891327220802, + "flos": 22528334914560.0, + "grad_norm": 2.252544904433591, + "language_loss": 0.71567249, + "learning_rate": 1.269234212747699e-06, + "loss": 0.736525, + "num_input_tokens_seen": 221728145, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.3984375, + "step": 10297, + "time_per_iteration": 2.45354962348938 + }, + { + "auxiliary_loss_clip": 0.01007453, + "auxiliary_loss_mlp": 0.01000632, + "balance_loss_clip": 0.99970782, + "balance_loss_mlp": 1.00041425, + "epoch": 0.6191492559747482, + "flos": 67726123925760.0, + "grad_norm": 0.8783601746671468, + "language_loss": 0.6415897, + "learning_rate": 1.2688825271119634e-06, + "loss": 0.66167051, + "num_input_tokens_seen": 221786100, + "router_z_loss_clip": 0.00921631, + "router_z_loss_mlp": 0.0703125, + "step": 10298, + "time_per_iteration": 2.9732346534729004 + }, + { + "auxiliary_loss_clip": 0.01059026, + "auxiliary_loss_mlp": 0.01026901, + "balance_loss_clip": 1.01523626, + "balance_loss_mlp": 1.02006984, + "epoch": 0.6192093792274163, + "flos": 22345564613760.0, + "grad_norm": 1.651475415588866, + "language_loss": 0.73979264, + "learning_rate": 1.2685308675679295e-06, + "loss": 0.76065195, + "num_input_tokens_seen": 221806450, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.38867188, + "step": 10299, + "time_per_iteration": 2.3932275772094727 + }, + { + "auxiliary_loss_clip": 0.01059574, + "auxiliary_loss_mlp": 0.01026398, + "balance_loss_clip": 1.01417327, + "balance_loss_mlp": 1.0189606, + "epoch": 0.6192695024800842, + "flos": 13553072858880.0, + "grad_norm": 1.7741180302550732, + "language_loss": 0.68311727, + "learning_rate": 1.2681792341281474e-06, + "loss": 0.70397705, + "num_input_tokens_seen": 221823330, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.40625, + "step": 10300, + "time_per_iteration": 2.4194388389587402 + }, + { + "auxiliary_loss_clip": 0.01060538, + "auxiliary_loss_mlp": 0.01024983, + "balance_loss_clip": 1.01306152, + "balance_loss_mlp": 1.02034235, + "epoch": 0.6193296257327522, + "flos": 17414502007680.0, + "grad_norm": 1.776501124317583, + "language_loss": 0.66799563, + "learning_rate": 1.267827626805166e-06, + "loss": 0.68885088, + "num_input_tokens_seen": 221839360, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.40234375, + "step": 10301, + "time_per_iteration": 3.8185672760009766 + }, + { + "auxiliary_loss_clip": 0.01057656, + "auxiliary_loss_mlp": 0.01023976, + "balance_loss_clip": 1.01279998, + "balance_loss_mlp": 1.01804566, + "epoch": 0.6193897489854201, + "flos": 31099826563200.0, + "grad_norm": 1.7182465377426757, + "language_loss": 0.72480506, + "learning_rate": 1.267476045611533e-06, + "loss": 0.74562138, + "num_input_tokens_seen": 221859465, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.39648438, + "step": 10302, + "time_per_iteration": 2.4748284816741943 + }, + { + "auxiliary_loss_clip": 0.01057709, + "auxiliary_loss_mlp": 0.01022566, + "balance_loss_clip": 1.00968504, + "balance_loss_mlp": 1.0193342, + "epoch": 0.6194498722380881, + "flos": 19133066505600.0, + "grad_norm": 1.7123466934314093, + "language_loss": 0.80238217, + "learning_rate": 1.267124490559796e-06, + "loss": 0.82318491, + "num_input_tokens_seen": 221878555, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.3828125, + "step": 10303, + "time_per_iteration": 2.373781204223633 + }, + { + "auxiliary_loss_clip": 0.01058445, + "auxiliary_loss_mlp": 0.01023287, + "balance_loss_clip": 1.01057911, + "balance_loss_mlp": 1.01827848, + "epoch": 0.619509995490756, + "flos": 21834017239680.0, + "grad_norm": 1.601806810427852, + "language_loss": 0.76547182, + "learning_rate": 1.2667729616625006e-06, + "loss": 0.7862891, + "num_input_tokens_seen": 221898790, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.40234375, + "step": 10304, + "time_per_iteration": 2.4085164070129395 + }, + { + "auxiliary_loss_clip": 0.01061365, + "auxiliary_loss_mlp": 0.01025584, + "balance_loss_clip": 1.01210165, + "balance_loss_mlp": 1.01919615, + "epoch": 0.619570118743424, + "flos": 23805387959040.0, + "grad_norm": 1.8776957287434821, + "language_loss": 0.76920229, + "learning_rate": 1.266421458932192e-06, + "loss": 0.79007185, + "num_input_tokens_seen": 221918875, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.421875, + "step": 10305, + "time_per_iteration": 2.410708427429199 + }, + { + "auxiliary_loss_clip": 0.0106234, + "auxiliary_loss_mlp": 0.01027149, + "balance_loss_clip": 1.01422095, + "balance_loss_mlp": 1.02082169, + "epoch": 0.619630241996092, + "flos": 21100666798080.0, + "grad_norm": 1.6605327126584544, + "language_loss": 0.78777218, + "learning_rate": 1.2660699823814147e-06, + "loss": 0.80866706, + "num_input_tokens_seen": 221937895, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.41601562, + "step": 10306, + "time_per_iteration": 2.4003000259399414 + }, + { + "auxiliary_loss_clip": 0.01053369, + "auxiliary_loss_mlp": 0.01019873, + "balance_loss_clip": 1.00938225, + "balance_loss_mlp": 1.01740527, + "epoch": 0.61969036524876, + "flos": 27307036880640.0, + "grad_norm": 1.6813386735576092, + "language_loss": 0.80102038, + "learning_rate": 1.2657185320227122e-06, + "loss": 0.82175279, + "num_input_tokens_seen": 221955920, + "router_z_loss_clip": 0.10498047, + "router_z_loss_mlp": 0.359375, + "step": 10307, + "time_per_iteration": 2.4623067378997803 + }, + { + "auxiliary_loss_clip": 0.0100705, + "auxiliary_loss_mlp": 0.00999791, + "balance_loss_clip": 0.99887341, + "balance_loss_mlp": 1.00019145, + "epoch": 0.6197504885014279, + "flos": 51645896547840.0, + "grad_norm": 0.8052155571387685, + "language_loss": 0.59402251, + "learning_rate": 1.2653671078686261e-06, + "loss": 0.61409092, + "num_input_tokens_seen": 222011405, + "router_z_loss_clip": 0.00915527, + "router_z_loss_mlp": 0.06835938, + "step": 10308, + "time_per_iteration": 3.05623197555542 + }, + { + "auxiliary_loss_clip": 0.01055457, + "auxiliary_loss_mlp": 0.01019671, + "balance_loss_clip": 1.00871611, + "balance_loss_mlp": 1.01816642, + "epoch": 0.6198106117540959, + "flos": 30555739935360.0, + "grad_norm": 1.604695853588542, + "language_loss": 0.68107009, + "learning_rate": 1.2650157099316982e-06, + "loss": 0.70182133, + "num_input_tokens_seen": 222034545, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.37109375, + "step": 10309, + "time_per_iteration": 2.480698585510254 + }, + { + "auxiliary_loss_clip": 0.01057426, + "auxiliary_loss_mlp": 0.0102636, + "balance_loss_clip": 1.01440334, + "balance_loss_mlp": 1.01936018, + "epoch": 0.6198707350067638, + "flos": 18908924376960.0, + "grad_norm": 1.585864983535532, + "language_loss": 0.71988386, + "learning_rate": 1.264664338224469e-06, + "loss": 0.7407217, + "num_input_tokens_seen": 222052690, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.38085938, + "step": 10310, + "time_per_iteration": 2.429055690765381 + }, + { + "auxiliary_loss_clip": 0.01060021, + "auxiliary_loss_mlp": 0.01027893, + "balance_loss_clip": 1.01471996, + "balance_loss_mlp": 1.02037406, + "epoch": 0.6199308582594318, + "flos": 21432795361920.0, + "grad_norm": 1.8163092548668465, + "language_loss": 0.79078007, + "learning_rate": 1.2643129927594781e-06, + "loss": 0.81165916, + "num_input_tokens_seen": 222069095, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.39453125, + "step": 10311, + "time_per_iteration": 2.410108804702759 + }, + { + "auxiliary_loss_clip": 0.01056361, + "auxiliary_loss_mlp": 0.0102366, + "balance_loss_clip": 1.01149487, + "balance_loss_mlp": 1.01705253, + "epoch": 0.6199909815120999, + "flos": 18406349222400.0, + "grad_norm": 1.6230839984943923, + "language_loss": 0.72465777, + "learning_rate": 1.2639616735492639e-06, + "loss": 0.74545801, + "num_input_tokens_seen": 222087360, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.39453125, + "step": 10312, + "time_per_iteration": 2.401395559310913 + }, + { + "auxiliary_loss_clip": 0.01057922, + "auxiliary_loss_mlp": 0.01023448, + "balance_loss_clip": 1.01076448, + "balance_loss_mlp": 1.01887119, + "epoch": 0.6200511047647678, + "flos": 21465893197440.0, + "grad_norm": 2.2843264817103677, + "language_loss": 0.72007382, + "learning_rate": 1.2636103806063644e-06, + "loss": 0.74088752, + "num_input_tokens_seen": 222106130, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.390625, + "step": 10313, + "time_per_iteration": 2.3887908458709717 + }, + { + "auxiliary_loss_clip": 0.01060213, + "auxiliary_loss_mlp": 0.01022555, + "balance_loss_clip": 1.00980568, + "balance_loss_mlp": 1.02004337, + "epoch": 0.6201112280174358, + "flos": 18215130372480.0, + "grad_norm": 1.861079223857737, + "language_loss": 0.78696239, + "learning_rate": 1.2632591139433167e-06, + "loss": 0.80779004, + "num_input_tokens_seen": 222123125, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.40039062, + "step": 10314, + "time_per_iteration": 2.3802709579467773 + }, + { + "auxiliary_loss_clip": 0.01058036, + "auxiliary_loss_mlp": 0.01025662, + "balance_loss_clip": 1.012954, + "balance_loss_mlp": 1.01843596, + "epoch": 0.6201713512701037, + "flos": 20010154481280.0, + "grad_norm": 2.590357373127183, + "language_loss": 0.7798748, + "learning_rate": 1.2629078735726553e-06, + "loss": 0.80071181, + "num_input_tokens_seen": 222140655, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.39648438, + "step": 10315, + "time_per_iteration": 2.3843588829040527 + }, + { + "auxiliary_loss_clip": 0.01061356, + "auxiliary_loss_mlp": 0.01029294, + "balance_loss_clip": 1.01739693, + "balance_loss_mlp": 1.02077937, + "epoch": 0.6202314745227717, + "flos": 22486718707200.0, + "grad_norm": 1.6816262963418063, + "language_loss": 0.76003993, + "learning_rate": 1.2625566595069162e-06, + "loss": 0.78094637, + "num_input_tokens_seen": 222160450, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.40625, + "step": 10316, + "time_per_iteration": 2.4242680072784424 + }, + { + "auxiliary_loss_clip": 0.01059201, + "auxiliary_loss_mlp": 0.0102412, + "balance_loss_clip": 1.01116168, + "balance_loss_mlp": 1.01869476, + "epoch": 0.6202915977754396, + "flos": 26827609824000.0, + "grad_norm": 1.9593595691986636, + "language_loss": 0.77306628, + "learning_rate": 1.2622054717586328e-06, + "loss": 0.79389948, + "num_input_tokens_seen": 222179170, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.40625, + "step": 10317, + "time_per_iteration": 2.464308023452759 + }, + { + "auxiliary_loss_clip": 0.01008455, + "auxiliary_loss_mlp": 0.01004291, + "balance_loss_clip": 1.00339735, + "balance_loss_mlp": 1.00146675, + "epoch": 0.6203517210281076, + "flos": 62739269233920.0, + "grad_norm": 0.6936785327475918, + "language_loss": 0.59046519, + "learning_rate": 1.2618543103403385e-06, + "loss": 0.61059272, + "num_input_tokens_seen": 222242660, + "router_z_loss_clip": 0.00891113, + "router_z_loss_mlp": 0.06982422, + "step": 10318, + "time_per_iteration": 3.0946767330169678 + }, + { + "auxiliary_loss_clip": 0.01060055, + "auxiliary_loss_mlp": 0.01025685, + "balance_loss_clip": 1.0131557, + "balance_loss_mlp": 1.02109993, + "epoch": 0.6204118442807756, + "flos": 23403153651840.0, + "grad_norm": 1.7559616983610047, + "language_loss": 0.77890575, + "learning_rate": 1.261503175264565e-06, + "loss": 0.79976314, + "num_input_tokens_seen": 222262170, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.38867188, + "step": 10319, + "time_per_iteration": 2.5019028186798096 + }, + { + "auxiliary_loss_clip": 0.01058491, + "auxiliary_loss_mlp": 0.0102482, + "balance_loss_clip": 1.01338792, + "balance_loss_mlp": 1.01947808, + "epoch": 0.6204719675334436, + "flos": 20192610579840.0, + "grad_norm": 2.983741313616062, + "language_loss": 0.66192877, + "learning_rate": 1.2611520665438435e-06, + "loss": 0.68276191, + "num_input_tokens_seen": 222280375, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.390625, + "step": 10320, + "time_per_iteration": 2.384251117706299 + }, + { + "auxiliary_loss_clip": 0.01056922, + "auxiliary_loss_mlp": 0.01024663, + "balance_loss_clip": 1.01367831, + "balance_loss_mlp": 1.01900291, + "epoch": 0.6205320907861115, + "flos": 13187218055040.0, + "grad_norm": 2.087581959424266, + "language_loss": 0.76102006, + "learning_rate": 1.2608009841907046e-06, + "loss": 0.78183591, + "num_input_tokens_seen": 222297325, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.37890625, + "step": 10321, + "time_per_iteration": 2.357180118560791 + }, + { + "auxiliary_loss_clip": 0.01055593, + "auxiliary_loss_mlp": 0.01021577, + "balance_loss_clip": 1.00973928, + "balance_loss_mlp": 1.01746964, + "epoch": 0.6205922140387795, + "flos": 20667324602880.0, + "grad_norm": 2.094182404880054, + "language_loss": 0.7368412, + "learning_rate": 1.2604499282176768e-06, + "loss": 0.75761282, + "num_input_tokens_seen": 222317095, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.38085938, + "step": 10322, + "time_per_iteration": 2.4001572132110596 + }, + { + "auxiliary_loss_clip": 0.01056568, + "auxiliary_loss_mlp": 0.01019998, + "balance_loss_clip": 1.00836933, + "balance_loss_mlp": 1.01811242, + "epoch": 0.6206523372914474, + "flos": 23876715600000.0, + "grad_norm": 1.9578770232763008, + "language_loss": 0.72567868, + "learning_rate": 1.260098898637289e-06, + "loss": 0.74644428, + "num_input_tokens_seen": 222337055, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.38476562, + "step": 10323, + "time_per_iteration": 4.0130109786987305 + }, + { + "auxiliary_loss_clip": 0.01063902, + "auxiliary_loss_mlp": 0.01024275, + "balance_loss_clip": 1.01129353, + "balance_loss_mlp": 1.02118981, + "epoch": 0.6207124605441154, + "flos": 13405774366080.0, + "grad_norm": 3.190161607885941, + "language_loss": 0.58684832, + "learning_rate": 1.2597478954620677e-06, + "loss": 0.60773009, + "num_input_tokens_seen": 222354515, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.42578125, + "step": 10324, + "time_per_iteration": 2.4275107383728027 + }, + { + "auxiliary_loss_clip": 0.01058736, + "auxiliary_loss_mlp": 0.01027709, + "balance_loss_clip": 1.01515055, + "balance_loss_mlp": 1.01948631, + "epoch": 0.6207725837967835, + "flos": 18915348067200.0, + "grad_norm": 1.6670455178143277, + "language_loss": 0.76778817, + "learning_rate": 1.2593969187045402e-06, + "loss": 0.78865254, + "num_input_tokens_seen": 222372755, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.39257812, + "step": 10325, + "time_per_iteration": 2.386425256729126 + }, + { + "auxiliary_loss_clip": 0.0106035, + "auxiliary_loss_mlp": 0.0102173, + "balance_loss_clip": 1.00902867, + "balance_loss_mlp": 1.0195992, + "epoch": 0.6208327070494514, + "flos": 23979290774400.0, + "grad_norm": 1.730879142420144, + "language_loss": 0.72709304, + "learning_rate": 1.2590459683772317e-06, + "loss": 0.74791384, + "num_input_tokens_seen": 222391380, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.40625, + "step": 10326, + "time_per_iteration": 2.4187400341033936 + }, + { + "auxiliary_loss_clip": 0.01061838, + "auxiliary_loss_mlp": 0.0102969, + "balance_loss_clip": 1.0165534, + "balance_loss_mlp": 1.01996303, + "epoch": 0.6208928303021194, + "flos": 22819301118720.0, + "grad_norm": 2.7255478618550413, + "language_loss": 0.74104697, + "learning_rate": 1.2586950444926663e-06, + "loss": 0.76196223, + "num_input_tokens_seen": 222411165, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.41796875, + "step": 10327, + "time_per_iteration": 2.4025745391845703 + }, + { + "auxiliary_loss_clip": 0.01060956, + "auxiliary_loss_mlp": 0.01025116, + "balance_loss_clip": 1.01239586, + "balance_loss_mlp": 1.01932192, + "epoch": 0.6209529535547873, + "flos": 17563615891200.0, + "grad_norm": 2.8866247785067585, + "language_loss": 0.79706895, + "learning_rate": 1.2583441470633683e-06, + "loss": 0.81792969, + "num_input_tokens_seen": 222428110, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.41601562, + "step": 10328, + "time_per_iteration": 2.3449184894561768 + }, + { + "auxiliary_loss_clip": 0.01059779, + "auxiliary_loss_mlp": 0.01028734, + "balance_loss_clip": 1.01596653, + "balance_loss_mlp": 1.01902187, + "epoch": 0.6210130768074553, + "flos": 22010992254720.0, + "grad_norm": 2.1581446398879796, + "language_loss": 0.7817691, + "learning_rate": 1.2579932761018596e-06, + "loss": 0.80265427, + "num_input_tokens_seen": 222446385, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.40820312, + "step": 10329, + "time_per_iteration": 2.4257168769836426 + }, + { + "auxiliary_loss_clip": 0.01007565, + "auxiliary_loss_mlp": 0.01001034, + "balance_loss_clip": 1.00005674, + "balance_loss_mlp": 1.00064254, + "epoch": 0.6210732000601232, + "flos": 63673825040640.0, + "grad_norm": 0.8299303214185655, + "language_loss": 0.62173408, + "learning_rate": 1.2576424316206624e-06, + "loss": 0.64182007, + "num_input_tokens_seen": 222502150, + "router_z_loss_clip": 0.00976562, + "router_z_loss_mlp": 0.06933594, + "step": 10330, + "time_per_iteration": 2.998696804046631 + }, + { + "auxiliary_loss_clip": 0.010616, + "auxiliary_loss_mlp": 0.01027885, + "balance_loss_clip": 1.0148437, + "balance_loss_mlp": 1.0199728, + "epoch": 0.6211333233127913, + "flos": 24242221290240.0, + "grad_norm": 2.636138061431643, + "language_loss": 0.77568126, + "learning_rate": 1.2572916136322974e-06, + "loss": 0.79657614, + "num_input_tokens_seen": 222519880, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.41601562, + "step": 10331, + "time_per_iteration": 2.4400112628936768 + }, + { + "auxiliary_loss_clip": 0.01061056, + "auxiliary_loss_mlp": 0.01028354, + "balance_loss_clip": 1.01553273, + "balance_loss_mlp": 1.01966012, + "epoch": 0.6211934465654592, + "flos": 16942930007040.0, + "grad_norm": 1.7870287002243714, + "language_loss": 0.67494988, + "learning_rate": 1.2569408221492835e-06, + "loss": 0.69584394, + "num_input_tokens_seen": 222538545, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.4140625, + "step": 10332, + "time_per_iteration": 2.4551053047180176 + }, + { + "auxiliary_loss_clip": 0.01057767, + "auxiliary_loss_mlp": 0.01022189, + "balance_loss_clip": 1.01072121, + "balance_loss_mlp": 1.01866651, + "epoch": 0.6212535698181272, + "flos": 15266505386880.0, + "grad_norm": 1.5522236076225577, + "language_loss": 0.76349247, + "learning_rate": 1.256590057184141e-06, + "loss": 0.78429204, + "num_input_tokens_seen": 222556935, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.390625, + "step": 10333, + "time_per_iteration": 2.3964481353759766 + }, + { + "auxiliary_loss_clip": 0.01058492, + "auxiliary_loss_mlp": 0.01023847, + "balance_loss_clip": 1.01241481, + "balance_loss_mlp": 1.0196619, + "epoch": 0.6213136930707951, + "flos": 13443096476160.0, + "grad_norm": 2.0153459389584065, + "language_loss": 0.69855368, + "learning_rate": 1.2562393187493866e-06, + "loss": 0.71937716, + "num_input_tokens_seen": 222574035, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.38867188, + "step": 10334, + "time_per_iteration": 5.219463109970093 + }, + { + "auxiliary_loss_clip": 0.01058658, + "auxiliary_loss_mlp": 0.01023534, + "balance_loss_clip": 1.01223946, + "balance_loss_mlp": 1.02009153, + "epoch": 0.6213738163234631, + "flos": 18110320871040.0, + "grad_norm": 1.6509680613649065, + "language_loss": 0.69991601, + "learning_rate": 1.2558886068575381e-06, + "loss": 0.72073793, + "num_input_tokens_seen": 222592290, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.38671875, + "step": 10335, + "time_per_iteration": 2.3915858268737793 + }, + { + "auxiliary_loss_clip": 0.0105502, + "auxiliary_loss_mlp": 0.01022767, + "balance_loss_clip": 1.01148403, + "balance_loss_mlp": 1.01721156, + "epoch": 0.621433939576131, + "flos": 25336189831680.0, + "grad_norm": 1.5024979678111523, + "language_loss": 0.79952371, + "learning_rate": 1.2555379215211113e-06, + "loss": 0.82030153, + "num_input_tokens_seen": 222612805, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.37890625, + "step": 10336, + "time_per_iteration": 2.5040295124053955 + }, + { + "auxiliary_loss_clip": 0.01056025, + "auxiliary_loss_mlp": 0.01024698, + "balance_loss_clip": 1.01306283, + "balance_loss_mlp": 1.01831114, + "epoch": 0.621494062828799, + "flos": 22564504949760.0, + "grad_norm": 1.9221686515031722, + "language_loss": 0.73389459, + "learning_rate": 1.2551872627526208e-06, + "loss": 0.75470185, + "num_input_tokens_seen": 222632260, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.37695312, + "step": 10337, + "time_per_iteration": 2.400878429412842 + }, + { + "auxiliary_loss_clip": 0.0106109, + "auxiliary_loss_mlp": 0.01029393, + "balance_loss_clip": 1.01621425, + "balance_loss_mlp": 1.01964855, + "epoch": 0.621554186081467, + "flos": 27416733972480.0, + "grad_norm": 1.9096159358027391, + "language_loss": 0.63620234, + "learning_rate": 1.2548366305645815e-06, + "loss": 0.65710723, + "num_input_tokens_seen": 222653570, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.4140625, + "step": 10338, + "time_per_iteration": 2.4701740741729736 + }, + { + "auxiliary_loss_clip": 0.01059029, + "auxiliary_loss_mlp": 0.01022579, + "balance_loss_clip": 1.01099205, + "balance_loss_mlp": 1.01937914, + "epoch": 0.621614309334135, + "flos": 22345704259200.0, + "grad_norm": 1.7133174650651068, + "language_loss": 0.71298802, + "learning_rate": 1.2544860249695052e-06, + "loss": 0.73380411, + "num_input_tokens_seen": 222672480, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.39648438, + "step": 10339, + "time_per_iteration": 2.390756607055664 + }, + { + "auxiliary_loss_clip": 0.01056992, + "auxiliary_loss_mlp": 0.01022995, + "balance_loss_clip": 1.01131201, + "balance_loss_mlp": 1.01842856, + "epoch": 0.621674432586803, + "flos": 19280225352960.0, + "grad_norm": 1.8860532556294547, + "language_loss": 0.69691509, + "learning_rate": 1.2541354459799057e-06, + "loss": 0.71771497, + "num_input_tokens_seen": 222691200, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.38671875, + "step": 10340, + "time_per_iteration": 2.395681619644165 + }, + { + "auxiliary_loss_clip": 0.01055826, + "auxiliary_loss_mlp": 0.01022037, + "balance_loss_clip": 1.0101521, + "balance_loss_mlp": 1.0177381, + "epoch": 0.6217345558394709, + "flos": 21608653213440.0, + "grad_norm": 2.7339324386592843, + "language_loss": 0.68564492, + "learning_rate": 1.2537848936082926e-06, + "loss": 0.70642352, + "num_input_tokens_seen": 222709975, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.38085938, + "step": 10341, + "time_per_iteration": 3.80718994140625 + }, + { + "auxiliary_loss_clip": 0.01060278, + "auxiliary_loss_mlp": 0.01025018, + "balance_loss_clip": 1.01245964, + "balance_loss_mlp": 1.01960325, + "epoch": 0.6217946790921389, + "flos": 18003137397120.0, + "grad_norm": 2.5954562044288125, + "language_loss": 0.80699492, + "learning_rate": 1.253434367867178e-06, + "loss": 0.8278479, + "num_input_tokens_seen": 222729005, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.40625, + "step": 10342, + "time_per_iteration": 2.3974125385284424 + }, + { + "auxiliary_loss_clip": 0.01007813, + "auxiliary_loss_mlp": 0.0100633, + "balance_loss_clip": 1.0053587, + "balance_loss_mlp": 1.00061202, + "epoch": 0.6218548023448068, + "flos": 61970307338880.0, + "grad_norm": 0.7707126155534422, + "language_loss": 0.57320517, + "learning_rate": 1.2530838687690704e-06, + "loss": 0.59334654, + "num_input_tokens_seen": 222786090, + "router_z_loss_clip": 0.00970459, + "router_z_loss_mlp": 0.07226562, + "step": 10343, + "time_per_iteration": 2.94726824760437 + }, + { + "auxiliary_loss_clip": 0.01054651, + "auxiliary_loss_mlp": 0.0102273, + "balance_loss_clip": 1.01238894, + "balance_loss_mlp": 1.0181191, + "epoch": 0.6219149255974749, + "flos": 25737970291200.0, + "grad_norm": 1.8817797789324373, + "language_loss": 0.72749853, + "learning_rate": 1.2527333963264777e-06, + "loss": 0.7482723, + "num_input_tokens_seen": 222806100, + "router_z_loss_clip": 0.10351562, + "router_z_loss_mlp": 0.36523438, + "step": 10344, + "time_per_iteration": 2.437208652496338 + }, + { + "auxiliary_loss_clip": 0.01059883, + "auxiliary_loss_mlp": 0.01033144, + "balance_loss_clip": 1.02113342, + "balance_loss_mlp": 1.02004254, + "epoch": 0.6219750488501428, + "flos": 25409891445120.0, + "grad_norm": 1.7186655992765343, + "language_loss": 0.6010651, + "learning_rate": 1.2523829505519083e-06, + "loss": 0.62199533, + "num_input_tokens_seen": 222826575, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.3984375, + "step": 10345, + "time_per_iteration": 2.416667938232422 + }, + { + "auxiliary_loss_clip": 0.01058321, + "auxiliary_loss_mlp": 0.01027167, + "balance_loss_clip": 1.01474595, + "balance_loss_mlp": 1.01897156, + "epoch": 0.6220351721028108, + "flos": 20046359427840.0, + "grad_norm": 1.8521801503180355, + "language_loss": 0.78351808, + "learning_rate": 1.252032531457868e-06, + "loss": 0.80437291, + "num_input_tokens_seen": 222845285, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.39257812, + "step": 10346, + "time_per_iteration": 2.3964521884918213 + }, + { + "auxiliary_loss_clip": 0.01059462, + "auxiliary_loss_mlp": 0.01028865, + "balance_loss_clip": 1.01613951, + "balance_loss_mlp": 1.0185864, + "epoch": 0.6220952953554787, + "flos": 27487223740800.0, + "grad_norm": 2.8979087197311206, + "language_loss": 0.71389717, + "learning_rate": 1.251682139056863e-06, + "loss": 0.73478043, + "num_input_tokens_seen": 222864575, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.40820312, + "step": 10347, + "time_per_iteration": 2.4540412425994873 + }, + { + "auxiliary_loss_clip": 0.0105653, + "auxiliary_loss_mlp": 0.01023672, + "balance_loss_clip": 1.01195931, + "balance_loss_mlp": 1.01715612, + "epoch": 0.6221554186081467, + "flos": 19206628473600.0, + "grad_norm": 1.643088758265078, + "language_loss": 0.71910095, + "learning_rate": 1.2513317733613976e-06, + "loss": 0.73990303, + "num_input_tokens_seen": 222884420, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.39453125, + "step": 10348, + "time_per_iteration": 2.40598464012146 + }, + { + "auxiliary_loss_clip": 0.01008101, + "auxiliary_loss_mlp": 0.01002725, + "balance_loss_clip": 1.00180101, + "balance_loss_mlp": 1.00106275, + "epoch": 0.6222155418608146, + "flos": 62947805984640.0, + "grad_norm": 0.8077876378227741, + "language_loss": 0.5414834, + "learning_rate": 1.2509814343839748e-06, + "loss": 0.56159163, + "num_input_tokens_seen": 222944690, + "router_z_loss_clip": 0.00921631, + "router_z_loss_mlp": 0.0703125, + "step": 10349, + "time_per_iteration": 3.049462080001831 + }, + { + "auxiliary_loss_clip": 0.01058804, + "auxiliary_loss_mlp": 0.01020866, + "balance_loss_clip": 1.00890875, + "balance_loss_mlp": 1.01934338, + "epoch": 0.6222756651134826, + "flos": 22600011669120.0, + "grad_norm": 1.8909533476261102, + "language_loss": 0.69456255, + "learning_rate": 1.2506311221370984e-06, + "loss": 0.71535927, + "num_input_tokens_seen": 222962990, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.39453125, + "step": 10350, + "time_per_iteration": 2.43404483795166 + }, + { + "auxiliary_loss_clip": 0.01058842, + "auxiliary_loss_mlp": 0.01028683, + "balance_loss_clip": 1.0165298, + "balance_loss_mlp": 1.01926184, + "epoch": 0.6223357883661506, + "flos": 21141165841920.0, + "grad_norm": 1.9915430963437437, + "language_loss": 0.5697369, + "learning_rate": 1.2502808366332694e-06, + "loss": 0.59061217, + "num_input_tokens_seen": 222980715, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.39453125, + "step": 10351, + "time_per_iteration": 2.400771141052246 + }, + { + "auxiliary_loss_clip": 0.01007761, + "auxiliary_loss_mlp": 0.01000249, + "balance_loss_clip": 0.99926555, + "balance_loss_mlp": 1.00068307, + "epoch": 0.6223959116188186, + "flos": 63761595932160.0, + "grad_norm": 0.8048586489464741, + "language_loss": 0.61203438, + "learning_rate": 1.2499305778849895e-06, + "loss": 0.63211447, + "num_input_tokens_seen": 223040685, + "router_z_loss_clip": 0.00982666, + "router_z_loss_mlp": 0.07080078, + "step": 10352, + "time_per_iteration": 3.003286123275757 + }, + { + "auxiliary_loss_clip": 0.01055042, + "auxiliary_loss_mlp": 0.01024544, + "balance_loss_clip": 1.01327872, + "balance_loss_mlp": 1.01729321, + "epoch": 0.6224560348714866, + "flos": 22564609683840.0, + "grad_norm": 1.848460457046505, + "language_loss": 0.81879795, + "learning_rate": 1.2495803459047576e-06, + "loss": 0.83959383, + "num_input_tokens_seen": 223059000, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.37695312, + "step": 10353, + "time_per_iteration": 2.4102888107299805 + }, + { + "auxiliary_loss_clip": 0.01054495, + "auxiliary_loss_mlp": 0.01024682, + "balance_loss_clip": 1.01378608, + "balance_loss_mlp": 1.01787448, + "epoch": 0.6225161581241545, + "flos": 24096598542720.0, + "grad_norm": 1.6922544881334223, + "language_loss": 0.75636315, + "learning_rate": 1.2492301407050722e-06, + "loss": 0.77715492, + "num_input_tokens_seen": 223079345, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.36523438, + "step": 10354, + "time_per_iteration": 2.43788743019104 + }, + { + "auxiliary_loss_clip": 0.01058163, + "auxiliary_loss_mlp": 0.01021797, + "balance_loss_clip": 1.0102458, + "balance_loss_mlp": 1.01927245, + "epoch": 0.6225762813768225, + "flos": 20442623892480.0, + "grad_norm": 1.454495111882195, + "language_loss": 0.78785521, + "learning_rate": 1.2488799622984325e-06, + "loss": 0.80865479, + "num_input_tokens_seen": 223097880, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.38867188, + "step": 10355, + "time_per_iteration": 2.4081759452819824 + }, + { + "auxiliary_loss_clip": 0.01057127, + "auxiliary_loss_mlp": 0.01022831, + "balance_loss_clip": 1.01153612, + "balance_loss_mlp": 1.0188911, + "epoch": 0.6226364046294904, + "flos": 27196920852480.0, + "grad_norm": 2.0408338974506486, + "language_loss": 0.77944148, + "learning_rate": 1.2485298106973344e-06, + "loss": 0.80024099, + "num_input_tokens_seen": 223118185, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.3828125, + "step": 10356, + "time_per_iteration": 2.4729557037353516 + }, + { + "auxiliary_loss_clip": 0.01061734, + "auxiliary_loss_mlp": 0.01029259, + "balance_loss_clip": 1.01598465, + "balance_loss_mlp": 1.02049422, + "epoch": 0.6226965278821585, + "flos": 20444823308160.0, + "grad_norm": 2.029023709791706, + "language_loss": 0.67676151, + "learning_rate": 1.2481796859142745e-06, + "loss": 0.69767153, + "num_input_tokens_seen": 223137600, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.4140625, + "step": 10357, + "time_per_iteration": 2.4267823696136475 + }, + { + "auxiliary_loss_clip": 0.01062646, + "auxiliary_loss_mlp": 0.0102811, + "balance_loss_clip": 1.01468158, + "balance_loss_mlp": 1.02095485, + "epoch": 0.6227566511348264, + "flos": 22161677149440.0, + "grad_norm": 3.2407146348334304, + "language_loss": 0.75768894, + "learning_rate": 1.247829587961748e-06, + "loss": 0.77859646, + "num_input_tokens_seen": 223154360, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.41796875, + "step": 10358, + "time_per_iteration": 2.4231438636779785 + }, + { + "auxiliary_loss_clip": 0.0105679, + "auxiliary_loss_mlp": 0.01025489, + "balance_loss_clip": 1.01301944, + "balance_loss_mlp": 1.01764059, + "epoch": 0.6228167743874944, + "flos": 18039900925440.0, + "grad_norm": 1.94638183917122, + "language_loss": 0.8179208, + "learning_rate": 1.2474795168522483e-06, + "loss": 0.83874357, + "num_input_tokens_seen": 223172255, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.390625, + "step": 10359, + "time_per_iteration": 2.359069585800171 + }, + { + "auxiliary_loss_clip": 0.01056428, + "auxiliary_loss_mlp": 0.01028723, + "balance_loss_clip": 1.01690388, + "balance_loss_mlp": 1.01872396, + "epoch": 0.6228768976401623, + "flos": 17742057183360.0, + "grad_norm": 1.9859159430111668, + "language_loss": 0.73337793, + "learning_rate": 1.247129472598269e-06, + "loss": 0.75422943, + "num_input_tokens_seen": 223186965, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.37695312, + "step": 10360, + "time_per_iteration": 2.369205951690674 + }, + { + "auxiliary_loss_clip": 0.01056966, + "auxiliary_loss_mlp": 0.01027076, + "balance_loss_clip": 1.01573324, + "balance_loss_mlp": 1.0181334, + "epoch": 0.6229370208928303, + "flos": 17893963975680.0, + "grad_norm": 1.951479696291086, + "language_loss": 0.77552724, + "learning_rate": 1.246779455212302e-06, + "loss": 0.79636765, + "num_input_tokens_seen": 223206045, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.38671875, + "step": 10361, + "time_per_iteration": 2.3714044094085693 + }, + { + "auxiliary_loss_clip": 0.01058499, + "auxiliary_loss_mlp": 0.01026529, + "balance_loss_clip": 1.01477551, + "balance_loss_mlp": 1.01991534, + "epoch": 0.6229971441454982, + "flos": 17346805148160.0, + "grad_norm": 1.603596787055431, + "language_loss": 0.67548531, + "learning_rate": 1.2464294647068392e-06, + "loss": 0.69633555, + "num_input_tokens_seen": 223224820, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.38671875, + "step": 10362, + "time_per_iteration": 3.8430521488189697 + }, + { + "auxiliary_loss_clip": 0.01058851, + "auxiliary_loss_mlp": 0.01022851, + "balance_loss_clip": 1.01073956, + "balance_loss_mlp": 1.01948273, + "epoch": 0.6230572673981662, + "flos": 29240107971840.0, + "grad_norm": 1.8398429513547925, + "language_loss": 0.67687273, + "learning_rate": 1.24607950109437e-06, + "loss": 0.69768977, + "num_input_tokens_seen": 223243205, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.39453125, + "step": 10363, + "time_per_iteration": 2.4843883514404297 + }, + { + "auxiliary_loss_clip": 0.01059863, + "auxiliary_loss_mlp": 0.01027967, + "balance_loss_clip": 1.01485384, + "balance_loss_mlp": 1.01926017, + "epoch": 0.6231173906508342, + "flos": 16325037031680.0, + "grad_norm": 1.9179406003858062, + "language_loss": 0.86856651, + "learning_rate": 1.2457295643873845e-06, + "loss": 0.88944483, + "num_input_tokens_seen": 223261370, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.40625, + "step": 10364, + "time_per_iteration": 2.371103048324585 + }, + { + "auxiliary_loss_clip": 0.01008213, + "auxiliary_loss_mlp": 0.0100189, + "balance_loss_clip": 1.00103211, + "balance_loss_mlp": 1.00111103, + "epoch": 0.6231775139035022, + "flos": 68699119006080.0, + "grad_norm": 0.8854275143271574, + "language_loss": 0.60787392, + "learning_rate": 1.2453796545983704e-06, + "loss": 0.62797493, + "num_input_tokens_seen": 223315050, + "router_z_loss_clip": 0.00860596, + "router_z_loss_mlp": 0.07128906, + "step": 10365, + "time_per_iteration": 3.0430266857147217 + }, + { + "auxiliary_loss_clip": 0.01062564, + "auxiliary_loss_mlp": 0.01030413, + "balance_loss_clip": 1.01566029, + "balance_loss_mlp": 1.0205245, + "epoch": 0.6232376371561702, + "flos": 19820227351680.0, + "grad_norm": 1.742814216958052, + "language_loss": 0.75011086, + "learning_rate": 1.2450297717398151e-06, + "loss": 0.77104062, + "num_input_tokens_seen": 223332130, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.41992188, + "step": 10366, + "time_per_iteration": 2.3846287727355957 + }, + { + "auxiliary_loss_clip": 0.01059358, + "auxiliary_loss_mlp": 0.01027111, + "balance_loss_clip": 1.01420689, + "balance_loss_mlp": 1.01906252, + "epoch": 0.6232977604088381, + "flos": 23257146879360.0, + "grad_norm": 1.8957360485716999, + "language_loss": 0.76670241, + "learning_rate": 1.2446799158242056e-06, + "loss": 0.78756714, + "num_input_tokens_seen": 223351605, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.40234375, + "step": 10367, + "time_per_iteration": 2.416487455368042 + }, + { + "auxiliary_loss_clip": 0.0105838, + "auxiliary_loss_mlp": 0.01022817, + "balance_loss_clip": 1.01151633, + "balance_loss_mlp": 1.02086866, + "epoch": 0.6233578836615061, + "flos": 21105344920320.0, + "grad_norm": 2.052904564273824, + "language_loss": 0.78593892, + "learning_rate": 1.244330086864027e-06, + "loss": 0.80675089, + "num_input_tokens_seen": 223372090, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.375, + "step": 10368, + "time_per_iteration": 2.402827501296997 + }, + { + "auxiliary_loss_clip": 0.01060915, + "auxiliary_loss_mlp": 0.01023612, + "balance_loss_clip": 1.01095176, + "balance_loss_mlp": 1.02063251, + "epoch": 0.623418006914174, + "flos": 23768275317120.0, + "grad_norm": 1.680514080108386, + "language_loss": 0.68741196, + "learning_rate": 1.2439802848717637e-06, + "loss": 0.7082572, + "num_input_tokens_seen": 223390110, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.40234375, + "step": 10369, + "time_per_iteration": 2.4594600200653076 + }, + { + "auxiliary_loss_clip": 0.01060144, + "auxiliary_loss_mlp": 0.01025252, + "balance_loss_clip": 1.01300371, + "balance_loss_mlp": 1.02033448, + "epoch": 0.6234781301668421, + "flos": 17889634967040.0, + "grad_norm": 2.1158678438641356, + "language_loss": 0.8764236, + "learning_rate": 1.2436305098598997e-06, + "loss": 0.89727759, + "num_input_tokens_seen": 223404205, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.3984375, + "step": 10370, + "time_per_iteration": 2.3638453483581543 + }, + { + "auxiliary_loss_clip": 0.01055128, + "auxiliary_loss_mlp": 0.01020646, + "balance_loss_clip": 1.00911832, + "balance_loss_mlp": 1.01792145, + "epoch": 0.62353825341951, + "flos": 26174349774720.0, + "grad_norm": 6.215288636351014, + "language_loss": 0.66029066, + "learning_rate": 1.2432807618409163e-06, + "loss": 0.68104839, + "num_input_tokens_seen": 223424855, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.37109375, + "step": 10371, + "time_per_iteration": 2.449317693710327 + }, + { + "auxiliary_loss_clip": 0.01052373, + "auxiliary_loss_mlp": 0.0102251, + "balance_loss_clip": 1.01242542, + "balance_loss_mlp": 1.01673806, + "epoch": 0.623598376672178, + "flos": 31138545127680.0, + "grad_norm": 1.6600189808549104, + "language_loss": 0.77847588, + "learning_rate": 1.2429310408272966e-06, + "loss": 0.79922462, + "num_input_tokens_seen": 223447225, + "router_z_loss_clip": 0.10107422, + "router_z_loss_mlp": 0.35546875, + "step": 10372, + "time_per_iteration": 2.496441602706909 + }, + { + "auxiliary_loss_clip": 0.01056756, + "auxiliary_loss_mlp": 0.01025036, + "balance_loss_clip": 1.01260841, + "balance_loss_mlp": 1.01743472, + "epoch": 0.6236584999248459, + "flos": 23729137816320.0, + "grad_norm": 1.6988964167999878, + "language_loss": 0.77448374, + "learning_rate": 1.24258134683152e-06, + "loss": 0.79530168, + "num_input_tokens_seen": 223467520, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.39257812, + "step": 10373, + "time_per_iteration": 3.876476764678955 + }, + { + "auxiliary_loss_clip": 0.01055596, + "auxiliary_loss_mlp": 0.01024406, + "balance_loss_clip": 1.0131762, + "balance_loss_mlp": 1.01853752, + "epoch": 0.6237186231775139, + "flos": 21761677169280.0, + "grad_norm": 1.55215783859111, + "language_loss": 0.69813102, + "learning_rate": 1.2422316798660677e-06, + "loss": 0.71893108, + "num_input_tokens_seen": 223488130, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.37109375, + "step": 10374, + "time_per_iteration": 3.84773325920105 + }, + { + "auxiliary_loss_clip": 0.01057602, + "auxiliary_loss_mlp": 0.01022346, + "balance_loss_clip": 1.01115811, + "balance_loss_mlp": 1.01852381, + "epoch": 0.6237787464301818, + "flos": 14938601097600.0, + "grad_norm": 2.7706759717602223, + "language_loss": 0.76879406, + "learning_rate": 1.2418820399434171e-06, + "loss": 0.78959358, + "num_input_tokens_seen": 223505105, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.390625, + "step": 10375, + "time_per_iteration": 2.406522035598755 + }, + { + "auxiliary_loss_clip": 0.01056356, + "auxiliary_loss_mlp": 0.01020259, + "balance_loss_clip": 1.00905955, + "balance_loss_mlp": 1.01863968, + "epoch": 0.6238388696828499, + "flos": 35588854045440.0, + "grad_norm": 1.4617674170077832, + "language_loss": 0.70106542, + "learning_rate": 1.241532427076046e-06, + "loss": 0.72183156, + "num_input_tokens_seen": 223528065, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.37695312, + "step": 10376, + "time_per_iteration": 2.5562903881073 + }, + { + "auxiliary_loss_clip": 0.01059315, + "auxiliary_loss_mlp": 0.01027165, + "balance_loss_clip": 1.01402783, + "balance_loss_mlp": 1.01953125, + "epoch": 0.6238989929355178, + "flos": 23622373278720.0, + "grad_norm": 1.612639630707966, + "language_loss": 0.76307732, + "learning_rate": 1.2411828412764322e-06, + "loss": 0.7839421, + "num_input_tokens_seen": 223547305, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.3984375, + "step": 10377, + "time_per_iteration": 2.4334006309509277 + }, + { + "auxiliary_loss_clip": 0.01058987, + "auxiliary_loss_mlp": 0.01021183, + "balance_loss_clip": 1.00994158, + "balance_loss_mlp": 1.02018332, + "epoch": 0.6239591161881858, + "flos": 22086474347520.0, + "grad_norm": 1.793965365316984, + "language_loss": 0.67922997, + "learning_rate": 1.2408332825570504e-06, + "loss": 0.7000317, + "num_input_tokens_seen": 223567205, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.38867188, + "step": 10378, + "time_per_iteration": 2.416011095046997 + }, + { + "auxiliary_loss_clip": 0.01059285, + "auxiliary_loss_mlp": 0.01023566, + "balance_loss_clip": 1.01153779, + "balance_loss_mlp": 1.01946568, + "epoch": 0.6240192394408538, + "flos": 24534758505600.0, + "grad_norm": 2.715901235348566, + "language_loss": 0.76578939, + "learning_rate": 1.2404837509303763e-06, + "loss": 0.78661788, + "num_input_tokens_seen": 223586560, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.3984375, + "step": 10379, + "time_per_iteration": 2.418097972869873 + }, + { + "auxiliary_loss_clip": 0.01055223, + "auxiliary_loss_mlp": 0.01023732, + "balance_loss_clip": 1.01199007, + "balance_loss_mlp": 1.01773107, + "epoch": 0.6240793626935217, + "flos": 27930585496320.0, + "grad_norm": 1.4198518453286229, + "language_loss": 0.7954669, + "learning_rate": 1.2401342464088835e-06, + "loss": 0.81625646, + "num_input_tokens_seen": 223610595, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.375, + "step": 10380, + "time_per_iteration": 2.511629819869995 + }, + { + "auxiliary_loss_clip": 0.01056652, + "auxiliary_loss_mlp": 0.01024856, + "balance_loss_clip": 1.01425231, + "balance_loss_mlp": 1.01909566, + "epoch": 0.6241394859461897, + "flos": 22891431720960.0, + "grad_norm": 1.528488367517126, + "language_loss": 0.79932433, + "learning_rate": 1.2397847690050442e-06, + "loss": 0.82013941, + "num_input_tokens_seen": 223630230, + "router_z_loss_clip": 0.10595703, + "router_z_loss_mlp": 0.375, + "step": 10381, + "time_per_iteration": 3.868790864944458 + }, + { + "auxiliary_loss_clip": 0.01058007, + "auxiliary_loss_mlp": 0.01026545, + "balance_loss_clip": 1.01464224, + "balance_loss_mlp": 1.01857853, + "epoch": 0.6241996091988576, + "flos": 12749930876160.0, + "grad_norm": 1.8979433515100117, + "language_loss": 0.74432874, + "learning_rate": 1.2394353187313318e-06, + "loss": 0.76517421, + "num_input_tokens_seen": 223648360, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.39453125, + "step": 10382, + "time_per_iteration": 2.3834540843963623 + }, + { + "auxiliary_loss_clip": 0.01054433, + "auxiliary_loss_mlp": 0.01023618, + "balance_loss_clip": 1.01250148, + "balance_loss_mlp": 1.01780379, + "epoch": 0.6242597324515257, + "flos": 25850041355520.0, + "grad_norm": 1.4282607768198388, + "language_loss": 0.78137994, + "learning_rate": 1.2390858956002163e-06, + "loss": 0.8021605, + "num_input_tokens_seen": 223671255, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.3671875, + "step": 10383, + "time_per_iteration": 2.4444901943206787 + }, + { + "auxiliary_loss_clip": 0.01056594, + "auxiliary_loss_mlp": 0.0102323, + "balance_loss_clip": 1.0125668, + "balance_loss_mlp": 1.01860285, + "epoch": 0.6243198557041936, + "flos": 19936697247360.0, + "grad_norm": 1.7232845921864133, + "language_loss": 0.7532075, + "learning_rate": 1.2387364996241678e-06, + "loss": 0.77400565, + "num_input_tokens_seen": 223689860, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.37890625, + "step": 10384, + "time_per_iteration": 2.3893842697143555 + }, + { + "auxiliary_loss_clip": 0.01057731, + "auxiliary_loss_mlp": 0.01021773, + "balance_loss_clip": 1.00950086, + "balance_loss_mlp": 1.01831937, + "epoch": 0.6243799789568616, + "flos": 18405197147520.0, + "grad_norm": 1.784956813465803, + "language_loss": 0.66360724, + "learning_rate": 1.238387130815655e-06, + "loss": 0.68440229, + "num_input_tokens_seen": 223707835, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.39453125, + "step": 10385, + "time_per_iteration": 2.371142864227295 + }, + { + "auxiliary_loss_clip": 0.01008002, + "auxiliary_loss_mlp": 0.01001971, + "balance_loss_clip": 1.00108862, + "balance_loss_mlp": 1.0010916, + "epoch": 0.6244401022095295, + "flos": 66024037589760.0, + "grad_norm": 0.7569617852129333, + "language_loss": 0.62075537, + "learning_rate": 1.2380377891871469e-06, + "loss": 0.64085513, + "num_input_tokens_seen": 223771875, + "router_z_loss_clip": 0.0088501, + "router_z_loss_mlp": 0.06933594, + "step": 10386, + "time_per_iteration": 3.0640921592712402 + }, + { + "auxiliary_loss_clip": 0.01058237, + "auxiliary_loss_mlp": 0.01026807, + "balance_loss_clip": 1.01473665, + "balance_loss_mlp": 1.0188489, + "epoch": 0.6245002254621975, + "flos": 24570125579520.0, + "grad_norm": 3.295007284198793, + "language_loss": 0.72255647, + "learning_rate": 1.23768847475111e-06, + "loss": 0.74340689, + "num_input_tokens_seen": 223788895, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.39453125, + "step": 10387, + "time_per_iteration": 2.406561851501465 + }, + { + "auxiliary_loss_clip": 0.01062104, + "auxiliary_loss_mlp": 0.01026048, + "balance_loss_clip": 1.01276803, + "balance_loss_mlp": 1.0189991, + "epoch": 0.6245603487148654, + "flos": 29167558433280.0, + "grad_norm": 4.809574010899178, + "language_loss": 0.65484607, + "learning_rate": 1.23733918752001e-06, + "loss": 0.67572749, + "num_input_tokens_seen": 223810385, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.43164062, + "step": 10388, + "time_per_iteration": 2.493380546569824 + }, + { + "auxiliary_loss_clip": 0.01056495, + "auxiliary_loss_mlp": 0.01020642, + "balance_loss_clip": 1.00991893, + "balance_loss_mlp": 1.01841629, + "epoch": 0.6246204719675335, + "flos": 14789312657280.0, + "grad_norm": 1.6331048473878274, + "language_loss": 0.78721845, + "learning_rate": 1.2369899275063133e-06, + "loss": 0.80798978, + "num_input_tokens_seen": 223826040, + "router_z_loss_clip": 0.10693359, + "router_z_loss_mlp": 0.38085938, + "step": 10389, + "time_per_iteration": 2.396672487258911 + }, + { + "auxiliary_loss_clip": 0.01057736, + "auxiliary_loss_mlp": 0.01023009, + "balance_loss_clip": 1.01074791, + "balance_loss_mlp": 1.01931107, + "epoch": 0.6246805952202014, + "flos": 12492760734720.0, + "grad_norm": 1.8616594591592985, + "language_loss": 0.60341781, + "learning_rate": 1.236640694722483e-06, + "loss": 0.62422526, + "num_input_tokens_seen": 223842300, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.38476562, + "step": 10390, + "time_per_iteration": 2.3780105113983154 + }, + { + "auxiliary_loss_clip": 0.01057389, + "auxiliary_loss_mlp": 0.01021993, + "balance_loss_clip": 1.01004815, + "balance_loss_mlp": 1.01774192, + "epoch": 0.6247407184728694, + "flos": 12785856531840.0, + "grad_norm": 8.00471462848224, + "language_loss": 0.77234793, + "learning_rate": 1.2362914891809828e-06, + "loss": 0.79314178, + "num_input_tokens_seen": 223858320, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.39648438, + "step": 10391, + "time_per_iteration": 2.458235502243042 + }, + { + "auxiliary_loss_clip": 0.01059551, + "auxiliary_loss_mlp": 0.01022532, + "balance_loss_clip": 1.01024771, + "balance_loss_mlp": 1.02052355, + "epoch": 0.6248008417255374, + "flos": 40627484150400.0, + "grad_norm": 1.4882105524576765, + "language_loss": 0.64859128, + "learning_rate": 1.2359423108942752e-06, + "loss": 0.66941202, + "num_input_tokens_seen": 223883545, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.390625, + "step": 10392, + "time_per_iteration": 2.6122260093688965 + }, + { + "auxiliary_loss_clip": 0.0106003, + "auxiliary_loss_mlp": 0.01025791, + "balance_loss_clip": 1.01351857, + "balance_loss_mlp": 1.02026463, + "epoch": 0.6248609649782053, + "flos": 19900981059840.0, + "grad_norm": 1.7490603610204642, + "language_loss": 0.76922894, + "learning_rate": 1.2355931598748206e-06, + "loss": 0.7900871, + "num_input_tokens_seen": 223901445, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.39648438, + "step": 10393, + "time_per_iteration": 2.413374185562134 + }, + { + "auxiliary_loss_clip": 0.01059442, + "auxiliary_loss_mlp": 0.01027151, + "balance_loss_clip": 1.01480663, + "balance_loss_mlp": 1.01975703, + "epoch": 0.6249210882308733, + "flos": 19681726521600.0, + "grad_norm": 1.6121573823545716, + "language_loss": 0.82236701, + "learning_rate": 1.2352440361350803e-06, + "loss": 0.84323293, + "num_input_tokens_seen": 223920170, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.39648438, + "step": 10394, + "time_per_iteration": 2.426682710647583 + }, + { + "auxiliary_loss_clip": 0.01059757, + "auxiliary_loss_mlp": 0.01024012, + "balance_loss_clip": 1.0115304, + "balance_loss_mlp": 1.01934338, + "epoch": 0.6249812114835412, + "flos": 13989871278720.0, + "grad_norm": 1.7672682653794567, + "language_loss": 0.75256729, + "learning_rate": 1.2348949396875125e-06, + "loss": 0.77340496, + "num_input_tokens_seen": 223936495, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.40429688, + "step": 10395, + "time_per_iteration": 2.384826898574829 + }, + { + "auxiliary_loss_clip": 0.01058837, + "auxiliary_loss_mlp": 0.01023305, + "balance_loss_clip": 1.01165199, + "balance_loss_mlp": 1.01885533, + "epoch": 0.6250413347362093, + "flos": 14529384518400.0, + "grad_norm": 2.1601043635683777, + "language_loss": 0.72974712, + "learning_rate": 1.2345458705445771e-06, + "loss": 0.75056851, + "num_input_tokens_seen": 223950070, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.40039062, + "step": 10396, + "time_per_iteration": 2.3907690048217773 + }, + { + "auxiliary_loss_clip": 0.01058724, + "auxiliary_loss_mlp": 0.01023026, + "balance_loss_clip": 1.01173723, + "balance_loss_mlp": 1.02019072, + "epoch": 0.6251014579888772, + "flos": 22961991312000.0, + "grad_norm": 1.7048049317451766, + "language_loss": 0.76076829, + "learning_rate": 1.23419682871873e-06, + "loss": 0.78158581, + "num_input_tokens_seen": 223970065, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.38476562, + "step": 10397, + "time_per_iteration": 2.411170721054077 + }, + { + "auxiliary_loss_clip": 0.01056838, + "auxiliary_loss_mlp": 0.01025593, + "balance_loss_clip": 1.01369643, + "balance_loss_mlp": 1.01866317, + "epoch": 0.6251615812415452, + "flos": 28109969395200.0, + "grad_norm": 1.9859372551429846, + "language_loss": 0.75071758, + "learning_rate": 1.2338478142224285e-06, + "loss": 0.77154189, + "num_input_tokens_seen": 223990315, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.38085938, + "step": 10398, + "time_per_iteration": 2.457659959793091 + }, + { + "auxiliary_loss_clip": 0.01058227, + "auxiliary_loss_mlp": 0.0102524, + "balance_loss_clip": 1.0123055, + "balance_loss_mlp": 1.0178349, + "epoch": 0.6252217044942131, + "flos": 26723254170240.0, + "grad_norm": 2.157195099202031, + "language_loss": 0.738446, + "learning_rate": 1.2334988270681277e-06, + "loss": 0.75928062, + "num_input_tokens_seen": 224009960, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.40429688, + "step": 10399, + "time_per_iteration": 2.4353506565093994 + }, + { + "auxiliary_loss_clip": 0.01057747, + "auxiliary_loss_mlp": 0.01025312, + "balance_loss_clip": 1.01333189, + "balance_loss_mlp": 1.01885235, + "epoch": 0.6252818277468811, + "flos": 20005860384000.0, + "grad_norm": 1.543781630454466, + "language_loss": 0.74600172, + "learning_rate": 1.2331498672682819e-06, + "loss": 0.76683235, + "num_input_tokens_seen": 224028870, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.38867188, + "step": 10400, + "time_per_iteration": 2.4022443294525146 + }, + { + "auxiliary_loss_clip": 0.01008145, + "auxiliary_loss_mlp": 0.01002202, + "balance_loss_clip": 1.00133824, + "balance_loss_mlp": 1.00136924, + "epoch": 0.625341950999549, + "flos": 59274907511040.0, + "grad_norm": 0.8331251295343464, + "language_loss": 0.56482047, + "learning_rate": 1.232800934835345e-06, + "loss": 0.58492398, + "num_input_tokens_seen": 224094140, + "router_z_loss_clip": 0.00866699, + "router_z_loss_mlp": 0.06787109, + "step": 10401, + "time_per_iteration": 4.589165210723877 + }, + { + "auxiliary_loss_clip": 0.01058821, + "auxiliary_loss_mlp": 0.01027281, + "balance_loss_clip": 1.0147109, + "balance_loss_mlp": 1.01926458, + "epoch": 0.625402074252217, + "flos": 20156056519680.0, + "grad_norm": 1.9639499604818134, + "language_loss": 0.82813823, + "learning_rate": 1.2324520297817693e-06, + "loss": 0.84899926, + "num_input_tokens_seen": 224113235, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.39453125, + "step": 10402, + "time_per_iteration": 2.4352450370788574 + }, + { + "auxiliary_loss_clip": 0.01055473, + "auxiliary_loss_mlp": 0.01022051, + "balance_loss_clip": 1.01047611, + "balance_loss_mlp": 1.01757669, + "epoch": 0.625462197504885, + "flos": 29131248752640.0, + "grad_norm": 2.145448231157836, + "language_loss": 0.68586457, + "learning_rate": 1.2321031521200057e-06, + "loss": 0.70663983, + "num_input_tokens_seen": 224134530, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.37890625, + "step": 10403, + "time_per_iteration": 2.455071449279785 + }, + { + "auxiliary_loss_clip": 0.01059611, + "auxiliary_loss_mlp": 0.01020971, + "balance_loss_clip": 1.00829875, + "balance_loss_mlp": 1.01855135, + "epoch": 0.625522320757553, + "flos": 26103231601920.0, + "grad_norm": 1.521457276332652, + "language_loss": 0.71943879, + "learning_rate": 1.2317543018625058e-06, + "loss": 0.74024463, + "num_input_tokens_seen": 224154170, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.41015625, + "step": 10404, + "time_per_iteration": 2.436936855316162 + }, + { + "auxiliary_loss_clip": 0.0105825, + "auxiliary_loss_mlp": 0.01026657, + "balance_loss_clip": 1.0148015, + "balance_loss_mlp": 1.01939559, + "epoch": 0.625582444010221, + "flos": 20629932670080.0, + "grad_norm": 2.093500366884469, + "language_loss": 0.69750118, + "learning_rate": 1.2314054790217184e-06, + "loss": 0.71835029, + "num_input_tokens_seen": 224172730, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.38867188, + "step": 10405, + "time_per_iteration": 2.4035158157348633 + }, + { + "auxiliary_loss_clip": 0.0105789, + "auxiliary_loss_mlp": 0.01023137, + "balance_loss_clip": 1.01126337, + "balance_loss_mlp": 1.01959395, + "epoch": 0.6256425672628889, + "flos": 20520479957760.0, + "grad_norm": 1.6889792807589612, + "language_loss": 0.79115647, + "learning_rate": 1.2310566836100927e-06, + "loss": 0.81196678, + "num_input_tokens_seen": 224192620, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.3828125, + "step": 10406, + "time_per_iteration": 2.3984835147857666 + }, + { + "auxiliary_loss_clip": 0.01058669, + "auxiliary_loss_mlp": 0.01028144, + "balance_loss_clip": 1.01625252, + "balance_loss_mlp": 1.01957297, + "epoch": 0.6257026905155569, + "flos": 29528036887680.0, + "grad_norm": 1.7109954272107677, + "language_loss": 0.68121016, + "learning_rate": 1.2307079156400756e-06, + "loss": 0.70207834, + "num_input_tokens_seen": 224214660, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.390625, + "step": 10407, + "time_per_iteration": 2.487356662750244 + }, + { + "auxiliary_loss_clip": 0.01056338, + "auxiliary_loss_mlp": 0.01022773, + "balance_loss_clip": 1.01198411, + "balance_loss_mlp": 1.01905847, + "epoch": 0.6257628137682248, + "flos": 24023734801920.0, + "grad_norm": 2.0731966925833016, + "language_loss": 0.85572082, + "learning_rate": 1.2303591751241146e-06, + "loss": 0.87651193, + "num_input_tokens_seen": 224234170, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.37304688, + "step": 10408, + "time_per_iteration": 2.4152045249938965 + }, + { + "auxiliary_loss_clip": 0.01056978, + "auxiliary_loss_mlp": 0.01024364, + "balance_loss_clip": 1.01378465, + "balance_loss_mlp": 1.01912725, + "epoch": 0.6258229370208929, + "flos": 20849885435520.0, + "grad_norm": 1.6423951805944128, + "language_loss": 0.79698455, + "learning_rate": 1.230010462074655e-06, + "loss": 0.8177979, + "num_input_tokens_seen": 224253115, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.37890625, + "step": 10409, + "time_per_iteration": 2.4246833324432373 + }, + { + "auxiliary_loss_clip": 0.01057523, + "auxiliary_loss_mlp": 0.01021871, + "balance_loss_clip": 1.01022398, + "balance_loss_mlp": 1.01891851, + "epoch": 0.6258830602735608, + "flos": 22230595906560.0, + "grad_norm": 3.392140120301666, + "language_loss": 0.69603884, + "learning_rate": 1.2296617765041408e-06, + "loss": 0.71683276, + "num_input_tokens_seen": 224271375, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.38671875, + "step": 10410, + "time_per_iteration": 2.4009859561920166 + }, + { + "auxiliary_loss_clip": 0.01056362, + "auxiliary_loss_mlp": 0.01022708, + "balance_loss_clip": 1.0124203, + "balance_loss_mlp": 1.01931345, + "epoch": 0.6259431835262288, + "flos": 25075877667840.0, + "grad_norm": 1.6782206805389166, + "language_loss": 0.67704642, + "learning_rate": 1.2293131184250167e-06, + "loss": 0.69783711, + "num_input_tokens_seen": 224290315, + "router_z_loss_clip": 0.10302734, + "router_z_loss_mlp": 0.37109375, + "step": 10411, + "time_per_iteration": 2.4234259128570557 + }, + { + "auxiliary_loss_clip": 0.01058194, + "auxiliary_loss_mlp": 0.01026675, + "balance_loss_clip": 1.01458168, + "balance_loss_mlp": 1.01858306, + "epoch": 0.6260033067788967, + "flos": 28251158400000.0, + "grad_norm": 1.7636175824631908, + "language_loss": 0.69485211, + "learning_rate": 1.2289644878497244e-06, + "loss": 0.71570086, + "num_input_tokens_seen": 224310545, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.39453125, + "step": 10412, + "time_per_iteration": 2.4737114906311035 + }, + { + "auxiliary_loss_clip": 0.01054271, + "auxiliary_loss_mlp": 0.01021231, + "balance_loss_clip": 1.01017404, + "balance_loss_mlp": 1.01704776, + "epoch": 0.6260634300315647, + "flos": 23366320300800.0, + "grad_norm": 1.5028537207772883, + "language_loss": 0.692662, + "learning_rate": 1.2286158847907074e-06, + "loss": 0.71341699, + "num_input_tokens_seen": 224331115, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.37109375, + "step": 10413, + "time_per_iteration": 5.251358270645142 + }, + { + "auxiliary_loss_clip": 0.01058265, + "auxiliary_loss_mlp": 0.01024337, + "balance_loss_clip": 1.01084888, + "balance_loss_mlp": 1.01774037, + "epoch": 0.6261235532842326, + "flos": 18034489664640.0, + "grad_norm": 2.8352289419867636, + "language_loss": 0.80841875, + "learning_rate": 1.2282673092604045e-06, + "loss": 0.82924473, + "num_input_tokens_seen": 224347525, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.40429688, + "step": 10414, + "time_per_iteration": 2.397747755050659 + }, + { + "auxiliary_loss_clip": 0.01058951, + "auxiliary_loss_mlp": 0.01030889, + "balance_loss_clip": 1.01834869, + "balance_loss_mlp": 1.01966918, + "epoch": 0.6261836765369007, + "flos": 22010363850240.0, + "grad_norm": 1.5924377791533966, + "language_loss": 0.74796361, + "learning_rate": 1.227918761271256e-06, + "loss": 0.76886201, + "num_input_tokens_seen": 224367045, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.39257812, + "step": 10415, + "time_per_iteration": 2.4678773880004883 + }, + { + "auxiliary_loss_clip": 0.01057229, + "auxiliary_loss_mlp": 0.01025245, + "balance_loss_clip": 1.01413465, + "balance_loss_mlp": 1.01882935, + "epoch": 0.6262437997895686, + "flos": 24934863219840.0, + "grad_norm": 1.5371684563999861, + "language_loss": 0.74301469, + "learning_rate": 1.227570240835701e-06, + "loss": 0.76383948, + "num_input_tokens_seen": 224388860, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.3828125, + "step": 10416, + "time_per_iteration": 2.425384044647217 + }, + { + "auxiliary_loss_clip": 0.01056388, + "auxiliary_loss_mlp": 0.01022151, + "balance_loss_clip": 1.01128507, + "balance_loss_mlp": 1.01917696, + "epoch": 0.6263039230422366, + "flos": 31607219485440.0, + "grad_norm": 1.5484630891273743, + "language_loss": 0.8434965, + "learning_rate": 1.2272217479661771e-06, + "loss": 0.86428189, + "num_input_tokens_seen": 224409645, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.37109375, + "step": 10417, + "time_per_iteration": 2.4842324256896973 + }, + { + "auxiliary_loss_clip": 0.01057277, + "auxiliary_loss_mlp": 0.01026707, + "balance_loss_clip": 1.01405287, + "balance_loss_mlp": 1.01832604, + "epoch": 0.6263640462949046, + "flos": 17638504490880.0, + "grad_norm": 2.695557959219408, + "language_loss": 0.57284611, + "learning_rate": 1.2268732826751214e-06, + "loss": 0.59368598, + "num_input_tokens_seen": 224428530, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.390625, + "step": 10418, + "time_per_iteration": 2.3642637729644775 + }, + { + "auxiliary_loss_clip": 0.0106063, + "auxiliary_loss_mlp": 0.01027778, + "balance_loss_clip": 1.01493335, + "balance_loss_mlp": 1.01912594, + "epoch": 0.6264241695475725, + "flos": 19973914623360.0, + "grad_norm": 1.868190826091013, + "language_loss": 0.85061967, + "learning_rate": 1.2265248449749694e-06, + "loss": 0.87150371, + "num_input_tokens_seen": 224447175, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.4140625, + "step": 10419, + "time_per_iteration": 2.426812171936035 + }, + { + "auxiliary_loss_clip": 0.01058696, + "auxiliary_loss_mlp": 0.01024419, + "balance_loss_clip": 1.01261151, + "balance_loss_mlp": 1.01895916, + "epoch": 0.6264842928002405, + "flos": 27343102181760.0, + "grad_norm": 1.8001015843094568, + "language_loss": 0.64782417, + "learning_rate": 1.2261764348781558e-06, + "loss": 0.6686554, + "num_input_tokens_seen": 224469445, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.3984375, + "step": 10420, + "time_per_iteration": 3.8192625045776367 + }, + { + "auxiliary_loss_clip": 0.01059791, + "auxiliary_loss_mlp": 0.01024638, + "balance_loss_clip": 1.01125669, + "balance_loss_mlp": 1.01960361, + "epoch": 0.6265444160529084, + "flos": 22996311045120.0, + "grad_norm": 1.8226732092660662, + "language_loss": 0.85948455, + "learning_rate": 1.2258280523971154e-06, + "loss": 0.88032883, + "num_input_tokens_seen": 224486590, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.40234375, + "step": 10421, + "time_per_iteration": 2.411940097808838 + }, + { + "auxiliary_loss_clip": 0.01058734, + "auxiliary_loss_mlp": 0.01026508, + "balance_loss_clip": 1.0148375, + "balance_loss_mlp": 1.0196569, + "epoch": 0.6266045393055765, + "flos": 19937290740480.0, + "grad_norm": 2.1028380109810034, + "language_loss": 0.79704255, + "learning_rate": 1.2254796975442795e-06, + "loss": 0.817895, + "num_input_tokens_seen": 224502795, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.390625, + "step": 10422, + "time_per_iteration": 2.3910913467407227 + }, + { + "auxiliary_loss_clip": 0.0105489, + "auxiliary_loss_mlp": 0.01022099, + "balance_loss_clip": 1.01008832, + "balance_loss_mlp": 1.01720643, + "epoch": 0.6266646625582444, + "flos": 24387948771840.0, + "grad_norm": 2.3290145185068676, + "language_loss": 0.74702144, + "learning_rate": 1.2251313703320816e-06, + "loss": 0.76779133, + "num_input_tokens_seen": 224522300, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.37695312, + "step": 10423, + "time_per_iteration": 2.408616065979004 + }, + { + "auxiliary_loss_clip": 0.01057203, + "auxiliary_loss_mlp": 0.01021778, + "balance_loss_clip": 1.00986898, + "balance_loss_mlp": 1.01944387, + "epoch": 0.6267247858109124, + "flos": 14682932144640.0, + "grad_norm": 1.9908235670919254, + "language_loss": 0.77795005, + "learning_rate": 1.2247830707729518e-06, + "loss": 0.79873979, + "num_input_tokens_seen": 224538260, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.37890625, + "step": 10424, + "time_per_iteration": 2.3835675716400146 + }, + { + "auxiliary_loss_clip": 0.01057335, + "auxiliary_loss_mlp": 0.01024067, + "balance_loss_clip": 1.0114603, + "balance_loss_mlp": 1.01903057, + "epoch": 0.6267849090635803, + "flos": 24928998111360.0, + "grad_norm": 1.785540311575651, + "language_loss": 0.69087934, + "learning_rate": 1.2244347988793198e-06, + "loss": 0.71169335, + "num_input_tokens_seen": 224559155, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.3828125, + "step": 10425, + "time_per_iteration": 2.432279348373413 + }, + { + "auxiliary_loss_clip": 0.01054441, + "auxiliary_loss_mlp": 0.01023392, + "balance_loss_clip": 1.01330638, + "balance_loss_mlp": 1.01881933, + "epoch": 0.6268450323162483, + "flos": 25336678590720.0, + "grad_norm": 1.540688454810348, + "language_loss": 0.74257046, + "learning_rate": 1.2240865546636152e-06, + "loss": 0.76334882, + "num_input_tokens_seen": 224578660, + "router_z_loss_clip": 0.10058594, + "router_z_loss_mlp": 0.35546875, + "step": 10426, + "time_per_iteration": 2.428976535797119 + }, + { + "auxiliary_loss_clip": 0.01057526, + "auxiliary_loss_mlp": 0.01029478, + "balance_loss_clip": 1.01700926, + "balance_loss_mlp": 1.0186075, + "epoch": 0.6269051555689162, + "flos": 26176095342720.0, + "grad_norm": 1.3611111449237792, + "language_loss": 0.80416679, + "learning_rate": 1.2237383381382652e-06, + "loss": 0.82503688, + "num_input_tokens_seen": 224599080, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.38867188, + "step": 10427, + "time_per_iteration": 2.451932907104492 + }, + { + "auxiliary_loss_clip": 0.01060181, + "auxiliary_loss_mlp": 0.01023856, + "balance_loss_clip": 1.01165485, + "balance_loss_mlp": 1.02002609, + "epoch": 0.6269652788215843, + "flos": 18255978529920.0, + "grad_norm": 2.3321324005499506, + "language_loss": 0.6898337, + "learning_rate": 1.2233901493156978e-06, + "loss": 0.71067405, + "num_input_tokens_seen": 224614225, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.40234375, + "step": 10428, + "time_per_iteration": 2.3714382648468018 + }, + { + "auxiliary_loss_clip": 0.01057589, + "auxiliary_loss_mlp": 0.01025781, + "balance_loss_clip": 1.01411068, + "balance_loss_mlp": 1.01939034, + "epoch": 0.6270254020742522, + "flos": 11764612085760.0, + "grad_norm": 1.8626623449741035, + "language_loss": 0.71450746, + "learning_rate": 1.2230419882083375e-06, + "loss": 0.73534119, + "num_input_tokens_seen": 224632365, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.38085938, + "step": 10429, + "time_per_iteration": 2.3859360218048096 + }, + { + "auxiliary_loss_clip": 0.01060576, + "auxiliary_loss_mlp": 0.01025424, + "balance_loss_clip": 1.01308608, + "balance_loss_mlp": 1.0202837, + "epoch": 0.6270855253269202, + "flos": 23474551115520.0, + "grad_norm": 1.4490588152463628, + "language_loss": 0.8039428, + "learning_rate": 1.2226938548286105e-06, + "loss": 0.82480288, + "num_input_tokens_seen": 224651125, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.40234375, + "step": 10430, + "time_per_iteration": 2.4386234283447266 + }, + { + "auxiliary_loss_clip": 0.01007913, + "auxiliary_loss_mlp": 0.01000275, + "balance_loss_clip": 0.99947619, + "balance_loss_mlp": 1.00115716, + "epoch": 0.6271456485795882, + "flos": 70061219856000.0, + "grad_norm": 0.8166104260647936, + "language_loss": 0.59111297, + "learning_rate": 1.2223457491889404e-06, + "loss": 0.61119485, + "num_input_tokens_seen": 224716115, + "router_z_loss_clip": 0.00799561, + "router_z_loss_mlp": 0.06738281, + "step": 10431, + "time_per_iteration": 3.152205467224121 + }, + { + "auxiliary_loss_clip": 0.01007777, + "auxiliary_loss_mlp": 0.01000637, + "balance_loss_clip": 0.99978423, + "balance_loss_mlp": 1.00111949, + "epoch": 0.6272057718322561, + "flos": 65153059102080.0, + "grad_norm": 0.8744204241134932, + "language_loss": 0.63773066, + "learning_rate": 1.22199767130175e-06, + "loss": 0.6578148, + "num_input_tokens_seen": 224782930, + "router_z_loss_clip": 0.00854492, + "router_z_loss_mlp": 0.06640625, + "step": 10432, + "time_per_iteration": 3.1809139251708984 + }, + { + "auxiliary_loss_clip": 0.01056387, + "auxiliary_loss_mlp": 0.01025605, + "balance_loss_clip": 1.01498342, + "balance_loss_mlp": 1.01829362, + "epoch": 0.6272658950849241, + "flos": 24388193151360.0, + "grad_norm": 1.8171967553102641, + "language_loss": 0.64827085, + "learning_rate": 1.2216496211794609e-06, + "loss": 0.66909081, + "num_input_tokens_seen": 224802010, + "router_z_loss_clip": 0.10595703, + "router_z_loss_mlp": 0.38085938, + "step": 10433, + "time_per_iteration": 2.4216973781585693 + }, + { + "auxiliary_loss_clip": 0.01061689, + "auxiliary_loss_mlp": 0.01026872, + "balance_loss_clip": 1.01368117, + "balance_loss_mlp": 1.02066159, + "epoch": 0.627326018337592, + "flos": 17965082148480.0, + "grad_norm": 2.2070128391319974, + "language_loss": 0.61729485, + "learning_rate": 1.221301598834496e-06, + "loss": 0.63818043, + "num_input_tokens_seen": 224818875, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.41015625, + "step": 10434, + "time_per_iteration": 2.388514757156372 + }, + { + "auxiliary_loss_clip": 0.01056332, + "auxiliary_loss_mlp": 0.01023072, + "balance_loss_clip": 1.01159787, + "balance_loss_mlp": 1.01775503, + "epoch": 0.6273861415902601, + "flos": 20229059905920.0, + "grad_norm": 1.6345155461340086, + "language_loss": 0.84496635, + "learning_rate": 1.220953604279273e-06, + "loss": 0.86576039, + "num_input_tokens_seen": 224837790, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.38476562, + "step": 10435, + "time_per_iteration": 2.3929190635681152 + }, + { + "auxiliary_loss_clip": 0.01007399, + "auxiliary_loss_mlp": 0.01001673, + "balance_loss_clip": 1.00082088, + "balance_loss_mlp": 1.00069928, + "epoch": 0.627446264842928, + "flos": 64950144946560.0, + "grad_norm": 0.7439953025349773, + "language_loss": 0.61536372, + "learning_rate": 1.2206056375262116e-06, + "loss": 0.63545442, + "num_input_tokens_seen": 224899685, + "router_z_loss_clip": 0.00854492, + "router_z_loss_mlp": 0.06738281, + "step": 10436, + "time_per_iteration": 3.1012892723083496 + }, + { + "auxiliary_loss_clip": 0.01059047, + "auxiliary_loss_mlp": 0.01024847, + "balance_loss_clip": 1.01216352, + "balance_loss_mlp": 1.02007461, + "epoch": 0.627506388095596, + "flos": 23583200866560.0, + "grad_norm": 1.5811709489595676, + "language_loss": 0.77734601, + "learning_rate": 1.2202576985877312e-06, + "loss": 0.79818499, + "num_input_tokens_seen": 224918650, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.390625, + "step": 10437, + "time_per_iteration": 2.416121006011963 + }, + { + "auxiliary_loss_clip": 0.01007337, + "auxiliary_loss_mlp": 0.01004402, + "balance_loss_clip": 1.00351954, + "balance_loss_mlp": 1.00060678, + "epoch": 0.6275665113482639, + "flos": 67580396444160.0, + "grad_norm": 0.7272630183098491, + "language_loss": 0.54280841, + "learning_rate": 1.2199097874762472e-06, + "loss": 0.56292576, + "num_input_tokens_seen": 224981575, + "router_z_loss_clip": 0.0088501, + "router_z_loss_mlp": 0.06738281, + "step": 10438, + "time_per_iteration": 3.00494647026062 + }, + { + "auxiliary_loss_clip": 0.01057475, + "auxiliary_loss_mlp": 0.01032932, + "balance_loss_clip": 1.0204742, + "balance_loss_mlp": 1.01834285, + "epoch": 0.6276266346009319, + "flos": 27635674308480.0, + "grad_norm": 1.7705472253017593, + "language_loss": 0.84387875, + "learning_rate": 1.219561904204176e-06, + "loss": 0.86478281, + "num_input_tokens_seen": 225000820, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.390625, + "step": 10439, + "time_per_iteration": 2.43599271774292 + }, + { + "auxiliary_loss_clip": 0.01060439, + "auxiliary_loss_mlp": 0.01029023, + "balance_loss_clip": 1.01649415, + "balance_loss_mlp": 1.01965833, + "epoch": 0.6276867578535998, + "flos": 22745075834880.0, + "grad_norm": 1.9993326361858246, + "language_loss": 0.80227435, + "learning_rate": 1.2192140487839328e-06, + "loss": 0.82316899, + "num_input_tokens_seen": 225017585, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.40820312, + "step": 10440, + "time_per_iteration": 2.3998799324035645 + }, + { + "auxiliary_loss_clip": 0.01053946, + "auxiliary_loss_mlp": 0.01026355, + "balance_loss_clip": 1.01581049, + "balance_loss_mlp": 1.01756442, + "epoch": 0.6277468811062679, + "flos": 24643059143040.0, + "grad_norm": 1.489801628924187, + "language_loss": 0.74373579, + "learning_rate": 1.218866221227933e-06, + "loss": 0.76453876, + "num_input_tokens_seen": 225039085, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.36328125, + "step": 10441, + "time_per_iteration": 3.8923678398132324 + }, + { + "auxiliary_loss_clip": 0.01056867, + "auxiliary_loss_mlp": 0.01021134, + "balance_loss_clip": 1.00929677, + "balance_loss_mlp": 1.01848125, + "epoch": 0.6278070043589358, + "flos": 19678060828800.0, + "grad_norm": 1.726738294630841, + "language_loss": 0.72307789, + "learning_rate": 1.2185184215485873e-06, + "loss": 0.74385792, + "num_input_tokens_seen": 225058105, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.3828125, + "step": 10442, + "time_per_iteration": 2.390047550201416 + }, + { + "auxiliary_loss_clip": 0.01058044, + "auxiliary_loss_mlp": 0.01023768, + "balance_loss_clip": 1.01225233, + "balance_loss_mlp": 1.01964498, + "epoch": 0.6278671276116038, + "flos": 22120898814720.0, + "grad_norm": 1.4392318825295243, + "language_loss": 0.71588558, + "learning_rate": 1.2181706497583096e-06, + "loss": 0.73670369, + "num_input_tokens_seen": 225077605, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.38476562, + "step": 10443, + "time_per_iteration": 2.4422624111175537 + }, + { + "auxiliary_loss_clip": 0.01058206, + "auxiliary_loss_mlp": 0.01023106, + "balance_loss_clip": 1.01192474, + "balance_loss_mlp": 1.01975262, + "epoch": 0.6279272508642717, + "flos": 23037473404800.0, + "grad_norm": 3.8473620626483025, + "language_loss": 0.7314446, + "learning_rate": 1.2178229058695104e-06, + "loss": 0.7522577, + "num_input_tokens_seen": 225097775, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.38476562, + "step": 10444, + "time_per_iteration": 2.4346444606781006 + }, + { + "auxiliary_loss_clip": 0.01057526, + "auxiliary_loss_mlp": 0.01025805, + "balance_loss_clip": 1.01359761, + "balance_loss_mlp": 1.01960659, + "epoch": 0.6279873741169397, + "flos": 19823194817280.0, + "grad_norm": 1.9177791357791025, + "language_loss": 0.72616661, + "learning_rate": 1.217475189894599e-06, + "loss": 0.74699992, + "num_input_tokens_seen": 225115585, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.37890625, + "step": 10445, + "time_per_iteration": 2.412992477416992 + }, + { + "auxiliary_loss_clip": 0.01056836, + "auxiliary_loss_mlp": 0.01023407, + "balance_loss_clip": 1.01125324, + "balance_loss_mlp": 1.01825881, + "epoch": 0.6280474973696077, + "flos": 23914247178240.0, + "grad_norm": 1.4709267886188675, + "language_loss": 0.68547177, + "learning_rate": 1.2171275018459853e-06, + "loss": 0.70627421, + "num_input_tokens_seen": 225135575, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.38671875, + "step": 10446, + "time_per_iteration": 2.4453186988830566 + }, + { + "auxiliary_loss_clip": 0.01061579, + "auxiliary_loss_mlp": 0.01024701, + "balance_loss_clip": 1.011814, + "balance_loss_mlp": 1.01961374, + "epoch": 0.6281076206222757, + "flos": 17967002273280.0, + "grad_norm": 8.244677614966093, + "language_loss": 0.73182988, + "learning_rate": 1.216779841736078e-06, + "loss": 0.75269264, + "num_input_tokens_seen": 225154230, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.41992188, + "step": 10447, + "time_per_iteration": 2.432509660720825 + }, + { + "auxiliary_loss_clip": 0.01007831, + "auxiliary_loss_mlp": 0.01001635, + "balance_loss_clip": 1.00089324, + "balance_loss_mlp": 1.00112081, + "epoch": 0.6281677438749437, + "flos": 66775229602560.0, + "grad_norm": 0.678854495010334, + "language_loss": 0.52359986, + "learning_rate": 1.2164322095772826e-06, + "loss": 0.54369462, + "num_input_tokens_seen": 225213650, + "router_z_loss_clip": 0.00741577, + "router_z_loss_mlp": 0.06738281, + "step": 10448, + "time_per_iteration": 2.926494836807251 + }, + { + "auxiliary_loss_clip": 0.01061484, + "auxiliary_loss_mlp": 0.01033843, + "balance_loss_clip": 1.02077758, + "balance_loss_mlp": 1.02067053, + "epoch": 0.6282278671276116, + "flos": 11655368841600.0, + "grad_norm": 2.8234384175745086, + "language_loss": 0.91078162, + "learning_rate": 1.216084605382006e-06, + "loss": 0.93173492, + "num_input_tokens_seen": 225230135, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.40820312, + "step": 10449, + "time_per_iteration": 2.4098057746887207 + }, + { + "auxiliary_loss_clip": 0.01056978, + "auxiliary_loss_mlp": 0.01025315, + "balance_loss_clip": 1.01402545, + "balance_loss_mlp": 1.01775873, + "epoch": 0.6282879903802796, + "flos": 42739939140480.0, + "grad_norm": 1.486712202544255, + "language_loss": 0.60054159, + "learning_rate": 1.2157370291626534e-06, + "loss": 0.62136453, + "num_input_tokens_seen": 225253520, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.39257812, + "step": 10450, + "time_per_iteration": 2.607388496398926 + }, + { + "auxiliary_loss_clip": 0.01055647, + "auxiliary_loss_mlp": 0.01024543, + "balance_loss_clip": 1.01323009, + "balance_loss_mlp": 1.01834226, + "epoch": 0.6283481136329475, + "flos": 20008234356480.0, + "grad_norm": 1.8237817755988865, + "language_loss": 0.76792032, + "learning_rate": 1.2153894809316297e-06, + "loss": 0.78872222, + "num_input_tokens_seen": 225272460, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.37304688, + "step": 10451, + "time_per_iteration": 2.4004359245300293 + }, + { + "auxiliary_loss_clip": 0.01058279, + "auxiliary_loss_mlp": 0.01022786, + "balance_loss_clip": 1.01052594, + "balance_loss_mlp": 1.01787305, + "epoch": 0.6284082368856155, + "flos": 21903459667200.0, + "grad_norm": 1.5368724574703565, + "language_loss": 0.77267146, + "learning_rate": 1.2150419607013365e-06, + "loss": 0.79348218, + "num_input_tokens_seen": 225291700, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.40429688, + "step": 10452, + "time_per_iteration": 3.9383530616760254 + }, + { + "auxiliary_loss_clip": 0.01058616, + "auxiliary_loss_mlp": 0.01024104, + "balance_loss_clip": 1.01234984, + "balance_loss_mlp": 1.02037847, + "epoch": 0.6284683601382834, + "flos": 25482999565440.0, + "grad_norm": 1.6514943964405588, + "language_loss": 0.72685057, + "learning_rate": 1.2146944684841764e-06, + "loss": 0.7476778, + "num_input_tokens_seen": 225311470, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.3828125, + "step": 10453, + "time_per_iteration": 3.881088972091675 + }, + { + "auxiliary_loss_clip": 0.01055924, + "auxiliary_loss_mlp": 0.01026168, + "balance_loss_clip": 1.01384163, + "balance_loss_mlp": 1.01703548, + "epoch": 0.6285284833909515, + "flos": 16537937702400.0, + "grad_norm": 1.8885044458908946, + "language_loss": 0.81216371, + "learning_rate": 1.2143470042925516e-06, + "loss": 0.83298469, + "num_input_tokens_seen": 225328385, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.38867188, + "step": 10454, + "time_per_iteration": 2.372993230819702 + }, + { + "auxiliary_loss_clip": 0.01057367, + "auxiliary_loss_mlp": 0.01024087, + "balance_loss_clip": 1.01310229, + "balance_loss_mlp": 1.01860785, + "epoch": 0.6285886066436194, + "flos": 22819580409600.0, + "grad_norm": 1.9621895926523178, + "language_loss": 0.82480156, + "learning_rate": 1.2139995681388603e-06, + "loss": 0.8456161, + "num_input_tokens_seen": 225348415, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.38671875, + "step": 10455, + "time_per_iteration": 2.425513505935669 + }, + { + "auxiliary_loss_clip": 0.01057697, + "auxiliary_loss_mlp": 0.01023759, + "balance_loss_clip": 1.01229656, + "balance_loss_mlp": 1.01863503, + "epoch": 0.6286487298962874, + "flos": 24714631163520.0, + "grad_norm": 1.6412525143973191, + "language_loss": 0.81612217, + "learning_rate": 1.2136521600355028e-06, + "loss": 0.83693677, + "num_input_tokens_seen": 225367740, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.390625, + "step": 10456, + "time_per_iteration": 2.4456627368927 + }, + { + "auxiliary_loss_clip": 0.01059458, + "auxiliary_loss_mlp": 0.01023192, + "balance_loss_clip": 1.01110983, + "balance_loss_mlp": 1.01916432, + "epoch": 0.6287088531489553, + "flos": 20739769407360.0, + "grad_norm": 2.832274065405238, + "language_loss": 0.72120005, + "learning_rate": 1.2133047799948776e-06, + "loss": 0.74202657, + "num_input_tokens_seen": 225388405, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.40234375, + "step": 10457, + "time_per_iteration": 2.404127836227417 + }, + { + "auxiliary_loss_clip": 0.01060536, + "auxiliary_loss_mlp": 0.01026683, + "balance_loss_clip": 1.01336098, + "balance_loss_mlp": 1.01838923, + "epoch": 0.6287689764016233, + "flos": 23069663544960.0, + "grad_norm": 1.6684111955250993, + "language_loss": 0.79927218, + "learning_rate": 1.2129574280293808e-06, + "loss": 0.82014436, + "num_input_tokens_seen": 225408360, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.421875, + "step": 10458, + "time_per_iteration": 2.422454833984375 + }, + { + "auxiliary_loss_clip": 0.01059858, + "auxiliary_loss_mlp": 0.01027602, + "balance_loss_clip": 1.01519847, + "balance_loss_mlp": 1.01980567, + "epoch": 0.6288290996542913, + "flos": 32232304200960.0, + "grad_norm": 1.758593309217106, + "language_loss": 0.61130041, + "learning_rate": 1.2126101041514085e-06, + "loss": 0.63217503, + "num_input_tokens_seen": 225431310, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.40039062, + "step": 10459, + "time_per_iteration": 2.4964170455932617 + }, + { + "auxiliary_loss_clip": 0.01059052, + "auxiliary_loss_mlp": 0.01022115, + "balance_loss_clip": 1.0107069, + "balance_loss_mlp": 1.01966238, + "epoch": 0.6288892229069593, + "flos": 24640266234240.0, + "grad_norm": 1.8854402056890953, + "language_loss": 0.79021084, + "learning_rate": 1.2122628083733562e-06, + "loss": 0.81102252, + "num_input_tokens_seen": 225450385, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.39453125, + "step": 10460, + "time_per_iteration": 3.8022756576538086 + }, + { + "auxiliary_loss_clip": 0.01057936, + "auxiliary_loss_mlp": 0.01025334, + "balance_loss_clip": 1.01392531, + "balance_loss_mlp": 1.01885974, + "epoch": 0.6289493461596273, + "flos": 17857375004160.0, + "grad_norm": 1.5905831569147304, + "language_loss": 0.73785168, + "learning_rate": 1.211915540707619e-06, + "loss": 0.7586844, + "num_input_tokens_seen": 225467325, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.390625, + "step": 10461, + "time_per_iteration": 2.361142158508301 + }, + { + "auxiliary_loss_clip": 0.01058172, + "auxiliary_loss_mlp": 0.01026081, + "balance_loss_clip": 1.01411211, + "balance_loss_mlp": 1.01954806, + "epoch": 0.6290094694122952, + "flos": 22344307804800.0, + "grad_norm": 1.799606841290098, + "language_loss": 0.70071685, + "learning_rate": 1.2115683011665877e-06, + "loss": 0.72155935, + "num_input_tokens_seen": 225487370, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.38671875, + "step": 10462, + "time_per_iteration": 2.407729387283325 + }, + { + "auxiliary_loss_clip": 0.01054429, + "auxiliary_loss_mlp": 0.01023976, + "balance_loss_clip": 1.0136168, + "balance_loss_mlp": 1.01898265, + "epoch": 0.6290695926649632, + "flos": 28401179978880.0, + "grad_norm": 2.6249826883203617, + "language_loss": 0.72391337, + "learning_rate": 1.211221089762656e-06, + "loss": 0.74469739, + "num_input_tokens_seen": 225506915, + "router_z_loss_clip": 0.10351562, + "router_z_loss_mlp": 0.35546875, + "step": 10463, + "time_per_iteration": 2.4317352771759033 + }, + { + "auxiliary_loss_clip": 0.01056841, + "auxiliary_loss_mlp": 0.01028314, + "balance_loss_clip": 1.01654792, + "balance_loss_mlp": 1.01968145, + "epoch": 0.6291297159176311, + "flos": 21504437205120.0, + "grad_norm": 1.6644455870547348, + "language_loss": 0.72597933, + "learning_rate": 1.2108739065082155e-06, + "loss": 0.74683088, + "num_input_tokens_seen": 225525670, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.37109375, + "step": 10464, + "time_per_iteration": 2.389374017715454 + }, + { + "auxiliary_loss_clip": 0.01057266, + "auxiliary_loss_mlp": 0.01025264, + "balance_loss_clip": 1.01416564, + "balance_loss_mlp": 1.01934564, + "epoch": 0.6291898391702991, + "flos": 12202492757760.0, + "grad_norm": 1.6144155687037784, + "language_loss": 0.68972272, + "learning_rate": 1.2105267514156544e-06, + "loss": 0.71054804, + "num_input_tokens_seen": 225542235, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.37890625, + "step": 10465, + "time_per_iteration": 2.3736841678619385 + }, + { + "auxiliary_loss_clip": 0.01007104, + "auxiliary_loss_mlp": 0.01002038, + "balance_loss_clip": 1.0012784, + "balance_loss_mlp": 1.00048566, + "epoch": 0.629249962422967, + "flos": 69296168033280.0, + "grad_norm": 0.6798604934857725, + "language_loss": 0.59803843, + "learning_rate": 1.2101796244973626e-06, + "loss": 0.61812985, + "num_input_tokens_seen": 225607185, + "router_z_loss_clip": 0.00759888, + "router_z_loss_mlp": 0.06640625, + "step": 10466, + "time_per_iteration": 3.1333208084106445 + }, + { + "auxiliary_loss_clip": 0.01053553, + "auxiliary_loss_mlp": 0.01023735, + "balance_loss_clip": 1.0124166, + "balance_loss_mlp": 1.0159502, + "epoch": 0.6293100856756351, + "flos": 40076310516480.0, + "grad_norm": 1.863557274541058, + "language_loss": 0.64637637, + "learning_rate": 1.2098325257657286e-06, + "loss": 0.66714931, + "num_input_tokens_seen": 225628785, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.375, + "step": 10467, + "time_per_iteration": 2.539336919784546 + }, + { + "auxiliary_loss_clip": 0.01007967, + "auxiliary_loss_mlp": 0.0100189, + "balance_loss_clip": 1.00094271, + "balance_loss_mlp": 1.00137496, + "epoch": 0.629370208928303, + "flos": 67498141547520.0, + "grad_norm": 0.8002594368804994, + "language_loss": 0.57000428, + "learning_rate": 1.2094854552331398e-06, + "loss": 0.59010285, + "num_input_tokens_seen": 225678980, + "router_z_loss_clip": 0.00946045, + "router_z_loss_mlp": 0.06591797, + "step": 10468, + "time_per_iteration": 2.906493663787842 + }, + { + "auxiliary_loss_clip": 0.01007313, + "auxiliary_loss_mlp": 0.0100089, + "balance_loss_clip": 1.00007367, + "balance_loss_mlp": 1.0007025, + "epoch": 0.629430332180971, + "flos": 60657154081920.0, + "grad_norm": 0.7390820835932422, + "language_loss": 0.57969642, + "learning_rate": 1.2091384129119809e-06, + "loss": 0.59977853, + "num_input_tokens_seen": 225740295, + "router_z_loss_clip": 0.00817871, + "router_z_loss_mlp": 0.06640625, + "step": 10469, + "time_per_iteration": 2.997147560119629 + }, + { + "auxiliary_loss_clip": 0.01007388, + "auxiliary_loss_mlp": 0.01001062, + "balance_loss_clip": 1.00018561, + "balance_loss_mlp": 1.00079441, + "epoch": 0.6294904554336389, + "flos": 66866107605120.0, + "grad_norm": 0.6766634888962791, + "language_loss": 0.52120507, + "learning_rate": 1.2087913988146379e-06, + "loss": 0.54128957, + "num_input_tokens_seen": 225805615, + "router_z_loss_clip": 0.00878906, + "router_z_loss_mlp": 0.06640625, + "step": 10470, + "time_per_iteration": 3.1302459239959717 + }, + { + "auxiliary_loss_clip": 0.01058407, + "auxiliary_loss_mlp": 0.010259, + "balance_loss_clip": 1.01377642, + "balance_loss_mlp": 1.01893663, + "epoch": 0.6295505786863069, + "flos": 42521138449920.0, + "grad_norm": 1.6417532717101435, + "language_loss": 0.74513918, + "learning_rate": 1.2084444129534951e-06, + "loss": 0.76598227, + "num_input_tokens_seen": 225826585, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.39453125, + "step": 10471, + "time_per_iteration": 2.6057558059692383 + }, + { + "auxiliary_loss_clip": 0.01057249, + "auxiliary_loss_mlp": 0.01024136, + "balance_loss_clip": 1.01218557, + "balance_loss_mlp": 1.01828003, + "epoch": 0.629610701938975, + "flos": 17383184651520.0, + "grad_norm": 1.9516305352710963, + "language_loss": 0.62269944, + "learning_rate": 1.2080974553409347e-06, + "loss": 0.64351326, + "num_input_tokens_seen": 225844095, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.390625, + "step": 10472, + "time_per_iteration": 2.3856801986694336 + }, + { + "auxiliary_loss_clip": 0.01062728, + "auxiliary_loss_mlp": 0.01029351, + "balance_loss_clip": 1.01676822, + "balance_loss_mlp": 1.0207665, + "epoch": 0.6296708251916429, + "flos": 24241802353920.0, + "grad_norm": 1.65443736587884, + "language_loss": 0.69635344, + "learning_rate": 1.2077505259893392e-06, + "loss": 0.71727419, + "num_input_tokens_seen": 225864310, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.41992188, + "step": 10473, + "time_per_iteration": 2.4359183311462402 + }, + { + "auxiliary_loss_clip": 0.01056963, + "auxiliary_loss_mlp": 0.01023162, + "balance_loss_clip": 1.01142001, + "balance_loss_mlp": 1.01890802, + "epoch": 0.6297309484443109, + "flos": 19277607000960.0, + "grad_norm": 1.5045582065591216, + "language_loss": 0.74827904, + "learning_rate": 1.2074036249110901e-06, + "loss": 0.76908028, + "num_input_tokens_seen": 225883830, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.38085938, + "step": 10474, + "time_per_iteration": 2.4637320041656494 + }, + { + "auxiliary_loss_clip": 0.01058196, + "auxiliary_loss_mlp": 0.01024075, + "balance_loss_clip": 1.01204097, + "balance_loss_mlp": 1.01920271, + "epoch": 0.6297910716969788, + "flos": 30661422220800.0, + "grad_norm": 1.4619239553697827, + "language_loss": 0.66175872, + "learning_rate": 1.2070567521185656e-06, + "loss": 0.68258142, + "num_input_tokens_seen": 225905755, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.390625, + "step": 10475, + "time_per_iteration": 2.504405975341797 + }, + { + "auxiliary_loss_clip": 0.01054262, + "auxiliary_loss_mlp": 0.01024313, + "balance_loss_clip": 1.01350689, + "balance_loss_mlp": 1.0179143, + "epoch": 0.6298511949496468, + "flos": 14422305778560.0, + "grad_norm": 1.8777958442939309, + "language_loss": 0.89877832, + "learning_rate": 1.2067099076241465e-06, + "loss": 0.91956401, + "num_input_tokens_seen": 225922155, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.36328125, + "step": 10476, + "time_per_iteration": 2.4161221981048584 + }, + { + "auxiliary_loss_clip": 0.01058039, + "auxiliary_loss_mlp": 0.01028587, + "balance_loss_clip": 1.01751292, + "balance_loss_mlp": 1.01966095, + "epoch": 0.6299113182023147, + "flos": 23513025300480.0, + "grad_norm": 1.6138887836530078, + "language_loss": 0.75213683, + "learning_rate": 1.20636309144021e-06, + "loss": 0.7730031, + "num_input_tokens_seen": 225941060, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.3828125, + "step": 10477, + "time_per_iteration": 2.434692859649658 + }, + { + "auxiliary_loss_clip": 0.01058614, + "auxiliary_loss_mlp": 0.01026049, + "balance_loss_clip": 1.0132457, + "balance_loss_mlp": 1.019104, + "epoch": 0.6299714414549827, + "flos": 22673399080320.0, + "grad_norm": 1.8028755965249168, + "language_loss": 0.70455205, + "learning_rate": 1.2060163035791341e-06, + "loss": 0.72539866, + "num_input_tokens_seen": 225960870, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.39453125, + "step": 10478, + "time_per_iteration": 2.444063186645508 + }, + { + "auxiliary_loss_clip": 0.01061446, + "auxiliary_loss_mlp": 0.01025364, + "balance_loss_clip": 1.01151156, + "balance_loss_mlp": 1.01961076, + "epoch": 0.6300315647076506, + "flos": 14500860071040.0, + "grad_norm": 2.083810345073692, + "language_loss": 0.67329377, + "learning_rate": 1.2056695440532932e-06, + "loss": 0.69416189, + "num_input_tokens_seen": 225977895, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.41796875, + "step": 10479, + "time_per_iteration": 2.3906569480895996 + }, + { + "auxiliary_loss_clip": 0.01058769, + "auxiliary_loss_mlp": 0.01025912, + "balance_loss_clip": 1.0137887, + "balance_loss_mlp": 1.02042663, + "epoch": 0.6300916879603187, + "flos": 21870606211200.0, + "grad_norm": 1.6665090876362598, + "language_loss": 0.73669571, + "learning_rate": 1.205322812875063e-06, + "loss": 0.75754249, + "num_input_tokens_seen": 225997835, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.3828125, + "step": 10480, + "time_per_iteration": 2.445596933364868 + }, + { + "auxiliary_loss_clip": 0.01058223, + "auxiliary_loss_mlp": 0.01029201, + "balance_loss_clip": 1.01627898, + "balance_loss_mlp": 1.01862645, + "epoch": 0.6301518112129866, + "flos": 21833004810240.0, + "grad_norm": 1.7189655182564032, + "language_loss": 0.78962737, + "learning_rate": 1.2049761100568182e-06, + "loss": 0.81050158, + "num_input_tokens_seen": 226017620, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.39648438, + "step": 10481, + "time_per_iteration": 3.882920980453491 + }, + { + "auxiliary_loss_clip": 0.01007346, + "auxiliary_loss_mlp": 0.01005142, + "balance_loss_clip": 1.00435555, + "balance_loss_mlp": 1.00080562, + "epoch": 0.6302119344656546, + "flos": 44331872670720.0, + "grad_norm": 0.8948433422880964, + "language_loss": 0.61793208, + "learning_rate": 1.2046294356109302e-06, + "loss": 0.63805699, + "num_input_tokens_seen": 226068755, + "router_z_loss_clip": 0.00787354, + "router_z_loss_mlp": 0.06542969, + "step": 10482, + "time_per_iteration": 2.823683738708496 + }, + { + "auxiliary_loss_clip": 0.0106258, + "auxiliary_loss_mlp": 0.0102534, + "balance_loss_clip": 1.01173818, + "balance_loss_mlp": 1.0197916, + "epoch": 0.6302720577183225, + "flos": 11217139056000.0, + "grad_norm": 1.9978517832100666, + "language_loss": 0.82861197, + "learning_rate": 1.2042827895497714e-06, + "loss": 0.84949124, + "num_input_tokens_seen": 226084395, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.42773438, + "step": 10483, + "time_per_iteration": 2.39579701423645 + }, + { + "auxiliary_loss_clip": 0.01054593, + "auxiliary_loss_mlp": 0.01027579, + "balance_loss_clip": 1.01568198, + "balance_loss_mlp": 1.01807392, + "epoch": 0.6303321809709905, + "flos": 27963683331840.0, + "grad_norm": 1.6176223213762064, + "language_loss": 0.72406101, + "learning_rate": 1.2039361718857132e-06, + "loss": 0.7448827, + "num_input_tokens_seen": 226105890, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.36523438, + "step": 10484, + "time_per_iteration": 2.440159320831299 + }, + { + "auxiliary_loss_clip": 0.01057531, + "auxiliary_loss_mlp": 0.01023219, + "balance_loss_clip": 1.01092207, + "balance_loss_mlp": 1.01815736, + "epoch": 0.6303923042236586, + "flos": 28219491930240.0, + "grad_norm": 1.7612820350948497, + "language_loss": 0.74620891, + "learning_rate": 1.2035895826311265e-06, + "loss": 0.76701635, + "num_input_tokens_seen": 226126760, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.39453125, + "step": 10485, + "time_per_iteration": 2.4458961486816406 + }, + { + "auxiliary_loss_clip": 0.01060731, + "auxiliary_loss_mlp": 0.01025901, + "balance_loss_clip": 1.01368189, + "balance_loss_mlp": 1.01950657, + "epoch": 0.6304524274763265, + "flos": 27629948845440.0, + "grad_norm": 4.004485650421458, + "language_loss": 0.81267637, + "learning_rate": 1.2032430217983778e-06, + "loss": 0.8335427, + "num_input_tokens_seen": 226147315, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.41210938, + "step": 10486, + "time_per_iteration": 2.4416091442108154 + }, + { + "auxiliary_loss_clip": 0.01057608, + "auxiliary_loss_mlp": 0.010252, + "balance_loss_clip": 1.01384568, + "balance_loss_mlp": 1.01904416, + "epoch": 0.6305125507289945, + "flos": 17310355822080.0, + "grad_norm": 1.5855647230145495, + "language_loss": 0.63782394, + "learning_rate": 1.2028964893998362e-06, + "loss": 0.65865195, + "num_input_tokens_seen": 226165935, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.38671875, + "step": 10487, + "time_per_iteration": 2.3886818885803223 + }, + { + "auxiliary_loss_clip": 0.01058666, + "auxiliary_loss_mlp": 0.01027763, + "balance_loss_clip": 1.01594925, + "balance_loss_mlp": 1.01999426, + "epoch": 0.6305726739816624, + "flos": 25807203250560.0, + "grad_norm": 1.5277303672219822, + "language_loss": 0.67149043, + "learning_rate": 1.2025499854478698e-06, + "loss": 0.69235468, + "num_input_tokens_seen": 226186890, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.38671875, + "step": 10488, + "time_per_iteration": 2.471667528152466 + }, + { + "auxiliary_loss_clip": 0.01058908, + "auxiliary_loss_mlp": 0.01027681, + "balance_loss_clip": 1.01610565, + "balance_loss_mlp": 1.01950073, + "epoch": 0.6306327972343304, + "flos": 21796415838720.0, + "grad_norm": 1.6700361275524422, + "language_loss": 0.67030084, + "learning_rate": 1.2022035099548418e-06, + "loss": 0.6911667, + "num_input_tokens_seen": 226206710, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.39453125, + "step": 10489, + "time_per_iteration": 2.436645984649658 + }, + { + "auxiliary_loss_clip": 0.01061909, + "auxiliary_loss_mlp": 0.01026566, + "balance_loss_clip": 1.01345921, + "balance_loss_mlp": 1.01957273, + "epoch": 0.6306929204869983, + "flos": 20776323467520.0, + "grad_norm": 1.8173916941396318, + "language_loss": 0.6930331, + "learning_rate": 1.2018570629331184e-06, + "loss": 0.71391785, + "num_input_tokens_seen": 226225565, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.421875, + "step": 10490, + "time_per_iteration": 2.4098074436187744 + }, + { + "auxiliary_loss_clip": 0.01060296, + "auxiliary_loss_mlp": 0.0102914, + "balance_loss_clip": 1.01626492, + "balance_loss_mlp": 1.02000046, + "epoch": 0.6307530437396663, + "flos": 23653236787200.0, + "grad_norm": 1.742479775577464, + "language_loss": 0.78240347, + "learning_rate": 1.2015106443950641e-06, + "loss": 0.80329782, + "num_input_tokens_seen": 226243680, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.40234375, + "step": 10491, + "time_per_iteration": 2.4234344959259033 + }, + { + "auxiliary_loss_clip": 0.01059627, + "auxiliary_loss_mlp": 0.01025282, + "balance_loss_clip": 1.0131228, + "balance_loss_mlp": 1.02003217, + "epoch": 0.6308131669923343, + "flos": 24717808097280.0, + "grad_norm": 1.9124335036745799, + "language_loss": 0.55642641, + "learning_rate": 1.2011642543530403e-06, + "loss": 0.57727545, + "num_input_tokens_seen": 226264345, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.39648438, + "step": 10492, + "time_per_iteration": 5.271852731704712 + }, + { + "auxiliary_loss_clip": 0.01059349, + "auxiliary_loss_mlp": 0.01028453, + "balance_loss_clip": 1.01554251, + "balance_loss_mlp": 1.01912856, + "epoch": 0.6308732902450023, + "flos": 22564295481600.0, + "grad_norm": 2.275138326969966, + "language_loss": 0.64490581, + "learning_rate": 1.2008178928194092e-06, + "loss": 0.66578376, + "num_input_tokens_seen": 226283165, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.40234375, + "step": 10493, + "time_per_iteration": 2.3970577716827393 + }, + { + "auxiliary_loss_clip": 0.01057922, + "auxiliary_loss_mlp": 0.01021226, + "balance_loss_clip": 1.00935864, + "balance_loss_mlp": 1.01888704, + "epoch": 0.6309334134976702, + "flos": 24643059143040.0, + "grad_norm": 1.3510956707077117, + "language_loss": 0.82764447, + "learning_rate": 1.2004715598065321e-06, + "loss": 0.848436, + "num_input_tokens_seen": 226304080, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.390625, + "step": 10494, + "time_per_iteration": 2.5279700756073 + }, + { + "auxiliary_loss_clip": 0.01060339, + "auxiliary_loss_mlp": 0.01025025, + "balance_loss_clip": 1.01257396, + "balance_loss_mlp": 1.01998603, + "epoch": 0.6309935367503382, + "flos": 41426332035840.0, + "grad_norm": 1.7610540767994167, + "language_loss": 0.79161817, + "learning_rate": 1.200125255326769e-06, + "loss": 0.81247175, + "num_input_tokens_seen": 226325925, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.40429688, + "step": 10495, + "time_per_iteration": 2.5763704776763916 + }, + { + "auxiliary_loss_clip": 0.01059171, + "auxiliary_loss_mlp": 0.01024862, + "balance_loss_clip": 1.01231503, + "balance_loss_mlp": 1.01984763, + "epoch": 0.6310536600030061, + "flos": 15118124641920.0, + "grad_norm": 1.840774074659016, + "language_loss": 0.70689183, + "learning_rate": 1.1997789793924772e-06, + "loss": 0.72773218, + "num_input_tokens_seen": 226344190, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.39257812, + "step": 10496, + "time_per_iteration": 2.3768091201782227 + }, + { + "auxiliary_loss_clip": 0.01060252, + "auxiliary_loss_mlp": 0.01028535, + "balance_loss_clip": 1.01603568, + "balance_loss_mlp": 1.01933861, + "epoch": 0.6311137832556741, + "flos": 15230719376640.0, + "grad_norm": 1.8603178514638123, + "language_loss": 0.80025423, + "learning_rate": 1.1994327320160151e-06, + "loss": 0.82114208, + "num_input_tokens_seen": 226361520, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.41015625, + "step": 10497, + "time_per_iteration": 2.3993844985961914 + }, + { + "auxiliary_loss_clip": 0.01055797, + "auxiliary_loss_mlp": 0.01024779, + "balance_loss_clip": 1.01375842, + "balance_loss_mlp": 1.01837766, + "epoch": 0.6311739065083422, + "flos": 22017555590400.0, + "grad_norm": 2.0812039701796206, + "language_loss": 0.73818314, + "learning_rate": 1.1990865132097404e-06, + "loss": 0.75898898, + "num_input_tokens_seen": 226381920, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.375, + "step": 10498, + "time_per_iteration": 2.429600715637207 + }, + { + "auxiliary_loss_clip": 0.01059273, + "auxiliary_loss_mlp": 0.01025726, + "balance_loss_clip": 1.01434195, + "balance_loss_mlp": 1.01913619, + "epoch": 0.6312340297610101, + "flos": 22709673849600.0, + "grad_norm": 1.9407392129071923, + "language_loss": 0.69450617, + "learning_rate": 1.1987403229860071e-06, + "loss": 0.71535617, + "num_input_tokens_seen": 226400035, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.40234375, + "step": 10499, + "time_per_iteration": 3.8734052181243896 + }, + { + "auxiliary_loss_clip": 0.01057219, + "auxiliary_loss_mlp": 0.0102718, + "balance_loss_clip": 1.01506805, + "balance_loss_mlp": 1.0195632, + "epoch": 0.6312941530136781, + "flos": 24278949907200.0, + "grad_norm": 1.7820524793201897, + "language_loss": 0.69863391, + "learning_rate": 1.1983941613571704e-06, + "loss": 0.71947789, + "num_input_tokens_seen": 226418280, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.375, + "step": 10500, + "time_per_iteration": 2.4452295303344727 + }, + { + "auxiliary_loss_clip": 0.01060273, + "auxiliary_loss_mlp": 0.01026416, + "balance_loss_clip": 1.0142684, + "balance_loss_mlp": 1.02038455, + "epoch": 0.631354276266346, + "flos": 21724878729600.0, + "grad_norm": 2.2708801766264677, + "language_loss": 0.74474752, + "learning_rate": 1.1980480283355849e-06, + "loss": 0.76561439, + "num_input_tokens_seen": 226436650, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.3984375, + "step": 10501, + "time_per_iteration": 2.417647361755371 + }, + { + "auxiliary_loss_clip": 0.01058577, + "auxiliary_loss_mlp": 0.01026928, + "balance_loss_clip": 1.01456058, + "balance_loss_mlp": 1.01975298, + "epoch": 0.631414399519014, + "flos": 24023944270080.0, + "grad_norm": 1.6746699138418861, + "language_loss": 0.74982166, + "learning_rate": 1.197701923933602e-06, + "loss": 0.77067673, + "num_input_tokens_seen": 226456275, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.38867188, + "step": 10502, + "time_per_iteration": 2.4984936714172363 + }, + { + "auxiliary_loss_clip": 0.01061988, + "auxiliary_loss_mlp": 0.01029316, + "balance_loss_clip": 1.01617956, + "balance_loss_mlp": 1.02085793, + "epoch": 0.6314745227716819, + "flos": 24314666094720.0, + "grad_norm": 1.922425958321701, + "language_loss": 0.85339499, + "learning_rate": 1.1973558481635738e-06, + "loss": 0.87430811, + "num_input_tokens_seen": 226473610, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.41015625, + "step": 10503, + "time_per_iteration": 2.40749192237854 + }, + { + "auxiliary_loss_clip": 0.01060161, + "auxiliary_loss_mlp": 0.01030681, + "balance_loss_clip": 1.01787782, + "balance_loss_mlp": 1.01969087, + "epoch": 0.6315346460243499, + "flos": 23365307871360.0, + "grad_norm": 1.6011439365362594, + "language_loss": 0.86432296, + "learning_rate": 1.1970098010378501e-06, + "loss": 0.88523132, + "num_input_tokens_seen": 226493665, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.40429688, + "step": 10504, + "time_per_iteration": 2.4463770389556885 + }, + { + "auxiliary_loss_clip": 0.01061677, + "auxiliary_loss_mlp": 0.01029772, + "balance_loss_clip": 1.01578927, + "balance_loss_mlp": 1.01949537, + "epoch": 0.6315947692770179, + "flos": 20259469566720.0, + "grad_norm": 1.5976194743726817, + "language_loss": 0.76572889, + "learning_rate": 1.1966637825687822e-06, + "loss": 0.78664339, + "num_input_tokens_seen": 226511625, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.421875, + "step": 10505, + "time_per_iteration": 2.43389630317688 + }, + { + "auxiliary_loss_clip": 0.01059509, + "auxiliary_loss_mlp": 0.01031463, + "balance_loss_clip": 1.01824284, + "balance_loss_mlp": 1.01876712, + "epoch": 0.6316548925296859, + "flos": 25264652722560.0, + "grad_norm": 2.07845157932319, + "language_loss": 0.81939411, + "learning_rate": 1.1963177927687167e-06, + "loss": 0.8403039, + "num_input_tokens_seen": 226530085, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.40820312, + "step": 10506, + "time_per_iteration": 2.4566543102264404 + }, + { + "auxiliary_loss_clip": 0.0106072, + "auxiliary_loss_mlp": 0.01024139, + "balance_loss_clip": 1.01127613, + "balance_loss_mlp": 1.02071714, + "epoch": 0.6317150157823538, + "flos": 22929452058240.0, + "grad_norm": 1.7741667926530753, + "language_loss": 0.7454282, + "learning_rate": 1.195971831650002e-06, + "loss": 0.76627684, + "num_input_tokens_seen": 226548115, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.3984375, + "step": 10507, + "time_per_iteration": 2.449716567993164 + }, + { + "auxiliary_loss_clip": 0.01062807, + "auxiliary_loss_mlp": 0.01031957, + "balance_loss_clip": 1.01860523, + "balance_loss_mlp": 1.02044892, + "epoch": 0.6317751390350218, + "flos": 22525995853440.0, + "grad_norm": 1.6320311761437438, + "language_loss": 0.67487347, + "learning_rate": 1.1956258992249847e-06, + "loss": 0.69582105, + "num_input_tokens_seen": 226567955, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.42382812, + "step": 10508, + "time_per_iteration": 2.4530317783355713 + }, + { + "auxiliary_loss_clip": 0.01058666, + "auxiliary_loss_mlp": 0.01027301, + "balance_loss_clip": 1.015154, + "balance_loss_mlp": 1.01982224, + "epoch": 0.6318352622876897, + "flos": 23293631116800.0, + "grad_norm": 2.0764888978178395, + "language_loss": 0.70847237, + "learning_rate": 1.1952799955060094e-06, + "loss": 0.72933209, + "num_input_tokens_seen": 226588205, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.38671875, + "step": 10509, + "time_per_iteration": 2.4463016986846924 + }, + { + "auxiliary_loss_clip": 0.01057667, + "auxiliary_loss_mlp": 0.01023714, + "balance_loss_clip": 1.01197207, + "balance_loss_mlp": 1.0186677, + "epoch": 0.6318953855403577, + "flos": 20703040790400.0, + "grad_norm": 1.5392766684249195, + "language_loss": 0.79344785, + "learning_rate": 1.194934120505421e-06, + "loss": 0.81426156, + "num_input_tokens_seen": 226606965, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.390625, + "step": 10510, + "time_per_iteration": 2.448500394821167 + }, + { + "auxiliary_loss_clip": 0.01059159, + "auxiliary_loss_mlp": 0.01025965, + "balance_loss_clip": 1.01337671, + "balance_loss_mlp": 1.01878619, + "epoch": 0.6319555087930258, + "flos": 22818952005120.0, + "grad_norm": 1.4801575389640662, + "language_loss": 0.70163685, + "learning_rate": 1.194588274235563e-06, + "loss": 0.72248811, + "num_input_tokens_seen": 226627845, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.40429688, + "step": 10511, + "time_per_iteration": 2.4543745517730713 + }, + { + "auxiliary_loss_clip": 0.01057182, + "auxiliary_loss_mlp": 0.01023076, + "balance_loss_clip": 1.01155472, + "balance_loss_mlp": 1.0192945, + "epoch": 0.6320156320456937, + "flos": 19970004551040.0, + "grad_norm": 1.6119196099460047, + "language_loss": 0.80167717, + "learning_rate": 1.1942424567087787e-06, + "loss": 0.82247972, + "num_input_tokens_seen": 226645855, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.37890625, + "step": 10512, + "time_per_iteration": 2.4172534942626953 + }, + { + "auxiliary_loss_clip": 0.0105941, + "auxiliary_loss_mlp": 0.0102362, + "balance_loss_clip": 1.01148438, + "balance_loss_mlp": 1.01925981, + "epoch": 0.6320757552983617, + "flos": 27012265338240.0, + "grad_norm": 1.7132193833649876, + "language_loss": 0.70818216, + "learning_rate": 1.1938966679374075e-06, + "loss": 0.72901243, + "num_input_tokens_seen": 226665375, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.40234375, + "step": 10513, + "time_per_iteration": 2.4511406421661377 + }, + { + "auxiliary_loss_clip": 0.01061003, + "auxiliary_loss_mlp": 0.01027048, + "balance_loss_clip": 1.01390553, + "balance_loss_mlp": 1.02073693, + "epoch": 0.6321358785510296, + "flos": 23694818083200.0, + "grad_norm": 1.7926598850687812, + "language_loss": 0.66679043, + "learning_rate": 1.193550907933791e-06, + "loss": 0.68767095, + "num_input_tokens_seen": 226685270, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.40234375, + "step": 10514, + "time_per_iteration": 2.465763568878174 + }, + { + "auxiliary_loss_clip": 0.01056111, + "auxiliary_loss_mlp": 0.01023147, + "balance_loss_clip": 1.01136935, + "balance_loss_mlp": 1.01809621, + "epoch": 0.6321960018036976, + "flos": 25994023269120.0, + "grad_norm": 1.7793498551112332, + "language_loss": 0.74908984, + "learning_rate": 1.1932051767102685e-06, + "loss": 0.76988232, + "num_input_tokens_seen": 226705325, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.38085938, + "step": 10515, + "time_per_iteration": 2.4664409160614014 + }, + { + "auxiliary_loss_clip": 0.01056671, + "auxiliary_loss_mlp": 0.01020969, + "balance_loss_clip": 1.00980484, + "balance_loss_mlp": 1.01918912, + "epoch": 0.6322561250563655, + "flos": 22819894611840.0, + "grad_norm": 1.9826844157302297, + "language_loss": 0.89756572, + "learning_rate": 1.1928594742791774e-06, + "loss": 0.91834211, + "num_input_tokens_seen": 226723815, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.375, + "step": 10516, + "time_per_iteration": 2.4241793155670166 + }, + { + "auxiliary_loss_clip": 0.01062001, + "auxiliary_loss_mlp": 0.01026341, + "balance_loss_clip": 1.01203609, + "balance_loss_mlp": 1.0203135, + "epoch": 0.6323162483090335, + "flos": 18987443758080.0, + "grad_norm": 1.6698452326568038, + "language_loss": 0.81756455, + "learning_rate": 1.1925138006528552e-06, + "loss": 0.83844799, + "num_input_tokens_seen": 226741550, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.41796875, + "step": 10517, + "time_per_iteration": 2.3998305797576904 + }, + { + "auxiliary_loss_clip": 0.01059725, + "auxiliary_loss_mlp": 0.01026102, + "balance_loss_clip": 1.01411581, + "balance_loss_mlp": 1.02036142, + "epoch": 0.6323763715617015, + "flos": 19864147708800.0, + "grad_norm": 1.6730550679342668, + "language_loss": 0.77692986, + "learning_rate": 1.19216815584364e-06, + "loss": 0.79778814, + "num_input_tokens_seen": 226761115, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.39453125, + "step": 10518, + "time_per_iteration": 2.494270086288452 + }, + { + "auxiliary_loss_clip": 0.01058097, + "auxiliary_loss_mlp": 0.01023915, + "balance_loss_clip": 1.01229167, + "balance_loss_mlp": 1.01907063, + "epoch": 0.6324364948143695, + "flos": 22781629895040.0, + "grad_norm": 1.4291366525947187, + "language_loss": 0.85109335, + "learning_rate": 1.1918225398638636e-06, + "loss": 0.87191343, + "num_input_tokens_seen": 226782225, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.390625, + "step": 10519, + "time_per_iteration": 2.462754487991333 + }, + { + "auxiliary_loss_clip": 0.01056497, + "auxiliary_loss_mlp": 0.01024598, + "balance_loss_clip": 1.01338589, + "balance_loss_mlp": 1.01878929, + "epoch": 0.6324966180670374, + "flos": 22234855092480.0, + "grad_norm": 1.7062364054114159, + "language_loss": 0.72028756, + "learning_rate": 1.1914769527258621e-06, + "loss": 0.74109852, + "num_input_tokens_seen": 226802375, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.37695312, + "step": 10520, + "time_per_iteration": 3.8473827838897705 + }, + { + "auxiliary_loss_clip": 0.01056602, + "auxiliary_loss_mlp": 0.01023205, + "balance_loss_clip": 1.011976, + "balance_loss_mlp": 1.01851964, + "epoch": 0.6325567413197054, + "flos": 21688115201280.0, + "grad_norm": 1.6051602772486793, + "language_loss": 0.7157737, + "learning_rate": 1.1911313944419683e-06, + "loss": 0.73657179, + "num_input_tokens_seen": 226822165, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.3828125, + "step": 10521, + "time_per_iteration": 2.4666836261749268 + }, + { + "auxiliary_loss_clip": 0.01059866, + "auxiliary_loss_mlp": 0.01028114, + "balance_loss_clip": 1.01505423, + "balance_loss_mlp": 1.01899242, + "epoch": 0.6326168645723733, + "flos": 19936138665600.0, + "grad_norm": 1.6323656100328647, + "language_loss": 0.72036213, + "learning_rate": 1.1907858650245154e-06, + "loss": 0.74124193, + "num_input_tokens_seen": 226841645, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.41015625, + "step": 10522, + "time_per_iteration": 2.4249582290649414 + }, + { + "auxiliary_loss_clip": 0.01059588, + "auxiliary_loss_mlp": 0.01026247, + "balance_loss_clip": 1.01411784, + "balance_loss_mlp": 1.01935625, + "epoch": 0.6326769878250413, + "flos": 20229304285440.0, + "grad_norm": 1.8083444723198865, + "language_loss": 0.81531918, + "learning_rate": 1.1904403644858324e-06, + "loss": 0.83617759, + "num_input_tokens_seen": 226860355, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.40234375, + "step": 10523, + "time_per_iteration": 2.413299560546875 + }, + { + "auxiliary_loss_clip": 0.01058496, + "auxiliary_loss_mlp": 0.01022414, + "balance_loss_clip": 1.01069546, + "balance_loss_mlp": 1.01893663, + "epoch": 0.6327371110777094, + "flos": 20774752456320.0, + "grad_norm": 2.073865220362537, + "language_loss": 0.73454887, + "learning_rate": 1.1900948928382506e-06, + "loss": 0.75535798, + "num_input_tokens_seen": 226878390, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.39453125, + "step": 10524, + "time_per_iteration": 2.400155544281006 + }, + { + "auxiliary_loss_clip": 0.01064363, + "auxiliary_loss_mlp": 0.01030222, + "balance_loss_clip": 1.01591063, + "balance_loss_mlp": 1.01960945, + "epoch": 0.6327972343303773, + "flos": 30335228588160.0, + "grad_norm": 2.417593009117104, + "language_loss": 0.83944768, + "learning_rate": 1.1897494500940993e-06, + "loss": 0.86039352, + "num_input_tokens_seen": 226898420, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.44726562, + "step": 10525, + "time_per_iteration": 2.4762563705444336 + }, + { + "auxiliary_loss_clip": 0.01056008, + "auxiliary_loss_mlp": 0.01023105, + "balance_loss_clip": 1.01141071, + "balance_loss_mlp": 1.01825047, + "epoch": 0.6328573575830453, + "flos": 17091310752000.0, + "grad_norm": 1.930504428470611, + "language_loss": 0.66839093, + "learning_rate": 1.1894040362657052e-06, + "loss": 0.68918204, + "num_input_tokens_seen": 226916305, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.37695312, + "step": 10526, + "time_per_iteration": 2.41268253326416 + }, + { + "auxiliary_loss_clip": 0.01062264, + "auxiliary_loss_mlp": 0.01029157, + "balance_loss_clip": 1.01557899, + "balance_loss_mlp": 1.02073097, + "epoch": 0.6329174808357132, + "flos": 25045956766080.0, + "grad_norm": 1.4499536056307496, + "language_loss": 0.73259687, + "learning_rate": 1.189058651365396e-06, + "loss": 0.75351113, + "num_input_tokens_seen": 226937705, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.41601562, + "step": 10527, + "time_per_iteration": 2.4327008724212646 + }, + { + "auxiliary_loss_clip": 0.01057878, + "auxiliary_loss_mlp": 0.0102303, + "balance_loss_clip": 1.01151466, + "balance_loss_mlp": 1.01988244, + "epoch": 0.6329776040883812, + "flos": 16835886178560.0, + "grad_norm": 2.2818942664393247, + "language_loss": 0.71856111, + "learning_rate": 1.1887132954054975e-06, + "loss": 0.73937023, + "num_input_tokens_seen": 226954880, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.37890625, + "step": 10528, + "time_per_iteration": 2.391496181488037 + }, + { + "auxiliary_loss_clip": 0.01057292, + "auxiliary_loss_mlp": 0.01023834, + "balance_loss_clip": 1.01109624, + "balance_loss_mlp": 1.01808095, + "epoch": 0.6330377273410491, + "flos": 13515855482880.0, + "grad_norm": 1.8240711777861944, + "language_loss": 0.6632753, + "learning_rate": 1.1883679683983354e-06, + "loss": 0.68408656, + "num_input_tokens_seen": 226972595, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.39257812, + "step": 10529, + "time_per_iteration": 2.397110939025879 + }, + { + "auxiliary_loss_clip": 0.01060321, + "auxiliary_loss_mlp": 0.01025981, + "balance_loss_clip": 1.01221836, + "balance_loss_mlp": 1.02024925, + "epoch": 0.6330978505937171, + "flos": 21537884154240.0, + "grad_norm": 2.0210639635464243, + "language_loss": 0.74856842, + "learning_rate": 1.188022670356232e-06, + "loss": 0.76943147, + "num_input_tokens_seen": 226991910, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.40039062, + "step": 10530, + "time_per_iteration": 2.4584803581237793 + }, + { + "auxiliary_loss_clip": 0.01059361, + "auxiliary_loss_mlp": 0.01023992, + "balance_loss_clip": 1.01190972, + "balance_loss_mlp": 1.02015674, + "epoch": 0.6331579738463851, + "flos": 25008320453760.0, + "grad_norm": 1.46443240234979, + "language_loss": 0.73842049, + "learning_rate": 1.1876774012915108e-06, + "loss": 0.75925398, + "num_input_tokens_seen": 227010175, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.390625, + "step": 10531, + "time_per_iteration": 3.9227538108825684 + }, + { + "auxiliary_loss_clip": 0.01055233, + "auxiliary_loss_mlp": 0.01023414, + "balance_loss_clip": 1.01152253, + "balance_loss_mlp": 1.01789641, + "epoch": 0.6332180970990531, + "flos": 14975120246400.0, + "grad_norm": 3.7156739754382135, + "language_loss": 0.79582804, + "learning_rate": 1.1873321612164944e-06, + "loss": 0.81661451, + "num_input_tokens_seen": 227025540, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.37304688, + "step": 10532, + "time_per_iteration": 3.8278069496154785 + }, + { + "auxiliary_loss_clip": 0.01057366, + "auxiliary_loss_mlp": 0.0102502, + "balance_loss_clip": 1.01441669, + "balance_loss_mlp": 1.0193553, + "epoch": 0.633278220351721, + "flos": 22705973245440.0, + "grad_norm": 1.959764339676544, + "language_loss": 0.74823707, + "learning_rate": 1.1869869501435023e-06, + "loss": 0.76906091, + "num_input_tokens_seen": 227045520, + "router_z_loss_clip": 0.10595703, + "router_z_loss_mlp": 0.37890625, + "step": 10533, + "time_per_iteration": 2.4551730155944824 + }, + { + "auxiliary_loss_clip": 0.01062651, + "auxiliary_loss_mlp": 0.01031292, + "balance_loss_clip": 1.01758265, + "balance_loss_mlp": 1.01943922, + "epoch": 0.633338343604389, + "flos": 12602143624320.0, + "grad_norm": 2.1695755387334494, + "language_loss": 0.76834953, + "learning_rate": 1.1866417680848542e-06, + "loss": 0.789289, + "num_input_tokens_seen": 227059420, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.43359375, + "step": 10534, + "time_per_iteration": 2.4251859188079834 + }, + { + "auxiliary_loss_clip": 0.01058462, + "auxiliary_loss_mlp": 0.01030588, + "balance_loss_clip": 1.01846409, + "balance_loss_mlp": 1.01954365, + "epoch": 0.6333984668570569, + "flos": 25958865663360.0, + "grad_norm": 2.3533880763410475, + "language_loss": 0.85681796, + "learning_rate": 1.1862966150528702e-06, + "loss": 0.87770844, + "num_input_tokens_seen": 227081310, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.390625, + "step": 10535, + "time_per_iteration": 2.467261791229248 + }, + { + "auxiliary_loss_clip": 0.01060116, + "auxiliary_loss_mlp": 0.01026611, + "balance_loss_clip": 1.01359367, + "balance_loss_mlp": 1.01931977, + "epoch": 0.6334585901097249, + "flos": 23658124377600.0, + "grad_norm": 1.8049927940608665, + "language_loss": 0.76305413, + "learning_rate": 1.1859514910598658e-06, + "loss": 0.78392136, + "num_input_tokens_seen": 227100365, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.40820312, + "step": 10536, + "time_per_iteration": 2.4388935565948486 + }, + { + "auxiliary_loss_clip": 0.01059913, + "auxiliary_loss_mlp": 0.01021135, + "balance_loss_clip": 1.00974429, + "balance_loss_mlp": 1.0208509, + "epoch": 0.633518713362393, + "flos": 28759424106240.0, + "grad_norm": 1.7689284225966684, + "language_loss": 0.60314548, + "learning_rate": 1.185606396118159e-06, + "loss": 0.62395591, + "num_input_tokens_seen": 227119680, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.390625, + "step": 10537, + "time_per_iteration": 2.484130620956421 + }, + { + "auxiliary_loss_clip": 0.01007292, + "auxiliary_loss_mlp": 0.01005088, + "balance_loss_clip": 1.00421226, + "balance_loss_mlp": 1.00062251, + "epoch": 0.6335788366150609, + "flos": 70417264567680.0, + "grad_norm": 0.7769655216093113, + "language_loss": 0.52465498, + "learning_rate": 1.1852613302400648e-06, + "loss": 0.54477882, + "num_input_tokens_seen": 227184465, + "router_z_loss_clip": 0.00878906, + "router_z_loss_mlp": 0.06640625, + "step": 10538, + "time_per_iteration": 3.128065824508667 + }, + { + "auxiliary_loss_clip": 0.01060987, + "auxiliary_loss_mlp": 0.01025126, + "balance_loss_clip": 1.01159024, + "balance_loss_mlp": 1.01934373, + "epoch": 0.6336389598677289, + "flos": 23730953207040.0, + "grad_norm": 1.7233618695239579, + "language_loss": 0.83206844, + "learning_rate": 1.184916293437899e-06, + "loss": 0.85292959, + "num_input_tokens_seen": 227202185, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.41601562, + "step": 10539, + "time_per_iteration": 3.8279902935028076 + }, + { + "auxiliary_loss_clip": 0.0105977, + "auxiliary_loss_mlp": 0.01020841, + "balance_loss_clip": 1.00787103, + "balance_loss_mlp": 1.01863682, + "epoch": 0.6336990831203968, + "flos": 29275440134400.0, + "grad_norm": 1.734400887746344, + "language_loss": 0.86820781, + "learning_rate": 1.1845712857239732e-06, + "loss": 0.88901395, + "num_input_tokens_seen": 227222020, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.41210938, + "step": 10540, + "time_per_iteration": 2.5396602153778076 + }, + { + "auxiliary_loss_clip": 0.01056761, + "auxiliary_loss_mlp": 0.0102601, + "balance_loss_clip": 1.01509023, + "balance_loss_mlp": 1.01903725, + "epoch": 0.6337592063730648, + "flos": 29095532565120.0, + "grad_norm": 1.5478554276752252, + "language_loss": 0.72570181, + "learning_rate": 1.1842263071106005e-06, + "loss": 0.74652946, + "num_input_tokens_seen": 227240885, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.37695312, + "step": 10541, + "time_per_iteration": 2.4877521991729736 + }, + { + "auxiliary_loss_clip": 0.0106384, + "auxiliary_loss_mlp": 0.01025582, + "balance_loss_clip": 1.0121119, + "balance_loss_mlp": 1.02145863, + "epoch": 0.6338193296257327, + "flos": 34705272556800.0, + "grad_norm": 2.0190711805909807, + "language_loss": 0.84393919, + "learning_rate": 1.1838813576100935e-06, + "loss": 0.86483347, + "num_input_tokens_seen": 227257880, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.42382812, + "step": 10542, + "time_per_iteration": 2.5476906299591064 + }, + { + "auxiliary_loss_clip": 0.01058981, + "auxiliary_loss_mlp": 0.01026808, + "balance_loss_clip": 1.01373065, + "balance_loss_mlp": 1.01984739, + "epoch": 0.6338794528784008, + "flos": 16686737383680.0, + "grad_norm": 1.6349348084319721, + "language_loss": 0.7746079, + "learning_rate": 1.1835364372347604e-06, + "loss": 0.79546571, + "num_input_tokens_seen": 227274840, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.390625, + "step": 10543, + "time_per_iteration": 2.405791759490967 + }, + { + "auxiliary_loss_clip": 0.01055187, + "auxiliary_loss_mlp": 0.01026466, + "balance_loss_clip": 1.01567793, + "balance_loss_mlp": 1.01774597, + "epoch": 0.6339395761310687, + "flos": 22345494791040.0, + "grad_norm": 1.9918840034330627, + "language_loss": 0.73331237, + "learning_rate": 1.183191545996912e-06, + "loss": 0.75412893, + "num_input_tokens_seen": 227294835, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.375, + "step": 10544, + "time_per_iteration": 2.4708352088928223 + }, + { + "auxiliary_loss_clip": 0.0106201, + "auxiliary_loss_mlp": 0.01031249, + "balance_loss_clip": 1.01793337, + "balance_loss_mlp": 1.02064586, + "epoch": 0.6339996993837367, + "flos": 18550819895040.0, + "grad_norm": 1.9623401961228888, + "language_loss": 0.67942953, + "learning_rate": 1.1828466839088568e-06, + "loss": 0.70036215, + "num_input_tokens_seen": 227314935, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.4140625, + "step": 10545, + "time_per_iteration": 2.4284543991088867 + }, + { + "auxiliary_loss_clip": 0.0105707, + "auxiliary_loss_mlp": 0.01029458, + "balance_loss_clip": 1.01761425, + "balance_loss_mlp": 1.01879847, + "epoch": 0.6340598226364046, + "flos": 12968661744000.0, + "grad_norm": 7.434251245005019, + "language_loss": 0.71451998, + "learning_rate": 1.1825018509829007e-06, + "loss": 0.7353853, + "num_input_tokens_seen": 227332905, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.3828125, + "step": 10546, + "time_per_iteration": 2.4260778427124023 + }, + { + "auxiliary_loss_clip": 0.01058322, + "auxiliary_loss_mlp": 0.01021951, + "balance_loss_clip": 1.01051855, + "balance_loss_mlp": 1.02080131, + "epoch": 0.6341199458890726, + "flos": 26686769932800.0, + "grad_norm": 1.2806059514202293, + "language_loss": 0.78159368, + "learning_rate": 1.182157047231351e-06, + "loss": 0.80239642, + "num_input_tokens_seen": 227354915, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.375, + "step": 10547, + "time_per_iteration": 2.490180253982544 + }, + { + "auxiliary_loss_clip": 0.01060044, + "auxiliary_loss_mlp": 0.01025956, + "balance_loss_clip": 1.01360607, + "balance_loss_mlp": 1.01887822, + "epoch": 0.6341800691417405, + "flos": 18733275993600.0, + "grad_norm": 2.0522733249895992, + "language_loss": 0.63695264, + "learning_rate": 1.1818122726665128e-06, + "loss": 0.65781265, + "num_input_tokens_seen": 227372990, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.41210938, + "step": 10548, + "time_per_iteration": 2.4510388374328613 + }, + { + "auxiliary_loss_clip": 0.0105744, + "auxiliary_loss_mlp": 0.01025774, + "balance_loss_clip": 1.0141927, + "balance_loss_mlp": 1.01923645, + "epoch": 0.6342401923944085, + "flos": 26248260856320.0, + "grad_norm": 1.5854898667008464, + "language_loss": 0.61794287, + "learning_rate": 1.1814675273006902e-06, + "loss": 0.63877499, + "num_input_tokens_seen": 227393270, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.38085938, + "step": 10549, + "time_per_iteration": 2.465970277786255 + }, + { + "auxiliary_loss_clip": 0.01059525, + "auxiliary_loss_mlp": 0.01025791, + "balance_loss_clip": 1.01428103, + "balance_loss_mlp": 1.01959181, + "epoch": 0.6343003156470765, + "flos": 24679787760000.0, + "grad_norm": 1.4496541707740862, + "language_loss": 0.73898685, + "learning_rate": 1.1811228111461855e-06, + "loss": 0.75984001, + "num_input_tokens_seen": 227413630, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.3984375, + "step": 10550, + "time_per_iteration": 2.486898183822632 + }, + { + "auxiliary_loss_clip": 0.01007555, + "auxiliary_loss_mlp": 0.01003096, + "balance_loss_clip": 1.0022912, + "balance_loss_mlp": 1.00116646, + "epoch": 0.6343604388997445, + "flos": 69802164501120.0, + "grad_norm": 0.696789918111987, + "language_loss": 0.57730138, + "learning_rate": 1.180778124215301e-06, + "loss": 0.59740788, + "num_input_tokens_seen": 227476630, + "router_z_loss_clip": 0.00805664, + "router_z_loss_mlp": 0.06396484, + "step": 10551, + "time_per_iteration": 3.088059902191162 + }, + { + "auxiliary_loss_clip": 0.01056463, + "auxiliary_loss_mlp": 0.01024052, + "balance_loss_clip": 1.01352024, + "balance_loss_mlp": 1.01817369, + "epoch": 0.6344205621524125, + "flos": 21981315732480.0, + "grad_norm": 1.788305205493527, + "language_loss": 0.67050076, + "learning_rate": 1.180433466520339e-06, + "loss": 0.69130594, + "num_input_tokens_seen": 227496060, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.3828125, + "step": 10552, + "time_per_iteration": 2.468315601348877 + }, + { + "auxiliary_loss_clip": 0.01058685, + "auxiliary_loss_mlp": 0.01021835, + "balance_loss_clip": 1.00960982, + "balance_loss_mlp": 1.0182488, + "epoch": 0.6344806854050804, + "flos": 20447825685120.0, + "grad_norm": 1.8089575841306778, + "language_loss": 0.81987232, + "learning_rate": 1.180088838073597e-06, + "loss": 0.8406775, + "num_input_tokens_seen": 227513440, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.40429688, + "step": 10553, + "time_per_iteration": 2.4373950958251953 + }, + { + "auxiliary_loss_clip": 0.01059647, + "auxiliary_loss_mlp": 0.01026625, + "balance_loss_clip": 1.01423299, + "balance_loss_mlp": 1.01856899, + "epoch": 0.6345408086577484, + "flos": 40509163952640.0, + "grad_norm": 5.426983751089434, + "language_loss": 0.5473066, + "learning_rate": 1.179744238887376e-06, + "loss": 0.5681693, + "num_input_tokens_seen": 227535395, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.41015625, + "step": 10554, + "time_per_iteration": 2.6192822456359863 + }, + { + "auxiliary_loss_clip": 0.01060933, + "auxiliary_loss_mlp": 0.01025774, + "balance_loss_clip": 1.01365042, + "balance_loss_mlp": 1.0203886, + "epoch": 0.6346009319104163, + "flos": 21360245823360.0, + "grad_norm": 1.6007536201575197, + "language_loss": 0.70566684, + "learning_rate": 1.1793996689739729e-06, + "loss": 0.72653389, + "num_input_tokens_seen": 227554545, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.40625, + "step": 10555, + "time_per_iteration": 2.404686689376831 + }, + { + "auxiliary_loss_clip": 0.01007601, + "auxiliary_loss_mlp": 0.01001076, + "balance_loss_clip": 1.00014627, + "balance_loss_mlp": 1.00117111, + "epoch": 0.6346610551630844, + "flos": 71362433427840.0, + "grad_norm": 0.7736434882698277, + "language_loss": 0.55403435, + "learning_rate": 1.1790551283456855e-06, + "loss": 0.57412112, + "num_input_tokens_seen": 227608575, + "router_z_loss_clip": 0.00927734, + "router_z_loss_mlp": 0.06445312, + "step": 10556, + "time_per_iteration": 2.9986910820007324 + }, + { + "auxiliary_loss_clip": 0.0105868, + "auxiliary_loss_mlp": 0.01023656, + "balance_loss_clip": 1.01079357, + "balance_loss_mlp": 1.01880097, + "epoch": 0.6347211784157523, + "flos": 25410310381440.0, + "grad_norm": 2.2965682883826504, + "language_loss": 0.68153381, + "learning_rate": 1.1787106170148082e-06, + "loss": 0.70235717, + "num_input_tokens_seen": 227628175, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.3984375, + "step": 10557, + "time_per_iteration": 2.4578864574432373 + }, + { + "auxiliary_loss_clip": 0.01057043, + "auxiliary_loss_mlp": 0.01023605, + "balance_loss_clip": 1.01122534, + "balance_loss_mlp": 1.01919937, + "epoch": 0.6347813016684203, + "flos": 15741812903040.0, + "grad_norm": 1.9798349468434384, + "language_loss": 0.70253563, + "learning_rate": 1.1783661349936363e-06, + "loss": 0.72334212, + "num_input_tokens_seen": 227645330, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.37890625, + "step": 10558, + "time_per_iteration": 2.3910725116729736 + }, + { + "auxiliary_loss_clip": 0.01057354, + "auxiliary_loss_mlp": 0.0102605, + "balance_loss_clip": 1.01381361, + "balance_loss_mlp": 1.01873302, + "epoch": 0.6348414249210882, + "flos": 21463868338560.0, + "grad_norm": 1.6133081956795394, + "language_loss": 0.78298801, + "learning_rate": 1.1780216822944647e-06, + "loss": 0.80382204, + "num_input_tokens_seen": 227665250, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.38671875, + "step": 10559, + "time_per_iteration": 2.3838164806365967 + }, + { + "auxiliary_loss_clip": 0.01060958, + "auxiliary_loss_mlp": 0.01024746, + "balance_loss_clip": 1.01221693, + "balance_loss_mlp": 1.02048659, + "epoch": 0.6349015481737562, + "flos": 21651980077440.0, + "grad_norm": 1.585189893666159, + "language_loss": 0.68416142, + "learning_rate": 1.1776772589295836e-06, + "loss": 0.70501846, + "num_input_tokens_seen": 227685070, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.40429688, + "step": 10560, + "time_per_iteration": 3.8635172843933105 + }, + { + "auxiliary_loss_clip": 0.01058835, + "auxiliary_loss_mlp": 0.01023959, + "balance_loss_clip": 1.01216304, + "balance_loss_mlp": 1.01955354, + "epoch": 0.6349616714264241, + "flos": 22194041846400.0, + "grad_norm": 2.044943982049011, + "language_loss": 0.77089661, + "learning_rate": 1.1773328649112858e-06, + "loss": 0.79172456, + "num_input_tokens_seen": 227704430, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.39257812, + "step": 10561, + "time_per_iteration": 2.436141014099121 + }, + { + "auxiliary_loss_clip": 0.01058943, + "auxiliary_loss_mlp": 0.01024003, + "balance_loss_clip": 1.01197433, + "balance_loss_mlp": 1.01891792, + "epoch": 0.6350217946790921, + "flos": 25409193217920.0, + "grad_norm": 3.6082874701096097, + "language_loss": 0.72087532, + "learning_rate": 1.176988500251863e-06, + "loss": 0.74170482, + "num_input_tokens_seen": 227724920, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.40039062, + "step": 10562, + "time_per_iteration": 2.4501471519470215 + }, + { + "auxiliary_loss_clip": 0.010609, + "auxiliary_loss_mlp": 0.01026952, + "balance_loss_clip": 1.01372576, + "balance_loss_mlp": 1.01947117, + "epoch": 0.63508191793176, + "flos": 19717931468160.0, + "grad_norm": 3.334375073222039, + "language_loss": 0.80648863, + "learning_rate": 1.176644164963603e-06, + "loss": 0.82736719, + "num_input_tokens_seen": 227743400, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.4140625, + "step": 10563, + "time_per_iteration": 2.4298336505889893 + }, + { + "auxiliary_loss_clip": 0.01060328, + "auxiliary_loss_mlp": 0.01027761, + "balance_loss_clip": 1.0146302, + "balance_loss_mlp": 1.02029991, + "epoch": 0.6351420411844281, + "flos": 18185942609280.0, + "grad_norm": 2.007330466560714, + "language_loss": 0.81082594, + "learning_rate": 1.1762998590587946e-06, + "loss": 0.83170682, + "num_input_tokens_seen": 227759990, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.40039062, + "step": 10564, + "time_per_iteration": 2.4207777976989746 + }, + { + "auxiliary_loss_clip": 0.01059864, + "auxiliary_loss_mlp": 0.01023843, + "balance_loss_clip": 1.01140392, + "balance_loss_mlp": 1.02024841, + "epoch": 0.6352021644370961, + "flos": 33725190470400.0, + "grad_norm": 2.4791422062115283, + "language_loss": 0.72331649, + "learning_rate": 1.1759555825497253e-06, + "loss": 0.74415356, + "num_input_tokens_seen": 227780835, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.39453125, + "step": 10565, + "time_per_iteration": 2.514857530593872 + }, + { + "auxiliary_loss_clip": 0.01058987, + "auxiliary_loss_mlp": 0.01026012, + "balance_loss_clip": 1.01258945, + "balance_loss_mlp": 1.01941419, + "epoch": 0.635262287689764, + "flos": 20373774958080.0, + "grad_norm": 1.8346704183473568, + "language_loss": 0.68769789, + "learning_rate": 1.1756113354486826e-06, + "loss": 0.70854783, + "num_input_tokens_seen": 227798580, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.39648438, + "step": 10566, + "time_per_iteration": 2.4165842533111572 + }, + { + "auxiliary_loss_clip": 0.01058546, + "auxiliary_loss_mlp": 0.01026084, + "balance_loss_clip": 1.0144074, + "balance_loss_mlp": 1.02021992, + "epoch": 0.635322410942432, + "flos": 27524231648640.0, + "grad_norm": 1.6436396919160292, + "language_loss": 0.69927359, + "learning_rate": 1.1752671177679495e-06, + "loss": 0.72011983, + "num_input_tokens_seen": 227819210, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.3828125, + "step": 10567, + "time_per_iteration": 2.471011161804199 + }, + { + "auxiliary_loss_clip": 0.01055404, + "auxiliary_loss_mlp": 0.01024768, + "balance_loss_clip": 1.0134908, + "balance_loss_mlp": 1.01878476, + "epoch": 0.6353825341950999, + "flos": 21542527365120.0, + "grad_norm": 1.6221492893997076, + "language_loss": 0.84495926, + "learning_rate": 1.1749229295198117e-06, + "loss": 0.86576092, + "num_input_tokens_seen": 227838340, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.36523438, + "step": 10568, + "time_per_iteration": 2.4567198753356934 + }, + { + "auxiliary_loss_clip": 0.01059777, + "auxiliary_loss_mlp": 0.01029434, + "balance_loss_clip": 1.01681602, + "balance_loss_mlp": 1.02002096, + "epoch": 0.635442657447768, + "flos": 31758393139200.0, + "grad_norm": 1.8261966607019806, + "language_loss": 0.84103394, + "learning_rate": 1.174578770716553e-06, + "loss": 0.86192614, + "num_input_tokens_seen": 227859170, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.39648438, + "step": 10569, + "time_per_iteration": 2.4900872707366943 + }, + { + "auxiliary_loss_clip": 0.01057744, + "auxiliary_loss_mlp": 0.01023227, + "balance_loss_clip": 1.01112175, + "balance_loss_mlp": 1.01965308, + "epoch": 0.6355027807004359, + "flos": 19827803116800.0, + "grad_norm": 2.0411425623314328, + "language_loss": 0.69506592, + "learning_rate": 1.1742346413704542e-06, + "loss": 0.71587563, + "num_input_tokens_seen": 227878545, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.38085938, + "step": 10570, + "time_per_iteration": 3.800788164138794 + }, + { + "auxiliary_loss_clip": 0.0105941, + "auxiliary_loss_mlp": 0.01024816, + "balance_loss_clip": 1.01210809, + "balance_loss_mlp": 1.01857269, + "epoch": 0.6355629039531039, + "flos": 30371084421120.0, + "grad_norm": 1.676647128129219, + "language_loss": 0.65651751, + "learning_rate": 1.1738905414937967e-06, + "loss": 0.6773597, + "num_input_tokens_seen": 227898875, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.40820312, + "step": 10571, + "time_per_iteration": 3.9359257221221924 + }, + { + "auxiliary_loss_clip": 0.01062496, + "auxiliary_loss_mlp": 0.01025244, + "balance_loss_clip": 1.01226175, + "balance_loss_mlp": 1.02176321, + "epoch": 0.6356230272057718, + "flos": 17931076617600.0, + "grad_norm": 3.669160000912244, + "language_loss": 0.71422142, + "learning_rate": 1.1735464710988608e-06, + "loss": 0.73509884, + "num_input_tokens_seen": 227917130, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.40820312, + "step": 10572, + "time_per_iteration": 2.422163248062134 + }, + { + "auxiliary_loss_clip": 0.01058463, + "auxiliary_loss_mlp": 0.01023703, + "balance_loss_clip": 1.01206255, + "balance_loss_mlp": 1.0198319, + "epoch": 0.6356831504584398, + "flos": 25374629105280.0, + "grad_norm": 1.4142766322316151, + "language_loss": 0.81163627, + "learning_rate": 1.1732024301979264e-06, + "loss": 0.8324579, + "num_input_tokens_seen": 227939550, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.38671875, + "step": 10573, + "time_per_iteration": 2.476503849029541 + }, + { + "auxiliary_loss_clip": 0.01059828, + "auxiliary_loss_mlp": 0.01026089, + "balance_loss_clip": 1.01339328, + "balance_loss_mlp": 1.01961458, + "epoch": 0.6357432737111077, + "flos": 46498548735360.0, + "grad_norm": 1.6402307925723882, + "language_loss": 0.69190115, + "learning_rate": 1.1728584188032695e-06, + "loss": 0.71276033, + "num_input_tokens_seen": 227962200, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.40234375, + "step": 10574, + "time_per_iteration": 2.648336410522461 + }, + { + "auxiliary_loss_clip": 0.01058096, + "auxiliary_loss_mlp": 0.01024748, + "balance_loss_clip": 1.01298189, + "balance_loss_mlp": 1.01925969, + "epoch": 0.6358033969637757, + "flos": 17273417736960.0, + "grad_norm": 2.4775958528325948, + "language_loss": 0.87072462, + "learning_rate": 1.1725144369271678e-06, + "loss": 0.89155304, + "num_input_tokens_seen": 227979270, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.38867188, + "step": 10575, + "time_per_iteration": 2.364718198776245 + }, + { + "auxiliary_loss_clip": 0.01058554, + "auxiliary_loss_mlp": 0.01026341, + "balance_loss_clip": 1.01437831, + "balance_loss_mlp": 1.02040458, + "epoch": 0.6358635202164437, + "flos": 27124301491200.0, + "grad_norm": 3.6223041959252495, + "language_loss": 0.71828884, + "learning_rate": 1.1721704845818986e-06, + "loss": 0.73913777, + "num_input_tokens_seen": 228000550, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.3828125, + "step": 10576, + "time_per_iteration": 2.4854257106781006 + }, + { + "auxiliary_loss_clip": 0.01058686, + "auxiliary_loss_mlp": 0.01027739, + "balance_loss_clip": 1.01525819, + "balance_loss_mlp": 1.01896501, + "epoch": 0.6359236434691117, + "flos": 27524022180480.0, + "grad_norm": 1.5328795759330944, + "language_loss": 0.69596541, + "learning_rate": 1.1718265617797341e-06, + "loss": 0.71682966, + "num_input_tokens_seen": 228022005, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.39648438, + "step": 10577, + "time_per_iteration": 2.4398841857910156 + }, + { + "auxiliary_loss_clip": 0.0105997, + "auxiliary_loss_mlp": 0.01024531, + "balance_loss_clip": 1.01270592, + "balance_loss_mlp": 1.02054977, + "epoch": 0.6359837667217797, + "flos": 39346730501760.0, + "grad_norm": 1.7697278242688328, + "language_loss": 0.72197121, + "learning_rate": 1.17148266853295e-06, + "loss": 0.74281621, + "num_input_tokens_seen": 228043770, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.39453125, + "step": 10578, + "time_per_iteration": 4.009287595748901 + }, + { + "auxiliary_loss_clip": 0.01007175, + "auxiliary_loss_mlp": 0.01001023, + "balance_loss_clip": 1.00022435, + "balance_loss_mlp": 1.00056219, + "epoch": 0.6360438899744476, + "flos": 56411017994880.0, + "grad_norm": 0.7027923506735537, + "language_loss": 0.5458796, + "learning_rate": 1.1711388048538182e-06, + "loss": 0.5659616, + "num_input_tokens_seen": 228104985, + "router_z_loss_clip": 0.00799561, + "router_z_loss_mlp": 0.06640625, + "step": 10579, + "time_per_iteration": 3.1155190467834473 + }, + { + "auxiliary_loss_clip": 0.01059235, + "auxiliary_loss_mlp": 0.01022525, + "balance_loss_clip": 1.01007974, + "balance_loss_mlp": 1.01964641, + "epoch": 0.6361040132271156, + "flos": 24971940950400.0, + "grad_norm": 1.5544884145917406, + "language_loss": 0.7758531, + "learning_rate": 1.17079497075461e-06, + "loss": 0.79667068, + "num_input_tokens_seen": 228125620, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.39453125, + "step": 10580, + "time_per_iteration": 2.4973936080932617 + }, + { + "auxiliary_loss_clip": 0.01058334, + "auxiliary_loss_mlp": 0.01025089, + "balance_loss_clip": 1.01292396, + "balance_loss_mlp": 1.01963329, + "epoch": 0.6361641364797835, + "flos": 23258054574720.0, + "grad_norm": 2.645780063104923, + "language_loss": 0.66456836, + "learning_rate": 1.1704511662475964e-06, + "loss": 0.68540257, + "num_input_tokens_seen": 228143495, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.38671875, + "step": 10581, + "time_per_iteration": 2.4055335521698 + }, + { + "auxiliary_loss_clip": 0.01058139, + "auxiliary_loss_mlp": 0.01023587, + "balance_loss_clip": 1.01210105, + "balance_loss_mlp": 1.01882625, + "epoch": 0.6362242597324516, + "flos": 25993325041920.0, + "grad_norm": 1.3707031835441401, + "language_loss": 0.68321568, + "learning_rate": 1.1701073913450465e-06, + "loss": 0.7040329, + "num_input_tokens_seen": 228166500, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.39257812, + "step": 10582, + "time_per_iteration": 2.492358922958374 + }, + { + "auxiliary_loss_clip": 0.01060307, + "auxiliary_loss_mlp": 0.01028066, + "balance_loss_clip": 1.01632965, + "balance_loss_mlp": 1.01985073, + "epoch": 0.6362843829851195, + "flos": 25702044635520.0, + "grad_norm": 1.913792261756428, + "language_loss": 0.84366226, + "learning_rate": 1.1697636460592301e-06, + "loss": 0.86454594, + "num_input_tokens_seen": 228185325, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.40429688, + "step": 10583, + "time_per_iteration": 2.439176559448242 + }, + { + "auxiliary_loss_clip": 0.0105856, + "auxiliary_loss_mlp": 0.01026397, + "balance_loss_clip": 1.01473856, + "balance_loss_mlp": 1.02007365, + "epoch": 0.6363445062377875, + "flos": 20521841500800.0, + "grad_norm": 2.410712271300886, + "language_loss": 0.75049871, + "learning_rate": 1.1694199304024125e-06, + "loss": 0.7713483, + "num_input_tokens_seen": 228204050, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.38476562, + "step": 10584, + "time_per_iteration": 2.405939817428589 + }, + { + "auxiliary_loss_clip": 0.01059825, + "auxiliary_loss_mlp": 0.01026622, + "balance_loss_clip": 1.01328218, + "balance_loss_mlp": 1.02003932, + "epoch": 0.6364046294904554, + "flos": 19462786185600.0, + "grad_norm": 1.833710128752351, + "language_loss": 0.72933954, + "learning_rate": 1.1690762443868613e-06, + "loss": 0.75020403, + "num_input_tokens_seen": 228222430, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.39648438, + "step": 10585, + "time_per_iteration": 2.4444289207458496 + }, + { + "auxiliary_loss_clip": 0.01059187, + "auxiliary_loss_mlp": 0.0102365, + "balance_loss_clip": 1.01236093, + "balance_loss_mlp": 1.02013409, + "epoch": 0.6364647527431234, + "flos": 20994844867200.0, + "grad_norm": 1.7880782335462506, + "language_loss": 0.82817525, + "learning_rate": 1.1687325880248424e-06, + "loss": 0.84900355, + "num_input_tokens_seen": 228241925, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.390625, + "step": 10586, + "time_per_iteration": 2.4854836463928223 + }, + { + "auxiliary_loss_clip": 0.01058218, + "auxiliary_loss_mlp": 0.0102274, + "balance_loss_clip": 1.01096773, + "balance_loss_mlp": 1.0198406, + "epoch": 0.6365248759957913, + "flos": 25769741495040.0, + "grad_norm": 1.5447129166755142, + "language_loss": 0.72535127, + "learning_rate": 1.1683889613286183e-06, + "loss": 0.74616086, + "num_input_tokens_seen": 228262535, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.3828125, + "step": 10587, + "time_per_iteration": 2.5020854473114014 + }, + { + "auxiliary_loss_clip": 0.01058751, + "auxiliary_loss_mlp": 0.01022685, + "balance_loss_clip": 1.01106191, + "balance_loss_mlp": 1.01983714, + "epoch": 0.6365849992484593, + "flos": 22454493655680.0, + "grad_norm": 1.9010023758410637, + "language_loss": 0.76963532, + "learning_rate": 1.1680453643104527e-06, + "loss": 0.79044968, + "num_input_tokens_seen": 228281340, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.38867188, + "step": 10588, + "time_per_iteration": 2.4460647106170654 + }, + { + "auxiliary_loss_clip": 0.0105625, + "auxiliary_loss_mlp": 0.0102389, + "balance_loss_clip": 1.01147461, + "balance_loss_mlp": 1.01885748, + "epoch": 0.6366451225011273, + "flos": 19024696045440.0, + "grad_norm": 1.4618131881217789, + "language_loss": 0.79766273, + "learning_rate": 1.1677017969826093e-06, + "loss": 0.81846416, + "num_input_tokens_seen": 228300865, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.375, + "step": 10589, + "time_per_iteration": 2.452267646789551 + }, + { + "auxiliary_loss_clip": 0.01057148, + "auxiliary_loss_mlp": 0.01024511, + "balance_loss_clip": 1.012447, + "balance_loss_mlp": 1.01938581, + "epoch": 0.6367052457537953, + "flos": 25227225878400.0, + "grad_norm": 2.5516693731168965, + "language_loss": 0.67114478, + "learning_rate": 1.167358259357347e-06, + "loss": 0.69196135, + "num_input_tokens_seen": 228320815, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.37890625, + "step": 10590, + "time_per_iteration": 2.482564926147461 + }, + { + "auxiliary_loss_clip": 0.0106341, + "auxiliary_loss_mlp": 0.01028775, + "balance_loss_clip": 1.01616263, + "balance_loss_mlp": 1.02052248, + "epoch": 0.6367653690064633, + "flos": 19207431434880.0, + "grad_norm": 1.7298838590852108, + "language_loss": 0.78868759, + "learning_rate": 1.167014751446926e-06, + "loss": 0.80960941, + "num_input_tokens_seen": 228339065, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.4296875, + "step": 10591, + "time_per_iteration": 2.4184000492095947 + }, + { + "auxiliary_loss_clip": 0.01057961, + "auxiliary_loss_mlp": 0.01019076, + "balance_loss_clip": 1.00821543, + "balance_loss_mlp": 1.02016687, + "epoch": 0.6368254922591312, + "flos": 23545774022400.0, + "grad_norm": 1.384728819709433, + "language_loss": 0.88727421, + "learning_rate": 1.1666712732636069e-06, + "loss": 0.90804458, + "num_input_tokens_seen": 228359210, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.37695312, + "step": 10592, + "time_per_iteration": 2.466956615447998 + }, + { + "auxiliary_loss_clip": 0.01054787, + "auxiliary_loss_mlp": 0.01021903, + "balance_loss_clip": 1.01103735, + "balance_loss_mlp": 1.01821411, + "epoch": 0.6368856155117992, + "flos": 26466153851520.0, + "grad_norm": 1.3210802119373308, + "language_loss": 0.68588215, + "learning_rate": 1.166327824819646e-06, + "loss": 0.70664907, + "num_input_tokens_seen": 228379630, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.36523438, + "step": 10593, + "time_per_iteration": 2.477764368057251 + }, + { + "auxiliary_loss_clip": 0.01055191, + "auxiliary_loss_mlp": 0.01022797, + "balance_loss_clip": 1.01149607, + "balance_loss_mlp": 1.01798677, + "epoch": 0.6369457387644671, + "flos": 33691045294080.0, + "grad_norm": 1.8954580404613597, + "language_loss": 0.63925427, + "learning_rate": 1.1659844061273007e-06, + "loss": 0.66003418, + "num_input_tokens_seen": 228401410, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.37304688, + "step": 10594, + "time_per_iteration": 2.5351719856262207 + }, + { + "auxiliary_loss_clip": 0.01007285, + "auxiliary_loss_mlp": 0.01001079, + "balance_loss_clip": 1.00026214, + "balance_loss_mlp": 1.00064111, + "epoch": 0.6370058620171352, + "flos": 70905140173440.0, + "grad_norm": 0.7872295913210553, + "language_loss": 0.54636514, + "learning_rate": 1.1656410171988259e-06, + "loss": 0.56644869, + "num_input_tokens_seen": 228470335, + "router_z_loss_clip": 0.00817871, + "router_z_loss_mlp": 0.06640625, + "step": 10595, + "time_per_iteration": 3.175220251083374 + }, + { + "auxiliary_loss_clip": 0.01060106, + "auxiliary_loss_mlp": 0.0102937, + "balance_loss_clip": 1.01607251, + "balance_loss_mlp": 1.01993299, + "epoch": 0.6370659852698031, + "flos": 21140886551040.0, + "grad_norm": 1.5492496685776016, + "language_loss": 0.66750813, + "learning_rate": 1.1652976580464787e-06, + "loss": 0.68840289, + "num_input_tokens_seen": 228490765, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.40234375, + "step": 10596, + "time_per_iteration": 2.426741123199463 + }, + { + "auxiliary_loss_clip": 0.01059495, + "auxiliary_loss_mlp": 0.01026826, + "balance_loss_clip": 1.01498866, + "balance_loss_mlp": 1.01950455, + "epoch": 0.6371261085224711, + "flos": 20192261466240.0, + "grad_norm": 2.4117212975927464, + "language_loss": 0.7868762, + "learning_rate": 1.164954328682509e-06, + "loss": 0.80773944, + "num_input_tokens_seen": 228509700, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.40039062, + "step": 10597, + "time_per_iteration": 2.4337868690490723 + }, + { + "auxiliary_loss_clip": 0.01058997, + "auxiliary_loss_mlp": 0.01027965, + "balance_loss_clip": 1.01687312, + "balance_loss_mlp": 1.02021575, + "epoch": 0.637186231775139, + "flos": 19682494571520.0, + "grad_norm": 1.822717690931111, + "language_loss": 0.74500859, + "learning_rate": 1.1646110291191724e-06, + "loss": 0.7658782, + "num_input_tokens_seen": 228529050, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.38671875, + "step": 10598, + "time_per_iteration": 2.4511489868164062 + }, + { + "auxiliary_loss_clip": 0.01057159, + "auxiliary_loss_mlp": 0.01024781, + "balance_loss_clip": 1.01309288, + "balance_loss_mlp": 1.0181973, + "epoch": 0.637246355027807, + "flos": 13070573602560.0, + "grad_norm": 2.11461837201156, + "language_loss": 0.68416798, + "learning_rate": 1.1642677593687184e-06, + "loss": 0.70498741, + "num_input_tokens_seen": 228544665, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.390625, + "step": 10599, + "time_per_iteration": 3.8949458599090576 + }, + { + "auxiliary_loss_clip": 0.01060334, + "auxiliary_loss_mlp": 0.01023443, + "balance_loss_clip": 1.01107478, + "balance_loss_mlp": 1.02003145, + "epoch": 0.6373064782804749, + "flos": 18221693708160.0, + "grad_norm": 4.350618528520823, + "language_loss": 0.80708873, + "learning_rate": 1.1639245194434e-06, + "loss": 0.82792652, + "num_input_tokens_seen": 228562060, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.40429688, + "step": 10600, + "time_per_iteration": 2.3896517753601074 + }, + { + "auxiliary_loss_clip": 0.01057039, + "auxiliary_loss_mlp": 0.01026569, + "balance_loss_clip": 1.01480317, + "balance_loss_mlp": 1.01780379, + "epoch": 0.637366601533143, + "flos": 24497331661440.0, + "grad_norm": 1.4910693120905953, + "language_loss": 0.80003691, + "learning_rate": 1.163581309355464e-06, + "loss": 0.82087302, + "num_input_tokens_seen": 228582550, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.39453125, + "step": 10601, + "time_per_iteration": 2.4262280464172363 + }, + { + "auxiliary_loss_clip": 0.01057926, + "auxiliary_loss_mlp": 0.01022995, + "balance_loss_clip": 1.01161659, + "balance_loss_mlp": 1.01901913, + "epoch": 0.6374267247858109, + "flos": 26357853214080.0, + "grad_norm": 2.7258777418166664, + "language_loss": 0.67388344, + "learning_rate": 1.163238129117159e-06, + "loss": 0.69469261, + "num_input_tokens_seen": 228604960, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.38867188, + "step": 10602, + "time_per_iteration": 2.434478521347046 + }, + { + "auxiliary_loss_clip": 0.01055365, + "auxiliary_loss_mlp": 0.01023486, + "balance_loss_clip": 1.01230383, + "balance_loss_mlp": 1.01790357, + "epoch": 0.6374868480384789, + "flos": 20370807492480.0, + "grad_norm": 1.6841152474825078, + "language_loss": 0.79593873, + "learning_rate": 1.1628949787407338e-06, + "loss": 0.81672716, + "num_input_tokens_seen": 228622195, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.375, + "step": 10603, + "time_per_iteration": 2.406832456588745 + }, + { + "auxiliary_loss_clip": 0.01058633, + "auxiliary_loss_mlp": 0.01024281, + "balance_loss_clip": 1.01187706, + "balance_loss_mlp": 1.01946676, + "epoch": 0.6375469712911469, + "flos": 20995193980800.0, + "grad_norm": 1.8111831192557397, + "language_loss": 0.76476979, + "learning_rate": 1.1625518582384323e-06, + "loss": 0.78559899, + "num_input_tokens_seen": 228639735, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.390625, + "step": 10604, + "time_per_iteration": 2.381322145462036 + }, + { + "auxiliary_loss_clip": 0.01059525, + "auxiliary_loss_mlp": 0.01020846, + "balance_loss_clip": 1.00894916, + "balance_loss_mlp": 1.018713, + "epoch": 0.6376070945438148, + "flos": 19714824357120.0, + "grad_norm": 1.809910129574086, + "language_loss": 0.76776874, + "learning_rate": 1.1622087676225017e-06, + "loss": 0.78857249, + "num_input_tokens_seen": 228658195, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.40820312, + "step": 10605, + "time_per_iteration": 2.3881263732910156 + }, + { + "auxiliary_loss_clip": 0.01058584, + "auxiliary_loss_mlp": 0.01022177, + "balance_loss_clip": 1.00952268, + "balance_loss_mlp": 1.01900744, + "epoch": 0.6376672177964828, + "flos": 21505694014080.0, + "grad_norm": 2.1687189375602784, + "language_loss": 0.66189611, + "learning_rate": 1.1618657069051847e-06, + "loss": 0.68270373, + "num_input_tokens_seen": 228677415, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.39648438, + "step": 10606, + "time_per_iteration": 2.425529718399048 + }, + { + "auxiliary_loss_clip": 0.01055673, + "auxiliary_loss_mlp": 0.01031504, + "balance_loss_clip": 1.02069771, + "balance_loss_mlp": 1.01944661, + "epoch": 0.6377273410491507, + "flos": 18842868351360.0, + "grad_norm": 1.9177214317706393, + "language_loss": 0.75540441, + "learning_rate": 1.1615226760987252e-06, + "loss": 0.77627617, + "num_input_tokens_seen": 228696450, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.36328125, + "step": 10607, + "time_per_iteration": 2.4188292026519775 + }, + { + "auxiliary_loss_clip": 0.01058954, + "auxiliary_loss_mlp": 0.01025887, + "balance_loss_clip": 1.01394248, + "balance_loss_mlp": 1.01920938, + "epoch": 0.6377874643018188, + "flos": 53061138086400.0, + "grad_norm": 1.5078716113257178, + "language_loss": 0.65893346, + "learning_rate": 1.1611796752153633e-06, + "loss": 0.67978191, + "num_input_tokens_seen": 228721600, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.3984375, + "step": 10608, + "time_per_iteration": 2.686077356338501 + }, + { + "auxiliary_loss_clip": 0.0100756, + "auxiliary_loss_mlp": 0.01000444, + "balance_loss_clip": 0.99963981, + "balance_loss_mlp": 1.00100064, + "epoch": 0.6378475875544867, + "flos": 65131972640640.0, + "grad_norm": 0.7251989686629139, + "language_loss": 0.51842666, + "learning_rate": 1.1608367042673421e-06, + "loss": 0.53850669, + "num_input_tokens_seen": 228784535, + "router_z_loss_clip": 0.00805664, + "router_z_loss_mlp": 0.06542969, + "step": 10609, + "time_per_iteration": 3.0303659439086914 + }, + { + "auxiliary_loss_clip": 0.01054601, + "auxiliary_loss_mlp": 0.01020383, + "balance_loss_clip": 1.01031625, + "balance_loss_mlp": 1.01843309, + "epoch": 0.6379077108071547, + "flos": 23001652483200.0, + "grad_norm": 1.5171634145478783, + "language_loss": 0.74555892, + "learning_rate": 1.1604937632669006e-06, + "loss": 0.76630872, + "num_input_tokens_seen": 228804110, + "router_z_loss_clip": 0.10058594, + "router_z_loss_mlp": 0.36132812, + "step": 10610, + "time_per_iteration": 3.9177186489105225 + }, + { + "auxiliary_loss_clip": 0.01007239, + "auxiliary_loss_mlp": 0.01001152, + "balance_loss_clip": 1.00028205, + "balance_loss_mlp": 1.00064969, + "epoch": 0.6379678340598226, + "flos": 67598201237760.0, + "grad_norm": 0.8533788572696468, + "language_loss": 0.63076186, + "learning_rate": 1.1601508522262767e-06, + "loss": 0.65084577, + "num_input_tokens_seen": 228867705, + "router_z_loss_clip": 0.00872803, + "router_z_loss_mlp": 0.06591797, + "step": 10611, + "time_per_iteration": 4.560175180435181 + }, + { + "auxiliary_loss_clip": 0.0105767, + "auxiliary_loss_mlp": 0.01023003, + "balance_loss_clip": 1.01117718, + "balance_loss_mlp": 1.01864409, + "epoch": 0.6380279573124906, + "flos": 29678756693760.0, + "grad_norm": 32.85793717885991, + "language_loss": 0.72108006, + "learning_rate": 1.1598079711577083e-06, + "loss": 0.74188679, + "num_input_tokens_seen": 228889215, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.390625, + "step": 10612, + "time_per_iteration": 2.439919948577881 + }, + { + "auxiliary_loss_clip": 0.0100728, + "auxiliary_loss_mlp": 0.01001239, + "balance_loss_clip": 1.00041103, + "balance_loss_mlp": 1.00055528, + "epoch": 0.6380880805651585, + "flos": 66480981730560.0, + "grad_norm": 0.711597800018794, + "language_loss": 0.57871723, + "learning_rate": 1.1594651200734333e-06, + "loss": 0.59880245, + "num_input_tokens_seen": 228948465, + "router_z_loss_clip": 0.00830078, + "router_z_loss_mlp": 0.06738281, + "step": 10613, + "time_per_iteration": 2.9867701530456543 + }, + { + "auxiliary_loss_clip": 0.01058812, + "auxiliary_loss_mlp": 0.01022358, + "balance_loss_clip": 1.01071703, + "balance_loss_mlp": 1.0196209, + "epoch": 0.6381482038178266, + "flos": 23913863153280.0, + "grad_norm": 3.9794825964535376, + "language_loss": 0.75241917, + "learning_rate": 1.1591222989856847e-06, + "loss": 0.77323085, + "num_input_tokens_seen": 228967955, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.39257812, + "step": 10614, + "time_per_iteration": 2.4264461994171143 + }, + { + "auxiliary_loss_clip": 0.01057598, + "auxiliary_loss_mlp": 0.01027673, + "balance_loss_clip": 1.0161097, + "balance_loss_mlp": 1.02001405, + "epoch": 0.6382083270704945, + "flos": 24241907088000.0, + "grad_norm": 2.551353811144796, + "language_loss": 0.79666948, + "learning_rate": 1.158779507906699e-06, + "loss": 0.81752217, + "num_input_tokens_seen": 228985495, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.37695312, + "step": 10615, + "time_per_iteration": 2.461258888244629 + }, + { + "auxiliary_loss_clip": 0.0100683, + "auxiliary_loss_mlp": 0.01000516, + "balance_loss_clip": 0.99967557, + "balance_loss_mlp": 1.00031686, + "epoch": 0.6382684503231625, + "flos": 70771736401920.0, + "grad_norm": 0.6610898846075127, + "language_loss": 0.55604756, + "learning_rate": 1.1584367468487087e-06, + "loss": 0.57612103, + "num_input_tokens_seen": 229052995, + "router_z_loss_clip": 0.00842285, + "router_z_loss_mlp": 0.06542969, + "step": 10616, + "time_per_iteration": 3.155568838119507 + }, + { + "auxiliary_loss_clip": 0.01058291, + "auxiliary_loss_mlp": 0.01023226, + "balance_loss_clip": 1.01165652, + "balance_loss_mlp": 1.01976311, + "epoch": 0.6383285735758305, + "flos": 16543907544960.0, + "grad_norm": 2.0114801005449072, + "language_loss": 0.84039563, + "learning_rate": 1.158094015823946e-06, + "loss": 0.86121082, + "num_input_tokens_seen": 229071030, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.38476562, + "step": 10617, + "time_per_iteration": 2.380998134613037 + }, + { + "auxiliary_loss_clip": 0.01058922, + "auxiliary_loss_mlp": 0.01026743, + "balance_loss_clip": 1.0145483, + "balance_loss_mlp": 1.01910686, + "epoch": 0.6383886968284984, + "flos": 14426809344000.0, + "grad_norm": 2.1558277616683856, + "language_loss": 0.8757683, + "learning_rate": 1.1577513148446426e-06, + "loss": 0.89662492, + "num_input_tokens_seen": 229088275, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.3984375, + "step": 10618, + "time_per_iteration": 3.810173749923706 + }, + { + "auxiliary_loss_clip": 0.01059787, + "auxiliary_loss_mlp": 0.01023839, + "balance_loss_clip": 1.0128417, + "balance_loss_mlp": 1.02023387, + "epoch": 0.6384488200811664, + "flos": 17928737556480.0, + "grad_norm": 1.734612003874151, + "language_loss": 0.73301673, + "learning_rate": 1.1574086439230273e-06, + "loss": 0.75385302, + "num_input_tokens_seen": 229105190, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.39453125, + "step": 10619, + "time_per_iteration": 2.365835189819336 + }, + { + "auxiliary_loss_clip": 0.01060713, + "auxiliary_loss_mlp": 0.01025178, + "balance_loss_clip": 1.01235151, + "balance_loss_mlp": 1.01950276, + "epoch": 0.6385089433338343, + "flos": 18514580037120.0, + "grad_norm": 2.05558441565159, + "language_loss": 0.76577383, + "learning_rate": 1.1570660030713315e-06, + "loss": 0.78663278, + "num_input_tokens_seen": 229122290, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.41210938, + "step": 10620, + "time_per_iteration": 2.3925013542175293 + }, + { + "auxiliary_loss_clip": 0.01057037, + "auxiliary_loss_mlp": 0.01020085, + "balance_loss_clip": 1.00926721, + "balance_loss_mlp": 1.0195272, + "epoch": 0.6385690665865024, + "flos": 24752476944000.0, + "grad_norm": 2.5958138758777185, + "language_loss": 0.70705628, + "learning_rate": 1.1567233923017805e-06, + "loss": 0.72782749, + "num_input_tokens_seen": 229141620, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.375, + "step": 10621, + "time_per_iteration": 2.450477123260498 + }, + { + "auxiliary_loss_clip": 0.01052035, + "auxiliary_loss_mlp": 0.0102026, + "balance_loss_clip": 1.01026404, + "balance_loss_mlp": 1.01715565, + "epoch": 0.6386291898391703, + "flos": 20119537370880.0, + "grad_norm": 1.5491894601691159, + "language_loss": 0.77679396, + "learning_rate": 1.1563808116266032e-06, + "loss": 0.79751688, + "num_input_tokens_seen": 229161570, + "router_z_loss_clip": 0.10009766, + "router_z_loss_mlp": 0.34765625, + "step": 10622, + "time_per_iteration": 2.404874563217163 + }, + { + "auxiliary_loss_clip": 0.01054887, + "auxiliary_loss_mlp": 0.01021091, + "balance_loss_clip": 1.01017129, + "balance_loss_mlp": 1.01808143, + "epoch": 0.6386893130918383, + "flos": 16866505307520.0, + "grad_norm": 1.7345632316985022, + "language_loss": 0.74449301, + "learning_rate": 1.1560382610580245e-06, + "loss": 0.76525277, + "num_input_tokens_seen": 229178465, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.3671875, + "step": 10623, + "time_per_iteration": 2.3737518787384033 + }, + { + "auxiliary_loss_clip": 0.01054744, + "auxiliary_loss_mlp": 0.01019651, + "balance_loss_clip": 1.00971508, + "balance_loss_mlp": 1.01833868, + "epoch": 0.6387494363445062, + "flos": 22965168245760.0, + "grad_norm": 1.3852897087832547, + "language_loss": 0.76934588, + "learning_rate": 1.1556957406082694e-06, + "loss": 0.79008985, + "num_input_tokens_seen": 229198975, + "router_z_loss_clip": 0.09960938, + "router_z_loss_mlp": 0.36328125, + "step": 10624, + "time_per_iteration": 2.4229860305786133 + }, + { + "auxiliary_loss_clip": 0.01056487, + "auxiliary_loss_mlp": 0.01025527, + "balance_loss_clip": 1.01414251, + "balance_loss_mlp": 1.01872826, + "epoch": 0.6388095595971742, + "flos": 22856588317440.0, + "grad_norm": 1.6995567826127604, + "language_loss": 0.80434769, + "learning_rate": 1.155353250289561e-06, + "loss": 0.82516789, + "num_input_tokens_seen": 229218825, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.37890625, + "step": 10625, + "time_per_iteration": 2.4300460815429688 + }, + { + "auxiliary_loss_clip": 0.01058544, + "auxiliary_loss_mlp": 0.01026188, + "balance_loss_clip": 1.01410604, + "balance_loss_mlp": 1.01944041, + "epoch": 0.6388696828498421, + "flos": 17310565290240.0, + "grad_norm": 3.0274291511645206, + "language_loss": 0.73191094, + "learning_rate": 1.1550107901141228e-06, + "loss": 0.75275832, + "num_input_tokens_seen": 229236060, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.390625, + "step": 10626, + "time_per_iteration": 2.373952865600586 + }, + { + "auxiliary_loss_clip": 0.01059031, + "auxiliary_loss_mlp": 0.01024777, + "balance_loss_clip": 1.01204515, + "balance_loss_mlp": 1.01953888, + "epoch": 0.6389298061025102, + "flos": 17127690255360.0, + "grad_norm": 1.8059224515218077, + "language_loss": 0.72847128, + "learning_rate": 1.154668360094176e-06, + "loss": 0.74930942, + "num_input_tokens_seen": 229255160, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.39453125, + "step": 10627, + "time_per_iteration": 2.3767263889312744 + }, + { + "auxiliary_loss_clip": 0.01060978, + "auxiliary_loss_mlp": 0.01023453, + "balance_loss_clip": 1.0109539, + "balance_loss_mlp": 1.02003062, + "epoch": 0.6389899293551781, + "flos": 27709690124160.0, + "grad_norm": 1.6925542062120167, + "language_loss": 0.66556239, + "learning_rate": 1.154325960241941e-06, + "loss": 0.68640667, + "num_input_tokens_seen": 229278705, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.40820312, + "step": 10628, + "time_per_iteration": 2.4863040447235107 + }, + { + "auxiliary_loss_clip": 0.01054164, + "auxiliary_loss_mlp": 0.01019959, + "balance_loss_clip": 1.00978994, + "balance_loss_mlp": 1.01829863, + "epoch": 0.6390500526078461, + "flos": 21214623075840.0, + "grad_norm": 2.2626648572983883, + "language_loss": 0.67839795, + "learning_rate": 1.1539835905696365e-06, + "loss": 0.69913918, + "num_input_tokens_seen": 229299990, + "router_z_loss_clip": 0.1015625, + "router_z_loss_mlp": 0.359375, + "step": 10629, + "time_per_iteration": 2.4606268405914307 + }, + { + "auxiliary_loss_clip": 0.01061296, + "auxiliary_loss_mlp": 0.01027467, + "balance_loss_clip": 1.01350141, + "balance_loss_mlp": 1.0198741, + "epoch": 0.6391101758605141, + "flos": 21579954209280.0, + "grad_norm": 1.9217639168874048, + "language_loss": 0.75109929, + "learning_rate": 1.1536412510894828e-06, + "loss": 0.77198696, + "num_input_tokens_seen": 229319230, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.4140625, + "step": 10630, + "time_per_iteration": 2.464754581451416 + }, + { + "auxiliary_loss_clip": 0.01056083, + "auxiliary_loss_mlp": 0.0102068, + "balance_loss_clip": 1.01017165, + "balance_loss_mlp": 1.01898897, + "epoch": 0.639170299113182, + "flos": 19899479871360.0, + "grad_norm": 1.5605850162557482, + "language_loss": 0.70635653, + "learning_rate": 1.1532989418136951e-06, + "loss": 0.72712409, + "num_input_tokens_seen": 229338600, + "router_z_loss_clip": 0.10498047, + "router_z_loss_mlp": 0.37109375, + "step": 10631, + "time_per_iteration": 2.488166332244873 + }, + { + "auxiliary_loss_clip": 0.01006979, + "auxiliary_loss_mlp": 0.01001089, + "balance_loss_clip": 1.00017679, + "balance_loss_mlp": 1.00043547, + "epoch": 0.63923042236585, + "flos": 69874434748800.0, + "grad_norm": 0.7675764909635795, + "language_loss": 0.62942564, + "learning_rate": 1.1529566627544894e-06, + "loss": 0.64950633, + "num_input_tokens_seen": 229402420, + "router_z_loss_clip": 0.00909424, + "router_z_loss_mlp": 0.06542969, + "step": 10632, + "time_per_iteration": 3.1103286743164062 + }, + { + "auxiliary_loss_clip": 0.01055223, + "auxiliary_loss_mlp": 0.01025571, + "balance_loss_clip": 1.01438355, + "balance_loss_mlp": 1.01811957, + "epoch": 0.639290545618518, + "flos": 22673713282560.0, + "grad_norm": 1.8487310716146466, + "language_loss": 0.67033017, + "learning_rate": 1.1526144139240832e-06, + "loss": 0.69113815, + "num_input_tokens_seen": 229419185, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.37109375, + "step": 10633, + "time_per_iteration": 2.444506883621216 + }, + { + "auxiliary_loss_clip": 0.01057436, + "auxiliary_loss_mlp": 0.01022949, + "balance_loss_clip": 1.0113616, + "balance_loss_mlp": 1.01944709, + "epoch": 0.639350668871186, + "flos": 19828152230400.0, + "grad_norm": 1.9051587103365486, + "language_loss": 0.82364923, + "learning_rate": 1.152272195334687e-06, + "loss": 0.8444531, + "num_input_tokens_seen": 229436735, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.37890625, + "step": 10634, + "time_per_iteration": 2.397249937057495 + }, + { + "auxiliary_loss_clip": 0.01057826, + "auxiliary_loss_mlp": 0.01022883, + "balance_loss_clip": 1.01072407, + "balance_loss_mlp": 1.01894403, + "epoch": 0.6394107921238539, + "flos": 20552425718400.0, + "grad_norm": 1.6753004192397598, + "language_loss": 0.75242686, + "learning_rate": 1.1519300069985165e-06, + "loss": 0.77323389, + "num_input_tokens_seen": 229455595, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.38867188, + "step": 10635, + "time_per_iteration": 2.413337230682373 + }, + { + "auxiliary_loss_clip": 0.01058108, + "auxiliary_loss_mlp": 0.01022812, + "balance_loss_clip": 1.01087952, + "balance_loss_mlp": 1.01911151, + "epoch": 0.6394709153765219, + "flos": 25773826124160.0, + "grad_norm": 1.5735009771954214, + "language_loss": 0.71785939, + "learning_rate": 1.151587848927782e-06, + "loss": 0.73866856, + "num_input_tokens_seen": 229476230, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.38867188, + "step": 10636, + "time_per_iteration": 2.421208143234253 + }, + { + "auxiliary_loss_clip": 0.01059203, + "auxiliary_loss_mlp": 0.0102509, + "balance_loss_clip": 1.01303828, + "balance_loss_mlp": 1.01905346, + "epoch": 0.6395310386291898, + "flos": 17529191424000.0, + "grad_norm": 2.031753485480657, + "language_loss": 0.74040353, + "learning_rate": 1.1512457211346963e-06, + "loss": 0.76124644, + "num_input_tokens_seen": 229494300, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.40039062, + "step": 10637, + "time_per_iteration": 2.4098145961761475 + }, + { + "auxiliary_loss_clip": 0.01059095, + "auxiliary_loss_mlp": 0.01025597, + "balance_loss_clip": 1.01294291, + "balance_loss_mlp": 1.01878333, + "epoch": 0.6395911618818578, + "flos": 18587234309760.0, + "grad_norm": 1.7711725336798059, + "language_loss": 0.7742914, + "learning_rate": 1.1509036236314656e-06, + "loss": 0.79513836, + "num_input_tokens_seen": 229512985, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.40234375, + "step": 10638, + "time_per_iteration": 2.3610777854919434 + }, + { + "auxiliary_loss_clip": 0.01056632, + "auxiliary_loss_mlp": 0.01022582, + "balance_loss_clip": 1.01018977, + "balance_loss_mlp": 1.0186826, + "epoch": 0.6396512851345257, + "flos": 28365289234560.0, + "grad_norm": 1.4801711789192313, + "language_loss": 0.81856686, + "learning_rate": 1.1505615564303016e-06, + "loss": 0.83935899, + "num_input_tokens_seen": 229534270, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.37890625, + "step": 10639, + "time_per_iteration": 4.049687623977661 + }, + { + "auxiliary_loss_clip": 0.01007146, + "auxiliary_loss_mlp": 0.01000596, + "balance_loss_clip": 0.9997912, + "balance_loss_mlp": 1.00060225, + "epoch": 0.6397114083871938, + "flos": 70720903595520.0, + "grad_norm": 0.8624243819936094, + "language_loss": 0.5903241, + "learning_rate": 1.1502195195434104e-06, + "loss": 0.61040151, + "num_input_tokens_seen": 229596455, + "router_z_loss_clip": 0.00805664, + "router_z_loss_mlp": 0.06542969, + "step": 10640, + "time_per_iteration": 3.0050559043884277 + }, + { + "auxiliary_loss_clip": 0.01055884, + "auxiliary_loss_mlp": 0.01019906, + "balance_loss_clip": 1.00924277, + "balance_loss_mlp": 1.01878047, + "epoch": 0.6397715316398617, + "flos": 18141777872640.0, + "grad_norm": 1.9528962080441272, + "language_loss": 0.7863993, + "learning_rate": 1.1498775129829988e-06, + "loss": 0.80715722, + "num_input_tokens_seen": 229612860, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.37109375, + "step": 10641, + "time_per_iteration": 2.362816095352173 + }, + { + "auxiliary_loss_clip": 0.01061394, + "auxiliary_loss_mlp": 0.0102782, + "balance_loss_clip": 1.01501656, + "balance_loss_mlp": 1.0197618, + "epoch": 0.6398316548925297, + "flos": 25738319404800.0, + "grad_norm": 1.9279478505187948, + "language_loss": 0.63257742, + "learning_rate": 1.149535536761271e-06, + "loss": 0.65346956, + "num_input_tokens_seen": 229633960, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.41601562, + "step": 10642, + "time_per_iteration": 2.4461469650268555 + }, + { + "auxiliary_loss_clip": 0.01007084, + "auxiliary_loss_mlp": 0.01001773, + "balance_loss_clip": 1.00085521, + "balance_loss_mlp": 1.00058579, + "epoch": 0.6398917781451977, + "flos": 71211399419520.0, + "grad_norm": 0.9149337799113935, + "language_loss": 0.55965722, + "learning_rate": 1.1491935908904328e-06, + "loss": 0.57974577, + "num_input_tokens_seen": 229686730, + "router_z_loss_clip": 0.00915527, + "router_z_loss_mlp": 0.06494141, + "step": 10643, + "time_per_iteration": 3.0160646438598633 + }, + { + "auxiliary_loss_clip": 0.01061148, + "auxiliary_loss_mlp": 0.01021162, + "balance_loss_clip": 1.00847208, + "balance_loss_mlp": 1.01980054, + "epoch": 0.6399519013978656, + "flos": 20520794160000.0, + "grad_norm": 2.8085319891676517, + "language_loss": 0.76668298, + "learning_rate": 1.1488516753826874e-06, + "loss": 0.7875061, + "num_input_tokens_seen": 229704800, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.41210938, + "step": 10644, + "time_per_iteration": 2.4385364055633545 + }, + { + "auxiliary_loss_clip": 0.01059405, + "auxiliary_loss_mlp": 0.01021864, + "balance_loss_clip": 1.00969255, + "balance_loss_mlp": 1.02017641, + "epoch": 0.6400120246505336, + "flos": 24459730260480.0, + "grad_norm": 1.5552316039702494, + "language_loss": 0.82595593, + "learning_rate": 1.148509790250236e-06, + "loss": 0.84676862, + "num_input_tokens_seen": 229725265, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.390625, + "step": 10645, + "time_per_iteration": 2.4328572750091553 + }, + { + "auxiliary_loss_clip": 0.0106102, + "auxiliary_loss_mlp": 0.01029913, + "balance_loss_clip": 1.01651967, + "balance_loss_mlp": 1.01956153, + "epoch": 0.6400721479032015, + "flos": 28364835386880.0, + "grad_norm": 1.7920048199267409, + "language_loss": 0.73559213, + "learning_rate": 1.14816793550528e-06, + "loss": 0.75650138, + "num_input_tokens_seen": 229744840, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.4140625, + "step": 10646, + "time_per_iteration": 2.480058193206787 + }, + { + "auxiliary_loss_clip": 0.01058656, + "auxiliary_loss_mlp": 0.01028434, + "balance_loss_clip": 1.01577425, + "balance_loss_mlp": 1.01883018, + "epoch": 0.6401322711558696, + "flos": 17815723885440.0, + "grad_norm": 2.0821922074629136, + "language_loss": 0.80117166, + "learning_rate": 1.1478261111600191e-06, + "loss": 0.82204258, + "num_input_tokens_seen": 229759095, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.3984375, + "step": 10647, + "time_per_iteration": 2.352259874343872 + }, + { + "auxiliary_loss_clip": 0.01058009, + "auxiliary_loss_mlp": 0.01019659, + "balance_loss_clip": 1.00860226, + "balance_loss_mlp": 1.01981783, + "epoch": 0.6401923944085375, + "flos": 26029669633920.0, + "grad_norm": 1.7179585917598978, + "language_loss": 0.75489175, + "learning_rate": 1.1474843172266525e-06, + "loss": 0.77566844, + "num_input_tokens_seen": 229777750, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.3828125, + "step": 10648, + "time_per_iteration": 2.457343578338623 + }, + { + "auxiliary_loss_clip": 0.01057755, + "auxiliary_loss_mlp": 0.01024368, + "balance_loss_clip": 1.01294804, + "balance_loss_mlp": 1.01837969, + "epoch": 0.6402525176612055, + "flos": 23585330459520.0, + "grad_norm": 2.043951486617398, + "language_loss": 0.78501248, + "learning_rate": 1.1471425537173764e-06, + "loss": 0.8058337, + "num_input_tokens_seen": 229796785, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.39453125, + "step": 10649, + "time_per_iteration": 3.8698372840881348 + }, + { + "auxiliary_loss_clip": 0.01060088, + "auxiliary_loss_mlp": 0.01027517, + "balance_loss_clip": 1.0152024, + "balance_loss_mlp": 1.02065003, + "epoch": 0.6403126409138734, + "flos": 18040424595840.0, + "grad_norm": 3.0400329062221583, + "language_loss": 0.75393319, + "learning_rate": 1.1468008206443907e-06, + "loss": 0.77480924, + "num_input_tokens_seen": 229815425, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.39453125, + "step": 10650, + "time_per_iteration": 3.844287633895874 + }, + { + "auxiliary_loss_clip": 0.01058836, + "auxiliary_loss_mlp": 0.01027293, + "balance_loss_clip": 1.0146687, + "balance_loss_mlp": 1.0196209, + "epoch": 0.6403727641665414, + "flos": 21978453000960.0, + "grad_norm": 1.89368623908487, + "language_loss": 0.71177256, + "learning_rate": 1.1464591180198872e-06, + "loss": 0.73263383, + "num_input_tokens_seen": 229834545, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.39257812, + "step": 10651, + "time_per_iteration": 2.4127442836761475 + }, + { + "auxiliary_loss_clip": 0.01059233, + "auxiliary_loss_mlp": 0.01027235, + "balance_loss_clip": 1.01478374, + "balance_loss_mlp": 1.02036119, + "epoch": 0.6404328874192093, + "flos": 24894503821440.0, + "grad_norm": 2.545610577539072, + "language_loss": 0.6372751, + "learning_rate": 1.1461174458560634e-06, + "loss": 0.65813982, + "num_input_tokens_seen": 229849175, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.38867188, + "step": 10652, + "time_per_iteration": 2.416428327560425 + }, + { + "auxiliary_loss_clip": 0.01058448, + "auxiliary_loss_mlp": 0.01022281, + "balance_loss_clip": 1.01083732, + "balance_loss_mlp": 1.01975262, + "epoch": 0.6404930106718774, + "flos": 17596399524480.0, + "grad_norm": 1.8620785532247748, + "language_loss": 0.79300505, + "learning_rate": 1.1457758041651104e-06, + "loss": 0.81381238, + "num_input_tokens_seen": 229865400, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.38671875, + "step": 10653, + "time_per_iteration": 2.41611385345459 + }, + { + "auxiliary_loss_clip": 0.01063659, + "auxiliary_loss_mlp": 0.01031046, + "balance_loss_clip": 1.01551342, + "balance_loss_mlp": 1.01945889, + "epoch": 0.6405531339245453, + "flos": 20156824569600.0, + "grad_norm": 2.4373019248266905, + "language_loss": 0.70710796, + "learning_rate": 1.1454341929592231e-06, + "loss": 0.728055, + "num_input_tokens_seen": 229882945, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.44140625, + "step": 10654, + "time_per_iteration": 2.4162051677703857 + }, + { + "auxiliary_loss_clip": 0.01060171, + "auxiliary_loss_mlp": 0.01030388, + "balance_loss_clip": 1.01709044, + "balance_loss_mlp": 1.01936316, + "epoch": 0.6406132571772133, + "flos": 21941270536320.0, + "grad_norm": 1.9871048239823108, + "language_loss": 0.72605318, + "learning_rate": 1.14509261225059e-06, + "loss": 0.74695873, + "num_input_tokens_seen": 229901590, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.40820312, + "step": 10655, + "time_per_iteration": 2.3944714069366455 + }, + { + "auxiliary_loss_clip": 0.01060106, + "auxiliary_loss_mlp": 0.01024875, + "balance_loss_clip": 1.01225114, + "balance_loss_mlp": 1.01972055, + "epoch": 0.6406733804298813, + "flos": 28766720580480.0, + "grad_norm": 2.0639540713755893, + "language_loss": 0.82773864, + "learning_rate": 1.144751062051403e-06, + "loss": 0.84858841, + "num_input_tokens_seen": 229922535, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.40429688, + "step": 10656, + "time_per_iteration": 2.465494394302368 + }, + { + "auxiliary_loss_clip": 0.01057597, + "auxiliary_loss_mlp": 0.01025696, + "balance_loss_clip": 1.01391256, + "balance_loss_mlp": 1.01825547, + "epoch": 0.6407335036825492, + "flos": 17456222949120.0, + "grad_norm": 2.7771403731217936, + "language_loss": 0.72209752, + "learning_rate": 1.1444095423738506e-06, + "loss": 0.74293041, + "num_input_tokens_seen": 229939575, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.39453125, + "step": 10657, + "time_per_iteration": 3.837526798248291 + }, + { + "auxiliary_loss_clip": 0.0105981, + "auxiliary_loss_mlp": 0.01030421, + "balance_loss_clip": 1.01896524, + "balance_loss_mlp": 1.02016044, + "epoch": 0.6407936269352172, + "flos": 22124250305280.0, + "grad_norm": 1.7665527930132534, + "language_loss": 0.77294922, + "learning_rate": 1.144068053230121e-06, + "loss": 0.79385155, + "num_input_tokens_seen": 229958840, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.39648438, + "step": 10658, + "time_per_iteration": 2.4517405033111572 + }, + { + "auxiliary_loss_clip": 0.0105989, + "auxiliary_loss_mlp": 0.01023478, + "balance_loss_clip": 1.01050186, + "balance_loss_mlp": 1.01923621, + "epoch": 0.6408537501878852, + "flos": 23109569095680.0, + "grad_norm": 1.6549068134950302, + "language_loss": 0.76384211, + "learning_rate": 1.1437265946324002e-06, + "loss": 0.78467572, + "num_input_tokens_seen": 229979680, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.40625, + "step": 10659, + "time_per_iteration": 2.443232774734497 + }, + { + "auxiliary_loss_clip": 0.010575, + "auxiliary_loss_mlp": 0.01021135, + "balance_loss_clip": 1.00872517, + "balance_loss_mlp": 1.01906776, + "epoch": 0.6409138734405532, + "flos": 16471497651840.0, + "grad_norm": 1.6367869709323353, + "language_loss": 0.77977848, + "learning_rate": 1.1433851665928751e-06, + "loss": 0.80056489, + "num_input_tokens_seen": 229996830, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.38476562, + "step": 10660, + "time_per_iteration": 2.3859710693359375 + }, + { + "auxiliary_loss_clip": 0.01064072, + "auxiliary_loss_mlp": 0.01030168, + "balance_loss_clip": 1.01603556, + "balance_loss_mlp": 1.02114046, + "epoch": 0.6409739966932211, + "flos": 22776986684160.0, + "grad_norm": 2.2564216323004374, + "language_loss": 0.68270719, + "learning_rate": 1.143043769123731e-06, + "loss": 0.70364958, + "num_input_tokens_seen": 230015115, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.4296875, + "step": 10661, + "time_per_iteration": 2.4047248363494873 + }, + { + "auxiliary_loss_clip": 0.01058147, + "auxiliary_loss_mlp": 0.01029369, + "balance_loss_clip": 1.01748955, + "balance_loss_mlp": 1.02001297, + "epoch": 0.6410341199458891, + "flos": 25150975735680.0, + "grad_norm": 2.191805532264937, + "language_loss": 0.7610327, + "learning_rate": 1.1427024022371486e-06, + "loss": 0.7819078, + "num_input_tokens_seen": 230035515, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.3828125, + "step": 10662, + "time_per_iteration": 2.4376702308654785 + }, + { + "auxiliary_loss_clip": 0.01058046, + "auxiliary_loss_mlp": 0.01024225, + "balance_loss_clip": 1.01197064, + "balance_loss_mlp": 1.01783109, + "epoch": 0.641094243198557, + "flos": 27045153705600.0, + "grad_norm": 1.7004572623534717, + "language_loss": 0.70047987, + "learning_rate": 1.142361065945313e-06, + "loss": 0.72130257, + "num_input_tokens_seen": 230054355, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.40234375, + "step": 10663, + "time_per_iteration": 2.4708335399627686 + }, + { + "auxiliary_loss_clip": 0.0106018, + "auxiliary_loss_mlp": 0.01025799, + "balance_loss_clip": 1.01204777, + "balance_loss_mlp": 1.01896358, + "epoch": 0.641154366451225, + "flos": 25373372296320.0, + "grad_norm": 1.9794530159855146, + "language_loss": 0.67974561, + "learning_rate": 1.1420197602604052e-06, + "loss": 0.70060539, + "num_input_tokens_seen": 230074605, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.4140625, + "step": 10664, + "time_per_iteration": 2.433748245239258 + }, + { + "auxiliary_loss_clip": 0.01056825, + "auxiliary_loss_mlp": 0.01024571, + "balance_loss_clip": 1.01291251, + "balance_loss_mlp": 1.01900554, + "epoch": 0.6412144897038929, + "flos": 25152232544640.0, + "grad_norm": 1.7197412642522487, + "language_loss": 0.66518986, + "learning_rate": 1.1416784851946045e-06, + "loss": 0.6860038, + "num_input_tokens_seen": 230093820, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.37890625, + "step": 10665, + "time_per_iteration": 2.469649076461792 + }, + { + "auxiliary_loss_clip": 0.01056656, + "auxiliary_loss_mlp": 0.01024471, + "balance_loss_clip": 1.01326585, + "balance_loss_mlp": 1.01884317, + "epoch": 0.641274612956561, + "flos": 23439637889280.0, + "grad_norm": 1.7536170923680574, + "language_loss": 0.64369363, + "learning_rate": 1.1413372407600907e-06, + "loss": 0.66450489, + "num_input_tokens_seen": 230114285, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.37890625, + "step": 10666, + "time_per_iteration": 2.4796504974365234 + }, + { + "auxiliary_loss_clip": 0.01058869, + "auxiliary_loss_mlp": 0.01029685, + "balance_loss_clip": 1.01710224, + "balance_loss_mlp": 1.01886225, + "epoch": 0.6413347362092289, + "flos": 19426476504960.0, + "grad_norm": 1.5463375191321824, + "language_loss": 0.71304154, + "learning_rate": 1.1409960269690433e-06, + "loss": 0.73392713, + "num_input_tokens_seen": 230132760, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.40039062, + "step": 10667, + "time_per_iteration": 2.495270252227783 + }, + { + "auxiliary_loss_clip": 0.01064875, + "auxiliary_loss_mlp": 0.01034055, + "balance_loss_clip": 1.02013159, + "balance_loss_mlp": 1.02084112, + "epoch": 0.6413948594618969, + "flos": 17195771139840.0, + "grad_norm": 2.189299741434883, + "language_loss": 0.77729285, + "learning_rate": 1.1406548438336368e-06, + "loss": 0.79828215, + "num_input_tokens_seen": 230149690, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.44140625, + "step": 10668, + "time_per_iteration": 2.4253666400909424 + }, + { + "auxiliary_loss_clip": 0.01059533, + "auxiliary_loss_mlp": 0.01026892, + "balance_loss_clip": 1.01463127, + "balance_loss_mlp": 1.0202775, + "epoch": 0.6414549827145648, + "flos": 22268790800640.0, + "grad_norm": 1.8204950865307021, + "language_loss": 0.67790747, + "learning_rate": 1.1403136913660488e-06, + "loss": 0.69877172, + "num_input_tokens_seen": 230166950, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.39257812, + "step": 10669, + "time_per_iteration": 2.4667108058929443 + }, + { + "auxiliary_loss_clip": 0.01064393, + "auxiliary_loss_mlp": 0.01032011, + "balance_loss_clip": 1.01808178, + "balance_loss_mlp": 1.02180052, + "epoch": 0.6415151059672328, + "flos": 19639342264320.0, + "grad_norm": 1.648120169059299, + "language_loss": 0.78649557, + "learning_rate": 1.139972569578453e-06, + "loss": 0.80745959, + "num_input_tokens_seen": 230184785, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.42578125, + "step": 10670, + "time_per_iteration": 2.451131582260132 + }, + { + "auxiliary_loss_clip": 0.01062857, + "auxiliary_loss_mlp": 0.0102616, + "balance_loss_clip": 1.01277828, + "balance_loss_mlp": 1.02050996, + "epoch": 0.6415752292199008, + "flos": 14864969306880.0, + "grad_norm": 2.6218396065560072, + "language_loss": 0.88567853, + "learning_rate": 1.1396314784830257e-06, + "loss": 0.90656877, + "num_input_tokens_seen": 230201385, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.42382812, + "step": 10671, + "time_per_iteration": 2.4205033779144287 + }, + { + "auxiliary_loss_clip": 0.01058281, + "auxiliary_loss_mlp": 0.01025606, + "balance_loss_clip": 1.01359582, + "balance_loss_mlp": 1.01989603, + "epoch": 0.6416353524725688, + "flos": 13734725996160.0, + "grad_norm": 1.664286812934029, + "language_loss": 0.6939013, + "learning_rate": 1.1392904180919363e-06, + "loss": 0.71474016, + "num_input_tokens_seen": 230220380, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.3828125, + "step": 10672, + "time_per_iteration": 2.394742965698242 + }, + { + "auxiliary_loss_clip": 0.01060777, + "auxiliary_loss_mlp": 0.01026117, + "balance_loss_clip": 1.011729, + "balance_loss_mlp": 1.01817346, + "epoch": 0.6416954757252368, + "flos": 24533780987520.0, + "grad_norm": 2.098902197889091, + "language_loss": 0.73974472, + "learning_rate": 1.1389493884173584e-06, + "loss": 0.76061368, + "num_input_tokens_seen": 230239845, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.42578125, + "step": 10673, + "time_per_iteration": 2.416768789291382 + }, + { + "auxiliary_loss_clip": 0.01060332, + "auxiliary_loss_mlp": 0.01025635, + "balance_loss_clip": 1.012182, + "balance_loss_mlp": 1.02012694, + "epoch": 0.6417555989779047, + "flos": 27708747517440.0, + "grad_norm": 1.9388939963238903, + "language_loss": 0.69216597, + "learning_rate": 1.1386083894714622e-06, + "loss": 0.71302563, + "num_input_tokens_seen": 230262420, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.40234375, + "step": 10674, + "time_per_iteration": 2.431015968322754 + }, + { + "auxiliary_loss_clip": 0.01062662, + "auxiliary_loss_mlp": 0.0102561, + "balance_loss_clip": 1.01240766, + "balance_loss_mlp": 1.02032292, + "epoch": 0.6418157222305727, + "flos": 20555637563520.0, + "grad_norm": 1.7365355616016371, + "language_loss": 0.66046458, + "learning_rate": 1.1382674212664167e-06, + "loss": 0.68134725, + "num_input_tokens_seen": 230279950, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.421875, + "step": 10675, + "time_per_iteration": 2.4062929153442383 + }, + { + "auxiliary_loss_clip": 0.01060319, + "auxiliary_loss_mlp": 0.01026871, + "balance_loss_clip": 1.01500952, + "balance_loss_mlp": 1.02023983, + "epoch": 0.6418758454832406, + "flos": 22600430605440.0, + "grad_norm": 1.6488689797512455, + "language_loss": 0.66162777, + "learning_rate": 1.1379264838143902e-06, + "loss": 0.68249959, + "num_input_tokens_seen": 230299705, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.40039062, + "step": 10676, + "time_per_iteration": 2.406886339187622 + }, + { + "auxiliary_loss_clip": 0.01060923, + "auxiliary_loss_mlp": 0.01025987, + "balance_loss_clip": 1.01241481, + "balance_loss_mlp": 1.02030087, + "epoch": 0.6419359687359086, + "flos": 27374035512960.0, + "grad_norm": 2.1077786381161765, + "language_loss": 0.75716752, + "learning_rate": 1.1375855771275503e-06, + "loss": 0.77803659, + "num_input_tokens_seen": 230320030, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.40625, + "step": 10677, + "time_per_iteration": 2.4556820392608643 + }, + { + "auxiliary_loss_clip": 0.0100739, + "auxiliary_loss_mlp": 0.01000851, + "balance_loss_clip": 0.99990946, + "balance_loss_mlp": 1.00079775, + "epoch": 0.6419960919885765, + "flos": 67248791550720.0, + "grad_norm": 0.8296376149473075, + "language_loss": 0.60761428, + "learning_rate": 1.1372447012180624e-06, + "loss": 0.62769675, + "num_input_tokens_seen": 230381495, + "router_z_loss_clip": 0.00939941, + "router_z_loss_mlp": 0.06591797, + "step": 10678, + "time_per_iteration": 3.09366774559021 + }, + { + "auxiliary_loss_clip": 0.01058791, + "auxiliary_loss_mlp": 0.01030102, + "balance_loss_clip": 1.01818705, + "balance_loss_mlp": 1.02033377, + "epoch": 0.6420562152412446, + "flos": 19900841414400.0, + "grad_norm": 2.14506917358082, + "language_loss": 0.67154038, + "learning_rate": 1.1369038560980912e-06, + "loss": 0.6924293, + "num_input_tokens_seen": 230401385, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.3828125, + "step": 10679, + "time_per_iteration": 3.861982583999634 + }, + { + "auxiliary_loss_clip": 0.01061865, + "auxiliary_loss_mlp": 0.01025082, + "balance_loss_clip": 1.01249909, + "balance_loss_mlp": 1.02098286, + "epoch": 0.6421163384939125, + "flos": 24789031004160.0, + "grad_norm": 1.996703738035103, + "language_loss": 0.73454678, + "learning_rate": 1.136563041779802e-06, + "loss": 0.75541627, + "num_input_tokens_seen": 230421340, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.41015625, + "step": 10680, + "time_per_iteration": 2.4935553073883057 + }, + { + "auxiliary_loss_clip": 0.01058459, + "auxiliary_loss_mlp": 0.01023189, + "balance_loss_clip": 1.01147103, + "balance_loss_mlp": 1.01903319, + "epoch": 0.6421764617465805, + "flos": 25591649316480.0, + "grad_norm": 4.317214509913957, + "language_loss": 0.68431574, + "learning_rate": 1.1362222582753567e-06, + "loss": 0.70513219, + "num_input_tokens_seen": 230441270, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.39453125, + "step": 10681, + "time_per_iteration": 2.4130303859710693 + }, + { + "auxiliary_loss_clip": 0.01057615, + "auxiliary_loss_mlp": 0.01022245, + "balance_loss_clip": 1.01037765, + "balance_loss_mlp": 1.0187186, + "epoch": 0.6422365849992484, + "flos": 14133923015040.0, + "grad_norm": 1.817979692263653, + "language_loss": 0.74695867, + "learning_rate": 1.1358815055969174e-06, + "loss": 0.7677573, + "num_input_tokens_seen": 230457455, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.38867188, + "step": 10682, + "time_per_iteration": 2.3986332416534424 + }, + { + "auxiliary_loss_clip": 0.0105547, + "auxiliary_loss_mlp": 0.0102712, + "balance_loss_clip": 1.01543701, + "balance_loss_mlp": 1.01809096, + "epoch": 0.6422967082519164, + "flos": 22382781989760.0, + "grad_norm": 1.4699819978785824, + "language_loss": 0.79104185, + "learning_rate": 1.1355407837566433e-06, + "loss": 0.81186783, + "num_input_tokens_seen": 230478955, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.37304688, + "step": 10683, + "time_per_iteration": 2.4159412384033203 + }, + { + "auxiliary_loss_clip": 0.01060079, + "auxiliary_loss_mlp": 0.01028803, + "balance_loss_clip": 1.01556492, + "balance_loss_mlp": 1.01925969, + "epoch": 0.6423568315045844, + "flos": 14647041400320.0, + "grad_norm": 1.8239890938354462, + "language_loss": 0.67031854, + "learning_rate": 1.1352000927666966e-06, + "loss": 0.69120741, + "num_input_tokens_seen": 230496425, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.40820312, + "step": 10684, + "time_per_iteration": 2.3822028636932373 + }, + { + "auxiliary_loss_clip": 0.01058323, + "auxiliary_loss_mlp": 0.01023604, + "balance_loss_clip": 1.01091373, + "balance_loss_mlp": 1.01901937, + "epoch": 0.6424169547572524, + "flos": 26832706882560.0, + "grad_norm": 2.0584375670435446, + "language_loss": 0.71005678, + "learning_rate": 1.1348594326392324e-06, + "loss": 0.73087603, + "num_input_tokens_seen": 230516245, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.39453125, + "step": 10685, + "time_per_iteration": 2.4529240131378174 + }, + { + "auxiliary_loss_clip": 0.01057913, + "auxiliary_loss_mlp": 0.01027002, + "balance_loss_clip": 1.0153079, + "balance_loss_mlp": 1.01974344, + "epoch": 0.6424770780099204, + "flos": 22706427093120.0, + "grad_norm": 1.7660828840500555, + "language_loss": 0.75369, + "learning_rate": 1.1345188033864107e-06, + "loss": 0.77453917, + "num_input_tokens_seen": 230534745, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.38085938, + "step": 10686, + "time_per_iteration": 2.438201904296875 + }, + { + "auxiliary_loss_clip": 0.01061838, + "auxiliary_loss_mlp": 0.01032881, + "balance_loss_clip": 1.01936221, + "balance_loss_mlp": 1.02027988, + "epoch": 0.6425372012625883, + "flos": 28468422990720.0, + "grad_norm": 2.264924160100509, + "language_loss": 0.68509269, + "learning_rate": 1.1341782050203859e-06, + "loss": 0.70603991, + "num_input_tokens_seen": 230555895, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.41601562, + "step": 10687, + "time_per_iteration": 2.4600448608398438 + }, + { + "auxiliary_loss_clip": 0.01060974, + "auxiliary_loss_mlp": 0.0102353, + "balance_loss_clip": 1.0111382, + "balance_loss_mlp": 1.02108574, + "epoch": 0.6425973245152563, + "flos": 29350398556800.0, + "grad_norm": 1.9028292480334343, + "language_loss": 0.66564608, + "learning_rate": 1.1338376375533153e-06, + "loss": 0.68649107, + "num_input_tokens_seen": 230577460, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.3984375, + "step": 10688, + "time_per_iteration": 2.4834766387939453 + }, + { + "auxiliary_loss_clip": 0.01059765, + "auxiliary_loss_mlp": 0.01025823, + "balance_loss_clip": 1.01328182, + "balance_loss_mlp": 1.0193975, + "epoch": 0.6426574477679242, + "flos": 16429602153600.0, + "grad_norm": 1.7971067230476259, + "language_loss": 0.7302556, + "learning_rate": 1.1334971009973492e-06, + "loss": 0.75111151, + "num_input_tokens_seen": 230595030, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.40429688, + "step": 10689, + "time_per_iteration": 3.8264548778533936 + }, + { + "auxiliary_loss_clip": 0.01058441, + "auxiliary_loss_mlp": 0.01023487, + "balance_loss_clip": 1.01164341, + "balance_loss_mlp": 1.01916552, + "epoch": 0.6427175710205922, + "flos": 21834820200960.0, + "grad_norm": 1.874615322853189, + "language_loss": 0.71856678, + "learning_rate": 1.1331565953646443e-06, + "loss": 0.73938602, + "num_input_tokens_seen": 230615135, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.39257812, + "step": 10690, + "time_per_iteration": 3.8153579235076904 + }, + { + "auxiliary_loss_clip": 0.0105674, + "auxiliary_loss_mlp": 0.0102436, + "balance_loss_clip": 1.01192045, + "balance_loss_mlp": 1.01814985, + "epoch": 0.6427776942732601, + "flos": 17785628426880.0, + "grad_norm": 1.58986417423364, + "language_loss": 0.77494049, + "learning_rate": 1.1328161206673512e-06, + "loss": 0.79575157, + "num_input_tokens_seen": 230631965, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.38671875, + "step": 10691, + "time_per_iteration": 2.3906002044677734 + }, + { + "auxiliary_loss_clip": 0.01060749, + "auxiliary_loss_mlp": 0.01025613, + "balance_loss_clip": 1.01236844, + "balance_loss_mlp": 1.01946211, + "epoch": 0.6428378175259282, + "flos": 15084991895040.0, + "grad_norm": 1.7609377407359947, + "language_loss": 0.74443901, + "learning_rate": 1.1324756769176183e-06, + "loss": 0.7653026, + "num_input_tokens_seen": 230649565, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.41210938, + "step": 10692, + "time_per_iteration": 2.367703437805176 + }, + { + "auxiliary_loss_clip": 0.01058251, + "auxiliary_loss_mlp": 0.01024847, + "balance_loss_clip": 1.01203775, + "balance_loss_mlp": 1.01955116, + "epoch": 0.6428979407785961, + "flos": 23840650298880.0, + "grad_norm": 1.9898920547118457, + "language_loss": 0.61291486, + "learning_rate": 1.1321352641275978e-06, + "loss": 0.63374585, + "num_input_tokens_seen": 230669265, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.38671875, + "step": 10693, + "time_per_iteration": 2.418081521987915 + }, + { + "auxiliary_loss_clip": 0.01060126, + "auxiliary_loss_mlp": 0.01022446, + "balance_loss_clip": 1.00905323, + "balance_loss_mlp": 1.01847053, + "epoch": 0.6429580640312641, + "flos": 32925469800960.0, + "grad_norm": 1.748474047611073, + "language_loss": 0.59761071, + "learning_rate": 1.1317948823094376e-06, + "loss": 0.61843646, + "num_input_tokens_seen": 230690575, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.41796875, + "step": 10694, + "time_per_iteration": 2.46952223777771 + }, + { + "auxiliary_loss_clip": 0.01061447, + "auxiliary_loss_mlp": 0.01027569, + "balance_loss_clip": 1.01521873, + "balance_loss_mlp": 1.02091205, + "epoch": 0.643018187283932, + "flos": 21067324583040.0, + "grad_norm": 1.4793991069298256, + "language_loss": 0.79816502, + "learning_rate": 1.1314545314752844e-06, + "loss": 0.8190552, + "num_input_tokens_seen": 230709420, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.40429688, + "step": 10695, + "time_per_iteration": 2.396190881729126 + }, + { + "auxiliary_loss_clip": 0.01060785, + "auxiliary_loss_mlp": 0.01030011, + "balance_loss_clip": 1.0171361, + "balance_loss_mlp": 1.02008903, + "epoch": 0.6430783105366, + "flos": 26723428727040.0, + "grad_norm": 2.513119247288457, + "language_loss": 0.73704159, + "learning_rate": 1.1311142116372843e-06, + "loss": 0.75794953, + "num_input_tokens_seen": 230729350, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.40625, + "step": 10696, + "time_per_iteration": 2.456329107284546 + }, + { + "auxiliary_loss_clip": 0.01057447, + "auxiliary_loss_mlp": 0.01019626, + "balance_loss_clip": 1.00769341, + "balance_loss_mlp": 1.0194484, + "epoch": 0.643138433789268, + "flos": 23695690867200.0, + "grad_norm": 1.528588310676385, + "language_loss": 0.75539917, + "learning_rate": 1.1307739228075838e-06, + "loss": 0.7761699, + "num_input_tokens_seen": 230749220, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.37890625, + "step": 10697, + "time_per_iteration": 3.873155117034912 + }, + { + "auxiliary_loss_clip": 0.01057988, + "auxiliary_loss_mlp": 0.01026487, + "balance_loss_clip": 1.01489437, + "balance_loss_mlp": 1.01897883, + "epoch": 0.643198557041936, + "flos": 34200812188800.0, + "grad_norm": 2.3365686384593585, + "language_loss": 0.65868807, + "learning_rate": 1.1304336649983257e-06, + "loss": 0.67953283, + "num_input_tokens_seen": 230770245, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.390625, + "step": 10698, + "time_per_iteration": 2.5300722122192383 + }, + { + "auxiliary_loss_clip": 0.01007615, + "auxiliary_loss_mlp": 0.01001929, + "balance_loss_clip": 1.00102317, + "balance_loss_mlp": 1.0012399, + "epoch": 0.643258680294604, + "flos": 67623059992320.0, + "grad_norm": 0.8414547431412333, + "language_loss": 0.63395846, + "learning_rate": 1.1300934382216536e-06, + "loss": 0.65405393, + "num_input_tokens_seen": 230837030, + "router_z_loss_clip": 0.0090332, + "router_z_loss_mlp": 0.06347656, + "step": 10699, + "time_per_iteration": 3.0326898097991943 + }, + { + "auxiliary_loss_clip": 0.01058568, + "auxiliary_loss_mlp": 0.01021356, + "balance_loss_clip": 1.01041269, + "balance_loss_mlp": 1.02016163, + "epoch": 0.6433188035472719, + "flos": 25184981266560.0, + "grad_norm": 1.657798438853893, + "language_loss": 0.69389212, + "learning_rate": 1.129753242489708e-06, + "loss": 0.7146914, + "num_input_tokens_seen": 230856845, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.38476562, + "step": 10700, + "time_per_iteration": 2.427807331085205 + }, + { + "auxiliary_loss_clip": 0.01060133, + "auxiliary_loss_mlp": 0.01022467, + "balance_loss_clip": 1.00996172, + "balance_loss_mlp": 1.02005696, + "epoch": 0.6433789267999399, + "flos": 24972394798080.0, + "grad_norm": 1.8104119419966371, + "language_loss": 0.73536515, + "learning_rate": 1.1294130778146325e-06, + "loss": 0.75619113, + "num_input_tokens_seen": 230878785, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.40039062, + "step": 10701, + "time_per_iteration": 2.4760842323303223 + }, + { + "auxiliary_loss_clip": 0.01060037, + "auxiliary_loss_mlp": 0.01027022, + "balance_loss_clip": 1.0130744, + "balance_loss_mlp": 1.01842666, + "epoch": 0.6434390500526078, + "flos": 17565082168320.0, + "grad_norm": 2.1241586049598755, + "language_loss": 0.81755519, + "learning_rate": 1.129072944208563e-06, + "loss": 0.83842576, + "num_input_tokens_seen": 230895445, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.41601562, + "step": 10702, + "time_per_iteration": 2.378952980041504 + }, + { + "auxiliary_loss_clip": 0.01057555, + "auxiliary_loss_mlp": 0.01026419, + "balance_loss_clip": 1.01498652, + "balance_loss_mlp": 1.01938689, + "epoch": 0.6434991733052758, + "flos": 20842728606720.0, + "grad_norm": 1.9756698817009204, + "language_loss": 0.7416628, + "learning_rate": 1.1287328416836408e-06, + "loss": 0.76250255, + "num_input_tokens_seen": 230911375, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.3828125, + "step": 10703, + "time_per_iteration": 2.402233839035034 + }, + { + "auxiliary_loss_clip": 0.01007234, + "auxiliary_loss_mlp": 0.01001962, + "balance_loss_clip": 1.00099683, + "balance_loss_mlp": 1.00068808, + "epoch": 0.6435592965579437, + "flos": 66091210778880.0, + "grad_norm": 0.6564666511866133, + "language_loss": 0.54598528, + "learning_rate": 1.1283927702520013e-06, + "loss": 0.56607723, + "num_input_tokens_seen": 230975990, + "router_z_loss_clip": 0.00964355, + "router_z_loss_mlp": 0.06542969, + "step": 10704, + "time_per_iteration": 3.0541574954986572 + }, + { + "auxiliary_loss_clip": 0.01058036, + "auxiliary_loss_mlp": 0.01019448, + "balance_loss_clip": 1.00863588, + "balance_loss_mlp": 1.0185945, + "epoch": 0.6436194198106118, + "flos": 23767716735360.0, + "grad_norm": 1.6200584715381823, + "language_loss": 0.77232695, + "learning_rate": 1.1280527299257835e-06, + "loss": 0.79310173, + "num_input_tokens_seen": 230997110, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.39453125, + "step": 10705, + "time_per_iteration": 2.4520156383514404 + }, + { + "auxiliary_loss_clip": 0.01058488, + "auxiliary_loss_mlp": 0.01028568, + "balance_loss_clip": 1.01701117, + "balance_loss_mlp": 1.01896381, + "epoch": 0.6436795430632797, + "flos": 20229269374080.0, + "grad_norm": 2.7418741317005053, + "language_loss": 0.78896368, + "learning_rate": 1.1277127207171201e-06, + "loss": 0.80983424, + "num_input_tokens_seen": 231015590, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.39453125, + "step": 10706, + "time_per_iteration": 2.3776750564575195 + }, + { + "auxiliary_loss_clip": 0.01057355, + "auxiliary_loss_mlp": 0.0102341, + "balance_loss_clip": 1.01093435, + "balance_loss_mlp": 1.01833069, + "epoch": 0.6437396663159477, + "flos": 20300841394560.0, + "grad_norm": 2.607422204487501, + "language_loss": 0.80039608, + "learning_rate": 1.127372742638145e-06, + "loss": 0.82120377, + "num_input_tokens_seen": 231033800, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.390625, + "step": 10707, + "time_per_iteration": 2.3964855670928955 + }, + { + "auxiliary_loss_clip": 0.0105795, + "auxiliary_loss_mlp": 0.01025725, + "balance_loss_clip": 1.01336861, + "balance_loss_mlp": 1.01876605, + "epoch": 0.6437997895686156, + "flos": 23877448738560.0, + "grad_norm": 2.020991527681803, + "language_loss": 0.85575688, + "learning_rate": 1.1270327957009937e-06, + "loss": 0.87659353, + "num_input_tokens_seen": 231053160, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.39257812, + "step": 10708, + "time_per_iteration": 2.417663097381592 + }, + { + "auxiliary_loss_clip": 0.01065538, + "auxiliary_loss_mlp": 0.01029084, + "balance_loss_clip": 1.01535058, + "balance_loss_mlp": 1.0221982, + "epoch": 0.6438599128212836, + "flos": 18988281630720.0, + "grad_norm": 9.915388689269797, + "language_loss": 0.65415263, + "learning_rate": 1.126692879917795e-06, + "loss": 0.6750989, + "num_input_tokens_seen": 231069470, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.43359375, + "step": 10709, + "time_per_iteration": 2.4161293506622314 + }, + { + "auxiliary_loss_clip": 0.01057051, + "auxiliary_loss_mlp": 0.01026166, + "balance_loss_clip": 1.01435196, + "balance_loss_mlp": 1.01830912, + "epoch": 0.6439200360739517, + "flos": 24095236999680.0, + "grad_norm": 1.8346335941142005, + "language_loss": 0.80303723, + "learning_rate": 1.1263529953006816e-06, + "loss": 0.82386935, + "num_input_tokens_seen": 231088205, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.38671875, + "step": 10710, + "time_per_iteration": 2.3984463214874268 + }, + { + "auxiliary_loss_clip": 0.0105701, + "auxiliary_loss_mlp": 0.01022289, + "balance_loss_clip": 1.010988, + "balance_loss_mlp": 1.01790202, + "epoch": 0.6439801593266196, + "flos": 31900873864320.0, + "grad_norm": 1.6491627793362091, + "language_loss": 0.66114819, + "learning_rate": 1.1260131418617826e-06, + "loss": 0.68194115, + "num_input_tokens_seen": 231107850, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.390625, + "step": 10711, + "time_per_iteration": 2.4800827503204346 + }, + { + "auxiliary_loss_clip": 0.01060248, + "auxiliary_loss_mlp": 0.01024847, + "balance_loss_clip": 1.01259232, + "balance_loss_mlp": 1.02089572, + "epoch": 0.6440402825792876, + "flos": 27124650604800.0, + "grad_norm": 1.7856638902672943, + "language_loss": 0.78872693, + "learning_rate": 1.1256733196132264e-06, + "loss": 0.80957788, + "num_input_tokens_seen": 231127200, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.39257812, + "step": 10712, + "time_per_iteration": 2.447319984436035 + }, + { + "auxiliary_loss_clip": 0.01060381, + "auxiliary_loss_mlp": 0.01025475, + "balance_loss_clip": 1.0128746, + "balance_loss_mlp": 1.01941776, + "epoch": 0.6441004058319555, + "flos": 20666661287040.0, + "grad_norm": 1.6179851974556119, + "language_loss": 0.82626414, + "learning_rate": 1.1253335285671393e-06, + "loss": 0.84712267, + "num_input_tokens_seen": 231146360, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.40820312, + "step": 10713, + "time_per_iteration": 2.3922464847564697 + }, + { + "auxiliary_loss_clip": 0.01057577, + "auxiliary_loss_mlp": 0.01029803, + "balance_loss_clip": 1.01863337, + "balance_loss_mlp": 1.02031136, + "epoch": 0.6441605290846235, + "flos": 26024956600320.0, + "grad_norm": 1.4039171677183582, + "language_loss": 0.78172332, + "learning_rate": 1.1249937687356497e-06, + "loss": 0.80259711, + "num_input_tokens_seen": 231168350, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.37304688, + "step": 10714, + "time_per_iteration": 2.441710948944092 + }, + { + "auxiliary_loss_clip": 0.01056053, + "auxiliary_loss_mlp": 0.01022026, + "balance_loss_clip": 1.01116014, + "balance_loss_mlp": 1.01859021, + "epoch": 0.6442206523372914, + "flos": 24898344071040.0, + "grad_norm": 6.089144652753937, + "language_loss": 0.8148886, + "learning_rate": 1.1246540401308818e-06, + "loss": 0.8356694, + "num_input_tokens_seen": 231188385, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.375, + "step": 10715, + "time_per_iteration": 2.472975254058838 + }, + { + "auxiliary_loss_clip": 0.01056906, + "auxiliary_loss_mlp": 0.01026364, + "balance_loss_clip": 1.01414466, + "balance_loss_mlp": 1.01711118, + "epoch": 0.6442807755899594, + "flos": 25155130187520.0, + "grad_norm": 1.8293641044849347, + "language_loss": 0.81934845, + "learning_rate": 1.1243143427649596e-06, + "loss": 0.84018117, + "num_input_tokens_seen": 231209880, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.3984375, + "step": 10716, + "time_per_iteration": 2.4180095195770264 + }, + { + "auxiliary_loss_clip": 0.01060251, + "auxiliary_loss_mlp": 0.01025173, + "balance_loss_clip": 1.01175618, + "balance_loss_mlp": 1.01943362, + "epoch": 0.6443408988426274, + "flos": 27343276738560.0, + "grad_norm": 2.2174835739410215, + "language_loss": 0.78299862, + "learning_rate": 1.1239746766500048e-06, + "loss": 0.80385292, + "num_input_tokens_seen": 231230765, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.41015625, + "step": 10717, + "time_per_iteration": 2.4890658855438232 + }, + { + "auxiliary_loss_clip": 0.01058526, + "auxiliary_loss_mlp": 0.01028953, + "balance_loss_clip": 1.01632261, + "balance_loss_mlp": 1.02001846, + "epoch": 0.6444010220952954, + "flos": 27927094360320.0, + "grad_norm": 1.7701877431768358, + "language_loss": 0.68578339, + "learning_rate": 1.123635041798142e-06, + "loss": 0.70665812, + "num_input_tokens_seen": 231252350, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.38476562, + "step": 10718, + "time_per_iteration": 2.440753936767578 + }, + { + "auxiliary_loss_clip": 0.01007499, + "auxiliary_loss_mlp": 0.01000252, + "balance_loss_clip": 0.99940574, + "balance_loss_mlp": 1.0007776, + "epoch": 0.6444611453479633, + "flos": 71212167469440.0, + "grad_norm": 0.7705128891995746, + "language_loss": 0.49666423, + "learning_rate": 1.123295438221489e-06, + "loss": 0.51674169, + "num_input_tokens_seen": 231313865, + "router_z_loss_clip": 0.00848389, + "router_z_loss_mlp": 0.06738281, + "step": 10719, + "time_per_iteration": 4.612186431884766 + }, + { + "auxiliary_loss_clip": 0.01062005, + "auxiliary_loss_mlp": 0.01026568, + "balance_loss_clip": 1.01391363, + "balance_loss_mlp": 1.02093458, + "epoch": 0.6445212686006313, + "flos": 22704192766080.0, + "grad_norm": 1.6732996418971795, + "language_loss": 0.77794433, + "learning_rate": 1.1229558659321674e-06, + "loss": 0.79883003, + "num_input_tokens_seen": 231331710, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.41015625, + "step": 10720, + "time_per_iteration": 2.4062206745147705 + }, + { + "auxiliary_loss_clip": 0.01059446, + "auxiliary_loss_mlp": 0.01027248, + "balance_loss_clip": 1.01509452, + "balance_loss_mlp": 1.01943719, + "epoch": 0.6445813918532992, + "flos": 21177754813440.0, + "grad_norm": 1.7217160996168188, + "language_loss": 0.76866686, + "learning_rate": 1.1226163249422955e-06, + "loss": 0.78953373, + "num_input_tokens_seen": 231350705, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.3984375, + "step": 10721, + "time_per_iteration": 2.3859145641326904 + }, + { + "auxiliary_loss_clip": 0.01058624, + "auxiliary_loss_mlp": 0.01027097, + "balance_loss_clip": 1.01395392, + "balance_loss_mlp": 1.01860905, + "epoch": 0.6446415151059672, + "flos": 25190741640960.0, + "grad_norm": 1.9216229794182322, + "language_loss": 0.73470151, + "learning_rate": 1.1222768152639887e-06, + "loss": 0.75555873, + "num_input_tokens_seen": 231369550, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.40039062, + "step": 10722, + "time_per_iteration": 2.4377055168151855 + }, + { + "auxiliary_loss_clip": 0.01058117, + "auxiliary_loss_mlp": 0.01020591, + "balance_loss_clip": 1.00880754, + "balance_loss_mlp": 1.01998115, + "epoch": 0.6447016383586353, + "flos": 25301032225920.0, + "grad_norm": 8.247711852209012, + "language_loss": 0.78201407, + "learning_rate": 1.1219373369093652e-06, + "loss": 0.80280113, + "num_input_tokens_seen": 231389285, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.38085938, + "step": 10723, + "time_per_iteration": 2.4255928993225098 + }, + { + "auxiliary_loss_clip": 0.01061226, + "auxiliary_loss_mlp": 0.01029059, + "balance_loss_clip": 1.01616669, + "balance_loss_mlp": 1.02024519, + "epoch": 0.6447617616113032, + "flos": 27702079447680.0, + "grad_norm": 1.6629447426783972, + "language_loss": 0.58584929, + "learning_rate": 1.121597889890539e-06, + "loss": 0.60675216, + "num_input_tokens_seen": 231408820, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.41015625, + "step": 10724, + "time_per_iteration": 2.474330186843872 + }, + { + "auxiliary_loss_clip": 0.01058839, + "auxiliary_loss_mlp": 0.01025145, + "balance_loss_clip": 1.01411223, + "balance_loss_mlp": 1.02049565, + "epoch": 0.6448218848639712, + "flos": 23037997075200.0, + "grad_norm": 1.6957165717373714, + "language_loss": 0.83548027, + "learning_rate": 1.1212584742196258e-06, + "loss": 0.85632002, + "num_input_tokens_seen": 231428100, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.3828125, + "step": 10725, + "time_per_iteration": 2.402435541152954 + }, + { + "auxiliary_loss_clip": 0.01058952, + "auxiliary_loss_mlp": 0.01028464, + "balance_loss_clip": 1.0163281, + "balance_loss_mlp": 1.01930285, + "epoch": 0.6448820081166391, + "flos": 24495027511680.0, + "grad_norm": 1.772012276017699, + "language_loss": 0.8233974, + "learning_rate": 1.120919089908736e-06, + "loss": 0.84427154, + "num_input_tokens_seen": 231445810, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.39648438, + "step": 10726, + "time_per_iteration": 2.4632811546325684 + }, + { + "auxiliary_loss_clip": 0.01060955, + "auxiliary_loss_mlp": 0.01027657, + "balance_loss_clip": 1.01516366, + "balance_loss_mlp": 1.01998782, + "epoch": 0.6449421313693071, + "flos": 22418183975040.0, + "grad_norm": 1.7429882605195839, + "language_loss": 0.81427383, + "learning_rate": 1.1205797369699835e-06, + "loss": 0.83515996, + "num_input_tokens_seen": 231463570, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.41015625, + "step": 10727, + "time_per_iteration": 2.407719373703003 + }, + { + "auxiliary_loss_clip": 0.01062896, + "auxiliary_loss_mlp": 0.0102865, + "balance_loss_clip": 1.01512027, + "balance_loss_mlp": 1.01915002, + "epoch": 0.645002254621975, + "flos": 20224800720000.0, + "grad_norm": 1.9386015204435336, + "language_loss": 0.79412699, + "learning_rate": 1.1202404154154773e-06, + "loss": 0.8150425, + "num_input_tokens_seen": 231482155, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.4375, + "step": 10728, + "time_per_iteration": 2.4079718589782715 + }, + { + "auxiliary_loss_clip": 0.01059894, + "auxiliary_loss_mlp": 0.01028555, + "balance_loss_clip": 1.01519799, + "balance_loss_mlp": 1.01951289, + "epoch": 0.645062377874643, + "flos": 27854195708160.0, + "grad_norm": 1.6568942517679146, + "language_loss": 0.74310392, + "learning_rate": 1.1199011252573284e-06, + "loss": 0.76398838, + "num_input_tokens_seen": 231502465, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.40429688, + "step": 10729, + "time_per_iteration": 3.9488227367401123 + }, + { + "auxiliary_loss_clip": 0.01060725, + "auxiliary_loss_mlp": 0.01027602, + "balance_loss_clip": 1.01438189, + "balance_loss_mlp": 1.02003336, + "epoch": 0.645122501127311, + "flos": 25300333998720.0, + "grad_norm": 1.5847718589479356, + "language_loss": 0.6645093, + "learning_rate": 1.1195618665076434e-06, + "loss": 0.68539262, + "num_input_tokens_seen": 231522740, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.40625, + "step": 10730, + "time_per_iteration": 2.512704610824585 + }, + { + "auxiliary_loss_clip": 0.01059412, + "auxiliary_loss_mlp": 0.01028496, + "balance_loss_clip": 1.01518631, + "balance_loss_mlp": 1.01851845, + "epoch": 0.645182624379979, + "flos": 18806349202560.0, + "grad_norm": 1.460952526390692, + "language_loss": 0.63289982, + "learning_rate": 1.1192226391785315e-06, + "loss": 0.65377891, + "num_input_tokens_seen": 231542050, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.41015625, + "step": 10731, + "time_per_iteration": 2.425337314605713 + }, + { + "auxiliary_loss_clip": 0.0105811, + "auxiliary_loss_mlp": 0.01028577, + "balance_loss_clip": 1.01555955, + "balance_loss_mlp": 1.01913023, + "epoch": 0.6452427476326469, + "flos": 18331216243200.0, + "grad_norm": 1.460531679056676, + "language_loss": 0.67849207, + "learning_rate": 1.118883443282098e-06, + "loss": 0.69935894, + "num_input_tokens_seen": 231560380, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.390625, + "step": 10732, + "time_per_iteration": 2.423727512359619 + }, + { + "auxiliary_loss_clip": 0.01057048, + "auxiliary_loss_mlp": 0.01028354, + "balance_loss_clip": 1.01704752, + "balance_loss_mlp": 1.01929307, + "epoch": 0.6453028708853149, + "flos": 22783619842560.0, + "grad_norm": 1.7329005561442643, + "language_loss": 0.75517845, + "learning_rate": 1.1185442788304477e-06, + "loss": 0.77603245, + "num_input_tokens_seen": 231580810, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.37695312, + "step": 10733, + "time_per_iteration": 2.4079771041870117 + }, + { + "auxiliary_loss_clip": 0.01061817, + "auxiliary_loss_mlp": 0.01030177, + "balance_loss_clip": 1.01732063, + "balance_loss_mlp": 1.0204308, + "epoch": 0.6453629941379828, + "flos": 23945005952640.0, + "grad_norm": 2.2387084380124103, + "language_loss": 0.66398644, + "learning_rate": 1.118205145835684e-06, + "loss": 0.68490636, + "num_input_tokens_seen": 231600585, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.4140625, + "step": 10734, + "time_per_iteration": 2.4401044845581055 + }, + { + "auxiliary_loss_clip": 0.01058296, + "auxiliary_loss_mlp": 0.0102715, + "balance_loss_clip": 1.01537848, + "balance_loss_mlp": 1.02089143, + "epoch": 0.6454231173906508, + "flos": 17675407664640.0, + "grad_norm": 2.0477620430529564, + "language_loss": 0.73694676, + "learning_rate": 1.1178660443099124e-06, + "loss": 0.75780118, + "num_input_tokens_seen": 231618765, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.375, + "step": 10735, + "time_per_iteration": 2.438272714614868 + }, + { + "auxiliary_loss_clip": 0.01058493, + "auxiliary_loss_mlp": 0.01024177, + "balance_loss_clip": 1.01201773, + "balance_loss_mlp": 1.01944339, + "epoch": 0.6454832406433189, + "flos": 23291710992000.0, + "grad_norm": 4.504835338387322, + "language_loss": 0.7494036, + "learning_rate": 1.1175269742652313e-06, + "loss": 0.77023035, + "num_input_tokens_seen": 231638525, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.390625, + "step": 10736, + "time_per_iteration": 3.9737565517425537 + }, + { + "auxiliary_loss_clip": 0.01061039, + "auxiliary_loss_mlp": 0.01024802, + "balance_loss_clip": 1.01201081, + "balance_loss_mlp": 1.01981676, + "epoch": 0.6455433638959868, + "flos": 20156161253760.0, + "grad_norm": 2.4272032995042667, + "language_loss": 0.70311034, + "learning_rate": 1.117187935713742e-06, + "loss": 0.72396874, + "num_input_tokens_seen": 231656785, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.41210938, + "step": 10737, + "time_per_iteration": 2.4350972175598145 + }, + { + "auxiliary_loss_clip": 0.01007984, + "auxiliary_loss_mlp": 0.010028, + "balance_loss_clip": 1.00197124, + "balance_loss_mlp": 1.00147367, + "epoch": 0.6456034871486548, + "flos": 66529510387200.0, + "grad_norm": 0.776223266051742, + "language_loss": 0.58489293, + "learning_rate": 1.1168489286675455e-06, + "loss": 0.60500079, + "num_input_tokens_seen": 231719075, + "router_z_loss_clip": 0.00830078, + "router_z_loss_mlp": 0.06542969, + "step": 10738, + "time_per_iteration": 3.116466522216797 + }, + { + "auxiliary_loss_clip": 0.01058285, + "auxiliary_loss_mlp": 0.01023285, + "balance_loss_clip": 1.01130438, + "balance_loss_mlp": 1.01972842, + "epoch": 0.6456636104013227, + "flos": 24204969002880.0, + "grad_norm": 1.7192253059982823, + "language_loss": 0.74642813, + "learning_rate": 1.1165099531387379e-06, + "loss": 0.76724386, + "num_input_tokens_seen": 231737810, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.38476562, + "step": 10739, + "time_per_iteration": 2.4169859886169434 + }, + { + "auxiliary_loss_clip": 0.01062218, + "auxiliary_loss_mlp": 0.01023615, + "balance_loss_clip": 1.01049018, + "balance_loss_mlp": 1.02046108, + "epoch": 0.6457237336539907, + "flos": 23622931860480.0, + "grad_norm": 2.1866241806196745, + "language_loss": 0.71525705, + "learning_rate": 1.116171009139418e-06, + "loss": 0.7361154, + "num_input_tokens_seen": 231756140, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.41796875, + "step": 10740, + "time_per_iteration": 2.405059814453125 + }, + { + "auxiliary_loss_clip": 0.0100724, + "auxiliary_loss_mlp": 0.0100104, + "balance_loss_clip": 1.00011063, + "balance_loss_mlp": 1.00097704, + "epoch": 0.6457838569066586, + "flos": 65842454275200.0, + "grad_norm": 1.2999038651679522, + "language_loss": 0.55402178, + "learning_rate": 1.1158320966816806e-06, + "loss": 0.57410461, + "num_input_tokens_seen": 231823665, + "router_z_loss_clip": 0.00927734, + "router_z_loss_mlp": 0.0625, + "step": 10741, + "time_per_iteration": 3.090751886367798 + }, + { + "auxiliary_loss_clip": 0.01057188, + "auxiliary_loss_mlp": 0.01026396, + "balance_loss_clip": 1.01483238, + "balance_loss_mlp": 1.01900768, + "epoch": 0.6458439801593266, + "flos": 22380896776320.0, + "grad_norm": 1.5230233393096009, + "language_loss": 0.8026005, + "learning_rate": 1.1154932157776228e-06, + "loss": 0.82343638, + "num_input_tokens_seen": 231844500, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.3828125, + "step": 10742, + "time_per_iteration": 2.400771141052246 + }, + { + "auxiliary_loss_clip": 0.01056195, + "auxiliary_loss_mlp": 0.01023654, + "balance_loss_clip": 1.01166189, + "balance_loss_mlp": 1.01898992, + "epoch": 0.6459041034119946, + "flos": 24788123308800.0, + "grad_norm": 2.006347246132335, + "language_loss": 0.81715477, + "learning_rate": 1.1151543664393354e-06, + "loss": 0.83795321, + "num_input_tokens_seen": 231864510, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.37109375, + "step": 10743, + "time_per_iteration": 2.4524827003479004 + }, + { + "auxiliary_loss_clip": 0.01057437, + "auxiliary_loss_mlp": 0.01025764, + "balance_loss_clip": 1.01417053, + "balance_loss_mlp": 1.01871479, + "epoch": 0.6459642266646626, + "flos": 18324583084800.0, + "grad_norm": 2.1045359084953823, + "language_loss": 0.71930718, + "learning_rate": 1.1148155486789134e-06, + "loss": 0.74013919, + "num_input_tokens_seen": 231881555, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.38671875, + "step": 10744, + "time_per_iteration": 2.349677562713623 + }, + { + "auxiliary_loss_clip": 0.01057607, + "auxiliary_loss_mlp": 0.0102425, + "balance_loss_clip": 1.01222754, + "balance_loss_mlp": 1.01825035, + "epoch": 0.6460243499173305, + "flos": 43579670094720.0, + "grad_norm": 1.681432533230374, + "language_loss": 0.66362715, + "learning_rate": 1.1144767625084477e-06, + "loss": 0.68444574, + "num_input_tokens_seen": 231905945, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.39453125, + "step": 10745, + "time_per_iteration": 2.6226656436920166 + }, + { + "auxiliary_loss_clip": 0.01058451, + "auxiliary_loss_mlp": 0.01029202, + "balance_loss_clip": 1.01691151, + "balance_loss_mlp": 1.01935744, + "epoch": 0.6460844731699985, + "flos": 19243042888320.0, + "grad_norm": 2.0362273295777755, + "language_loss": 0.73724157, + "learning_rate": 1.1141380079400282e-06, + "loss": 0.75811809, + "num_input_tokens_seen": 231922535, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.390625, + "step": 10746, + "time_per_iteration": 2.3609185218811035 + }, + { + "auxiliary_loss_clip": 0.01060881, + "auxiliary_loss_mlp": 0.01025278, + "balance_loss_clip": 1.0133872, + "balance_loss_mlp": 1.02054262, + "epoch": 0.6461445964226664, + "flos": 27452135957760.0, + "grad_norm": 1.3926674455656094, + "language_loss": 0.66802347, + "learning_rate": 1.1137992849857437e-06, + "loss": 0.68888509, + "num_input_tokens_seen": 231944800, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.40234375, + "step": 10747, + "time_per_iteration": 2.472407579421997 + }, + { + "auxiliary_loss_clip": 0.0105725, + "auxiliary_loss_mlp": 0.0102881, + "balance_loss_clip": 1.0167284, + "balance_loss_mlp": 1.01896918, + "epoch": 0.6462047196753344, + "flos": 20294662083840.0, + "grad_norm": 1.5256170315896045, + "language_loss": 0.6714313, + "learning_rate": 1.1134605936576841e-06, + "loss": 0.69229198, + "num_input_tokens_seen": 231962970, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.3828125, + "step": 10748, + "time_per_iteration": 2.4116933345794678 + }, + { + "auxiliary_loss_clip": 0.01062058, + "auxiliary_loss_mlp": 0.01030478, + "balance_loss_clip": 1.01706076, + "balance_loss_mlp": 1.02036428, + "epoch": 0.6462648429280025, + "flos": 22017241388160.0, + "grad_norm": 1.7568458837759513, + "language_loss": 0.76134765, + "learning_rate": 1.1131219339679355e-06, + "loss": 0.78227299, + "num_input_tokens_seen": 231981195, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.41796875, + "step": 10749, + "time_per_iteration": 2.425565481185913 + }, + { + "auxiliary_loss_clip": 0.01058967, + "auxiliary_loss_mlp": 0.0102477, + "balance_loss_clip": 1.01287913, + "balance_loss_mlp": 1.01938939, + "epoch": 0.6463249661806704, + "flos": 27779935512960.0, + "grad_norm": 1.4861647513655087, + "language_loss": 0.76890659, + "learning_rate": 1.1127833059285837e-06, + "loss": 0.78974402, + "num_input_tokens_seen": 232001735, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.39648438, + "step": 10750, + "time_per_iteration": 2.461064100265503 + }, + { + "auxiliary_loss_clip": 0.01061334, + "auxiliary_loss_mlp": 0.01025371, + "balance_loss_clip": 1.01252604, + "balance_loss_mlp": 1.01998711, + "epoch": 0.6463850894333384, + "flos": 22049606085120.0, + "grad_norm": 2.144767780716493, + "language_loss": 0.68237567, + "learning_rate": 1.1124447095517132e-06, + "loss": 0.70324272, + "num_input_tokens_seen": 232019830, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.4140625, + "step": 10751, + "time_per_iteration": 2.423534631729126 + }, + { + "auxiliary_loss_clip": 0.01058418, + "auxiliary_loss_mlp": 0.01024433, + "balance_loss_clip": 1.01207733, + "balance_loss_mlp": 1.01846564, + "epoch": 0.6464452126860063, + "flos": 21105170363520.0, + "grad_norm": 1.803223206177529, + "language_loss": 0.71564627, + "learning_rate": 1.1121061448494082e-06, + "loss": 0.73647475, + "num_input_tokens_seen": 232039625, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.40039062, + "step": 10752, + "time_per_iteration": 2.3984375 + }, + { + "auxiliary_loss_clip": 0.010596, + "auxiliary_loss_mlp": 0.01026565, + "balance_loss_clip": 1.01225948, + "balance_loss_mlp": 1.01951551, + "epoch": 0.6465053359386743, + "flos": 16027298023680.0, + "grad_norm": 1.6097438454980397, + "language_loss": 0.77822369, + "learning_rate": 1.111767611833751e-06, + "loss": 0.79908538, + "num_input_tokens_seen": 232055855, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.40039062, + "step": 10753, + "time_per_iteration": 2.3679893016815186 + }, + { + "auxiliary_loss_clip": 0.01058807, + "auxiliary_loss_mlp": 0.01032209, + "balance_loss_clip": 1.01995385, + "balance_loss_mlp": 1.01968622, + "epoch": 0.6465654591913422, + "flos": 23397707479680.0, + "grad_norm": 1.547744457707381, + "language_loss": 0.85043085, + "learning_rate": 1.111429110516822e-06, + "loss": 0.87134099, + "num_input_tokens_seen": 232073475, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.390625, + "step": 10754, + "time_per_iteration": 2.3965883255004883 + }, + { + "auxiliary_loss_clip": 0.01057724, + "auxiliary_loss_mlp": 0.01027595, + "balance_loss_clip": 1.01517379, + "balance_loss_mlp": 1.0186342, + "epoch": 0.6466255824440102, + "flos": 15376377035520.0, + "grad_norm": 1.9896311764077086, + "language_loss": 0.59708393, + "learning_rate": 1.1110906409107042e-06, + "loss": 0.61793709, + "num_input_tokens_seen": 232091090, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.390625, + "step": 10755, + "time_per_iteration": 2.3908159732818604 + }, + { + "auxiliary_loss_clip": 0.01058462, + "auxiliary_loss_mlp": 0.01024025, + "balance_loss_clip": 1.01181173, + "balance_loss_mlp": 1.01958978, + "epoch": 0.6466857056966782, + "flos": 16251928911360.0, + "grad_norm": 2.15801475098609, + "language_loss": 0.67833358, + "learning_rate": 1.1107522030274733e-06, + "loss": 0.69915843, + "num_input_tokens_seen": 232107320, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.38867188, + "step": 10756, + "time_per_iteration": 2.3551363945007324 + }, + { + "auxiliary_loss_clip": 0.01062489, + "auxiliary_loss_mlp": 0.01029708, + "balance_loss_clip": 1.0167917, + "balance_loss_mlp": 1.02042127, + "epoch": 0.6467458289493462, + "flos": 21177196231680.0, + "grad_norm": 3.3893094903144094, + "language_loss": 0.74307531, + "learning_rate": 1.110413796879209e-06, + "loss": 0.76399726, + "num_input_tokens_seen": 232123930, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.421875, + "step": 10757, + "time_per_iteration": 2.391309976577759 + }, + { + "auxiliary_loss_clip": 0.01057399, + "auxiliary_loss_mlp": 0.01027302, + "balance_loss_clip": 1.01491046, + "balance_loss_mlp": 1.01937938, + "epoch": 0.6468059522020141, + "flos": 17967316475520.0, + "grad_norm": 1.4877522871940356, + "language_loss": 0.74410379, + "learning_rate": 1.1100754224779879e-06, + "loss": 0.76495081, + "num_input_tokens_seen": 232142905, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.37890625, + "step": 10758, + "time_per_iteration": 3.837752103805542 + }, + { + "auxiliary_loss_clip": 0.01058639, + "auxiliary_loss_mlp": 0.01030206, + "balance_loss_clip": 1.01606774, + "balance_loss_mlp": 1.01909494, + "epoch": 0.6468660754546821, + "flos": 17889320764800.0, + "grad_norm": 2.029331444063699, + "language_loss": 0.6776709, + "learning_rate": 1.1097370798358871e-06, + "loss": 0.69855928, + "num_input_tokens_seen": 232162230, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.39453125, + "step": 10759, + "time_per_iteration": 2.397392749786377 + }, + { + "auxiliary_loss_clip": 0.01061733, + "auxiliary_loss_mlp": 0.01026199, + "balance_loss_clip": 1.01335382, + "balance_loss_mlp": 1.02017808, + "epoch": 0.64692619870735, + "flos": 22599906935040.0, + "grad_norm": 2.957673803441603, + "language_loss": 0.75494909, + "learning_rate": 1.1093987689649784e-06, + "loss": 0.77582836, + "num_input_tokens_seen": 232182700, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.41601562, + "step": 10760, + "time_per_iteration": 2.4047811031341553 + }, + { + "auxiliary_loss_clip": 0.01057098, + "auxiliary_loss_mlp": 0.01023864, + "balance_loss_clip": 1.0118773, + "balance_loss_mlp": 1.01824355, + "epoch": 0.646986321960018, + "flos": 49598940867840.0, + "grad_norm": 1.5103157644277634, + "language_loss": 0.65412408, + "learning_rate": 1.1090604898773377e-06, + "loss": 0.67493367, + "num_input_tokens_seen": 232208235, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.38867188, + "step": 10761, + "time_per_iteration": 2.665527582168579 + }, + { + "auxiliary_loss_clip": 0.01060504, + "auxiliary_loss_mlp": 0.01023368, + "balance_loss_clip": 1.01002824, + "balance_loss_mlp": 1.01939476, + "epoch": 0.6470464452126861, + "flos": 21907369739520.0, + "grad_norm": 2.002397632079946, + "language_loss": 0.69571948, + "learning_rate": 1.1087222425850362e-06, + "loss": 0.71655816, + "num_input_tokens_seen": 232228720, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.41015625, + "step": 10762, + "time_per_iteration": 2.4245190620422363 + }, + { + "auxiliary_loss_clip": 0.01060664, + "auxiliary_loss_mlp": 0.01027049, + "balance_loss_clip": 1.0136975, + "balance_loss_mlp": 1.01928616, + "epoch": 0.647106568465354, + "flos": 18105363457920.0, + "grad_norm": 1.8832454087691133, + "language_loss": 0.82648921, + "learning_rate": 1.1083840271001452e-06, + "loss": 0.84736633, + "num_input_tokens_seen": 232244655, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.4140625, + "step": 10763, + "time_per_iteration": 2.398571729660034 + }, + { + "auxiliary_loss_clip": 0.01007524, + "auxiliary_loss_mlp": 0.01000885, + "balance_loss_clip": 0.99993098, + "balance_loss_mlp": 1.00107479, + "epoch": 0.647166691718022, + "flos": 69476773829760.0, + "grad_norm": 0.7117260030428895, + "language_loss": 0.57686615, + "learning_rate": 1.1080458434347337e-06, + "loss": 0.59695029, + "num_input_tokens_seen": 232308685, + "router_z_loss_clip": 0.00952148, + "router_z_loss_mlp": 0.06445312, + "step": 10764, + "time_per_iteration": 3.22666335105896 + }, + { + "auxiliary_loss_clip": 0.01057187, + "auxiliary_loss_mlp": 0.01025553, + "balance_loss_clip": 1.01284575, + "balance_loss_mlp": 1.01831794, + "epoch": 0.6472268149706899, + "flos": 34093733448960.0, + "grad_norm": 1.9827486489092954, + "language_loss": 0.60619116, + "learning_rate": 1.107707691600873e-06, + "loss": 0.62701857, + "num_input_tokens_seen": 232327520, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.38867188, + "step": 10765, + "time_per_iteration": 2.531791925430298 + }, + { + "auxiliary_loss_clip": 0.01057957, + "auxiliary_loss_mlp": 0.01021763, + "balance_loss_clip": 1.00995517, + "balance_loss_mlp": 1.01918662, + "epoch": 0.6472869382233579, + "flos": 28109969395200.0, + "grad_norm": 1.9397081000460596, + "language_loss": 0.62887013, + "learning_rate": 1.1073695716106293e-06, + "loss": 0.64966732, + "num_input_tokens_seen": 232349025, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.38867188, + "step": 10766, + "time_per_iteration": 2.4513356685638428 + }, + { + "auxiliary_loss_clip": 0.01059059, + "auxiliary_loss_mlp": 0.01025427, + "balance_loss_clip": 1.01282024, + "balance_loss_mlp": 1.01914883, + "epoch": 0.6473470614760258, + "flos": 22491047715840.0, + "grad_norm": 1.8159015729127852, + "language_loss": 0.76122069, + "learning_rate": 1.1070314834760693e-06, + "loss": 0.78206551, + "num_input_tokens_seen": 232367835, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.3984375, + "step": 10767, + "time_per_iteration": 2.464421272277832 + }, + { + "auxiliary_loss_clip": 0.01057539, + "auxiliary_loss_mlp": 0.01027921, + "balance_loss_clip": 1.015589, + "balance_loss_mlp": 1.01922727, + "epoch": 0.6474071847286939, + "flos": 14538042535680.0, + "grad_norm": 2.1740246808299117, + "language_loss": 0.77683854, + "learning_rate": 1.1066934272092588e-06, + "loss": 0.79769313, + "num_input_tokens_seen": 232385840, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.3828125, + "step": 10768, + "time_per_iteration": 2.398916244506836 + }, + { + "auxiliary_loss_clip": 0.01058876, + "auxiliary_loss_mlp": 0.01019657, + "balance_loss_clip": 1.00820732, + "balance_loss_mlp": 1.01960683, + "epoch": 0.6474673079813618, + "flos": 24097052390400.0, + "grad_norm": 1.7253836105905518, + "language_loss": 0.71421432, + "learning_rate": 1.106355402822262e-06, + "loss": 0.73499966, + "num_input_tokens_seen": 232406205, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.39257812, + "step": 10769, + "time_per_iteration": 3.882082223892212 + }, + { + "auxiliary_loss_clip": 0.01055987, + "auxiliary_loss_mlp": 0.01024932, + "balance_loss_clip": 1.01335645, + "balance_loss_mlp": 1.01926374, + "epoch": 0.6475274312340298, + "flos": 14975294803200.0, + "grad_norm": 2.047972742836389, + "language_loss": 0.72530305, + "learning_rate": 1.106017410327142e-06, + "loss": 0.74611223, + "num_input_tokens_seen": 232424995, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.3671875, + "step": 10770, + "time_per_iteration": 2.374680757522583 + }, + { + "auxiliary_loss_clip": 0.01060905, + "auxiliary_loss_mlp": 0.01029457, + "balance_loss_clip": 1.01509809, + "balance_loss_mlp": 1.01925588, + "epoch": 0.6475875544866977, + "flos": 25044176286720.0, + "grad_norm": 1.5817430093172882, + "language_loss": 0.73688793, + "learning_rate": 1.1056794497359604e-06, + "loss": 0.75779164, + "num_input_tokens_seen": 232445870, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.41796875, + "step": 10771, + "time_per_iteration": 2.430501699447632 + }, + { + "auxiliary_loss_clip": 0.01059264, + "auxiliary_loss_mlp": 0.01028374, + "balance_loss_clip": 1.01577377, + "balance_loss_mlp": 1.02064216, + "epoch": 0.6476476777393657, + "flos": 16471218360960.0, + "grad_norm": 2.940044765678426, + "language_loss": 0.73604083, + "learning_rate": 1.1053415210607803e-06, + "loss": 0.75691724, + "num_input_tokens_seen": 232464285, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.38671875, + "step": 10772, + "time_per_iteration": 2.378213405609131 + }, + { + "auxiliary_loss_clip": 0.01057202, + "auxiliary_loss_mlp": 0.01026102, + "balance_loss_clip": 1.01508117, + "balance_loss_mlp": 1.01870179, + "epoch": 0.6477078009920336, + "flos": 25811078411520.0, + "grad_norm": 1.518830606026181, + "language_loss": 0.82936728, + "learning_rate": 1.1050036243136587e-06, + "loss": 0.8502003, + "num_input_tokens_seen": 232485815, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.38476562, + "step": 10773, + "time_per_iteration": 2.4222240447998047 + }, + { + "auxiliary_loss_clip": 0.0105397, + "auxiliary_loss_mlp": 0.01024, + "balance_loss_clip": 1.01227617, + "balance_loss_mlp": 1.01694167, + "epoch": 0.6477679242447016, + "flos": 17675163285120.0, + "grad_norm": 1.7926291914823196, + "language_loss": 0.78294241, + "learning_rate": 1.104665759506656e-06, + "loss": 0.80372214, + "num_input_tokens_seen": 232504875, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.37109375, + "step": 10774, + "time_per_iteration": 2.3941547870635986 + }, + { + "auxiliary_loss_clip": 0.01059083, + "auxiliary_loss_mlp": 0.0103088, + "balance_loss_clip": 1.01804101, + "balance_loss_mlp": 1.01893628, + "epoch": 0.6478280474973696, + "flos": 21031259281920.0, + "grad_norm": 3.5585735027873966, + "language_loss": 0.68900883, + "learning_rate": 1.1043279266518285e-06, + "loss": 0.70990849, + "num_input_tokens_seen": 232521945, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.40234375, + "step": 10775, + "time_per_iteration": 2.370875835418701 + }, + { + "auxiliary_loss_clip": 0.01060729, + "auxiliary_loss_mlp": 0.01029875, + "balance_loss_clip": 1.01682734, + "balance_loss_mlp": 1.02023411, + "epoch": 0.6478881707500376, + "flos": 21615844953600.0, + "grad_norm": 2.045905052943998, + "language_loss": 0.65658057, + "learning_rate": 1.103990125761235e-06, + "loss": 0.67748666, + "num_input_tokens_seen": 232541500, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.40429688, + "step": 10776, + "time_per_iteration": 3.779775857925415 + }, + { + "auxiliary_loss_clip": 0.0105878, + "auxiliary_loss_mlp": 0.01022781, + "balance_loss_clip": 1.00952435, + "balance_loss_mlp": 1.01817012, + "epoch": 0.6479482940027056, + "flos": 18441576650880.0, + "grad_norm": 2.0699495855480787, + "language_loss": 0.79160237, + "learning_rate": 1.1036523568469276e-06, + "loss": 0.81241798, + "num_input_tokens_seen": 232559720, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.40625, + "step": 10777, + "time_per_iteration": 2.3761472702026367 + }, + { + "auxiliary_loss_clip": 0.01060657, + "auxiliary_loss_mlp": 0.01026958, + "balance_loss_clip": 1.01377988, + "balance_loss_mlp": 1.02071428, + "epoch": 0.6480084172553735, + "flos": 22162968869760.0, + "grad_norm": 1.8108079960138417, + "language_loss": 0.7373904, + "learning_rate": 1.1033146199209627e-06, + "loss": 0.75826651, + "num_input_tokens_seen": 232579370, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.40039062, + "step": 10778, + "time_per_iteration": 2.4375979900360107 + }, + { + "auxiliary_loss_clip": 0.01056563, + "auxiliary_loss_mlp": 0.01025474, + "balance_loss_clip": 1.01317132, + "balance_loss_mlp": 1.01811397, + "epoch": 0.6480685405080415, + "flos": 24315085031040.0, + "grad_norm": 1.7966812131937098, + "language_loss": 0.78017247, + "learning_rate": 1.1029769149953922e-06, + "loss": 0.80099285, + "num_input_tokens_seen": 232600495, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.3828125, + "step": 10779, + "time_per_iteration": 2.402998208999634 + }, + { + "auxiliary_loss_clip": 0.01007997, + "auxiliary_loss_mlp": 0.01000814, + "balance_loss_clip": 0.99982446, + "balance_loss_mlp": 1.001647, + "epoch": 0.6481286637607094, + "flos": 59888017630080.0, + "grad_norm": 0.72190256139572, + "language_loss": 0.59388125, + "learning_rate": 1.1026392420822684e-06, + "loss": 0.61396933, + "num_input_tokens_seen": 232663165, + "router_z_loss_clip": 0.0098877, + "router_z_loss_mlp": 0.06347656, + "step": 10780, + "time_per_iteration": 3.0596604347229004 + }, + { + "auxiliary_loss_clip": 0.01056194, + "auxiliary_loss_mlp": 0.01025019, + "balance_loss_clip": 1.01299655, + "balance_loss_mlp": 1.01877594, + "epoch": 0.6481887870133775, + "flos": 25482999565440.0, + "grad_norm": 1.9194036089819286, + "language_loss": 0.7934984, + "learning_rate": 1.1023016011936417e-06, + "loss": 0.81431061, + "num_input_tokens_seen": 232683385, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.375, + "step": 10781, + "time_per_iteration": 2.4512617588043213 + }, + { + "auxiliary_loss_clip": 0.01060757, + "auxiliary_loss_mlp": 0.01032104, + "balance_loss_clip": 1.01836491, + "balance_loss_mlp": 1.01988447, + "epoch": 0.6482489102660454, + "flos": 19929400773120.0, + "grad_norm": 1.945505588830395, + "language_loss": 0.78533745, + "learning_rate": 1.1019639923415618e-06, + "loss": 0.80626607, + "num_input_tokens_seen": 232699095, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.40820312, + "step": 10782, + "time_per_iteration": 2.416837692260742 + }, + { + "auxiliary_loss_clip": 0.0105912, + "auxiliary_loss_mlp": 0.01027404, + "balance_loss_clip": 1.0146606, + "balance_loss_mlp": 1.01945078, + "epoch": 0.6483090335187134, + "flos": 26978259807360.0, + "grad_norm": 2.046391279900785, + "language_loss": 0.63687229, + "learning_rate": 1.1016264155380768e-06, + "loss": 0.65773755, + "num_input_tokens_seen": 232717920, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.39648438, + "step": 10783, + "time_per_iteration": 2.443535327911377 + }, + { + "auxiliary_loss_clip": 0.01061218, + "auxiliary_loss_mlp": 0.01027228, + "balance_loss_clip": 1.01305354, + "balance_loss_mlp": 1.02013159, + "epoch": 0.6483691567713813, + "flos": 25076925008640.0, + "grad_norm": 1.7531947642390202, + "language_loss": 0.88736308, + "learning_rate": 1.1012888707952335e-06, + "loss": 0.90824753, + "num_input_tokens_seen": 232737605, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.41210938, + "step": 10784, + "time_per_iteration": 2.442423105239868 + }, + { + "auxiliary_loss_clip": 0.01057455, + "auxiliary_loss_mlp": 0.01023272, + "balance_loss_clip": 1.01111889, + "balance_loss_mlp": 1.0190742, + "epoch": 0.6484292800240493, + "flos": 16105084266240.0, + "grad_norm": 2.7008203870896086, + "language_loss": 0.73285633, + "learning_rate": 1.1009513581250795e-06, + "loss": 0.75366354, + "num_input_tokens_seen": 232755110, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.38476562, + "step": 10785, + "time_per_iteration": 2.3761377334594727 + }, + { + "auxiliary_loss_clip": 0.01056569, + "auxiliary_loss_mlp": 0.01026708, + "balance_loss_clip": 1.01482892, + "balance_loss_mlp": 1.01898086, + "epoch": 0.6484894032767172, + "flos": 28839130473600.0, + "grad_norm": 1.4875817149334645, + "language_loss": 0.69240689, + "learning_rate": 1.1006138775396588e-06, + "loss": 0.71323967, + "num_input_tokens_seen": 232779040, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.375, + "step": 10786, + "time_per_iteration": 2.5856356620788574 + }, + { + "auxiliary_loss_clip": 0.01056853, + "auxiliary_loss_mlp": 0.01022432, + "balance_loss_clip": 1.01009369, + "balance_loss_mlp": 1.01716208, + "epoch": 0.6485495265293852, + "flos": 30225740964480.0, + "grad_norm": 2.4240987472957225, + "language_loss": 0.71176147, + "learning_rate": 1.1002764290510151e-06, + "loss": 0.73255432, + "num_input_tokens_seen": 232800515, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.39648438, + "step": 10787, + "time_per_iteration": 2.4723284244537354 + }, + { + "auxiliary_loss_clip": 0.01060575, + "auxiliary_loss_mlp": 0.01026816, + "balance_loss_clip": 1.01348794, + "balance_loss_mlp": 1.01910949, + "epoch": 0.6486096497820532, + "flos": 20081202831360.0, + "grad_norm": 2.0858185711733337, + "language_loss": 0.84204221, + "learning_rate": 1.0999390126711907e-06, + "loss": 0.86291611, + "num_input_tokens_seen": 232818450, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.4140625, + "step": 10788, + "time_per_iteration": 2.406313896179199 + }, + { + "auxiliary_loss_clip": 0.01060725, + "auxiliary_loss_mlp": 0.01027354, + "balance_loss_clip": 1.0142585, + "balance_loss_mlp": 1.02080619, + "epoch": 0.6486697730347212, + "flos": 17128109191680.0, + "grad_norm": 1.863389602275105, + "language_loss": 0.77149796, + "learning_rate": 1.0996016284122293e-06, + "loss": 0.79237872, + "num_input_tokens_seen": 232834785, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.3984375, + "step": 10789, + "time_per_iteration": 2.3680622577667236 + }, + { + "auxiliary_loss_clip": 0.01054938, + "auxiliary_loss_mlp": 0.0102163, + "balance_loss_clip": 1.01005483, + "balance_loss_mlp": 1.0180099, + "epoch": 0.6487298962873892, + "flos": 38910351018240.0, + "grad_norm": 1.676564464814078, + "language_loss": 0.75879681, + "learning_rate": 1.0992642762861682e-06, + "loss": 0.77956247, + "num_input_tokens_seen": 232856050, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.36914062, + "step": 10790, + "time_per_iteration": 2.5759963989257812 + }, + { + "auxiliary_loss_clip": 0.01058734, + "auxiliary_loss_mlp": 0.01026301, + "balance_loss_clip": 1.01467824, + "balance_loss_mlp": 1.02009571, + "epoch": 0.6487900195400571, + "flos": 11947033272960.0, + "grad_norm": 2.0221946839722165, + "language_loss": 0.6034897, + "learning_rate": 1.0989269563050487e-06, + "loss": 0.62434006, + "num_input_tokens_seen": 232873945, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.38671875, + "step": 10791, + "time_per_iteration": 2.3730356693267822 + }, + { + "auxiliary_loss_clip": 0.01058948, + "auxiliary_loss_mlp": 0.01026266, + "balance_loss_clip": 1.01407123, + "balance_loss_mlp": 1.01921439, + "epoch": 0.6488501427927251, + "flos": 22343400109440.0, + "grad_norm": 3.2977291642543554, + "language_loss": 0.85992032, + "learning_rate": 1.0985896684809076e-06, + "loss": 0.88077247, + "num_input_tokens_seen": 232892160, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.3984375, + "step": 10792, + "time_per_iteration": 2.411586046218872 + }, + { + "auxiliary_loss_clip": 0.01061934, + "auxiliary_loss_mlp": 0.01025834, + "balance_loss_clip": 1.01324558, + "balance_loss_mlp": 1.02083409, + "epoch": 0.648910266045393, + "flos": 22235204206080.0, + "grad_norm": 1.7984059023430259, + "language_loss": 0.77539855, + "learning_rate": 1.0982524128257842e-06, + "loss": 0.79627627, + "num_input_tokens_seen": 232911725, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.41015625, + "step": 10793, + "time_per_iteration": 2.404999017715454 + }, + { + "auxiliary_loss_clip": 0.01060795, + "auxiliary_loss_mlp": 0.01023055, + "balance_loss_clip": 1.01036501, + "balance_loss_mlp": 1.0198667, + "epoch": 0.6489703892980611, + "flos": 25300089619200.0, + "grad_norm": 1.875281444892567, + "language_loss": 0.74468213, + "learning_rate": 1.0979151893517108e-06, + "loss": 0.76552063, + "num_input_tokens_seen": 232929085, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.41015625, + "step": 10794, + "time_per_iteration": 2.4172208309173584 + }, + { + "auxiliary_loss_clip": 0.01058924, + "auxiliary_loss_mlp": 0.01026812, + "balance_loss_clip": 1.01526117, + "balance_loss_mlp": 1.01858306, + "epoch": 0.649030512550729, + "flos": 24570753984000.0, + "grad_norm": 1.9706142106206428, + "language_loss": 0.69612634, + "learning_rate": 1.097577998070725e-06, + "loss": 0.71698368, + "num_input_tokens_seen": 232949455, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.40234375, + "step": 10795, + "time_per_iteration": 2.4283218383789062 + }, + { + "auxiliary_loss_clip": 0.01059893, + "auxiliary_loss_mlp": 0.0102619, + "balance_loss_clip": 1.01257038, + "balance_loss_mlp": 1.01996708, + "epoch": 0.649090635803397, + "flos": 26243652556800.0, + "grad_norm": 1.7849212533993468, + "language_loss": 0.5407238, + "learning_rate": 1.0972408389948586e-06, + "loss": 0.56158465, + "num_input_tokens_seen": 232969445, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.3984375, + "step": 10796, + "time_per_iteration": 2.476618528366089 + }, + { + "auxiliary_loss_clip": 0.01059432, + "auxiliary_loss_mlp": 0.01021725, + "balance_loss_clip": 1.0099889, + "balance_loss_mlp": 1.02002573, + "epoch": 0.6491507590560649, + "flos": 24936189851520.0, + "grad_norm": 1.47748376360614, + "language_loss": 0.77718312, + "learning_rate": 1.0969037121361448e-06, + "loss": 0.79799473, + "num_input_tokens_seen": 232988900, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.39453125, + "step": 10797, + "time_per_iteration": 2.4053847789764404 + }, + { + "auxiliary_loss_clip": 0.01056553, + "auxiliary_loss_mlp": 0.01021745, + "balance_loss_clip": 1.01026535, + "balance_loss_mlp": 1.01961207, + "epoch": 0.6492108823087329, + "flos": 19498781664000.0, + "grad_norm": 2.3271573881148244, + "language_loss": 0.70933545, + "learning_rate": 1.0965666175066144e-06, + "loss": 0.73011839, + "num_input_tokens_seen": 233005060, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.36914062, + "step": 10798, + "time_per_iteration": 3.79569411277771 + }, + { + "auxiliary_loss_clip": 0.01057188, + "auxiliary_loss_mlp": 0.01024004, + "balance_loss_clip": 1.01245296, + "balance_loss_mlp": 1.01834381, + "epoch": 0.6492710055614008, + "flos": 19718280581760.0, + "grad_norm": 1.7791530998066005, + "language_loss": 0.77015436, + "learning_rate": 1.0962295551182976e-06, + "loss": 0.79096627, + "num_input_tokens_seen": 233023375, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.38867188, + "step": 10799, + "time_per_iteration": 2.3676726818084717 + }, + { + "auxiliary_loss_clip": 0.01059465, + "auxiliary_loss_mlp": 0.01024988, + "balance_loss_clip": 1.01204181, + "balance_loss_mlp": 1.01846039, + "epoch": 0.6493311288140688, + "flos": 24315853080960.0, + "grad_norm": 2.094432534978481, + "language_loss": 0.71689415, + "learning_rate": 1.095892524983223e-06, + "loss": 0.73773873, + "num_input_tokens_seen": 233043130, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.41015625, + "step": 10800, + "time_per_iteration": 2.4034547805786133 + }, + { + "auxiliary_loss_clip": 0.01007961, + "auxiliary_loss_mlp": 0.0100237, + "balance_loss_clip": 1.00159538, + "balance_loss_mlp": 1.0013423, + "epoch": 0.6493912520667368, + "flos": 70931465205120.0, + "grad_norm": 0.8076725104934388, + "language_loss": 0.60224122, + "learning_rate": 1.0955555271134182e-06, + "loss": 0.62234455, + "num_input_tokens_seen": 233110560, + "router_z_loss_clip": 0.00775146, + "router_z_loss_mlp": 0.06640625, + "step": 10801, + "time_per_iteration": 3.1519765853881836 + }, + { + "auxiliary_loss_clip": 0.01063745, + "auxiliary_loss_mlp": 0.01027412, + "balance_loss_clip": 1.01362586, + "balance_loss_mlp": 1.02133811, + "epoch": 0.6494513753194048, + "flos": 25336608768000.0, + "grad_norm": 2.1778201622143594, + "language_loss": 0.78015268, + "learning_rate": 1.0952185615209107e-06, + "loss": 0.80106425, + "num_input_tokens_seen": 233130080, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.42382812, + "step": 10802, + "time_per_iteration": 2.4190165996551514 + }, + { + "auxiliary_loss_clip": 0.01058754, + "auxiliary_loss_mlp": 0.01031273, + "balance_loss_clip": 1.0178318, + "balance_loss_mlp": 1.01866686, + "epoch": 0.6495114985720728, + "flos": 24680800189440.0, + "grad_norm": 1.6918513734149712, + "language_loss": 0.75020796, + "learning_rate": 1.0948816282177253e-06, + "loss": 0.77110821, + "num_input_tokens_seen": 233150235, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.40039062, + "step": 10803, + "time_per_iteration": 2.445716142654419 + }, + { + "auxiliary_loss_clip": 0.0105793, + "auxiliary_loss_mlp": 0.01029079, + "balance_loss_clip": 1.01644301, + "balance_loss_mlp": 1.01823401, + "epoch": 0.6495716218247407, + "flos": 23650269321600.0, + "grad_norm": 6.4609781069006, + "language_loss": 0.70075256, + "learning_rate": 1.0945447272158863e-06, + "loss": 0.72162271, + "num_input_tokens_seen": 233166710, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.39648438, + "step": 10804, + "time_per_iteration": 2.434551954269409 + }, + { + "auxiliary_loss_clip": 0.01059967, + "auxiliary_loss_mlp": 0.01028426, + "balance_loss_clip": 1.01544428, + "balance_loss_mlp": 1.02032435, + "epoch": 0.6496317450774087, + "flos": 22345075854720.0, + "grad_norm": 2.021257022832899, + "language_loss": 0.72922873, + "learning_rate": 1.0942078585274162e-06, + "loss": 0.75011271, + "num_input_tokens_seen": 233185445, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.39648438, + "step": 10805, + "time_per_iteration": 2.393517017364502 + }, + { + "auxiliary_loss_clip": 0.01058057, + "auxiliary_loss_mlp": 0.01024991, + "balance_loss_clip": 1.01279628, + "balance_loss_mlp": 1.01769698, + "epoch": 0.6496918683300766, + "flos": 30517335573120.0, + "grad_norm": 2.046618788025742, + "language_loss": 0.65316886, + "learning_rate": 1.0938710221643392e-06, + "loss": 0.67399931, + "num_input_tokens_seen": 233205805, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.40429688, + "step": 10806, + "time_per_iteration": 2.470839023590088 + }, + { + "auxiliary_loss_clip": 0.01059946, + "auxiliary_loss_mlp": 0.01023366, + "balance_loss_clip": 1.01048505, + "balance_loss_mlp": 1.01873636, + "epoch": 0.6497519915827447, + "flos": 12458161710720.0, + "grad_norm": 2.396218738245077, + "language_loss": 0.79390103, + "learning_rate": 1.0935342181386729e-06, + "loss": 0.81473416, + "num_input_tokens_seen": 233224215, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.41210938, + "step": 10807, + "time_per_iteration": 2.4018001556396484 + }, + { + "auxiliary_loss_clip": 0.01008003, + "auxiliary_loss_mlp": 0.01001909, + "balance_loss_clip": 1.00106883, + "balance_loss_mlp": 1.00155425, + "epoch": 0.6498121148354126, + "flos": 69090075077760.0, + "grad_norm": 0.7892082616403024, + "language_loss": 0.58919418, + "learning_rate": 1.0931974464624394e-06, + "loss": 0.60929328, + "num_input_tokens_seen": 233294440, + "router_z_loss_clip": 0.00842285, + "router_z_loss_mlp": 0.06445312, + "step": 10808, + "time_per_iteration": 6.06263279914856 + }, + { + "auxiliary_loss_clip": 0.01057469, + "auxiliary_loss_mlp": 0.0102286, + "balance_loss_clip": 1.01070046, + "balance_loss_mlp": 1.01928353, + "epoch": 0.6498722380880806, + "flos": 36895827991680.0, + "grad_norm": 1.6557075163856299, + "language_loss": 0.6336326, + "learning_rate": 1.0928607071476559e-06, + "loss": 0.65443593, + "num_input_tokens_seen": 233316125, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.3828125, + "step": 10809, + "time_per_iteration": 2.5262978076934814 + }, + { + "auxiliary_loss_clip": 0.01057232, + "auxiliary_loss_mlp": 0.01024702, + "balance_loss_clip": 1.01319242, + "balance_loss_mlp": 1.01982605, + "epoch": 0.6499323613407485, + "flos": 29016629159040.0, + "grad_norm": 2.040495088482402, + "language_loss": 0.8155092, + "learning_rate": 1.0925240002063418e-06, + "loss": 0.83632851, + "num_input_tokens_seen": 233336140, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.375, + "step": 10810, + "time_per_iteration": 2.465278387069702 + }, + { + "auxiliary_loss_clip": 0.01057172, + "auxiliary_loss_mlp": 0.01027175, + "balance_loss_clip": 1.01579094, + "balance_loss_mlp": 1.01871133, + "epoch": 0.6499924845934165, + "flos": 20118245650560.0, + "grad_norm": 1.6000995150405755, + "language_loss": 0.71657073, + "learning_rate": 1.09218732565051e-06, + "loss": 0.73741412, + "num_input_tokens_seen": 233356095, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.38476562, + "step": 10811, + "time_per_iteration": 2.4172542095184326 + }, + { + "auxiliary_loss_clip": 0.01057762, + "auxiliary_loss_mlp": 0.01023919, + "balance_loss_clip": 1.01240969, + "balance_loss_mlp": 1.02090359, + "epoch": 0.6500526078460844, + "flos": 24420313468800.0, + "grad_norm": 1.8997363997139003, + "language_loss": 0.77932632, + "learning_rate": 1.0918506834921787e-06, + "loss": 0.80014312, + "num_input_tokens_seen": 233376830, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.36914062, + "step": 10812, + "time_per_iteration": 2.522979974746704 + }, + { + "auxiliary_loss_clip": 0.01058692, + "auxiliary_loss_mlp": 0.01023856, + "balance_loss_clip": 1.01146972, + "balance_loss_mlp": 1.01878023, + "epoch": 0.6501127310987524, + "flos": 23329905886080.0, + "grad_norm": 1.8019524408074237, + "language_loss": 0.8525672, + "learning_rate": 1.0915140737433607e-06, + "loss": 0.87339264, + "num_input_tokens_seen": 233395275, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.3984375, + "step": 10813, + "time_per_iteration": 2.401533365249634 + }, + { + "auxiliary_loss_clip": 0.01059105, + "auxiliary_loss_mlp": 0.01026435, + "balance_loss_clip": 1.01444817, + "balance_loss_mlp": 1.01994038, + "epoch": 0.6501728543514204, + "flos": 18696826667520.0, + "grad_norm": 1.5855080775705488, + "language_loss": 0.80200469, + "learning_rate": 1.0911774964160674e-06, + "loss": 0.82286012, + "num_input_tokens_seen": 233413345, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.390625, + "step": 10814, + "time_per_iteration": 2.417670488357544 + }, + { + "auxiliary_loss_clip": 0.0106087, + "auxiliary_loss_mlp": 0.01027545, + "balance_loss_clip": 1.0145812, + "balance_loss_mlp": 1.01995051, + "epoch": 0.6502329776040884, + "flos": 44198191474560.0, + "grad_norm": 2.1068649994302286, + "language_loss": 0.65531623, + "learning_rate": 1.090840951522312e-06, + "loss": 0.67620039, + "num_input_tokens_seen": 233436105, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.41015625, + "step": 10815, + "time_per_iteration": 2.5953209400177 + }, + { + "auxiliary_loss_clip": 0.01060072, + "auxiliary_loss_mlp": 0.01029013, + "balance_loss_clip": 1.0144459, + "balance_loss_mlp": 1.01954603, + "epoch": 0.6502931008567564, + "flos": 14573863457280.0, + "grad_norm": 2.0518160260476352, + "language_loss": 0.75356382, + "learning_rate": 1.0905044390741043e-06, + "loss": 0.77445471, + "num_input_tokens_seen": 233452320, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.40625, + "step": 10816, + "time_per_iteration": 3.782099723815918 + }, + { + "auxiliary_loss_clip": 0.01057071, + "auxiliary_loss_mlp": 0.01023611, + "balance_loss_clip": 1.01179194, + "balance_loss_mlp": 1.0187068, + "epoch": 0.6503532241094243, + "flos": 21394006974720.0, + "grad_norm": 1.7535607593960325, + "language_loss": 0.73288345, + "learning_rate": 1.090167959083454e-06, + "loss": 0.7536903, + "num_input_tokens_seen": 233469920, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.3828125, + "step": 10817, + "time_per_iteration": 2.4371542930603027 + }, + { + "auxiliary_loss_clip": 0.0105714, + "auxiliary_loss_mlp": 0.01024026, + "balance_loss_clip": 1.0115273, + "balance_loss_mlp": 1.01815307, + "epoch": 0.6504133473620923, + "flos": 74738256209280.0, + "grad_norm": 1.3608176620786636, + "language_loss": 0.71897531, + "learning_rate": 1.0898315115623678e-06, + "loss": 0.73978698, + "num_input_tokens_seen": 233499780, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.38867188, + "step": 10818, + "time_per_iteration": 2.828923463821411 + }, + { + "auxiliary_loss_clip": 0.01061475, + "auxiliary_loss_mlp": 0.01023984, + "balance_loss_clip": 1.01122832, + "balance_loss_mlp": 1.02035975, + "epoch": 0.6504734706147602, + "flos": 19712415473280.0, + "grad_norm": 1.7790175317189596, + "language_loss": 0.64604342, + "learning_rate": 1.0894950965228547e-06, + "loss": 0.66689801, + "num_input_tokens_seen": 233518235, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.41015625, + "step": 10819, + "time_per_iteration": 2.387799024581909 + }, + { + "auxiliary_loss_clip": 0.01060396, + "auxiliary_loss_mlp": 0.01026317, + "balance_loss_clip": 1.01335311, + "balance_loss_mlp": 1.01992464, + "epoch": 0.6505335938674283, + "flos": 25555688749440.0, + "grad_norm": 1.9213560149469828, + "language_loss": 0.83762175, + "learning_rate": 1.0891587139769195e-06, + "loss": 0.85848886, + "num_input_tokens_seen": 233535215, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.40429688, + "step": 10820, + "time_per_iteration": 2.444873332977295 + }, + { + "auxiliary_loss_clip": 0.01062226, + "auxiliary_loss_mlp": 0.01023812, + "balance_loss_clip": 1.01076508, + "balance_loss_mlp": 1.0201323, + "epoch": 0.6505937171200962, + "flos": 17820471830400.0, + "grad_norm": 3.506995455443073, + "language_loss": 0.77641964, + "learning_rate": 1.0888223639365666e-06, + "loss": 0.79727995, + "num_input_tokens_seen": 233552775, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.421875, + "step": 10821, + "time_per_iteration": 2.4256794452667236 + }, + { + "auxiliary_loss_clip": 0.01056247, + "auxiliary_loss_mlp": 0.01021473, + "balance_loss_clip": 1.01072669, + "balance_loss_mlp": 1.01900387, + "epoch": 0.6506538403727642, + "flos": 20667080223360.0, + "grad_norm": 1.4610193041935169, + "language_loss": 0.7999329, + "learning_rate": 1.0884860464137991e-06, + "loss": 0.82071006, + "num_input_tokens_seen": 233572080, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.37109375, + "step": 10822, + "time_per_iteration": 2.451772928237915 + }, + { + "auxiliary_loss_clip": 0.01057913, + "auxiliary_loss_mlp": 0.01026712, + "balance_loss_clip": 1.01440942, + "balance_loss_mlp": 1.01861286, + "epoch": 0.6507139636254321, + "flos": 11720831374080.0, + "grad_norm": 2.5757292305682244, + "language_loss": 0.87072295, + "learning_rate": 1.0881497614206215e-06, + "loss": 0.89156914, + "num_input_tokens_seen": 233589155, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.39257812, + "step": 10823, + "time_per_iteration": 2.3958284854888916 + }, + { + "auxiliary_loss_clip": 0.01059314, + "auxiliary_loss_mlp": 0.01023782, + "balance_loss_clip": 1.01167035, + "balance_loss_mlp": 1.01959848, + "epoch": 0.6507740868781001, + "flos": 26760506457600.0, + "grad_norm": 1.9837853195836797, + "language_loss": 0.66643232, + "learning_rate": 1.0878135089690316e-06, + "loss": 0.68726325, + "num_input_tokens_seen": 233608180, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.3984375, + "step": 10824, + "time_per_iteration": 2.4602396488189697 + }, + { + "auxiliary_loss_clip": 0.01060027, + "auxiliary_loss_mlp": 0.01026031, + "balance_loss_clip": 1.01385355, + "balance_loss_mlp": 1.01899588, + "epoch": 0.650834210130768, + "flos": 16470799424640.0, + "grad_norm": 2.473069591713258, + "language_loss": 0.87458366, + "learning_rate": 1.0874772890710322e-06, + "loss": 0.89544421, + "num_input_tokens_seen": 233625750, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.41015625, + "step": 10825, + "time_per_iteration": 2.4010820388793945 + }, + { + "auxiliary_loss_clip": 0.0106031, + "auxiliary_loss_mlp": 0.01022365, + "balance_loss_clip": 1.00792241, + "balance_loss_mlp": 1.01906002, + "epoch": 0.650894333383436, + "flos": 17127725166720.0, + "grad_norm": 2.1131823299999373, + "language_loss": 0.73069763, + "learning_rate": 1.087141101738621e-06, + "loss": 0.75152445, + "num_input_tokens_seen": 233644235, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.41210938, + "step": 10826, + "time_per_iteration": 2.3859803676605225 + }, + { + "auxiliary_loss_clip": 0.01057014, + "auxiliary_loss_mlp": 0.01022185, + "balance_loss_clip": 1.01018071, + "balance_loss_mlp": 1.0192095, + "epoch": 0.650954456636104, + "flos": 18733241082240.0, + "grad_norm": 2.4599249710577618, + "language_loss": 0.69182414, + "learning_rate": 1.0868049469837956e-06, + "loss": 0.71261615, + "num_input_tokens_seen": 233662845, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.37890625, + "step": 10827, + "time_per_iteration": 2.3725028038024902 + }, + { + "auxiliary_loss_clip": 0.01057691, + "auxiliary_loss_mlp": 0.01024467, + "balance_loss_clip": 1.01288533, + "balance_loss_mlp": 1.01842511, + "epoch": 0.651014579888772, + "flos": 24527287474560.0, + "grad_norm": 3.4948538250817824, + "language_loss": 0.76997292, + "learning_rate": 1.0864688248185526e-06, + "loss": 0.79079449, + "num_input_tokens_seen": 233681990, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.39257812, + "step": 10828, + "time_per_iteration": 2.4810562133789062 + }, + { + "auxiliary_loss_clip": 0.01058481, + "auxiliary_loss_mlp": 0.01022297, + "balance_loss_clip": 1.01059628, + "balance_loss_mlp": 1.01965189, + "epoch": 0.65107470314144, + "flos": 24059939748480.0, + "grad_norm": 1.8162276381233096, + "language_loss": 0.89294457, + "learning_rate": 1.0861327352548865e-06, + "loss": 0.91375238, + "num_input_tokens_seen": 233698930, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.38867188, + "step": 10829, + "time_per_iteration": 2.4252307415008545 + }, + { + "auxiliary_loss_clip": 0.01007754, + "auxiliary_loss_mlp": 0.0100132, + "balance_loss_clip": 1.00039613, + "balance_loss_mlp": 1.00125599, + "epoch": 0.6511348263941079, + "flos": 72477139317120.0, + "grad_norm": 0.6367540119542259, + "language_loss": 0.5533638, + "learning_rate": 1.0857966783047943e-06, + "loss": 0.5734545, + "num_input_tokens_seen": 233769825, + "router_z_loss_clip": 0.00921631, + "router_z_loss_mlp": 0.06445312, + "step": 10830, + "time_per_iteration": 3.1916120052337646 + }, + { + "auxiliary_loss_clip": 0.01059926, + "auxiliary_loss_mlp": 0.01028779, + "balance_loss_clip": 1.01557016, + "balance_loss_mlp": 1.01923895, + "epoch": 0.6511949496467759, + "flos": 23366564680320.0, + "grad_norm": 1.8707026872221164, + "language_loss": 0.74875879, + "learning_rate": 1.085460653980265e-06, + "loss": 0.76964581, + "num_input_tokens_seen": 233787095, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.40625, + "step": 10831, + "time_per_iteration": 2.4537649154663086 + }, + { + "auxiliary_loss_clip": 0.01007741, + "auxiliary_loss_mlp": 0.01000646, + "balance_loss_clip": 0.99982393, + "balance_loss_mlp": 1.00123298, + "epoch": 0.6512550728994438, + "flos": 67329824549760.0, + "grad_norm": 0.6461081182502488, + "language_loss": 0.51052743, + "learning_rate": 1.0851246622932935e-06, + "loss": 0.53061134, + "num_input_tokens_seen": 233853050, + "router_z_loss_clip": 0.00823975, + "router_z_loss_mlp": 0.06494141, + "step": 10832, + "time_per_iteration": 3.1772005558013916 + }, + { + "auxiliary_loss_clip": 0.01060166, + "auxiliary_loss_mlp": 0.01025147, + "balance_loss_clip": 1.01215935, + "balance_loss_mlp": 1.01920581, + "epoch": 0.6513151961521119, + "flos": 21140642171520.0, + "grad_norm": 1.9925538330519939, + "language_loss": 0.83045912, + "learning_rate": 1.0847887032558696e-06, + "loss": 0.85131216, + "num_input_tokens_seen": 233871385, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.41015625, + "step": 10833, + "time_per_iteration": 2.3915698528289795 + }, + { + "auxiliary_loss_clip": 0.01060307, + "auxiliary_loss_mlp": 0.01027313, + "balance_loss_clip": 1.0148201, + "balance_loss_mlp": 1.02001917, + "epoch": 0.6513753194047798, + "flos": 15157925458560.0, + "grad_norm": 2.052617125755609, + "language_loss": 0.83695698, + "learning_rate": 1.0844527768799825e-06, + "loss": 0.85783315, + "num_input_tokens_seen": 233888175, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.40234375, + "step": 10834, + "time_per_iteration": 2.4085702896118164 + }, + { + "auxiliary_loss_clip": 0.01059687, + "auxiliary_loss_mlp": 0.01024302, + "balance_loss_clip": 1.01251197, + "balance_loss_mlp": 1.01931441, + "epoch": 0.6514354426574478, + "flos": 30225322028160.0, + "grad_norm": 1.4831602949997633, + "language_loss": 0.77192825, + "learning_rate": 1.08411688317762e-06, + "loss": 0.79276818, + "num_input_tokens_seen": 233911470, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.40429688, + "step": 10835, + "time_per_iteration": 2.4637410640716553 + }, + { + "auxiliary_loss_clip": 0.01061483, + "auxiliary_loss_mlp": 0.01023218, + "balance_loss_clip": 1.01051617, + "balance_loss_mlp": 1.02145493, + "epoch": 0.6514955659101157, + "flos": 24204480243840.0, + "grad_norm": 1.328369677228315, + "language_loss": 0.77279484, + "learning_rate": 1.0837810221607705e-06, + "loss": 0.79364187, + "num_input_tokens_seen": 233932135, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.40039062, + "step": 10836, + "time_per_iteration": 2.433237075805664 + }, + { + "auxiliary_loss_clip": 0.01058717, + "auxiliary_loss_mlp": 0.01027723, + "balance_loss_clip": 1.0154382, + "balance_loss_mlp": 1.01927352, + "epoch": 0.6515556891627837, + "flos": 12377163623040.0, + "grad_norm": 2.189600445478586, + "language_loss": 0.82340586, + "learning_rate": 1.0834451938414199e-06, + "loss": 0.84427023, + "num_input_tokens_seen": 233947880, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.39453125, + "step": 10837, + "time_per_iteration": 3.8115594387054443 + }, + { + "auxiliary_loss_clip": 0.01057541, + "auxiliary_loss_mlp": 0.01029369, + "balance_loss_clip": 1.01639891, + "balance_loss_mlp": 1.01985121, + "epoch": 0.6516158124154516, + "flos": 49599359804160.0, + "grad_norm": 1.8520506206436131, + "language_loss": 0.58615083, + "learning_rate": 1.0831093982315526e-06, + "loss": 0.60701996, + "num_input_tokens_seen": 233971475, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.37695312, + "step": 10838, + "time_per_iteration": 2.6508569717407227 + }, + { + "auxiliary_loss_clip": 0.01007173, + "auxiliary_loss_mlp": 0.01000729, + "balance_loss_clip": 0.99977523, + "balance_loss_mlp": 1.00063729, + "epoch": 0.6516759356681197, + "flos": 59699731334400.0, + "grad_norm": 0.7253394179750844, + "language_loss": 0.60911345, + "learning_rate": 1.0827736353431517e-06, + "loss": 0.62919247, + "num_input_tokens_seen": 234030690, + "router_z_loss_clip": 0.00952148, + "router_z_loss_mlp": 0.06542969, + "step": 10839, + "time_per_iteration": 3.11209774017334 + }, + { + "auxiliary_loss_clip": 0.01058621, + "auxiliary_loss_mlp": 0.01021457, + "balance_loss_clip": 1.01012588, + "balance_loss_mlp": 1.01992536, + "epoch": 0.6517360589207876, + "flos": 37449305775360.0, + "grad_norm": 1.8668022672269662, + "language_loss": 0.67381096, + "learning_rate": 1.0824379051882016e-06, + "loss": 0.69461167, + "num_input_tokens_seen": 234052470, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.38671875, + "step": 10840, + "time_per_iteration": 2.5356640815734863 + }, + { + "auxiliary_loss_clip": 0.01057973, + "auxiliary_loss_mlp": 0.01023501, + "balance_loss_clip": 1.01216459, + "balance_loss_mlp": 1.0187645, + "epoch": 0.6517961821734556, + "flos": 25373721409920.0, + "grad_norm": 3.367843701345369, + "language_loss": 0.73754299, + "learning_rate": 1.082102207778681e-06, + "loss": 0.75835776, + "num_input_tokens_seen": 234071495, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.39257812, + "step": 10841, + "time_per_iteration": 2.4564714431762695 + }, + { + "auxiliary_loss_clip": 0.01059098, + "auxiliary_loss_mlp": 0.01023429, + "balance_loss_clip": 1.01155579, + "balance_loss_mlp": 1.01936162, + "epoch": 0.6518563054261236, + "flos": 28765743062400.0, + "grad_norm": 1.6533636907597342, + "language_loss": 0.62787044, + "learning_rate": 1.0817665431265722e-06, + "loss": 0.64869571, + "num_input_tokens_seen": 234092325, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.3984375, + "step": 10842, + "time_per_iteration": 2.461843490600586 + }, + { + "auxiliary_loss_clip": 0.01007324, + "auxiliary_loss_mlp": 0.01001095, + "balance_loss_clip": 1.00018907, + "balance_loss_mlp": 1.00087428, + "epoch": 0.6519164286787915, + "flos": 68921725034880.0, + "grad_norm": 0.8091699732654762, + "language_loss": 0.56125188, + "learning_rate": 1.0814309112438544e-06, + "loss": 0.58133614, + "num_input_tokens_seen": 234148005, + "router_z_loss_clip": 0.0090332, + "router_z_loss_mlp": 0.06445312, + "step": 10843, + "time_per_iteration": 2.9060652256011963 + }, + { + "auxiliary_loss_clip": 0.01059549, + "auxiliary_loss_mlp": 0.010278, + "balance_loss_clip": 1.01439524, + "balance_loss_mlp": 1.01836514, + "epoch": 0.6519765519314595, + "flos": 20441087792640.0, + "grad_norm": 1.806177238922764, + "language_loss": 0.82581139, + "learning_rate": 1.0810953121425028e-06, + "loss": 0.84668481, + "num_input_tokens_seen": 234164280, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.41210938, + "step": 10844, + "time_per_iteration": 2.45100736618042 + }, + { + "auxiliary_loss_clip": 0.01058643, + "auxiliary_loss_mlp": 0.01026647, + "balance_loss_clip": 1.01407075, + "balance_loss_mlp": 1.01908231, + "epoch": 0.6520366751841274, + "flos": 28401703649280.0, + "grad_norm": 2.38383645000062, + "language_loss": 0.601013, + "learning_rate": 1.0807597458344967e-06, + "loss": 0.62186587, + "num_input_tokens_seen": 234185090, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.39453125, + "step": 10845, + "time_per_iteration": 2.465003490447998 + }, + { + "auxiliary_loss_clip": 0.01058142, + "auxiliary_loss_mlp": 0.01023924, + "balance_loss_clip": 1.01134109, + "balance_loss_mlp": 1.01951694, + "epoch": 0.6520967984367955, + "flos": 22272316848000.0, + "grad_norm": 1.6712908688126096, + "language_loss": 0.79193485, + "learning_rate": 1.0804242123318101e-06, + "loss": 0.81275553, + "num_input_tokens_seen": 234204050, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.38671875, + "step": 10846, + "time_per_iteration": 2.488028049468994 + }, + { + "auxiliary_loss_clip": 0.01007384, + "auxiliary_loss_mlp": 0.01001885, + "balance_loss_clip": 1.00097859, + "balance_loss_mlp": 1.00093973, + "epoch": 0.6521569216894634, + "flos": 68913309530880.0, + "grad_norm": 0.7082630623055411, + "language_loss": 0.60238129, + "learning_rate": 1.0800887116464194e-06, + "loss": 0.62247396, + "num_input_tokens_seen": 234269790, + "router_z_loss_clip": 0.0090332, + "router_z_loss_mlp": 0.06445312, + "step": 10847, + "time_per_iteration": 3.2238411903381348 + }, + { + "auxiliary_loss_clip": 0.01061391, + "auxiliary_loss_mlp": 0.01034417, + "balance_loss_clip": 1.02160811, + "balance_loss_mlp": 1.02020633, + "epoch": 0.6522170449421314, + "flos": 29129293716480.0, + "grad_norm": 1.6597914912630993, + "language_loss": 0.80926013, + "learning_rate": 1.0797532437902946e-06, + "loss": 0.8302182, + "num_input_tokens_seen": 234290135, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.41210938, + "step": 10848, + "time_per_iteration": 3.8933751583099365 + }, + { + "auxiliary_loss_clip": 0.01057544, + "auxiliary_loss_mlp": 0.01025927, + "balance_loss_clip": 1.01402378, + "balance_loss_mlp": 1.0185039, + "epoch": 0.6522771681947993, + "flos": 26650704631680.0, + "grad_norm": 1.978428153404655, + "language_loss": 0.75001639, + "learning_rate": 1.0794178087754102e-06, + "loss": 0.77085108, + "num_input_tokens_seen": 234309535, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.390625, + "step": 10849, + "time_per_iteration": 2.4333343505859375 + }, + { + "auxiliary_loss_clip": 0.01057105, + "auxiliary_loss_mlp": 0.01023519, + "balance_loss_clip": 1.01157463, + "balance_loss_mlp": 1.0187676, + "epoch": 0.6523372914474673, + "flos": 25738563784320.0, + "grad_norm": 1.453810262743245, + "language_loss": 0.68017399, + "learning_rate": 1.079082406613736e-06, + "loss": 0.70098025, + "num_input_tokens_seen": 234328755, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.3828125, + "step": 10850, + "time_per_iteration": 2.421346426010132 + }, + { + "auxiliary_loss_clip": 0.01056499, + "auxiliary_loss_mlp": 0.01021633, + "balance_loss_clip": 1.01001632, + "balance_loss_mlp": 1.01905215, + "epoch": 0.6523974147001352, + "flos": 24826178557440.0, + "grad_norm": 1.8510447388569582, + "language_loss": 0.66536987, + "learning_rate": 1.078747037317242e-06, + "loss": 0.68615121, + "num_input_tokens_seen": 234348655, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.375, + "step": 10851, + "time_per_iteration": 2.422250747680664 + }, + { + "auxiliary_loss_clip": 0.01059655, + "auxiliary_loss_mlp": 0.01020892, + "balance_loss_clip": 1.00935841, + "balance_loss_mlp": 1.01894724, + "epoch": 0.6524575379528033, + "flos": 26316586120320.0, + "grad_norm": 2.2623910546242043, + "language_loss": 0.7399503, + "learning_rate": 1.0784117008978958e-06, + "loss": 0.76075578, + "num_input_tokens_seen": 234367445, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.40625, + "step": 10852, + "time_per_iteration": 2.429995059967041 + }, + { + "auxiliary_loss_clip": 0.01060746, + "auxiliary_loss_mlp": 0.01023862, + "balance_loss_clip": 1.01054621, + "balance_loss_mlp": 1.01980639, + "epoch": 0.6525176612054712, + "flos": 19493300580480.0, + "grad_norm": 1.8127193671773902, + "language_loss": 0.66624337, + "learning_rate": 1.078076397367666e-06, + "loss": 0.68708944, + "num_input_tokens_seen": 234384825, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.41015625, + "step": 10853, + "time_per_iteration": 2.3745832443237305 + }, + { + "auxiliary_loss_clip": 0.01006889, + "auxiliary_loss_mlp": 0.01003143, + "balance_loss_clip": 1.00223684, + "balance_loss_mlp": 1.00049841, + "epoch": 0.6525777844581392, + "flos": 71703534211200.0, + "grad_norm": 0.7328365793560861, + "language_loss": 0.63015974, + "learning_rate": 1.0777411267385183e-06, + "loss": 0.65026003, + "num_input_tokens_seen": 234450630, + "router_z_loss_clip": 0.0090332, + "router_z_loss_mlp": 0.06396484, + "step": 10854, + "time_per_iteration": 3.1656787395477295 + }, + { + "auxiliary_loss_clip": 0.01059725, + "auxiliary_loss_mlp": 0.01028548, + "balance_loss_clip": 1.01542878, + "balance_loss_mlp": 1.01934934, + "epoch": 0.6526379077108072, + "flos": 26651856706560.0, + "grad_norm": 1.701549405188786, + "language_loss": 0.77345073, + "learning_rate": 1.0774058890224175e-06, + "loss": 0.79433346, + "num_input_tokens_seen": 234473505, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.40429688, + "step": 10855, + "time_per_iteration": 3.9499459266662598 + }, + { + "auxiliary_loss_clip": 0.01055126, + "auxiliary_loss_mlp": 0.01018715, + "balance_loss_clip": 1.00719953, + "balance_loss_mlp": 1.01710391, + "epoch": 0.6526980309634751, + "flos": 22819266207360.0, + "grad_norm": 1.8242153942903927, + "language_loss": 0.79060626, + "learning_rate": 1.0770706842313262e-06, + "loss": 0.81134468, + "num_input_tokens_seen": 234492485, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.37890625, + "step": 10856, + "time_per_iteration": 2.423534870147705 + }, + { + "auxiliary_loss_clip": 0.01057399, + "auxiliary_loss_mlp": 0.01024111, + "balance_loss_clip": 1.01113498, + "balance_loss_mlp": 1.01856709, + "epoch": 0.6527581542161431, + "flos": 28363822957440.0, + "grad_norm": 3.025843627578128, + "language_loss": 0.73615754, + "learning_rate": 1.07673551237721e-06, + "loss": 0.75697261, + "num_input_tokens_seen": 234512645, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.38867188, + "step": 10857, + "time_per_iteration": 2.4779293537139893 + }, + { + "auxiliary_loss_clip": 0.01058706, + "auxiliary_loss_mlp": 0.01028339, + "balance_loss_clip": 1.01688313, + "balance_loss_mlp": 1.0194962, + "epoch": 0.652818277468811, + "flos": 18368224151040.0, + "grad_norm": 2.292718158748062, + "language_loss": 0.62678373, + "learning_rate": 1.0764003734720275e-06, + "loss": 0.64765424, + "num_input_tokens_seen": 234529310, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.390625, + "step": 10858, + "time_per_iteration": 2.345427989959717 + }, + { + "auxiliary_loss_clip": 0.01056251, + "auxiliary_loss_mlp": 0.01028136, + "balance_loss_clip": 1.0166446, + "balance_loss_mlp": 1.0184443, + "epoch": 0.6528784007214791, + "flos": 18035327537280.0, + "grad_norm": 1.6837601121168504, + "language_loss": 0.78479975, + "learning_rate": 1.0760652675277393e-06, + "loss": 0.80564368, + "num_input_tokens_seen": 234546685, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.37890625, + "step": 10859, + "time_per_iteration": 2.3870327472686768 + }, + { + "auxiliary_loss_clip": 0.01060862, + "auxiliary_loss_mlp": 0.01023181, + "balance_loss_clip": 1.01041389, + "balance_loss_mlp": 1.01953983, + "epoch": 0.652938523974147, + "flos": 22380931687680.0, + "grad_norm": 2.599982869870114, + "language_loss": 0.67834413, + "learning_rate": 1.0757301945563064e-06, + "loss": 0.6991846, + "num_input_tokens_seen": 234566255, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.4140625, + "step": 10860, + "time_per_iteration": 2.418459177017212 + }, + { + "auxiliary_loss_clip": 0.01060374, + "auxiliary_loss_mlp": 0.01023969, + "balance_loss_clip": 1.0114994, + "balance_loss_mlp": 1.01885009, + "epoch": 0.652998647226815, + "flos": 16763092260480.0, + "grad_norm": 2.0194063595103717, + "language_loss": 0.66695642, + "learning_rate": 1.075395154569684e-06, + "loss": 0.68779993, + "num_input_tokens_seen": 234585405, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.4140625, + "step": 10861, + "time_per_iteration": 2.3988428115844727 + }, + { + "auxiliary_loss_clip": 0.01060503, + "auxiliary_loss_mlp": 0.01023705, + "balance_loss_clip": 1.01121211, + "balance_loss_mlp": 1.0205518, + "epoch": 0.6530587704794829, + "flos": 35771065764480.0, + "grad_norm": 1.8793455292816075, + "language_loss": 0.64962035, + "learning_rate": 1.0750601475798307e-06, + "loss": 0.67046249, + "num_input_tokens_seen": 234608095, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.3984375, + "step": 10862, + "time_per_iteration": 2.510821580886841 + }, + { + "auxiliary_loss_clip": 0.01057022, + "auxiliary_loss_mlp": 0.01023965, + "balance_loss_clip": 1.01259875, + "balance_loss_mlp": 1.01807499, + "epoch": 0.6531188937321509, + "flos": 19315173490560.0, + "grad_norm": 1.5543017529551204, + "language_loss": 0.77301931, + "learning_rate": 1.0747251735987009e-06, + "loss": 0.7938292, + "num_input_tokens_seen": 234627335, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.390625, + "step": 10863, + "time_per_iteration": 2.3981056213378906 + }, + { + "auxiliary_loss_clip": 0.01057637, + "auxiliary_loss_mlp": 0.01023092, + "balance_loss_clip": 1.01176667, + "balance_loss_mlp": 1.01884878, + "epoch": 0.6531790169848188, + "flos": 22892653618560.0, + "grad_norm": 1.6812550739920165, + "language_loss": 0.74766421, + "learning_rate": 1.074390232638251e-06, + "loss": 0.76847154, + "num_input_tokens_seen": 234646540, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.38867188, + "step": 10864, + "time_per_iteration": 2.439741849899292 + }, + { + "auxiliary_loss_clip": 0.0105731, + "auxiliary_loss_mlp": 0.01024926, + "balance_loss_clip": 1.01403666, + "balance_loss_mlp": 1.01920986, + "epoch": 0.6532391402374869, + "flos": 29562426443520.0, + "grad_norm": 1.7083489512943086, + "language_loss": 0.8615641, + "learning_rate": 1.0740553247104315e-06, + "loss": 0.88238645, + "num_input_tokens_seen": 234665470, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.38085938, + "step": 10865, + "time_per_iteration": 2.467099666595459 + }, + { + "auxiliary_loss_clip": 0.01062459, + "auxiliary_loss_mlp": 0.01026851, + "balance_loss_clip": 1.01435781, + "balance_loss_mlp": 1.02069831, + "epoch": 0.6532992634901548, + "flos": 23104541859840.0, + "grad_norm": 1.8291336216886538, + "language_loss": 0.81330729, + "learning_rate": 1.0737204498271958e-06, + "loss": 0.83420038, + "num_input_tokens_seen": 234683955, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.41796875, + "step": 10866, + "time_per_iteration": 2.4122583866119385 + }, + { + "auxiliary_loss_clip": 0.01056344, + "auxiliary_loss_mlp": 0.01022601, + "balance_loss_clip": 1.01145458, + "balance_loss_mlp": 1.01900232, + "epoch": 0.6533593867428228, + "flos": 26066153871360.0, + "grad_norm": 1.4454887599013782, + "language_loss": 0.82206666, + "learning_rate": 1.0733856080004952e-06, + "loss": 0.84285611, + "num_input_tokens_seen": 234704595, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.37304688, + "step": 10867, + "time_per_iteration": 2.5612285137176514 + }, + { + "auxiliary_loss_clip": 0.01057968, + "auxiliary_loss_mlp": 0.01025663, + "balance_loss_clip": 1.01340806, + "balance_loss_mlp": 1.01823056, + "epoch": 0.6534195099954908, + "flos": 21211481053440.0, + "grad_norm": 1.7555137119190583, + "language_loss": 0.81091738, + "learning_rate": 1.0730507992422784e-06, + "loss": 0.83175367, + "num_input_tokens_seen": 234724090, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.3984375, + "step": 10868, + "time_per_iteration": 2.4489614963531494 + }, + { + "auxiliary_loss_clip": 0.01060907, + "auxiliary_loss_mlp": 0.01025712, + "balance_loss_clip": 1.01262283, + "balance_loss_mlp": 1.01989472, + "epoch": 0.6534796332481587, + "flos": 19645556486400.0, + "grad_norm": 2.2046191535343436, + "language_loss": 0.8029902, + "learning_rate": 1.0727160235644932e-06, + "loss": 0.82385635, + "num_input_tokens_seen": 234742560, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.41015625, + "step": 10869, + "time_per_iteration": 2.404510498046875 + }, + { + "auxiliary_loss_clip": 0.01061047, + "auxiliary_loss_mlp": 0.0102594, + "balance_loss_clip": 1.01363754, + "balance_loss_mlp": 1.02095366, + "epoch": 0.6535397565008267, + "flos": 24021395740800.0, + "grad_norm": 2.586316979451736, + "language_loss": 0.72091055, + "learning_rate": 1.0723812809790898e-06, + "loss": 0.7417804, + "num_input_tokens_seen": 234762315, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.40234375, + "step": 10870, + "time_per_iteration": 2.429699659347534 + }, + { + "auxiliary_loss_clip": 0.01058545, + "auxiliary_loss_mlp": 0.0103354, + "balance_loss_clip": 1.02119029, + "balance_loss_mlp": 1.01969707, + "epoch": 0.6535998797534947, + "flos": 24601757137920.0, + "grad_norm": 2.268564431921739, + "language_loss": 0.74508917, + "learning_rate": 1.0720465714980106e-06, + "loss": 0.76600999, + "num_input_tokens_seen": 234781300, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.38867188, + "step": 10871, + "time_per_iteration": 2.438281297683716 + }, + { + "auxiliary_loss_clip": 0.01055642, + "auxiliary_loss_mlp": 0.01025127, + "balance_loss_clip": 1.01335537, + "balance_loss_mlp": 1.01893151, + "epoch": 0.6536600030061627, + "flos": 23363143367040.0, + "grad_norm": 1.5439505980410755, + "language_loss": 0.55692428, + "learning_rate": 1.0717118951332032e-06, + "loss": 0.57773197, + "num_input_tokens_seen": 234801040, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.3671875, + "step": 10872, + "time_per_iteration": 2.4171202182769775 + }, + { + "auxiliary_loss_clip": 0.01057131, + "auxiliary_loss_mlp": 0.01027471, + "balance_loss_clip": 1.01557982, + "balance_loss_mlp": 1.01900661, + "epoch": 0.6537201262588306, + "flos": 23877344004480.0, + "grad_norm": 1.6436138870765644, + "language_loss": 0.74300152, + "learning_rate": 1.0713772518966102e-06, + "loss": 0.76384753, + "num_input_tokens_seen": 234821415, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.38085938, + "step": 10873, + "time_per_iteration": 2.404817581176758 + }, + { + "auxiliary_loss_clip": 0.01059241, + "auxiliary_loss_mlp": 0.01028874, + "balance_loss_clip": 1.01647592, + "balance_loss_mlp": 1.0197475, + "epoch": 0.6537802495114986, + "flos": 24353559216000.0, + "grad_norm": 1.700131765527546, + "language_loss": 0.75292075, + "learning_rate": 1.0710426418001746e-06, + "loss": 0.77380192, + "num_input_tokens_seen": 234843795, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.39453125, + "step": 10874, + "time_per_iteration": 2.4504101276397705 + }, + { + "auxiliary_loss_clip": 0.01058846, + "auxiliary_loss_mlp": 0.01026443, + "balance_loss_clip": 1.01400971, + "balance_loss_mlp": 1.02019608, + "epoch": 0.6538403727641665, + "flos": 27995768737920.0, + "grad_norm": 1.7332866043836717, + "language_loss": 0.81481314, + "learning_rate": 1.0707080648558374e-06, + "loss": 0.83566606, + "num_input_tokens_seen": 234862350, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.38671875, + "step": 10875, + "time_per_iteration": 2.4325015544891357 + }, + { + "auxiliary_loss_clip": 0.01057154, + "auxiliary_loss_mlp": 0.010336, + "balance_loss_clip": 1.02199483, + "balance_loss_mlp": 1.01900816, + "epoch": 0.6539004960168345, + "flos": 27562356720000.0, + "grad_norm": 1.6390980552146706, + "language_loss": 0.69937944, + "learning_rate": 1.0703735210755383e-06, + "loss": 0.72028697, + "num_input_tokens_seen": 234881790, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.3828125, + "step": 10876, + "time_per_iteration": 2.4403188228607178 + }, + { + "auxiliary_loss_clip": 0.0105791, + "auxiliary_loss_mlp": 0.01031375, + "balance_loss_clip": 1.01938212, + "balance_loss_mlp": 1.01921463, + "epoch": 0.6539606192695024, + "flos": 14529419429760.0, + "grad_norm": 2.7556305231644442, + "language_loss": 0.79236233, + "learning_rate": 1.0700390104712184e-06, + "loss": 0.81325519, + "num_input_tokens_seen": 234897775, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.38671875, + "step": 10877, + "time_per_iteration": 3.838050365447998 + }, + { + "auxiliary_loss_clip": 0.01058383, + "auxiliary_loss_mlp": 0.01024038, + "balance_loss_clip": 1.01264763, + "balance_loss_mlp": 1.01939869, + "epoch": 0.6540207425221705, + "flos": 21615286371840.0, + "grad_norm": 2.937319586796875, + "language_loss": 0.80071485, + "learning_rate": 1.0697045330548127e-06, + "loss": 0.82153904, + "num_input_tokens_seen": 234918395, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.390625, + "step": 10878, + "time_per_iteration": 2.403205156326294 + }, + { + "auxiliary_loss_clip": 0.01057553, + "auxiliary_loss_mlp": 0.01022522, + "balance_loss_clip": 1.00933754, + "balance_loss_mlp": 1.01892769, + "epoch": 0.6540808657748384, + "flos": 17668215924480.0, + "grad_norm": 1.8521017011674692, + "language_loss": 0.84146672, + "learning_rate": 1.06937008883826e-06, + "loss": 0.86226749, + "num_input_tokens_seen": 234936260, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.38671875, + "step": 10879, + "time_per_iteration": 2.3930230140686035 + }, + { + "auxiliary_loss_clip": 0.01059516, + "auxiliary_loss_mlp": 0.01025827, + "balance_loss_clip": 1.01331592, + "balance_loss_mlp": 1.01869082, + "epoch": 0.6541409890275064, + "flos": 14537414131200.0, + "grad_norm": 2.7979095315507445, + "language_loss": 0.72130835, + "learning_rate": 1.069035677833494e-06, + "loss": 0.74216175, + "num_input_tokens_seen": 234952110, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.40820312, + "step": 10880, + "time_per_iteration": 2.350275993347168 + }, + { + "auxiliary_loss_clip": 0.01056672, + "auxiliary_loss_mlp": 0.01025116, + "balance_loss_clip": 1.01333785, + "balance_loss_mlp": 1.01932263, + "epoch": 0.6542011122801744, + "flos": 17164349049600.0, + "grad_norm": 1.9214576862289319, + "language_loss": 0.84094512, + "learning_rate": 1.0687013000524513e-06, + "loss": 0.861763, + "num_input_tokens_seen": 234970810, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.37304688, + "step": 10881, + "time_per_iteration": 2.4385287761688232 + }, + { + "auxiliary_loss_clip": 0.01058915, + "auxiliary_loss_mlp": 0.01024089, + "balance_loss_clip": 1.01182795, + "balance_loss_mlp": 1.01891279, + "epoch": 0.6542612355328423, + "flos": 18185628407040.0, + "grad_norm": 1.9787091623137727, + "language_loss": 0.78014416, + "learning_rate": 1.0683669555070624e-06, + "loss": 0.80097419, + "num_input_tokens_seen": 234989565, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.40039062, + "step": 10882, + "time_per_iteration": 2.3813652992248535 + }, + { + "auxiliary_loss_clip": 0.01061951, + "auxiliary_loss_mlp": 0.01026007, + "balance_loss_clip": 1.01325798, + "balance_loss_mlp": 1.02112603, + "epoch": 0.6543213587855103, + "flos": 19791423613440.0, + "grad_norm": 2.340792974238044, + "language_loss": 0.8236109, + "learning_rate": 1.068032644209261e-06, + "loss": 0.84449053, + "num_input_tokens_seen": 235007955, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.40820312, + "step": 10883, + "time_per_iteration": 2.407172203063965 + }, + { + "auxiliary_loss_clip": 0.01059959, + "auxiliary_loss_mlp": 0.01028145, + "balance_loss_clip": 1.01562226, + "balance_loss_mlp": 1.01984692, + "epoch": 0.6543814820381783, + "flos": 21104053200000.0, + "grad_norm": 3.0145302898835253, + "language_loss": 0.8484512, + "learning_rate": 1.0676983661709774e-06, + "loss": 0.86933225, + "num_input_tokens_seen": 235024860, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.40039062, + "step": 10884, + "time_per_iteration": 2.4175610542297363 + }, + { + "auxiliary_loss_clip": 0.01059097, + "auxiliary_loss_mlp": 0.01025017, + "balance_loss_clip": 1.01235723, + "balance_loss_mlp": 1.02029359, + "epoch": 0.6544416052908463, + "flos": 20192994604800.0, + "grad_norm": 2.29807429277068, + "language_loss": 0.79830003, + "learning_rate": 1.067364121404141e-06, + "loss": 0.81914127, + "num_input_tokens_seen": 235043815, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.38867188, + "step": 10885, + "time_per_iteration": 2.4269251823425293 + }, + { + "auxiliary_loss_clip": 0.01057136, + "auxiliary_loss_mlp": 0.01020703, + "balance_loss_clip": 1.00915718, + "balance_loss_mlp": 1.01939511, + "epoch": 0.6545017285435142, + "flos": 23367123262080.0, + "grad_norm": 1.8837328637555038, + "language_loss": 0.72339261, + "learning_rate": 1.067029909920679e-06, + "loss": 0.74417102, + "num_input_tokens_seen": 235062985, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.37890625, + "step": 10886, + "time_per_iteration": 2.4089465141296387 + }, + { + "auxiliary_loss_clip": 0.01056561, + "auxiliary_loss_mlp": 0.01020278, + "balance_loss_clip": 1.00940657, + "balance_loss_mlp": 1.01844859, + "epoch": 0.6545618517961822, + "flos": 19133729821440.0, + "grad_norm": 1.7220781573138189, + "language_loss": 0.77907562, + "learning_rate": 1.0666957317325215e-06, + "loss": 0.79984403, + "num_input_tokens_seen": 235081670, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.38085938, + "step": 10887, + "time_per_iteration": 3.830592632293701 + }, + { + "auxiliary_loss_clip": 0.01058413, + "auxiliary_loss_mlp": 0.01024294, + "balance_loss_clip": 1.01203918, + "balance_loss_mlp": 1.02014661, + "epoch": 0.6546219750488501, + "flos": 14937763224960.0, + "grad_norm": 1.953748509359008, + "language_loss": 0.78964114, + "learning_rate": 1.0663615868515913e-06, + "loss": 0.8104682, + "num_input_tokens_seen": 235098510, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.3828125, + "step": 10888, + "time_per_iteration": 3.800351858139038 + }, + { + "auxiliary_loss_clip": 0.01056622, + "auxiliary_loss_mlp": 0.01023921, + "balance_loss_clip": 1.01293027, + "balance_loss_mlp": 1.01916587, + "epoch": 0.6546820983015181, + "flos": 36319027553280.0, + "grad_norm": 1.7612312560425945, + "language_loss": 0.66350263, + "learning_rate": 1.066027475289814e-06, + "loss": 0.68430805, + "num_input_tokens_seen": 235119990, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.375, + "step": 10889, + "time_per_iteration": 2.499765157699585 + }, + { + "auxiliary_loss_clip": 0.01057596, + "auxiliary_loss_mlp": 0.01026088, + "balance_loss_clip": 1.01410747, + "balance_loss_mlp": 1.0187006, + "epoch": 0.654742221554186, + "flos": 20410433752320.0, + "grad_norm": 1.5825490856605133, + "language_loss": 0.80105281, + "learning_rate": 1.0656933970591145e-06, + "loss": 0.82188964, + "num_input_tokens_seen": 235139255, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.38867188, + "step": 10890, + "time_per_iteration": 2.391158103942871 + }, + { + "auxiliary_loss_clip": 0.01057785, + "auxiliary_loss_mlp": 0.01021797, + "balance_loss_clip": 1.00978088, + "balance_loss_mlp": 1.01816177, + "epoch": 0.6548023448068541, + "flos": 24862488238080.0, + "grad_norm": 1.986083535996682, + "language_loss": 0.65051663, + "learning_rate": 1.0653593521714144e-06, + "loss": 0.67131245, + "num_input_tokens_seen": 235158455, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.39648438, + "step": 10891, + "time_per_iteration": 2.4314041137695312 + }, + { + "auxiliary_loss_clip": 0.01057065, + "auxiliary_loss_mlp": 0.01026137, + "balance_loss_clip": 1.0140729, + "balance_loss_mlp": 1.01842117, + "epoch": 0.654862468059522, + "flos": 21426685873920.0, + "grad_norm": 1.7277951073324296, + "language_loss": 0.79336417, + "learning_rate": 1.0650253406386347e-06, + "loss": 0.81419623, + "num_input_tokens_seen": 235177350, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.38671875, + "step": 10892, + "time_per_iteration": 2.4528965950012207 + }, + { + "auxiliary_loss_clip": 0.01061339, + "auxiliary_loss_mlp": 0.01027692, + "balance_loss_clip": 1.01493049, + "balance_loss_mlp": 1.02022302, + "epoch": 0.65492259131219, + "flos": 26576653904640.0, + "grad_norm": 3.5599121052871108, + "language_loss": 0.7783457, + "learning_rate": 1.0646913624726947e-06, + "loss": 0.799236, + "num_input_tokens_seen": 235196435, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.41015625, + "step": 10893, + "time_per_iteration": 2.4770731925964355 + }, + { + "auxiliary_loss_clip": 0.01061744, + "auxiliary_loss_mlp": 0.01025838, + "balance_loss_clip": 1.01217699, + "balance_loss_mlp": 1.01987278, + "epoch": 0.6549827145648579, + "flos": 21500422398720.0, + "grad_norm": 1.7478429682174867, + "language_loss": 0.70221925, + "learning_rate": 1.0643574176855158e-06, + "loss": 0.72309506, + "num_input_tokens_seen": 235215430, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.41796875, + "step": 10894, + "time_per_iteration": 3.825401782989502 + }, + { + "auxiliary_loss_clip": 0.01060709, + "auxiliary_loss_mlp": 0.01028781, + "balance_loss_clip": 1.01600754, + "balance_loss_mlp": 1.01917946, + "epoch": 0.6550428378175259, + "flos": 22593378510720.0, + "grad_norm": 2.885043695888063, + "language_loss": 0.62443078, + "learning_rate": 1.0640235062890121e-06, + "loss": 0.64532566, + "num_input_tokens_seen": 235232015, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.4140625, + "step": 10895, + "time_per_iteration": 2.36613130569458 + }, + { + "auxiliary_loss_clip": 0.01057045, + "auxiliary_loss_mlp": 0.0102678, + "balance_loss_clip": 1.01471591, + "balance_loss_mlp": 1.0176394, + "epoch": 0.655102961070194, + "flos": 12822969173760.0, + "grad_norm": 2.763973029118269, + "language_loss": 0.78665191, + "learning_rate": 1.0636896282951028e-06, + "loss": 0.80749011, + "num_input_tokens_seen": 235248115, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.39453125, + "step": 10896, + "time_per_iteration": 2.3871471881866455 + }, + { + "auxiliary_loss_clip": 0.01060152, + "auxiliary_loss_mlp": 0.01026558, + "balance_loss_clip": 1.01448846, + "balance_loss_mlp": 1.02068067, + "epoch": 0.6551630843228619, + "flos": 24789903788160.0, + "grad_norm": 1.8357189483593843, + "language_loss": 0.70427954, + "learning_rate": 1.0633557837157016e-06, + "loss": 0.72514659, + "num_input_tokens_seen": 235270785, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.39453125, + "step": 10897, + "time_per_iteration": 2.4527370929718018 + }, + { + "auxiliary_loss_clip": 0.01059173, + "auxiliary_loss_mlp": 0.01025956, + "balance_loss_clip": 1.01370716, + "balance_loss_mlp": 1.01815784, + "epoch": 0.6552232075755299, + "flos": 16723605646080.0, + "grad_norm": 1.621675466706627, + "language_loss": 0.75813669, + "learning_rate": 1.0630219725627245e-06, + "loss": 0.778988, + "num_input_tokens_seen": 235287905, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.41015625, + "step": 10898, + "time_per_iteration": 2.380887985229492 + }, + { + "auxiliary_loss_clip": 0.01059138, + "auxiliary_loss_mlp": 0.01025969, + "balance_loss_clip": 1.01408386, + "balance_loss_mlp": 1.01988983, + "epoch": 0.6552833308281978, + "flos": 22015425997440.0, + "grad_norm": 2.5815426549116576, + "language_loss": 0.74044228, + "learning_rate": 1.0626881948480813e-06, + "loss": 0.76129329, + "num_input_tokens_seen": 235305525, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.39257812, + "step": 10899, + "time_per_iteration": 2.366683006286621 + }, + { + "auxiliary_loss_clip": 0.010579, + "auxiliary_loss_mlp": 0.01025891, + "balance_loss_clip": 1.01367176, + "balance_loss_mlp": 1.01925182, + "epoch": 0.6553434540808658, + "flos": 24862243858560.0, + "grad_norm": 2.3354995233999225, + "language_loss": 0.5604949, + "learning_rate": 1.0623544505836863e-06, + "loss": 0.5813328, + "num_input_tokens_seen": 235324415, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.38671875, + "step": 10900, + "time_per_iteration": 2.444258451461792 + }, + { + "auxiliary_loss_clip": 0.01060001, + "auxiliary_loss_mlp": 0.01026957, + "balance_loss_clip": 1.01327133, + "balance_loss_mlp": 1.02031291, + "epoch": 0.6554035773335337, + "flos": 23219964414720.0, + "grad_norm": 2.2658582184447713, + "language_loss": 0.76987982, + "learning_rate": 1.0620207397814492e-06, + "loss": 0.79074937, + "num_input_tokens_seen": 235341595, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.39648438, + "step": 10901, + "time_per_iteration": 2.388127088546753 + }, + { + "auxiliary_loss_clip": 0.01057898, + "auxiliary_loss_mlp": 0.01021583, + "balance_loss_clip": 1.00944138, + "balance_loss_mlp": 1.01903975, + "epoch": 0.6554637005862017, + "flos": 22782502679040.0, + "grad_norm": 2.4076778492842252, + "language_loss": 0.73370826, + "learning_rate": 1.0616870624532789e-06, + "loss": 0.75450307, + "num_input_tokens_seen": 235361700, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.38867188, + "step": 10902, + "time_per_iteration": 2.4230289459228516 + }, + { + "auxiliary_loss_clip": 0.01060147, + "auxiliary_loss_mlp": 0.01024675, + "balance_loss_clip": 1.01243186, + "balance_loss_mlp": 1.02123141, + "epoch": 0.6555238238388696, + "flos": 21506147861760.0, + "grad_norm": 1.6827654750227377, + "language_loss": 0.67788124, + "learning_rate": 1.0613534186110838e-06, + "loss": 0.6987294, + "num_input_tokens_seen": 235382065, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.38867188, + "step": 10903, + "time_per_iteration": 2.4541754722595215 + }, + { + "auxiliary_loss_clip": 0.0106004, + "auxiliary_loss_mlp": 0.01026421, + "balance_loss_clip": 1.01311755, + "balance_loss_mlp": 1.01886129, + "epoch": 0.6555839470915377, + "flos": 30518138534400.0, + "grad_norm": 1.683449729798239, + "language_loss": 0.66682565, + "learning_rate": 1.0610198082667706e-06, + "loss": 0.6876902, + "num_input_tokens_seen": 235402130, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.41015625, + "step": 10904, + "time_per_iteration": 2.4868462085723877 + }, + { + "auxiliary_loss_clip": 0.01058226, + "auxiliary_loss_mlp": 0.01028343, + "balance_loss_clip": 1.01453269, + "balance_loss_mlp": 1.01849723, + "epoch": 0.6556440703442056, + "flos": 24641837245440.0, + "grad_norm": 1.7238152533951356, + "language_loss": 0.90404445, + "learning_rate": 1.0606862314322454e-06, + "loss": 0.92491013, + "num_input_tokens_seen": 235420435, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.39648438, + "step": 10905, + "time_per_iteration": 2.433056592941284 + }, + { + "auxiliary_loss_clip": 0.01057259, + "auxiliary_loss_mlp": 0.01027472, + "balance_loss_clip": 1.01604009, + "balance_loss_mlp": 1.01917624, + "epoch": 0.6557041935968736, + "flos": 23731337232000.0, + "grad_norm": 1.7517487801384106, + "language_loss": 0.75481933, + "learning_rate": 1.060352688119411e-06, + "loss": 0.77566665, + "num_input_tokens_seen": 235439960, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.38085938, + "step": 10906, + "time_per_iteration": 2.455878734588623 + }, + { + "auxiliary_loss_clip": 0.01061473, + "auxiliary_loss_mlp": 0.01032241, + "balance_loss_clip": 1.0192951, + "balance_loss_mlp": 1.02077389, + "epoch": 0.6557643168495415, + "flos": 11102135437440.0, + "grad_norm": 19.877295953241934, + "language_loss": 0.73786604, + "learning_rate": 1.0600191783401732e-06, + "loss": 0.75880313, + "num_input_tokens_seen": 235457495, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.40625, + "step": 10907, + "time_per_iteration": 2.4324028491973877 + }, + { + "auxiliary_loss_clip": 0.01057574, + "auxiliary_loss_mlp": 0.01026194, + "balance_loss_clip": 1.01349247, + "balance_loss_mlp": 1.01761854, + "epoch": 0.6558244401022095, + "flos": 30189710574720.0, + "grad_norm": 1.453446591272727, + "language_loss": 0.71945715, + "learning_rate": 1.0596857021064333e-06, + "loss": 0.74029481, + "num_input_tokens_seen": 235479525, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.40039062, + "step": 10908, + "time_per_iteration": 2.4815971851348877 + }, + { + "auxiliary_loss_clip": 0.01059423, + "auxiliary_loss_mlp": 0.01026705, + "balance_loss_clip": 1.01481998, + "balance_loss_mlp": 1.020226, + "epoch": 0.6558845633548775, + "flos": 17930099099520.0, + "grad_norm": 2.1123974960709435, + "language_loss": 0.81124371, + "learning_rate": 1.0593522594300917e-06, + "loss": 0.83210504, + "num_input_tokens_seen": 235496305, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.39257812, + "step": 10909, + "time_per_iteration": 2.375415563583374 + }, + { + "auxiliary_loss_clip": 0.01059202, + "auxiliary_loss_mlp": 0.01029004, + "balance_loss_clip": 1.01566458, + "balance_loss_mlp": 1.01869917, + "epoch": 0.6559446866075455, + "flos": 21903180376320.0, + "grad_norm": 2.0220516239436326, + "language_loss": 0.63718182, + "learning_rate": 1.0590188503230475e-06, + "loss": 0.65806389, + "num_input_tokens_seen": 235512545, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.40625, + "step": 10910, + "time_per_iteration": 2.4473583698272705 + }, + { + "auxiliary_loss_clip": 0.0106293, + "auxiliary_loss_mlp": 0.01027809, + "balance_loss_clip": 1.01282454, + "balance_loss_mlp": 1.02085412, + "epoch": 0.6560048098602135, + "flos": 14127359679360.0, + "grad_norm": 2.4020543226969893, + "language_loss": 0.75763673, + "learning_rate": 1.0586854747972015e-06, + "loss": 0.77854413, + "num_input_tokens_seen": 235526045, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.421875, + "step": 10911, + "time_per_iteration": 2.3588924407958984 + }, + { + "auxiliary_loss_clip": 0.01053981, + "auxiliary_loss_mlp": 0.01024344, + "balance_loss_clip": 1.0135076, + "balance_loss_mlp": 1.01732254, + "epoch": 0.6560649331128814, + "flos": 18806558670720.0, + "grad_norm": 1.7310603082191003, + "language_loss": 0.75459504, + "learning_rate": 1.0583521328644485e-06, + "loss": 0.77537835, + "num_input_tokens_seen": 235545285, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.3671875, + "step": 10912, + "time_per_iteration": 2.3889739513397217 + }, + { + "auxiliary_loss_clip": 0.01062855, + "auxiliary_loss_mlp": 0.0102321, + "balance_loss_clip": 1.00997818, + "balance_loss_mlp": 1.02044487, + "epoch": 0.6561250563655494, + "flos": 17052731832960.0, + "grad_norm": 1.4987631443364997, + "language_loss": 0.77250767, + "learning_rate": 1.058018824536686e-06, + "loss": 0.79336834, + "num_input_tokens_seen": 235563150, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.42382812, + "step": 10913, + "time_per_iteration": 2.398407459259033 + }, + { + "auxiliary_loss_clip": 0.01054226, + "auxiliary_loss_mlp": 0.0102108, + "balance_loss_clip": 1.00979066, + "balance_loss_mlp": 1.01794767, + "epoch": 0.6561851796182173, + "flos": 22636565729280.0, + "grad_norm": 2.8450668649028708, + "language_loss": 0.71001577, + "learning_rate": 1.0576855498258087e-06, + "loss": 0.7307688, + "num_input_tokens_seen": 235582535, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.36328125, + "step": 10914, + "time_per_iteration": 2.3974366188049316 + }, + { + "auxiliary_loss_clip": 0.01057847, + "auxiliary_loss_mlp": 0.01024468, + "balance_loss_clip": 1.01181984, + "balance_loss_mlp": 1.01913023, + "epoch": 0.6562453028708853, + "flos": 19238364766080.0, + "grad_norm": 1.7226865833104414, + "language_loss": 0.74163359, + "learning_rate": 1.05735230874371e-06, + "loss": 0.76245677, + "num_input_tokens_seen": 235601490, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.38671875, + "step": 10915, + "time_per_iteration": 2.4053759574890137 + }, + { + "auxiliary_loss_clip": 0.01055794, + "auxiliary_loss_mlp": 0.01022889, + "balance_loss_clip": 1.01044357, + "balance_loss_mlp": 1.01797593, + "epoch": 0.6563054261235532, + "flos": 23800290900480.0, + "grad_norm": 1.5707541682334614, + "language_loss": 0.79409254, + "learning_rate": 1.057019101302282e-06, + "loss": 0.8148793, + "num_input_tokens_seen": 235619165, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.37890625, + "step": 10916, + "time_per_iteration": 2.401923179626465 + }, + { + "auxiliary_loss_clip": 0.01057678, + "auxiliary_loss_mlp": 0.01026277, + "balance_loss_clip": 1.01469564, + "balance_loss_mlp": 1.0187273, + "epoch": 0.6563655493762213, + "flos": 19239167727360.0, + "grad_norm": 1.6793711661589301, + "language_loss": 0.76431274, + "learning_rate": 1.0566859275134174e-06, + "loss": 0.78515226, + "num_input_tokens_seen": 235637115, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.390625, + "step": 10917, + "time_per_iteration": 3.8155148029327393 + }, + { + "auxiliary_loss_clip": 0.01056446, + "auxiliary_loss_mlp": 0.01024517, + "balance_loss_clip": 1.01258993, + "balance_loss_mlp": 1.01834357, + "epoch": 0.6564256726288892, + "flos": 25555269813120.0, + "grad_norm": 1.8065387755907982, + "language_loss": 0.69925374, + "learning_rate": 1.0563527873890063e-06, + "loss": 0.72006333, + "num_input_tokens_seen": 235656330, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.38085938, + "step": 10918, + "time_per_iteration": 2.478621006011963 + }, + { + "auxiliary_loss_clip": 0.01055179, + "auxiliary_loss_mlp": 0.01026414, + "balance_loss_clip": 1.01510072, + "balance_loss_mlp": 1.01854956, + "epoch": 0.6564857958815572, + "flos": 22199522929920.0, + "grad_norm": 1.5038834658702898, + "language_loss": 0.7638514, + "learning_rate": 1.0560196809409356e-06, + "loss": 0.78466731, + "num_input_tokens_seen": 235674510, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.3671875, + "step": 10919, + "time_per_iteration": 2.395294427871704 + }, + { + "auxiliary_loss_clip": 0.01007156, + "auxiliary_loss_mlp": 0.00999722, + "balance_loss_clip": 0.99880964, + "balance_loss_mlp": 1.00068521, + "epoch": 0.6565459191342251, + "flos": 58120470627840.0, + "grad_norm": 0.7154413670427671, + "language_loss": 0.53086138, + "learning_rate": 1.0556866081810948e-06, + "loss": 0.55093014, + "num_input_tokens_seen": 235735050, + "router_z_loss_clip": 0.00909424, + "router_z_loss_mlp": 0.06445312, + "step": 10920, + "time_per_iteration": 3.0891292095184326 + }, + { + "auxiliary_loss_clip": 0.01058719, + "auxiliary_loss_mlp": 0.01024666, + "balance_loss_clip": 1.01224494, + "balance_loss_mlp": 1.02021456, + "epoch": 0.6566060423868931, + "flos": 30808336688640.0, + "grad_norm": 1.4281903031918286, + "language_loss": 0.65376091, + "learning_rate": 1.05535356912137e-06, + "loss": 0.67459476, + "num_input_tokens_seen": 235757545, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.38476562, + "step": 10921, + "time_per_iteration": 2.5638792514801025 + }, + { + "auxiliary_loss_clip": 0.01055896, + "auxiliary_loss_mlp": 0.010236, + "balance_loss_clip": 1.01083231, + "balance_loss_mlp": 1.01760173, + "epoch": 0.6566661656395612, + "flos": 23366320300800.0, + "grad_norm": 1.7990919316421796, + "language_loss": 0.81039608, + "learning_rate": 1.0550205637736462e-06, + "loss": 0.831191, + "num_input_tokens_seen": 235777265, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.3828125, + "step": 10922, + "time_per_iteration": 2.461700916290283 + }, + { + "auxiliary_loss_clip": 0.01058373, + "auxiliary_loss_mlp": 0.01025536, + "balance_loss_clip": 1.01289403, + "balance_loss_mlp": 1.01907563, + "epoch": 0.6567262888922291, + "flos": 25734514066560.0, + "grad_norm": 2.4038546892416846, + "language_loss": 0.72063613, + "learning_rate": 1.054687592149807e-06, + "loss": 0.74147522, + "num_input_tokens_seen": 235796565, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.39257812, + "step": 10923, + "time_per_iteration": 2.448513984680176 + }, + { + "auxiliary_loss_clip": 0.01060235, + "auxiliary_loss_mlp": 0.010237, + "balance_loss_clip": 1.01085532, + "balance_loss_mlp": 1.02031112, + "epoch": 0.6567864121448971, + "flos": 17122907399040.0, + "grad_norm": 1.8506895027030794, + "language_loss": 0.80350953, + "learning_rate": 1.054354654261737e-06, + "loss": 0.82434887, + "num_input_tokens_seen": 235814805, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.3984375, + "step": 10924, + "time_per_iteration": 2.4143130779266357 + }, + { + "auxiliary_loss_clip": 0.01055813, + "auxiliary_loss_mlp": 0.01022115, + "balance_loss_clip": 1.01066494, + "balance_loss_mlp": 1.01813781, + "epoch": 0.656846535397565, + "flos": 22418218886400.0, + "grad_norm": 1.5597704290456715, + "language_loss": 0.72093552, + "learning_rate": 1.0540217501213166e-06, + "loss": 0.74171472, + "num_input_tokens_seen": 235833405, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.37695312, + "step": 10925, + "time_per_iteration": 2.5070865154266357 + }, + { + "auxiliary_loss_clip": 0.01059681, + "auxiliary_loss_mlp": 0.01022343, + "balance_loss_clip": 1.01053524, + "balance_loss_mlp": 1.01960158, + "epoch": 0.656906658650233, + "flos": 17703792466560.0, + "grad_norm": 2.233226499404924, + "language_loss": 0.7253406, + "learning_rate": 1.0536888797404268e-06, + "loss": 0.74616086, + "num_input_tokens_seen": 235848530, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.40039062, + "step": 10926, + "time_per_iteration": 2.395404577255249 + }, + { + "auxiliary_loss_clip": 0.01058596, + "auxiliary_loss_mlp": 0.01026008, + "balance_loss_clip": 1.01232898, + "balance_loss_mlp": 1.01824164, + "epoch": 0.6569667819029009, + "flos": 21174193854720.0, + "grad_norm": 1.7558803578909232, + "language_loss": 0.7264899, + "learning_rate": 1.0533560431309458e-06, + "loss": 0.74733597, + "num_input_tokens_seen": 235867225, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.40429688, + "step": 10927, + "time_per_iteration": 4.029839277267456 + }, + { + "auxiliary_loss_clip": 0.0100711, + "auxiliary_loss_mlp": 0.00999458, + "balance_loss_clip": 0.99851024, + "balance_loss_mlp": 1.00083899, + "epoch": 0.6570269051555689, + "flos": 68758330538880.0, + "grad_norm": 0.7410425042972925, + "language_loss": 0.64447713, + "learning_rate": 1.0530232403047541e-06, + "loss": 0.66454279, + "num_input_tokens_seen": 235932925, + "router_z_loss_clip": 0.00946045, + "router_z_loss_mlp": 0.0625, + "step": 10928, + "time_per_iteration": 3.031466245651245 + }, + { + "auxiliary_loss_clip": 0.01058791, + "auxiliary_loss_mlp": 0.01024931, + "balance_loss_clip": 1.01286721, + "balance_loss_mlp": 1.01938295, + "epoch": 0.6570870284082369, + "flos": 26318192042880.0, + "grad_norm": 1.726355568132069, + "language_loss": 0.77849329, + "learning_rate": 1.0526904712737254e-06, + "loss": 0.79933047, + "num_input_tokens_seen": 235952680, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.39453125, + "step": 10929, + "time_per_iteration": 2.469968795776367 + }, + { + "auxiliary_loss_clip": 0.010578, + "auxiliary_loss_mlp": 0.0102354, + "balance_loss_clip": 1.010993, + "balance_loss_mlp": 1.01889932, + "epoch": 0.6571471516609049, + "flos": 26173616636160.0, + "grad_norm": 1.9220121741610947, + "language_loss": 0.64994478, + "learning_rate": 1.0523577360497383e-06, + "loss": 0.67075819, + "num_input_tokens_seen": 235972075, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.38867188, + "step": 10930, + "time_per_iteration": 2.4295570850372314 + }, + { + "auxiliary_loss_clip": 0.01059355, + "auxiliary_loss_mlp": 0.01027735, + "balance_loss_clip": 1.01483059, + "balance_loss_mlp": 1.01899612, + "epoch": 0.6572072749135728, + "flos": 20375206323840.0, + "grad_norm": 1.616865633146839, + "language_loss": 0.70667922, + "learning_rate": 1.0520250346446654e-06, + "loss": 0.72755003, + "num_input_tokens_seen": 235990340, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.40429688, + "step": 10931, + "time_per_iteration": 2.4255692958831787 + }, + { + "auxiliary_loss_clip": 0.0105999, + "auxiliary_loss_mlp": 0.01021694, + "balance_loss_clip": 1.00977933, + "balance_loss_mlp": 1.01989961, + "epoch": 0.6572673981662408, + "flos": 17127794989440.0, + "grad_norm": 1.9450710630981851, + "language_loss": 0.68734968, + "learning_rate": 1.0516923670703808e-06, + "loss": 0.70816654, + "num_input_tokens_seen": 236007470, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.40039062, + "step": 10932, + "time_per_iteration": 2.3942387104034424 + }, + { + "auxiliary_loss_clip": 0.01055822, + "auxiliary_loss_mlp": 0.01025612, + "balance_loss_clip": 1.01374531, + "balance_loss_mlp": 1.01778388, + "epoch": 0.6573275214189087, + "flos": 41273692104960.0, + "grad_norm": 1.6553563433525593, + "language_loss": 0.80201751, + "learning_rate": 1.051359733338756e-06, + "loss": 0.82283187, + "num_input_tokens_seen": 236029030, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.38085938, + "step": 10933, + "time_per_iteration": 2.563951253890991 + }, + { + "auxiliary_loss_clip": 0.01058647, + "auxiliary_loss_mlp": 0.01024755, + "balance_loss_clip": 1.01224995, + "balance_loss_mlp": 1.01986837, + "epoch": 0.6573876446715767, + "flos": 22889127571200.0, + "grad_norm": 1.6509968428450872, + "language_loss": 0.73511279, + "learning_rate": 1.0510271334616616e-06, + "loss": 0.75594676, + "num_input_tokens_seen": 236047160, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.38867188, + "step": 10934, + "time_per_iteration": 3.8526976108551025 + }, + { + "auxiliary_loss_clip": 0.01057778, + "auxiliary_loss_mlp": 0.01024767, + "balance_loss_clip": 1.01323938, + "balance_loss_mlp": 1.0193522, + "epoch": 0.6574477679242448, + "flos": 44016468514560.0, + "grad_norm": 2.0887059543068704, + "language_loss": 0.76230752, + "learning_rate": 1.0506945674509693e-06, + "loss": 0.78313303, + "num_input_tokens_seen": 236069215, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.38476562, + "step": 10935, + "time_per_iteration": 2.607213258743286 + }, + { + "auxiliary_loss_clip": 0.01058705, + "auxiliary_loss_mlp": 0.01031031, + "balance_loss_clip": 1.01782894, + "balance_loss_mlp": 1.01887417, + "epoch": 0.6575078911769127, + "flos": 24570369959040.0, + "grad_norm": 1.568985640039447, + "language_loss": 0.78510988, + "learning_rate": 1.0503620353185443e-06, + "loss": 0.80600727, + "num_input_tokens_seen": 236088335, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.3984375, + "step": 10936, + "time_per_iteration": 2.453864097595215 + }, + { + "auxiliary_loss_clip": 0.01053825, + "auxiliary_loss_mlp": 0.01023331, + "balance_loss_clip": 1.01181531, + "balance_loss_mlp": 1.01729631, + "epoch": 0.6575680144295807, + "flos": 20922958644480.0, + "grad_norm": 2.0507257741208624, + "language_loss": 0.69348562, + "learning_rate": 1.0500295370762565e-06, + "loss": 0.71425724, + "num_input_tokens_seen": 236108540, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.36523438, + "step": 10937, + "time_per_iteration": 2.4096384048461914 + }, + { + "auxiliary_loss_clip": 0.01058048, + "auxiliary_loss_mlp": 0.01023401, + "balance_loss_clip": 1.01134932, + "balance_loss_mlp": 1.01906013, + "epoch": 0.6576281376822486, + "flos": 10924881131520.0, + "grad_norm": 2.270132290930518, + "language_loss": 0.68454754, + "learning_rate": 1.0496970727359707e-06, + "loss": 0.70536202, + "num_input_tokens_seen": 236124495, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.390625, + "step": 10938, + "time_per_iteration": 2.3724775314331055 + }, + { + "auxiliary_loss_clip": 0.01059651, + "auxiliary_loss_mlp": 0.01024535, + "balance_loss_clip": 1.01210713, + "balance_loss_mlp": 1.01986241, + "epoch": 0.6576882609349166, + "flos": 19280539555200.0, + "grad_norm": 1.9853586936004282, + "language_loss": 0.72135687, + "learning_rate": 1.049364642309552e-06, + "loss": 0.74219871, + "num_input_tokens_seen": 236142550, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.3984375, + "step": 10939, + "time_per_iteration": 2.4174630641937256 + }, + { + "auxiliary_loss_clip": 0.01060912, + "auxiliary_loss_mlp": 0.01024637, + "balance_loss_clip": 1.01218545, + "balance_loss_mlp": 1.01972425, + "epoch": 0.6577483841875845, + "flos": 20219773484160.0, + "grad_norm": 2.240725925714913, + "language_loss": 0.77537882, + "learning_rate": 1.049032245808863e-06, + "loss": 0.79623437, + "num_input_tokens_seen": 236156620, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.41210938, + "step": 10940, + "time_per_iteration": 2.3565642833709717 + }, + { + "auxiliary_loss_clip": 0.01061555, + "auxiliary_loss_mlp": 0.0102517, + "balance_loss_clip": 1.01249838, + "balance_loss_mlp": 1.01943684, + "epoch": 0.6578085074402525, + "flos": 34749646761600.0, + "grad_norm": 1.943234579101433, + "language_loss": 0.69011092, + "learning_rate": 1.0486998832457676e-06, + "loss": 0.71097809, + "num_input_tokens_seen": 236177095, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.421875, + "step": 10941, + "time_per_iteration": 2.5417191982269287 + }, + { + "auxiliary_loss_clip": 0.01056483, + "auxiliary_loss_mlp": 0.01020964, + "balance_loss_clip": 1.0091269, + "balance_loss_mlp": 1.01818871, + "epoch": 0.6578686306929205, + "flos": 23470047550080.0, + "grad_norm": 1.7822820078495782, + "language_loss": 0.68227971, + "learning_rate": 1.0483675546321267e-06, + "loss": 0.70305413, + "num_input_tokens_seen": 236194695, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.3828125, + "step": 10942, + "time_per_iteration": 2.4121994972229004 + }, + { + "auxiliary_loss_clip": 0.0106154, + "auxiliary_loss_mlp": 0.0102627, + "balance_loss_clip": 1.01264489, + "balance_loss_mlp": 1.01928425, + "epoch": 0.6579287539455885, + "flos": 18076105872000.0, + "grad_norm": 1.8207814435824865, + "language_loss": 0.71226621, + "learning_rate": 1.0480352599798e-06, + "loss": 0.73314428, + "num_input_tokens_seen": 236213885, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.421875, + "step": 10943, + "time_per_iteration": 2.4016785621643066 + }, + { + "auxiliary_loss_clip": 0.01056018, + "auxiliary_loss_mlp": 0.01026021, + "balance_loss_clip": 1.01441646, + "balance_loss_mlp": 1.01817513, + "epoch": 0.6579888771982564, + "flos": 28660025865600.0, + "grad_norm": 1.7293120329113196, + "language_loss": 0.59459579, + "learning_rate": 1.047702999300645e-06, + "loss": 0.61541617, + "num_input_tokens_seen": 236237315, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.37890625, + "step": 10944, + "time_per_iteration": 2.468083620071411 + }, + { + "auxiliary_loss_clip": 0.01062062, + "auxiliary_loss_mlp": 0.01030409, + "balance_loss_clip": 1.01740956, + "balance_loss_mlp": 1.01978564, + "epoch": 0.6580490004509244, + "flos": 25045363272960.0, + "grad_norm": 1.4519074426476017, + "language_loss": 0.72457004, + "learning_rate": 1.0473707726065217e-06, + "loss": 0.74549472, + "num_input_tokens_seen": 236256345, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.42382812, + "step": 10945, + "time_per_iteration": 2.4620537757873535 + }, + { + "auxiliary_loss_clip": 0.01057634, + "auxiliary_loss_mlp": 0.01023578, + "balance_loss_clip": 1.01219976, + "balance_loss_mlp": 1.01938534, + "epoch": 0.6581091237035923, + "flos": 43507085644800.0, + "grad_norm": 1.513405593369303, + "language_loss": 0.70410633, + "learning_rate": 1.0470385799092841e-06, + "loss": 0.72491848, + "num_input_tokens_seen": 236281890, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.3828125, + "step": 10946, + "time_per_iteration": 2.5930285453796387 + }, + { + "auxiliary_loss_clip": 0.01059024, + "auxiliary_loss_mlp": 0.01024526, + "balance_loss_clip": 1.01286125, + "balance_loss_mlp": 1.01976502, + "epoch": 0.6581692469562603, + "flos": 22414413548160.0, + "grad_norm": 1.7828027907083728, + "language_loss": 0.8200891, + "learning_rate": 1.0467064212207888e-06, + "loss": 0.84092462, + "num_input_tokens_seen": 236298370, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.39257812, + "step": 10947, + "time_per_iteration": 2.4680259227752686 + }, + { + "auxiliary_loss_clip": 0.01058136, + "auxiliary_loss_mlp": 0.01021053, + "balance_loss_clip": 1.01002645, + "balance_loss_mlp": 1.0184617, + "epoch": 0.6582293702089284, + "flos": 24858717811200.0, + "grad_norm": 1.6370794753061932, + "language_loss": 0.77125096, + "learning_rate": 1.04637429655289e-06, + "loss": 0.79204285, + "num_input_tokens_seen": 236317380, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.39648438, + "step": 10948, + "time_per_iteration": 2.4166243076324463 + }, + { + "auxiliary_loss_clip": 0.0105774, + "auxiliary_loss_mlp": 0.01023791, + "balance_loss_clip": 1.01144123, + "balance_loss_mlp": 1.01950836, + "epoch": 0.6582894934615963, + "flos": 23038555656960.0, + "grad_norm": 1.661278352632479, + "language_loss": 0.79145849, + "learning_rate": 1.0460422059174376e-06, + "loss": 0.81227386, + "num_input_tokens_seen": 236336210, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.3828125, + "step": 10949, + "time_per_iteration": 2.423811435699463 + }, + { + "auxiliary_loss_clip": 0.01006892, + "auxiliary_loss_mlp": 0.01001214, + "balance_loss_clip": 1.00028455, + "balance_loss_mlp": 1.0005759, + "epoch": 0.6583496167142643, + "flos": 72548432046720.0, + "grad_norm": 0.7397905944170835, + "language_loss": 0.61777776, + "learning_rate": 1.045710149326286e-06, + "loss": 0.63785887, + "num_input_tokens_seen": 236403090, + "router_z_loss_clip": 0.00927734, + "router_z_loss_mlp": 0.06347656, + "step": 10950, + "time_per_iteration": 3.107018232345581 + }, + { + "auxiliary_loss_clip": 0.01055086, + "auxiliary_loss_mlp": 0.01022034, + "balance_loss_clip": 1.01044655, + "balance_loss_mlp": 1.01744437, + "epoch": 0.6584097399669322, + "flos": 13078009722240.0, + "grad_norm": 4.169635982845391, + "language_loss": 0.6735974, + "learning_rate": 1.0453781267912838e-06, + "loss": 0.6943686, + "num_input_tokens_seen": 236420475, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.37695312, + "step": 10951, + "time_per_iteration": 2.391807794570923 + }, + { + "auxiliary_loss_clip": 0.01057825, + "auxiliary_loss_mlp": 0.01024452, + "balance_loss_clip": 1.01233411, + "balance_loss_mlp": 1.01980901, + "epoch": 0.6584698632196002, + "flos": 28691936714880.0, + "grad_norm": 1.4014987482421897, + "language_loss": 0.7663641, + "learning_rate": 1.0450461383242821e-06, + "loss": 0.78718686, + "num_input_tokens_seen": 236441915, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.38085938, + "step": 10952, + "time_per_iteration": 2.4505505561828613 + }, + { + "auxiliary_loss_clip": 0.01058118, + "auxiliary_loss_mlp": 0.01021691, + "balance_loss_clip": 1.00990105, + "balance_loss_mlp": 1.02014661, + "epoch": 0.6585299864722681, + "flos": 14318403972480.0, + "grad_norm": 1.7644818555995745, + "language_loss": 0.73740828, + "learning_rate": 1.044714183937126e-06, + "loss": 0.75820637, + "num_input_tokens_seen": 236460340, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.37890625, + "step": 10953, + "time_per_iteration": 2.3768742084503174 + }, + { + "auxiliary_loss_clip": 0.01057908, + "auxiliary_loss_mlp": 0.01027111, + "balance_loss_clip": 1.01452267, + "balance_loss_mlp": 1.01891851, + "epoch": 0.6585901097249361, + "flos": 26796676492800.0, + "grad_norm": 1.8184344618197934, + "language_loss": 0.78600788, + "learning_rate": 1.0443822636416637e-06, + "loss": 0.80685806, + "num_input_tokens_seen": 236478280, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.390625, + "step": 10954, + "time_per_iteration": 2.441042900085449 + }, + { + "auxiliary_loss_clip": 0.0105947, + "auxiliary_loss_mlp": 0.01027843, + "balance_loss_clip": 1.01545143, + "balance_loss_mlp": 1.01958346, + "epoch": 0.658650232977604, + "flos": 18732158830080.0, + "grad_norm": 3.0353766349991402, + "language_loss": 0.69826567, + "learning_rate": 1.0440503774497406e-06, + "loss": 0.7191388, + "num_input_tokens_seen": 236493225, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.3984375, + "step": 10955, + "time_per_iteration": 2.418853282928467 + }, + { + "auxiliary_loss_clip": 0.01056216, + "auxiliary_loss_mlp": 0.01024372, + "balance_loss_clip": 1.01400065, + "balance_loss_mlp": 1.01845455, + "epoch": 0.6587103562302721, + "flos": 24752302387200.0, + "grad_norm": 3.200543889910529, + "language_loss": 0.80237448, + "learning_rate": 1.0437185253732006e-06, + "loss": 0.82318044, + "num_input_tokens_seen": 236514420, + "router_z_loss_clip": 0.10351562, + "router_z_loss_mlp": 0.37695312, + "step": 10956, + "time_per_iteration": 3.8643760681152344 + }, + { + "auxiliary_loss_clip": 0.01059033, + "auxiliary_loss_mlp": 0.01028585, + "balance_loss_clip": 1.01646769, + "balance_loss_mlp": 1.01967132, + "epoch": 0.65877047948294, + "flos": 22345040943360.0, + "grad_norm": 2.012954391425153, + "language_loss": 0.81143379, + "learning_rate": 1.0433867074238856e-06, + "loss": 0.83231002, + "num_input_tokens_seen": 236532785, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.39453125, + "step": 10957, + "time_per_iteration": 2.3839330673217773 + }, + { + "auxiliary_loss_clip": 0.01055826, + "auxiliary_loss_mlp": 0.01026034, + "balance_loss_clip": 1.0155139, + "balance_loss_mlp": 1.01911402, + "epoch": 0.658830602735608, + "flos": 45178971788160.0, + "grad_norm": 1.6008439027840027, + "language_loss": 0.75396204, + "learning_rate": 1.0430549236136399e-06, + "loss": 0.77478063, + "num_input_tokens_seen": 236553330, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.3671875, + "step": 10958, + "time_per_iteration": 2.618417263031006 + }, + { + "auxiliary_loss_clip": 0.01055902, + "auxiliary_loss_mlp": 0.0102703, + "balance_loss_clip": 1.01565182, + "balance_loss_mlp": 1.01857066, + "epoch": 0.6588907259882759, + "flos": 19900597034880.0, + "grad_norm": 1.483434898033206, + "language_loss": 0.74851274, + "learning_rate": 1.0427231739543009e-06, + "loss": 0.76934206, + "num_input_tokens_seen": 236572960, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.37304688, + "step": 10959, + "time_per_iteration": 2.3792831897735596 + }, + { + "auxiliary_loss_clip": 0.01059384, + "auxiliary_loss_mlp": 0.01022922, + "balance_loss_clip": 1.01159704, + "balance_loss_mlp": 1.02039313, + "epoch": 0.6589508492409439, + "flos": 24132628932480.0, + "grad_norm": 1.557571213640023, + "language_loss": 0.64776516, + "learning_rate": 1.0423914584577102e-06, + "loss": 0.66858822, + "num_input_tokens_seen": 236594090, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.390625, + "step": 10960, + "time_per_iteration": 2.413191080093384 + }, + { + "auxiliary_loss_clip": 0.01060511, + "auxiliary_loss_mlp": 0.01026537, + "balance_loss_clip": 1.01351905, + "balance_loss_mlp": 1.01974583, + "epoch": 0.659010972493612, + "flos": 18221938087680.0, + "grad_norm": 2.3449464568631666, + "language_loss": 0.8216188, + "learning_rate": 1.0420597771357042e-06, + "loss": 0.8424893, + "num_input_tokens_seen": 236610190, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.40625, + "step": 10961, + "time_per_iteration": 2.3649723529815674 + }, + { + "auxiliary_loss_clip": 0.01058063, + "auxiliary_loss_mlp": 0.01024428, + "balance_loss_clip": 1.01248884, + "balance_loss_mlp": 1.01949239, + "epoch": 0.6590710957462799, + "flos": 27598771134720.0, + "grad_norm": 1.9307233773269643, + "language_loss": 0.73476297, + "learning_rate": 1.041728130000122e-06, + "loss": 0.75558794, + "num_input_tokens_seen": 236631575, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.38671875, + "step": 10962, + "time_per_iteration": 2.4750735759735107 + }, + { + "auxiliary_loss_clip": 0.01058599, + "auxiliary_loss_mlp": 0.01022045, + "balance_loss_clip": 1.01077414, + "balance_loss_mlp": 1.01935327, + "epoch": 0.6591312189989479, + "flos": 20301923646720.0, + "grad_norm": 2.541872789702627, + "language_loss": 0.79457086, + "learning_rate": 1.0413965170627976e-06, + "loss": 0.8153773, + "num_input_tokens_seen": 236649815, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.390625, + "step": 10963, + "time_per_iteration": 2.4192757606506348 + }, + { + "auxiliary_loss_clip": 0.01055259, + "auxiliary_loss_mlp": 0.01024431, + "balance_loss_clip": 1.01335609, + "balance_loss_mlp": 1.01822269, + "epoch": 0.6591913422516158, + "flos": 12312120026880.0, + "grad_norm": 1.6244794630692816, + "language_loss": 0.78401786, + "learning_rate": 1.0410649383355648e-06, + "loss": 0.80481482, + "num_input_tokens_seen": 236668335, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.37109375, + "step": 10964, + "time_per_iteration": 2.375831127166748 + }, + { + "auxiliary_loss_clip": 0.01057944, + "auxiliary_loss_mlp": 0.01023549, + "balance_loss_clip": 1.01225364, + "balance_loss_mlp": 1.01856363, + "epoch": 0.6592514655042838, + "flos": 25883418481920.0, + "grad_norm": 1.6339378356020835, + "language_loss": 0.743312, + "learning_rate": 1.0407333938302589e-06, + "loss": 0.7641269, + "num_input_tokens_seen": 236688945, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.39453125, + "step": 10965, + "time_per_iteration": 2.4430787563323975 + }, + { + "auxiliary_loss_clip": 0.0105968, + "auxiliary_loss_mlp": 0.01025189, + "balance_loss_clip": 1.01351237, + "balance_loss_mlp": 1.01928687, + "epoch": 0.6593115887569517, + "flos": 14062769930880.0, + "grad_norm": 1.9390088773718026, + "language_loss": 0.73038477, + "learning_rate": 1.0404018835587095e-06, + "loss": 0.7512334, + "num_input_tokens_seen": 236707055, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.40234375, + "step": 10966, + "time_per_iteration": 3.7821204662323 + }, + { + "auxiliary_loss_clip": 0.01058037, + "auxiliary_loss_mlp": 0.0102435, + "balance_loss_clip": 1.01198161, + "balance_loss_mlp": 1.01855755, + "epoch": 0.6593717120096197, + "flos": 24716760756480.0, + "grad_norm": 3.4641364799563923, + "language_loss": 0.76704317, + "learning_rate": 1.040070407532749e-06, + "loss": 0.78786707, + "num_input_tokens_seen": 236725900, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.39453125, + "step": 10967, + "time_per_iteration": 3.868217945098877 + }, + { + "auxiliary_loss_clip": 0.0100718, + "auxiliary_loss_mlp": 0.01001435, + "balance_loss_clip": 1.00054049, + "balance_loss_mlp": 1.00069666, + "epoch": 0.6594318352622877, + "flos": 55554772101120.0, + "grad_norm": 0.6990764595375883, + "language_loss": 0.48481038, + "learning_rate": 1.0397389657642058e-06, + "loss": 0.50489652, + "num_input_tokens_seen": 236788415, + "router_z_loss_clip": 0.00891113, + "router_z_loss_mlp": 0.06445312, + "step": 10968, + "time_per_iteration": 3.0813934803009033 + }, + { + "auxiliary_loss_clip": 0.01057228, + "auxiliary_loss_mlp": 0.01023143, + "balance_loss_clip": 1.01123977, + "balance_loss_mlp": 1.01877141, + "epoch": 0.6594919585149557, + "flos": 17455978569600.0, + "grad_norm": 1.7900992516609608, + "language_loss": 0.79059899, + "learning_rate": 1.0394075582649102e-06, + "loss": 0.81140268, + "num_input_tokens_seen": 236805155, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.38476562, + "step": 10969, + "time_per_iteration": 2.385155439376831 + }, + { + "auxiliary_loss_clip": 0.0105738, + "auxiliary_loss_mlp": 0.01021587, + "balance_loss_clip": 1.0100956, + "balance_loss_mlp": 1.01879978, + "epoch": 0.6595520817676236, + "flos": 18222252289920.0, + "grad_norm": 1.9029668950773477, + "language_loss": 0.65320492, + "learning_rate": 1.0390761850466864e-06, + "loss": 0.67399454, + "num_input_tokens_seen": 236824360, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.38476562, + "step": 10970, + "time_per_iteration": 2.3685436248779297 + }, + { + "auxiliary_loss_clip": 0.01056217, + "auxiliary_loss_mlp": 0.01022706, + "balance_loss_clip": 1.01122046, + "balance_loss_mlp": 1.01857984, + "epoch": 0.6596122050202916, + "flos": 22198685057280.0, + "grad_norm": 1.9712563810842962, + "language_loss": 0.76381117, + "learning_rate": 1.0387448461213626e-06, + "loss": 0.78460044, + "num_input_tokens_seen": 236844640, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.375, + "step": 10971, + "time_per_iteration": 2.3956093788146973 + }, + { + "auxiliary_loss_clip": 0.01059926, + "auxiliary_loss_mlp": 0.01025665, + "balance_loss_clip": 1.0137918, + "balance_loss_mlp": 1.01947713, + "epoch": 0.6596723282729595, + "flos": 14172955781760.0, + "grad_norm": 2.3009928626561984, + "language_loss": 0.6962719, + "learning_rate": 1.0384135415007627e-06, + "loss": 0.7171278, + "num_input_tokens_seen": 236861160, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.40625, + "step": 10972, + "time_per_iteration": 2.3844361305236816 + }, + { + "auxiliary_loss_clip": 0.01055552, + "auxiliary_loss_mlp": 0.01022245, + "balance_loss_clip": 1.01074767, + "balance_loss_mlp": 1.01785421, + "epoch": 0.6597324515256275, + "flos": 30551934597120.0, + "grad_norm": 1.7370997128303827, + "language_loss": 0.55662364, + "learning_rate": 1.0380822711967097e-06, + "loss": 0.57740164, + "num_input_tokens_seen": 236880465, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.37695312, + "step": 10973, + "time_per_iteration": 2.4503438472747803 + }, + { + "auxiliary_loss_clip": 0.01060898, + "auxiliary_loss_mlp": 0.01027564, + "balance_loss_clip": 1.01552963, + "balance_loss_mlp": 1.01941156, + "epoch": 0.6597925747782956, + "flos": 17638888515840.0, + "grad_norm": 1.7714618918133018, + "language_loss": 0.78534329, + "learning_rate": 1.0377510352210256e-06, + "loss": 0.80622792, + "num_input_tokens_seen": 236897730, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.4140625, + "step": 10974, + "time_per_iteration": 3.8076107501983643 + }, + { + "auxiliary_loss_clip": 0.01058613, + "auxiliary_loss_mlp": 0.01027936, + "balance_loss_clip": 1.01550841, + "balance_loss_mlp": 1.01868808, + "epoch": 0.6598526980309635, + "flos": 22818044309760.0, + "grad_norm": 2.1458378024780376, + "language_loss": 0.68645525, + "learning_rate": 1.0374198335855334e-06, + "loss": 0.70732075, + "num_input_tokens_seen": 236917300, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.3984375, + "step": 10975, + "time_per_iteration": 2.448878526687622 + }, + { + "auxiliary_loss_clip": 0.01056974, + "auxiliary_loss_mlp": 0.01020346, + "balance_loss_clip": 1.00909901, + "balance_loss_mlp": 1.0179652, + "epoch": 0.6599128212836315, + "flos": 21067010380800.0, + "grad_norm": 1.5445536468104377, + "language_loss": 0.70304477, + "learning_rate": 1.0370886663020498e-06, + "loss": 0.723818, + "num_input_tokens_seen": 236935590, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.390625, + "step": 10976, + "time_per_iteration": 2.411987066268921 + }, + { + "auxiliary_loss_clip": 0.01053995, + "auxiliary_loss_mlp": 0.01024861, + "balance_loss_clip": 1.01321983, + "balance_loss_mlp": 1.01703238, + "epoch": 0.6599729445362994, + "flos": 22162445199360.0, + "grad_norm": 1.7018367707688544, + "language_loss": 0.67775548, + "learning_rate": 1.0367575333823953e-06, + "loss": 0.69854403, + "num_input_tokens_seen": 236952830, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.37109375, + "step": 10977, + "time_per_iteration": 2.3995540142059326 + }, + { + "auxiliary_loss_clip": 0.01061028, + "auxiliary_loss_mlp": 0.01025506, + "balance_loss_clip": 1.01240504, + "balance_loss_mlp": 1.01990366, + "epoch": 0.6600330677889674, + "flos": 18149109258240.0, + "grad_norm": 2.1399493025497067, + "language_loss": 0.81559432, + "learning_rate": 1.0364264348383868e-06, + "loss": 0.83645964, + "num_input_tokens_seen": 236971930, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.41015625, + "step": 10978, + "time_per_iteration": 2.3549869060516357 + }, + { + "auxiliary_loss_clip": 0.01058484, + "auxiliary_loss_mlp": 0.01026244, + "balance_loss_clip": 1.01396012, + "balance_loss_mlp": 1.01821935, + "epoch": 0.6600931910416353, + "flos": 18149144169600.0, + "grad_norm": 2.0174414858514558, + "language_loss": 0.67337239, + "learning_rate": 1.0360953706818402e-06, + "loss": 0.69421971, + "num_input_tokens_seen": 236989920, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.40234375, + "step": 10979, + "time_per_iteration": 2.3835062980651855 + }, + { + "auxiliary_loss_clip": 0.01057268, + "auxiliary_loss_mlp": 0.0102533, + "balance_loss_clip": 1.01264644, + "balance_loss_mlp": 1.01819921, + "epoch": 0.6601533142943034, + "flos": 17419773623040.0, + "grad_norm": 3.3053004870588336, + "language_loss": 0.73608816, + "learning_rate": 1.0357643409245703e-06, + "loss": 0.75691414, + "num_input_tokens_seen": 237006570, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.390625, + "step": 10980, + "time_per_iteration": 2.351909637451172 + }, + { + "auxiliary_loss_clip": 0.01054819, + "auxiliary_loss_mlp": 0.01023796, + "balance_loss_clip": 1.01294792, + "balance_loss_mlp": 1.01852214, + "epoch": 0.6602134375469713, + "flos": 28218339855360.0, + "grad_norm": 1.610222361466647, + "language_loss": 0.72733641, + "learning_rate": 1.0354333455783901e-06, + "loss": 0.74812257, + "num_input_tokens_seen": 237028415, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.36328125, + "step": 10981, + "time_per_iteration": 2.4702982902526855 + }, + { + "auxiliary_loss_clip": 0.01058539, + "auxiliary_loss_mlp": 0.01028808, + "balance_loss_clip": 1.01684499, + "balance_loss_mlp": 1.01888525, + "epoch": 0.6602735607996393, + "flos": 29416943341440.0, + "grad_norm": 2.3022813327822607, + "language_loss": 0.68344975, + "learning_rate": 1.0351023846551141e-06, + "loss": 0.70432323, + "num_input_tokens_seen": 237046595, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.3984375, + "step": 10982, + "time_per_iteration": 2.5381920337677 + }, + { + "auxiliary_loss_clip": 0.01057273, + "auxiliary_loss_mlp": 0.0102532, + "balance_loss_clip": 1.01381612, + "balance_loss_mlp": 1.01973653, + "epoch": 0.6603336840523072, + "flos": 18587059752960.0, + "grad_norm": 1.4576555897320254, + "language_loss": 0.69447935, + "learning_rate": 1.0347714581665504e-06, + "loss": 0.71530527, + "num_input_tokens_seen": 237066150, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.375, + "step": 10983, + "time_per_iteration": 2.3892574310302734 + }, + { + "auxiliary_loss_clip": 0.01057289, + "auxiliary_loss_mlp": 0.0102405, + "balance_loss_clip": 1.01206303, + "balance_loss_mlp": 1.01835632, + "epoch": 0.6603938073049752, + "flos": 33253478824320.0, + "grad_norm": 1.6348964244541355, + "language_loss": 0.70140445, + "learning_rate": 1.0344405661245117e-06, + "loss": 0.72221786, + "num_input_tokens_seen": 237087060, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.390625, + "step": 10984, + "time_per_iteration": 2.49653697013855 + }, + { + "auxiliary_loss_clip": 0.01055713, + "auxiliary_loss_mlp": 0.01023678, + "balance_loss_clip": 1.01177502, + "balance_loss_mlp": 1.01878583, + "epoch": 0.6604539305576431, + "flos": 17383324296960.0, + "grad_norm": 1.5232575186573707, + "language_loss": 0.83864546, + "learning_rate": 1.0341097085408041e-06, + "loss": 0.85943937, + "num_input_tokens_seen": 237103825, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.36914062, + "step": 10985, + "time_per_iteration": 2.374582290649414 + }, + { + "auxiliary_loss_clip": 0.01059873, + "auxiliary_loss_mlp": 0.01024014, + "balance_loss_clip": 1.01181889, + "balance_loss_mlp": 1.01972365, + "epoch": 0.6605140538103111, + "flos": 21250094883840.0, + "grad_norm": 2.190964232413839, + "language_loss": 0.74267507, + "learning_rate": 1.0337788854272385e-06, + "loss": 0.76351392, + "num_input_tokens_seen": 237121740, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.40039062, + "step": 10986, + "time_per_iteration": 2.4121594429016113 + }, + { + "auxiliary_loss_clip": 0.01056378, + "auxiliary_loss_mlp": 0.01022362, + "balance_loss_clip": 1.01126981, + "balance_loss_mlp": 1.01901615, + "epoch": 0.6605741770629792, + "flos": 13880837502720.0, + "grad_norm": 1.652183864571212, + "language_loss": 0.79207301, + "learning_rate": 1.033448096795617e-06, + "loss": 0.81286043, + "num_input_tokens_seen": 237139565, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.375, + "step": 10987, + "time_per_iteration": 2.40476393699646 + }, + { + "auxiliary_loss_clip": 0.01007244, + "auxiliary_loss_mlp": 0.01000245, + "balance_loss_clip": 0.99928522, + "balance_loss_mlp": 1.00091815, + "epoch": 0.6606343003156471, + "flos": 69312436727040.0, + "grad_norm": 0.8175739862456629, + "language_loss": 0.54146898, + "learning_rate": 1.0331173426577477e-06, + "loss": 0.56154394, + "num_input_tokens_seen": 237201055, + "router_z_loss_clip": 0.00958252, + "router_z_loss_mlp": 0.06347656, + "step": 10988, + "time_per_iteration": 3.1221582889556885 + }, + { + "auxiliary_loss_clip": 0.01058065, + "auxiliary_loss_mlp": 0.01024155, + "balance_loss_clip": 1.01368284, + "balance_loss_mlp": 1.02074707, + "epoch": 0.6606944235683151, + "flos": 27271146136320.0, + "grad_norm": 1.4774163728081104, + "language_loss": 0.77327788, + "learning_rate": 1.0327866230254336e-06, + "loss": 0.79410017, + "num_input_tokens_seen": 237221805, + "router_z_loss_clip": 0.10449219, + "router_z_loss_mlp": 0.37304688, + "step": 10989, + "time_per_iteration": 2.460942268371582 + }, + { + "auxiliary_loss_clip": 0.01059905, + "auxiliary_loss_mlp": 0.01027286, + "balance_loss_clip": 1.01544213, + "balance_loss_mlp": 1.02120113, + "epoch": 0.660754546820983, + "flos": 13771943372160.0, + "grad_norm": 1.99969441480549, + "language_loss": 0.77644324, + "learning_rate": 1.0324559379104766e-06, + "loss": 0.79731512, + "num_input_tokens_seen": 237238270, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.38671875, + "step": 10990, + "time_per_iteration": 2.3554177284240723 + }, + { + "auxiliary_loss_clip": 0.0106058, + "auxiliary_loss_mlp": 0.01022107, + "balance_loss_clip": 1.00994194, + "balance_loss_mlp": 1.02025354, + "epoch": 0.660814670073651, + "flos": 15704316236160.0, + "grad_norm": 2.161506276892878, + "language_loss": 0.60593772, + "learning_rate": 1.0321252873246774e-06, + "loss": 0.62676454, + "num_input_tokens_seen": 237255400, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.40234375, + "step": 10991, + "time_per_iteration": 2.375490427017212 + }, + { + "auxiliary_loss_clip": 0.0106141, + "auxiliary_loss_mlp": 0.01029938, + "balance_loss_clip": 1.01777911, + "balance_loss_mlp": 1.02044582, + "epoch": 0.6608747933263189, + "flos": 20848977740160.0, + "grad_norm": 3.0621232191443504, + "language_loss": 0.68427086, + "learning_rate": 1.0317946712798388e-06, + "loss": 0.70518434, + "num_input_tokens_seen": 237273105, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.41015625, + "step": 10992, + "time_per_iteration": 2.3776631355285645 + }, + { + "auxiliary_loss_clip": 0.01057078, + "auxiliary_loss_mlp": 0.01025226, + "balance_loss_clip": 1.0131743, + "balance_loss_mlp": 1.0174948, + "epoch": 0.660934916578987, + "flos": 20631049833600.0, + "grad_norm": 1.7885527292076884, + "language_loss": 0.87542164, + "learning_rate": 1.0314640897877574e-06, + "loss": 0.89624465, + "num_input_tokens_seen": 237292650, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.39453125, + "step": 10993, + "time_per_iteration": 2.4251699447631836 + }, + { + "auxiliary_loss_clip": 0.01061568, + "auxiliary_loss_mlp": 0.0102648, + "balance_loss_clip": 1.01280618, + "balance_loss_mlp": 1.01932418, + "epoch": 0.6609950398316549, + "flos": 25112571373440.0, + "grad_norm": 1.8319763622311662, + "language_loss": 0.66879398, + "learning_rate": 1.0311335428602302e-06, + "loss": 0.68967444, + "num_input_tokens_seen": 237312865, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.42382812, + "step": 10994, + "time_per_iteration": 2.4169225692749023 + }, + { + "auxiliary_loss_clip": 0.01060355, + "auxiliary_loss_mlp": 0.01021944, + "balance_loss_clip": 1.00967741, + "balance_loss_mlp": 1.02030516, + "epoch": 0.6610551630843229, + "flos": 18660202784640.0, + "grad_norm": 1.7858264302287015, + "language_loss": 0.7667405, + "learning_rate": 1.0308030305090553e-06, + "loss": 0.7875635, + "num_input_tokens_seen": 237331210, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.40039062, + "step": 10995, + "time_per_iteration": 3.8443076610565186 + }, + { + "auxiliary_loss_clip": 0.01058336, + "auxiliary_loss_mlp": 0.01026156, + "balance_loss_clip": 1.01467586, + "balance_loss_mlp": 1.0185498, + "epoch": 0.6611152863369908, + "flos": 23257077056640.0, + "grad_norm": 1.7987510178792754, + "language_loss": 0.74260211, + "learning_rate": 1.0304725527460271e-06, + "loss": 0.76344705, + "num_input_tokens_seen": 237349455, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.3984375, + "step": 10996, + "time_per_iteration": 2.386129856109619 + }, + { + "auxiliary_loss_clip": 0.01058857, + "auxiliary_loss_mlp": 0.01028655, + "balance_loss_clip": 1.01659131, + "balance_loss_mlp": 1.01999891, + "epoch": 0.6611754095896588, + "flos": 22158744595200.0, + "grad_norm": 1.6065550636008947, + "language_loss": 0.68897402, + "learning_rate": 1.0301421095829402e-06, + "loss": 0.70984912, + "num_input_tokens_seen": 237367100, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.38867188, + "step": 10997, + "time_per_iteration": 2.40219783782959 + }, + { + "auxiliary_loss_clip": 0.01060167, + "auxiliary_loss_mlp": 0.01024981, + "balance_loss_clip": 1.01313186, + "balance_loss_mlp": 1.01982582, + "epoch": 0.6612355328423267, + "flos": 13990360037760.0, + "grad_norm": 2.1827920561357583, + "language_loss": 0.68458855, + "learning_rate": 1.0298117010315853e-06, + "loss": 0.70544004, + "num_input_tokens_seen": 237384840, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.40429688, + "step": 10998, + "time_per_iteration": 2.3775830268859863 + }, + { + "auxiliary_loss_clip": 0.01007277, + "auxiliary_loss_mlp": 0.01004032, + "balance_loss_clip": 1.00310242, + "balance_loss_mlp": 1.00081396, + "epoch": 0.6612956560949947, + "flos": 61450660529280.0, + "grad_norm": 0.650009147856226, + "language_loss": 0.51170826, + "learning_rate": 1.0294813271037569e-06, + "loss": 0.53182137, + "num_input_tokens_seen": 237443355, + "router_z_loss_clip": 0.00927734, + "router_z_loss_mlp": 0.06445312, + "step": 10999, + "time_per_iteration": 3.0343198776245117 + }, + { + "auxiliary_loss_clip": 0.01060582, + "auxiliary_loss_mlp": 0.01024945, + "balance_loss_clip": 1.01272583, + "balance_loss_mlp": 1.01957536, + "epoch": 0.6613557793476627, + "flos": 21615565662720.0, + "grad_norm": 2.3865161892761866, + "language_loss": 0.70668572, + "learning_rate": 1.0291509878112416e-06, + "loss": 0.72754103, + "num_input_tokens_seen": 237459205, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.41015625, + "step": 11000, + "time_per_iteration": 2.402019739151001 + }, + { + "auxiliary_loss_clip": 0.01054989, + "auxiliary_loss_mlp": 0.01025485, + "balance_loss_clip": 1.01455939, + "balance_loss_mlp": 1.01796103, + "epoch": 0.6614159026003307, + "flos": 34018740115200.0, + "grad_norm": 1.887544857297183, + "language_loss": 0.64866477, + "learning_rate": 1.0288206831658314e-06, + "loss": 0.66946948, + "num_input_tokens_seen": 237483580, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.37109375, + "step": 11001, + "time_per_iteration": 2.516291618347168 + }, + { + "auxiliary_loss_clip": 0.01057578, + "auxiliary_loss_mlp": 0.01020782, + "balance_loss_clip": 1.00933218, + "balance_loss_mlp": 1.01893973, + "epoch": 0.6614760258529987, + "flos": 24096144695040.0, + "grad_norm": 1.971714145348519, + "language_loss": 0.7322135, + "learning_rate": 1.0284904131793127e-06, + "loss": 0.7529971, + "num_input_tokens_seen": 237502860, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.38671875, + "step": 11002, + "time_per_iteration": 2.4269535541534424 + }, + { + "auxiliary_loss_clip": 0.01056683, + "auxiliary_loss_mlp": 0.01025152, + "balance_loss_clip": 1.01360655, + "balance_loss_mlp": 1.01836848, + "epoch": 0.6615361491056666, + "flos": 14902884910080.0, + "grad_norm": 2.004800543545157, + "language_loss": 0.78931707, + "learning_rate": 1.0281601778634722e-06, + "loss": 0.81013542, + "num_input_tokens_seen": 237521030, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.3828125, + "step": 11003, + "time_per_iteration": 2.3838484287261963 + }, + { + "auxiliary_loss_clip": 0.01059936, + "auxiliary_loss_mlp": 0.01027636, + "balance_loss_clip": 1.01389098, + "balance_loss_mlp": 1.01944888, + "epoch": 0.6615962723583346, + "flos": 15303967142400.0, + "grad_norm": 1.9535972536575064, + "language_loss": 0.68584257, + "learning_rate": 1.0278299772300943e-06, + "loss": 0.70671827, + "num_input_tokens_seen": 237539585, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.40429688, + "step": 11004, + "time_per_iteration": 2.4028642177581787 + }, + { + "auxiliary_loss_clip": 0.0106096, + "auxiliary_loss_mlp": 0.01029295, + "balance_loss_clip": 1.01602077, + "balance_loss_mlp": 1.0195471, + "epoch": 0.6616563956110025, + "flos": 18731635159680.0, + "grad_norm": 2.2290251770534844, + "language_loss": 0.69660056, + "learning_rate": 1.0274998112909642e-06, + "loss": 0.71750307, + "num_input_tokens_seen": 237557655, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.4140625, + "step": 11005, + "time_per_iteration": 2.410698652267456 + }, + { + "auxiliary_loss_clip": 0.01057326, + "auxiliary_loss_mlp": 0.01028777, + "balance_loss_clip": 1.01704097, + "balance_loss_mlp": 1.01968002, + "epoch": 0.6617165188636706, + "flos": 24494015082240.0, + "grad_norm": 1.6517813074331775, + "language_loss": 0.78030294, + "learning_rate": 1.0271696800578646e-06, + "loss": 0.80116403, + "num_input_tokens_seen": 237577000, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.37695312, + "step": 11006, + "time_per_iteration": 3.9357218742370605 + }, + { + "auxiliary_loss_clip": 0.01058948, + "auxiliary_loss_mlp": 0.01025814, + "balance_loss_clip": 1.01314235, + "balance_loss_mlp": 1.01987946, + "epoch": 0.6617766421163385, + "flos": 22378662449280.0, + "grad_norm": 1.5090601312738154, + "language_loss": 0.76090348, + "learning_rate": 1.0268395835425767e-06, + "loss": 0.78175116, + "num_input_tokens_seen": 237597960, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.390625, + "step": 11007, + "time_per_iteration": 2.430600166320801 + }, + { + "auxiliary_loss_clip": 0.01057355, + "auxiliary_loss_mlp": 0.0102208, + "balance_loss_clip": 1.01079082, + "balance_loss_mlp": 1.0195148, + "epoch": 0.6618367653690065, + "flos": 20849361765120.0, + "grad_norm": 1.8012899484677165, + "language_loss": 0.78301352, + "learning_rate": 1.0265095217568806e-06, + "loss": 0.80380785, + "num_input_tokens_seen": 237616385, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.37890625, + "step": 11008, + "time_per_iteration": 2.4036383628845215 + }, + { + "auxiliary_loss_clip": 0.01060523, + "auxiliary_loss_mlp": 0.01024861, + "balance_loss_clip": 1.01165271, + "balance_loss_mlp": 1.01981831, + "epoch": 0.6618968886216744, + "flos": 17711368231680.0, + "grad_norm": 1.7445885584386343, + "language_loss": 0.81745297, + "learning_rate": 1.0261794947125556e-06, + "loss": 0.83830684, + "num_input_tokens_seen": 237634930, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.40625, + "step": 11009, + "time_per_iteration": 2.3677022457122803 + }, + { + "auxiliary_loss_clip": 0.01007299, + "auxiliary_loss_mlp": 0.01000963, + "balance_loss_clip": 0.99999714, + "balance_loss_mlp": 1.00092554, + "epoch": 0.6619570118743424, + "flos": 67032155364480.0, + "grad_norm": 0.9831397037770547, + "language_loss": 0.67355716, + "learning_rate": 1.02584950242138e-06, + "loss": 0.69363987, + "num_input_tokens_seen": 237693175, + "router_z_loss_clip": 0.00964355, + "router_z_loss_mlp": 0.06347656, + "step": 11010, + "time_per_iteration": 3.0101115703582764 + }, + { + "auxiliary_loss_clip": 0.01057715, + "auxiliary_loss_mlp": 0.01026836, + "balance_loss_clip": 1.01430106, + "balance_loss_mlp": 1.01841879, + "epoch": 0.6620171351270103, + "flos": 18659923493760.0, + "grad_norm": 1.4833165302811906, + "language_loss": 0.71192843, + "learning_rate": 1.0255195448951287e-06, + "loss": 0.7327739, + "num_input_tokens_seen": 237713160, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.39257812, + "step": 11011, + "time_per_iteration": 2.388044595718384 + }, + { + "auxiliary_loss_clip": 0.01058916, + "auxiliary_loss_mlp": 0.01028812, + "balance_loss_clip": 1.01702213, + "balance_loss_mlp": 1.02045321, + "epoch": 0.6620772583796783, + "flos": 24169357549440.0, + "grad_norm": 1.6412617465812267, + "language_loss": 0.72775519, + "learning_rate": 1.0251896221455787e-06, + "loss": 0.74863243, + "num_input_tokens_seen": 237733600, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.38476562, + "step": 11012, + "time_per_iteration": 2.4349045753479004 + }, + { + "auxiliary_loss_clip": 0.01054584, + "auxiliary_loss_mlp": 0.0102203, + "balance_loss_clip": 1.0108428, + "balance_loss_mlp": 1.01886368, + "epoch": 0.6621373816323463, + "flos": 23622408190080.0, + "grad_norm": 1.6723159538923709, + "language_loss": 0.79281318, + "learning_rate": 1.0248597341845039e-06, + "loss": 0.81357932, + "num_input_tokens_seen": 237752135, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.35742188, + "step": 11013, + "time_per_iteration": 3.850212335586548 + }, + { + "auxiliary_loss_clip": 0.01058505, + "auxiliary_loss_mlp": 0.01022177, + "balance_loss_clip": 1.01082218, + "balance_loss_mlp": 1.01965117, + "epoch": 0.6621975048850143, + "flos": 18441227537280.0, + "grad_norm": 2.182041163205455, + "language_loss": 0.70093727, + "learning_rate": 1.0245298810236764e-06, + "loss": 0.72174406, + "num_input_tokens_seen": 237770735, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.38867188, + "step": 11014, + "time_per_iteration": 2.4155139923095703 + }, + { + "auxiliary_loss_clip": 0.01055829, + "auxiliary_loss_mlp": 0.0102392, + "balance_loss_clip": 1.01339984, + "balance_loss_mlp": 1.01924241, + "epoch": 0.6622576281376823, + "flos": 14063014310400.0, + "grad_norm": 1.9279714347846344, + "language_loss": 0.77026546, + "learning_rate": 1.0242000626748679e-06, + "loss": 0.79106295, + "num_input_tokens_seen": 237789005, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.3671875, + "step": 11015, + "time_per_iteration": 2.379804849624634 + }, + { + "auxiliary_loss_clip": 0.01056085, + "auxiliary_loss_mlp": 0.01021618, + "balance_loss_clip": 1.00974464, + "balance_loss_mlp": 1.01877582, + "epoch": 0.6623177513903502, + "flos": 17018028074880.0, + "grad_norm": 1.996175344967133, + "language_loss": 0.82111436, + "learning_rate": 1.0238702791498506e-06, + "loss": 0.84189141, + "num_input_tokens_seen": 237807740, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.37304688, + "step": 11016, + "time_per_iteration": 2.400703191757202 + }, + { + "auxiliary_loss_clip": 0.01057408, + "auxiliary_loss_mlp": 0.01024362, + "balance_loss_clip": 1.01272678, + "balance_loss_mlp": 1.01833963, + "epoch": 0.6623778746430182, + "flos": 17270170980480.0, + "grad_norm": 2.0113152901058746, + "language_loss": 0.69823325, + "learning_rate": 1.0235405304603904e-06, + "loss": 0.719051, + "num_input_tokens_seen": 237826340, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.390625, + "step": 11017, + "time_per_iteration": 2.3585972785949707 + }, + { + "auxiliary_loss_clip": 0.01058093, + "auxiliary_loss_mlp": 0.01028424, + "balance_loss_clip": 1.01640165, + "balance_loss_mlp": 1.01913643, + "epoch": 0.6624379978956861, + "flos": 48791016028800.0, + "grad_norm": 1.7358011508070814, + "language_loss": 0.77215326, + "learning_rate": 1.023210816618258e-06, + "loss": 0.79301846, + "num_input_tokens_seen": 237848305, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.390625, + "step": 11018, + "time_per_iteration": 2.6414568424224854 + }, + { + "auxiliary_loss_clip": 0.01057919, + "auxiliary_loss_mlp": 0.01029315, + "balance_loss_clip": 1.01674485, + "balance_loss_mlp": 1.01888919, + "epoch": 0.6624981211483542, + "flos": 18951448279680.0, + "grad_norm": 2.067026279700051, + "language_loss": 0.83669829, + "learning_rate": 1.0228811376352187e-06, + "loss": 0.85757059, + "num_input_tokens_seen": 237867020, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.390625, + "step": 11019, + "time_per_iteration": 2.424246311187744 + }, + { + "auxiliary_loss_clip": 0.01056594, + "auxiliary_loss_mlp": 0.01022507, + "balance_loss_clip": 1.01114607, + "balance_loss_mlp": 1.0195626, + "epoch": 0.6625582444010221, + "flos": 23255506045440.0, + "grad_norm": 1.7445590708474232, + "language_loss": 0.72005463, + "learning_rate": 1.022551493523038e-06, + "loss": 0.74084562, + "num_input_tokens_seen": 237886710, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.37109375, + "step": 11020, + "time_per_iteration": 2.4296133518218994 + }, + { + "auxiliary_loss_clip": 0.01060112, + "auxiliary_loss_mlp": 0.01023324, + "balance_loss_clip": 1.00977635, + "balance_loss_mlp": 1.01833737, + "epoch": 0.6626183676536901, + "flos": 21393832417920.0, + "grad_norm": 1.7357368231950026, + "language_loss": 0.72529566, + "learning_rate": 1.0222218842934799e-06, + "loss": 0.74612999, + "num_input_tokens_seen": 237904795, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.41796875, + "step": 11021, + "time_per_iteration": 2.3923070430755615 + }, + { + "auxiliary_loss_clip": 0.01059138, + "auxiliary_loss_mlp": 0.01026461, + "balance_loss_clip": 1.01446295, + "balance_loss_mlp": 1.01993036, + "epoch": 0.662678490906358, + "flos": 14570511966720.0, + "grad_norm": 2.1819114965242847, + "language_loss": 0.83548862, + "learning_rate": 1.0218923099583082e-06, + "loss": 0.85634464, + "num_input_tokens_seen": 237921320, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.39257812, + "step": 11022, + "time_per_iteration": 2.404301404953003 + }, + { + "auxiliary_loss_clip": 0.01062716, + "auxiliary_loss_mlp": 0.01024695, + "balance_loss_clip": 1.01064587, + "balance_loss_mlp": 1.01968598, + "epoch": 0.662738614159026, + "flos": 15991581836160.0, + "grad_norm": 2.969917077530105, + "language_loss": 0.72588444, + "learning_rate": 1.0215627705292844e-06, + "loss": 0.74675852, + "num_input_tokens_seen": 237933525, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.4296875, + "step": 11023, + "time_per_iteration": 2.3857150077819824 + }, + { + "auxiliary_loss_clip": 0.01056861, + "auxiliary_loss_mlp": 0.01023806, + "balance_loss_clip": 1.0122304, + "balance_loss_mlp": 1.01813614, + "epoch": 0.6627987374116939, + "flos": 19535335724160.0, + "grad_norm": 1.6687667116212686, + "language_loss": 0.7465775, + "learning_rate": 1.021233266018167e-06, + "loss": 0.76738417, + "num_input_tokens_seen": 237953395, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.38671875, + "step": 11024, + "time_per_iteration": 2.4272422790527344 + }, + { + "auxiliary_loss_clip": 0.01058659, + "auxiliary_loss_mlp": 0.01025853, + "balance_loss_clip": 1.0140816, + "balance_loss_mlp": 1.01941621, + "epoch": 0.662858860664362, + "flos": 15702012086400.0, + "grad_norm": 2.069437942418118, + "language_loss": 0.69589877, + "learning_rate": 1.0209037964367177e-06, + "loss": 0.71674395, + "num_input_tokens_seen": 237971445, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.390625, + "step": 11025, + "time_per_iteration": 2.418407678604126 + }, + { + "auxiliary_loss_clip": 0.0105909, + "auxiliary_loss_mlp": 0.01027075, + "balance_loss_clip": 1.01500511, + "balance_loss_mlp": 1.01971686, + "epoch": 0.6629189839170299, + "flos": 20153333433600.0, + "grad_norm": 1.635840744208188, + "language_loss": 0.79106772, + "learning_rate": 1.0205743617966932e-06, + "loss": 0.8119294, + "num_input_tokens_seen": 237989965, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.39453125, + "step": 11026, + "time_per_iteration": 2.437089443206787 + }, + { + "auxiliary_loss_clip": 0.01057637, + "auxiliary_loss_mlp": 0.01022184, + "balance_loss_clip": 1.01088834, + "balance_loss_mlp": 1.01999533, + "epoch": 0.6629791071696979, + "flos": 20914579918080.0, + "grad_norm": 1.6239033650983654, + "language_loss": 0.75949097, + "learning_rate": 1.0202449621098505e-06, + "loss": 0.78028923, + "num_input_tokens_seen": 238006820, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.375, + "step": 11027, + "time_per_iteration": 2.4286820888519287 + }, + { + "auxiliary_loss_clip": 0.0105888, + "auxiliary_loss_mlp": 0.010242, + "balance_loss_clip": 1.01177275, + "balance_loss_mlp": 1.01896834, + "epoch": 0.6630392304223659, + "flos": 20845940451840.0, + "grad_norm": 1.9635197573991352, + "language_loss": 0.69967705, + "learning_rate": 1.0199155973879442e-06, + "loss": 0.7205078, + "num_input_tokens_seen": 238022560, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.3984375, + "step": 11028, + "time_per_iteration": 2.4495835304260254 + }, + { + "auxiliary_loss_clip": 0.0105726, + "auxiliary_loss_mlp": 0.01023481, + "balance_loss_clip": 1.01266253, + "balance_loss_mlp": 1.01863122, + "epoch": 0.6630993536750338, + "flos": 20994775044480.0, + "grad_norm": 2.4919067799084638, + "language_loss": 0.79645002, + "learning_rate": 1.0195862676427297e-06, + "loss": 0.8172574, + "num_input_tokens_seen": 238041895, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.38671875, + "step": 11029, + "time_per_iteration": 2.391010046005249 + }, + { + "auxiliary_loss_clip": 0.01060126, + "auxiliary_loss_mlp": 0.01027321, + "balance_loss_clip": 1.01427901, + "balance_loss_mlp": 1.0192163, + "epoch": 0.6631594769277018, + "flos": 18258073211520.0, + "grad_norm": 2.1377643821167447, + "language_loss": 0.76186156, + "learning_rate": 1.0192569728859593e-06, + "loss": 0.78273606, + "num_input_tokens_seen": 238060445, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.40820312, + "step": 11030, + "time_per_iteration": 2.3787331581115723 + }, + { + "auxiliary_loss_clip": 0.01061473, + "auxiliary_loss_mlp": 0.01028774, + "balance_loss_clip": 1.01593494, + "balance_loss_mlp": 1.01999807, + "epoch": 0.6632196001803697, + "flos": 17819564135040.0, + "grad_norm": 5.608188585904313, + "language_loss": 0.75587165, + "learning_rate": 1.018927713129385e-06, + "loss": 0.77677417, + "num_input_tokens_seen": 238077080, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.4140625, + "step": 11031, + "time_per_iteration": 2.3773133754730225 + }, + { + "auxiliary_loss_clip": 0.01060514, + "auxiliary_loss_mlp": 0.01031028, + "balance_loss_clip": 1.01832676, + "balance_loss_mlp": 1.0203681, + "epoch": 0.6632797234330378, + "flos": 12669561192960.0, + "grad_norm": 2.4915111698107455, + "language_loss": 0.75187856, + "learning_rate": 1.0185984883847561e-06, + "loss": 0.77279401, + "num_input_tokens_seen": 238091045, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.40234375, + "step": 11032, + "time_per_iteration": 2.349684953689575 + }, + { + "auxiliary_loss_clip": 0.01057373, + "auxiliary_loss_mlp": 0.01025315, + "balance_loss_clip": 1.01359105, + "balance_loss_mlp": 1.01857162, + "epoch": 0.6633398466857057, + "flos": 23583654714240.0, + "grad_norm": 1.6585388325621342, + "language_loss": 0.80805588, + "learning_rate": 1.018269298663824e-06, + "loss": 0.82888269, + "num_input_tokens_seen": 238110220, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.38671875, + "step": 11033, + "time_per_iteration": 2.451970338821411 + }, + { + "auxiliary_loss_clip": 0.01058359, + "auxiliary_loss_mlp": 0.0102513, + "balance_loss_clip": 1.01238668, + "balance_loss_mlp": 1.01817584, + "epoch": 0.6633999699383737, + "flos": 20630630897280.0, + "grad_norm": 4.047362577632364, + "language_loss": 0.8040784, + "learning_rate": 1.017940143978334e-06, + "loss": 0.82491326, + "num_input_tokens_seen": 238130400, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.40234375, + "step": 11034, + "time_per_iteration": 2.4219424724578857 + }, + { + "auxiliary_loss_clip": 0.01057099, + "auxiliary_loss_mlp": 0.010244, + "balance_loss_clip": 1.01276541, + "balance_loss_mlp": 1.01889157, + "epoch": 0.6634600931910416, + "flos": 21796066725120.0, + "grad_norm": 1.5783357242347107, + "language_loss": 0.75328761, + "learning_rate": 1.0176110243400348e-06, + "loss": 0.77410257, + "num_input_tokens_seen": 238148165, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.3828125, + "step": 11035, + "time_per_iteration": 3.9137275218963623 + }, + { + "auxiliary_loss_clip": 0.01058326, + "auxiliary_loss_mlp": 0.01021541, + "balance_loss_clip": 1.00955486, + "balance_loss_mlp": 1.01804137, + "epoch": 0.6635202164437096, + "flos": 18731914450560.0, + "grad_norm": 1.7327132362728197, + "language_loss": 0.82643163, + "learning_rate": 1.0172819397606714e-06, + "loss": 0.84723032, + "num_input_tokens_seen": 238166360, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.40234375, + "step": 11036, + "time_per_iteration": 2.3827342987060547 + }, + { + "auxiliary_loss_clip": 0.01007549, + "auxiliary_loss_mlp": 0.0100287, + "balance_loss_clip": 1.00195777, + "balance_loss_mlp": 1.00104082, + "epoch": 0.6635803396963775, + "flos": 60219482878080.0, + "grad_norm": 0.7729945078028613, + "language_loss": 0.52315354, + "learning_rate": 1.0169528902519874e-06, + "loss": 0.54325771, + "num_input_tokens_seen": 238227630, + "router_z_loss_clip": 0.00909424, + "router_z_loss_mlp": 0.06494141, + "step": 11037, + "time_per_iteration": 3.0279970169067383 + }, + { + "auxiliary_loss_clip": 0.01061183, + "auxiliary_loss_mlp": 0.01026376, + "balance_loss_clip": 1.01375735, + "balance_loss_mlp": 1.02089214, + "epoch": 0.6636404629490456, + "flos": 29165812865280.0, + "grad_norm": 1.6753042697394167, + "language_loss": 0.78756499, + "learning_rate": 1.016623875825726e-06, + "loss": 0.80844063, + "num_input_tokens_seen": 238248435, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.40234375, + "step": 11038, + "time_per_iteration": 2.463728904724121 + }, + { + "auxiliary_loss_clip": 0.01059771, + "auxiliary_loss_mlp": 0.01031274, + "balance_loss_clip": 1.01789284, + "balance_loss_mlp": 1.0203166, + "epoch": 0.6637005862017135, + "flos": 38906231477760.0, + "grad_norm": 2.622791105088234, + "language_loss": 0.63630974, + "learning_rate": 1.0162948964936284e-06, + "loss": 0.65722024, + "num_input_tokens_seen": 238268755, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.39453125, + "step": 11039, + "time_per_iteration": 2.521787405014038 + }, + { + "auxiliary_loss_clip": 0.01059165, + "auxiliary_loss_mlp": 0.01023399, + "balance_loss_clip": 1.01114988, + "balance_loss_mlp": 1.01868796, + "epoch": 0.6637607094543815, + "flos": 22782258299520.0, + "grad_norm": 1.7089914012518355, + "language_loss": 0.64206314, + "learning_rate": 1.0159659522674374e-06, + "loss": 0.66288877, + "num_input_tokens_seen": 238290120, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.40429688, + "step": 11040, + "time_per_iteration": 2.485130786895752 + }, + { + "auxiliary_loss_clip": 0.01056191, + "auxiliary_loss_mlp": 0.01026517, + "balance_loss_clip": 1.01514411, + "balance_loss_mlp": 1.01755667, + "epoch": 0.6638208327070495, + "flos": 18113113779840.0, + "grad_norm": 1.7682329019204057, + "language_loss": 0.71925557, + "learning_rate": 1.0156370431588882e-06, + "loss": 0.74008262, + "num_input_tokens_seen": 238309290, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.38671875, + "step": 11041, + "time_per_iteration": 2.3964970111846924 + }, + { + "auxiliary_loss_clip": 0.01058027, + "auxiliary_loss_mlp": 0.01028874, + "balance_loss_clip": 1.01630354, + "balance_loss_mlp": 1.01936579, + "epoch": 0.6638809559597174, + "flos": 29423576499840.0, + "grad_norm": 3.4128284318197486, + "language_loss": 0.61542237, + "learning_rate": 1.015308169179722e-06, + "loss": 0.63629138, + "num_input_tokens_seen": 238327280, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.38671875, + "step": 11042, + "time_per_iteration": 2.5104501247406006 + }, + { + "auxiliary_loss_clip": 0.01061083, + "auxiliary_loss_mlp": 0.01026546, + "balance_loss_clip": 1.01302814, + "balance_loss_mlp": 1.01870942, + "epoch": 0.6639410792123854, + "flos": 28071495210240.0, + "grad_norm": 1.7998598520618503, + "language_loss": 0.68536836, + "learning_rate": 1.0149793303416738e-06, + "loss": 0.70624471, + "num_input_tokens_seen": 238346330, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.42382812, + "step": 11043, + "time_per_iteration": 2.554661273956299 + }, + { + "auxiliary_loss_clip": 0.01055621, + "auxiliary_loss_mlp": 0.01021376, + "balance_loss_clip": 1.01071322, + "balance_loss_mlp": 1.01928771, + "epoch": 0.6640012024650533, + "flos": 25555199990400.0, + "grad_norm": 1.5652082281316388, + "language_loss": 0.8378787, + "learning_rate": 1.01465052665648e-06, + "loss": 0.85864866, + "num_input_tokens_seen": 238364650, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.36328125, + "step": 11044, + "time_per_iteration": 2.473503589630127 + }, + { + "auxiliary_loss_clip": 0.01058035, + "auxiliary_loss_mlp": 0.01022523, + "balance_loss_clip": 1.01021504, + "balance_loss_mlp": 1.01862693, + "epoch": 0.6640613257177214, + "flos": 14866051559040.0, + "grad_norm": 2.2686508589186336, + "language_loss": 0.7001974, + "learning_rate": 1.0143217581358733e-06, + "loss": 0.721003, + "num_input_tokens_seen": 238381630, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.39453125, + "step": 11045, + "time_per_iteration": 2.4730963706970215 + }, + { + "auxiliary_loss_clip": 0.01060376, + "auxiliary_loss_mlp": 0.01025551, + "balance_loss_clip": 1.01413703, + "balance_loss_mlp": 1.02008295, + "epoch": 0.6641214489703893, + "flos": 23219999326080.0, + "grad_norm": 1.3929752205254622, + "language_loss": 0.64491713, + "learning_rate": 1.0139930247915894e-06, + "loss": 0.66577643, + "num_input_tokens_seen": 238402595, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.40234375, + "step": 11046, + "time_per_iteration": 5.324596166610718 + }, + { + "auxiliary_loss_clip": 0.01055889, + "auxiliary_loss_mlp": 0.01020588, + "balance_loss_clip": 1.00883412, + "balance_loss_mlp": 1.01848483, + "epoch": 0.6641815722230573, + "flos": 37741109852160.0, + "grad_norm": 1.41062688316459, + "language_loss": 0.71412969, + "learning_rate": 1.0136643266353564e-06, + "loss": 0.73489451, + "num_input_tokens_seen": 238426860, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.375, + "step": 11047, + "time_per_iteration": 2.5774528980255127 + }, + { + "auxiliary_loss_clip": 0.01059263, + "auxiliary_loss_mlp": 0.01025517, + "balance_loss_clip": 1.01364958, + "balance_loss_mlp": 1.01973224, + "epoch": 0.6642416954757252, + "flos": 17930168922240.0, + "grad_norm": 1.708220447580711, + "language_loss": 0.774517, + "learning_rate": 1.013335663678907e-06, + "loss": 0.79536486, + "num_input_tokens_seen": 238443990, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.39648438, + "step": 11048, + "time_per_iteration": 2.372279405593872 + }, + { + "auxiliary_loss_clip": 0.0100723, + "auxiliary_loss_mlp": 0.01002612, + "balance_loss_clip": 1.0016582, + "balance_loss_mlp": 1.00092816, + "epoch": 0.6643018187283932, + "flos": 51992829394560.0, + "grad_norm": 0.7543744002824945, + "language_loss": 0.5503701, + "learning_rate": 1.0130070359339693e-06, + "loss": 0.57046849, + "num_input_tokens_seen": 238503045, + "router_z_loss_clip": 0.00952148, + "router_z_loss_mlp": 0.06298828, + "step": 11049, + "time_per_iteration": 3.0738790035247803 + }, + { + "auxiliary_loss_clip": 0.01057292, + "auxiliary_loss_mlp": 0.01023759, + "balance_loss_clip": 1.01199937, + "balance_loss_mlp": 1.01861358, + "epoch": 0.6643619419810611, + "flos": 30225356939520.0, + "grad_norm": 1.8878016583390176, + "language_loss": 0.6411171, + "learning_rate": 1.012678443412273e-06, + "loss": 0.6619277, + "num_input_tokens_seen": 238527320, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.38671875, + "step": 11050, + "time_per_iteration": 2.5053091049194336 + }, + { + "auxiliary_loss_clip": 0.01056272, + "auxiliary_loss_mlp": 0.01024514, + "balance_loss_clip": 1.01310599, + "balance_loss_mlp": 1.01929522, + "epoch": 0.6644220652337292, + "flos": 22965028600320.0, + "grad_norm": 1.8810207313500562, + "language_loss": 0.79117572, + "learning_rate": 1.0123498861255417e-06, + "loss": 0.81198364, + "num_input_tokens_seen": 238546030, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.37109375, + "step": 11051, + "time_per_iteration": 2.417876958847046 + }, + { + "auxiliary_loss_clip": 0.01058393, + "auxiliary_loss_mlp": 0.01026643, + "balance_loss_clip": 1.01504374, + "balance_loss_mlp": 1.02038944, + "epoch": 0.6644821884863971, + "flos": 21141165841920.0, + "grad_norm": 1.7333151146818808, + "language_loss": 0.85752386, + "learning_rate": 1.0120213640855035e-06, + "loss": 0.87837422, + "num_input_tokens_seen": 238564175, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.37890625, + "step": 11052, + "time_per_iteration": 3.8582603931427 + }, + { + "auxiliary_loss_clip": 0.01059697, + "auxiliary_loss_mlp": 0.01026446, + "balance_loss_clip": 1.01328528, + "balance_loss_mlp": 1.01898694, + "epoch": 0.6645423117390651, + "flos": 20191807618560.0, + "grad_norm": 2.2095419612916762, + "language_loss": 0.74847627, + "learning_rate": 1.011692877303882e-06, + "loss": 0.76933771, + "num_input_tokens_seen": 238581010, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.40625, + "step": 11053, + "time_per_iteration": 2.3945770263671875 + }, + { + "auxiliary_loss_clip": 0.0105579, + "auxiliary_loss_mlp": 0.01021587, + "balance_loss_clip": 1.01005363, + "balance_loss_mlp": 1.01810741, + "epoch": 0.6646024349917331, + "flos": 24350836129920.0, + "grad_norm": 1.7134339859208543, + "language_loss": 0.79378736, + "learning_rate": 1.011364425792398e-06, + "loss": 0.81456113, + "num_input_tokens_seen": 238601365, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.37695312, + "step": 11054, + "time_per_iteration": 2.499497652053833 + }, + { + "auxiliary_loss_clip": 0.01055981, + "auxiliary_loss_mlp": 0.01022626, + "balance_loss_clip": 1.01167607, + "balance_loss_mlp": 1.01810396, + "epoch": 0.664662558244401, + "flos": 18805720798080.0, + "grad_norm": 1.7025683462906338, + "language_loss": 0.74371171, + "learning_rate": 1.0110360095627755e-06, + "loss": 0.76449776, + "num_input_tokens_seen": 238619850, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.37890625, + "step": 11055, + "time_per_iteration": 2.3765504360198975 + }, + { + "auxiliary_loss_clip": 0.01057293, + "auxiliary_loss_mlp": 0.010257, + "balance_loss_clip": 1.01335001, + "balance_loss_mlp": 1.01878142, + "epoch": 0.664722681497069, + "flos": 18951797393280.0, + "grad_norm": 2.397862716391816, + "language_loss": 0.73040795, + "learning_rate": 1.0107076286267329e-06, + "loss": 0.75123787, + "num_input_tokens_seen": 238637635, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.38476562, + "step": 11056, + "time_per_iteration": 2.4335198402404785 + }, + { + "auxiliary_loss_clip": 0.01056001, + "auxiliary_loss_mlp": 0.01021113, + "balance_loss_clip": 1.00943637, + "balance_loss_mlp": 1.01761842, + "epoch": 0.6647828047497369, + "flos": 19570318773120.0, + "grad_norm": 1.9386031602274618, + "language_loss": 0.69710207, + "learning_rate": 1.0103792829959919e-06, + "loss": 0.71787322, + "num_input_tokens_seen": 238656200, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.38476562, + "step": 11057, + "time_per_iteration": 2.440624475479126 + }, + { + "auxiliary_loss_clip": 0.01058312, + "auxiliary_loss_mlp": 0.0102615, + "balance_loss_clip": 1.01468825, + "balance_loss_mlp": 1.01944792, + "epoch": 0.664842928002405, + "flos": 23148322571520.0, + "grad_norm": 1.6264740022069413, + "language_loss": 0.80429053, + "learning_rate": 1.0100509726822671e-06, + "loss": 0.82513511, + "num_input_tokens_seen": 238675005, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.38867188, + "step": 11058, + "time_per_iteration": 2.47061824798584 + }, + { + "auxiliary_loss_clip": 0.01007279, + "auxiliary_loss_mlp": 0.01002158, + "balance_loss_clip": 1.00125813, + "balance_loss_mlp": 1.00091505, + "epoch": 0.6649030512550729, + "flos": 65241844289280.0, + "grad_norm": 0.795611666474792, + "language_loss": 0.62600023, + "learning_rate": 1.0097226976972776e-06, + "loss": 0.6460945, + "num_input_tokens_seen": 238731425, + "router_z_loss_clip": 0.00897217, + "router_z_loss_mlp": 0.06347656, + "step": 11059, + "time_per_iteration": 2.9552884101867676 + }, + { + "auxiliary_loss_clip": 0.01057159, + "auxiliary_loss_mlp": 0.01024908, + "balance_loss_clip": 1.01343441, + "balance_loss_mlp": 1.01944911, + "epoch": 0.6649631745077409, + "flos": 20193727743360.0, + "grad_norm": 1.341698721192763, + "language_loss": 0.78909332, + "learning_rate": 1.0093944580527374e-06, + "loss": 0.80991399, + "num_input_tokens_seen": 238752020, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.37695312, + "step": 11060, + "time_per_iteration": 2.4762237071990967 + }, + { + "auxiliary_loss_clip": 0.01058202, + "auxiliary_loss_mlp": 0.01027433, + "balance_loss_clip": 1.0149045, + "balance_loss_mlp": 1.01896632, + "epoch": 0.6650232977604088, + "flos": 17237596815360.0, + "grad_norm": 1.6411229580990072, + "language_loss": 0.78619295, + "learning_rate": 1.0090662537603612e-06, + "loss": 0.80704933, + "num_input_tokens_seen": 238769665, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.39257812, + "step": 11061, + "time_per_iteration": 2.397801399230957 + }, + { + "auxiliary_loss_clip": 0.01056228, + "auxiliary_loss_mlp": 0.01019269, + "balance_loss_clip": 1.00854599, + "balance_loss_mlp": 1.01919723, + "epoch": 0.6650834210130768, + "flos": 10006316593920.0, + "grad_norm": 2.1888375613925852, + "language_loss": 0.56698132, + "learning_rate": 1.0087380848318603e-06, + "loss": 0.58773625, + "num_input_tokens_seen": 238782180, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.37109375, + "step": 11062, + "time_per_iteration": 2.4087862968444824 + }, + { + "auxiliary_loss_clip": 0.01059444, + "auxiliary_loss_mlp": 0.01021858, + "balance_loss_clip": 1.00862002, + "balance_loss_mlp": 1.0181756, + "epoch": 0.6651435442657447, + "flos": 10451319183360.0, + "grad_norm": 1.7951204502142917, + "language_loss": 0.75753319, + "learning_rate": 1.0084099512789493e-06, + "loss": 0.77834624, + "num_input_tokens_seen": 238800315, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.41210938, + "step": 11063, + "time_per_iteration": 2.4048495292663574 + }, + { + "auxiliary_loss_clip": 0.01057537, + "auxiliary_loss_mlp": 0.01025355, + "balance_loss_clip": 1.01401806, + "balance_loss_mlp": 1.01899004, + "epoch": 0.6652036675184128, + "flos": 22343190641280.0, + "grad_norm": 1.309117761391568, + "language_loss": 0.70651913, + "learning_rate": 1.0080818531133343e-06, + "loss": 0.72734809, + "num_input_tokens_seen": 238822250, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.38476562, + "step": 11064, + "time_per_iteration": 2.5021142959594727 + }, + { + "auxiliary_loss_clip": 0.01007201, + "auxiliary_loss_mlp": 0.01003174, + "balance_loss_clip": 1.00224447, + "balance_loss_mlp": 1.0007441, + "epoch": 0.6652637907710807, + "flos": 52906995100800.0, + "grad_norm": 1.116516180712424, + "language_loss": 0.63045317, + "learning_rate": 1.0077537903467276e-06, + "loss": 0.65055686, + "num_input_tokens_seen": 238877190, + "router_z_loss_clip": 0.00927734, + "router_z_loss_mlp": 0.06445312, + "step": 11065, + "time_per_iteration": 3.000415325164795 + }, + { + "auxiliary_loss_clip": 0.0105598, + "auxiliary_loss_mlp": 0.01024324, + "balance_loss_clip": 1.01302934, + "balance_loss_mlp": 1.01802671, + "epoch": 0.6653239140237487, + "flos": 23103738898560.0, + "grad_norm": 1.8751890300624912, + "language_loss": 0.62919456, + "learning_rate": 1.007425762990835e-06, + "loss": 0.64999759, + "num_input_tokens_seen": 238896010, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.37890625, + "step": 11066, + "time_per_iteration": 2.435241937637329 + }, + { + "auxiliary_loss_clip": 0.010575, + "auxiliary_loss_mlp": 0.01026607, + "balance_loss_clip": 1.0135237, + "balance_loss_mlp": 1.01844776, + "epoch": 0.6653840372764167, + "flos": 25958167436160.0, + "grad_norm": 1.4758666250657237, + "language_loss": 0.70146132, + "learning_rate": 1.0070977710573654e-06, + "loss": 0.72230232, + "num_input_tokens_seen": 238918990, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.390625, + "step": 11067, + "time_per_iteration": 2.4658799171447754 + }, + { + "auxiliary_loss_clip": 0.01007266, + "auxiliary_loss_mlp": 0.01001052, + "balance_loss_clip": 1.00012851, + "balance_loss_mlp": 1.0010345, + "epoch": 0.6654441605290846, + "flos": 66039051340800.0, + "grad_norm": 0.9194380451742943, + "language_loss": 0.72124165, + "learning_rate": 1.0067698145580213e-06, + "loss": 0.74132484, + "num_input_tokens_seen": 238975735, + "router_z_loss_clip": 0.00921631, + "router_z_loss_mlp": 0.06225586, + "step": 11068, + "time_per_iteration": 3.0611417293548584 + }, + { + "auxiliary_loss_clip": 0.01006977, + "auxiliary_loss_mlp": 0.01000238, + "balance_loss_clip": 0.99932623, + "balance_loss_mlp": 1.00063992, + "epoch": 0.6655042837817526, + "flos": 65192371159680.0, + "grad_norm": 0.706137232446137, + "language_loss": 0.57800388, + "learning_rate": 1.0064418935045066e-06, + "loss": 0.59807599, + "num_input_tokens_seen": 239042360, + "router_z_loss_clip": 0.00909424, + "router_z_loss_mlp": 0.06347656, + "step": 11069, + "time_per_iteration": 3.2283718585968018 + }, + { + "auxiliary_loss_clip": 0.01007195, + "auxiliary_loss_mlp": 0.01001279, + "balance_loss_clip": 1.00035512, + "balance_loss_mlp": 1.00074542, + "epoch": 0.6655644070344205, + "flos": 69005411297280.0, + "grad_norm": 0.7845353934849226, + "language_loss": 0.63607502, + "learning_rate": 1.0061140079085268e-06, + "loss": 0.65615976, + "num_input_tokens_seen": 239109410, + "router_z_loss_clip": 0.00921631, + "router_z_loss_mlp": 0.06445312, + "step": 11070, + "time_per_iteration": 3.125208854675293 + }, + { + "auxiliary_loss_clip": 0.01055544, + "auxiliary_loss_mlp": 0.01023041, + "balance_loss_clip": 1.01209116, + "balance_loss_mlp": 1.01704168, + "epoch": 0.6656245302870886, + "flos": 36314209785600.0, + "grad_norm": 2.0966842943318684, + "language_loss": 0.5895564, + "learning_rate": 1.0057861577817801e-06, + "loss": 0.61034226, + "num_input_tokens_seen": 239135345, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.38476562, + "step": 11071, + "time_per_iteration": 2.5643959045410156 + }, + { + "auxiliary_loss_clip": 0.01059119, + "auxiliary_loss_mlp": 0.01025657, + "balance_loss_clip": 1.01513088, + "balance_loss_mlp": 1.02020872, + "epoch": 0.6656846535397565, + "flos": 21793867309440.0, + "grad_norm": 1.5957159723382721, + "language_loss": 0.72796911, + "learning_rate": 1.0054583431359686e-06, + "loss": 0.74881685, + "num_input_tokens_seen": 239154340, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.38867188, + "step": 11072, + "time_per_iteration": 2.414482593536377 + }, + { + "auxiliary_loss_clip": 0.01058054, + "auxiliary_loss_mlp": 0.01024851, + "balance_loss_clip": 1.01301956, + "balance_loss_mlp": 1.01967835, + "epoch": 0.6657447767924245, + "flos": 37486104215040.0, + "grad_norm": 1.9369635852247333, + "language_loss": 0.70627749, + "learning_rate": 1.0051305639827898e-06, + "loss": 0.72710657, + "num_input_tokens_seen": 239177815, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.38476562, + "step": 11073, + "time_per_iteration": 2.570723533630371 + }, + { + "auxiliary_loss_clip": 0.01060092, + "auxiliary_loss_mlp": 0.01025625, + "balance_loss_clip": 1.01345944, + "balance_loss_mlp": 1.0203414, + "epoch": 0.6658049000450924, + "flos": 16836724051200.0, + "grad_norm": 1.8747211080160762, + "language_loss": 0.56456089, + "learning_rate": 1.0048028203339435e-06, + "loss": 0.58541805, + "num_input_tokens_seen": 239195735, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.3984375, + "step": 11074, + "time_per_iteration": 2.374842882156372 + }, + { + "auxiliary_loss_clip": 0.01057532, + "auxiliary_loss_mlp": 0.01027471, + "balance_loss_clip": 1.0160687, + "balance_loss_mlp": 1.01997852, + "epoch": 0.6658650232977604, + "flos": 33509566713600.0, + "grad_norm": 1.4605558882542755, + "language_loss": 0.72410405, + "learning_rate": 1.0044751122011233e-06, + "loss": 0.74495411, + "num_input_tokens_seen": 239217535, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.37695312, + "step": 11075, + "time_per_iteration": 3.949430465698242 + }, + { + "auxiliary_loss_clip": 0.01055436, + "auxiliary_loss_mlp": 0.0102275, + "balance_loss_clip": 1.01204515, + "balance_loss_mlp": 1.01846635, + "epoch": 0.6659251465504283, + "flos": 37704800171520.0, + "grad_norm": 1.3740118461852557, + "language_loss": 0.65736485, + "learning_rate": 1.0041474395960263e-06, + "loss": 0.67814672, + "num_input_tokens_seen": 239241975, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.36914062, + "step": 11076, + "time_per_iteration": 2.545232057571411 + }, + { + "auxiliary_loss_clip": 0.01057109, + "auxiliary_loss_mlp": 0.01024691, + "balance_loss_clip": 1.01243019, + "balance_loss_mlp": 1.01904774, + "epoch": 0.6659852698030964, + "flos": 24892444051200.0, + "grad_norm": 2.3843020916612843, + "language_loss": 0.74842024, + "learning_rate": 1.0038198025303452e-06, + "loss": 0.76923823, + "num_input_tokens_seen": 239262025, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.38085938, + "step": 11077, + "time_per_iteration": 2.473160982131958 + }, + { + "auxiliary_loss_clip": 0.01056188, + "auxiliary_loss_mlp": 0.01023476, + "balance_loss_clip": 1.01201439, + "balance_loss_mlp": 1.01831067, + "epoch": 0.6660453930557643, + "flos": 24351674002560.0, + "grad_norm": 1.646876824097136, + "language_loss": 0.66880977, + "learning_rate": 1.0034922010157734e-06, + "loss": 0.68960643, + "num_input_tokens_seen": 239282775, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.37890625, + "step": 11078, + "time_per_iteration": 2.4399359226226807 + }, + { + "auxiliary_loss_clip": 0.01058035, + "auxiliary_loss_mlp": 0.01023314, + "balance_loss_clip": 1.01172066, + "balance_loss_mlp": 1.01833463, + "epoch": 0.6661055163084323, + "flos": 10597046664960.0, + "grad_norm": 1.9679094385829197, + "language_loss": 0.7008484, + "learning_rate": 1.0031646350640005e-06, + "loss": 0.72166193, + "num_input_tokens_seen": 239299775, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.39648438, + "step": 11079, + "time_per_iteration": 2.424578905105591 + }, + { + "auxiliary_loss_clip": 0.01054448, + "auxiliary_loss_mlp": 0.01023854, + "balance_loss_clip": 1.0131247, + "balance_loss_mlp": 1.01797318, + "epoch": 0.6661656395611003, + "flos": 24056448612480.0, + "grad_norm": 2.4629401797461967, + "language_loss": 0.80339742, + "learning_rate": 1.0028371046867191e-06, + "loss": 0.82418042, + "num_input_tokens_seen": 239319660, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.36523438, + "step": 11080, + "time_per_iteration": 2.485008716583252 + }, + { + "auxiliary_loss_clip": 0.01053729, + "auxiliary_loss_mlp": 0.01022763, + "balance_loss_clip": 1.01248145, + "balance_loss_mlp": 1.01797712, + "epoch": 0.6662257628137682, + "flos": 23035169255040.0, + "grad_norm": 1.712730422164716, + "language_loss": 0.78232408, + "learning_rate": 1.002509609895615e-06, + "loss": 0.80308896, + "num_input_tokens_seen": 239339215, + "router_z_loss_clip": 0.10302734, + "router_z_loss_mlp": 0.35742188, + "step": 11081, + "time_per_iteration": 2.428873062133789 + }, + { + "auxiliary_loss_clip": 0.01057879, + "auxiliary_loss_mlp": 0.01025657, + "balance_loss_clip": 1.01373625, + "balance_loss_mlp": 1.01894248, + "epoch": 0.6662858860664362, + "flos": 24753279905280.0, + "grad_norm": 1.672367854776746, + "language_loss": 0.79558396, + "learning_rate": 1.002182150702378e-06, + "loss": 0.81641924, + "num_input_tokens_seen": 239358545, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.390625, + "step": 11082, + "time_per_iteration": 2.4716217517852783 + }, + { + "auxiliary_loss_clip": 0.01057024, + "auxiliary_loss_mlp": 0.0102042, + "balance_loss_clip": 1.00958347, + "balance_loss_mlp": 1.01964343, + "epoch": 0.6663460093191041, + "flos": 20008094711040.0, + "grad_norm": 2.574177990413811, + "language_loss": 0.8383764, + "learning_rate": 1.001854727118693e-06, + "loss": 0.85915083, + "num_input_tokens_seen": 239376665, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.375, + "step": 11083, + "time_per_iteration": 2.460783004760742 + }, + { + "auxiliary_loss_clip": 0.01061179, + "auxiliary_loss_mlp": 0.01027958, + "balance_loss_clip": 1.01570952, + "balance_loss_mlp": 1.02063775, + "epoch": 0.6664061325717722, + "flos": 17820436919040.0, + "grad_norm": 2.175497457491615, + "language_loss": 0.85637379, + "learning_rate": 1.0015273391562456e-06, + "loss": 0.8772651, + "num_input_tokens_seen": 239394345, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.40625, + "step": 11084, + "time_per_iteration": 2.4695167541503906 + }, + { + "auxiliary_loss_clip": 0.0105807, + "auxiliary_loss_mlp": 0.01023593, + "balance_loss_clip": 1.01125479, + "balance_loss_mlp": 1.01865113, + "epoch": 0.6664662558244401, + "flos": 18075931315200.0, + "grad_norm": 2.620930557470334, + "language_loss": 0.73549163, + "learning_rate": 1.0011999868267188e-06, + "loss": 0.75630832, + "num_input_tokens_seen": 239410605, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.39257812, + "step": 11085, + "time_per_iteration": 3.896737813949585 + }, + { + "auxiliary_loss_clip": 0.01055541, + "auxiliary_loss_mlp": 0.01022254, + "balance_loss_clip": 1.00990403, + "balance_loss_mlp": 1.0174439, + "epoch": 0.6665263790771081, + "flos": 21573286139520.0, + "grad_norm": 2.0229600463669164, + "language_loss": 0.80351877, + "learning_rate": 1.0008726701417946e-06, + "loss": 0.82429671, + "num_input_tokens_seen": 239427155, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.38085938, + "step": 11086, + "time_per_iteration": 2.421529769897461 + }, + { + "auxiliary_loss_clip": 0.01057275, + "auxiliary_loss_mlp": 0.0102386, + "balance_loss_clip": 1.01196337, + "balance_loss_mlp": 1.01876402, + "epoch": 0.666586502329776, + "flos": 24205492673280.0, + "grad_norm": 2.080412517735765, + "language_loss": 0.74966228, + "learning_rate": 1.0005453891131562e-06, + "loss": 0.77047366, + "num_input_tokens_seen": 239445510, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.38476562, + "step": 11087, + "time_per_iteration": 2.4108424186706543 + }, + { + "auxiliary_loss_clip": 0.01056713, + "auxiliary_loss_mlp": 0.0102361, + "balance_loss_clip": 1.01181459, + "balance_loss_mlp": 1.01852655, + "epoch": 0.666646625582444, + "flos": 22199418195840.0, + "grad_norm": 1.5759697559382875, + "language_loss": 0.64956892, + "learning_rate": 1.0002181437524804e-06, + "loss": 0.67037213, + "num_input_tokens_seen": 239464805, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.3828125, + "step": 11088, + "time_per_iteration": 2.4564099311828613 + }, + { + "auxiliary_loss_clip": 0.01057733, + "auxiliary_loss_mlp": 0.01021896, + "balance_loss_clip": 1.01063061, + "balance_loss_mlp": 1.01901126, + "epoch": 0.6667067488351119, + "flos": 18258945995520.0, + "grad_norm": 2.0014099570453365, + "language_loss": 0.63700318, + "learning_rate": 9.998909340714484e-07, + "loss": 0.65779954, + "num_input_tokens_seen": 239483890, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.38671875, + "step": 11089, + "time_per_iteration": 2.381699800491333 + }, + { + "auxiliary_loss_clip": 0.0105661, + "auxiliary_loss_mlp": 0.01023499, + "balance_loss_clip": 1.0117507, + "balance_loss_mlp": 1.01838923, + "epoch": 0.66676687208778, + "flos": 17235641779200.0, + "grad_norm": 2.324024104041711, + "language_loss": 0.8097418, + "learning_rate": 9.995637600817359e-07, + "loss": 0.83054286, + "num_input_tokens_seen": 239500080, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.3828125, + "step": 11090, + "time_per_iteration": 2.3810107707977295 + }, + { + "auxiliary_loss_clip": 0.0105764, + "auxiliary_loss_mlp": 0.01025927, + "balance_loss_clip": 1.01349366, + "balance_loss_mlp": 1.01834977, + "epoch": 0.6668269953404479, + "flos": 19751273683200.0, + "grad_norm": 2.109625537623564, + "language_loss": 0.77819496, + "learning_rate": 9.992366217950197e-07, + "loss": 0.79903066, + "num_input_tokens_seen": 239517335, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.39257812, + "step": 11091, + "time_per_iteration": 2.3978869915008545 + }, + { + "auxiliary_loss_clip": 0.01055472, + "auxiliary_loss_mlp": 0.01023052, + "balance_loss_clip": 1.01201332, + "balance_loss_mlp": 1.01765227, + "epoch": 0.6668871185931159, + "flos": 20557383131520.0, + "grad_norm": 1.6672031856987257, + "language_loss": 0.792642, + "learning_rate": 9.989095192229734e-07, + "loss": 0.81342721, + "num_input_tokens_seen": 239536240, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.37890625, + "step": 11092, + "time_per_iteration": 3.8872549533843994 + }, + { + "auxiliary_loss_clip": 0.01006637, + "auxiliary_loss_mlp": 0.01001593, + "balance_loss_clip": 1.00069916, + "balance_loss_mlp": 1.0003593, + "epoch": 0.6669472418457839, + "flos": 58085452667520.0, + "grad_norm": 0.8289146655563763, + "language_loss": 0.57758528, + "learning_rate": 9.985824523772718e-07, + "loss": 0.59766757, + "num_input_tokens_seen": 239598000, + "router_z_loss_clip": 0.00891113, + "router_z_loss_mlp": 0.0625, + "step": 11093, + "time_per_iteration": 3.101414918899536 + }, + { + "auxiliary_loss_clip": 0.01056722, + "auxiliary_loss_mlp": 0.01022491, + "balance_loss_clip": 1.01192307, + "balance_loss_mlp": 1.01990747, + "epoch": 0.6670073650984518, + "flos": 26063989367040.0, + "grad_norm": 1.6346851588817362, + "language_loss": 0.76599598, + "learning_rate": 9.982554212695869e-07, + "loss": 0.78678817, + "num_input_tokens_seen": 239617650, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.36914062, + "step": 11094, + "time_per_iteration": 2.445249557495117 + }, + { + "auxiliary_loss_clip": 0.01057461, + "auxiliary_loss_mlp": 0.01025621, + "balance_loss_clip": 1.01383746, + "balance_loss_mlp": 1.0187254, + "epoch": 0.6670674883511198, + "flos": 32415458526720.0, + "grad_norm": 1.78111219443242, + "language_loss": 0.7352773, + "learning_rate": 9.97928425911589e-07, + "loss": 0.75610811, + "num_input_tokens_seen": 239639825, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.38671875, + "step": 11095, + "time_per_iteration": 2.501195192337036 + }, + { + "auxiliary_loss_clip": 0.01056847, + "auxiliary_loss_mlp": 0.01024213, + "balance_loss_clip": 1.01273358, + "balance_loss_mlp": 1.01952684, + "epoch": 0.6671276116037878, + "flos": 18036898548480.0, + "grad_norm": 2.359261509087308, + "language_loss": 0.73446411, + "learning_rate": 9.97601466314947e-07, + "loss": 0.75527471, + "num_input_tokens_seen": 239656300, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.37304688, + "step": 11096, + "time_per_iteration": 2.389089345932007 + }, + { + "auxiliary_loss_clip": 0.01054432, + "auxiliary_loss_mlp": 0.01024335, + "balance_loss_clip": 1.01383913, + "balance_loss_mlp": 1.01848006, + "epoch": 0.6671877348564558, + "flos": 23765971167360.0, + "grad_norm": 1.8418585265502245, + "language_loss": 0.6474213, + "learning_rate": 9.97274542491332e-07, + "loss": 0.66820896, + "num_input_tokens_seen": 239676655, + "router_z_loss_clip": 0.10498047, + "router_z_loss_mlp": 0.359375, + "step": 11097, + "time_per_iteration": 2.4778668880462646 + }, + { + "auxiliary_loss_clip": 0.01056991, + "auxiliary_loss_mlp": 0.01026162, + "balance_loss_clip": 1.01427042, + "balance_loss_mlp": 1.01791453, + "epoch": 0.6672478581091237, + "flos": 20917442649600.0, + "grad_norm": 2.442652393610128, + "language_loss": 0.75684178, + "learning_rate": 9.969476544524086e-07, + "loss": 0.77767336, + "num_input_tokens_seen": 239695430, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.390625, + "step": 11098, + "time_per_iteration": 2.4255211353302 + }, + { + "auxiliary_loss_clip": 0.0105752, + "auxiliary_loss_mlp": 0.01020538, + "balance_loss_clip": 1.00936258, + "balance_loss_mlp": 1.01886296, + "epoch": 0.6673079813617917, + "flos": 27854544821760.0, + "grad_norm": 1.5217030491359838, + "language_loss": 0.73948789, + "learning_rate": 9.96620802209842e-07, + "loss": 0.76026851, + "num_input_tokens_seen": 239717070, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.38671875, + "step": 11099, + "time_per_iteration": 2.4535090923309326 + }, + { + "auxiliary_loss_clip": 0.0105543, + "auxiliary_loss_mlp": 0.01027401, + "balance_loss_clip": 1.01604009, + "balance_loss_mlp": 1.01836491, + "epoch": 0.6673681046144596, + "flos": 21615775130880.0, + "grad_norm": 2.3585521404664362, + "language_loss": 0.78341782, + "learning_rate": 9.96293985775299e-07, + "loss": 0.80424619, + "num_input_tokens_seen": 239737105, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.37109375, + "step": 11100, + "time_per_iteration": 2.4484057426452637 + }, + { + "auxiliary_loss_clip": 0.0105568, + "auxiliary_loss_mlp": 0.01024044, + "balance_loss_clip": 1.01269579, + "balance_loss_mlp": 1.01883841, + "epoch": 0.6674282278671276, + "flos": 20888743645440.0, + "grad_norm": 1.6865739033830702, + "language_loss": 0.59955895, + "learning_rate": 9.95967205160442e-07, + "loss": 0.6203562, + "num_input_tokens_seen": 239757835, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.36914062, + "step": 11101, + "time_per_iteration": 2.4275617599487305 + }, + { + "auxiliary_loss_clip": 0.0105944, + "auxiliary_loss_mlp": 0.01024169, + "balance_loss_clip": 1.01158047, + "balance_loss_mlp": 1.0181179, + "epoch": 0.6674883511197955, + "flos": 23623036594560.0, + "grad_norm": 2.1062690571955303, + "language_loss": 0.71187437, + "learning_rate": 9.956404603769327e-07, + "loss": 0.73271048, + "num_input_tokens_seen": 239775425, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.4140625, + "step": 11102, + "time_per_iteration": 2.4160361289978027 + }, + { + "auxiliary_loss_clip": 0.01056805, + "auxiliary_loss_mlp": 0.01022767, + "balance_loss_clip": 1.01120424, + "balance_loss_mlp": 1.01847637, + "epoch": 0.6675484743724636, + "flos": 19608653312640.0, + "grad_norm": 1.888688366820038, + "language_loss": 0.84585142, + "learning_rate": 9.953137514364308e-07, + "loss": 0.86664718, + "num_input_tokens_seen": 239794605, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.3828125, + "step": 11103, + "time_per_iteration": 2.4329113960266113 + }, + { + "auxiliary_loss_clip": 0.01056736, + "auxiliary_loss_mlp": 0.01019625, + "balance_loss_clip": 1.00893164, + "balance_loss_mlp": 1.01808429, + "epoch": 0.6676085976251315, + "flos": 14318578529280.0, + "grad_norm": 1.7794766065061352, + "language_loss": 0.77929556, + "learning_rate": 9.949870783505985e-07, + "loss": 0.80005908, + "num_input_tokens_seen": 239812135, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.38671875, + "step": 11104, + "time_per_iteration": 2.4238386154174805 + }, + { + "auxiliary_loss_clip": 0.01058539, + "auxiliary_loss_mlp": 0.01023926, + "balance_loss_clip": 1.01254725, + "balance_loss_mlp": 1.01989412, + "epoch": 0.6676687208777995, + "flos": 38103159317760.0, + "grad_norm": 1.7790665718318046, + "language_loss": 0.58008224, + "learning_rate": 9.946604411310906e-07, + "loss": 0.60090691, + "num_input_tokens_seen": 239835845, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.38671875, + "step": 11105, + "time_per_iteration": 2.593398332595825 + }, + { + "auxiliary_loss_clip": 0.01059344, + "auxiliary_loss_mlp": 0.01025496, + "balance_loss_clip": 1.01293135, + "balance_loss_mlp": 1.01895189, + "epoch": 0.6677288441304675, + "flos": 23980617406080.0, + "grad_norm": 4.995834047283355, + "language_loss": 0.73646843, + "learning_rate": 9.943338397895662e-07, + "loss": 0.75731683, + "num_input_tokens_seen": 239853820, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.40429688, + "step": 11106, + "time_per_iteration": 2.489358901977539 + }, + { + "auxiliary_loss_clip": 0.01055604, + "auxiliary_loss_mlp": 0.01021823, + "balance_loss_clip": 1.01086211, + "balance_loss_mlp": 1.018049, + "epoch": 0.6677889673831354, + "flos": 24169532106240.0, + "grad_norm": 1.6984922416228212, + "language_loss": 0.7673679, + "learning_rate": 9.940072743376801e-07, + "loss": 0.78814214, + "num_input_tokens_seen": 239873365, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.375, + "step": 11107, + "time_per_iteration": 2.4386682510375977 + }, + { + "auxiliary_loss_clip": 0.01055082, + "auxiliary_loss_mlp": 0.01022047, + "balance_loss_clip": 1.01161075, + "balance_loss_mlp": 1.01882148, + "epoch": 0.6678490906358034, + "flos": 22308556705920.0, + "grad_norm": 2.3999023397210313, + "language_loss": 0.89994848, + "learning_rate": 9.936807447870869e-07, + "loss": 0.92071974, + "num_input_tokens_seen": 239891215, + "router_z_loss_clip": 0.10449219, + "router_z_loss_mlp": 0.36328125, + "step": 11108, + "time_per_iteration": 2.4389472007751465 + }, + { + "auxiliary_loss_clip": 0.0105423, + "auxiliary_loss_mlp": 0.01020554, + "balance_loss_clip": 1.01017082, + "balance_loss_mlp": 1.01753092, + "epoch": 0.6679092138884714, + "flos": 36897399002880.0, + "grad_norm": 2.127276202340324, + "language_loss": 0.82815689, + "learning_rate": 9.933542511494387e-07, + "loss": 0.84890473, + "num_input_tokens_seen": 239913490, + "router_z_loss_clip": 0.10400391, + "router_z_loss_mlp": 0.3671875, + "step": 11109, + "time_per_iteration": 2.5145137310028076 + }, + { + "auxiliary_loss_clip": 0.01055864, + "auxiliary_loss_mlp": 0.01024971, + "balance_loss_clip": 1.01305604, + "balance_loss_mlp": 1.01763952, + "epoch": 0.6679693371411394, + "flos": 18149318726400.0, + "grad_norm": 1.8053607288096516, + "language_loss": 0.69402951, + "learning_rate": 9.930277934363884e-07, + "loss": 0.71483791, + "num_input_tokens_seen": 239931565, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.3828125, + "step": 11110, + "time_per_iteration": 2.3906304836273193 + }, + { + "auxiliary_loss_clip": 0.01055642, + "auxiliary_loss_mlp": 0.01026703, + "balance_loss_clip": 1.01369143, + "balance_loss_mlp": 1.01675737, + "epoch": 0.6680294603938073, + "flos": 27196955763840.0, + "grad_norm": 1.5012183755901523, + "language_loss": 0.73423839, + "learning_rate": 9.927013716595859e-07, + "loss": 0.75506186, + "num_input_tokens_seen": 239952395, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.38867188, + "step": 11111, + "time_per_iteration": 2.4273202419281006 + }, + { + "auxiliary_loss_clip": 0.01054107, + "auxiliary_loss_mlp": 0.01023119, + "balance_loss_clip": 1.01239598, + "balance_loss_mlp": 1.01775718, + "epoch": 0.6680895836464753, + "flos": 21724250325120.0, + "grad_norm": 1.8682562816311399, + "language_loss": 0.65295249, + "learning_rate": 9.923749858306806e-07, + "loss": 0.67372477, + "num_input_tokens_seen": 239968910, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.36328125, + "step": 11112, + "time_per_iteration": 2.4216606616973877 + }, + { + "auxiliary_loss_clip": 0.01055707, + "auxiliary_loss_mlp": 0.01023399, + "balance_loss_clip": 1.01233089, + "balance_loss_mlp": 1.01820326, + "epoch": 0.6681497068991432, + "flos": 19645451752320.0, + "grad_norm": 1.9590111721571772, + "language_loss": 0.63075584, + "learning_rate": 9.920486359613198e-07, + "loss": 0.65154684, + "num_input_tokens_seen": 239987680, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.375, + "step": 11113, + "time_per_iteration": 2.364003896713257 + }, + { + "auxiliary_loss_clip": 0.01056806, + "auxiliary_loss_mlp": 0.01021816, + "balance_loss_clip": 1.01139724, + "balance_loss_mlp": 1.01953447, + "epoch": 0.6682098301518112, + "flos": 17418237523200.0, + "grad_norm": 1.789083695681391, + "language_loss": 0.66112077, + "learning_rate": 9.917223220631506e-07, + "loss": 0.681907, + "num_input_tokens_seen": 240005790, + "router_z_loss_clip": 0.10449219, + "router_z_loss_mlp": 0.37304688, + "step": 11114, + "time_per_iteration": 2.3834335803985596 + }, + { + "auxiliary_loss_clip": 0.01056154, + "auxiliary_loss_mlp": 0.010305, + "balance_loss_clip": 1.01926494, + "balance_loss_mlp": 1.01773643, + "epoch": 0.6682699534044791, + "flos": 22597986810240.0, + "grad_norm": 1.8110588925614255, + "language_loss": 0.78444278, + "learning_rate": 9.91396044147818e-07, + "loss": 0.8053093, + "num_input_tokens_seen": 240025895, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.38476562, + "step": 11115, + "time_per_iteration": 3.9326984882354736 + }, + { + "auxiliary_loss_clip": 0.0105516, + "auxiliary_loss_mlp": 0.01020095, + "balance_loss_clip": 1.00838256, + "balance_loss_mlp": 1.01711512, + "epoch": 0.6683300766571472, + "flos": 24862523149440.0, + "grad_norm": 1.708543099553696, + "language_loss": 0.79920459, + "learning_rate": 9.910698022269655e-07, + "loss": 0.81995708, + "num_input_tokens_seen": 240044880, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.37890625, + "step": 11116, + "time_per_iteration": 2.4313366413116455 + }, + { + "auxiliary_loss_clip": 0.01058364, + "auxiliary_loss_mlp": 0.01030318, + "balance_loss_clip": 1.01772952, + "balance_loss_mlp": 1.01968956, + "epoch": 0.6683901999098151, + "flos": 27125383743360.0, + "grad_norm": 1.7346012054037117, + "language_loss": 0.79082084, + "learning_rate": 9.907435963122372e-07, + "loss": 0.81170762, + "num_input_tokens_seen": 240065785, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.38671875, + "step": 11117, + "time_per_iteration": 2.4588441848754883 + }, + { + "auxiliary_loss_clip": 0.0105828, + "auxiliary_loss_mlp": 0.01025923, + "balance_loss_clip": 1.01300621, + "balance_loss_mlp": 1.01899028, + "epoch": 0.6684503231624831, + "flos": 20738023839360.0, + "grad_norm": 1.5972527556004288, + "language_loss": 0.65247035, + "learning_rate": 9.904174264152738e-07, + "loss": 0.67331237, + "num_input_tokens_seen": 240085130, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.39257812, + "step": 11118, + "time_per_iteration": 2.4316418170928955 + }, + { + "auxiliary_loss_clip": 0.01054805, + "auxiliary_loss_mlp": 0.01021793, + "balance_loss_clip": 1.01172602, + "balance_loss_mlp": 1.01838338, + "epoch": 0.668510446415151, + "flos": 21761118587520.0, + "grad_norm": 1.4980784352205287, + "language_loss": 0.68782771, + "learning_rate": 9.900912925477157e-07, + "loss": 0.70859373, + "num_input_tokens_seen": 240105495, + "router_z_loss_clip": 0.10058594, + "router_z_loss_mlp": 0.36328125, + "step": 11119, + "time_per_iteration": 2.5307564735412598 + }, + { + "auxiliary_loss_clip": 0.01056752, + "auxiliary_loss_mlp": 0.01024264, + "balance_loss_clip": 1.01310039, + "balance_loss_mlp": 1.01909256, + "epoch": 0.668570569667819, + "flos": 30189920042880.0, + "grad_norm": 1.940014767368923, + "language_loss": 0.674101, + "learning_rate": 9.897651947212007e-07, + "loss": 0.69491118, + "num_input_tokens_seen": 240125455, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.37695312, + "step": 11120, + "time_per_iteration": 2.5024354457855225 + }, + { + "auxiliary_loss_clip": 0.01055991, + "auxiliary_loss_mlp": 0.01021894, + "balance_loss_clip": 1.01045609, + "balance_loss_mlp": 1.01873124, + "epoch": 0.668630692920487, + "flos": 24169497194880.0, + "grad_norm": 1.9485931084282258, + "language_loss": 0.71900159, + "learning_rate": 9.894391329473685e-07, + "loss": 0.73978049, + "num_input_tokens_seen": 240143870, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.37304688, + "step": 11121, + "time_per_iteration": 2.4329776763916016 + }, + { + "auxiliary_loss_clip": 0.01059276, + "auxiliary_loss_mlp": 0.01028216, + "balance_loss_clip": 1.0161463, + "balance_loss_mlp": 1.02028322, + "epoch": 0.668690816173155, + "flos": 17456188037760.0, + "grad_norm": 3.654211844406422, + "language_loss": 0.69663095, + "learning_rate": 9.891131072378532e-07, + "loss": 0.71750587, + "num_input_tokens_seen": 240161020, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.390625, + "step": 11122, + "time_per_iteration": 2.4106481075286865 + }, + { + "auxiliary_loss_clip": 0.01056518, + "auxiliary_loss_mlp": 0.01024718, + "balance_loss_clip": 1.01239741, + "balance_loss_mlp": 1.01887584, + "epoch": 0.668750939425823, + "flos": 25004061267840.0, + "grad_norm": 1.9137485267831342, + "language_loss": 0.71320742, + "learning_rate": 9.88787117604291e-07, + "loss": 0.73401976, + "num_input_tokens_seen": 240179820, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.37695312, + "step": 11123, + "time_per_iteration": 2.4454691410064697 + }, + { + "auxiliary_loss_clip": 0.01059721, + "auxiliary_loss_mlp": 0.01029716, + "balance_loss_clip": 1.01756859, + "balance_loss_mlp": 1.0197289, + "epoch": 0.6688110626784909, + "flos": 24095655936000.0, + "grad_norm": 2.025543993676266, + "language_loss": 0.79248393, + "learning_rate": 9.884611640583158e-07, + "loss": 0.81337833, + "num_input_tokens_seen": 240200130, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.40039062, + "step": 11124, + "time_per_iteration": 2.469949960708618 + }, + { + "auxiliary_loss_clip": 0.01057692, + "auxiliary_loss_mlp": 0.01027286, + "balance_loss_clip": 1.01570511, + "balance_loss_mlp": 1.01938105, + "epoch": 0.6688711859311589, + "flos": 21758535146880.0, + "grad_norm": 1.6952895411181441, + "language_loss": 0.74399364, + "learning_rate": 9.881352466115596e-07, + "loss": 0.76484334, + "num_input_tokens_seen": 240217945, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.3828125, + "step": 11125, + "time_per_iteration": 3.849921226501465 + }, + { + "auxiliary_loss_clip": 0.01061199, + "auxiliary_loss_mlp": 0.01026389, + "balance_loss_clip": 1.01509428, + "balance_loss_mlp": 1.02172673, + "epoch": 0.6689313091838268, + "flos": 22928544362880.0, + "grad_norm": 2.287699703476547, + "language_loss": 0.6686942, + "learning_rate": 9.878093652756528e-07, + "loss": 0.68957013, + "num_input_tokens_seen": 240237220, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.39453125, + "step": 11126, + "time_per_iteration": 2.4119958877563477 + }, + { + "auxiliary_loss_clip": 0.01056013, + "auxiliary_loss_mlp": 0.01021779, + "balance_loss_clip": 1.01019216, + "balance_loss_mlp": 1.01740122, + "epoch": 0.6689914324364948, + "flos": 20885112864000.0, + "grad_norm": 1.5938834383979041, + "language_loss": 0.70902503, + "learning_rate": 9.874835200622266e-07, + "loss": 0.72980297, + "num_input_tokens_seen": 240256000, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.38671875, + "step": 11127, + "time_per_iteration": 2.4424333572387695 + }, + { + "auxiliary_loss_clip": 0.01059127, + "auxiliary_loss_mlp": 0.0102592, + "balance_loss_clip": 1.0128665, + "balance_loss_mlp": 1.0180856, + "epoch": 0.6690515556891627, + "flos": 22747100693760.0, + "grad_norm": 1.924695833611523, + "language_loss": 0.80226696, + "learning_rate": 9.871577109829101e-07, + "loss": 0.82311743, + "num_input_tokens_seen": 240275845, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.41015625, + "step": 11128, + "time_per_iteration": 2.420313596725464 + }, + { + "auxiliary_loss_clip": 0.01057654, + "auxiliary_loss_mlp": 0.01023946, + "balance_loss_clip": 1.01255524, + "balance_loss_mlp": 1.01898885, + "epoch": 0.6691116789418308, + "flos": 23330324822400.0, + "grad_norm": 1.7335019709255788, + "language_loss": 0.80601954, + "learning_rate": 9.868319380493283e-07, + "loss": 0.82683551, + "num_input_tokens_seen": 240294095, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.38671875, + "step": 11129, + "time_per_iteration": 2.4179728031158447 + }, + { + "auxiliary_loss_clip": 0.01054174, + "auxiliary_loss_mlp": 0.01021408, + "balance_loss_clip": 1.01102507, + "balance_loss_mlp": 1.01875877, + "epoch": 0.6691718021944987, + "flos": 32445798364800.0, + "grad_norm": 1.5161106091424141, + "language_loss": 0.70467359, + "learning_rate": 9.865062012731088e-07, + "loss": 0.72542942, + "num_input_tokens_seen": 240313460, + "router_z_loss_clip": 0.10351562, + "router_z_loss_mlp": 0.35351562, + "step": 11130, + "time_per_iteration": 2.4743690490722656 + }, + { + "auxiliary_loss_clip": 0.01054839, + "auxiliary_loss_mlp": 0.01025219, + "balance_loss_clip": 1.01411486, + "balance_loss_mlp": 1.01850545, + "epoch": 0.6692319254471667, + "flos": 23730499359360.0, + "grad_norm": 1.9212033773091404, + "language_loss": 0.69921732, + "learning_rate": 9.86180500665876e-07, + "loss": 0.72001791, + "num_input_tokens_seen": 240333540, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.36328125, + "step": 11131, + "time_per_iteration": 2.4292144775390625 + }, + { + "auxiliary_loss_clip": 0.01056877, + "auxiliary_loss_mlp": 0.01026625, + "balance_loss_clip": 1.01364279, + "balance_loss_mlp": 1.01843596, + "epoch": 0.6692920486998346, + "flos": 14427053723520.0, + "grad_norm": 2.2980230952659966, + "language_loss": 0.65447485, + "learning_rate": 9.858548362392534e-07, + "loss": 0.6753099, + "num_input_tokens_seen": 240350085, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.3828125, + "step": 11132, + "time_per_iteration": 3.8044779300689697 + }, + { + "auxiliary_loss_clip": 0.01055945, + "auxiliary_loss_mlp": 0.01025852, + "balance_loss_clip": 1.01341295, + "balance_loss_mlp": 1.01671803, + "epoch": 0.6693521719525026, + "flos": 21506392241280.0, + "grad_norm": 1.9036899582056344, + "language_loss": 0.74691427, + "learning_rate": 9.855292080048622e-07, + "loss": 0.7677322, + "num_input_tokens_seen": 240370015, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.39257812, + "step": 11133, + "time_per_iteration": 2.4648427963256836 + }, + { + "auxiliary_loss_clip": 0.01059834, + "auxiliary_loss_mlp": 0.01027652, + "balance_loss_clip": 1.01394892, + "balance_loss_mlp": 1.01942539, + "epoch": 0.6694122952051706, + "flos": 25405876638720.0, + "grad_norm": 1.9928796645014237, + "language_loss": 0.66618538, + "learning_rate": 9.852036159743255e-07, + "loss": 0.68706024, + "num_input_tokens_seen": 240390770, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.40234375, + "step": 11134, + "time_per_iteration": 2.437077522277832 + }, + { + "auxiliary_loss_clip": 0.01062318, + "auxiliary_loss_mlp": 0.01024306, + "balance_loss_clip": 1.0109781, + "balance_loss_mlp": 1.01982379, + "epoch": 0.6694724184578386, + "flos": 25660672807680.0, + "grad_norm": 1.9353126292226035, + "language_loss": 0.7703383, + "learning_rate": 9.8487806015926e-07, + "loss": 0.79120457, + "num_input_tokens_seen": 240409590, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.42578125, + "step": 11135, + "time_per_iteration": 2.433809757232666 + }, + { + "auxiliary_loss_clip": 0.01055612, + "auxiliary_loss_mlp": 0.01022068, + "balance_loss_clip": 1.01088572, + "balance_loss_mlp": 1.01886797, + "epoch": 0.6695325417105066, + "flos": 17708435677440.0, + "grad_norm": 1.6576373336186772, + "language_loss": 0.7410118, + "learning_rate": 9.84552540571286e-07, + "loss": 0.76178861, + "num_input_tokens_seen": 240428180, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.3671875, + "step": 11136, + "time_per_iteration": 2.402554512023926 + }, + { + "auxiliary_loss_clip": 0.01060542, + "auxiliary_loss_mlp": 0.01022273, + "balance_loss_clip": 1.00942183, + "balance_loss_mlp": 1.02027392, + "epoch": 0.6695926649631745, + "flos": 24458962210560.0, + "grad_norm": 1.6912377966464982, + "language_loss": 0.62188935, + "learning_rate": 9.84227057222019e-07, + "loss": 0.64271748, + "num_input_tokens_seen": 240447815, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.40234375, + "step": 11137, + "time_per_iteration": 2.467311143875122 + }, + { + "auxiliary_loss_clip": 0.01056805, + "auxiliary_loss_mlp": 0.01024007, + "balance_loss_clip": 1.01216388, + "balance_loss_mlp": 1.01833153, + "epoch": 0.6696527882158425, + "flos": 24278984818560.0, + "grad_norm": 1.991472314271042, + "language_loss": 0.6613344, + "learning_rate": 9.83901610123077e-07, + "loss": 0.6821425, + "num_input_tokens_seen": 240468635, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.38476562, + "step": 11138, + "time_per_iteration": 2.438432216644287 + }, + { + "auxiliary_loss_clip": 0.01054306, + "auxiliary_loss_mlp": 0.01023015, + "balance_loss_clip": 1.01188135, + "balance_loss_mlp": 1.01785636, + "epoch": 0.6697129114685104, + "flos": 23001652483200.0, + "grad_norm": 1.607940641461544, + "language_loss": 0.72316217, + "learning_rate": 9.835761992860711e-07, + "loss": 0.74393535, + "num_input_tokens_seen": 240488550, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.36523438, + "step": 11139, + "time_per_iteration": 2.455749273300171 + }, + { + "auxiliary_loss_clip": 0.01054985, + "auxiliary_loss_mlp": 0.01021289, + "balance_loss_clip": 1.01128769, + "balance_loss_mlp": 1.01874936, + "epoch": 0.6697730347211784, + "flos": 22637019576960.0, + "grad_norm": 1.6738154194444925, + "language_loss": 0.7021932, + "learning_rate": 9.832508247226172e-07, + "loss": 0.72295588, + "num_input_tokens_seen": 240508330, + "router_z_loss_clip": 0.10009766, + "router_z_loss_mlp": 0.36328125, + "step": 11140, + "time_per_iteration": 2.4095962047576904 + }, + { + "auxiliary_loss_clip": 0.01057313, + "auxiliary_loss_mlp": 0.01021802, + "balance_loss_clip": 1.00952911, + "balance_loss_mlp": 1.01752079, + "epoch": 0.6698331579738463, + "flos": 28875963824640.0, + "grad_norm": 1.906472517500616, + "language_loss": 0.75805557, + "learning_rate": 9.829254864443258e-07, + "loss": 0.77884674, + "num_input_tokens_seen": 240528470, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.3984375, + "step": 11141, + "time_per_iteration": 2.4910356998443604 + }, + { + "auxiliary_loss_clip": 0.01057617, + "auxiliary_loss_mlp": 0.01024608, + "balance_loss_clip": 1.01251435, + "balance_loss_mlp": 1.01867867, + "epoch": 0.6698932812265144, + "flos": 24205946520960.0, + "grad_norm": 1.8289369414798458, + "language_loss": 0.81996167, + "learning_rate": 9.826001844628075e-07, + "loss": 0.84078389, + "num_input_tokens_seen": 240547815, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.390625, + "step": 11142, + "time_per_iteration": 2.418278217315674 + }, + { + "auxiliary_loss_clip": 0.0105537, + "auxiliary_loss_mlp": 0.01026034, + "balance_loss_clip": 1.01448858, + "balance_loss_mlp": 1.01781058, + "epoch": 0.6699534044791823, + "flos": 22089197433600.0, + "grad_norm": 1.3768585078329838, + "language_loss": 0.69954979, + "learning_rate": 9.822749187896716e-07, + "loss": 0.72036386, + "num_input_tokens_seen": 240567765, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.375, + "step": 11143, + "time_per_iteration": 2.464150905609131 + }, + { + "auxiliary_loss_clip": 0.01053709, + "auxiliary_loss_mlp": 0.01025583, + "balance_loss_clip": 1.01514673, + "balance_loss_mlp": 1.01773, + "epoch": 0.6700135277318503, + "flos": 25191195488640.0, + "grad_norm": 1.6133640412271968, + "language_loss": 0.69910622, + "learning_rate": 9.819496894365254e-07, + "loss": 0.71989906, + "num_input_tokens_seen": 240590750, + "router_z_loss_clip": 0.10449219, + "router_z_loss_mlp": 0.359375, + "step": 11144, + "time_per_iteration": 2.475053310394287 + }, + { + "auxiliary_loss_clip": 0.01055828, + "auxiliary_loss_mlp": 0.01023836, + "balance_loss_clip": 1.01277328, + "balance_loss_mlp": 1.0176878, + "epoch": 0.6700736509845182, + "flos": 23439079307520.0, + "grad_norm": 3.7322778160636605, + "language_loss": 0.74524206, + "learning_rate": 9.816244964149773e-07, + "loss": 0.76603866, + "num_input_tokens_seen": 240608875, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.3828125, + "step": 11145, + "time_per_iteration": 2.4766297340393066 + }, + { + "auxiliary_loss_clip": 0.0105896, + "auxiliary_loss_mlp": 0.01023626, + "balance_loss_clip": 1.01129961, + "balance_loss_mlp": 1.01887083, + "epoch": 0.6701337742371862, + "flos": 24388786644480.0, + "grad_norm": 1.661991678905502, + "language_loss": 0.70657074, + "learning_rate": 9.812993397366301e-07, + "loss": 0.72739661, + "num_input_tokens_seen": 240628565, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.40234375, + "step": 11146, + "time_per_iteration": 2.5188214778900146 + }, + { + "auxiliary_loss_clip": 0.01052544, + "auxiliary_loss_mlp": 0.01021601, + "balance_loss_clip": 1.0105499, + "balance_loss_mlp": 1.01684546, + "epoch": 0.6701938974898543, + "flos": 14792768881920.0, + "grad_norm": 2.3924354770010154, + "language_loss": 0.78264964, + "learning_rate": 9.809742194130895e-07, + "loss": 0.8033911, + "num_input_tokens_seen": 240646325, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.35546875, + "step": 11147, + "time_per_iteration": 2.4073593616485596 + }, + { + "auxiliary_loss_clip": 0.01058427, + "auxiliary_loss_mlp": 0.01021696, + "balance_loss_clip": 1.01031792, + "balance_loss_mlp": 1.02018642, + "epoch": 0.6702540207425222, + "flos": 20153054142720.0, + "grad_norm": 1.6512445331119014, + "language_loss": 0.70124811, + "learning_rate": 9.806491354559579e-07, + "loss": 0.72204936, + "num_input_tokens_seen": 240666145, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.3828125, + "step": 11148, + "time_per_iteration": 2.4007086753845215 + }, + { + "auxiliary_loss_clip": 0.01058063, + "auxiliary_loss_mlp": 0.01023785, + "balance_loss_clip": 1.01253796, + "balance_loss_mlp": 1.01831007, + "epoch": 0.6703141439951902, + "flos": 21213121887360.0, + "grad_norm": 1.7982035822464497, + "language_loss": 0.69905758, + "learning_rate": 9.803240878768366e-07, + "loss": 0.71987605, + "num_input_tokens_seen": 240685570, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.3984375, + "step": 11149, + "time_per_iteration": 2.4467084407806396 + }, + { + "auxiliary_loss_clip": 0.01060016, + "auxiliary_loss_mlp": 0.01022468, + "balance_loss_clip": 1.01106584, + "balance_loss_mlp": 1.01968646, + "epoch": 0.6703742672478581, + "flos": 23111419397760.0, + "grad_norm": 1.7221843212280783, + "language_loss": 0.7387259, + "learning_rate": 9.799990766873246e-07, + "loss": 0.75955069, + "num_input_tokens_seen": 240706945, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.40429688, + "step": 11150, + "time_per_iteration": 2.441962480545044 + }, + { + "auxiliary_loss_clip": 0.0105655, + "auxiliary_loss_mlp": 0.01023811, + "balance_loss_clip": 1.0119977, + "balance_loss_mlp": 1.01823592, + "epoch": 0.6704343905005261, + "flos": 22527811244160.0, + "grad_norm": 1.6648616659789177, + "language_loss": 0.78244311, + "learning_rate": 9.796741018990237e-07, + "loss": 0.80324674, + "num_input_tokens_seen": 240727990, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.3828125, + "step": 11151, + "time_per_iteration": 2.4520792961120605 + }, + { + "auxiliary_loss_clip": 0.01007194, + "auxiliary_loss_mlp": 0.0100113, + "balance_loss_clip": 1.00018215, + "balance_loss_mlp": 1.00095773, + "epoch": 0.670494513753194, + "flos": 64789473536640.0, + "grad_norm": 0.7878340049913638, + "language_loss": 0.55473101, + "learning_rate": 9.79349163523528e-07, + "loss": 0.5748142, + "num_input_tokens_seen": 240790380, + "router_z_loss_clip": 0.00946045, + "router_z_loss_mlp": 0.0625, + "step": 11152, + "time_per_iteration": 3.131005048751831 + }, + { + "auxiliary_loss_clip": 0.01056891, + "auxiliary_loss_mlp": 0.01026955, + "balance_loss_clip": 1.01526046, + "balance_loss_mlp": 1.01788044, + "epoch": 0.670554637005862, + "flos": 23510441859840.0, + "grad_norm": 1.70467597623901, + "language_loss": 0.8086974, + "learning_rate": 9.790242615724358e-07, + "loss": 0.82953584, + "num_input_tokens_seen": 240811545, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.390625, + "step": 11153, + "time_per_iteration": 2.4689650535583496 + }, + { + "auxiliary_loss_clip": 0.01057267, + "auxiliary_loss_mlp": 0.01024409, + "balance_loss_clip": 1.01277423, + "balance_loss_mlp": 1.01842213, + "epoch": 0.67061476025853, + "flos": 19462402160640.0, + "grad_norm": 1.619367612447367, + "language_loss": 0.76247287, + "learning_rate": 9.78699396057341e-07, + "loss": 0.78328967, + "num_input_tokens_seen": 240831380, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.38867188, + "step": 11154, + "time_per_iteration": 3.9434902667999268 + }, + { + "auxiliary_loss_clip": 0.01060597, + "auxiliary_loss_mlp": 0.01027867, + "balance_loss_clip": 1.0158205, + "balance_loss_mlp": 1.02075791, + "epoch": 0.670674883511198, + "flos": 20518978769280.0, + "grad_norm": 1.6914532816722505, + "language_loss": 0.7632395, + "learning_rate": 9.783745669898388e-07, + "loss": 0.78412414, + "num_input_tokens_seen": 240851855, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.3984375, + "step": 11155, + "time_per_iteration": 2.4175002574920654 + }, + { + "auxiliary_loss_clip": 0.01058705, + "auxiliary_loss_mlp": 0.01025197, + "balance_loss_clip": 1.01313889, + "balance_loss_mlp": 1.02034998, + "epoch": 0.6707350067638659, + "flos": 25482790097280.0, + "grad_norm": 2.056922796635523, + "language_loss": 0.8192777, + "learning_rate": 9.78049774381519e-07, + "loss": 0.84011674, + "num_input_tokens_seen": 240869980, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.38476562, + "step": 11156, + "time_per_iteration": 2.447269916534424 + }, + { + "auxiliary_loss_clip": 0.01058365, + "auxiliary_loss_mlp": 0.01028321, + "balance_loss_clip": 1.01668632, + "balance_loss_mlp": 1.01945591, + "epoch": 0.6707951300165339, + "flos": 22272351759360.0, + "grad_norm": 2.1457188697129728, + "language_loss": 0.74697316, + "learning_rate": 9.777250182439746e-07, + "loss": 0.76784003, + "num_input_tokens_seen": 240888680, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.38867188, + "step": 11157, + "time_per_iteration": 2.3975648880004883 + }, + { + "auxiliary_loss_clip": 0.01059999, + "auxiliary_loss_mlp": 0.01029041, + "balance_loss_clip": 1.01668501, + "balance_loss_mlp": 1.02017498, + "epoch": 0.6708552532692018, + "flos": 23983549960320.0, + "grad_norm": 1.8477841280904184, + "language_loss": 0.74201924, + "learning_rate": 9.774002985887957e-07, + "loss": 0.76290965, + "num_input_tokens_seen": 240909050, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.3984375, + "step": 11158, + "time_per_iteration": 2.4521069526672363 + }, + { + "auxiliary_loss_clip": 0.01059961, + "auxiliary_loss_mlp": 0.01025414, + "balance_loss_clip": 1.01334429, + "balance_loss_mlp": 1.01975918, + "epoch": 0.6709153765218698, + "flos": 24936329496960.0, + "grad_norm": 2.207163252123411, + "language_loss": 0.81742078, + "learning_rate": 9.770756154275681e-07, + "loss": 0.83827454, + "num_input_tokens_seen": 240930035, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.40234375, + "step": 11159, + "time_per_iteration": 2.4467265605926514 + }, + { + "auxiliary_loss_clip": 0.01057517, + "auxiliary_loss_mlp": 0.01023151, + "balance_loss_clip": 1.01165307, + "balance_loss_mlp": 1.01913846, + "epoch": 0.6709754997745379, + "flos": 17529261246720.0, + "grad_norm": 1.5694957291978544, + "language_loss": 0.7678225, + "learning_rate": 9.767509687718811e-07, + "loss": 0.78862923, + "num_input_tokens_seen": 240948895, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.3828125, + "step": 11160, + "time_per_iteration": 2.4835658073425293 + }, + { + "auxiliary_loss_clip": 0.01056483, + "auxiliary_loss_mlp": 0.01020816, + "balance_loss_clip": 1.00954509, + "balance_loss_mlp": 1.01846147, + "epoch": 0.6710356230272058, + "flos": 22089790926720.0, + "grad_norm": 2.1164076357326045, + "language_loss": 0.73382914, + "learning_rate": 9.764263586333195e-07, + "loss": 0.75460213, + "num_input_tokens_seen": 240967770, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.38085938, + "step": 11161, + "time_per_iteration": 2.4597277641296387 + }, + { + "auxiliary_loss_clip": 0.01059887, + "auxiliary_loss_mlp": 0.01029035, + "balance_loss_clip": 1.01583886, + "balance_loss_mlp": 1.01906633, + "epoch": 0.6710957462798738, + "flos": 24205318116480.0, + "grad_norm": 1.6429299819401615, + "language_loss": 0.68134898, + "learning_rate": 9.761017850234695e-07, + "loss": 0.70223814, + "num_input_tokens_seen": 240988985, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.40820312, + "step": 11162, + "time_per_iteration": 2.4962713718414307 + }, + { + "auxiliary_loss_clip": 0.01057331, + "auxiliary_loss_mlp": 0.0102617, + "balance_loss_clip": 1.01494694, + "balance_loss_mlp": 1.01879513, + "epoch": 0.6711558695325417, + "flos": 19093091132160.0, + "grad_norm": 1.6902121403231303, + "language_loss": 0.6983341, + "learning_rate": 9.757772479539116e-07, + "loss": 0.71916914, + "num_input_tokens_seen": 241005455, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.38476562, + "step": 11163, + "time_per_iteration": 2.3840994834899902 + }, + { + "auxiliary_loss_clip": 0.0105705, + "auxiliary_loss_mlp": 0.01026372, + "balance_loss_clip": 1.01473713, + "balance_loss_mlp": 1.02033854, + "epoch": 0.6712159927852097, + "flos": 25556666267520.0, + "grad_norm": 1.7970358144934453, + "language_loss": 0.75560343, + "learning_rate": 9.754527474362296e-07, + "loss": 0.7764377, + "num_input_tokens_seen": 241026175, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.3671875, + "step": 11164, + "time_per_iteration": 3.950084686279297 + }, + { + "auxiliary_loss_clip": 0.01056687, + "auxiliary_loss_mlp": 0.01023508, + "balance_loss_clip": 1.0124867, + "balance_loss_mlp": 1.01800334, + "epoch": 0.6712761160378776, + "flos": 22227942643200.0, + "grad_norm": 2.5014241724433117, + "language_loss": 0.65495175, + "learning_rate": 9.751282834820039e-07, + "loss": 0.67575371, + "num_input_tokens_seen": 241044040, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.38671875, + "step": 11165, + "time_per_iteration": 2.3958675861358643 + }, + { + "auxiliary_loss_clip": 0.01056886, + "auxiliary_loss_mlp": 0.0102492, + "balance_loss_clip": 1.01323724, + "balance_loss_mlp": 1.0186013, + "epoch": 0.6713362392905456, + "flos": 22454423832960.0, + "grad_norm": 2.021520936680037, + "language_loss": 0.71552843, + "learning_rate": 9.74803856102813e-07, + "loss": 0.73634648, + "num_input_tokens_seen": 241063615, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.3828125, + "step": 11166, + "time_per_iteration": 2.4355595111846924 + }, + { + "auxiliary_loss_clip": 0.01055952, + "auxiliary_loss_mlp": 0.01021834, + "balance_loss_clip": 1.01089609, + "balance_loss_mlp": 1.01932192, + "epoch": 0.6713963625432136, + "flos": 25629006337920.0, + "grad_norm": 1.8273414276440947, + "language_loss": 0.77798951, + "learning_rate": 9.74479465310235e-07, + "loss": 0.79876733, + "num_input_tokens_seen": 241082520, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.3671875, + "step": 11167, + "time_per_iteration": 2.4626271724700928 + }, + { + "auxiliary_loss_clip": 0.01057399, + "auxiliary_loss_mlp": 0.01028838, + "balance_loss_clip": 1.01727414, + "balance_loss_mlp": 1.01866901, + "epoch": 0.6714564857958816, + "flos": 35005036423680.0, + "grad_norm": 1.7096125870970196, + "language_loss": 0.68492055, + "learning_rate": 9.741551111158485e-07, + "loss": 0.70578289, + "num_input_tokens_seen": 241103505, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.38671875, + "step": 11168, + "time_per_iteration": 2.54671311378479 + }, + { + "auxiliary_loss_clip": 0.01059318, + "auxiliary_loss_mlp": 0.01028763, + "balance_loss_clip": 1.01560211, + "balance_loss_mlp": 1.01917982, + "epoch": 0.6715166090485495, + "flos": 26278914896640.0, + "grad_norm": 3.0459189258151693, + "language_loss": 0.73164821, + "learning_rate": 9.738307935312257e-07, + "loss": 0.75252903, + "num_input_tokens_seen": 241122885, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.40039062, + "step": 11169, + "time_per_iteration": 2.4263410568237305 + }, + { + "auxiliary_loss_clip": 0.01060123, + "auxiliary_loss_mlp": 0.01025714, + "balance_loss_clip": 1.01337576, + "balance_loss_mlp": 1.01985145, + "epoch": 0.6715767323012175, + "flos": 15923256572160.0, + "grad_norm": 2.150967429739817, + "language_loss": 0.75470865, + "learning_rate": 9.735065125679432e-07, + "loss": 0.77556705, + "num_input_tokens_seen": 241140865, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.40234375, + "step": 11170, + "time_per_iteration": 2.453855514526367 + }, + { + "auxiliary_loss_clip": 0.01059086, + "auxiliary_loss_mlp": 0.01022, + "balance_loss_clip": 1.00965643, + "balance_loss_mlp": 1.01820993, + "epoch": 0.6716368555538854, + "flos": 17490542682240.0, + "grad_norm": 1.9824236901199768, + "language_loss": 0.74183309, + "learning_rate": 9.731822682375717e-07, + "loss": 0.76264393, + "num_input_tokens_seen": 241158225, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.41015625, + "step": 11171, + "time_per_iteration": 3.8153717517852783 + }, + { + "auxiliary_loss_clip": 0.01058461, + "auxiliary_loss_mlp": 0.01030132, + "balance_loss_clip": 1.01779997, + "balance_loss_mlp": 1.0189817, + "epoch": 0.6716969788065534, + "flos": 16760648465280.0, + "grad_norm": 1.5577459956330526, + "language_loss": 0.86444682, + "learning_rate": 9.728580605516854e-07, + "loss": 0.8853327, + "num_input_tokens_seen": 241175215, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.39453125, + "step": 11172, + "time_per_iteration": 2.3981587886810303 + }, + { + "auxiliary_loss_clip": 0.01056736, + "auxiliary_loss_mlp": 0.01027762, + "balance_loss_clip": 1.01580548, + "balance_loss_mlp": 1.01766992, + "epoch": 0.6717571020592215, + "flos": 22708731242880.0, + "grad_norm": 1.4976258223346426, + "language_loss": 0.63744378, + "learning_rate": 9.72533889521852e-07, + "loss": 0.65828872, + "num_input_tokens_seen": 241195250, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.390625, + "step": 11173, + "time_per_iteration": 2.406010150909424 + }, + { + "auxiliary_loss_clip": 0.01060888, + "auxiliary_loss_mlp": 0.01029548, + "balance_loss_clip": 1.01633358, + "balance_loss_mlp": 1.01883698, + "epoch": 0.6718172253118894, + "flos": 18733101436800.0, + "grad_norm": 2.695686293913985, + "language_loss": 0.71701771, + "learning_rate": 9.722097551596404e-07, + "loss": 0.73792213, + "num_input_tokens_seen": 241210720, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.421875, + "step": 11174, + "time_per_iteration": 2.3601198196411133 + }, + { + "auxiliary_loss_clip": 0.01055932, + "auxiliary_loss_mlp": 0.01022288, + "balance_loss_clip": 1.01083207, + "balance_loss_mlp": 1.01837182, + "epoch": 0.6718773485645574, + "flos": 15631627052160.0, + "grad_norm": 4.319005887721141, + "language_loss": 0.68457401, + "learning_rate": 9.718856574766205e-07, + "loss": 0.70535618, + "num_input_tokens_seen": 241227395, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.37695312, + "step": 11175, + "time_per_iteration": 2.370774984359741 + }, + { + "auxiliary_loss_clip": 0.01057088, + "auxiliary_loss_mlp": 0.01022742, + "balance_loss_clip": 1.01148295, + "balance_loss_mlp": 1.01914239, + "epoch": 0.6719374718172253, + "flos": 19353752409600.0, + "grad_norm": 1.7910762971511556, + "language_loss": 0.73406589, + "learning_rate": 9.71561596484355e-07, + "loss": 0.75486422, + "num_input_tokens_seen": 241246355, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.37890625, + "step": 11176, + "time_per_iteration": 2.406846761703491 + }, + { + "auxiliary_loss_clip": 0.01056044, + "auxiliary_loss_mlp": 0.01022243, + "balance_loss_clip": 1.00978565, + "balance_loss_mlp": 1.01725972, + "epoch": 0.6719975950698933, + "flos": 21980233480320.0, + "grad_norm": 1.6882082774143334, + "language_loss": 0.73005193, + "learning_rate": 9.712375721944117e-07, + "loss": 0.75083482, + "num_input_tokens_seen": 241264180, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.38867188, + "step": 11177, + "time_per_iteration": 2.392808675765991 + }, + { + "auxiliary_loss_clip": 0.01057616, + "auxiliary_loss_mlp": 0.01028547, + "balance_loss_clip": 1.01533234, + "balance_loss_mlp": 1.01899719, + "epoch": 0.6720577183225612, + "flos": 25226911676160.0, + "grad_norm": 2.1961273045798415, + "language_loss": 0.76622647, + "learning_rate": 9.709135846183531e-07, + "loss": 0.7870881, + "num_input_tokens_seen": 241282245, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.38671875, + "step": 11178, + "time_per_iteration": 2.415851354598999 + }, + { + "auxiliary_loss_clip": 0.01058513, + "auxiliary_loss_mlp": 0.01024397, + "balance_loss_clip": 1.01257169, + "balance_loss_mlp": 1.01925659, + "epoch": 0.6721178415752292, + "flos": 16944954865920.0, + "grad_norm": 1.6439754274603333, + "language_loss": 0.69865781, + "learning_rate": 9.705896337677418e-07, + "loss": 0.71948689, + "num_input_tokens_seen": 241300745, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.39257812, + "step": 11179, + "time_per_iteration": 2.360980987548828 + }, + { + "auxiliary_loss_clip": 0.01057031, + "auxiliary_loss_mlp": 0.0102175, + "balance_loss_clip": 1.01054394, + "balance_loss_mlp": 1.01905406, + "epoch": 0.6721779648278972, + "flos": 21540362860800.0, + "grad_norm": 1.4890524734884922, + "language_loss": 0.7406345, + "learning_rate": 9.702657196541372e-07, + "loss": 0.76142228, + "num_input_tokens_seen": 241319320, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.37890625, + "step": 11180, + "time_per_iteration": 2.4137940406799316 + }, + { + "auxiliary_loss_clip": 0.01059026, + "auxiliary_loss_mlp": 0.0102398, + "balance_loss_clip": 1.01249385, + "balance_loss_mlp": 1.02017975, + "epoch": 0.6722380880805652, + "flos": 22604235943680.0, + "grad_norm": 1.405979724751444, + "language_loss": 0.7521984, + "learning_rate": 9.699418422891014e-07, + "loss": 0.77302849, + "num_input_tokens_seen": 241342225, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.38867188, + "step": 11181, + "time_per_iteration": 2.43453049659729 + }, + { + "auxiliary_loss_clip": 0.01058715, + "auxiliary_loss_mlp": 0.0102485, + "balance_loss_clip": 1.01314378, + "balance_loss_mlp": 1.01844192, + "epoch": 0.6722982113332331, + "flos": 15924338824320.0, + "grad_norm": 2.0658933607677485, + "language_loss": 0.74648356, + "learning_rate": 9.696180016841917e-07, + "loss": 0.7673192, + "num_input_tokens_seen": 241358240, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.40234375, + "step": 11182, + "time_per_iteration": 2.3851075172424316 + }, + { + "auxiliary_loss_clip": 0.01055816, + "auxiliary_loss_mlp": 0.01023754, + "balance_loss_clip": 1.01216698, + "balance_loss_mlp": 1.01928544, + "epoch": 0.6723583345859011, + "flos": 20595089266560.0, + "grad_norm": 1.786287738112594, + "language_loss": 0.69744778, + "learning_rate": 9.692941978509649e-07, + "loss": 0.71824348, + "num_input_tokens_seen": 241378420, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.36523438, + "step": 11183, + "time_per_iteration": 2.4015769958496094 + }, + { + "auxiliary_loss_clip": 0.01057892, + "auxiliary_loss_mlp": 0.01027418, + "balance_loss_clip": 1.0156759, + "balance_loss_mlp": 1.01885951, + "epoch": 0.672418457838569, + "flos": 21724773995520.0, + "grad_norm": 2.6382045547684667, + "language_loss": 0.77819836, + "learning_rate": 9.68970430800976e-07, + "loss": 0.79905152, + "num_input_tokens_seen": 241397185, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.390625, + "step": 11184, + "time_per_iteration": 2.496405601501465 + }, + { + "auxiliary_loss_clip": 0.0106421, + "auxiliary_loss_mlp": 0.01026968, + "balance_loss_clip": 1.0130918, + "balance_loss_mlp": 1.02125931, + "epoch": 0.672478581091237, + "flos": 21469314510720.0, + "grad_norm": 1.7635660259948536, + "language_loss": 0.66180241, + "learning_rate": 9.68646700545782e-07, + "loss": 0.68271416, + "num_input_tokens_seen": 241415785, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.4296875, + "step": 11185, + "time_per_iteration": 2.412283182144165 + }, + { + "auxiliary_loss_clip": 0.010578, + "auxiliary_loss_mlp": 0.01023539, + "balance_loss_clip": 1.01116562, + "balance_loss_mlp": 1.01850438, + "epoch": 0.6725387043439051, + "flos": 30845449330560.0, + "grad_norm": 1.7431804725622502, + "language_loss": 0.80737507, + "learning_rate": 9.683230070969328e-07, + "loss": 0.82818848, + "num_input_tokens_seen": 241437390, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.39257812, + "step": 11186, + "time_per_iteration": 2.5166494846343994 + }, + { + "auxiliary_loss_clip": 0.01057751, + "auxiliary_loss_mlp": 0.01022694, + "balance_loss_clip": 1.01167274, + "balance_loss_mlp": 1.01995969, + "epoch": 0.672598827596573, + "flos": 24054947424000.0, + "grad_norm": 1.4990099890441146, + "language_loss": 0.80414325, + "learning_rate": 9.679993504659823e-07, + "loss": 0.82494771, + "num_input_tokens_seen": 241458085, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.37695312, + "step": 11187, + "time_per_iteration": 2.4583332538604736 + }, + { + "auxiliary_loss_clip": 0.01059726, + "auxiliary_loss_mlp": 0.01023596, + "balance_loss_clip": 1.01150846, + "balance_loss_mlp": 1.01881528, + "epoch": 0.672658950849241, + "flos": 21870780768000.0, + "grad_norm": 25.417291033719813, + "language_loss": 0.7068913, + "learning_rate": 9.676757306644805e-07, + "loss": 0.72772455, + "num_input_tokens_seen": 241476880, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.41015625, + "step": 11188, + "time_per_iteration": 2.430915594100952 + }, + { + "auxiliary_loss_clip": 0.01056404, + "auxiliary_loss_mlp": 0.01022932, + "balance_loss_clip": 1.01192284, + "balance_loss_mlp": 1.01963425, + "epoch": 0.6727190741019089, + "flos": 23220976844160.0, + "grad_norm": 1.9250177780696562, + "language_loss": 0.75822955, + "learning_rate": 9.673521477039763e-07, + "loss": 0.77902293, + "num_input_tokens_seen": 241496535, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.3671875, + "step": 11189, + "time_per_iteration": 2.394350290298462 + }, + { + "auxiliary_loss_clip": 0.01059604, + "auxiliary_loss_mlp": 0.01029173, + "balance_loss_clip": 1.01570249, + "balance_loss_mlp": 1.01858294, + "epoch": 0.6727791973545769, + "flos": 15777703647360.0, + "grad_norm": 1.8877407707912421, + "language_loss": 0.75168115, + "learning_rate": 9.670286015960178e-07, + "loss": 0.77256894, + "num_input_tokens_seen": 241513465, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.41015625, + "step": 11190, + "time_per_iteration": 2.362993001937866 + }, + { + "auxiliary_loss_clip": 0.01056923, + "auxiliary_loss_mlp": 0.01028165, + "balance_loss_clip": 1.01577342, + "balance_loss_mlp": 1.01844883, + "epoch": 0.6728393206072448, + "flos": 21248838074880.0, + "grad_norm": 1.480663247433822, + "language_loss": 0.77123863, + "learning_rate": 9.667050923521504e-07, + "loss": 0.79208952, + "num_input_tokens_seen": 241534125, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.38476562, + "step": 11191, + "time_per_iteration": 2.3816514015197754 + }, + { + "auxiliary_loss_clip": 0.01055905, + "auxiliary_loss_mlp": 0.01020997, + "balance_loss_clip": 1.009076, + "balance_loss_mlp": 1.01832747, + "epoch": 0.6728994438599128, + "flos": 32121943793280.0, + "grad_norm": 1.7628604847242564, + "language_loss": 0.86301446, + "learning_rate": 9.66381619983922e-07, + "loss": 0.88378346, + "num_input_tokens_seen": 241556340, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.375, + "step": 11192, + "time_per_iteration": 2.479154109954834 + }, + { + "auxiliary_loss_clip": 0.01060537, + "auxiliary_loss_mlp": 0.01026157, + "balance_loss_clip": 1.0128293, + "balance_loss_mlp": 1.01997161, + "epoch": 0.6729595671125808, + "flos": 23111244840960.0, + "grad_norm": 2.2628836562745716, + "language_loss": 0.75478011, + "learning_rate": 9.660581845028732e-07, + "loss": 0.7756471, + "num_input_tokens_seen": 241575185, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.40625, + "step": 11193, + "time_per_iteration": 3.857584238052368 + }, + { + "auxiliary_loss_clip": 0.01059046, + "auxiliary_loss_mlp": 0.01026402, + "balance_loss_clip": 1.01414156, + "balance_loss_mlp": 1.01964068, + "epoch": 0.6730196903652488, + "flos": 14610522251520.0, + "grad_norm": 1.6683129773731877, + "language_loss": 0.78789693, + "learning_rate": 9.65734785920549e-07, + "loss": 0.8087514, + "num_input_tokens_seen": 241592970, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.39453125, + "step": 11194, + "time_per_iteration": 2.38741135597229 + }, + { + "auxiliary_loss_clip": 0.01058219, + "auxiliary_loss_mlp": 0.0102599, + "balance_loss_clip": 1.01423025, + "balance_loss_mlp": 1.01861215, + "epoch": 0.6730798136179167, + "flos": 21104856161280.0, + "grad_norm": 2.5527992736087923, + "language_loss": 0.90035158, + "learning_rate": 9.654114242484899e-07, + "loss": 0.92119372, + "num_input_tokens_seen": 241610245, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.39453125, + "step": 11195, + "time_per_iteration": 2.3794100284576416 + }, + { + "auxiliary_loss_clip": 0.01057008, + "auxiliary_loss_mlp": 0.01022039, + "balance_loss_clip": 1.00992131, + "balance_loss_mlp": 1.01856816, + "epoch": 0.6731399368705847, + "flos": 28984997600640.0, + "grad_norm": 1.819433553834312, + "language_loss": 0.72282624, + "learning_rate": 9.650880994982358e-07, + "loss": 0.7436167, + "num_input_tokens_seen": 241630350, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.38476562, + "step": 11196, + "time_per_iteration": 2.4629929065704346 + }, + { + "auxiliary_loss_clip": 0.01007274, + "auxiliary_loss_mlp": 0.01001482, + "balance_loss_clip": 1.00051081, + "balance_loss_mlp": 1.00099182, + "epoch": 0.6732000601232526, + "flos": 64740386298240.0, + "grad_norm": 1.0905096893200659, + "language_loss": 0.56561458, + "learning_rate": 9.647648116813245e-07, + "loss": 0.58570218, + "num_input_tokens_seen": 241692380, + "router_z_loss_clip": 0.00970459, + "router_z_loss_mlp": 0.0625, + "step": 11197, + "time_per_iteration": 2.9768314361572266 + }, + { + "auxiliary_loss_clip": 0.01056552, + "auxiliary_loss_mlp": 0.01022842, + "balance_loss_clip": 1.01121902, + "balance_loss_mlp": 1.01919556, + "epoch": 0.6732601833759206, + "flos": 17200693641600.0, + "grad_norm": 13.030278253183559, + "language_loss": 0.75094509, + "learning_rate": 9.64441560809295e-07, + "loss": 0.77173901, + "num_input_tokens_seen": 241710430, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.37304688, + "step": 11198, + "time_per_iteration": 2.38499116897583 + }, + { + "auxiliary_loss_clip": 0.01057165, + "auxiliary_loss_mlp": 0.01025451, + "balance_loss_clip": 1.01334524, + "balance_loss_mlp": 1.01780438, + "epoch": 0.6733203066285887, + "flos": 18657933546240.0, + "grad_norm": 2.394244380279217, + "language_loss": 0.81614393, + "learning_rate": 9.64118346893682e-07, + "loss": 0.83697009, + "num_input_tokens_seen": 241724775, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.39453125, + "step": 11199, + "time_per_iteration": 2.4373958110809326 + }, + { + "auxiliary_loss_clip": 0.01057682, + "auxiliary_loss_mlp": 0.01025343, + "balance_loss_clip": 1.013309, + "balance_loss_mlp": 1.0184381, + "epoch": 0.6733804298812566, + "flos": 35807864204160.0, + "grad_norm": 1.9452248489859054, + "language_loss": 0.7113173, + "learning_rate": 9.63795169946021e-07, + "loss": 0.73214751, + "num_input_tokens_seen": 241744440, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.39257812, + "step": 11200, + "time_per_iteration": 2.541478395462036 + }, + { + "auxiliary_loss_clip": 0.01059885, + "auxiliary_loss_mlp": 0.01025341, + "balance_loss_clip": 1.01324713, + "balance_loss_mlp": 1.02038479, + "epoch": 0.6734405531339246, + "flos": 61636714364160.0, + "grad_norm": 1.8153059387933712, + "language_loss": 0.6451416, + "learning_rate": 9.63472029977844e-07, + "loss": 0.66599387, + "num_input_tokens_seen": 241771705, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.39453125, + "step": 11201, + "time_per_iteration": 2.7867822647094727 + }, + { + "auxiliary_loss_clip": 0.01059624, + "auxiliary_loss_mlp": 0.01024921, + "balance_loss_clip": 1.01223695, + "balance_loss_mlp": 1.0191524, + "epoch": 0.6735006763865925, + "flos": 20521282919040.0, + "grad_norm": 1.857050425722388, + "language_loss": 0.63120341, + "learning_rate": 9.631489270006855e-07, + "loss": 0.65204889, + "num_input_tokens_seen": 241790830, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.40429688, + "step": 11202, + "time_per_iteration": 2.4356908798217773 + }, + { + "auxiliary_loss_clip": 0.01058115, + "auxiliary_loss_mlp": 0.01022427, + "balance_loss_clip": 1.01122165, + "balance_loss_mlp": 1.01914573, + "epoch": 0.6735607996392605, + "flos": 13917985056000.0, + "grad_norm": 1.7480704283508721, + "language_loss": 0.75175339, + "learning_rate": 9.628258610260742e-07, + "loss": 0.77255881, + "num_input_tokens_seen": 241808165, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.38867188, + "step": 11203, + "time_per_iteration": 2.390599250793457 + }, + { + "auxiliary_loss_clip": 0.01061918, + "auxiliary_loss_mlp": 0.0102865, + "balance_loss_clip": 1.01472676, + "balance_loss_mlp": 1.01972079, + "epoch": 0.6736209228919284, + "flos": 18806244468480.0, + "grad_norm": 1.6080066895022636, + "language_loss": 0.67480934, + "learning_rate": 9.625028320655387e-07, + "loss": 0.69571507, + "num_input_tokens_seen": 241826925, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.421875, + "step": 11204, + "time_per_iteration": 5.220855712890625 + }, + { + "auxiliary_loss_clip": 0.01062172, + "auxiliary_loss_mlp": 0.01028125, + "balance_loss_clip": 1.01504815, + "balance_loss_mlp": 1.02087045, + "epoch": 0.6736810461445965, + "flos": 20372169035520.0, + "grad_norm": 1.493308959163068, + "language_loss": 0.74282658, + "learning_rate": 9.621798401306095e-07, + "loss": 0.76372957, + "num_input_tokens_seen": 241845525, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.4140625, + "step": 11205, + "time_per_iteration": 2.4068195819854736 + }, + { + "auxiliary_loss_clip": 0.01055261, + "auxiliary_loss_mlp": 0.01025528, + "balance_loss_clip": 1.01439345, + "balance_loss_mlp": 1.01742554, + "epoch": 0.6737411693972644, + "flos": 30006242046720.0, + "grad_norm": 1.577345296582584, + "language_loss": 0.71485221, + "learning_rate": 9.618568852328123e-07, + "loss": 0.73566008, + "num_input_tokens_seen": 241866815, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.37890625, + "step": 11206, + "time_per_iteration": 2.5101194381713867 + }, + { + "auxiliary_loss_clip": 0.01062805, + "auxiliary_loss_mlp": 0.01026308, + "balance_loss_clip": 1.01315892, + "balance_loss_mlp": 1.02030659, + "epoch": 0.6738012926499324, + "flos": 25446166214400.0, + "grad_norm": 1.9795074599013325, + "language_loss": 0.67460322, + "learning_rate": 9.615339673836724e-07, + "loss": 0.69549441, + "num_input_tokens_seen": 241887050, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.42578125, + "step": 11207, + "time_per_iteration": 2.415846347808838 + }, + { + "auxiliary_loss_clip": 0.01058476, + "auxiliary_loss_mlp": 0.01028401, + "balance_loss_clip": 1.01665926, + "balance_loss_mlp": 1.02027774, + "epoch": 0.6738614159026003, + "flos": 20775834708480.0, + "grad_norm": 2.1429244871661792, + "language_loss": 0.73266327, + "learning_rate": 9.612110865947133e-07, + "loss": 0.75353205, + "num_input_tokens_seen": 241904280, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.3828125, + "step": 11208, + "time_per_iteration": 2.380352020263672 + }, + { + "auxiliary_loss_clip": 0.01059177, + "auxiliary_loss_mlp": 0.01025299, + "balance_loss_clip": 1.01207316, + "balance_loss_mlp": 1.0192945, + "epoch": 0.6739215391552683, + "flos": 19566059587200.0, + "grad_norm": 2.0321786697570907, + "language_loss": 0.75438076, + "learning_rate": 9.608882428774595e-07, + "loss": 0.77522552, + "num_input_tokens_seen": 241919190, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.3984375, + "step": 11209, + "time_per_iteration": 2.3514606952667236 + }, + { + "auxiliary_loss_clip": 0.01053023, + "auxiliary_loss_mlp": 0.01021388, + "balance_loss_clip": 1.01068306, + "balance_loss_mlp": 1.01696682, + "epoch": 0.6739816624079362, + "flos": 24387075987840.0, + "grad_norm": 1.7432581578184088, + "language_loss": 0.66391951, + "learning_rate": 9.605654362434302e-07, + "loss": 0.68466359, + "num_input_tokens_seen": 241940525, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.359375, + "step": 11210, + "time_per_iteration": 2.4023056030273438 + }, + { + "auxiliary_loss_clip": 0.01053775, + "auxiliary_loss_mlp": 0.01020339, + "balance_loss_clip": 1.00946712, + "balance_loss_mlp": 1.01658583, + "epoch": 0.6740417856606042, + "flos": 22527078105600.0, + "grad_norm": 2.073500778380215, + "language_loss": 0.80042672, + "learning_rate": 9.602426667041475e-07, + "loss": 0.82116783, + "num_input_tokens_seen": 241959290, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.37109375, + "step": 11211, + "time_per_iteration": 3.815119504928589 + }, + { + "auxiliary_loss_clip": 0.01056694, + "auxiliary_loss_mlp": 0.01026113, + "balance_loss_clip": 1.01412058, + "balance_loss_mlp": 1.01917434, + "epoch": 0.6741019089132723, + "flos": 25774279971840.0, + "grad_norm": 2.298562164970679, + "language_loss": 0.76412451, + "learning_rate": 9.599199342711293e-07, + "loss": 0.78495258, + "num_input_tokens_seen": 241980715, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.375, + "step": 11212, + "time_per_iteration": 2.4513723850250244 + }, + { + "auxiliary_loss_clip": 0.01057542, + "auxiliary_loss_mlp": 0.01020959, + "balance_loss_clip": 1.00885916, + "balance_loss_mlp": 1.01899648, + "epoch": 0.6741620321659402, + "flos": 21104611781760.0, + "grad_norm": 1.7433247925922988, + "language_loss": 0.78240961, + "learning_rate": 9.595972389558932e-07, + "loss": 0.80319464, + "num_input_tokens_seen": 241999985, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.38671875, + "step": 11213, + "time_per_iteration": 2.442798137664795 + }, + { + "auxiliary_loss_clip": 0.01060598, + "auxiliary_loss_mlp": 0.01025045, + "balance_loss_clip": 1.01286197, + "balance_loss_mlp": 1.01951301, + "epoch": 0.6742221554186082, + "flos": 20739385382400.0, + "grad_norm": 1.7812311454833685, + "language_loss": 0.67563117, + "learning_rate": 9.592745807699548e-07, + "loss": 0.69648761, + "num_input_tokens_seen": 242018990, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.41015625, + "step": 11214, + "time_per_iteration": 2.411428689956665 + }, + { + "auxiliary_loss_clip": 0.01056766, + "auxiliary_loss_mlp": 0.01026258, + "balance_loss_clip": 1.01462913, + "balance_loss_mlp": 1.01806366, + "epoch": 0.6742822786712761, + "flos": 37772776321920.0, + "grad_norm": 3.290409058757227, + "language_loss": 0.72771513, + "learning_rate": 9.589519597248304e-07, + "loss": 0.74854541, + "num_input_tokens_seen": 242039340, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.38671875, + "step": 11215, + "time_per_iteration": 2.52140212059021 + }, + { + "auxiliary_loss_clip": 0.0105734, + "auxiliary_loss_mlp": 0.01023619, + "balance_loss_clip": 1.01199055, + "balance_loss_mlp": 1.01833546, + "epoch": 0.6743424019239441, + "flos": 37262520668160.0, + "grad_norm": 5.465702034139275, + "language_loss": 0.66967744, + "learning_rate": 9.586293758320326e-07, + "loss": 0.69048703, + "num_input_tokens_seen": 242062215, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.390625, + "step": 11216, + "time_per_iteration": 2.544454336166382 + }, + { + "auxiliary_loss_clip": 0.01007287, + "auxiliary_loss_mlp": 0.01002081, + "balance_loss_clip": 1.00108588, + "balance_loss_mlp": 1.00102878, + "epoch": 0.674402525176612, + "flos": 65994011953920.0, + "grad_norm": 0.6782075860493014, + "language_loss": 0.562868, + "learning_rate": 9.583068291030736e-07, + "loss": 0.58296168, + "num_input_tokens_seen": 242131130, + "router_z_loss_clip": 0.00994873, + "router_z_loss_mlp": 0.0625, + "step": 11217, + "time_per_iteration": 3.1303963661193848 + }, + { + "auxiliary_loss_clip": 0.01056453, + "auxiliary_loss_mlp": 0.01024893, + "balance_loss_clip": 1.01304984, + "balance_loss_mlp": 1.0185256, + "epoch": 0.67446264842928, + "flos": 26460218920320.0, + "grad_norm": 2.1361458651744667, + "language_loss": 0.74604583, + "learning_rate": 9.57984319549464e-07, + "loss": 0.76685929, + "num_input_tokens_seen": 242149720, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.37890625, + "step": 11218, + "time_per_iteration": 2.4617183208465576 + }, + { + "auxiliary_loss_clip": 0.01057657, + "auxiliary_loss_mlp": 0.0103295, + "balance_loss_clip": 1.0196104, + "balance_loss_mlp": 1.01879311, + "epoch": 0.674522771681948, + "flos": 23731267409280.0, + "grad_norm": 1.9988691294013068, + "language_loss": 0.66161001, + "learning_rate": 9.576618471827143e-07, + "loss": 0.68251598, + "num_input_tokens_seen": 242168875, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.38867188, + "step": 11219, + "time_per_iteration": 2.450237512588501 + }, + { + "auxiliary_loss_clip": 0.01058869, + "auxiliary_loss_mlp": 0.01026217, + "balance_loss_clip": 1.01414752, + "balance_loss_mlp": 1.01860857, + "epoch": 0.674582894934616, + "flos": 24753175171200.0, + "grad_norm": 2.1078342896044586, + "language_loss": 0.74830103, + "learning_rate": 9.573394120143318e-07, + "loss": 0.76915187, + "num_input_tokens_seen": 242188465, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.40234375, + "step": 11220, + "time_per_iteration": 2.485271453857422 + }, + { + "auxiliary_loss_clip": 0.01058989, + "auxiliary_loss_mlp": 0.01024271, + "balance_loss_clip": 1.01111007, + "balance_loss_mlp": 1.01934183, + "epoch": 0.6746430181872839, + "flos": 24825480330240.0, + "grad_norm": 1.540670467882481, + "language_loss": 0.70234901, + "learning_rate": 9.570170140558226e-07, + "loss": 0.72318161, + "num_input_tokens_seen": 242208675, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.39648438, + "step": 11221, + "time_per_iteration": 2.467562437057495 + }, + { + "auxiliary_loss_clip": 0.01057834, + "auxiliary_loss_mlp": 0.01024185, + "balance_loss_clip": 1.01246715, + "balance_loss_mlp": 1.01979804, + "epoch": 0.6747031414399519, + "flos": 16872544972800.0, + "grad_norm": 1.723907387861653, + "language_loss": 0.58208436, + "learning_rate": 9.56694653318695e-07, + "loss": 0.60290456, + "num_input_tokens_seen": 242227440, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.38085938, + "step": 11222, + "time_per_iteration": 2.4259519577026367 + }, + { + "auxiliary_loss_clip": 0.01057896, + "auxiliary_loss_mlp": 0.01026018, + "balance_loss_clip": 1.01310802, + "balance_loss_mlp": 1.01897931, + "epoch": 0.6747632646926198, + "flos": 22783794399360.0, + "grad_norm": 1.7580088267167857, + "language_loss": 0.76818949, + "learning_rate": 9.563723298144499e-07, + "loss": 0.78902864, + "num_input_tokens_seen": 242245240, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.38867188, + "step": 11223, + "time_per_iteration": 2.4308125972747803 + }, + { + "auxiliary_loss_clip": 0.01060512, + "auxiliary_loss_mlp": 0.01023193, + "balance_loss_clip": 1.01089597, + "balance_loss_mlp": 1.02047753, + "epoch": 0.6748233879452878, + "flos": 20045102618880.0, + "grad_norm": 1.8638009933921749, + "language_loss": 0.75400078, + "learning_rate": 9.56050043554593e-07, + "loss": 0.77483785, + "num_input_tokens_seen": 242263435, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.40234375, + "step": 11224, + "time_per_iteration": 2.446411371231079 + }, + { + "auxiliary_loss_clip": 0.01053856, + "auxiliary_loss_mlp": 0.01021031, + "balance_loss_clip": 1.0104816, + "balance_loss_mlp": 1.01776767, + "epoch": 0.6748835111979558, + "flos": 23001722305920.0, + "grad_norm": 1.5653319581514318, + "language_loss": 0.63301289, + "learning_rate": 9.557277945506235e-07, + "loss": 0.65376174, + "num_input_tokens_seen": 242282765, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.36132812, + "step": 11225, + "time_per_iteration": 2.4427261352539062 + }, + { + "auxiliary_loss_clip": 0.01054066, + "auxiliary_loss_mlp": 0.01023081, + "balance_loss_clip": 1.01199448, + "balance_loss_mlp": 1.01794457, + "epoch": 0.6749436344506238, + "flos": 12196662560640.0, + "grad_norm": 1.8771649265655552, + "language_loss": 0.64475691, + "learning_rate": 9.554055828140443e-07, + "loss": 0.66552842, + "num_input_tokens_seen": 242298980, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.36132812, + "step": 11226, + "time_per_iteration": 2.438894748687744 + }, + { + "auxiliary_loss_clip": 0.01060774, + "auxiliary_loss_mlp": 0.01027744, + "balance_loss_clip": 1.01514995, + "balance_loss_mlp": 1.02066159, + "epoch": 0.6750037577032918, + "flos": 11872947634560.0, + "grad_norm": 2.163564360002674, + "language_loss": 0.71601719, + "learning_rate": 9.550834083563516e-07, + "loss": 0.73690236, + "num_input_tokens_seen": 242315420, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.40234375, + "step": 11227, + "time_per_iteration": 2.3888943195343018 + }, + { + "auxiliary_loss_clip": 0.01055879, + "auxiliary_loss_mlp": 0.01020481, + "balance_loss_clip": 1.00881684, + "balance_loss_mlp": 1.01820445, + "epoch": 0.6750638809559597, + "flos": 17018656479360.0, + "grad_norm": 2.2064872636732957, + "language_loss": 0.71439362, + "learning_rate": 9.54761271189045e-07, + "loss": 0.73515725, + "num_input_tokens_seen": 242332805, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.375, + "step": 11228, + "time_per_iteration": 2.4256081581115723 + }, + { + "auxiliary_loss_clip": 0.01006928, + "auxiliary_loss_mlp": 0.01000844, + "balance_loss_clip": 0.99984229, + "balance_loss_mlp": 1.00072002, + "epoch": 0.6751240042086277, + "flos": 70947384785280.0, + "grad_norm": 0.7567684519062312, + "language_loss": 0.53301436, + "learning_rate": 9.544391713236198e-07, + "loss": 0.55309212, + "num_input_tokens_seen": 242396160, + "router_z_loss_clip": 0.01000977, + "router_z_loss_mlp": 0.06201172, + "step": 11229, + "time_per_iteration": 3.0850484371185303 + }, + { + "auxiliary_loss_clip": 0.01057698, + "auxiliary_loss_mlp": 0.01023987, + "balance_loss_clip": 1.01183391, + "balance_loss_mlp": 1.01881325, + "epoch": 0.6751841274612956, + "flos": 22674027484800.0, + "grad_norm": 1.6840089228558963, + "language_loss": 0.80265301, + "learning_rate": 9.54117108771571e-07, + "loss": 0.82346988, + "num_input_tokens_seen": 242414660, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.38867188, + "step": 11230, + "time_per_iteration": 2.434100866317749 + }, + { + "auxiliary_loss_clip": 0.01054963, + "auxiliary_loss_mlp": 0.01020327, + "balance_loss_clip": 1.00932384, + "balance_loss_mlp": 1.01870847, + "epoch": 0.6752442507139637, + "flos": 21287556639360.0, + "grad_norm": 1.444934917439602, + "language_loss": 0.65601122, + "learning_rate": 9.537950835443916e-07, + "loss": 0.67676413, + "num_input_tokens_seen": 242434225, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.36328125, + "step": 11231, + "time_per_iteration": 2.3977441787719727 + }, + { + "auxiliary_loss_clip": 0.01061564, + "auxiliary_loss_mlp": 0.01027869, + "balance_loss_clip": 1.01494646, + "balance_loss_mlp": 1.02021182, + "epoch": 0.6753043739666316, + "flos": 28255661965440.0, + "grad_norm": 1.7718131118210514, + "language_loss": 0.66768372, + "learning_rate": 9.53473095653575e-07, + "loss": 0.68857801, + "num_input_tokens_seen": 242454355, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.4140625, + "step": 11232, + "time_per_iteration": 3.901428461074829 + }, + { + "auxiliary_loss_clip": 0.01056927, + "auxiliary_loss_mlp": 0.01020849, + "balance_loss_clip": 1.00976288, + "balance_loss_mlp": 1.01908183, + "epoch": 0.6753644972192996, + "flos": 21359303216640.0, + "grad_norm": 1.6145458383466667, + "language_loss": 0.72621816, + "learning_rate": 9.531511451106127e-07, + "loss": 0.74699593, + "num_input_tokens_seen": 242474935, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.37890625, + "step": 11233, + "time_per_iteration": 2.409806966781616 + }, + { + "auxiliary_loss_clip": 0.01057875, + "auxiliary_loss_mlp": 0.01024257, + "balance_loss_clip": 1.01188314, + "balance_loss_mlp": 1.01917791, + "epoch": 0.6754246204719675, + "flos": 26540763160320.0, + "grad_norm": 1.6932540247419958, + "language_loss": 0.76745105, + "learning_rate": 9.528292319269918e-07, + "loss": 0.78827238, + "num_input_tokens_seen": 242495530, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.38671875, + "step": 11234, + "time_per_iteration": 2.453603744506836 + }, + { + "auxiliary_loss_clip": 0.01057888, + "auxiliary_loss_mlp": 0.01023636, + "balance_loss_clip": 1.01198292, + "balance_loss_mlp": 1.01881385, + "epoch": 0.6754847437246355, + "flos": 25555514192640.0, + "grad_norm": 1.5441608428324598, + "language_loss": 0.75229943, + "learning_rate": 9.525073561142023e-07, + "loss": 0.77311468, + "num_input_tokens_seen": 242514550, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.390625, + "step": 11235, + "time_per_iteration": 2.4322726726531982 + }, + { + "auxiliary_loss_clip": 0.01007267, + "auxiliary_loss_mlp": 0.0100178, + "balance_loss_clip": 1.00083244, + "balance_loss_mlp": 1.0009712, + "epoch": 0.6755448669773034, + "flos": 59510502432000.0, + "grad_norm": 0.7808532256941619, + "language_loss": 0.51363605, + "learning_rate": 9.521855176837312e-07, + "loss": 0.53372651, + "num_input_tokens_seen": 242569200, + "router_z_loss_clip": 0.00946045, + "router_z_loss_mlp": 0.0625, + "step": 11236, + "time_per_iteration": 2.990741014480591 + }, + { + "auxiliary_loss_clip": 0.01058108, + "auxiliary_loss_mlp": 0.01026199, + "balance_loss_clip": 1.01335454, + "balance_loss_mlp": 1.01982307, + "epoch": 0.6756049902299714, + "flos": 23293421648640.0, + "grad_norm": 1.9011938521218712, + "language_loss": 0.75737196, + "learning_rate": 9.518637166470635e-07, + "loss": 0.77821505, + "num_input_tokens_seen": 242586950, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.3828125, + "step": 11237, + "time_per_iteration": 2.394481897354126 + }, + { + "auxiliary_loss_clip": 0.01057424, + "auxiliary_loss_mlp": 0.01030007, + "balance_loss_clip": 1.01845562, + "balance_loss_mlp": 1.01958418, + "epoch": 0.6756651134826394, + "flos": 31574121649920.0, + "grad_norm": 2.1190377379009857, + "language_loss": 0.77408189, + "learning_rate": 9.515419530156828e-07, + "loss": 0.79495621, + "num_input_tokens_seen": 242607380, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.37890625, + "step": 11238, + "time_per_iteration": 2.466855764389038 + }, + { + "auxiliary_loss_clip": 0.01060392, + "auxiliary_loss_mlp": 0.0102754, + "balance_loss_clip": 1.01478481, + "balance_loss_mlp": 1.01894724, + "epoch": 0.6757252367353074, + "flos": 27271041402240.0, + "grad_norm": 1.8316643070463814, + "language_loss": 0.66278398, + "learning_rate": 9.512202268010745e-07, + "loss": 0.68366331, + "num_input_tokens_seen": 242628025, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.4140625, + "step": 11239, + "time_per_iteration": 2.488096237182617 + }, + { + "auxiliary_loss_clip": 0.01056771, + "auxiliary_loss_mlp": 0.01023086, + "balance_loss_clip": 1.01171887, + "balance_loss_mlp": 1.0192554, + "epoch": 0.6757853599879754, + "flos": 16830125804160.0, + "grad_norm": 1.89899545732981, + "language_loss": 0.82695156, + "learning_rate": 9.50898538014717e-07, + "loss": 0.84775013, + "num_input_tokens_seen": 242643825, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.375, + "step": 11240, + "time_per_iteration": 2.437082529067993 + }, + { + "auxiliary_loss_clip": 0.01062019, + "auxiliary_loss_mlp": 0.01027845, + "balance_loss_clip": 1.01293194, + "balance_loss_mlp": 1.01955032, + "epoch": 0.6758454832406433, + "flos": 23218986896640.0, + "grad_norm": 2.295918028778512, + "language_loss": 0.74363035, + "learning_rate": 9.505768866680925e-07, + "loss": 0.76452899, + "num_input_tokens_seen": 242661820, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.42382812, + "step": 11241, + "time_per_iteration": 2.4064927101135254 + }, + { + "auxiliary_loss_clip": 0.01060756, + "auxiliary_loss_mlp": 0.01024277, + "balance_loss_clip": 1.0125289, + "balance_loss_mlp": 1.02088892, + "epoch": 0.6759056064933113, + "flos": 16288622616960.0, + "grad_norm": 2.0794303172202433, + "language_loss": 0.80237007, + "learning_rate": 9.502552727726791e-07, + "loss": 0.82322031, + "num_input_tokens_seen": 242679890, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.3984375, + "step": 11242, + "time_per_iteration": 2.393109083175659 + }, + { + "auxiliary_loss_clip": 0.01055168, + "auxiliary_loss_mlp": 0.0102397, + "balance_loss_clip": 1.01253176, + "balance_loss_mlp": 1.01674557, + "epoch": 0.6759657297459792, + "flos": 25921089705600.0, + "grad_norm": 1.7690700574310172, + "language_loss": 0.72794604, + "learning_rate": 9.499336963399562e-07, + "loss": 0.74873745, + "num_input_tokens_seen": 242699495, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.38476562, + "step": 11243, + "time_per_iteration": 3.8845319747924805 + }, + { + "auxiliary_loss_clip": 0.01059051, + "auxiliary_loss_mlp": 0.01026644, + "balance_loss_clip": 1.01464581, + "balance_loss_mlp": 1.02032185, + "epoch": 0.6760258529986473, + "flos": 22999767269760.0, + "grad_norm": 1.6310271959159979, + "language_loss": 0.72666746, + "learning_rate": 9.49612157381397e-07, + "loss": 0.74752438, + "num_input_tokens_seen": 242719500, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.38671875, + "step": 11244, + "time_per_iteration": 2.400974988937378 + }, + { + "auxiliary_loss_clip": 0.01059909, + "auxiliary_loss_mlp": 0.01030101, + "balance_loss_clip": 1.01806033, + "balance_loss_mlp": 1.01973367, + "epoch": 0.6760859762513152, + "flos": 20958290807040.0, + "grad_norm": 1.8264625480307297, + "language_loss": 0.8554486, + "learning_rate": 9.492906559084788e-07, + "loss": 0.87634873, + "num_input_tokens_seen": 242738325, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.40234375, + "step": 11245, + "time_per_iteration": 2.436004161834717 + }, + { + "auxiliary_loss_clip": 0.01061354, + "auxiliary_loss_mlp": 0.01029932, + "balance_loss_clip": 1.01725972, + "balance_loss_mlp": 1.01951027, + "epoch": 0.6761460995039832, + "flos": 23621814696960.0, + "grad_norm": 2.0179082830801183, + "language_loss": 0.73712695, + "learning_rate": 9.489691919326743e-07, + "loss": 0.75803983, + "num_input_tokens_seen": 242756620, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.41796875, + "step": 11246, + "time_per_iteration": 2.417330026626587 + }, + { + "auxiliary_loss_clip": 0.01057355, + "auxiliary_loss_mlp": 0.01022783, + "balance_loss_clip": 1.01141012, + "balance_loss_mlp": 1.01857448, + "epoch": 0.6762062227566511, + "flos": 20770004511360.0, + "grad_norm": 1.860497533929894, + "language_loss": 0.88049084, + "learning_rate": 9.486477654654557e-07, + "loss": 0.9012922, + "num_input_tokens_seen": 242774505, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.38671875, + "step": 11247, + "time_per_iteration": 2.3827199935913086 + }, + { + "auxiliary_loss_clip": 0.01007123, + "auxiliary_loss_mlp": 0.01003563, + "balance_loss_clip": 1.00257361, + "balance_loss_mlp": 1.00084233, + "epoch": 0.6762663460093191, + "flos": 52814963157120.0, + "grad_norm": 0.826983564513949, + "language_loss": 0.54055661, + "learning_rate": 9.48326376518294e-07, + "loss": 0.56066346, + "num_input_tokens_seen": 242828645, + "router_z_loss_clip": 0.0098877, + "router_z_loss_mlp": 0.0625, + "step": 11248, + "time_per_iteration": 3.037388563156128 + }, + { + "auxiliary_loss_clip": 0.01054658, + "auxiliary_loss_mlp": 0.0102886, + "balance_loss_clip": 1.01794016, + "balance_loss_mlp": 1.01901102, + "epoch": 0.676326469261987, + "flos": 23695167196800.0, + "grad_norm": 1.6754082186591372, + "language_loss": 0.7392534, + "learning_rate": 9.480050251026579e-07, + "loss": 0.76008862, + "num_input_tokens_seen": 242850100, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.35742188, + "step": 11249, + "time_per_iteration": 2.4326624870300293 + }, + { + "auxiliary_loss_clip": 0.01057573, + "auxiliary_loss_mlp": 0.01027034, + "balance_loss_clip": 1.01353335, + "balance_loss_mlp": 1.01769495, + "epoch": 0.676386592514655, + "flos": 14062874664960.0, + "grad_norm": 2.1116891235298403, + "language_loss": 0.74244881, + "learning_rate": 9.47683711230018e-07, + "loss": 0.76329488, + "num_input_tokens_seen": 242867775, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.3984375, + "step": 11250, + "time_per_iteration": 3.7769620418548584 + }, + { + "auxiliary_loss_clip": 0.01059104, + "auxiliary_loss_mlp": 0.01022995, + "balance_loss_clip": 1.01103818, + "balance_loss_mlp": 1.01969504, + "epoch": 0.676446715767323, + "flos": 20411201802240.0, + "grad_norm": 2.1792004968225704, + "language_loss": 0.7516287, + "learning_rate": 9.473624349118381e-07, + "loss": 0.77244961, + "num_input_tokens_seen": 242886865, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.39453125, + "step": 11251, + "time_per_iteration": 2.3796966075897217 + }, + { + "auxiliary_loss_clip": 0.01057658, + "auxiliary_loss_mlp": 0.01022462, + "balance_loss_clip": 1.01056504, + "balance_loss_mlp": 1.01929343, + "epoch": 0.676506839019991, + "flos": 21287172614400.0, + "grad_norm": 2.464188178754606, + "language_loss": 0.70051247, + "learning_rate": 9.470411961595859e-07, + "loss": 0.72131371, + "num_input_tokens_seen": 242906705, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.3828125, + "step": 11252, + "time_per_iteration": 2.4889590740203857 + }, + { + "auxiliary_loss_clip": 0.01058785, + "auxiliary_loss_mlp": 0.01023272, + "balance_loss_clip": 1.01250172, + "balance_loss_mlp": 1.01976383, + "epoch": 0.676566962272659, + "flos": 29931248712960.0, + "grad_norm": 1.981618483644026, + "language_loss": 0.66494572, + "learning_rate": 9.467199949847249e-07, + "loss": 0.68576628, + "num_input_tokens_seen": 242925215, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.390625, + "step": 11253, + "time_per_iteration": 2.495748996734619 + }, + { + "auxiliary_loss_clip": 0.01059432, + "auxiliary_loss_mlp": 0.01025743, + "balance_loss_clip": 1.01254094, + "balance_loss_mlp": 1.01965117, + "epoch": 0.6766270855253269, + "flos": 17930238744960.0, + "grad_norm": 1.6106135530678318, + "language_loss": 0.77032697, + "learning_rate": 9.463988313987177e-07, + "loss": 0.79117876, + "num_input_tokens_seen": 242944750, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.3984375, + "step": 11254, + "time_per_iteration": 2.399510145187378 + }, + { + "auxiliary_loss_clip": 0.01057381, + "auxiliary_loss_mlp": 0.01025556, + "balance_loss_clip": 1.01349759, + "balance_loss_mlp": 1.01822281, + "epoch": 0.6766872087779949, + "flos": 23103948366720.0, + "grad_norm": 2.074330460018026, + "language_loss": 0.72057652, + "learning_rate": 9.460777054130256e-07, + "loss": 0.7414059, + "num_input_tokens_seen": 242963860, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.390625, + "step": 11255, + "time_per_iteration": 2.3892011642456055 + }, + { + "auxiliary_loss_clip": 0.01056661, + "auxiliary_loss_mlp": 0.01029032, + "balance_loss_clip": 1.01610923, + "balance_loss_mlp": 1.01788962, + "epoch": 0.6767473320306628, + "flos": 26211951175680.0, + "grad_norm": 2.000156796664541, + "language_loss": 0.74698007, + "learning_rate": 9.457566170391105e-07, + "loss": 0.76783699, + "num_input_tokens_seen": 242983050, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.38671875, + "step": 11256, + "time_per_iteration": 2.416048049926758 + }, + { + "auxiliary_loss_clip": 0.01060518, + "auxiliary_loss_mlp": 0.01023709, + "balance_loss_clip": 1.01092982, + "balance_loss_mlp": 1.02154088, + "epoch": 0.6768074552833309, + "flos": 18367770303360.0, + "grad_norm": 2.095121930000568, + "language_loss": 0.65095454, + "learning_rate": 9.454355662884283e-07, + "loss": 0.6717968, + "num_input_tokens_seen": 243001125, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.390625, + "step": 11257, + "time_per_iteration": 2.404146671295166 + }, + { + "auxiliary_loss_clip": 0.01055757, + "auxiliary_loss_mlp": 0.01022459, + "balance_loss_clip": 1.01131248, + "balance_loss_mlp": 1.01848459, + "epoch": 0.6768675785359988, + "flos": 23038800036480.0, + "grad_norm": 1.4392495838765131, + "language_loss": 0.75577939, + "learning_rate": 9.451145531724389e-07, + "loss": 0.7765615, + "num_input_tokens_seen": 243021865, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.37304688, + "step": 11258, + "time_per_iteration": 2.459279775619507 + }, + { + "auxiliary_loss_clip": 0.0105615, + "auxiliary_loss_mlp": 0.0102528, + "balance_loss_clip": 1.0137707, + "balance_loss_mlp": 1.01843822, + "epoch": 0.6769277017886668, + "flos": 33035131981440.0, + "grad_norm": 1.4814974992411214, + "language_loss": 0.66610682, + "learning_rate": 9.447935777025968e-07, + "loss": 0.68692106, + "num_input_tokens_seen": 243042970, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.37695312, + "step": 11259, + "time_per_iteration": 2.5163934230804443 + }, + { + "auxiliary_loss_clip": 0.0105777, + "auxiliary_loss_mlp": 0.01024438, + "balance_loss_clip": 1.01192689, + "balance_loss_mlp": 1.01891375, + "epoch": 0.6769878250413347, + "flos": 20847406728960.0, + "grad_norm": 2.4562714807073527, + "language_loss": 0.85350156, + "learning_rate": 9.444726398903593e-07, + "loss": 0.87432367, + "num_input_tokens_seen": 243058470, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.38867188, + "step": 11260, + "time_per_iteration": 2.390697956085205 + }, + { + "auxiliary_loss_clip": 0.01058536, + "auxiliary_loss_mlp": 0.01026333, + "balance_loss_clip": 1.01363158, + "balance_loss_mlp": 1.01809859, + "epoch": 0.6770479482940027, + "flos": 15595072992000.0, + "grad_norm": 2.0541772983829985, + "language_loss": 0.77334052, + "learning_rate": 9.441517397471765e-07, + "loss": 0.79418921, + "num_input_tokens_seen": 243076630, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.40429688, + "step": 11261, + "time_per_iteration": 2.4111077785491943 + }, + { + "auxiliary_loss_clip": 0.01056706, + "auxiliary_loss_mlp": 0.01025441, + "balance_loss_clip": 1.01294804, + "balance_loss_mlp": 1.01743042, + "epoch": 0.6771080715466706, + "flos": 18620122677120.0, + "grad_norm": 4.124529250929395, + "language_loss": 0.87713242, + "learning_rate": 9.43830877284503e-07, + "loss": 0.89795387, + "num_input_tokens_seen": 243092260, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.39257812, + "step": 11262, + "time_per_iteration": 2.3563637733459473 + }, + { + "auxiliary_loss_clip": 0.01059322, + "auxiliary_loss_mlp": 0.01026464, + "balance_loss_clip": 1.014364, + "balance_loss_mlp": 1.01908708, + "epoch": 0.6771681947993387, + "flos": 12494611036800.0, + "grad_norm": 1.9756955106046352, + "language_loss": 0.74010789, + "learning_rate": 9.435100525137893e-07, + "loss": 0.7609657, + "num_input_tokens_seen": 243109405, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.40234375, + "step": 11263, + "time_per_iteration": 2.3786158561706543 + }, + { + "auxiliary_loss_clip": 0.01058877, + "auxiliary_loss_mlp": 0.01028551, + "balance_loss_clip": 1.01545024, + "balance_loss_mlp": 1.01815677, + "epoch": 0.6772283180520066, + "flos": 22235867521920.0, + "grad_norm": 2.281760449798072, + "language_loss": 0.67893225, + "learning_rate": 9.431892654464828e-07, + "loss": 0.69980657, + "num_input_tokens_seen": 243128135, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.40625, + "step": 11264, + "time_per_iteration": 2.385636806488037 + }, + { + "auxiliary_loss_clip": 0.01058423, + "auxiliary_loss_mlp": 0.01026487, + "balance_loss_clip": 1.01414883, + "balance_loss_mlp": 1.01968503, + "epoch": 0.6772884413046746, + "flos": 16142231819520.0, + "grad_norm": 1.885544381020051, + "language_loss": 0.72775877, + "learning_rate": 9.428685160940337e-07, + "loss": 0.74860787, + "num_input_tokens_seen": 243146785, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.38671875, + "step": 11265, + "time_per_iteration": 2.424107074737549 + }, + { + "auxiliary_loss_clip": 0.01057086, + "auxiliary_loss_mlp": 0.01026141, + "balance_loss_clip": 1.014274, + "balance_loss_mlp": 1.01888156, + "epoch": 0.6773485645573426, + "flos": 19134742250880.0, + "grad_norm": 1.5948909733287144, + "language_loss": 0.6162523, + "learning_rate": 9.42547804467888e-07, + "loss": 0.6370846, + "num_input_tokens_seen": 243165275, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.3828125, + "step": 11266, + "time_per_iteration": 2.400667667388916 + }, + { + "auxiliary_loss_clip": 0.01061768, + "auxiliary_loss_mlp": 0.0102375, + "balance_loss_clip": 1.01101875, + "balance_loss_mlp": 1.02071011, + "epoch": 0.6774086878100105, + "flos": 14136052608000.0, + "grad_norm": 2.3846707222277863, + "language_loss": 0.70772803, + "learning_rate": 9.422271305794911e-07, + "loss": 0.72858322, + "num_input_tokens_seen": 243182845, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.41015625, + "step": 11267, + "time_per_iteration": 2.4092466831207275 + }, + { + "auxiliary_loss_clip": 0.01059271, + "auxiliary_loss_mlp": 0.01027244, + "balance_loss_clip": 1.01467967, + "balance_loss_mlp": 1.01962197, + "epoch": 0.6774688110626785, + "flos": 22196066705280.0, + "grad_norm": 2.246306068879972, + "language_loss": 0.70715415, + "learning_rate": 9.419064944402863e-07, + "loss": 0.72801924, + "num_input_tokens_seen": 243201475, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.39648438, + "step": 11268, + "time_per_iteration": 2.3840172290802 + }, + { + "auxiliary_loss_clip": 0.01059003, + "auxiliary_loss_mlp": 0.01027064, + "balance_loss_clip": 1.01455259, + "balance_loss_mlp": 1.01962996, + "epoch": 0.6775289343153464, + "flos": 23038834947840.0, + "grad_norm": 1.5180230201637552, + "language_loss": 0.76911461, + "learning_rate": 9.415858960617176e-07, + "loss": 0.78997529, + "num_input_tokens_seen": 243221850, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.39257812, + "step": 11269, + "time_per_iteration": 2.438626527786255 + }, + { + "auxiliary_loss_clip": 0.01059452, + "auxiliary_loss_mlp": 0.01031245, + "balance_loss_clip": 1.01824582, + "balance_loss_mlp": 1.01848412, + "epoch": 0.6775890575680145, + "flos": 18292602412800.0, + "grad_norm": 2.772256788437959, + "language_loss": 0.74070096, + "learning_rate": 9.412653354552258e-07, + "loss": 0.76160789, + "num_input_tokens_seen": 243239855, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.41015625, + "step": 11270, + "time_per_iteration": 2.4225120544433594 + }, + { + "auxiliary_loss_clip": 0.01060444, + "auxiliary_loss_mlp": 0.01025988, + "balance_loss_clip": 1.0124104, + "balance_loss_mlp": 1.01981759, + "epoch": 0.6776491808206824, + "flos": 25335317047680.0, + "grad_norm": 1.709996747399506, + "language_loss": 0.73086208, + "learning_rate": 9.409448126322506e-07, + "loss": 0.75172633, + "num_input_tokens_seen": 243260085, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.40625, + "step": 11271, + "time_per_iteration": 2.451498508453369 + }, + { + "auxiliary_loss_clip": 0.0105942, + "auxiliary_loss_mlp": 0.0102597, + "balance_loss_clip": 1.01444244, + "balance_loss_mlp": 1.01988411, + "epoch": 0.6777093040733504, + "flos": 26027121104640.0, + "grad_norm": 1.565812195306252, + "language_loss": 0.67881656, + "learning_rate": 9.406243276042303e-07, + "loss": 0.69967043, + "num_input_tokens_seen": 243280065, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.39648438, + "step": 11272, + "time_per_iteration": 3.942746639251709 + }, + { + "auxiliary_loss_clip": 0.01061094, + "auxiliary_loss_mlp": 0.01030727, + "balance_loss_clip": 1.01738787, + "balance_loss_mlp": 1.01995039, + "epoch": 0.6777694273260183, + "flos": 18002648638080.0, + "grad_norm": 1.7576467936165878, + "language_loss": 0.73551655, + "learning_rate": 9.40303880382604e-07, + "loss": 0.75643474, + "num_input_tokens_seen": 243297775, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.41210938, + "step": 11273, + "time_per_iteration": 2.385559320449829 + }, + { + "auxiliary_loss_clip": 0.01057506, + "auxiliary_loss_mlp": 0.01031259, + "balance_loss_clip": 1.01923048, + "balance_loss_mlp": 1.01932549, + "epoch": 0.6778295505786863, + "flos": 23439952091520.0, + "grad_norm": 1.8269661762470804, + "language_loss": 0.70085001, + "learning_rate": 9.399834709788051e-07, + "loss": 0.72173762, + "num_input_tokens_seen": 243315760, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.3828125, + "step": 11274, + "time_per_iteration": 2.3999364376068115 + }, + { + "auxiliary_loss_clip": 0.01060099, + "auxiliary_loss_mlp": 0.01025422, + "balance_loss_clip": 1.01281571, + "balance_loss_mlp": 1.02031398, + "epoch": 0.6778896738313542, + "flos": 19097420140800.0, + "grad_norm": 1.5528620105452944, + "language_loss": 0.65629059, + "learning_rate": 9.3966309940427e-07, + "loss": 0.67714572, + "num_input_tokens_seen": 243335715, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.3984375, + "step": 11275, + "time_per_iteration": 2.4120819568634033 + }, + { + "auxiliary_loss_clip": 0.01060394, + "auxiliary_loss_mlp": 0.01025158, + "balance_loss_clip": 1.01364803, + "balance_loss_mlp": 1.02134657, + "epoch": 0.6779497970840223, + "flos": 26102742842880.0, + "grad_norm": 1.8233083964860786, + "language_loss": 0.72482216, + "learning_rate": 9.393427656704307e-07, + "loss": 0.74567765, + "num_input_tokens_seen": 243356935, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.390625, + "step": 11276, + "time_per_iteration": 2.450122833251953 + }, + { + "auxiliary_loss_clip": 0.01058492, + "auxiliary_loss_mlp": 0.01024803, + "balance_loss_clip": 1.01203573, + "balance_loss_mlp": 1.02002621, + "epoch": 0.6780099203366902, + "flos": 19718210759040.0, + "grad_norm": 1.7383669980717182, + "language_loss": 0.76979643, + "learning_rate": 9.39022469788721e-07, + "loss": 0.79062939, + "num_input_tokens_seen": 243375625, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.38476562, + "step": 11277, + "time_per_iteration": 2.4117329120635986 + }, + { + "auxiliary_loss_clip": 0.0105939, + "auxiliary_loss_mlp": 0.0102297, + "balance_loss_clip": 1.01102519, + "balance_loss_mlp": 1.01902962, + "epoch": 0.6780700435893582, + "flos": 18213803740800.0, + "grad_norm": 3.1298154189220053, + "language_loss": 0.83328015, + "learning_rate": 9.387022117705699e-07, + "loss": 0.8541038, + "num_input_tokens_seen": 243390195, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.40234375, + "step": 11278, + "time_per_iteration": 2.356454610824585 + }, + { + "auxiliary_loss_clip": 0.0105622, + "auxiliary_loss_mlp": 0.01024416, + "balance_loss_clip": 1.01380062, + "balance_loss_mlp": 1.01891506, + "epoch": 0.6781301668420262, + "flos": 25375013130240.0, + "grad_norm": 1.5344010826247092, + "language_loss": 0.70121253, + "learning_rate": 9.383819916274059e-07, + "loss": 0.7220189, + "num_input_tokens_seen": 243411690, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.37304688, + "step": 11279, + "time_per_iteration": 2.473766326904297 + }, + { + "auxiliary_loss_clip": 0.01059346, + "auxiliary_loss_mlp": 0.01023362, + "balance_loss_clip": 1.01073146, + "balance_loss_mlp": 1.01936042, + "epoch": 0.6781902900946941, + "flos": 24019405793280.0, + "grad_norm": 1.8331533613069937, + "language_loss": 0.73908246, + "learning_rate": 9.380618093706592e-07, + "loss": 0.75990951, + "num_input_tokens_seen": 243430280, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.40039062, + "step": 11280, + "time_per_iteration": 2.4224629402160645 + }, + { + "auxiliary_loss_clip": 0.01060268, + "auxiliary_loss_mlp": 0.01026428, + "balance_loss_clip": 1.01327896, + "balance_loss_mlp": 1.01959264, + "epoch": 0.6782504133473621, + "flos": 19645731043200.0, + "grad_norm": 3.431692930122287, + "language_loss": 0.70241576, + "learning_rate": 9.377416650117533e-07, + "loss": 0.72328275, + "num_input_tokens_seen": 243448690, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.40625, + "step": 11281, + "time_per_iteration": 2.393928050994873 + }, + { + "auxiliary_loss_clip": 0.01056924, + "auxiliary_loss_mlp": 0.01020435, + "balance_loss_clip": 1.00936604, + "balance_loss_mlp": 1.01899028, + "epoch": 0.67831053660003, + "flos": 24931686286080.0, + "grad_norm": 2.713262449192202, + "language_loss": 0.63751113, + "learning_rate": 9.374215585621159e-07, + "loss": 0.65828466, + "num_input_tokens_seen": 243470695, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.37890625, + "step": 11282, + "time_per_iteration": 2.495755434036255 + }, + { + "auxiliary_loss_clip": 0.01062667, + "auxiliary_loss_mlp": 0.01031719, + "balance_loss_clip": 1.0172348, + "balance_loss_mlp": 1.0207876, + "epoch": 0.6783706598526981, + "flos": 31207149682560.0, + "grad_norm": 1.586343825209863, + "language_loss": 0.7432673, + "learning_rate": 9.371014900331699e-07, + "loss": 0.76421118, + "num_input_tokens_seen": 243493345, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.41796875, + "step": 11283, + "time_per_iteration": 5.291429042816162 + }, + { + "auxiliary_loss_clip": 0.01057794, + "auxiliary_loss_mlp": 0.0102284, + "balance_loss_clip": 1.0109849, + "balance_loss_mlp": 1.01962423, + "epoch": 0.678430783105366, + "flos": 35439949630080.0, + "grad_norm": 1.5630854837457568, + "language_loss": 0.56898838, + "learning_rate": 9.367814594363374e-07, + "loss": 0.58979475, + "num_input_tokens_seen": 243515670, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.3828125, + "step": 11284, + "time_per_iteration": 2.5541441440582275 + }, + { + "auxiliary_loss_clip": 0.01057389, + "auxiliary_loss_mlp": 0.0102283, + "balance_loss_clip": 1.01113534, + "balance_loss_mlp": 1.01767778, + "epoch": 0.678490906358034, + "flos": 14427926507520.0, + "grad_norm": 1.9362136295300367, + "language_loss": 0.75014114, + "learning_rate": 9.36461466783039e-07, + "loss": 0.77094334, + "num_input_tokens_seen": 243533625, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.39648438, + "step": 11285, + "time_per_iteration": 2.4244580268859863 + }, + { + "auxiliary_loss_clip": 0.01058749, + "auxiliary_loss_mlp": 0.01027702, + "balance_loss_clip": 1.01547086, + "balance_loss_mlp": 1.02036476, + "epoch": 0.6785510296107019, + "flos": 24310232352000.0, + "grad_norm": 1.6141206482327337, + "language_loss": 0.66493112, + "learning_rate": 9.361415120846958e-07, + "loss": 0.68579561, + "num_input_tokens_seen": 243553040, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.38476562, + "step": 11286, + "time_per_iteration": 2.4141108989715576 + }, + { + "auxiliary_loss_clip": 0.01056534, + "auxiliary_loss_mlp": 0.01024311, + "balance_loss_clip": 1.01210976, + "balance_loss_mlp": 1.01704359, + "epoch": 0.6786111528633699, + "flos": 26976095303040.0, + "grad_norm": 2.4813771920324017, + "language_loss": 0.53010386, + "learning_rate": 9.358215953527256e-07, + "loss": 0.55091232, + "num_input_tokens_seen": 243572590, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.39453125, + "step": 11287, + "time_per_iteration": 2.4190146923065186 + }, + { + "auxiliary_loss_clip": 0.0105937, + "auxiliary_loss_mlp": 0.01022418, + "balance_loss_clip": 1.01002634, + "balance_loss_mlp": 1.02049005, + "epoch": 0.6786712761160378, + "flos": 24316376751360.0, + "grad_norm": 1.4959485620983028, + "language_loss": 0.771433, + "learning_rate": 9.355017165985453e-07, + "loss": 0.79225093, + "num_input_tokens_seen": 243594140, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.38867188, + "step": 11288, + "time_per_iteration": 2.4398958683013916 + }, + { + "auxiliary_loss_clip": 0.0106108, + "auxiliary_loss_mlp": 0.01028587, + "balance_loss_clip": 1.01620746, + "balance_loss_mlp": 1.02040172, + "epoch": 0.6787313993687059, + "flos": 22929312412800.0, + "grad_norm": 2.892879917576392, + "language_loss": 0.73298872, + "learning_rate": 9.351818758335696e-07, + "loss": 0.75388539, + "num_input_tokens_seen": 243615170, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.40625, + "step": 11289, + "time_per_iteration": 2.4300763607025146 + }, + { + "auxiliary_loss_clip": 0.0105762, + "auxiliary_loss_mlp": 0.01026439, + "balance_loss_clip": 1.01432097, + "balance_loss_mlp": 1.0185703, + "epoch": 0.6787915226213738, + "flos": 26867270995200.0, + "grad_norm": 1.4534389650743078, + "language_loss": 0.80150604, + "learning_rate": 9.348620730692154e-07, + "loss": 0.82234669, + "num_input_tokens_seen": 243635675, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.390625, + "step": 11290, + "time_per_iteration": 3.8813083171844482 + }, + { + "auxiliary_loss_clip": 0.01057366, + "auxiliary_loss_mlp": 0.01023402, + "balance_loss_clip": 1.01201749, + "balance_loss_mlp": 1.01968598, + "epoch": 0.6788516458740418, + "flos": 20007885242880.0, + "grad_norm": 1.9342793432999188, + "language_loss": 0.74804419, + "learning_rate": 9.345423083168921e-07, + "loss": 0.76885188, + "num_input_tokens_seen": 243654950, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.37695312, + "step": 11291, + "time_per_iteration": 2.4091529846191406 + }, + { + "auxiliary_loss_clip": 0.01060224, + "auxiliary_loss_mlp": 0.01027662, + "balance_loss_clip": 1.01505518, + "balance_loss_mlp": 1.01996315, + "epoch": 0.6789117691267098, + "flos": 28725942245760.0, + "grad_norm": 2.0450179713088183, + "language_loss": 0.75628865, + "learning_rate": 9.342225815880142e-07, + "loss": 0.7771675, + "num_input_tokens_seen": 243674970, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.40234375, + "step": 11292, + "time_per_iteration": 2.4454476833343506 + }, + { + "auxiliary_loss_clip": 0.01058982, + "auxiliary_loss_mlp": 0.01025287, + "balance_loss_clip": 1.01264453, + "balance_loss_mlp": 1.0190959, + "epoch": 0.6789718923793777, + "flos": 23402350690560.0, + "grad_norm": 2.046708114604479, + "language_loss": 0.84468484, + "learning_rate": 9.339028928939907e-07, + "loss": 0.86552751, + "num_input_tokens_seen": 243693440, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.3984375, + "step": 11293, + "time_per_iteration": 2.4199564456939697 + }, + { + "auxiliary_loss_clip": 0.01062418, + "auxiliary_loss_mlp": 0.01031201, + "balance_loss_clip": 1.01828492, + "balance_loss_mlp": 1.02105308, + "epoch": 0.6790320156320457, + "flos": 20447825685120.0, + "grad_norm": 2.4879352557606, + "language_loss": 0.79139256, + "learning_rate": 9.335832422462308e-07, + "loss": 0.8123287, + "num_input_tokens_seen": 243710055, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.4140625, + "step": 11294, + "time_per_iteration": 2.3635284900665283 + }, + { + "auxiliary_loss_clip": 0.01057276, + "auxiliary_loss_mlp": 0.01021093, + "balance_loss_clip": 1.0094763, + "balance_loss_mlp": 1.01918137, + "epoch": 0.6790921388847136, + "flos": 24166145704320.0, + "grad_norm": 2.209362612133923, + "language_loss": 0.79009789, + "learning_rate": 9.332636296561418e-07, + "loss": 0.81088161, + "num_input_tokens_seen": 243728635, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.38085938, + "step": 11295, + "time_per_iteration": 2.403095006942749 + }, + { + "auxiliary_loss_clip": 0.01056655, + "auxiliary_loss_mlp": 0.01024795, + "balance_loss_clip": 1.01446521, + "balance_loss_mlp": 1.01984596, + "epoch": 0.6791522621373817, + "flos": 21907020625920.0, + "grad_norm": 1.8573085744942148, + "language_loss": 0.71348739, + "learning_rate": 9.329440551351289e-07, + "loss": 0.73430181, + "num_input_tokens_seen": 243748330, + "router_z_loss_clip": 0.10351562, + "router_z_loss_mlp": 0.36914062, + "step": 11296, + "time_per_iteration": 2.3886935710906982 + }, + { + "auxiliary_loss_clip": 0.0105706, + "auxiliary_loss_mlp": 0.0102558, + "balance_loss_clip": 1.01413596, + "balance_loss_mlp": 1.0184257, + "epoch": 0.6792123853900496, + "flos": 24825375596160.0, + "grad_norm": 1.5706765100956344, + "language_loss": 0.70949316, + "learning_rate": 9.326245186945996e-07, + "loss": 0.7303195, + "num_input_tokens_seen": 243769380, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.38671875, + "step": 11297, + "time_per_iteration": 2.4364521503448486 + }, + { + "auxiliary_loss_clip": 0.0106093, + "auxiliary_loss_mlp": 0.01026385, + "balance_loss_clip": 1.01369524, + "balance_loss_mlp": 1.0197289, + "epoch": 0.6792725086427176, + "flos": 17565326547840.0, + "grad_norm": 2.154926288899104, + "language_loss": 0.6647532, + "learning_rate": 9.323050203459539e-07, + "loss": 0.68562639, + "num_input_tokens_seen": 243785510, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.41015625, + "step": 11298, + "time_per_iteration": 2.362678289413452 + }, + { + "auxiliary_loss_clip": 0.01059671, + "auxiliary_loss_mlp": 0.01021656, + "balance_loss_clip": 1.01079011, + "balance_loss_mlp": 1.02010751, + "epoch": 0.6793326318953855, + "flos": 26940658406400.0, + "grad_norm": 1.6870613311503757, + "language_loss": 0.71719098, + "learning_rate": 9.319855601005966e-07, + "loss": 0.73800421, + "num_input_tokens_seen": 243805545, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.39453125, + "step": 11299, + "time_per_iteration": 2.428150177001953 + }, + { + "auxiliary_loss_clip": 0.01058848, + "auxiliary_loss_mlp": 0.01026338, + "balance_loss_clip": 1.01329088, + "balance_loss_mlp": 1.01975942, + "epoch": 0.6793927551480535, + "flos": 24317074978560.0, + "grad_norm": 1.278900559406549, + "language_loss": 0.77186066, + "learning_rate": 9.316661379699274e-07, + "loss": 0.79271257, + "num_input_tokens_seen": 243825185, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.390625, + "step": 11300, + "time_per_iteration": 2.438361167907715 + }, + { + "auxiliary_loss_clip": 0.01055712, + "auxiliary_loss_mlp": 0.0102491, + "balance_loss_clip": 1.01331139, + "balance_loss_mlp": 1.01753592, + "epoch": 0.6794528784007214, + "flos": 11435835012480.0, + "grad_norm": 1.7805862429466122, + "language_loss": 0.62634379, + "learning_rate": 9.313467539653454e-07, + "loss": 0.64714998, + "num_input_tokens_seen": 243841600, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.3828125, + "step": 11301, + "time_per_iteration": 2.404844284057617 + }, + { + "auxiliary_loss_clip": 0.01057878, + "auxiliary_loss_mlp": 0.0102118, + "balance_loss_clip": 1.00959921, + "balance_loss_mlp": 1.01952028, + "epoch": 0.6795130016533895, + "flos": 25228482687360.0, + "grad_norm": 2.84472856368307, + "language_loss": 0.8290621, + "learning_rate": 9.310274080982483e-07, + "loss": 0.84985268, + "num_input_tokens_seen": 243862250, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.3828125, + "step": 11302, + "time_per_iteration": 2.463226795196533 + }, + { + "auxiliary_loss_clip": 0.01060092, + "auxiliary_loss_mlp": 0.01026837, + "balance_loss_clip": 1.0143435, + "balance_loss_mlp": 1.01955426, + "epoch": 0.6795731249060574, + "flos": 18295430232960.0, + "grad_norm": 1.9460263582456216, + "language_loss": 0.69615972, + "learning_rate": 9.307081003800339e-07, + "loss": 0.71702898, + "num_input_tokens_seen": 243880560, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.40625, + "step": 11303, + "time_per_iteration": 2.3953444957733154 + }, + { + "auxiliary_loss_clip": 0.0105976, + "auxiliary_loss_mlp": 0.01025996, + "balance_loss_clip": 1.01348543, + "balance_loss_mlp": 1.01924849, + "epoch": 0.6796332481587254, + "flos": 20299410028800.0, + "grad_norm": 3.8900488896021947, + "language_loss": 0.70086884, + "learning_rate": 9.303888308220969e-07, + "loss": 0.72172636, + "num_input_tokens_seen": 243900635, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.40625, + "step": 11304, + "time_per_iteration": 2.4396238327026367 + }, + { + "auxiliary_loss_clip": 0.01059888, + "auxiliary_loss_mlp": 0.0102316, + "balance_loss_clip": 1.01071477, + "balance_loss_mlp": 1.01992297, + "epoch": 0.6796933714113934, + "flos": 23585714484480.0, + "grad_norm": 2.2403629459921652, + "language_loss": 0.7248199, + "learning_rate": 9.300695994358312e-07, + "loss": 0.74565029, + "num_input_tokens_seen": 243920160, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.3984375, + "step": 11305, + "time_per_iteration": 2.45145320892334 + }, + { + "auxiliary_loss_clip": 0.01060368, + "auxiliary_loss_mlp": 0.01024935, + "balance_loss_clip": 1.01255488, + "balance_loss_mlp": 1.02072597, + "epoch": 0.6797534946640613, + "flos": 27118855319040.0, + "grad_norm": 2.262283720135965, + "language_loss": 0.65880984, + "learning_rate": 9.297504062326285e-07, + "loss": 0.67966282, + "num_input_tokens_seen": 243939015, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.39648438, + "step": 11306, + "time_per_iteration": 2.505852699279785 + }, + { + "auxiliary_loss_clip": 0.01057075, + "auxiliary_loss_mlp": 0.01023664, + "balance_loss_clip": 1.01196361, + "balance_loss_mlp": 1.01891708, + "epoch": 0.6798136179167293, + "flos": 22126344986880.0, + "grad_norm": 1.6808596431336669, + "language_loss": 0.79950655, + "learning_rate": 9.294312512238823e-07, + "loss": 0.82031393, + "num_input_tokens_seen": 243958470, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.38085938, + "step": 11307, + "time_per_iteration": 2.4094455242156982 + }, + { + "auxiliary_loss_clip": 0.01058598, + "auxiliary_loss_mlp": 0.01025877, + "balance_loss_clip": 1.01444483, + "balance_loss_mlp": 1.01979959, + "epoch": 0.6798737411693972, + "flos": 17487819596160.0, + "grad_norm": 1.4788592979631736, + "language_loss": 0.89083719, + "learning_rate": 9.291121344209802e-07, + "loss": 0.91168189, + "num_input_tokens_seen": 243975450, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.38867188, + "step": 11308, + "time_per_iteration": 2.383631944656372 + }, + { + "auxiliary_loss_clip": 0.01058264, + "auxiliary_loss_mlp": 0.01026029, + "balance_loss_clip": 1.01398301, + "balance_loss_mlp": 1.01867557, + "epoch": 0.6799338644220653, + "flos": 22891187341440.0, + "grad_norm": 1.9068273566055578, + "language_loss": 0.70926535, + "learning_rate": 9.287930558353106e-07, + "loss": 0.73010832, + "num_input_tokens_seen": 243994355, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.39648438, + "step": 11309, + "time_per_iteration": 2.414903163909912 + }, + { + "auxiliary_loss_clip": 0.01059391, + "auxiliary_loss_mlp": 0.01028392, + "balance_loss_clip": 1.01617861, + "balance_loss_mlp": 1.01937103, + "epoch": 0.6799939876747332, + "flos": 23179430459520.0, + "grad_norm": 1.7398942775808746, + "language_loss": 0.84488165, + "learning_rate": 9.284740154782622e-07, + "loss": 0.86575949, + "num_input_tokens_seen": 244011620, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.40039062, + "step": 11310, + "time_per_iteration": 2.414982318878174 + }, + { + "auxiliary_loss_clip": 0.01059144, + "auxiliary_loss_mlp": 0.0102508, + "balance_loss_clip": 1.01376724, + "balance_loss_mlp": 1.02114928, + "epoch": 0.6800541109274012, + "flos": 19498921309440.0, + "grad_norm": 2.3649747692654426, + "language_loss": 0.83061433, + "learning_rate": 9.281550133612197e-07, + "loss": 0.85145658, + "num_input_tokens_seen": 244029925, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.38085938, + "step": 11311, + "time_per_iteration": 2.3786652088165283 + }, + { + "auxiliary_loss_clip": 0.01060854, + "auxiliary_loss_mlp": 0.01024497, + "balance_loss_clip": 1.01123548, + "balance_loss_mlp": 1.01925218, + "epoch": 0.6801142341800691, + "flos": 22276436388480.0, + "grad_norm": 1.530260974103352, + "language_loss": 0.76185513, + "learning_rate": 9.278360494955677e-07, + "loss": 0.78270864, + "num_input_tokens_seen": 244051225, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.41601562, + "step": 11312, + "time_per_iteration": 3.8766698837280273 + }, + { + "auxiliary_loss_clip": 0.01006988, + "auxiliary_loss_mlp": 0.01001257, + "balance_loss_clip": 1.00033355, + "balance_loss_mlp": 1.00075555, + "epoch": 0.6801743574327371, + "flos": 68711547450240.0, + "grad_norm": 0.6557147434363612, + "language_loss": 0.57277226, + "learning_rate": 9.275171238926884e-07, + "loss": 0.59285474, + "num_input_tokens_seen": 244115930, + "router_z_loss_clip": 0.00921631, + "router_z_loss_mlp": 0.06201172, + "step": 11313, + "time_per_iteration": 3.175163507461548 + }, + { + "auxiliary_loss_clip": 0.01057613, + "auxiliary_loss_mlp": 0.01019397, + "balance_loss_clip": 1.00713611, + "balance_loss_mlp": 1.01844335, + "epoch": 0.680234480685405, + "flos": 29459187953280.0, + "grad_norm": 1.8507325703395436, + "language_loss": 0.68843001, + "learning_rate": 9.271982365639659e-07, + "loss": 0.70920014, + "num_input_tokens_seen": 244137320, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.390625, + "step": 11314, + "time_per_iteration": 2.4911270141601562 + }, + { + "auxiliary_loss_clip": 0.0105574, + "auxiliary_loss_mlp": 0.01021538, + "balance_loss_clip": 1.01063037, + "balance_loss_mlp": 1.01754332, + "epoch": 0.6802946039380731, + "flos": 15916169566080.0, + "grad_norm": 1.968846230695246, + "language_loss": 0.81845868, + "learning_rate": 9.268793875207772e-07, + "loss": 0.83923149, + "num_input_tokens_seen": 244152755, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.38085938, + "step": 11315, + "time_per_iteration": 2.3831257820129395 + }, + { + "auxiliary_loss_clip": 0.01057245, + "auxiliary_loss_mlp": 0.01021691, + "balance_loss_clip": 1.00898933, + "balance_loss_mlp": 1.0189904, + "epoch": 0.680354727190741, + "flos": 22017555590400.0, + "grad_norm": 1.6178875508086727, + "language_loss": 0.6986838, + "learning_rate": 9.265605767745033e-07, + "loss": 0.71947312, + "num_input_tokens_seen": 244171480, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.3828125, + "step": 11316, + "time_per_iteration": 2.413984775543213 + }, + { + "auxiliary_loss_clip": 0.01058433, + "auxiliary_loss_mlp": 0.01024086, + "balance_loss_clip": 1.01264739, + "balance_loss_mlp": 1.01960278, + "epoch": 0.680414850443409, + "flos": 18440529310080.0, + "grad_norm": 2.2597491303088857, + "language_loss": 0.6703018, + "learning_rate": 9.262418043365215e-07, + "loss": 0.691127, + "num_input_tokens_seen": 244187920, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.38867188, + "step": 11317, + "time_per_iteration": 2.3787741661071777 + }, + { + "auxiliary_loss_clip": 0.01060845, + "auxiliary_loss_mlp": 0.01032213, + "balance_loss_clip": 1.0200659, + "balance_loss_mlp": 1.02076399, + "epoch": 0.680474973696077, + "flos": 26357434277760.0, + "grad_norm": 1.530181577677079, + "language_loss": 0.7501964, + "learning_rate": 9.259230702182075e-07, + "loss": 0.77112699, + "num_input_tokens_seen": 244209565, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.40234375, + "step": 11318, + "time_per_iteration": 2.473525285720825 + }, + { + "auxiliary_loss_clip": 0.01054318, + "auxiliary_loss_mlp": 0.01021249, + "balance_loss_clip": 1.01012087, + "balance_loss_mlp": 1.01728356, + "epoch": 0.6805350969487449, + "flos": 18332123938560.0, + "grad_norm": 1.5346548775838882, + "language_loss": 0.68034005, + "learning_rate": 9.256043744309354e-07, + "loss": 0.7010957, + "num_input_tokens_seen": 244228015, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.37109375, + "step": 11319, + "time_per_iteration": 2.4227991104125977 + }, + { + "auxiliary_loss_clip": 0.01057358, + "auxiliary_loss_mlp": 0.01026564, + "balance_loss_clip": 1.01475585, + "balance_loss_mlp": 1.01803517, + "epoch": 0.6805952202014129, + "flos": 19936487779200.0, + "grad_norm": 1.8295401387090073, + "language_loss": 0.76305687, + "learning_rate": 9.252857169860804e-07, + "loss": 0.78389603, + "num_input_tokens_seen": 244245615, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.39453125, + "step": 11320, + "time_per_iteration": 2.40329909324646 + }, + { + "auxiliary_loss_clip": 0.01056767, + "auxiliary_loss_mlp": 0.01026892, + "balance_loss_clip": 1.01457739, + "balance_loss_mlp": 1.01835454, + "epoch": 0.6806553434540809, + "flos": 25223245983360.0, + "grad_norm": 2.0822625725930295, + "language_loss": 0.74393284, + "learning_rate": 9.249670978950137e-07, + "loss": 0.76476943, + "num_input_tokens_seen": 244263625, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.3828125, + "step": 11321, + "time_per_iteration": 2.4776158332824707 + }, + { + "auxiliary_loss_clip": 0.01058628, + "auxiliary_loss_mlp": 0.01029163, + "balance_loss_clip": 1.01740336, + "balance_loss_mlp": 1.01988578, + "epoch": 0.6807154667067489, + "flos": 17784615997440.0, + "grad_norm": 2.0066878638455896, + "language_loss": 0.72737014, + "learning_rate": 9.246485171691058e-07, + "loss": 0.7482481, + "num_input_tokens_seen": 244282745, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.38671875, + "step": 11322, + "time_per_iteration": 3.880328416824341 + }, + { + "auxiliary_loss_clip": 0.0105762, + "auxiliary_loss_mlp": 0.01022105, + "balance_loss_clip": 1.01008844, + "balance_loss_mlp": 1.0188235, + "epoch": 0.6807755899594168, + "flos": 22198824702720.0, + "grad_norm": 1.7077473603564448, + "language_loss": 0.78434259, + "learning_rate": 9.243299748197264e-07, + "loss": 0.80513984, + "num_input_tokens_seen": 244303770, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.38867188, + "step": 11323, + "time_per_iteration": 2.4506161212921143 + }, + { + "auxiliary_loss_clip": 0.01007483, + "auxiliary_loss_mlp": 0.01002727, + "balance_loss_clip": 1.00184536, + "balance_loss_mlp": 1.00112557, + "epoch": 0.6808357132120848, + "flos": 68628105567360.0, + "grad_norm": 0.7518935608317822, + "language_loss": 0.5709728, + "learning_rate": 9.240114708582432e-07, + "loss": 0.59107494, + "num_input_tokens_seen": 244355910, + "router_z_loss_clip": 0.0088501, + "router_z_loss_mlp": 0.06347656, + "step": 11324, + "time_per_iteration": 2.869373321533203 + }, + { + "auxiliary_loss_clip": 0.01058307, + "auxiliary_loss_mlp": 0.01025487, + "balance_loss_clip": 1.01232076, + "balance_loss_mlp": 1.01892567, + "epoch": 0.6808958364647527, + "flos": 23842186398720.0, + "grad_norm": 1.8450181894154878, + "language_loss": 0.68155581, + "learning_rate": 9.236930052960225e-07, + "loss": 0.70239377, + "num_input_tokens_seen": 244376610, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.39453125, + "step": 11325, + "time_per_iteration": 2.5145061016082764 + }, + { + "auxiliary_loss_clip": 0.01061222, + "auxiliary_loss_mlp": 0.01024485, + "balance_loss_clip": 1.01194453, + "balance_loss_mlp": 1.01909876, + "epoch": 0.6809559597174207, + "flos": 17710774738560.0, + "grad_norm": 2.172621176775672, + "language_loss": 0.70257807, + "learning_rate": 9.233745781444295e-07, + "loss": 0.7234351, + "num_input_tokens_seen": 244393000, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.421875, + "step": 11326, + "time_per_iteration": 2.3918654918670654 + }, + { + "auxiliary_loss_clip": 0.01057667, + "auxiliary_loss_mlp": 0.01023954, + "balance_loss_clip": 1.01234877, + "balance_loss_mlp": 1.01806247, + "epoch": 0.6810160829700886, + "flos": 22490803336320.0, + "grad_norm": 1.7852504916837342, + "language_loss": 0.7301622, + "learning_rate": 9.230561894148298e-07, + "loss": 0.75097847, + "num_input_tokens_seen": 244409515, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.39648438, + "step": 11327, + "time_per_iteration": 2.419940710067749 + }, + { + "auxiliary_loss_clip": 0.01055816, + "auxiliary_loss_mlp": 0.01023789, + "balance_loss_clip": 1.01175523, + "balance_loss_mlp": 1.01766622, + "epoch": 0.6810762062227567, + "flos": 16832045928960.0, + "grad_norm": 1.7990505742833787, + "language_loss": 0.77670944, + "learning_rate": 9.227378391185829e-07, + "loss": 0.7975055, + "num_input_tokens_seen": 244427165, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.3828125, + "step": 11328, + "time_per_iteration": 2.3930208683013916 + }, + { + "auxiliary_loss_clip": 0.01058394, + "auxiliary_loss_mlp": 0.01021709, + "balance_loss_clip": 1.00986004, + "balance_loss_mlp": 1.01908159, + "epoch": 0.6811363294754246, + "flos": 12713830663680.0, + "grad_norm": 2.0937378057686065, + "language_loss": 0.64344281, + "learning_rate": 9.224195272670523e-07, + "loss": 0.66424382, + "num_input_tokens_seen": 244445705, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.39453125, + "step": 11329, + "time_per_iteration": 2.3887884616851807 + }, + { + "auxiliary_loss_clip": 0.01060503, + "auxiliary_loss_mlp": 0.01023507, + "balance_loss_clip": 1.01084137, + "balance_loss_mlp": 1.02016914, + "epoch": 0.6811964527280926, + "flos": 17711019118080.0, + "grad_norm": 2.1655356266458066, + "language_loss": 0.79402852, + "learning_rate": 9.22101253871596e-07, + "loss": 0.81486857, + "num_input_tokens_seen": 244460415, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.40234375, + "step": 11330, + "time_per_iteration": 3.7611358165740967 + }, + { + "auxiliary_loss_clip": 0.01058324, + "auxiliary_loss_mlp": 0.01022593, + "balance_loss_clip": 1.01094043, + "balance_loss_mlp": 1.01952147, + "epoch": 0.6812565759807605, + "flos": 24862313681280.0, + "grad_norm": 2.4785127415954014, + "language_loss": 0.63639903, + "learning_rate": 9.217830189435749e-07, + "loss": 0.6572082, + "num_input_tokens_seen": 244480555, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.38671875, + "step": 11331, + "time_per_iteration": 2.448093891143799 + }, + { + "auxiliary_loss_clip": 0.01058252, + "auxiliary_loss_mlp": 0.01028705, + "balance_loss_clip": 1.01657581, + "balance_loss_mlp": 1.02008629, + "epoch": 0.6813166992334285, + "flos": 17165047276800.0, + "grad_norm": 1.7854177380261336, + "language_loss": 0.7214222, + "learning_rate": 9.214648224943429e-07, + "loss": 0.74229181, + "num_input_tokens_seen": 244498540, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.38085938, + "step": 11332, + "time_per_iteration": 2.4125442504882812 + }, + { + "auxiliary_loss_clip": 0.01059843, + "auxiliary_loss_mlp": 0.01031435, + "balance_loss_clip": 1.01876867, + "balance_loss_mlp": 1.01955891, + "epoch": 0.6813768224860965, + "flos": 18842554149120.0, + "grad_norm": 3.482242860595682, + "language_loss": 0.74243629, + "learning_rate": 9.211466645352577e-07, + "loss": 0.763349, + "num_input_tokens_seen": 244517015, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.40234375, + "step": 11333, + "time_per_iteration": 2.3815085887908936 + }, + { + "auxiliary_loss_clip": 0.01061117, + "auxiliary_loss_mlp": 0.01020662, + "balance_loss_clip": 1.00810969, + "balance_loss_mlp": 1.01992369, + "epoch": 0.6814369457387645, + "flos": 24531651394560.0, + "grad_norm": 1.4808589335230133, + "language_loss": 0.72030091, + "learning_rate": 9.20828545077673e-07, + "loss": 0.74111867, + "num_input_tokens_seen": 244537450, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.41210938, + "step": 11334, + "time_per_iteration": 2.416727304458618 + }, + { + "auxiliary_loss_clip": 0.01055466, + "auxiliary_loss_mlp": 0.01025045, + "balance_loss_clip": 1.01368999, + "balance_loss_mlp": 1.0178833, + "epoch": 0.6814970689914325, + "flos": 18222007910400.0, + "grad_norm": 1.6978545132478386, + "language_loss": 0.86147219, + "learning_rate": 9.205104641329416e-07, + "loss": 0.88227731, + "num_input_tokens_seen": 244555640, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.37695312, + "step": 11335, + "time_per_iteration": 2.388427734375 + }, + { + "auxiliary_loss_clip": 0.01053747, + "auxiliary_loss_mlp": 0.01021829, + "balance_loss_clip": 1.01103401, + "balance_loss_mlp": 1.01756573, + "epoch": 0.6815571922441004, + "flos": 25227609903360.0, + "grad_norm": 1.7436531665451849, + "language_loss": 0.81843352, + "learning_rate": 9.201924217124139e-07, + "loss": 0.83918929, + "num_input_tokens_seen": 244574005, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.36132812, + "step": 11336, + "time_per_iteration": 2.4438955783843994 + }, + { + "auxiliary_loss_clip": 0.01056394, + "auxiliary_loss_mlp": 0.01021184, + "balance_loss_clip": 1.01007986, + "balance_loss_mlp": 1.01849866, + "epoch": 0.6816173154967684, + "flos": 19455280243200.0, + "grad_norm": 1.9173430926184734, + "language_loss": 0.81372029, + "learning_rate": 9.198744178274421e-07, + "loss": 0.83449602, + "num_input_tokens_seen": 244591395, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.37890625, + "step": 11337, + "time_per_iteration": 2.3882157802581787 + }, + { + "auxiliary_loss_clip": 0.01054673, + "auxiliary_loss_mlp": 0.01023011, + "balance_loss_clip": 1.01215696, + "balance_loss_mlp": 1.01823473, + "epoch": 0.6816774387494363, + "flos": 17930483124480.0, + "grad_norm": 1.646535783727833, + "language_loss": 0.72412539, + "learning_rate": 9.195564524893738e-07, + "loss": 0.74490225, + "num_input_tokens_seen": 244610400, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.36328125, + "step": 11338, + "time_per_iteration": 2.3965094089508057 + }, + { + "auxiliary_loss_clip": 0.01060652, + "auxiliary_loss_mlp": 0.01026814, + "balance_loss_clip": 1.01414776, + "balance_loss_mlp": 1.0207864, + "epoch": 0.6817375620021043, + "flos": 22232027272320.0, + "grad_norm": 1.5943121255912192, + "language_loss": 0.77453464, + "learning_rate": 9.192385257095565e-07, + "loss": 0.79540932, + "num_input_tokens_seen": 244630400, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.3984375, + "step": 11339, + "time_per_iteration": 2.4254109859466553 + }, + { + "auxiliary_loss_clip": 0.01058944, + "auxiliary_loss_mlp": 0.01028924, + "balance_loss_clip": 1.0170393, + "balance_loss_mlp": 1.01992786, + "epoch": 0.6817976852547722, + "flos": 25373232650880.0, + "grad_norm": 2.24646711130546, + "language_loss": 0.70967245, + "learning_rate": 9.189206374993361e-07, + "loss": 0.73055112, + "num_input_tokens_seen": 244649155, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.390625, + "step": 11340, + "time_per_iteration": 2.4525270462036133 + }, + { + "auxiliary_loss_clip": 0.01061534, + "auxiliary_loss_mlp": 0.01028032, + "balance_loss_clip": 1.01592064, + "balance_loss_mlp": 1.02059269, + "epoch": 0.6818578085074403, + "flos": 22264880728320.0, + "grad_norm": 2.6786744417766246, + "language_loss": 0.83965051, + "learning_rate": 9.186027878700576e-07, + "loss": 0.86054611, + "num_input_tokens_seen": 244665470, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.41015625, + "step": 11341, + "time_per_iteration": 2.4335830211639404 + }, + { + "auxiliary_loss_clip": 0.01058275, + "auxiliary_loss_mlp": 0.01026062, + "balance_loss_clip": 1.01303196, + "balance_loss_mlp": 1.01900983, + "epoch": 0.6819179317601082, + "flos": 19317128526720.0, + "grad_norm": 2.961539290041472, + "language_loss": 0.68685174, + "learning_rate": 9.182849768330636e-07, + "loss": 0.70769507, + "num_input_tokens_seen": 244684390, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.39257812, + "step": 11342, + "time_per_iteration": 2.4038050174713135 + }, + { + "auxiliary_loss_clip": 0.01056871, + "auxiliary_loss_mlp": 0.01024947, + "balance_loss_clip": 1.01216722, + "balance_loss_mlp": 1.01799715, + "epoch": 0.6819780550127762, + "flos": 21103110593280.0, + "grad_norm": 1.5118903309294058, + "language_loss": 0.74966347, + "learning_rate": 9.179672043996956e-07, + "loss": 0.77048171, + "num_input_tokens_seen": 244703370, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.38867188, + "step": 11343, + "time_per_iteration": 2.409865140914917 + }, + { + "auxiliary_loss_clip": 0.01060181, + "auxiliary_loss_mlp": 0.01027784, + "balance_loss_clip": 1.01554716, + "balance_loss_mlp": 1.02030778, + "epoch": 0.6820381782654441, + "flos": 29715101285760.0, + "grad_norm": 1.690032246482668, + "language_loss": 0.79220915, + "learning_rate": 9.176494705812963e-07, + "loss": 0.81308889, + "num_input_tokens_seen": 244723325, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.3984375, + "step": 11344, + "time_per_iteration": 2.4569272994995117 + }, + { + "auxiliary_loss_clip": 0.01058379, + "auxiliary_loss_mlp": 0.0102662, + "balance_loss_clip": 1.01494956, + "balance_loss_mlp": 1.01884878, + "epoch": 0.6820983015181121, + "flos": 29240841110400.0, + "grad_norm": 2.2043772693164643, + "language_loss": 0.67028189, + "learning_rate": 9.173317753892016e-07, + "loss": 0.69113195, + "num_input_tokens_seen": 244745650, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.39648438, + "step": 11345, + "time_per_iteration": 2.5083422660827637 + }, + { + "auxiliary_loss_clip": 0.01060043, + "auxiliary_loss_mlp": 0.0102744, + "balance_loss_clip": 1.01488745, + "balance_loss_mlp": 1.02043581, + "epoch": 0.6821584247707801, + "flos": 18871008773760.0, + "grad_norm": 2.3138142632289305, + "language_loss": 0.64776301, + "learning_rate": 9.170141188347517e-07, + "loss": 0.66863781, + "num_input_tokens_seen": 244760270, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.39648438, + "step": 11346, + "time_per_iteration": 2.4826903343200684 + }, + { + "auxiliary_loss_clip": 0.01058382, + "auxiliary_loss_mlp": 0.01024193, + "balance_loss_clip": 1.01122344, + "balance_loss_mlp": 1.01946092, + "epoch": 0.6822185480234481, + "flos": 21323517206400.0, + "grad_norm": 2.1086702424037247, + "language_loss": 0.78245306, + "learning_rate": 9.166965009292815e-07, + "loss": 0.80327886, + "num_input_tokens_seen": 244779565, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.390625, + "step": 11347, + "time_per_iteration": 2.3952183723449707 + }, + { + "auxiliary_loss_clip": 0.01007302, + "auxiliary_loss_mlp": 0.01000754, + "balance_loss_clip": 0.99977642, + "balance_loss_mlp": 1.00097275, + "epoch": 0.6822786712761161, + "flos": 63485434010880.0, + "grad_norm": 0.7205605020246206, + "language_loss": 0.52517533, + "learning_rate": 9.16378921684128e-07, + "loss": 0.5452559, + "num_input_tokens_seen": 244838480, + "router_z_loss_clip": 0.00976562, + "router_z_loss_mlp": 0.06347656, + "step": 11348, + "time_per_iteration": 3.0771484375 + }, + { + "auxiliary_loss_clip": 0.01055536, + "auxiliary_loss_mlp": 0.01024009, + "balance_loss_clip": 1.01262426, + "balance_loss_mlp": 1.01795375, + "epoch": 0.682338794528784, + "flos": 21067883164800.0, + "grad_norm": 1.6255377176238088, + "language_loss": 0.79847401, + "learning_rate": 9.16061381110622e-07, + "loss": 0.81926942, + "num_input_tokens_seen": 244855265, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.375, + "step": 11349, + "time_per_iteration": 2.3939037322998047 + }, + { + "auxiliary_loss_clip": 0.01063052, + "auxiliary_loss_mlp": 0.01025495, + "balance_loss_clip": 1.01108265, + "balance_loss_mlp": 1.01962733, + "epoch": 0.682398917781452, + "flos": 36281775265920.0, + "grad_norm": 1.8787039796999003, + "language_loss": 0.73632872, + "learning_rate": 9.157438792200975e-07, + "loss": 0.75721413, + "num_input_tokens_seen": 244875555, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.43359375, + "step": 11350, + "time_per_iteration": 2.525472640991211 + }, + { + "auxiliary_loss_clip": 0.01058555, + "auxiliary_loss_mlp": 0.01020614, + "balance_loss_clip": 1.00865698, + "balance_loss_mlp": 1.02043271, + "epoch": 0.6824590410341199, + "flos": 24858159229440.0, + "grad_norm": 4.444436627299718, + "language_loss": 0.79403567, + "learning_rate": 9.154264160238853e-07, + "loss": 0.81482732, + "num_input_tokens_seen": 244895270, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.38085938, + "step": 11351, + "time_per_iteration": 3.8658883571624756 + }, + { + "auxiliary_loss_clip": 0.01057758, + "auxiliary_loss_mlp": 0.01021483, + "balance_loss_clip": 1.01034904, + "balance_loss_mlp": 1.01862752, + "epoch": 0.6825191642867879, + "flos": 22451386544640.0, + "grad_norm": 1.731210798065209, + "language_loss": 0.73291945, + "learning_rate": 9.151089915333143e-07, + "loss": 0.75371188, + "num_input_tokens_seen": 244914535, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.390625, + "step": 11352, + "time_per_iteration": 2.424696207046509 + }, + { + "auxiliary_loss_clip": 0.01060131, + "auxiliary_loss_mlp": 0.01026397, + "balance_loss_clip": 1.0139693, + "balance_loss_mlp": 1.01970077, + "epoch": 0.6825792875394558, + "flos": 29423087740800.0, + "grad_norm": 1.552523456815037, + "language_loss": 0.79946584, + "learning_rate": 9.147916057597127e-07, + "loss": 0.8203311, + "num_input_tokens_seen": 244936095, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.40429688, + "step": 11353, + "time_per_iteration": 2.494189739227295 + }, + { + "auxiliary_loss_clip": 0.01056971, + "auxiliary_loss_mlp": 0.01024558, + "balance_loss_clip": 1.01275611, + "balance_loss_mlp": 1.01879144, + "epoch": 0.6826394107921239, + "flos": 18769969699200.0, + "grad_norm": 1.853415417898717, + "language_loss": 0.78258181, + "learning_rate": 9.144742587144065e-07, + "loss": 0.80339712, + "num_input_tokens_seen": 244955290, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.38085938, + "step": 11354, + "time_per_iteration": 2.4651975631713867 + }, + { + "auxiliary_loss_clip": 0.01056549, + "auxiliary_loss_mlp": 0.01022409, + "balance_loss_clip": 1.01130497, + "balance_loss_mlp": 1.01857328, + "epoch": 0.6826995340447918, + "flos": 16616666551680.0, + "grad_norm": 1.8551245053429473, + "language_loss": 0.61764336, + "learning_rate": 9.141569504087232e-07, + "loss": 0.63843298, + "num_input_tokens_seen": 244972935, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.37890625, + "step": 11355, + "time_per_iteration": 2.4200196266174316 + }, + { + "auxiliary_loss_clip": 0.01055757, + "auxiliary_loss_mlp": 0.01024504, + "balance_loss_clip": 1.01297021, + "balance_loss_mlp": 1.01763678, + "epoch": 0.6827596572974598, + "flos": 20847301994880.0, + "grad_norm": 1.7713513694878138, + "language_loss": 0.82156587, + "learning_rate": 9.138396808539837e-07, + "loss": 0.84236854, + "num_input_tokens_seen": 244989440, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.38085938, + "step": 11356, + "time_per_iteration": 2.400268316268921 + }, + { + "auxiliary_loss_clip": 0.01061019, + "auxiliary_loss_mlp": 0.01026152, + "balance_loss_clip": 1.01362324, + "balance_loss_mlp": 1.02093506, + "epoch": 0.6828197805501277, + "flos": 22746961048320.0, + "grad_norm": 2.088429426160453, + "language_loss": 0.78527117, + "learning_rate": 9.135224500615126e-07, + "loss": 0.80614281, + "num_input_tokens_seen": 245007830, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.40039062, + "step": 11357, + "time_per_iteration": 2.419868230819702 + }, + { + "auxiliary_loss_clip": 0.01007071, + "auxiliary_loss_mlp": 0.0100058, + "balance_loss_clip": 0.99958456, + "balance_loss_mlp": 1.00069726, + "epoch": 0.6828799038027957, + "flos": 71642854535040.0, + "grad_norm": 0.8160695482165858, + "language_loss": 0.59685767, + "learning_rate": 9.132052580426309e-07, + "loss": 0.61693418, + "num_input_tokens_seen": 245070720, + "router_z_loss_clip": 0.00994873, + "router_z_loss_mlp": 0.06347656, + "step": 11358, + "time_per_iteration": 3.1311240196228027 + }, + { + "auxiliary_loss_clip": 0.01063047, + "auxiliary_loss_mlp": 0.01027146, + "balance_loss_clip": 1.01444435, + "balance_loss_mlp": 1.02091908, + "epoch": 0.6829400270554637, + "flos": 19828117319040.0, + "grad_norm": 1.7731040277930705, + "language_loss": 0.78218502, + "learning_rate": 9.128881048086576e-07, + "loss": 0.803087, + "num_input_tokens_seen": 245089070, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.421875, + "step": 11359, + "time_per_iteration": 2.428117036819458 + }, + { + "auxiliary_loss_clip": 0.01058453, + "auxiliary_loss_mlp": 0.01027517, + "balance_loss_clip": 1.01519656, + "balance_loss_mlp": 1.01852512, + "epoch": 0.6830001503081317, + "flos": 21979570164480.0, + "grad_norm": 1.7733116814253638, + "language_loss": 0.81807667, + "learning_rate": 9.125709903709109e-07, + "loss": 0.83893639, + "num_input_tokens_seen": 245106500, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.3984375, + "step": 11360, + "time_per_iteration": 2.419032573699951 + }, + { + "auxiliary_loss_clip": 0.01061456, + "auxiliary_loss_mlp": 0.01023639, + "balance_loss_clip": 1.01135993, + "balance_loss_mlp": 1.0219965, + "epoch": 0.6830602735607997, + "flos": 24315608701440.0, + "grad_norm": 1.7258353156482282, + "language_loss": 0.75150883, + "learning_rate": 9.122539147407098e-07, + "loss": 0.77235979, + "num_input_tokens_seen": 245125260, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.39453125, + "step": 11361, + "time_per_iteration": 3.8582701683044434 + }, + { + "auxiliary_loss_clip": 0.01007392, + "auxiliary_loss_mlp": 0.01000477, + "balance_loss_clip": 0.99957067, + "balance_loss_mlp": 1.00097382, + "epoch": 0.6831203968134676, + "flos": 57687268078080.0, + "grad_norm": 0.8682955834392074, + "language_loss": 0.59698224, + "learning_rate": 9.11936877929367e-07, + "loss": 0.61706096, + "num_input_tokens_seen": 245188730, + "router_z_loss_clip": 0.0090332, + "router_z_loss_mlp": 0.06445312, + "step": 11362, + "time_per_iteration": 4.563758134841919 + }, + { + "auxiliary_loss_clip": 0.0106003, + "auxiliary_loss_mlp": 0.01022155, + "balance_loss_clip": 1.00990009, + "balance_loss_mlp": 1.01911652, + "epoch": 0.6831805200661356, + "flos": 14387671843200.0, + "grad_norm": 2.1205171020579567, + "language_loss": 0.7551198, + "learning_rate": 9.116198799481988e-07, + "loss": 0.77594161, + "num_input_tokens_seen": 245205065, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.41015625, + "step": 11363, + "time_per_iteration": 2.392162561416626 + }, + { + "auxiliary_loss_clip": 0.01057943, + "auxiliary_loss_mlp": 0.01024781, + "balance_loss_clip": 1.01277089, + "balance_loss_mlp": 1.0182966, + "epoch": 0.6832406433188035, + "flos": 22819196384640.0, + "grad_norm": 2.335586992463906, + "language_loss": 0.8971951, + "learning_rate": 9.113029208085171e-07, + "loss": 0.91802239, + "num_input_tokens_seen": 245224265, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.39648438, + "step": 11364, + "time_per_iteration": 2.484243869781494 + }, + { + "auxiliary_loss_clip": 0.0105634, + "auxiliary_loss_mlp": 0.01022503, + "balance_loss_clip": 1.0117625, + "balance_loss_mlp": 1.01911855, + "epoch": 0.6833007665714715, + "flos": 17560892805120.0, + "grad_norm": 1.8609026076226716, + "language_loss": 0.88294107, + "learning_rate": 9.109860005216347e-07, + "loss": 0.90372944, + "num_input_tokens_seen": 245243360, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.37304688, + "step": 11365, + "time_per_iteration": 2.4552578926086426 + }, + { + "auxiliary_loss_clip": 0.01061406, + "auxiliary_loss_mlp": 0.01026659, + "balance_loss_clip": 1.01370049, + "balance_loss_mlp": 1.01998043, + "epoch": 0.6833608898241395, + "flos": 22445102499840.0, + "grad_norm": 2.4318521231951715, + "language_loss": 0.8156209, + "learning_rate": 9.106691190988596e-07, + "loss": 0.83650154, + "num_input_tokens_seen": 245256350, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.4140625, + "step": 11366, + "time_per_iteration": 2.3914685249328613 + }, + { + "auxiliary_loss_clip": 0.01057991, + "auxiliary_loss_mlp": 0.01022505, + "balance_loss_clip": 1.01000023, + "balance_loss_mlp": 1.01856041, + "epoch": 0.6834210130768075, + "flos": 24533501696640.0, + "grad_norm": 1.735907360511487, + "language_loss": 0.76646841, + "learning_rate": 9.10352276551502e-07, + "loss": 0.78727341, + "num_input_tokens_seen": 245277575, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.39453125, + "step": 11367, + "time_per_iteration": 2.4451732635498047 + }, + { + "auxiliary_loss_clip": 0.01058274, + "auxiliary_loss_mlp": 0.01030734, + "balance_loss_clip": 1.01847374, + "balance_loss_mlp": 1.01925588, + "epoch": 0.6834811363294754, + "flos": 20046115048320.0, + "grad_norm": 1.5529909652226286, + "language_loss": 0.69030219, + "learning_rate": 9.100354728908688e-07, + "loss": 0.71119225, + "num_input_tokens_seen": 245296615, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.390625, + "step": 11368, + "time_per_iteration": 2.3842661380767822 + }, + { + "auxiliary_loss_clip": 0.01058992, + "auxiliary_loss_mlp": 0.01023822, + "balance_loss_clip": 1.0125742, + "balance_loss_mlp": 1.02012765, + "epoch": 0.6835412595821434, + "flos": 24789589585920.0, + "grad_norm": 2.3721163897996247, + "language_loss": 0.73454064, + "learning_rate": 9.097187081282658e-07, + "loss": 0.75536877, + "num_input_tokens_seen": 245316275, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.38867188, + "step": 11369, + "time_per_iteration": 3.8295366764068604 + }, + { + "auxiliary_loss_clip": 0.0105815, + "auxiliary_loss_mlp": 0.01022515, + "balance_loss_clip": 1.01036751, + "balance_loss_mlp": 1.01840818, + "epoch": 0.6836013828348113, + "flos": 19499340245760.0, + "grad_norm": 1.7622249987156993, + "language_loss": 0.78955901, + "learning_rate": 9.094019822749976e-07, + "loss": 0.81036568, + "num_input_tokens_seen": 245334595, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.39648438, + "step": 11370, + "time_per_iteration": 2.4224441051483154 + }, + { + "auxiliary_loss_clip": 0.01007026, + "auxiliary_loss_mlp": 0.01001756, + "balance_loss_clip": 1.00079644, + "balance_loss_mlp": 1.0006696, + "epoch": 0.6836615060874793, + "flos": 58360706893440.0, + "grad_norm": 0.7442096743435535, + "language_loss": 0.59851527, + "learning_rate": 9.090852953423674e-07, + "loss": 0.61860311, + "num_input_tokens_seen": 245389750, + "router_z_loss_clip": 0.00958252, + "router_z_loss_mlp": 0.06347656, + "step": 11371, + "time_per_iteration": 2.9455175399780273 + }, + { + "auxiliary_loss_clip": 0.01056296, + "auxiliary_loss_mlp": 0.01022384, + "balance_loss_clip": 1.01174438, + "balance_loss_mlp": 1.01866531, + "epoch": 0.6837216293401474, + "flos": 12166078343040.0, + "grad_norm": 1.7230679184413564, + "language_loss": 0.6374895, + "learning_rate": 9.087686473416766e-07, + "loss": 0.65827632, + "num_input_tokens_seen": 245407530, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.375, + "step": 11372, + "time_per_iteration": 2.4289181232452393 + }, + { + "auxiliary_loss_clip": 0.01056898, + "auxiliary_loss_mlp": 0.01025703, + "balance_loss_clip": 1.01340103, + "balance_loss_mlp": 1.01808572, + "epoch": 0.6837817525928153, + "flos": 22126484632320.0, + "grad_norm": 1.5110059314943842, + "language_loss": 0.72098398, + "learning_rate": 9.084520382842253e-07, + "loss": 0.74181008, + "num_input_tokens_seen": 245427000, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.38867188, + "step": 11373, + "time_per_iteration": 2.41644024848938 + }, + { + "auxiliary_loss_clip": 0.0100697, + "auxiliary_loss_mlp": 0.0100105, + "balance_loss_clip": 1.00009656, + "balance_loss_mlp": 1.0005517, + "epoch": 0.6838418758454833, + "flos": 65002409118720.0, + "grad_norm": 1.2379590706965962, + "language_loss": 0.56639111, + "learning_rate": 9.081354681813136e-07, + "loss": 0.58647132, + "num_input_tokens_seen": 245491620, + "router_z_loss_clip": 0.00952148, + "router_z_loss_mlp": 0.06445312, + "step": 11374, + "time_per_iteration": 3.2287232875823975 + }, + { + "auxiliary_loss_clip": 0.01059815, + "auxiliary_loss_mlp": 0.0102287, + "balance_loss_clip": 1.01071668, + "balance_loss_mlp": 1.02001762, + "epoch": 0.6839019990981512, + "flos": 21029827916160.0, + "grad_norm": 1.442300866951778, + "language_loss": 0.73922759, + "learning_rate": 9.078189370442386e-07, + "loss": 0.76005447, + "num_input_tokens_seen": 245511285, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.3984375, + "step": 11375, + "time_per_iteration": 2.405219554901123 + }, + { + "auxiliary_loss_clip": 0.01057178, + "auxiliary_loss_mlp": 0.01021931, + "balance_loss_clip": 1.01068938, + "balance_loss_mlp": 1.0189234, + "epoch": 0.6839621223508192, + "flos": 24934409372160.0, + "grad_norm": 1.5743904021060386, + "language_loss": 0.70519012, + "learning_rate": 9.07502444884296e-07, + "loss": 0.72598124, + "num_input_tokens_seen": 245532910, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.3828125, + "step": 11376, + "time_per_iteration": 2.472360849380493 + }, + { + "auxiliary_loss_clip": 0.01059881, + "auxiliary_loss_mlp": 0.0102447, + "balance_loss_clip": 1.01217985, + "balance_loss_mlp": 1.01951575, + "epoch": 0.6840222456034871, + "flos": 26357643745920.0, + "grad_norm": 1.8981915109691643, + "language_loss": 0.74715328, + "learning_rate": 9.071859917127804e-07, + "loss": 0.76799691, + "num_input_tokens_seen": 245550540, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.40234375, + "step": 11377, + "time_per_iteration": 2.441364049911499 + }, + { + "auxiliary_loss_clip": 0.01006942, + "auxiliary_loss_mlp": 0.01002584, + "balance_loss_clip": 1.00167763, + "balance_loss_mlp": 1.00063467, + "epoch": 0.6840823688561551, + "flos": 65984865177600.0, + "grad_norm": 0.7143133175880789, + "language_loss": 0.56849241, + "learning_rate": 9.068695775409872e-07, + "loss": 0.58858764, + "num_input_tokens_seen": 245619570, + "router_z_loss_clip": 0.0090332, + "router_z_loss_mlp": 0.06298828, + "step": 11378, + "time_per_iteration": 3.1806328296661377 + }, + { + "auxiliary_loss_clip": 0.0105763, + "auxiliary_loss_mlp": 0.01023849, + "balance_loss_clip": 1.01131415, + "balance_loss_mlp": 1.01831293, + "epoch": 0.684142492108823, + "flos": 21396520592640.0, + "grad_norm": 1.6982074964997558, + "language_loss": 0.78613889, + "learning_rate": 9.065532023802051e-07, + "loss": 0.80695367, + "num_input_tokens_seen": 245637980, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.39257812, + "step": 11379, + "time_per_iteration": 2.398530960083008 + }, + { + "auxiliary_loss_clip": 0.01054875, + "auxiliary_loss_mlp": 0.01020807, + "balance_loss_clip": 1.01071608, + "balance_loss_mlp": 1.01939201, + "epoch": 0.6842026153614911, + "flos": 18800588828160.0, + "grad_norm": 1.7652977984281695, + "language_loss": 0.69269609, + "learning_rate": 9.062368662417276e-07, + "loss": 0.71345294, + "num_input_tokens_seen": 245655690, + "router_z_loss_clip": 0.10058594, + "router_z_loss_mlp": 0.35546875, + "step": 11380, + "time_per_iteration": 2.3921585083007812 + }, + { + "auxiliary_loss_clip": 0.01055505, + "auxiliary_loss_mlp": 0.01023215, + "balance_loss_clip": 1.01172972, + "balance_loss_mlp": 1.01711178, + "epoch": 0.684262738614159, + "flos": 19645381929600.0, + "grad_norm": 2.0009357886872037, + "language_loss": 0.78094578, + "learning_rate": 9.059205691368421e-07, + "loss": 0.80173302, + "num_input_tokens_seen": 245671525, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.38476562, + "step": 11381, + "time_per_iteration": 2.3941097259521484 + }, + { + "auxiliary_loss_clip": 0.01006937, + "auxiliary_loss_mlp": 0.01001586, + "balance_loss_clip": 1.00060892, + "balance_loss_mlp": 1.00055528, + "epoch": 0.684322861866827, + "flos": 62351699564160.0, + "grad_norm": 0.8712498281188904, + "language_loss": 0.67277813, + "learning_rate": 9.056043110768385e-07, + "loss": 0.69286335, + "num_input_tokens_seen": 245724115, + "router_z_loss_clip": 0.00976562, + "router_z_loss_mlp": 0.06347656, + "step": 11382, + "time_per_iteration": 2.9277901649475098 + }, + { + "auxiliary_loss_clip": 0.01006705, + "auxiliary_loss_mlp": 0.01001375, + "balance_loss_clip": 1.00036144, + "balance_loss_mlp": 1.00042224, + "epoch": 0.6843829851194949, + "flos": 65801606117760.0, + "grad_norm": 0.8190649935466133, + "language_loss": 0.58124483, + "learning_rate": 9.052880920730006e-07, + "loss": 0.60132563, + "num_input_tokens_seen": 245789245, + "router_z_loss_clip": 0.01013184, + "router_z_loss_mlp": 0.0625, + "step": 11383, + "time_per_iteration": 3.1353719234466553 + }, + { + "auxiliary_loss_clip": 0.01057014, + "auxiliary_loss_mlp": 0.01024555, + "balance_loss_clip": 1.013111, + "balance_loss_mlp": 1.01773393, + "epoch": 0.6844431083721629, + "flos": 27853916417280.0, + "grad_norm": 1.6204900767970887, + "language_loss": 0.79698092, + "learning_rate": 9.049719121366153e-07, + "loss": 0.81779659, + "num_input_tokens_seen": 245812420, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.39453125, + "step": 11384, + "time_per_iteration": 2.4809491634368896 + }, + { + "auxiliary_loss_clip": 0.01060548, + "auxiliary_loss_mlp": 0.01024606, + "balance_loss_clip": 1.01250672, + "balance_loss_mlp": 1.01964045, + "epoch": 0.684503231624831, + "flos": 18254163139200.0, + "grad_norm": 1.6718480284828643, + "language_loss": 0.77214998, + "learning_rate": 9.046557712789667e-07, + "loss": 0.79300153, + "num_input_tokens_seen": 245829135, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.41015625, + "step": 11385, + "time_per_iteration": 2.434901714324951 + }, + { + "auxiliary_loss_clip": 0.01063941, + "auxiliary_loss_mlp": 0.01029875, + "balance_loss_clip": 1.01551032, + "balance_loss_mlp": 1.02046239, + "epoch": 0.6845633548774989, + "flos": 17638713959040.0, + "grad_norm": 2.0069990367604422, + "language_loss": 0.84721458, + "learning_rate": 9.043396695113344e-07, + "loss": 0.86815274, + "num_input_tokens_seen": 245847140, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.43554688, + "step": 11386, + "time_per_iteration": 2.3931376934051514 + }, + { + "auxiliary_loss_clip": 0.01057541, + "auxiliary_loss_mlp": 0.01025439, + "balance_loss_clip": 1.01424527, + "balance_loss_mlp": 1.01915836, + "epoch": 0.6846234781301669, + "flos": 20806698216960.0, + "grad_norm": 2.3841537955850267, + "language_loss": 0.83423817, + "learning_rate": 9.040236068450016e-07, + "loss": 0.85506797, + "num_input_tokens_seen": 245862855, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.3828125, + "step": 11387, + "time_per_iteration": 2.412458896636963 + }, + { + "auxiliary_loss_clip": 0.01056673, + "auxiliary_loss_mlp": 0.01020854, + "balance_loss_clip": 1.00942194, + "balance_loss_mlp": 1.01855278, + "epoch": 0.6846836013828348, + "flos": 36099703192320.0, + "grad_norm": 1.6416520612612215, + "language_loss": 0.72547925, + "learning_rate": 9.037075832912473e-07, + "loss": 0.7462545, + "num_input_tokens_seen": 245885415, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.38085938, + "step": 11388, + "time_per_iteration": 2.551164150238037 + }, + { + "auxiliary_loss_clip": 0.01055678, + "auxiliary_loss_mlp": 0.01022218, + "balance_loss_clip": 1.01124477, + "balance_loss_mlp": 1.01857495, + "epoch": 0.6847437246355028, + "flos": 43140811904640.0, + "grad_norm": 1.823643891296676, + "language_loss": 0.62692976, + "learning_rate": 9.033915988613492e-07, + "loss": 0.64770877, + "num_input_tokens_seen": 245906285, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.37109375, + "step": 11389, + "time_per_iteration": 2.6451070308685303 + }, + { + "auxiliary_loss_clip": 0.01006746, + "auxiliary_loss_mlp": 0.01000998, + "balance_loss_clip": 0.99999678, + "balance_loss_mlp": 1.00054502, + "epoch": 0.6848038478881707, + "flos": 71660556460800.0, + "grad_norm": 0.7374778539365218, + "language_loss": 0.55923998, + "learning_rate": 9.030756535665834e-07, + "loss": 0.57931745, + "num_input_tokens_seen": 245967620, + "router_z_loss_clip": 0.01000977, + "router_z_loss_mlp": 0.06201172, + "step": 11390, + "time_per_iteration": 3.117983818054199 + }, + { + "auxiliary_loss_clip": 0.01058703, + "auxiliary_loss_mlp": 0.01023052, + "balance_loss_clip": 1.01133347, + "balance_loss_mlp": 1.02042532, + "epoch": 0.6848639711408387, + "flos": 19936801981440.0, + "grad_norm": 2.114303865192027, + "language_loss": 0.87815577, + "learning_rate": 9.027597474182267e-07, + "loss": 0.89897335, + "num_input_tokens_seen": 245985075, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.3828125, + "step": 11391, + "time_per_iteration": 3.8626785278320312 + }, + { + "auxiliary_loss_clip": 0.01060058, + "auxiliary_loss_mlp": 0.01024958, + "balance_loss_clip": 1.01335883, + "balance_loss_mlp": 1.02072251, + "epoch": 0.6849240943935067, + "flos": 26866363299840.0, + "grad_norm": 3.844382325997501, + "language_loss": 0.79166126, + "learning_rate": 9.02443880427552e-07, + "loss": 0.81251144, + "num_input_tokens_seen": 246003560, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.39257812, + "step": 11392, + "time_per_iteration": 2.448519468307495 + }, + { + "auxiliary_loss_clip": 0.01055248, + "auxiliary_loss_mlp": 0.01023031, + "balance_loss_clip": 1.01121116, + "balance_loss_mlp": 1.01848423, + "epoch": 0.6849842176461747, + "flos": 13734516528000.0, + "grad_norm": 2.027053053852798, + "language_loss": 0.70797384, + "learning_rate": 9.021280526058322e-07, + "loss": 0.72875655, + "num_input_tokens_seen": 246019600, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.3671875, + "step": 11393, + "time_per_iteration": 2.3930606842041016 + }, + { + "auxiliary_loss_clip": 0.01059509, + "auxiliary_loss_mlp": 0.01023175, + "balance_loss_clip": 1.01028872, + "balance_loss_mlp": 1.01885343, + "epoch": 0.6850443408988426, + "flos": 24971906039040.0, + "grad_norm": 1.8740626436906158, + "language_loss": 0.6462428, + "learning_rate": 9.018122639643373e-07, + "loss": 0.66706967, + "num_input_tokens_seen": 246038920, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.40625, + "step": 11394, + "time_per_iteration": 2.448310136795044 + }, + { + "auxiliary_loss_clip": 0.01059266, + "auxiliary_loss_mlp": 0.0102496, + "balance_loss_clip": 1.01330709, + "balance_loss_mlp": 1.01923037, + "epoch": 0.6851044641515106, + "flos": 27743032339200.0, + "grad_norm": 1.5347544088323761, + "language_loss": 0.80619472, + "learning_rate": 9.014965145143392e-07, + "loss": 0.82703698, + "num_input_tokens_seen": 246060490, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.40039062, + "step": 11395, + "time_per_iteration": 2.46160626411438 + }, + { + "auxiliary_loss_clip": 0.01058785, + "auxiliary_loss_mlp": 0.01025842, + "balance_loss_clip": 1.013134, + "balance_loss_mlp": 1.0198735, + "epoch": 0.6851645874041785, + "flos": 24349963345920.0, + "grad_norm": 1.7029521137046677, + "language_loss": 0.73258549, + "learning_rate": 9.011808042671035e-07, + "loss": 0.7534318, + "num_input_tokens_seen": 246081465, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.38867188, + "step": 11396, + "time_per_iteration": 2.470796823501587 + }, + { + "auxiliary_loss_clip": 0.01059476, + "auxiliary_loss_mlp": 0.01023273, + "balance_loss_clip": 1.01024985, + "balance_loss_mlp": 1.01994467, + "epoch": 0.6852247106568465, + "flos": 15077171750400.0, + "grad_norm": 2.299311165160741, + "language_loss": 0.7889356, + "learning_rate": 9.00865133233899e-07, + "loss": 0.80976307, + "num_input_tokens_seen": 246096110, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.39453125, + "step": 11397, + "time_per_iteration": 2.403425693511963 + }, + { + "auxiliary_loss_clip": 0.0105884, + "auxiliary_loss_mlp": 0.01023876, + "balance_loss_clip": 1.011657, + "balance_loss_mlp": 1.01863611, + "epoch": 0.6852848339095146, + "flos": 18769027092480.0, + "grad_norm": 1.8687069124512279, + "language_loss": 0.71954298, + "learning_rate": 9.005495014259905e-07, + "loss": 0.74037015, + "num_input_tokens_seen": 246114785, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.40234375, + "step": 11398, + "time_per_iteration": 2.433225154876709 + }, + { + "auxiliary_loss_clip": 0.01056219, + "auxiliary_loss_mlp": 0.01024119, + "balance_loss_clip": 1.01269829, + "balance_loss_mlp": 1.01789665, + "epoch": 0.6853449571621825, + "flos": 27853148367360.0, + "grad_norm": 1.7683806324396392, + "language_loss": 0.71114153, + "learning_rate": 9.002339088546424e-07, + "loss": 0.73194486, + "num_input_tokens_seen": 246136375, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.3828125, + "step": 11399, + "time_per_iteration": 2.4983601570129395 + }, + { + "auxiliary_loss_clip": 0.01058062, + "auxiliary_loss_mlp": 0.01022401, + "balance_loss_clip": 1.01044476, + "balance_loss_mlp": 1.01906085, + "epoch": 0.6854050804148505, + "flos": 18149528194560.0, + "grad_norm": 2.1148947010209467, + "language_loss": 0.69882047, + "learning_rate": 8.999183555311169e-07, + "loss": 0.71962512, + "num_input_tokens_seen": 246155090, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.390625, + "step": 11400, + "time_per_iteration": 2.41316556930542 + }, + { + "auxiliary_loss_clip": 0.01064775, + "auxiliary_loss_mlp": 0.01032543, + "balance_loss_clip": 1.01746297, + "balance_loss_mlp": 1.02072787, + "epoch": 0.6854652036675184, + "flos": 16325281411200.0, + "grad_norm": 1.769231151812737, + "language_loss": 0.78135842, + "learning_rate": 8.996028414666752e-07, + "loss": 0.80233169, + "num_input_tokens_seen": 246172645, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.43945312, + "step": 11401, + "time_per_iteration": 5.245810270309448 + }, + { + "auxiliary_loss_clip": 0.0105784, + "auxiliary_loss_mlp": 0.010261, + "balance_loss_clip": 1.01464367, + "balance_loss_mlp": 1.01967144, + "epoch": 0.6855253269201864, + "flos": 14939892817920.0, + "grad_norm": 1.7555333638479833, + "language_loss": 0.75613701, + "learning_rate": 8.992873666725786e-07, + "loss": 0.77697641, + "num_input_tokens_seen": 246189055, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.38085938, + "step": 11402, + "time_per_iteration": 2.390002727508545 + }, + { + "auxiliary_loss_clip": 0.01058274, + "auxiliary_loss_mlp": 0.010318, + "balance_loss_clip": 1.0191164, + "balance_loss_mlp": 1.0186702, + "epoch": 0.6855854501728543, + "flos": 23036670443520.0, + "grad_norm": 2.070220952709313, + "language_loss": 0.72973979, + "learning_rate": 8.989719311600832e-07, + "loss": 0.75064051, + "num_input_tokens_seen": 246207990, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.39648438, + "step": 11403, + "time_per_iteration": 2.419844627380371 + }, + { + "auxiliary_loss_clip": 0.01056114, + "auxiliary_loss_mlp": 0.01018824, + "balance_loss_clip": 1.00808334, + "balance_loss_mlp": 1.01862121, + "epoch": 0.6856455734255223, + "flos": 13252994789760.0, + "grad_norm": 1.9386323347137622, + "language_loss": 0.81450713, + "learning_rate": 8.986565349404482e-07, + "loss": 0.83525652, + "num_input_tokens_seen": 246221595, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.375, + "step": 11404, + "time_per_iteration": 2.364647626876831 + }, + { + "auxiliary_loss_clip": 0.01058877, + "auxiliary_loss_mlp": 0.01027088, + "balance_loss_clip": 1.01559639, + "balance_loss_mlp": 1.01966286, + "epoch": 0.6857056966781903, + "flos": 23332279858560.0, + "grad_norm": 1.4054640001403185, + "language_loss": 0.77592659, + "learning_rate": 8.983411780249284e-07, + "loss": 0.79678619, + "num_input_tokens_seen": 246242970, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.39257812, + "step": 11405, + "time_per_iteration": 2.423056125640869 + }, + { + "auxiliary_loss_clip": 0.01007041, + "auxiliary_loss_mlp": 0.01002396, + "balance_loss_clip": 1.00140047, + "balance_loss_mlp": 1.00075674, + "epoch": 0.6857658199308583, + "flos": 61849648080000.0, + "grad_norm": 0.793264838853731, + "language_loss": 0.61051685, + "learning_rate": 8.980258604247781e-07, + "loss": 0.63061118, + "num_input_tokens_seen": 246300405, + "router_z_loss_clip": 0.00994873, + "router_z_loss_mlp": 0.06298828, + "step": 11406, + "time_per_iteration": 3.086237668991089 + }, + { + "auxiliary_loss_clip": 0.01057087, + "auxiliary_loss_mlp": 0.01028878, + "balance_loss_clip": 1.01664114, + "balance_loss_mlp": 1.01831102, + "epoch": 0.6858259431835262, + "flos": 16653604636800.0, + "grad_norm": 2.0352936616995683, + "language_loss": 0.76635665, + "learning_rate": 8.977105821512496e-07, + "loss": 0.78721631, + "num_input_tokens_seen": 246318780, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.38867188, + "step": 11407, + "time_per_iteration": 2.3669943809509277 + }, + { + "auxiliary_loss_clip": 0.01060579, + "auxiliary_loss_mlp": 0.01027395, + "balance_loss_clip": 1.01452029, + "balance_loss_mlp": 1.01955914, + "epoch": 0.6858860664361942, + "flos": 21871863020160.0, + "grad_norm": 2.1676497090270965, + "language_loss": 0.71279836, + "learning_rate": 8.973953432155956e-07, + "loss": 0.7336781, + "num_input_tokens_seen": 246339405, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.41015625, + "step": 11408, + "time_per_iteration": 3.936854362487793 + }, + { + "auxiliary_loss_clip": 0.0106006, + "auxiliary_loss_mlp": 0.01022199, + "balance_loss_clip": 1.0100162, + "balance_loss_mlp": 1.0188719, + "epoch": 0.6859461896888621, + "flos": 15266749766400.0, + "grad_norm": 1.7631580257409907, + "language_loss": 0.70032334, + "learning_rate": 8.970801436290658e-07, + "loss": 0.72114593, + "num_input_tokens_seen": 246357055, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.41210938, + "step": 11409, + "time_per_iteration": 2.391387939453125 + }, + { + "auxiliary_loss_clip": 0.0105368, + "auxiliary_loss_mlp": 0.01022464, + "balance_loss_clip": 1.01178837, + "balance_loss_mlp": 1.01826525, + "epoch": 0.6860063129415301, + "flos": 18619424449920.0, + "grad_norm": 1.6301711127841927, + "language_loss": 0.78100097, + "learning_rate": 8.967649834029085e-07, + "loss": 0.80176234, + "num_input_tokens_seen": 246374050, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.35351562, + "step": 11410, + "time_per_iteration": 2.4337971210479736 + }, + { + "auxiliary_loss_clip": 0.01059783, + "auxiliary_loss_mlp": 0.01029575, + "balance_loss_clip": 1.01643801, + "balance_loss_mlp": 1.01793194, + "epoch": 0.6860664361941982, + "flos": 23950242656640.0, + "grad_norm": 3.1142407382370365, + "language_loss": 0.71835202, + "learning_rate": 8.964498625483703e-07, + "loss": 0.73924565, + "num_input_tokens_seen": 246392910, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.41796875, + "step": 11411, + "time_per_iteration": 2.43038010597229 + }, + { + "auxiliary_loss_clip": 0.01057951, + "auxiliary_loss_mlp": 0.01024104, + "balance_loss_clip": 1.01157534, + "balance_loss_mlp": 1.01842141, + "epoch": 0.6861265594468661, + "flos": 20406872793600.0, + "grad_norm": 1.8968211978321994, + "language_loss": 0.70325953, + "learning_rate": 8.961347810766993e-07, + "loss": 0.72408009, + "num_input_tokens_seen": 246411540, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.39453125, + "step": 11412, + "time_per_iteration": 2.404721736907959 + }, + { + "auxiliary_loss_clip": 0.01059042, + "auxiliary_loss_mlp": 0.01022726, + "balance_loss_clip": 1.01054275, + "balance_loss_mlp": 1.01961958, + "epoch": 0.6861866826995341, + "flos": 11428014867840.0, + "grad_norm": 2.752676611228628, + "language_loss": 0.71869963, + "learning_rate": 8.958197389991371e-07, + "loss": 0.73951733, + "num_input_tokens_seen": 246423295, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.39453125, + "step": 11413, + "time_per_iteration": 2.3499867916107178 + }, + { + "auxiliary_loss_clip": 0.01057958, + "auxiliary_loss_mlp": 0.01024804, + "balance_loss_clip": 1.01346695, + "balance_loss_mlp": 1.01890671, + "epoch": 0.686246805952202, + "flos": 15996678894720.0, + "grad_norm": 1.6273199517666894, + "language_loss": 0.73514539, + "learning_rate": 8.955047363269288e-07, + "loss": 0.75597298, + "num_input_tokens_seen": 246441045, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.390625, + "step": 11414, + "time_per_iteration": 2.390825033187866 + }, + { + "auxiliary_loss_clip": 0.01061701, + "auxiliary_loss_mlp": 0.01025892, + "balance_loss_clip": 1.01327395, + "balance_loss_mlp": 1.02063763, + "epoch": 0.68630692920487, + "flos": 19825743346560.0, + "grad_norm": 2.725619573093813, + "language_loss": 0.86594439, + "learning_rate": 8.95189773071316e-07, + "loss": 0.88682032, + "num_input_tokens_seen": 246456905, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.41015625, + "step": 11415, + "time_per_iteration": 2.3795831203460693 + }, + { + "auxiliary_loss_clip": 0.01060494, + "auxiliary_loss_mlp": 0.01024126, + "balance_loss_clip": 1.01158476, + "balance_loss_mlp": 1.01928318, + "epoch": 0.6863670524575379, + "flos": 26285024384640.0, + "grad_norm": 1.8620395675158048, + "language_loss": 0.6712923, + "learning_rate": 8.948748492435369e-07, + "loss": 0.69213855, + "num_input_tokens_seen": 246477545, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.41210938, + "step": 11416, + "time_per_iteration": 2.4602913856506348 + }, + { + "auxiliary_loss_clip": 0.01059099, + "auxiliary_loss_mlp": 0.01022745, + "balance_loss_clip": 1.01081252, + "balance_loss_mlp": 1.0193851, + "epoch": 0.686427175710206, + "flos": 19172099272320.0, + "grad_norm": 1.6570911643335073, + "language_loss": 0.76216674, + "learning_rate": 8.945599648548325e-07, + "loss": 0.78298515, + "num_input_tokens_seen": 246496705, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.39648438, + "step": 11417, + "time_per_iteration": 2.4147403240203857 + }, + { + "auxiliary_loss_clip": 0.01057374, + "auxiliary_loss_mlp": 0.01022322, + "balance_loss_clip": 1.01063371, + "balance_loss_mlp": 1.01859426, + "epoch": 0.6864872989628739, + "flos": 18915627358080.0, + "grad_norm": 1.7728900604423494, + "language_loss": 0.77408612, + "learning_rate": 8.942451199164386e-07, + "loss": 0.79488313, + "num_input_tokens_seen": 246514860, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.38671875, + "step": 11418, + "time_per_iteration": 2.400787115097046 + }, + { + "auxiliary_loss_clip": 0.01056841, + "auxiliary_loss_mlp": 0.01023893, + "balance_loss_clip": 1.01248479, + "balance_loss_mlp": 1.01841807, + "epoch": 0.6865474222155419, + "flos": 25955060325120.0, + "grad_norm": 1.6730316869134647, + "language_loss": 0.76492202, + "learning_rate": 8.939303144395936e-07, + "loss": 0.78572941, + "num_input_tokens_seen": 246536145, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.3828125, + "step": 11419, + "time_per_iteration": 2.4423277378082275 + }, + { + "auxiliary_loss_clip": 0.01057157, + "auxiliary_loss_mlp": 0.01028684, + "balance_loss_clip": 1.01713264, + "balance_loss_mlp": 1.01935518, + "epoch": 0.6866075454682098, + "flos": 18477118281600.0, + "grad_norm": 1.863496537947953, + "language_loss": 0.71679091, + "learning_rate": 8.93615548435529e-07, + "loss": 0.73764932, + "num_input_tokens_seen": 246553265, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.37890625, + "step": 11420, + "time_per_iteration": 2.372908353805542 + }, + { + "auxiliary_loss_clip": 0.0105582, + "auxiliary_loss_mlp": 0.0102111, + "balance_loss_clip": 1.00973701, + "balance_loss_mlp": 1.01751959, + "epoch": 0.6866676687208778, + "flos": 34238588146560.0, + "grad_norm": 1.826619709078517, + "language_loss": 0.74661839, + "learning_rate": 8.933008219154803e-07, + "loss": 0.76738763, + "num_input_tokens_seen": 246575130, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.3828125, + "step": 11421, + "time_per_iteration": 2.531064510345459 + }, + { + "auxiliary_loss_clip": 0.01054286, + "auxiliary_loss_mlp": 0.0102526, + "balance_loss_clip": 1.01403618, + "balance_loss_mlp": 1.01765394, + "epoch": 0.6867277919735457, + "flos": 21720794100480.0, + "grad_norm": 1.709431451649204, + "language_loss": 0.77328676, + "learning_rate": 8.929861348906784e-07, + "loss": 0.79408216, + "num_input_tokens_seen": 246593095, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.3671875, + "step": 11422, + "time_per_iteration": 2.5039350986480713 + }, + { + "auxiliary_loss_clip": 0.01060137, + "auxiliary_loss_mlp": 0.01029389, + "balance_loss_clip": 1.01686025, + "balance_loss_mlp": 1.02055717, + "epoch": 0.6867879152262137, + "flos": 24096842922240.0, + "grad_norm": 1.8472339949444916, + "language_loss": 0.77320808, + "learning_rate": 8.926714873723537e-07, + "loss": 0.79410338, + "num_input_tokens_seen": 246612165, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.39453125, + "step": 11423, + "time_per_iteration": 2.497734308242798 + }, + { + "auxiliary_loss_clip": 0.01058433, + "auxiliary_loss_mlp": 0.0102425, + "balance_loss_clip": 1.01190615, + "balance_loss_mlp": 1.01905394, + "epoch": 0.6868480384788818, + "flos": 21614762701440.0, + "grad_norm": 2.177597490148885, + "language_loss": 0.7275424, + "learning_rate": 8.923568793717347e-07, + "loss": 0.74836922, + "num_input_tokens_seen": 246632065, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.39453125, + "step": 11424, + "time_per_iteration": 2.4091973304748535 + }, + { + "auxiliary_loss_clip": 0.01058474, + "auxiliary_loss_mlp": 0.0102409, + "balance_loss_clip": 1.0121398, + "balance_loss_mlp": 1.01885438, + "epoch": 0.6869081617315497, + "flos": 26284954561920.0, + "grad_norm": 3.755109655409233, + "language_loss": 0.65152669, + "learning_rate": 8.920423109000501e-07, + "loss": 0.67235231, + "num_input_tokens_seen": 246651245, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.39648438, + "step": 11425, + "time_per_iteration": 2.442230224609375 + }, + { + "auxiliary_loss_clip": 0.01058012, + "auxiliary_loss_mlp": 0.01023793, + "balance_loss_clip": 1.01234269, + "balance_loss_mlp": 1.02018929, + "epoch": 0.6869682849842177, + "flos": 21104053200000.0, + "grad_norm": 1.4088839514605371, + "language_loss": 0.71768117, + "learning_rate": 8.917277819685254e-07, + "loss": 0.73849922, + "num_input_tokens_seen": 246672225, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.37890625, + "step": 11426, + "time_per_iteration": 2.4266088008880615 + }, + { + "auxiliary_loss_clip": 0.01056794, + "auxiliary_loss_mlp": 0.01023692, + "balance_loss_clip": 1.01217663, + "balance_loss_mlp": 1.01878679, + "epoch": 0.6870284082368856, + "flos": 17091694776960.0, + "grad_norm": 2.595482270871291, + "language_loss": 0.85402274, + "learning_rate": 8.914132925883855e-07, + "loss": 0.87482762, + "num_input_tokens_seen": 246688385, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.37890625, + "step": 11427, + "time_per_iteration": 2.379101276397705 + }, + { + "auxiliary_loss_clip": 0.0105832, + "auxiliary_loss_mlp": 0.01024951, + "balance_loss_clip": 1.01378727, + "balance_loss_mlp": 1.01987338, + "epoch": 0.6870885314895536, + "flos": 27306862323840.0, + "grad_norm": 1.3828084786987767, + "language_loss": 0.75889832, + "learning_rate": 8.910988427708526e-07, + "loss": 0.77973104, + "num_input_tokens_seen": 246710730, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.38476562, + "step": 11428, + "time_per_iteration": 2.454709053039551 + }, + { + "auxiliary_loss_clip": 0.01059981, + "auxiliary_loss_mlp": 0.01028524, + "balance_loss_clip": 1.01569688, + "balance_loss_mlp": 1.01925945, + "epoch": 0.6871486547422215, + "flos": 20813471020800.0, + "grad_norm": 1.6777292796896113, + "language_loss": 0.72720623, + "learning_rate": 8.907844325271511e-07, + "loss": 0.74809128, + "num_input_tokens_seen": 246730350, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.40625, + "step": 11429, + "time_per_iteration": 2.388010263442993 + }, + { + "auxiliary_loss_clip": 0.01060041, + "auxiliary_loss_mlp": 0.01023315, + "balance_loss_clip": 1.01180577, + "balance_loss_mlp": 1.01977503, + "epoch": 0.6872087779948896, + "flos": 30152807400960.0, + "grad_norm": 1.7644072008889125, + "language_loss": 0.83184338, + "learning_rate": 8.904700618684993e-07, + "loss": 0.85267693, + "num_input_tokens_seen": 246751700, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.40234375, + "step": 11430, + "time_per_iteration": 2.464534282684326 + }, + { + "auxiliary_loss_clip": 0.01058931, + "auxiliary_loss_mlp": 0.01020555, + "balance_loss_clip": 1.00906324, + "balance_loss_mlp": 1.01937628, + "epoch": 0.6872689012475575, + "flos": 20703529549440.0, + "grad_norm": 1.7891832222139021, + "language_loss": 0.70363665, + "learning_rate": 8.901557308061163e-07, + "loss": 0.72443151, + "num_input_tokens_seen": 246769860, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.39648438, + "step": 11431, + "time_per_iteration": 3.8697562217712402 + }, + { + "auxiliary_loss_clip": 0.01058337, + "auxiliary_loss_mlp": 0.01023593, + "balance_loss_clip": 1.01208973, + "balance_loss_mlp": 1.01910329, + "epoch": 0.6873290245002255, + "flos": 25519658359680.0, + "grad_norm": 1.8758451630607171, + "language_loss": 0.80066484, + "learning_rate": 8.898414393512217e-07, + "loss": 0.82148415, + "num_input_tokens_seen": 246789905, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.390625, + "step": 11432, + "time_per_iteration": 2.429009437561035 + }, + { + "auxiliary_loss_clip": 0.01055228, + "auxiliary_loss_mlp": 0.0102442, + "balance_loss_clip": 1.01320863, + "balance_loss_mlp": 1.01876652, + "epoch": 0.6873891477528934, + "flos": 25190322704640.0, + "grad_norm": 1.6191415548069168, + "language_loss": 0.67753363, + "learning_rate": 8.89527187515029e-07, + "loss": 0.6983301, + "num_input_tokens_seen": 246808815, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.36523438, + "step": 11433, + "time_per_iteration": 2.4571428298950195 + }, + { + "auxiliary_loss_clip": 0.01059157, + "auxiliary_loss_mlp": 0.01030579, + "balance_loss_clip": 1.01887262, + "balance_loss_mlp": 1.0201261, + "epoch": 0.6874492710055614, + "flos": 35150938462080.0, + "grad_norm": 1.850655278129343, + "language_loss": 0.72835493, + "learning_rate": 8.892129753087554e-07, + "loss": 0.74925226, + "num_input_tokens_seen": 246829775, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.390625, + "step": 11434, + "time_per_iteration": 2.5360608100891113 + }, + { + "auxiliary_loss_clip": 0.01058219, + "auxiliary_loss_mlp": 0.01020899, + "balance_loss_clip": 1.00939512, + "balance_loss_mlp": 1.02006006, + "epoch": 0.6875093942582293, + "flos": 17821239880320.0, + "grad_norm": 1.6386020666291585, + "language_loss": 0.8043952, + "learning_rate": 8.888988027436124e-07, + "loss": 0.82518637, + "num_input_tokens_seen": 246848045, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.3828125, + "step": 11435, + "time_per_iteration": 2.4741156101226807 + }, + { + "auxiliary_loss_clip": 0.01058333, + "auxiliary_loss_mlp": 0.01026435, + "balance_loss_clip": 1.01469326, + "balance_loss_mlp": 1.01844466, + "epoch": 0.6875695175108973, + "flos": 20703494638080.0, + "grad_norm": 2.3609508053406323, + "language_loss": 0.80544162, + "learning_rate": 8.885846698308148e-07, + "loss": 0.8262893, + "num_input_tokens_seen": 246866095, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.3984375, + "step": 11436, + "time_per_iteration": 2.443342924118042 + }, + { + "auxiliary_loss_clip": 0.01052786, + "auxiliary_loss_mlp": 0.01021092, + "balance_loss_clip": 1.01135826, + "balance_loss_mlp": 1.01784563, + "epoch": 0.6876296407635654, + "flos": 25372848625920.0, + "grad_norm": 1.6890009547037808, + "language_loss": 0.82145292, + "learning_rate": 8.882705765815697e-07, + "loss": 0.8421917, + "num_input_tokens_seen": 246883975, + "router_z_loss_clip": 0.09765625, + "router_z_loss_mlp": 0.34960938, + "step": 11437, + "time_per_iteration": 2.4717020988464355 + }, + { + "auxiliary_loss_clip": 0.01063058, + "auxiliary_loss_mlp": 0.01032178, + "balance_loss_clip": 1.01899934, + "balance_loss_mlp": 1.02038682, + "epoch": 0.6876897640162333, + "flos": 23221186312320.0, + "grad_norm": 2.75351729649528, + "language_loss": 0.77905571, + "learning_rate": 8.879565230070889e-07, + "loss": 0.80000806, + "num_input_tokens_seen": 246901560, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.42773438, + "step": 11438, + "time_per_iteration": 2.416496515274048 + }, + { + "auxiliary_loss_clip": 0.0105451, + "auxiliary_loss_mlp": 0.01021964, + "balance_loss_clip": 1.01115203, + "balance_loss_mlp": 1.01809418, + "epoch": 0.6877498872689013, + "flos": 27123149416320.0, + "grad_norm": 1.626673880514064, + "language_loss": 0.72291094, + "learning_rate": 8.876425091185793e-07, + "loss": 0.74367571, + "num_input_tokens_seen": 246922655, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.36523438, + "step": 11439, + "time_per_iteration": 2.478018283843994 + }, + { + "auxiliary_loss_clip": 0.01057113, + "auxiliary_loss_mlp": 0.01022751, + "balance_loss_clip": 1.01155734, + "balance_loss_mlp": 1.01857996, + "epoch": 0.6878100105215692, + "flos": 11580899178240.0, + "grad_norm": 1.9682481713654305, + "language_loss": 0.75529957, + "learning_rate": 8.873285349272472e-07, + "loss": 0.77609819, + "num_input_tokens_seen": 246940100, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.38476562, + "step": 11440, + "time_per_iteration": 2.367910146713257 + }, + { + "auxiliary_loss_clip": 0.01056146, + "auxiliary_loss_mlp": 0.01029336, + "balance_loss_clip": 1.01768339, + "balance_loss_mlp": 1.017887, + "epoch": 0.6878701337742372, + "flos": 20302133114880.0, + "grad_norm": 1.6130076002646885, + "language_loss": 0.71993989, + "learning_rate": 8.870146004442969e-07, + "loss": 0.74079466, + "num_input_tokens_seen": 246958545, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.3828125, + "step": 11441, + "time_per_iteration": 3.887524127960205 + }, + { + "auxiliary_loss_clip": 0.01060733, + "auxiliary_loss_mlp": 0.01024883, + "balance_loss_clip": 1.01135266, + "balance_loss_mlp": 1.01946235, + "epoch": 0.6879302570269051, + "flos": 13839360940800.0, + "grad_norm": 1.6096354958626855, + "language_loss": 0.66364652, + "learning_rate": 8.86700705680933e-07, + "loss": 0.68450266, + "num_input_tokens_seen": 246974805, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.41210938, + "step": 11442, + "time_per_iteration": 2.3975579738616943 + }, + { + "auxiliary_loss_clip": 0.01057521, + "auxiliary_loss_mlp": 0.0102443, + "balance_loss_clip": 1.01361763, + "balance_loss_mlp": 1.0189904, + "epoch": 0.6879903802795732, + "flos": 21323587029120.0, + "grad_norm": 2.215281465415575, + "language_loss": 0.69816196, + "learning_rate": 8.863868506483574e-07, + "loss": 0.71898144, + "num_input_tokens_seen": 246992505, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.38671875, + "step": 11443, + "time_per_iteration": 2.4502294063568115 + }, + { + "auxiliary_loss_clip": 0.01061568, + "auxiliary_loss_mlp": 0.01026561, + "balance_loss_clip": 1.01442599, + "balance_loss_mlp": 1.02097106, + "epoch": 0.6880505035322411, + "flos": 25150975735680.0, + "grad_norm": 1.4113852706100212, + "language_loss": 0.76252937, + "learning_rate": 8.860730353577705e-07, + "loss": 0.78341067, + "num_input_tokens_seen": 247013370, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.40625, + "step": 11444, + "time_per_iteration": 2.4424262046813965 + }, + { + "auxiliary_loss_clip": 0.01055394, + "auxiliary_loss_mlp": 0.01021637, + "balance_loss_clip": 1.01025891, + "balance_loss_mlp": 1.01761103, + "epoch": 0.6881106267849091, + "flos": 23214588065280.0, + "grad_norm": 2.5725621931044533, + "language_loss": 0.76385766, + "learning_rate": 8.857592598203718e-07, + "loss": 0.78462803, + "num_input_tokens_seen": 247029855, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.37890625, + "step": 11445, + "time_per_iteration": 2.3905415534973145 + }, + { + "auxiliary_loss_clip": 0.01006752, + "auxiliary_loss_mlp": 0.0100048, + "balance_loss_clip": 0.99950892, + "balance_loss_mlp": 1.00053406, + "epoch": 0.688170750037577, + "flos": 48482173342080.0, + "grad_norm": 0.8217931523352354, + "language_loss": 0.58390814, + "learning_rate": 8.854455240473587e-07, + "loss": 0.60398048, + "num_input_tokens_seen": 247085030, + "router_z_loss_clip": 0.00970459, + "router_z_loss_mlp": 0.06201172, + "step": 11446, + "time_per_iteration": 3.103346586227417 + }, + { + "auxiliary_loss_clip": 0.01058335, + "auxiliary_loss_mlp": 0.01022737, + "balance_loss_clip": 1.01077485, + "balance_loss_mlp": 1.01881099, + "epoch": 0.688230873290245, + "flos": 22782537590400.0, + "grad_norm": 1.5166852241008448, + "language_loss": 0.75743568, + "learning_rate": 8.85131828049928e-07, + "loss": 0.7782464, + "num_input_tokens_seen": 247104840, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.39648438, + "step": 11447, + "time_per_iteration": 2.441352128982544 + }, + { + "auxiliary_loss_clip": 0.01059496, + "auxiliary_loss_mlp": 0.0102491, + "balance_loss_clip": 1.01322138, + "balance_loss_mlp": 1.0188309, + "epoch": 0.6882909965429129, + "flos": 22454563478400.0, + "grad_norm": 2.0815190404002157, + "language_loss": 0.72916192, + "learning_rate": 8.848181718392737e-07, + "loss": 0.75000596, + "num_input_tokens_seen": 247121905, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.40625, + "step": 11448, + "time_per_iteration": 3.798980712890625 + }, + { + "auxiliary_loss_clip": 0.01060547, + "auxiliary_loss_mlp": 0.01024739, + "balance_loss_clip": 1.0116204, + "balance_loss_mlp": 1.01920235, + "epoch": 0.688351119795581, + "flos": 26212928693760.0, + "grad_norm": 1.6821494969518649, + "language_loss": 0.74280584, + "learning_rate": 8.84504555426592e-07, + "loss": 0.76365864, + "num_input_tokens_seen": 247142375, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.4140625, + "step": 11449, + "time_per_iteration": 2.4691076278686523 + }, + { + "auxiliary_loss_clip": 0.0105496, + "auxiliary_loss_mlp": 0.01024978, + "balance_loss_clip": 1.01401043, + "balance_loss_mlp": 1.01777434, + "epoch": 0.6884112430482489, + "flos": 22564155836160.0, + "grad_norm": 1.8981966433734272, + "language_loss": 0.70034206, + "learning_rate": 8.841909788230715e-07, + "loss": 0.72114134, + "num_input_tokens_seen": 247161095, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.37109375, + "step": 11450, + "time_per_iteration": 2.444225549697876 + }, + { + "auxiliary_loss_clip": 0.01060077, + "auxiliary_loss_mlp": 0.01028489, + "balance_loss_clip": 1.01535177, + "balance_loss_mlp": 1.01893663, + "epoch": 0.6884713663009169, + "flos": 17340276723840.0, + "grad_norm": 1.8258688536667027, + "language_loss": 0.75864911, + "learning_rate": 8.838774420399058e-07, + "loss": 0.77953476, + "num_input_tokens_seen": 247178565, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.41015625, + "step": 11451, + "time_per_iteration": 2.39186692237854 + }, + { + "auxiliary_loss_clip": 0.01056644, + "auxiliary_loss_mlp": 0.01023482, + "balance_loss_clip": 1.01165032, + "balance_loss_mlp": 1.01759648, + "epoch": 0.6885314895535849, + "flos": 26469575164800.0, + "grad_norm": 1.376554361279503, + "language_loss": 0.69335055, + "learning_rate": 8.835639450882821e-07, + "loss": 0.7141518, + "num_input_tokens_seen": 247202345, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.390625, + "step": 11452, + "time_per_iteration": 2.4580564498901367 + }, + { + "auxiliary_loss_clip": 0.01057043, + "auxiliary_loss_mlp": 0.01026906, + "balance_loss_clip": 1.01458013, + "balance_loss_mlp": 1.01868057, + "epoch": 0.6885916128062528, + "flos": 20520514869120.0, + "grad_norm": 1.9847568250996823, + "language_loss": 0.71833169, + "learning_rate": 8.832504879793912e-07, + "loss": 0.73917115, + "num_input_tokens_seen": 247219240, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.3828125, + "step": 11453, + "time_per_iteration": 2.3930046558380127 + }, + { + "auxiliary_loss_clip": 0.01006678, + "auxiliary_loss_mlp": 0.01002577, + "balance_loss_clip": 1.00162923, + "balance_loss_mlp": 1.00053835, + "epoch": 0.6886517360589208, + "flos": 70712629603200.0, + "grad_norm": 0.7816040507721983, + "language_loss": 0.50708902, + "learning_rate": 8.829370707244162e-07, + "loss": 0.52718151, + "num_input_tokens_seen": 247272010, + "router_z_loss_clip": 0.00946045, + "router_z_loss_mlp": 0.06152344, + "step": 11454, + "time_per_iteration": 2.9271037578582764 + }, + { + "auxiliary_loss_clip": 0.0105722, + "auxiliary_loss_mlp": 0.01026125, + "balance_loss_clip": 1.01512206, + "balance_loss_mlp": 1.02005279, + "epoch": 0.6887118593115887, + "flos": 17892602432640.0, + "grad_norm": 1.778762574574926, + "language_loss": 0.75547802, + "learning_rate": 8.826236933345443e-07, + "loss": 0.77631146, + "num_input_tokens_seen": 247290630, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.37109375, + "step": 11455, + "time_per_iteration": 2.409266471862793 + }, + { + "auxiliary_loss_clip": 0.01058783, + "auxiliary_loss_mlp": 0.01031612, + "balance_loss_clip": 1.01811194, + "balance_loss_mlp": 1.01817751, + "epoch": 0.6887719825642568, + "flos": 17452173231360.0, + "grad_norm": 2.4315732097649687, + "language_loss": 0.724769, + "learning_rate": 8.823103558209586e-07, + "loss": 0.745673, + "num_input_tokens_seen": 247304800, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.40625, + "step": 11456, + "time_per_iteration": 2.390833854675293 + }, + { + "auxiliary_loss_clip": 0.01055583, + "auxiliary_loss_mlp": 0.01025111, + "balance_loss_clip": 1.01395917, + "balance_loss_mlp": 1.01931882, + "epoch": 0.6888321058169247, + "flos": 23069244608640.0, + "grad_norm": 3.1401407210678913, + "language_loss": 0.80832744, + "learning_rate": 8.819970581948415e-07, + "loss": 0.8291344, + "num_input_tokens_seen": 247323450, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.36328125, + "step": 11457, + "time_per_iteration": 2.445619583129883 + }, + { + "auxiliary_loss_clip": 0.01061766, + "auxiliary_loss_mlp": 0.01025254, + "balance_loss_clip": 1.01343465, + "balance_loss_mlp": 1.02052689, + "epoch": 0.6888922290695927, + "flos": 23367681843840.0, + "grad_norm": 1.6609054087060438, + "language_loss": 0.75987554, + "learning_rate": 8.816838004673725e-07, + "loss": 0.78074574, + "num_input_tokens_seen": 247343845, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.41210938, + "step": 11458, + "time_per_iteration": 2.4761316776275635 + }, + { + "auxiliary_loss_clip": 0.01059495, + "auxiliary_loss_mlp": 0.01024234, + "balance_loss_clip": 1.01265275, + "balance_loss_mlp": 1.020123, + "epoch": 0.6889523523222606, + "flos": 17630893814400.0, + "grad_norm": 2.231880880893956, + "language_loss": 0.68878084, + "learning_rate": 8.813705826497337e-07, + "loss": 0.70961815, + "num_input_tokens_seen": 247356650, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.39453125, + "step": 11459, + "time_per_iteration": 2.361516237258911 + }, + { + "auxiliary_loss_clip": 0.01058003, + "auxiliary_loss_mlp": 0.01023872, + "balance_loss_clip": 1.01268458, + "balance_loss_mlp": 1.01948857, + "epoch": 0.6890124755749286, + "flos": 25226981498880.0, + "grad_norm": 1.4789044227513013, + "language_loss": 0.68545109, + "learning_rate": 8.810574047531006e-07, + "loss": 0.70626986, + "num_input_tokens_seen": 247377340, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.38476562, + "step": 11460, + "time_per_iteration": 2.474442481994629 + }, + { + "auxiliary_loss_clip": 0.01060431, + "auxiliary_loss_mlp": 0.01025977, + "balance_loss_clip": 1.01302505, + "balance_loss_mlp": 1.01915026, + "epoch": 0.6890725988275965, + "flos": 20229199551360.0, + "grad_norm": 2.263803289271149, + "language_loss": 0.76969218, + "learning_rate": 8.807442667886496e-07, + "loss": 0.79055625, + "num_input_tokens_seen": 247395805, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.4140625, + "step": 11461, + "time_per_iteration": 2.44023060798645 + }, + { + "auxiliary_loss_clip": 0.01058975, + "auxiliary_loss_mlp": 0.01028367, + "balance_loss_clip": 1.01672614, + "balance_loss_mlp": 1.02014244, + "epoch": 0.6891327220802645, + "flos": 14534516488320.0, + "grad_norm": 1.9077390562579113, + "language_loss": 0.69174898, + "learning_rate": 8.804311687675574e-07, + "loss": 0.7126224, + "num_input_tokens_seen": 247413165, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.38867188, + "step": 11462, + "time_per_iteration": 2.434659004211426 + }, + { + "auxiliary_loss_clip": 0.01059335, + "auxiliary_loss_mlp": 0.01020332, + "balance_loss_clip": 1.00857806, + "balance_loss_mlp": 1.01985967, + "epoch": 0.6891928453329325, + "flos": 21138163464960.0, + "grad_norm": 1.5512544674994446, + "language_loss": 0.87344426, + "learning_rate": 8.801181107009969e-07, + "loss": 0.89424098, + "num_input_tokens_seen": 247433140, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.39453125, + "step": 11463, + "time_per_iteration": 2.438202381134033 + }, + { + "auxiliary_loss_clip": 0.01054528, + "auxiliary_loss_mlp": 0.01019239, + "balance_loss_clip": 1.00938654, + "balance_loss_mlp": 1.01813889, + "epoch": 0.6892529685856005, + "flos": 17857549560960.0, + "grad_norm": 2.1198764072555516, + "language_loss": 0.68489718, + "learning_rate": 8.798050926001404e-07, + "loss": 0.70563495, + "num_input_tokens_seen": 247451265, + "router_z_loss_clip": 0.09863281, + "router_z_loss_mlp": 0.36328125, + "step": 11464, + "time_per_iteration": 2.4089303016662598 + }, + { + "auxiliary_loss_clip": 0.0105837, + "auxiliary_loss_mlp": 0.01027537, + "balance_loss_clip": 1.01574731, + "balance_loss_mlp": 1.0193274, + "epoch": 0.6893130918382685, + "flos": 29933517951360.0, + "grad_norm": 1.894722080118835, + "language_loss": 0.65155935, + "learning_rate": 8.794921144761578e-07, + "loss": 0.67241842, + "num_input_tokens_seen": 247471645, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.390625, + "step": 11465, + "time_per_iteration": 2.435086250305176 + }, + { + "auxiliary_loss_clip": 0.01059888, + "auxiliary_loss_mlp": 0.0103241, + "balance_loss_clip": 1.02052486, + "balance_loss_mlp": 1.01935053, + "epoch": 0.6893732150909364, + "flos": 24387390190080.0, + "grad_norm": 1.4877115165048047, + "language_loss": 0.72626531, + "learning_rate": 8.79179176340221e-07, + "loss": 0.74718827, + "num_input_tokens_seen": 247491170, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.40625, + "step": 11466, + "time_per_iteration": 2.500251054763794 + }, + { + "auxiliary_loss_clip": 0.01058198, + "auxiliary_loss_mlp": 0.01021294, + "balance_loss_clip": 1.01027298, + "balance_loss_mlp": 1.01944661, + "epoch": 0.6894333383436044, + "flos": 16981927862400.0, + "grad_norm": 1.8229094576984217, + "language_loss": 0.72184908, + "learning_rate": 8.788662782034948e-07, + "loss": 0.74264401, + "num_input_tokens_seen": 247509005, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.38671875, + "step": 11467, + "time_per_iteration": 2.353635787963867 + }, + { + "auxiliary_loss_clip": 0.01060294, + "auxiliary_loss_mlp": 0.0102983, + "balance_loss_clip": 1.01609731, + "balance_loss_mlp": 1.01877475, + "epoch": 0.6894934615962723, + "flos": 18984650849280.0, + "grad_norm": 2.2038336740870568, + "language_loss": 0.81321013, + "learning_rate": 8.785534200771478e-07, + "loss": 0.83411139, + "num_input_tokens_seen": 247527050, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.41601562, + "step": 11468, + "time_per_iteration": 2.390570640563965 + }, + { + "auxiliary_loss_clip": 0.01059859, + "auxiliary_loss_mlp": 0.01028305, + "balance_loss_clip": 1.01625919, + "balance_loss_mlp": 1.0200572, + "epoch": 0.6895535848489404, + "flos": 34530252577920.0, + "grad_norm": 2.0184005550012865, + "language_loss": 0.65922594, + "learning_rate": 8.782406019723441e-07, + "loss": 0.68010759, + "num_input_tokens_seen": 247547765, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.3984375, + "step": 11469, + "time_per_iteration": 2.5008201599121094 + }, + { + "auxiliary_loss_clip": 0.01006639, + "auxiliary_loss_mlp": 0.01002792, + "balance_loss_clip": 1.00187421, + "balance_loss_mlp": 1.00058889, + "epoch": 0.6896137081016083, + "flos": 50072954797440.0, + "grad_norm": 0.8020869594840828, + "language_loss": 0.55221212, + "learning_rate": 8.77927823900249e-07, + "loss": 0.57230651, + "num_input_tokens_seen": 247603515, + "router_z_loss_clip": 0.00915527, + "router_z_loss_mlp": 0.06054688, + "step": 11470, + "time_per_iteration": 4.3974645137786865 + }, + { + "auxiliary_loss_clip": 0.01054496, + "auxiliary_loss_mlp": 0.01027113, + "balance_loss_clip": 1.01699841, + "balance_loss_mlp": 1.01876605, + "epoch": 0.6896738313542763, + "flos": 19937186006400.0, + "grad_norm": 1.8540702926167558, + "language_loss": 0.78368312, + "learning_rate": 8.776150858720222e-07, + "loss": 0.80449927, + "num_input_tokens_seen": 247622110, + "router_z_loss_clip": 0.10107422, + "router_z_loss_mlp": 0.35742188, + "step": 11471, + "time_per_iteration": 2.3782827854156494 + }, + { + "auxiliary_loss_clip": 0.01057616, + "auxiliary_loss_mlp": 0.01025067, + "balance_loss_clip": 1.01320004, + "balance_loss_mlp": 1.01874781, + "epoch": 0.6897339546069442, + "flos": 21724424881920.0, + "grad_norm": 2.404309274614216, + "language_loss": 0.7846415, + "learning_rate": 8.773023878988266e-07, + "loss": 0.80546832, + "num_input_tokens_seen": 247641905, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.38867188, + "step": 11472, + "time_per_iteration": 2.4041476249694824 + }, + { + "auxiliary_loss_clip": 0.01056809, + "auxiliary_loss_mlp": 0.01024336, + "balance_loss_clip": 1.01326132, + "balance_loss_mlp": 1.01792896, + "epoch": 0.6897940778596122, + "flos": 19825533878400.0, + "grad_norm": 1.6874624156736768, + "language_loss": 0.76250088, + "learning_rate": 8.769897299918208e-07, + "loss": 0.78331232, + "num_input_tokens_seen": 247660945, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.390625, + "step": 11473, + "time_per_iteration": 2.411024570465088 + }, + { + "auxiliary_loss_clip": 0.01059508, + "auxiliary_loss_mlp": 0.01025995, + "balance_loss_clip": 1.01357388, + "balance_loss_mlp": 1.01916242, + "epoch": 0.6898542011122801, + "flos": 17309133924480.0, + "grad_norm": 1.757441601581931, + "language_loss": 0.75471747, + "learning_rate": 8.766771121621628e-07, + "loss": 0.77557242, + "num_input_tokens_seen": 247678395, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.40234375, + "step": 11474, + "time_per_iteration": 2.3758556842803955 + }, + { + "auxiliary_loss_clip": 0.01057845, + "auxiliary_loss_mlp": 0.01026116, + "balance_loss_clip": 1.0138855, + "balance_loss_mlp": 1.01980174, + "epoch": 0.6899143243649482, + "flos": 24752895880320.0, + "grad_norm": 1.535598741119482, + "language_loss": 0.74474418, + "learning_rate": 8.763645344210091e-07, + "loss": 0.76558375, + "num_input_tokens_seen": 247698380, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.38085938, + "step": 11475, + "time_per_iteration": 2.417649745941162 + }, + { + "auxiliary_loss_clip": 0.01058282, + "auxiliary_loss_mlp": 0.01021393, + "balance_loss_clip": 1.00950742, + "balance_loss_mlp": 1.01904142, + "epoch": 0.6899744476176161, + "flos": 17233686743040.0, + "grad_norm": 1.89846694030223, + "language_loss": 0.88388658, + "learning_rate": 8.76051996779515e-07, + "loss": 0.90468335, + "num_input_tokens_seen": 247716370, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.39257812, + "step": 11476, + "time_per_iteration": 2.4322190284729004 + }, + { + "auxiliary_loss_clip": 0.01056346, + "auxiliary_loss_mlp": 0.01025365, + "balance_loss_clip": 1.01317036, + "balance_loss_mlp": 1.01834249, + "epoch": 0.6900345708702841, + "flos": 25409507420160.0, + "grad_norm": 1.6068383031625888, + "language_loss": 0.70000196, + "learning_rate": 8.757394992488338e-07, + "loss": 0.72081912, + "num_input_tokens_seen": 247737335, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.38085938, + "step": 11477, + "time_per_iteration": 2.4319493770599365 + }, + { + "auxiliary_loss_clip": 0.01060284, + "auxiliary_loss_mlp": 0.01033991, + "balance_loss_clip": 1.02126515, + "balance_loss_mlp": 1.01899719, + "epoch": 0.6900946941229521, + "flos": 23433249110400.0, + "grad_norm": 3.422447503766648, + "language_loss": 0.68049294, + "learning_rate": 8.754270418401173e-07, + "loss": 0.70143569, + "num_input_tokens_seen": 247756680, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.4140625, + "step": 11478, + "time_per_iteration": 2.4392688274383545 + }, + { + "auxiliary_loss_clip": 0.01059251, + "auxiliary_loss_mlp": 0.01028476, + "balance_loss_clip": 1.01573801, + "balance_loss_mlp": 1.01958942, + "epoch": 0.69015481737562, + "flos": 17819180110080.0, + "grad_norm": 1.853151612206263, + "language_loss": 0.75823319, + "learning_rate": 8.751146245645178e-07, + "loss": 0.77911049, + "num_input_tokens_seen": 247774265, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.39648438, + "step": 11479, + "time_per_iteration": 2.391864776611328 + }, + { + "auxiliary_loss_clip": 0.01058767, + "auxiliary_loss_mlp": 0.01020545, + "balance_loss_clip": 1.00818276, + "balance_loss_mlp": 1.01937163, + "epoch": 0.690214940628288, + "flos": 17455559633280.0, + "grad_norm": 1.819423097180096, + "language_loss": 0.78492182, + "learning_rate": 8.748022474331835e-07, + "loss": 0.80571496, + "num_input_tokens_seen": 247792395, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.39453125, + "step": 11480, + "time_per_iteration": 3.79115891456604 + }, + { + "auxiliary_loss_clip": 0.01058881, + "auxiliary_loss_mlp": 0.01023525, + "balance_loss_clip": 1.01249862, + "balance_loss_mlp": 1.02063811, + "epoch": 0.6902750638809559, + "flos": 29565498643200.0, + "grad_norm": 1.835418612366482, + "language_loss": 0.7522999, + "learning_rate": 8.74489910457263e-07, + "loss": 0.77312398, + "num_input_tokens_seen": 247811985, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.3828125, + "step": 11481, + "time_per_iteration": 3.9351046085357666 + }, + { + "auxiliary_loss_clip": 0.0105713, + "auxiliary_loss_mlp": 0.01023283, + "balance_loss_clip": 1.01123655, + "balance_loss_mlp": 1.01883936, + "epoch": 0.690335187133624, + "flos": 25555933128960.0, + "grad_norm": 1.717508385247596, + "language_loss": 0.68862522, + "learning_rate": 8.741776136479014e-07, + "loss": 0.70942932, + "num_input_tokens_seen": 247831880, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.3828125, + "step": 11482, + "time_per_iteration": 2.4631423950195312 + }, + { + "auxiliary_loss_clip": 0.01057854, + "auxiliary_loss_mlp": 0.01023499, + "balance_loss_clip": 1.01201296, + "balance_loss_mlp": 1.01942527, + "epoch": 0.6903953103862919, + "flos": 22487451845760.0, + "grad_norm": 1.668227610066798, + "language_loss": 0.82842064, + "learning_rate": 8.738653570162464e-07, + "loss": 0.84923422, + "num_input_tokens_seen": 247851170, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.38476562, + "step": 11483, + "time_per_iteration": 2.451010227203369 + }, + { + "auxiliary_loss_clip": 0.01058099, + "auxiliary_loss_mlp": 0.01022706, + "balance_loss_clip": 1.01167274, + "balance_loss_mlp": 1.01918995, + "epoch": 0.6904554336389599, + "flos": 26099426263680.0, + "grad_norm": 1.7860884141923954, + "language_loss": 0.65492725, + "learning_rate": 8.735531405734387e-07, + "loss": 0.67573529, + "num_input_tokens_seen": 247868950, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.38867188, + "step": 11484, + "time_per_iteration": 2.4139389991760254 + }, + { + "auxiliary_loss_clip": 0.0105672, + "auxiliary_loss_mlp": 0.01030376, + "balance_loss_clip": 1.01864636, + "balance_loss_mlp": 1.01846647, + "epoch": 0.6905155568916278, + "flos": 31170525799680.0, + "grad_norm": 1.6566603767791497, + "language_loss": 0.73517704, + "learning_rate": 8.732409643306223e-07, + "loss": 0.75604796, + "num_input_tokens_seen": 247889805, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.3828125, + "step": 11485, + "time_per_iteration": 2.4827823638916016 + }, + { + "auxiliary_loss_clip": 0.01061179, + "auxiliary_loss_mlp": 0.01027842, + "balance_loss_clip": 1.01558089, + "balance_loss_mlp": 1.02008224, + "epoch": 0.6905756801442958, + "flos": 17638713959040.0, + "grad_norm": 1.863634730479343, + "language_loss": 0.84894294, + "learning_rate": 8.729288282989369e-07, + "loss": 0.86983311, + "num_input_tokens_seen": 247908585, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.41015625, + "step": 11486, + "time_per_iteration": 2.412964105606079 + }, + { + "auxiliary_loss_clip": 0.01057409, + "auxiliary_loss_mlp": 0.01023093, + "balance_loss_clip": 1.01139307, + "balance_loss_mlp": 1.01866424, + "epoch": 0.6906358033969637, + "flos": 22342666970880.0, + "grad_norm": 1.5863275127135883, + "language_loss": 0.72402406, + "learning_rate": 8.72616732489524e-07, + "loss": 0.74482906, + "num_input_tokens_seen": 247928480, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.38671875, + "step": 11487, + "time_per_iteration": 2.4033377170562744 + }, + { + "auxiliary_loss_clip": 0.01006555, + "auxiliary_loss_mlp": 0.01002388, + "balance_loss_clip": 1.0014286, + "balance_loss_mlp": 1.00040376, + "epoch": 0.6906959266496318, + "flos": 69744172999680.0, + "grad_norm": 0.9083858086817863, + "language_loss": 0.66708064, + "learning_rate": 8.723046769135183e-07, + "loss": 0.68717015, + "num_input_tokens_seen": 247988855, + "router_z_loss_clip": 0.00958252, + "router_z_loss_mlp": 0.06152344, + "step": 11488, + "time_per_iteration": 4.434304475784302 + }, + { + "auxiliary_loss_clip": 0.01061224, + "auxiliary_loss_mlp": 0.01025456, + "balance_loss_clip": 1.01207495, + "balance_loss_mlp": 1.01973867, + "epoch": 0.6907560499022997, + "flos": 21433179386880.0, + "grad_norm": 1.8586346330476655, + "language_loss": 0.74937707, + "learning_rate": 8.719926615820587e-07, + "loss": 0.77024388, + "num_input_tokens_seen": 248007685, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.41601562, + "step": 11489, + "time_per_iteration": 2.396143913269043 + }, + { + "auxiliary_loss_clip": 0.01059131, + "auxiliary_loss_mlp": 0.01031431, + "balance_loss_clip": 1.01805568, + "balance_loss_mlp": 1.01817703, + "epoch": 0.6908161731549677, + "flos": 14391337536000.0, + "grad_norm": 1.801769078294002, + "language_loss": 0.62155253, + "learning_rate": 8.716806865062803e-07, + "loss": 0.64245814, + "num_input_tokens_seen": 248025145, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.41015625, + "step": 11490, + "time_per_iteration": 2.380067825317383 + }, + { + "auxiliary_loss_clip": 0.01058379, + "auxiliary_loss_mlp": 0.01023658, + "balance_loss_clip": 1.01136732, + "balance_loss_mlp": 1.01918793, + "epoch": 0.6908762964076357, + "flos": 20009945013120.0, + "grad_norm": 1.693592085157612, + "language_loss": 0.72876418, + "learning_rate": 8.713687516973142e-07, + "loss": 0.74958456, + "num_input_tokens_seen": 248043750, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.390625, + "step": 11491, + "time_per_iteration": 2.400446891784668 + }, + { + "auxiliary_loss_clip": 0.01057096, + "auxiliary_loss_mlp": 0.0102676, + "balance_loss_clip": 1.01514268, + "balance_loss_mlp": 1.01889253, + "epoch": 0.6909364196603036, + "flos": 28767767921280.0, + "grad_norm": 1.3075012590320831, + "language_loss": 0.70119238, + "learning_rate": 8.710568571662948e-07, + "loss": 0.722031, + "num_input_tokens_seen": 248065765, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.3828125, + "step": 11492, + "time_per_iteration": 2.489530563354492 + }, + { + "auxiliary_loss_clip": 0.01060635, + "auxiliary_loss_mlp": 0.01025676, + "balance_loss_clip": 1.01315284, + "balance_loss_mlp": 1.01975334, + "epoch": 0.6909965429129716, + "flos": 22527043194240.0, + "grad_norm": 1.7609974694825774, + "language_loss": 0.7455982, + "learning_rate": 8.707450029243524e-07, + "loss": 0.76646131, + "num_input_tokens_seen": 248083810, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.41015625, + "step": 11493, + "time_per_iteration": 2.424762487411499 + }, + { + "auxiliary_loss_clip": 0.01057806, + "auxiliary_loss_mlp": 0.01021977, + "balance_loss_clip": 1.01024079, + "balance_loss_mlp": 1.01929402, + "epoch": 0.6910566661656395, + "flos": 18404952768000.0, + "grad_norm": 2.1115278621331686, + "language_loss": 0.74741709, + "learning_rate": 8.70433188982616e-07, + "loss": 0.76821494, + "num_input_tokens_seen": 248103185, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.38476562, + "step": 11494, + "time_per_iteration": 2.397420644760132 + }, + { + "auxiliary_loss_clip": 0.01056489, + "auxiliary_loss_mlp": 0.01023887, + "balance_loss_clip": 1.01259804, + "balance_loss_mlp": 1.01814616, + "epoch": 0.6911167894183076, + "flos": 30772655412480.0, + "grad_norm": 1.4461721974801598, + "language_loss": 0.68290901, + "learning_rate": 8.701214153522127e-07, + "loss": 0.70371276, + "num_input_tokens_seen": 248125665, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.3828125, + "step": 11495, + "time_per_iteration": 2.497405529022217 + }, + { + "auxiliary_loss_clip": 0.01057865, + "auxiliary_loss_mlp": 0.01024861, + "balance_loss_clip": 1.01196289, + "balance_loss_mlp": 1.01785779, + "epoch": 0.6911769126709755, + "flos": 13734865641600.0, + "grad_norm": 1.7446766484115743, + "language_loss": 0.74118072, + "learning_rate": 8.698096820442704e-07, + "loss": 0.76200795, + "num_input_tokens_seen": 248142545, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.40039062, + "step": 11496, + "time_per_iteration": 2.382288694381714 + }, + { + "auxiliary_loss_clip": 0.01054164, + "auxiliary_loss_mlp": 0.01021061, + "balance_loss_clip": 1.00992715, + "balance_loss_mlp": 1.0168128, + "epoch": 0.6912370359236435, + "flos": 17565885129600.0, + "grad_norm": 1.6902211876722018, + "language_loss": 0.79871511, + "learning_rate": 8.694979890699135e-07, + "loss": 0.81946737, + "num_input_tokens_seen": 248160225, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.37304688, + "step": 11497, + "time_per_iteration": 2.3894574642181396 + }, + { + "auxiliary_loss_clip": 0.01058369, + "auxiliary_loss_mlp": 0.01024189, + "balance_loss_clip": 1.01158881, + "balance_loss_mlp": 1.01859927, + "epoch": 0.6912971591763114, + "flos": 22089686192640.0, + "grad_norm": 1.4815796605653364, + "language_loss": 0.80265725, + "learning_rate": 8.691863364402655e-07, + "loss": 0.82348287, + "num_input_tokens_seen": 248180430, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.3984375, + "step": 11498, + "time_per_iteration": 2.412646532058716 + }, + { + "auxiliary_loss_clip": 0.01055602, + "auxiliary_loss_mlp": 0.01024378, + "balance_loss_clip": 1.01260614, + "balance_loss_mlp": 1.0176841, + "epoch": 0.6913572824289794, + "flos": 29970176745600.0, + "grad_norm": 2.669993967505526, + "language_loss": 0.8614763, + "learning_rate": 8.688747241664471e-07, + "loss": 0.88227606, + "num_input_tokens_seen": 248202365, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.37890625, + "step": 11499, + "time_per_iteration": 2.472987651824951 + }, + { + "auxiliary_loss_clip": 0.01056233, + "auxiliary_loss_mlp": 0.0102306, + "balance_loss_clip": 1.01203895, + "balance_loss_mlp": 1.0183754, + "epoch": 0.6914174056816473, + "flos": 20447895507840.0, + "grad_norm": 1.5894946852640002, + "language_loss": 0.75335789, + "learning_rate": 8.68563152259582e-07, + "loss": 0.77415079, + "num_input_tokens_seen": 248221750, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.37890625, + "step": 11500, + "time_per_iteration": 2.3847901821136475 + }, + { + "auxiliary_loss_clip": 0.01059756, + "auxiliary_loss_mlp": 0.01026486, + "balance_loss_clip": 1.01421297, + "balance_loss_mlp": 1.01994157, + "epoch": 0.6914775289343154, + "flos": 21281621708160.0, + "grad_norm": 1.7556217312978186, + "language_loss": 0.77195632, + "learning_rate": 8.682516207307862e-07, + "loss": 0.79281867, + "num_input_tokens_seen": 248239535, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.3984375, + "step": 11501, + "time_per_iteration": 2.421639919281006 + }, + { + "auxiliary_loss_clip": 0.01060012, + "auxiliary_loss_mlp": 0.01025318, + "balance_loss_clip": 1.01304531, + "balance_loss_mlp": 1.0196557, + "epoch": 0.6915376521869833, + "flos": 23876994890880.0, + "grad_norm": 1.722396074346708, + "language_loss": 0.73668993, + "learning_rate": 8.679401295911794e-07, + "loss": 0.75754321, + "num_input_tokens_seen": 248259055, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.40429688, + "step": 11502, + "time_per_iteration": 2.425060510635376 + }, + { + "auxiliary_loss_clip": 0.01061075, + "auxiliary_loss_mlp": 0.0102426, + "balance_loss_clip": 1.01112926, + "balance_loss_mlp": 1.01954532, + "epoch": 0.6915977754396513, + "flos": 11509466803200.0, + "grad_norm": 1.7915987632028683, + "language_loss": 0.73591483, + "learning_rate": 8.676286788518774e-07, + "loss": 0.75676823, + "num_input_tokens_seen": 248276765, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.41601562, + "step": 11503, + "time_per_iteration": 2.420104503631592 + }, + { + "auxiliary_loss_clip": 0.01058579, + "auxiliary_loss_mlp": 0.01023359, + "balance_loss_clip": 1.01186085, + "balance_loss_mlp": 1.0196104, + "epoch": 0.6916578986923193, + "flos": 22600186225920.0, + "grad_norm": 1.6219886514758126, + "language_loss": 0.77133518, + "learning_rate": 8.673172685239951e-07, + "loss": 0.79215455, + "num_input_tokens_seen": 248295310, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.390625, + "step": 11504, + "time_per_iteration": 2.403688907623291 + }, + { + "auxiliary_loss_clip": 0.01058629, + "auxiliary_loss_mlp": 0.01021389, + "balance_loss_clip": 1.01023734, + "balance_loss_mlp": 1.0188812, + "epoch": 0.6917180219449872, + "flos": 23476226860800.0, + "grad_norm": 1.7499946148043246, + "language_loss": 0.73527932, + "learning_rate": 8.670058986186459e-07, + "loss": 0.75607949, + "num_input_tokens_seen": 248315230, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.3984375, + "step": 11505, + "time_per_iteration": 2.425032615661621 + }, + { + "auxiliary_loss_clip": 0.01057647, + "auxiliary_loss_mlp": 0.01026294, + "balance_loss_clip": 1.01446831, + "balance_loss_mlp": 1.01904142, + "epoch": 0.6917781451976552, + "flos": 23731407054720.0, + "grad_norm": 1.9397651994878615, + "language_loss": 0.87067717, + "learning_rate": 8.666945691469409e-07, + "loss": 0.89151663, + "num_input_tokens_seen": 248332980, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.38671875, + "step": 11506, + "time_per_iteration": 2.4298064708709717 + }, + { + "auxiliary_loss_clip": 0.01056809, + "auxiliary_loss_mlp": 0.01020772, + "balance_loss_clip": 1.00958431, + "balance_loss_mlp": 1.0185957, + "epoch": 0.6918382684503231, + "flos": 31465436987520.0, + "grad_norm": 2.0744217324146583, + "language_loss": 0.7001282, + "learning_rate": 8.663832801199933e-07, + "loss": 0.72090405, + "num_input_tokens_seen": 248352865, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.3828125, + "step": 11507, + "time_per_iteration": 2.485255241394043 + }, + { + "auxiliary_loss_clip": 0.01058631, + "auxiliary_loss_mlp": 0.010243, + "balance_loss_clip": 1.01242685, + "balance_loss_mlp": 1.01852882, + "epoch": 0.6918983917029912, + "flos": 21649466459520.0, + "grad_norm": 1.7927764479796122, + "language_loss": 0.77451944, + "learning_rate": 8.660720315489087e-07, + "loss": 0.79534876, + "num_input_tokens_seen": 248371125, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.40234375, + "step": 11508, + "time_per_iteration": 2.432852268218994 + }, + { + "auxiliary_loss_clip": 0.01054372, + "auxiliary_loss_mlp": 0.01025916, + "balance_loss_clip": 1.01616526, + "balance_loss_mlp": 1.01876163, + "epoch": 0.6919585149556591, + "flos": 25549090502400.0, + "grad_norm": 3.920505476477462, + "language_loss": 0.74796784, + "learning_rate": 8.657608234447972e-07, + "loss": 0.76877081, + "num_input_tokens_seen": 248390455, + "router_z_loss_clip": 0.09765625, + "router_z_loss_mlp": 0.35546875, + "step": 11509, + "time_per_iteration": 2.4462239742279053 + }, + { + "auxiliary_loss_clip": 0.01059323, + "auxiliary_loss_mlp": 0.01028243, + "balance_loss_clip": 1.01663756, + "balance_loss_mlp": 1.02011442, + "epoch": 0.6920186382083271, + "flos": 23658648048000.0, + "grad_norm": 1.440415863171698, + "language_loss": 0.63849276, + "learning_rate": 8.654496558187643e-07, + "loss": 0.6593684, + "num_input_tokens_seen": 248411305, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.390625, + "step": 11510, + "time_per_iteration": 3.8481104373931885 + }, + { + "auxiliary_loss_clip": 0.01061367, + "auxiliary_loss_mlp": 0.01026275, + "balance_loss_clip": 1.01287544, + "balance_loss_mlp": 1.02005053, + "epoch": 0.692078761460995, + "flos": 19060970814720.0, + "grad_norm": 1.7707292855690195, + "language_loss": 0.75022697, + "learning_rate": 8.651385286819149e-07, + "loss": 0.77110338, + "num_input_tokens_seen": 248430190, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.4140625, + "step": 11511, + "time_per_iteration": 2.376166343688965 + }, + { + "auxiliary_loss_clip": 0.01055652, + "auxiliary_loss_mlp": 0.0102482, + "balance_loss_clip": 1.01424015, + "balance_loss_mlp": 1.01816964, + "epoch": 0.692138884713663, + "flos": 29022005508480.0, + "grad_norm": 1.6334351246311116, + "language_loss": 0.62454671, + "learning_rate": 8.648274420453514e-07, + "loss": 0.64535141, + "num_input_tokens_seen": 248450830, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.375, + "step": 11512, + "time_per_iteration": 2.4715189933776855 + }, + { + "auxiliary_loss_clip": 0.01057672, + "auxiliary_loss_mlp": 0.01024623, + "balance_loss_clip": 1.01247001, + "balance_loss_mlp": 1.01797581, + "epoch": 0.6921990079663309, + "flos": 14756947960320.0, + "grad_norm": 1.7265367835002559, + "language_loss": 0.83153403, + "learning_rate": 8.645163959201771e-07, + "loss": 0.85235697, + "num_input_tokens_seen": 248468585, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.39648438, + "step": 11513, + "time_per_iteration": 2.377126693725586 + }, + { + "auxiliary_loss_clip": 0.01057354, + "auxiliary_loss_mlp": 0.01025035, + "balance_loss_clip": 1.01403165, + "balance_loss_mlp": 1.01915503, + "epoch": 0.692259131218999, + "flos": 23840720121600.0, + "grad_norm": 1.4821510809838099, + "language_loss": 0.78409803, + "learning_rate": 8.64205390317492e-07, + "loss": 0.80492187, + "num_input_tokens_seen": 248490535, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.3828125, + "step": 11514, + "time_per_iteration": 2.4808788299560547 + }, + { + "auxiliary_loss_clip": 0.01059092, + "auxiliary_loss_mlp": 0.01025354, + "balance_loss_clip": 1.01364768, + "balance_loss_mlp": 1.01934469, + "epoch": 0.6923192544716669, + "flos": 19134078935040.0, + "grad_norm": 1.8762894744508194, + "language_loss": 0.75000054, + "learning_rate": 8.638944252483948e-07, + "loss": 0.77084506, + "num_input_tokens_seen": 248508575, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.3984375, + "step": 11515, + "time_per_iteration": 2.3725101947784424 + }, + { + "auxiliary_loss_clip": 0.01055344, + "auxiliary_loss_mlp": 0.01024116, + "balance_loss_clip": 1.01302934, + "balance_loss_mlp": 1.01835942, + "epoch": 0.6923793777243349, + "flos": 28073380423680.0, + "grad_norm": 2.013694622536591, + "language_loss": 0.53981936, + "learning_rate": 8.635835007239824e-07, + "loss": 0.56061399, + "num_input_tokens_seen": 248527025, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.36914062, + "step": 11516, + "time_per_iteration": 2.4502267837524414 + }, + { + "auxiliary_loss_clip": 0.0105639, + "auxiliary_loss_mlp": 0.01024316, + "balance_loss_clip": 1.01263309, + "balance_loss_mlp": 1.01981544, + "epoch": 0.6924395009770029, + "flos": 16580321959680.0, + "grad_norm": 1.6985075630250803, + "language_loss": 0.73149598, + "learning_rate": 8.632726167553532e-07, + "loss": 0.752303, + "num_input_tokens_seen": 248544275, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.36523438, + "step": 11517, + "time_per_iteration": 2.407344341278076 + }, + { + "auxiliary_loss_clip": 0.01059359, + "auxiliary_loss_mlp": 0.01029516, + "balance_loss_clip": 1.01709414, + "balance_loss_mlp": 1.01903629, + "epoch": 0.6924996242296708, + "flos": 16654337775360.0, + "grad_norm": 2.416889986536669, + "language_loss": 0.76291704, + "learning_rate": 8.629617733535987e-07, + "loss": 0.78380579, + "num_input_tokens_seen": 248561870, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.40429688, + "step": 11518, + "time_per_iteration": 2.40413236618042 + }, + { + "auxiliary_loss_clip": 0.01060364, + "auxiliary_loss_mlp": 0.0102394, + "balance_loss_clip": 1.01091599, + "balance_loss_mlp": 1.01969814, + "epoch": 0.6925597474823388, + "flos": 34752649138560.0, + "grad_norm": 1.477403353781455, + "language_loss": 0.6482383, + "learning_rate": 8.626509705298146e-07, + "loss": 0.66908133, + "num_input_tokens_seen": 248588190, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.40625, + "step": 11519, + "time_per_iteration": 2.567308187484741 + }, + { + "auxiliary_loss_clip": 0.01056493, + "auxiliary_loss_mlp": 0.01023736, + "balance_loss_clip": 1.01149881, + "balance_loss_mlp": 1.01786816, + "epoch": 0.6926198707350067, + "flos": 21870641122560.0, + "grad_norm": 1.8732355152320206, + "language_loss": 0.6266306, + "learning_rate": 8.623402082950926e-07, + "loss": 0.64743292, + "num_input_tokens_seen": 248606460, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.38671875, + "step": 11520, + "time_per_iteration": 5.31326961517334 + }, + { + "auxiliary_loss_clip": 0.01062424, + "auxiliary_loss_mlp": 0.01028571, + "balance_loss_clip": 1.01475453, + "balance_loss_mlp": 1.01976895, + "epoch": 0.6926799939876748, + "flos": 13005425272320.0, + "grad_norm": 1.7657278138370855, + "language_loss": 0.77976549, + "learning_rate": 8.620294866605204e-07, + "loss": 0.80067551, + "num_input_tokens_seen": 248623715, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.42578125, + "step": 11521, + "time_per_iteration": 2.357619047164917 + }, + { + "auxiliary_loss_clip": 0.01060903, + "auxiliary_loss_mlp": 0.01028901, + "balance_loss_clip": 1.01591325, + "balance_loss_mlp": 1.0190146, + "epoch": 0.6927401172403427, + "flos": 16760369174400.0, + "grad_norm": 1.9437399619855908, + "language_loss": 0.82078904, + "learning_rate": 8.617188056371894e-07, + "loss": 0.84168708, + "num_input_tokens_seen": 248640575, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.41796875, + "step": 11522, + "time_per_iteration": 2.4031827449798584 + }, + { + "auxiliary_loss_clip": 0.01059518, + "auxiliary_loss_mlp": 0.0102177, + "balance_loss_clip": 1.01002169, + "balance_loss_mlp": 1.02030516, + "epoch": 0.6928002404930107, + "flos": 25704383696640.0, + "grad_norm": 1.5727011642211104, + "language_loss": 0.76843268, + "learning_rate": 8.614081652361855e-07, + "loss": 0.78924549, + "num_input_tokens_seen": 248663535, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.39257812, + "step": 11523, + "time_per_iteration": 2.5318384170532227 + }, + { + "auxiliary_loss_clip": 0.01056716, + "auxiliary_loss_mlp": 0.01024979, + "balance_loss_clip": 1.01364255, + "balance_loss_mlp": 1.0185504, + "epoch": 0.6928603637456786, + "flos": 18587269221120.0, + "grad_norm": 1.8093865614650628, + "language_loss": 0.67945027, + "learning_rate": 8.61097565468597e-07, + "loss": 0.70026726, + "num_input_tokens_seen": 248681125, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.3828125, + "step": 11524, + "time_per_iteration": 2.377894401550293 + }, + { + "auxiliary_loss_clip": 0.01058506, + "auxiliary_loss_mlp": 0.01025616, + "balance_loss_clip": 1.01290274, + "balance_loss_mlp": 1.01859355, + "epoch": 0.6929204869983466, + "flos": 22199767309440.0, + "grad_norm": 2.845438781347939, + "language_loss": 0.6476742, + "learning_rate": 8.607870063455051e-07, + "loss": 0.66851538, + "num_input_tokens_seen": 248700555, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.3984375, + "step": 11525, + "time_per_iteration": 2.3876404762268066 + }, + { + "auxiliary_loss_clip": 0.01059428, + "auxiliary_loss_mlp": 0.01027693, + "balance_loss_clip": 1.01573014, + "balance_loss_mlp": 1.01990032, + "epoch": 0.6929806102510145, + "flos": 17893789418880.0, + "grad_norm": 1.9657594286079807, + "language_loss": 0.70448369, + "learning_rate": 8.604764878779953e-07, + "loss": 0.72535491, + "num_input_tokens_seen": 248716095, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.39453125, + "step": 11526, + "time_per_iteration": 2.358471632003784 + }, + { + "auxiliary_loss_clip": 0.0105639, + "auxiliary_loss_mlp": 0.01022411, + "balance_loss_clip": 1.01170039, + "balance_loss_mlp": 1.01829374, + "epoch": 0.6930407335036826, + "flos": 19754171326080.0, + "grad_norm": 2.6032180315907736, + "language_loss": 0.76053321, + "learning_rate": 8.601660100771486e-07, + "loss": 0.78132129, + "num_input_tokens_seen": 248735330, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.38085938, + "step": 11527, + "time_per_iteration": 3.907961130142212 + }, + { + "auxiliary_loss_clip": 0.01053593, + "auxiliary_loss_mlp": 0.01020906, + "balance_loss_clip": 1.00986159, + "balance_loss_mlp": 1.01682699, + "epoch": 0.6931008567563505, + "flos": 21543155769600.0, + "grad_norm": 1.7524413163197985, + "language_loss": 0.79381037, + "learning_rate": 8.598555729540449e-07, + "loss": 0.81455535, + "num_input_tokens_seen": 248754530, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.3671875, + "step": 11528, + "time_per_iteration": 2.422232151031494 + }, + { + "auxiliary_loss_clip": 0.01054056, + "auxiliary_loss_mlp": 0.01023979, + "balance_loss_clip": 1.01298761, + "balance_loss_mlp": 1.01761174, + "epoch": 0.6931609800090185, + "flos": 26248819438080.0, + "grad_norm": 1.480431010307303, + "language_loss": 0.76231259, + "learning_rate": 8.595451765197624e-07, + "loss": 0.78309298, + "num_input_tokens_seen": 248775825, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.36523438, + "step": 11529, + "time_per_iteration": 2.4562289714813232 + }, + { + "auxiliary_loss_clip": 0.01056749, + "auxiliary_loss_mlp": 0.01029118, + "balance_loss_clip": 1.01724505, + "balance_loss_mlp": 1.01871502, + "epoch": 0.6932211032616865, + "flos": 32342001292800.0, + "grad_norm": 1.7407637130370763, + "language_loss": 0.72664094, + "learning_rate": 8.592348207853795e-07, + "loss": 0.74749959, + "num_input_tokens_seen": 248796180, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.38085938, + "step": 11530, + "time_per_iteration": 2.481351852416992 + }, + { + "auxiliary_loss_clip": 0.01062145, + "auxiliary_loss_mlp": 0.01028835, + "balance_loss_clip": 1.01556063, + "balance_loss_mlp": 1.01984, + "epoch": 0.6932812265143544, + "flos": 22048139808000.0, + "grad_norm": 1.8466645014388083, + "language_loss": 0.78792739, + "learning_rate": 8.589245057619714e-07, + "loss": 0.80883718, + "num_input_tokens_seen": 248814735, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.42382812, + "step": 11531, + "time_per_iteration": 2.3914437294006348 + }, + { + "auxiliary_loss_clip": 0.01055454, + "auxiliary_loss_mlp": 0.01026462, + "balance_loss_clip": 1.01532817, + "balance_loss_mlp": 1.01747251, + "epoch": 0.6933413497670224, + "flos": 26255243128320.0, + "grad_norm": 1.4492978857960845, + "language_loss": 0.69378763, + "learning_rate": 8.586142314606126e-07, + "loss": 0.71460682, + "num_input_tokens_seen": 248839140, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.37890625, + "step": 11532, + "time_per_iteration": 2.4629294872283936 + }, + { + "auxiliary_loss_clip": 0.01058685, + "auxiliary_loss_mlp": 0.01028259, + "balance_loss_clip": 1.01558161, + "balance_loss_mlp": 1.01950228, + "epoch": 0.6934014730196904, + "flos": 19571994518400.0, + "grad_norm": 2.1983445203786234, + "language_loss": 0.66599333, + "learning_rate": 8.583039978923751e-07, + "loss": 0.68686283, + "num_input_tokens_seen": 248858300, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.390625, + "step": 11533, + "time_per_iteration": 2.3933334350585938 + }, + { + "auxiliary_loss_clip": 0.01058961, + "auxiliary_loss_mlp": 0.010245, + "balance_loss_clip": 1.01245427, + "balance_loss_mlp": 1.01869893, + "epoch": 0.6934615962723584, + "flos": 22118385196800.0, + "grad_norm": 2.617702345544729, + "language_loss": 0.59144402, + "learning_rate": 8.579938050683326e-07, + "loss": 0.61227858, + "num_input_tokens_seen": 248876310, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.40234375, + "step": 11534, + "time_per_iteration": 2.36181378364563 + }, + { + "auxiliary_loss_clip": 0.01056414, + "auxiliary_loss_mlp": 0.01022523, + "balance_loss_clip": 1.010566, + "balance_loss_mlp": 1.01771379, + "epoch": 0.6935217195250263, + "flos": 21359757064320.0, + "grad_norm": 2.3584645651836036, + "language_loss": 0.7141118, + "learning_rate": 8.57683652999553e-07, + "loss": 0.73490113, + "num_input_tokens_seen": 248895650, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.38671875, + "step": 11535, + "time_per_iteration": 2.395111322402954 + }, + { + "auxiliary_loss_clip": 0.01007225, + "auxiliary_loss_mlp": 0.01000505, + "balance_loss_clip": 0.9995867, + "balance_loss_mlp": 1.00103474, + "epoch": 0.6935818427776943, + "flos": 64060137901440.0, + "grad_norm": 0.7168904854642948, + "language_loss": 0.59071499, + "learning_rate": 8.573735416971046e-07, + "loss": 0.61079228, + "num_input_tokens_seen": 248963920, + "router_z_loss_clip": 0.00915527, + "router_z_loss_mlp": 0.06201172, + "step": 11536, + "time_per_iteration": 3.147862195968628 + }, + { + "auxiliary_loss_clip": 0.01059797, + "auxiliary_loss_mlp": 0.01022116, + "balance_loss_clip": 1.01009989, + "balance_loss_mlp": 1.01902699, + "epoch": 0.6936419660303622, + "flos": 20301539621760.0, + "grad_norm": 2.6634615054113455, + "language_loss": 0.72800362, + "learning_rate": 8.570634711720568e-07, + "loss": 0.74882275, + "num_input_tokens_seen": 248983380, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.40820312, + "step": 11537, + "time_per_iteration": 2.4104433059692383 + }, + { + "auxiliary_loss_clip": 0.01006853, + "auxiliary_loss_mlp": 0.0100137, + "balance_loss_clip": 1.00037479, + "balance_loss_mlp": 1.00064254, + "epoch": 0.6937020892830302, + "flos": 67179349123200.0, + "grad_norm": 0.7455322412419763, + "language_loss": 0.55545813, + "learning_rate": 8.567534414354722e-07, + "loss": 0.57554036, + "num_input_tokens_seen": 249044680, + "router_z_loss_clip": 0.00994873, + "router_z_loss_mlp": 0.06225586, + "step": 11538, + "time_per_iteration": 3.0526585578918457 + }, + { + "auxiliary_loss_clip": 0.01054275, + "auxiliary_loss_mlp": 0.01023008, + "balance_loss_clip": 1.01299417, + "balance_loss_mlp": 1.01828921, + "epoch": 0.6937622125356981, + "flos": 23877064713600.0, + "grad_norm": 1.4551193505206648, + "language_loss": 0.77582967, + "learning_rate": 8.564434524984172e-07, + "loss": 0.79660249, + "num_input_tokens_seen": 249061060, + "router_z_loss_clip": 0.10009766, + "router_z_loss_mlp": 0.359375, + "step": 11539, + "time_per_iteration": 2.450789451599121 + }, + { + "auxiliary_loss_clip": 0.01056952, + "auxiliary_loss_mlp": 0.01024614, + "balance_loss_clip": 1.01315236, + "balance_loss_mlp": 1.01848769, + "epoch": 0.6938223357883662, + "flos": 28729363559040.0, + "grad_norm": 1.6787987835089875, + "language_loss": 0.63959026, + "learning_rate": 8.561335043719531e-07, + "loss": 0.66040593, + "num_input_tokens_seen": 249081430, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.3828125, + "step": 11540, + "time_per_iteration": 2.4740142822265625 + }, + { + "auxiliary_loss_clip": 0.01057308, + "auxiliary_loss_mlp": 0.01026948, + "balance_loss_clip": 1.01581407, + "balance_loss_mlp": 1.01918817, + "epoch": 0.6938824590410341, + "flos": 28653846554880.0, + "grad_norm": 1.3885732647298505, + "language_loss": 0.86771792, + "learning_rate": 8.558235970671434e-07, + "loss": 0.88856053, + "num_input_tokens_seen": 249103020, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.38085938, + "step": 11541, + "time_per_iteration": 2.5124149322509766 + }, + { + "auxiliary_loss_clip": 0.01056338, + "auxiliary_loss_mlp": 0.01023022, + "balance_loss_clip": 1.01197755, + "balance_loss_mlp": 1.01867592, + "epoch": 0.6939425822937021, + "flos": 18982241965440.0, + "grad_norm": 1.6333697816278634, + "language_loss": 0.84127539, + "learning_rate": 8.555137305950448e-07, + "loss": 0.86206901, + "num_input_tokens_seen": 249120810, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.375, + "step": 11542, + "time_per_iteration": 2.435520648956299 + }, + { + "auxiliary_loss_clip": 0.01056599, + "auxiliary_loss_mlp": 0.01023095, + "balance_loss_clip": 1.01059651, + "balance_loss_mlp": 1.01755393, + "epoch": 0.6940027055463701, + "flos": 23074725692160.0, + "grad_norm": 2.2056245705710387, + "language_loss": 0.75236821, + "learning_rate": 8.552039049667181e-07, + "loss": 0.77316517, + "num_input_tokens_seen": 249138050, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.390625, + "step": 11543, + "time_per_iteration": 2.4653520584106445 + }, + { + "auxiliary_loss_clip": 0.01055604, + "auxiliary_loss_mlp": 0.01022651, + "balance_loss_clip": 1.01176739, + "balance_loss_mlp": 1.01761961, + "epoch": 0.694062828799038, + "flos": 18185593495680.0, + "grad_norm": 1.7353081441218665, + "language_loss": 0.76104534, + "learning_rate": 8.548941201932191e-07, + "loss": 0.78182787, + "num_input_tokens_seen": 249155570, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.37890625, + "step": 11544, + "time_per_iteration": 2.3722567558288574 + }, + { + "auxiliary_loss_clip": 0.01059558, + "auxiliary_loss_mlp": 0.01022304, + "balance_loss_clip": 1.01099682, + "balance_loss_mlp": 1.02051365, + "epoch": 0.694122952051706, + "flos": 17820576564480.0, + "grad_norm": 1.7496761794547757, + "language_loss": 0.71019757, + "learning_rate": 8.545843762856033e-07, + "loss": 0.73101616, + "num_input_tokens_seen": 249172960, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.390625, + "step": 11545, + "time_per_iteration": 2.370508909225464 + }, + { + "auxiliary_loss_clip": 0.01054952, + "auxiliary_loss_mlp": 0.01020689, + "balance_loss_clip": 1.01034153, + "balance_loss_mlp": 1.01891613, + "epoch": 0.694183075304374, + "flos": 21214239050880.0, + "grad_norm": 1.6497841023671944, + "language_loss": 0.79282618, + "learning_rate": 8.542746732549241e-07, + "loss": 0.81358266, + "num_input_tokens_seen": 249192450, + "router_z_loss_clip": 0.10351562, + "router_z_loss_mlp": 0.359375, + "step": 11546, + "time_per_iteration": 2.3843963146209717 + }, + { + "auxiliary_loss_clip": 0.01057453, + "auxiliary_loss_mlp": 0.01023892, + "balance_loss_clip": 1.01307344, + "balance_loss_mlp": 1.019279, + "epoch": 0.694243198557042, + "flos": 24059381166720.0, + "grad_norm": 1.3288660461771755, + "language_loss": 0.78910935, + "learning_rate": 8.539650111122363e-07, + "loss": 0.80992281, + "num_input_tokens_seen": 249214320, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.3828125, + "step": 11547, + "time_per_iteration": 2.4479877948760986 + }, + { + "auxiliary_loss_clip": 0.01058691, + "auxiliary_loss_mlp": 0.01029523, + "balance_loss_clip": 1.01793015, + "balance_loss_mlp": 1.0195272, + "epoch": 0.6943033218097099, + "flos": 21140816728320.0, + "grad_norm": 3.1180729561528113, + "language_loss": 0.80877328, + "learning_rate": 8.536553898685876e-07, + "loss": 0.82965535, + "num_input_tokens_seen": 249230925, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.39257812, + "step": 11548, + "time_per_iteration": 2.3709657192230225 + }, + { + "auxiliary_loss_clip": 0.0100705, + "auxiliary_loss_mlp": 0.01005607, + "balance_loss_clip": 1.00475514, + "balance_loss_mlp": 1.00080025, + "epoch": 0.6943634450623779, + "flos": 57808869943680.0, + "grad_norm": 0.6911178263562192, + "language_loss": 0.53694236, + "learning_rate": 8.533458095350302e-07, + "loss": 0.55706894, + "num_input_tokens_seen": 249293975, + "router_z_loss_clip": 0.00854492, + "router_z_loss_mlp": 0.0625, + "step": 11549, + "time_per_iteration": 3.0338873863220215 + }, + { + "auxiliary_loss_clip": 0.01056873, + "auxiliary_loss_mlp": 0.01023771, + "balance_loss_clip": 1.01196957, + "balance_loss_mlp": 1.0182519, + "epoch": 0.6944235683150458, + "flos": 30589396352640.0, + "grad_norm": 1.5292478161263885, + "language_loss": 0.73196268, + "learning_rate": 8.530362701226111e-07, + "loss": 0.75276911, + "num_input_tokens_seen": 249315285, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.38671875, + "step": 11550, + "time_per_iteration": 3.8827273845672607 + }, + { + "auxiliary_loss_clip": 0.01060138, + "auxiliary_loss_mlp": 0.01026313, + "balance_loss_clip": 1.01451182, + "balance_loss_mlp": 1.02037477, + "epoch": 0.6944836915677138, + "flos": 19718420227200.0, + "grad_norm": 1.7142111549006143, + "language_loss": 0.74255174, + "learning_rate": 8.527267716423774e-07, + "loss": 0.76341629, + "num_input_tokens_seen": 249333505, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.3984375, + "step": 11551, + "time_per_iteration": 2.3918135166168213 + }, + { + "auxiliary_loss_clip": 0.01057994, + "auxiliary_loss_mlp": 0.01025763, + "balance_loss_clip": 1.01428926, + "balance_loss_mlp": 1.01932371, + "epoch": 0.6945438148203817, + "flos": 24862418415360.0, + "grad_norm": 1.5572472049952308, + "language_loss": 0.84225774, + "learning_rate": 8.524173141053739e-07, + "loss": 0.86309534, + "num_input_tokens_seen": 249354180, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.38671875, + "step": 11552, + "time_per_iteration": 2.4191043376922607 + }, + { + "auxiliary_loss_clip": 0.01056707, + "auxiliary_loss_mlp": 0.01026175, + "balance_loss_clip": 1.01476097, + "balance_loss_mlp": 1.01916802, + "epoch": 0.6946039380730498, + "flos": 33325295224320.0, + "grad_norm": 1.4894123645715933, + "language_loss": 0.67282939, + "learning_rate": 8.521078975226439e-07, + "loss": 0.69365823, + "num_input_tokens_seen": 249377035, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.375, + "step": 11553, + "time_per_iteration": 2.506551742553711 + }, + { + "auxiliary_loss_clip": 0.01059541, + "auxiliary_loss_mlp": 0.01022922, + "balance_loss_clip": 1.01009548, + "balance_loss_mlp": 1.01943052, + "epoch": 0.6946640613257177, + "flos": 20849885435520.0, + "grad_norm": 1.5636652032436469, + "language_loss": 0.7959134, + "learning_rate": 8.517985219052317e-07, + "loss": 0.81673801, + "num_input_tokens_seen": 249396155, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.40039062, + "step": 11554, + "time_per_iteration": 2.396320104598999 + }, + { + "auxiliary_loss_clip": 0.01059004, + "auxiliary_loss_mlp": 0.01024242, + "balance_loss_clip": 1.01266074, + "balance_loss_mlp": 1.01909304, + "epoch": 0.6947241845783857, + "flos": 19353822232320.0, + "grad_norm": 1.6469504579866097, + "language_loss": 0.72498322, + "learning_rate": 8.514891872641751e-07, + "loss": 0.74581563, + "num_input_tokens_seen": 249414555, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.40039062, + "step": 11555, + "time_per_iteration": 2.432352066040039 + }, + { + "auxiliary_loss_clip": 0.01056633, + "auxiliary_loss_mlp": 0.01024276, + "balance_loss_clip": 1.01221204, + "balance_loss_mlp": 1.01910353, + "epoch": 0.6947843078310536, + "flos": 27119169521280.0, + "grad_norm": 1.9989867783414101, + "language_loss": 0.78678751, + "learning_rate": 8.511798936105162e-07, + "loss": 0.80759656, + "num_input_tokens_seen": 249433570, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.375, + "step": 11556, + "time_per_iteration": 2.4583423137664795 + }, + { + "auxiliary_loss_clip": 0.01057014, + "auxiliary_loss_mlp": 0.01024678, + "balance_loss_clip": 1.01314425, + "balance_loss_mlp": 1.01868069, + "epoch": 0.6948444310837216, + "flos": 28583845545600.0, + "grad_norm": 3.2912408694389814, + "language_loss": 0.60186458, + "learning_rate": 8.508706409552908e-07, + "loss": 0.6226815, + "num_input_tokens_seen": 249453735, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.3828125, + "step": 11557, + "time_per_iteration": 2.482941150665283 + }, + { + "auxiliary_loss_clip": 0.01056099, + "auxiliary_loss_mlp": 0.01025671, + "balance_loss_clip": 1.01431, + "balance_loss_mlp": 1.01829267, + "epoch": 0.6949045543363896, + "flos": 15668355669120.0, + "grad_norm": 1.704783820406344, + "language_loss": 0.85199785, + "learning_rate": 8.505614293095378e-07, + "loss": 0.87281549, + "num_input_tokens_seen": 249470805, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.37890625, + "step": 11558, + "time_per_iteration": 2.3726584911346436 + }, + { + "auxiliary_loss_clip": 0.01057768, + "auxiliary_loss_mlp": 0.01024532, + "balance_loss_clip": 1.01180613, + "balance_loss_mlp": 1.0184505, + "epoch": 0.6949646775890576, + "flos": 23258264042880.0, + "grad_norm": 1.6367991820869667, + "language_loss": 0.70720005, + "learning_rate": 8.502522586842893e-07, + "loss": 0.72802305, + "num_input_tokens_seen": 249491150, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.39257812, + "step": 11559, + "time_per_iteration": 3.849191188812256 + }, + { + "auxiliary_loss_clip": 0.01056258, + "auxiliary_loss_mlp": 0.0102197, + "balance_loss_clip": 1.01049614, + "balance_loss_mlp": 1.0189569, + "epoch": 0.6950248008417256, + "flos": 22381455358080.0, + "grad_norm": 1.6147370729110062, + "language_loss": 0.78957355, + "learning_rate": 8.499431290905809e-07, + "loss": 0.81035578, + "num_input_tokens_seen": 249511560, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.37304688, + "step": 11560, + "time_per_iteration": 3.8619377613067627 + }, + { + "auxiliary_loss_clip": 0.01056116, + "auxiliary_loss_mlp": 0.01022659, + "balance_loss_clip": 1.01163208, + "balance_loss_mlp": 1.01919413, + "epoch": 0.6950849240943935, + "flos": 23476226860800.0, + "grad_norm": 1.4933872146830247, + "language_loss": 0.769418, + "learning_rate": 8.496340405394437e-07, + "loss": 0.79020578, + "num_input_tokens_seen": 249531910, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.36914062, + "step": 11561, + "time_per_iteration": 2.4656670093536377 + }, + { + "auxiliary_loss_clip": 0.01055107, + "auxiliary_loss_mlp": 0.01025927, + "balance_loss_clip": 1.01390457, + "balance_loss_mlp": 1.01680803, + "epoch": 0.6951450473470615, + "flos": 17419599066240.0, + "grad_norm": 1.9905931568333248, + "language_loss": 0.78636116, + "learning_rate": 8.493249930419089e-07, + "loss": 0.80717146, + "num_input_tokens_seen": 249550300, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.3828125, + "step": 11562, + "time_per_iteration": 2.4283411502838135 + }, + { + "auxiliary_loss_clip": 0.01056907, + "auxiliary_loss_mlp": 0.01021374, + "balance_loss_clip": 1.00999534, + "balance_loss_mlp": 1.01859772, + "epoch": 0.6952051705997294, + "flos": 20484693947520.0, + "grad_norm": 2.0867973951270518, + "language_loss": 0.69257104, + "learning_rate": 8.490159866090043e-07, + "loss": 0.71335381, + "num_input_tokens_seen": 249567740, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.3828125, + "step": 11563, + "time_per_iteration": 2.3882150650024414 + }, + { + "auxiliary_loss_clip": 0.01053333, + "auxiliary_loss_mlp": 0.0102461, + "balance_loss_clip": 1.01361275, + "balance_loss_mlp": 1.01737475, + "epoch": 0.6952652938523974, + "flos": 13988719203840.0, + "grad_norm": 2.313782650450377, + "language_loss": 0.73555362, + "learning_rate": 8.487070212517598e-07, + "loss": 0.75633311, + "num_input_tokens_seen": 249582700, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.359375, + "step": 11564, + "time_per_iteration": 2.395888090133667 + }, + { + "auxiliary_loss_clip": 0.01066215, + "auxiliary_loss_mlp": 0.01028703, + "balance_loss_clip": 1.01460028, + "balance_loss_mlp": 1.02225459, + "epoch": 0.6953254171050653, + "flos": 30952702627200.0, + "grad_norm": 3.2479556909129634, + "language_loss": 0.72287405, + "learning_rate": 8.483980969811994e-07, + "loss": 0.74382329, + "num_input_tokens_seen": 249602920, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.43945312, + "step": 11565, + "time_per_iteration": 2.461282730102539 + }, + { + "auxiliary_loss_clip": 0.01006787, + "auxiliary_loss_mlp": 0.01001524, + "balance_loss_clip": 1.00064754, + "balance_loss_mlp": 1.00067115, + "epoch": 0.6953855403577334, + "flos": 61667261804160.0, + "grad_norm": 0.8897515446812349, + "language_loss": 0.5833807, + "learning_rate": 8.480892138083482e-07, + "loss": 0.60346377, + "num_input_tokens_seen": 249660400, + "router_z_loss_clip": 0.00878906, + "router_z_loss_mlp": 0.06103516, + "step": 11566, + "time_per_iteration": 2.9048550128936768 + }, + { + "auxiliary_loss_clip": 0.01058578, + "auxiliary_loss_mlp": 0.01024779, + "balance_loss_clip": 1.01263165, + "balance_loss_mlp": 1.01803517, + "epoch": 0.6954456636104013, + "flos": 23037927252480.0, + "grad_norm": 2.7177999671552175, + "language_loss": 0.74484479, + "learning_rate": 8.477803717442305e-07, + "loss": 0.76567841, + "num_input_tokens_seen": 249679335, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.40625, + "step": 11567, + "time_per_iteration": 3.834009885787964 + }, + { + "auxiliary_loss_clip": 0.01054255, + "auxiliary_loss_mlp": 0.01021794, + "balance_loss_clip": 1.01020098, + "balance_loss_mlp": 1.01730442, + "epoch": 0.6955057868630693, + "flos": 23917284466560.0, + "grad_norm": 1.289120769961404, + "language_loss": 0.76905614, + "learning_rate": 8.474715707998676e-07, + "loss": 0.78981662, + "num_input_tokens_seen": 249701805, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.37109375, + "step": 11568, + "time_per_iteration": 2.481703519821167 + }, + { + "auxiliary_loss_clip": 0.01058396, + "auxiliary_loss_mlp": 0.01025504, + "balance_loss_clip": 1.01413131, + "balance_loss_mlp": 1.0200119, + "epoch": 0.6955659101157372, + "flos": 22593727624320.0, + "grad_norm": 1.572816379059826, + "language_loss": 0.72716689, + "learning_rate": 8.471628109862794e-07, + "loss": 0.74800587, + "num_input_tokens_seen": 249720550, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.38476562, + "step": 11569, + "time_per_iteration": 2.4165775775909424 + }, + { + "auxiliary_loss_clip": 0.01059958, + "auxiliary_loss_mlp": 0.01027004, + "balance_loss_clip": 1.01517296, + "balance_loss_mlp": 1.02005172, + "epoch": 0.6956260333684052, + "flos": 24571347477120.0, + "grad_norm": 1.4946353599811, + "language_loss": 0.76806593, + "learning_rate": 8.468540923144845e-07, + "loss": 0.78893554, + "num_input_tokens_seen": 249740325, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.3984375, + "step": 11570, + "time_per_iteration": 2.43357253074646 + }, + { + "auxiliary_loss_clip": 0.0105642, + "auxiliary_loss_mlp": 0.01027692, + "balance_loss_clip": 1.01536584, + "balance_loss_mlp": 1.01770723, + "epoch": 0.6956861566210732, + "flos": 25844944296960.0, + "grad_norm": 1.8736464532200787, + "language_loss": 0.74716872, + "learning_rate": 8.465454147955023e-07, + "loss": 0.76800978, + "num_input_tokens_seen": 249760570, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.38671875, + "step": 11571, + "time_per_iteration": 2.439837694168091 + }, + { + "auxiliary_loss_clip": 0.01054939, + "auxiliary_loss_mlp": 0.0102291, + "balance_loss_clip": 1.01121569, + "balance_loss_mlp": 1.01894462, + "epoch": 0.6957462798737412, + "flos": 15300580740480.0, + "grad_norm": 1.7327751721463591, + "language_loss": 0.74392939, + "learning_rate": 8.462367784403457e-07, + "loss": 0.76470792, + "num_input_tokens_seen": 249778290, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.36132812, + "step": 11572, + "time_per_iteration": 2.3755388259887695 + }, + { + "auxiliary_loss_clip": 0.01056337, + "auxiliary_loss_mlp": 0.01022099, + "balance_loss_clip": 1.01031542, + "balance_loss_mlp": 1.01769781, + "epoch": 0.6958064031264092, + "flos": 36245360851200.0, + "grad_norm": 3.2548106853672305, + "language_loss": 0.70133024, + "learning_rate": 8.459281832600314e-07, + "loss": 0.72211456, + "num_input_tokens_seen": 249800925, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.38671875, + "step": 11573, + "time_per_iteration": 2.5487332344055176 + }, + { + "auxiliary_loss_clip": 0.01060603, + "auxiliary_loss_mlp": 0.01022776, + "balance_loss_clip": 1.01105189, + "balance_loss_mlp": 1.01991725, + "epoch": 0.6958665263790771, + "flos": 19207710725760.0, + "grad_norm": 1.6468494373356206, + "language_loss": 0.74430799, + "learning_rate": 8.456196292655706e-07, + "loss": 0.76514173, + "num_input_tokens_seen": 249820500, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.40625, + "step": 11574, + "time_per_iteration": 2.434858798980713 + }, + { + "auxiliary_loss_clip": 0.01058729, + "auxiliary_loss_mlp": 0.01022411, + "balance_loss_clip": 1.01110423, + "balance_loss_mlp": 1.01954496, + "epoch": 0.6959266496317451, + "flos": 21794844827520.0, + "grad_norm": 1.713561162731871, + "language_loss": 0.7460078, + "learning_rate": 8.453111164679776e-07, + "loss": 0.76681918, + "num_input_tokens_seen": 249839845, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.39257812, + "step": 11575, + "time_per_iteration": 2.3965299129486084 + }, + { + "auxiliary_loss_clip": 0.01058423, + "auxiliary_loss_mlp": 0.01025572, + "balance_loss_clip": 1.01339507, + "balance_loss_mlp": 1.01897514, + "epoch": 0.695986772884413, + "flos": 20557208574720.0, + "grad_norm": 1.9322510302989326, + "language_loss": 0.78751445, + "learning_rate": 8.45002644878259e-07, + "loss": 0.8083545, + "num_input_tokens_seen": 249857400, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.39453125, + "step": 11576, + "time_per_iteration": 2.417430877685547 + }, + { + "auxiliary_loss_clip": 0.01059877, + "auxiliary_loss_mlp": 0.01025681, + "balance_loss_clip": 1.01378357, + "balance_loss_mlp": 1.01909292, + "epoch": 0.696046896137081, + "flos": 14935424163840.0, + "grad_norm": 3.032430723144245, + "language_loss": 0.56755459, + "learning_rate": 8.446942145074258e-07, + "loss": 0.5884102, + "num_input_tokens_seen": 249871645, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.40625, + "step": 11577, + "time_per_iteration": 2.3656578063964844 + }, + { + "auxiliary_loss_clip": 0.01054178, + "auxiliary_loss_mlp": 0.0101999, + "balance_loss_clip": 1.00983334, + "balance_loss_mlp": 1.01813221, + "epoch": 0.696107019389749, + "flos": 30738824438400.0, + "grad_norm": 1.3457710022287621, + "language_loss": 0.76657784, + "learning_rate": 8.443858253664844e-07, + "loss": 0.78731954, + "num_input_tokens_seen": 249894215, + "router_z_loss_clip": 0.1015625, + "router_z_loss_mlp": 0.36132812, + "step": 11578, + "time_per_iteration": 2.479719400405884 + }, + { + "auxiliary_loss_clip": 0.01057761, + "auxiliary_loss_mlp": 0.01025584, + "balance_loss_clip": 1.01454544, + "balance_loss_mlp": 1.0191642, + "epoch": 0.696167142642417, + "flos": 20775695063040.0, + "grad_norm": 1.8627492430772956, + "language_loss": 0.79436916, + "learning_rate": 8.440774774664401e-07, + "loss": 0.81520259, + "num_input_tokens_seen": 249912850, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.38671875, + "step": 11579, + "time_per_iteration": 2.398063898086548 + }, + { + "auxiliary_loss_clip": 0.01054449, + "auxiliary_loss_mlp": 0.01020003, + "balance_loss_clip": 1.00894046, + "balance_loss_mlp": 1.01735735, + "epoch": 0.6962272658950849, + "flos": 22564051102080.0, + "grad_norm": 1.8097180211976995, + "language_loss": 0.72353023, + "learning_rate": 8.437691708182975e-07, + "loss": 0.74427474, + "num_input_tokens_seen": 249932650, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.37109375, + "step": 11580, + "time_per_iteration": 2.4092466831207275 + }, + { + "auxiliary_loss_clip": 0.01058917, + "auxiliary_loss_mlp": 0.01022385, + "balance_loss_clip": 1.00995779, + "balance_loss_mlp": 1.01893473, + "epoch": 0.6962873891477529, + "flos": 22199069082240.0, + "grad_norm": 2.9217707541157916, + "language_loss": 0.65649164, + "learning_rate": 8.434609054330586e-07, + "loss": 0.67730474, + "num_input_tokens_seen": 249951205, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.40039062, + "step": 11581, + "time_per_iteration": 2.422780752182007 + }, + { + "auxiliary_loss_clip": 0.01057391, + "auxiliary_loss_mlp": 0.01022935, + "balance_loss_clip": 1.01160967, + "balance_loss_mlp": 1.0189085, + "epoch": 0.6963475124004208, + "flos": 12489025219200.0, + "grad_norm": 2.167512787680744, + "language_loss": 0.76840341, + "learning_rate": 8.431526813217254e-07, + "loss": 0.78920668, + "num_input_tokens_seen": 249967045, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.38476562, + "step": 11582, + "time_per_iteration": 2.3754336833953857 + }, + { + "auxiliary_loss_clip": 0.01056984, + "auxiliary_loss_mlp": 0.01024288, + "balance_loss_clip": 1.01309466, + "balance_loss_mlp": 1.01786733, + "epoch": 0.6964076356530888, + "flos": 17164139581440.0, + "grad_norm": 1.9745121665137888, + "language_loss": 0.69795537, + "learning_rate": 8.428444984952962e-07, + "loss": 0.71876812, + "num_input_tokens_seen": 249984565, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.390625, + "step": 11583, + "time_per_iteration": 2.38165545463562 + }, + { + "auxiliary_loss_clip": 0.0105924, + "auxiliary_loss_mlp": 0.01026124, + "balance_loss_clip": 1.01393521, + "balance_loss_mlp": 1.0198133, + "epoch": 0.6964677589057569, + "flos": 19936313222400.0, + "grad_norm": 1.9426067864757033, + "language_loss": 0.82180035, + "learning_rate": 8.425363569647712e-07, + "loss": 0.84265399, + "num_input_tokens_seen": 250004235, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.39453125, + "step": 11584, + "time_per_iteration": 2.3950014114379883 + }, + { + "auxiliary_loss_clip": 0.01060533, + "auxiliary_loss_mlp": 0.01023684, + "balance_loss_clip": 1.01126814, + "balance_loss_mlp": 1.02044046, + "epoch": 0.6965278821584248, + "flos": 22782956526720.0, + "grad_norm": 2.268569791794884, + "language_loss": 0.80149323, + "learning_rate": 8.422282567411463e-07, + "loss": 0.82233536, + "num_input_tokens_seen": 250017645, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.40039062, + "step": 11585, + "time_per_iteration": 2.4173247814178467 + }, + { + "auxiliary_loss_clip": 0.01054375, + "auxiliary_loss_mlp": 0.01023284, + "balance_loss_clip": 1.01244211, + "balance_loss_mlp": 1.0175935, + "epoch": 0.6965880054110928, + "flos": 20046533984640.0, + "grad_norm": 1.6737171806398161, + "language_loss": 0.77688074, + "learning_rate": 8.419201978354167e-07, + "loss": 0.79765725, + "num_input_tokens_seen": 250037640, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.3671875, + "step": 11586, + "time_per_iteration": 2.4148612022399902 + }, + { + "auxiliary_loss_clip": 0.01051594, + "auxiliary_loss_mlp": 0.01022498, + "balance_loss_clip": 1.01241291, + "balance_loss_mlp": 1.01690078, + "epoch": 0.6966481286637607, + "flos": 21907160271360.0, + "grad_norm": 1.563034281241357, + "language_loss": 0.78270346, + "learning_rate": 8.416121802585756e-07, + "loss": 0.80344433, + "num_input_tokens_seen": 250056490, + "router_z_loss_clip": 0.10107422, + "router_z_loss_mlp": 0.34765625, + "step": 11587, + "time_per_iteration": 2.4288203716278076 + }, + { + "auxiliary_loss_clip": 0.01056144, + "auxiliary_loss_mlp": 0.01026397, + "balance_loss_clip": 1.0149945, + "balance_loss_mlp": 1.01936579, + "epoch": 0.6967082519164287, + "flos": 15632255456640.0, + "grad_norm": 2.104499894644182, + "language_loss": 0.72766531, + "learning_rate": 8.413042040216173e-07, + "loss": 0.74849069, + "num_input_tokens_seen": 250074285, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.3671875, + "step": 11588, + "time_per_iteration": 2.3855245113372803 + }, + { + "auxiliary_loss_clip": 0.01055548, + "auxiliary_loss_mlp": 0.01023311, + "balance_loss_clip": 1.01267135, + "balance_loss_mlp": 1.01786995, + "epoch": 0.6967683751690966, + "flos": 24023455511040.0, + "grad_norm": 1.887320285612445, + "language_loss": 0.75216579, + "learning_rate": 8.409962691355303e-07, + "loss": 0.77295446, + "num_input_tokens_seen": 250093350, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.37695312, + "step": 11589, + "time_per_iteration": 3.8515000343322754 + }, + { + "auxiliary_loss_clip": 0.01058075, + "auxiliary_loss_mlp": 0.01024663, + "balance_loss_clip": 1.01290274, + "balance_loss_mlp": 1.01868796, + "epoch": 0.6968284984217646, + "flos": 31023506597760.0, + "grad_norm": 4.091222977359106, + "language_loss": 0.63189566, + "learning_rate": 8.406883756113059e-07, + "loss": 0.65272295, + "num_input_tokens_seen": 250114170, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.39453125, + "step": 11590, + "time_per_iteration": 2.4701828956604004 + }, + { + "auxiliary_loss_clip": 0.0100739, + "auxiliary_loss_mlp": 0.0100036, + "balance_loss_clip": 0.9993943, + "balance_loss_mlp": 1.0012995, + "epoch": 0.6968886216744326, + "flos": 67619673590400.0, + "grad_norm": 0.7717877708492105, + "language_loss": 0.61280918, + "learning_rate": 8.403805234599311e-07, + "loss": 0.63288677, + "num_input_tokens_seen": 250178250, + "router_z_loss_clip": 0.00964355, + "router_z_loss_mlp": 0.06103516, + "step": 11591, + "time_per_iteration": 3.1584770679473877 + }, + { + "auxiliary_loss_clip": 0.01059213, + "auxiliary_loss_mlp": 0.01027012, + "balance_loss_clip": 1.01518643, + "balance_loss_mlp": 1.01939559, + "epoch": 0.6969487449271006, + "flos": 24862523149440.0, + "grad_norm": 1.815671405650028, + "language_loss": 0.69228065, + "learning_rate": 8.400727126923926e-07, + "loss": 0.71314287, + "num_input_tokens_seen": 250198420, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.3984375, + "step": 11592, + "time_per_iteration": 2.4369304180145264 + }, + { + "auxiliary_loss_clip": 0.01055948, + "auxiliary_loss_mlp": 0.0102066, + "balance_loss_clip": 1.00968683, + "balance_loss_mlp": 1.01905382, + "epoch": 0.6970088681797685, + "flos": 28766580935040.0, + "grad_norm": 1.7240237918397592, + "language_loss": 0.62357879, + "learning_rate": 8.397649433196742e-07, + "loss": 0.64434481, + "num_input_tokens_seen": 250220650, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.36914062, + "step": 11593, + "time_per_iteration": 2.4687740802764893 + }, + { + "auxiliary_loss_clip": 0.01053656, + "auxiliary_loss_mlp": 0.01019872, + "balance_loss_clip": 1.00947094, + "balance_loss_mlp": 1.01855767, + "epoch": 0.6970689914324365, + "flos": 27307316171520.0, + "grad_norm": 1.3739315408035249, + "language_loss": 0.54225957, + "learning_rate": 8.394572153527617e-07, + "loss": 0.56299484, + "num_input_tokens_seen": 250241750, + "router_z_loss_clip": 0.10449219, + "router_z_loss_mlp": 0.3515625, + "step": 11594, + "time_per_iteration": 2.4652693271636963 + }, + { + "auxiliary_loss_clip": 0.01054898, + "auxiliary_loss_mlp": 0.01021367, + "balance_loss_clip": 1.01025677, + "balance_loss_mlp": 1.01749754, + "epoch": 0.6971291146851044, + "flos": 19135231009920.0, + "grad_norm": 2.2270881250152574, + "language_loss": 0.76985133, + "learning_rate": 8.391495288026365e-07, + "loss": 0.79061395, + "num_input_tokens_seen": 250259445, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.37304688, + "step": 11595, + "time_per_iteration": 2.4320034980773926 + }, + { + "auxiliary_loss_clip": 0.01061321, + "auxiliary_loss_mlp": 0.0102489, + "balance_loss_clip": 1.01219416, + "balance_loss_mlp": 1.02040839, + "epoch": 0.6971892379377724, + "flos": 14609649467520.0, + "grad_norm": 2.1999942293134462, + "language_loss": 0.71370578, + "learning_rate": 8.388418836802771e-07, + "loss": 0.73456794, + "num_input_tokens_seen": 250275640, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.40820312, + "step": 11596, + "time_per_iteration": 2.369732618331909 + }, + { + "auxiliary_loss_clip": 0.0105888, + "auxiliary_loss_mlp": 0.01025494, + "balance_loss_clip": 1.01352525, + "balance_loss_mlp": 1.0203464, + "epoch": 0.6972493611904405, + "flos": 22306427112960.0, + "grad_norm": 1.7010348276613103, + "language_loss": 0.76273173, + "learning_rate": 8.385342799966646e-07, + "loss": 0.78357548, + "num_input_tokens_seen": 250296435, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.38476562, + "step": 11597, + "time_per_iteration": 2.428927183151245 + }, + { + "auxiliary_loss_clip": 0.01056598, + "auxiliary_loss_mlp": 0.0102391, + "balance_loss_clip": 1.01270485, + "balance_loss_mlp": 1.01893306, + "epoch": 0.6973094844431084, + "flos": 17419424509440.0, + "grad_norm": 2.0503737398634074, + "language_loss": 0.74775612, + "learning_rate": 8.382267177627762e-07, + "loss": 0.76856124, + "num_input_tokens_seen": 250314035, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.37695312, + "step": 11598, + "time_per_iteration": 3.7944259643554688 + }, + { + "auxiliary_loss_clip": 0.01060141, + "auxiliary_loss_mlp": 0.01025903, + "balance_loss_clip": 1.01358318, + "balance_loss_mlp": 1.01984024, + "epoch": 0.6973696076957764, + "flos": 27234138228480.0, + "grad_norm": 1.8556924198053064, + "language_loss": 0.89722806, + "learning_rate": 8.379191969895876e-07, + "loss": 0.9180885, + "num_input_tokens_seen": 250332995, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.40234375, + "step": 11599, + "time_per_iteration": 3.9041433334350586 + }, + { + "auxiliary_loss_clip": 0.01060798, + "auxiliary_loss_mlp": 0.01027654, + "balance_loss_clip": 1.01427221, + "balance_loss_mlp": 1.01958799, + "epoch": 0.6974297309484443, + "flos": 22016997008640.0, + "grad_norm": 2.5048727023857595, + "language_loss": 0.70126086, + "learning_rate": 8.37611717688073e-07, + "loss": 0.72214544, + "num_input_tokens_seen": 250352120, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.4140625, + "step": 11600, + "time_per_iteration": 2.4482600688934326 + }, + { + "auxiliary_loss_clip": 0.01057344, + "auxiliary_loss_mlp": 0.01021593, + "balance_loss_clip": 1.01006591, + "balance_loss_mlp": 1.01804495, + "epoch": 0.6974898542011123, + "flos": 28365184500480.0, + "grad_norm": 2.1652871462856043, + "language_loss": 0.7677995, + "learning_rate": 8.37304279869207e-07, + "loss": 0.78858888, + "num_input_tokens_seen": 250371705, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.39257812, + "step": 11601, + "time_per_iteration": 2.4721298217773438 + }, + { + "auxiliary_loss_clip": 0.01054163, + "auxiliary_loss_mlp": 0.01023344, + "balance_loss_clip": 1.01257932, + "balance_loss_mlp": 1.01844871, + "epoch": 0.6975499774537802, + "flos": 15231138312960.0, + "grad_norm": 1.8010813967439945, + "language_loss": 0.7176035, + "learning_rate": 8.369968835439604e-07, + "loss": 0.73837858, + "num_input_tokens_seen": 250390485, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.35742188, + "step": 11602, + "time_per_iteration": 2.410259485244751 + }, + { + "auxiliary_loss_clip": 0.01056099, + "auxiliary_loss_mlp": 0.01024835, + "balance_loss_clip": 1.01371861, + "balance_loss_mlp": 1.01787424, + "epoch": 0.6976101007064482, + "flos": 22156510268160.0, + "grad_norm": 3.227678999041087, + "language_loss": 0.76474613, + "learning_rate": 8.366895287233033e-07, + "loss": 0.78555548, + "num_input_tokens_seen": 250407020, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.3828125, + "step": 11603, + "time_per_iteration": 2.394674777984619 + }, + { + "auxiliary_loss_clip": 0.01055589, + "auxiliary_loss_mlp": 0.01023544, + "balance_loss_clip": 1.01238048, + "balance_loss_mlp": 1.01783395, + "epoch": 0.6976702239591162, + "flos": 22272421582080.0, + "grad_norm": 1.5404749035713936, + "language_loss": 0.62320578, + "learning_rate": 8.363822154182039e-07, + "loss": 0.64399713, + "num_input_tokens_seen": 250425880, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.37695312, + "step": 11604, + "time_per_iteration": 2.432276487350464 + }, + { + "auxiliary_loss_clip": 0.01058692, + "auxiliary_loss_mlp": 0.01025193, + "balance_loss_clip": 1.01267052, + "balance_loss_mlp": 1.01933622, + "epoch": 0.6977303472117842, + "flos": 25847423003520.0, + "grad_norm": 2.1267047515175306, + "language_loss": 0.81346488, + "learning_rate": 8.360749436396315e-07, + "loss": 0.83430374, + "num_input_tokens_seen": 250442925, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.39453125, + "step": 11605, + "time_per_iteration": 2.412855625152588 + }, + { + "auxiliary_loss_clip": 0.01057611, + "auxiliary_loss_mlp": 0.01026642, + "balance_loss_clip": 1.01494193, + "balance_loss_mlp": 1.01901817, + "epoch": 0.6977904704644521, + "flos": 20958535186560.0, + "grad_norm": 1.7653885175526292, + "language_loss": 0.70317972, + "learning_rate": 8.35767713398549e-07, + "loss": 0.72402227, + "num_input_tokens_seen": 250461220, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.38671875, + "step": 11606, + "time_per_iteration": 3.801241874694824 + }, + { + "auxiliary_loss_clip": 0.01058691, + "auxiliary_loss_mlp": 0.01026594, + "balance_loss_clip": 1.014727, + "balance_loss_mlp": 1.01849484, + "epoch": 0.6978505937171201, + "flos": 22053935093760.0, + "grad_norm": 1.9553472566166317, + "language_loss": 0.82355011, + "learning_rate": 8.354605247059228e-07, + "loss": 0.84440297, + "num_input_tokens_seen": 250480975, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.40234375, + "step": 11607, + "time_per_iteration": 2.4186203479766846 + }, + { + "auxiliary_loss_clip": 0.01061525, + "auxiliary_loss_mlp": 0.01023725, + "balance_loss_clip": 1.01183987, + "balance_loss_mlp": 1.02057588, + "epoch": 0.697910716969788, + "flos": 20042798469120.0, + "grad_norm": 1.8241352695096273, + "language_loss": 0.79071033, + "learning_rate": 8.351533775727147e-07, + "loss": 0.81156284, + "num_input_tokens_seen": 250497980, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.41015625, + "step": 11608, + "time_per_iteration": 2.402369737625122 + }, + { + "auxiliary_loss_clip": 0.01057447, + "auxiliary_loss_mlp": 0.01026747, + "balance_loss_clip": 1.01485026, + "balance_loss_mlp": 1.0193665, + "epoch": 0.697970840222456, + "flos": 15887330916480.0, + "grad_norm": 2.114531099361619, + "language_loss": 0.89861166, + "learning_rate": 8.348462720098863e-07, + "loss": 0.91945362, + "num_input_tokens_seen": 250511910, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.38085938, + "step": 11609, + "time_per_iteration": 2.358245849609375 + }, + { + "auxiliary_loss_clip": 0.0105765, + "auxiliary_loss_mlp": 0.01021898, + "balance_loss_clip": 1.01028717, + "balance_loss_mlp": 1.01764321, + "epoch": 0.698030963475124, + "flos": 21214553253120.0, + "grad_norm": 1.7562175564281766, + "language_loss": 0.64180887, + "learning_rate": 8.345392080283972e-07, + "loss": 0.66260439, + "num_input_tokens_seen": 250531090, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.40039062, + "step": 11610, + "time_per_iteration": 2.4086973667144775 + }, + { + "auxiliary_loss_clip": 0.01056096, + "auxiliary_loss_mlp": 0.01022983, + "balance_loss_clip": 1.01231933, + "balance_loss_mlp": 1.01807594, + "epoch": 0.698091086727792, + "flos": 33758497774080.0, + "grad_norm": 1.7530104267020925, + "language_loss": 0.84446657, + "learning_rate": 8.342321856392054e-07, + "loss": 0.86525738, + "num_input_tokens_seen": 250551565, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.38085938, + "step": 11611, + "time_per_iteration": 2.5036981105804443 + }, + { + "auxiliary_loss_clip": 0.01057921, + "auxiliary_loss_mlp": 0.01025843, + "balance_loss_clip": 1.01392245, + "balance_loss_mlp": 1.01824296, + "epoch": 0.69815120998046, + "flos": 15886946891520.0, + "grad_norm": 2.617884140612428, + "language_loss": 0.70986336, + "learning_rate": 8.339252048532695e-07, + "loss": 0.73070103, + "num_input_tokens_seen": 250569625, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.39648438, + "step": 11612, + "time_per_iteration": 2.4422593116760254 + }, + { + "auxiliary_loss_clip": 0.01055403, + "auxiliary_loss_mlp": 0.01025393, + "balance_loss_clip": 1.01451564, + "balance_loss_mlp": 1.01810014, + "epoch": 0.6982113332331279, + "flos": 18946211575680.0, + "grad_norm": 2.0206999356382425, + "language_loss": 0.80864722, + "learning_rate": 8.33618265681542e-07, + "loss": 0.82945514, + "num_input_tokens_seen": 250586960, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.37304688, + "step": 11613, + "time_per_iteration": 2.398373603820801 + }, + { + "auxiliary_loss_clip": 0.01055746, + "auxiliary_loss_mlp": 0.01022078, + "balance_loss_clip": 1.01143229, + "balance_loss_mlp": 1.01866078, + "epoch": 0.6982714564857959, + "flos": 24388437530880.0, + "grad_norm": 2.1050597399765247, + "language_loss": 0.7533927, + "learning_rate": 8.333113681349792e-07, + "loss": 0.77417088, + "num_input_tokens_seen": 250605080, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.37109375, + "step": 11614, + "time_per_iteration": 2.4367921352386475 + }, + { + "auxiliary_loss_clip": 0.01058675, + "auxiliary_loss_mlp": 0.01022924, + "balance_loss_clip": 1.01186156, + "balance_loss_mlp": 1.02041483, + "epoch": 0.6983315797384638, + "flos": 20082704019840.0, + "grad_norm": 2.7262857000950254, + "language_loss": 0.77584219, + "learning_rate": 8.330045122245326e-07, + "loss": 0.79665828, + "num_input_tokens_seen": 250623965, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.3828125, + "step": 11615, + "time_per_iteration": 2.3845553398132324 + }, + { + "auxiliary_loss_clip": 0.0105507, + "auxiliary_loss_mlp": 0.01021739, + "balance_loss_clip": 1.01096821, + "balance_loss_mlp": 1.01854992, + "epoch": 0.6983917029911318, + "flos": 13511701031040.0, + "grad_norm": 2.5906792347461196, + "language_loss": 0.72749507, + "learning_rate": 8.326976979611528e-07, + "loss": 0.74826318, + "num_input_tokens_seen": 250640675, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.36523438, + "step": 11616, + "time_per_iteration": 2.387112617492676 + }, + { + "auxiliary_loss_clip": 0.01056738, + "auxiliary_loss_mlp": 0.0102507, + "balance_loss_clip": 1.01355481, + "balance_loss_mlp": 1.01975834, + "epoch": 0.6984518262437998, + "flos": 22017311210880.0, + "grad_norm": 1.620568119741991, + "language_loss": 0.84447932, + "learning_rate": 8.323909253557891e-07, + "loss": 0.86529738, + "num_input_tokens_seen": 250660295, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.36914062, + "step": 11617, + "time_per_iteration": 2.438170909881592 + }, + { + "auxiliary_loss_clip": 0.01060156, + "auxiliary_loss_mlp": 0.01027025, + "balance_loss_clip": 1.01590896, + "balance_loss_mlp": 1.02069402, + "epoch": 0.6985119494964678, + "flos": 18769620585600.0, + "grad_norm": 2.545982818353417, + "language_loss": 0.59756887, + "learning_rate": 8.320841944193904e-07, + "loss": 0.61844075, + "num_input_tokens_seen": 250678155, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.39453125, + "step": 11618, + "time_per_iteration": 2.4339380264282227 + }, + { + "auxiliary_loss_clip": 0.0105771, + "auxiliary_loss_mlp": 0.01025406, + "balance_loss_clip": 1.01450396, + "balance_loss_mlp": 1.019418, + "epoch": 0.6985720727491357, + "flos": 22381734648960.0, + "grad_norm": 1.7391461453538866, + "language_loss": 0.82841939, + "learning_rate": 8.317775051629026e-07, + "loss": 0.84925056, + "num_input_tokens_seen": 250697230, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.3828125, + "step": 11619, + "time_per_iteration": 2.4378249645233154 + }, + { + "auxiliary_loss_clip": 0.01053257, + "auxiliary_loss_mlp": 0.01021347, + "balance_loss_clip": 1.01113105, + "balance_loss_mlp": 1.01736236, + "epoch": 0.6986321960018037, + "flos": 39566299242240.0, + "grad_norm": 1.8117774071807498, + "language_loss": 0.67020983, + "learning_rate": 8.314708575972706e-07, + "loss": 0.69095588, + "num_input_tokens_seen": 250719865, + "router_z_loss_clip": 0.10205078, + "router_z_loss_mlp": 0.359375, + "step": 11620, + "time_per_iteration": 2.605787992477417 + }, + { + "auxiliary_loss_clip": 0.01057678, + "auxiliary_loss_mlp": 0.01023709, + "balance_loss_clip": 1.01163268, + "balance_loss_mlp": 1.01790285, + "epoch": 0.6986923192544716, + "flos": 17966757893760.0, + "grad_norm": 1.9517035971962295, + "language_loss": 0.73213136, + "learning_rate": 8.311642517334371e-07, + "loss": 0.7529453, + "num_input_tokens_seen": 250736565, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.39648438, + "step": 11621, + "time_per_iteration": 2.4287145137786865 + }, + { + "auxiliary_loss_clip": 0.01058506, + "auxiliary_loss_mlp": 0.01027548, + "balance_loss_clip": 1.01588321, + "balance_loss_mlp": 1.01860547, + "epoch": 0.6987524425071396, + "flos": 25593115593600.0, + "grad_norm": 1.6218288277090944, + "language_loss": 0.68517512, + "learning_rate": 8.308576875823463e-07, + "loss": 0.70603561, + "num_input_tokens_seen": 250757235, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.3984375, + "step": 11622, + "time_per_iteration": 2.518447160720825 + }, + { + "auxiliary_loss_clip": 0.01051607, + "auxiliary_loss_mlp": 0.01019119, + "balance_loss_clip": 1.00840807, + "balance_loss_mlp": 1.01591587, + "epoch": 0.6988125657598077, + "flos": 17529121601280.0, + "grad_norm": 1.5776753099475354, + "language_loss": 0.62579995, + "learning_rate": 8.305511651549359e-07, + "loss": 0.6465072, + "num_input_tokens_seen": 250775585, + "router_z_loss_clip": 0.10693359, + "router_z_loss_mlp": 0.35742188, + "step": 11623, + "time_per_iteration": 2.360358476638794 + }, + { + "auxiliary_loss_clip": 0.010576, + "auxiliary_loss_mlp": 0.01023325, + "balance_loss_clip": 1.0118928, + "balance_loss_mlp": 1.01760387, + "epoch": 0.6988726890124756, + "flos": 39164169669120.0, + "grad_norm": 3.447938780086215, + "language_loss": 0.60454154, + "learning_rate": 8.302446844621469e-07, + "loss": 0.62535083, + "num_input_tokens_seen": 250795725, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.3984375, + "step": 11624, + "time_per_iteration": 2.598320960998535 + }, + { + "auxiliary_loss_clip": 0.0105815, + "auxiliary_loss_mlp": 0.0102629, + "balance_loss_clip": 1.01367164, + "balance_loss_mlp": 1.01796651, + "epoch": 0.6989328122651436, + "flos": 20192436023040.0, + "grad_norm": 1.6251976549197287, + "language_loss": 0.78168577, + "learning_rate": 8.299382455149169e-07, + "loss": 0.80253017, + "num_input_tokens_seen": 250814555, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.40234375, + "step": 11625, + "time_per_iteration": 2.3882265090942383 + }, + { + "auxiliary_loss_clip": 0.01054366, + "auxiliary_loss_mlp": 0.01018223, + "balance_loss_clip": 1.00776839, + "balance_loss_mlp": 1.01839399, + "epoch": 0.6989929355178115, + "flos": 21833807771520.0, + "grad_norm": 1.7494608961074176, + "language_loss": 0.65870583, + "learning_rate": 8.296318483241797e-07, + "loss": 0.67943174, + "num_input_tokens_seen": 250833105, + "router_z_loss_clip": 0.10449219, + "router_z_loss_mlp": 0.359375, + "step": 11626, + "time_per_iteration": 2.3939595222473145 + }, + { + "auxiliary_loss_clip": 0.01058463, + "auxiliary_loss_mlp": 0.0102604, + "balance_loss_clip": 1.01338542, + "balance_loss_mlp": 1.01949024, + "epoch": 0.6990530587704795, + "flos": 26321683178880.0, + "grad_norm": 1.7151432934033417, + "language_loss": 0.70482016, + "learning_rate": 8.293254929008719e-07, + "loss": 0.72566515, + "num_input_tokens_seen": 250852570, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.390625, + "step": 11627, + "time_per_iteration": 2.4258158206939697 + }, + { + "auxiliary_loss_clip": 0.01055351, + "auxiliary_loss_mlp": 0.01020074, + "balance_loss_clip": 1.00965464, + "balance_loss_mlp": 1.01931179, + "epoch": 0.6991131820231474, + "flos": 19827942762240.0, + "grad_norm": 1.7966939797867123, + "language_loss": 0.62401986, + "learning_rate": 8.290191792559253e-07, + "loss": 0.64477414, + "num_input_tokens_seen": 250870500, + "router_z_loss_clip": 0.10449219, + "router_z_loss_mlp": 0.359375, + "step": 11628, + "time_per_iteration": 3.8171331882476807 + }, + { + "auxiliary_loss_clip": 0.01057298, + "auxiliary_loss_mlp": 0.0102565, + "balance_loss_clip": 1.01363397, + "balance_loss_mlp": 1.01864374, + "epoch": 0.6991733052758154, + "flos": 33983407952640.0, + "grad_norm": 2.2729298386839694, + "language_loss": 0.68213117, + "learning_rate": 8.287129074002735e-07, + "loss": 0.70296067, + "num_input_tokens_seen": 250892745, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.38671875, + "step": 11629, + "time_per_iteration": 2.4879562854766846 + }, + { + "auxiliary_loss_clip": 0.01058239, + "auxiliary_loss_mlp": 0.01024088, + "balance_loss_clip": 1.0130074, + "balance_loss_mlp": 1.01958287, + "epoch": 0.6992334285284834, + "flos": 15632220545280.0, + "grad_norm": 1.7130019882575527, + "language_loss": 0.72539163, + "learning_rate": 8.284066773448437e-07, + "loss": 0.74621487, + "num_input_tokens_seen": 250910225, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.38671875, + "step": 11630, + "time_per_iteration": 2.385709285736084 + }, + { + "auxiliary_loss_clip": 0.01056811, + "auxiliary_loss_mlp": 0.0102325, + "balance_loss_clip": 1.01195467, + "balance_loss_mlp": 1.01949239, + "epoch": 0.6992935517811514, + "flos": 21725192931840.0, + "grad_norm": 1.6193117169731328, + "language_loss": 0.73861331, + "learning_rate": 8.281004891005666e-07, + "loss": 0.75941396, + "num_input_tokens_seen": 250929715, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.37304688, + "step": 11631, + "time_per_iteration": 2.3841307163238525 + }, + { + "auxiliary_loss_clip": 0.0106117, + "auxiliary_loss_mlp": 0.01024025, + "balance_loss_clip": 1.01103091, + "balance_loss_mlp": 1.02074373, + "epoch": 0.6993536750338193, + "flos": 20114370489600.0, + "grad_norm": 1.9437750689012478, + "language_loss": 0.89198244, + "learning_rate": 8.277943426783684e-07, + "loss": 0.91283441, + "num_input_tokens_seen": 250944230, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.40625, + "step": 11632, + "time_per_iteration": 2.3728206157684326 + }, + { + "auxiliary_loss_clip": 0.0105896, + "auxiliary_loss_mlp": 0.01024931, + "balance_loss_clip": 1.0129621, + "balance_loss_mlp": 1.01955187, + "epoch": 0.6994137982864873, + "flos": 22009665623040.0, + "grad_norm": 1.4808211614648914, + "language_loss": 0.80017281, + "learning_rate": 8.274882380891752e-07, + "loss": 0.82101172, + "num_input_tokens_seen": 250961865, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.39453125, + "step": 11633, + "time_per_iteration": 2.4030184745788574 + }, + { + "auxiliary_loss_clip": 0.01060052, + "auxiliary_loss_mlp": 0.01024324, + "balance_loss_clip": 1.01117504, + "balance_loss_mlp": 1.01982236, + "epoch": 0.6994739215391552, + "flos": 25517877880320.0, + "grad_norm": 2.224671625739836, + "language_loss": 0.67139125, + "learning_rate": 8.271821753439097e-07, + "loss": 0.69223499, + "num_input_tokens_seen": 250982025, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.40234375, + "step": 11634, + "time_per_iteration": 2.4552290439605713 + }, + { + "auxiliary_loss_clip": 0.01007566, + "auxiliary_loss_mlp": 0.01002922, + "balance_loss_clip": 1.00190282, + "balance_loss_mlp": 1.00115728, + "epoch": 0.6995340447918232, + "flos": 59125095400320.0, + "grad_norm": 0.72669142670471, + "language_loss": 0.53214353, + "learning_rate": 8.26876154453497e-07, + "loss": 0.55224842, + "num_input_tokens_seen": 251046900, + "router_z_loss_clip": 0.01019287, + "router_z_loss_mlp": 0.06445312, + "step": 11635, + "time_per_iteration": 3.1659693717956543 + }, + { + "auxiliary_loss_clip": 0.01056759, + "auxiliary_loss_mlp": 0.01020353, + "balance_loss_clip": 1.00924826, + "balance_loss_mlp": 1.01822186, + "epoch": 0.6995941680444913, + "flos": 17966862627840.0, + "grad_norm": 1.505458068399106, + "language_loss": 0.82315534, + "learning_rate": 8.265701754288554e-07, + "loss": 0.84392643, + "num_input_tokens_seen": 251065050, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.38671875, + "step": 11636, + "time_per_iteration": 2.4222090244293213 + }, + { + "auxiliary_loss_clip": 0.01058801, + "auxiliary_loss_mlp": 0.01022934, + "balance_loss_clip": 1.01091814, + "balance_loss_mlp": 1.01899719, + "epoch": 0.6996542912971592, + "flos": 21979046494080.0, + "grad_norm": 1.8414446251443022, + "language_loss": 0.8291266, + "learning_rate": 8.262642382809064e-07, + "loss": 0.849944, + "num_input_tokens_seen": 251083355, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.3984375, + "step": 11637, + "time_per_iteration": 3.8376030921936035 + }, + { + "auxiliary_loss_clip": 0.01055742, + "auxiliary_loss_mlp": 0.01024042, + "balance_loss_clip": 1.01254988, + "balance_loss_mlp": 1.01730001, + "epoch": 0.6997144145498272, + "flos": 11685534122880.0, + "grad_norm": 1.777207183421924, + "language_loss": 0.67649049, + "learning_rate": 8.259583430205668e-07, + "loss": 0.69728833, + "num_input_tokens_seen": 251096420, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.38476562, + "step": 11638, + "time_per_iteration": 2.3427529335021973 + }, + { + "auxiliary_loss_clip": 0.01061598, + "auxiliary_loss_mlp": 0.01027643, + "balance_loss_clip": 1.01429796, + "balance_loss_mlp": 1.02024472, + "epoch": 0.6997745378024951, + "flos": 29605858041600.0, + "grad_norm": 2.23734871860766, + "language_loss": 0.7811445, + "learning_rate": 8.256524896587555e-07, + "loss": 0.802037, + "num_input_tokens_seen": 251115410, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.4140625, + "step": 11639, + "time_per_iteration": 3.854995012283325 + }, + { + "auxiliary_loss_clip": 0.01059922, + "auxiliary_loss_mlp": 0.01031777, + "balance_loss_clip": 1.01926017, + "balance_loss_mlp": 1.02065539, + "epoch": 0.6998346610551631, + "flos": 20885566711680.0, + "grad_norm": 2.276820835779646, + "language_loss": 0.82064664, + "learning_rate": 8.253466782063854e-07, + "loss": 0.84156364, + "num_input_tokens_seen": 251133530, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.39257812, + "step": 11640, + "time_per_iteration": 2.393209457397461 + }, + { + "auxiliary_loss_clip": 0.010566, + "auxiliary_loss_mlp": 0.01020386, + "balance_loss_clip": 1.00921035, + "balance_loss_mlp": 1.01778209, + "epoch": 0.699894784307831, + "flos": 27161798158080.0, + "grad_norm": 2.652548328012818, + "language_loss": 0.75499767, + "learning_rate": 8.250409086743699e-07, + "loss": 0.77576756, + "num_input_tokens_seen": 251153985, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.38867188, + "step": 11641, + "time_per_iteration": 2.4801523685455322 + }, + { + "auxiliary_loss_clip": 0.01057776, + "auxiliary_loss_mlp": 0.01022078, + "balance_loss_clip": 1.00972772, + "balance_loss_mlp": 1.01939166, + "epoch": 0.699954907560499, + "flos": 20922574619520.0, + "grad_norm": 2.0220349963947877, + "language_loss": 0.7782768, + "learning_rate": 8.247351810736234e-07, + "loss": 0.79907537, + "num_input_tokens_seen": 251173225, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.3828125, + "step": 11642, + "time_per_iteration": 2.3812668323516846 + }, + { + "auxiliary_loss_clip": 0.01062946, + "auxiliary_loss_mlp": 0.01030066, + "balance_loss_clip": 1.01613045, + "balance_loss_mlp": 1.02052939, + "epoch": 0.700015030813167, + "flos": 28656534729600.0, + "grad_norm": 2.243630225037215, + "language_loss": 0.74914265, + "learning_rate": 8.244294954150539e-07, + "loss": 0.7700727, + "num_input_tokens_seen": 251192485, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.42578125, + "step": 11643, + "time_per_iteration": 2.458421468734741 + }, + { + "auxiliary_loss_clip": 0.01058557, + "auxiliary_loss_mlp": 0.01024402, + "balance_loss_clip": 1.01234424, + "balance_loss_mlp": 1.01890469, + "epoch": 0.700075154065835, + "flos": 29204007759360.0, + "grad_norm": 1.480750435007196, + "language_loss": 0.60357463, + "learning_rate": 8.241238517095723e-07, + "loss": 0.62440419, + "num_input_tokens_seen": 251214965, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.39648438, + "step": 11644, + "time_per_iteration": 2.455565929412842 + }, + { + "auxiliary_loss_clip": 0.01056825, + "auxiliary_loss_mlp": 0.01026008, + "balance_loss_clip": 1.0145216, + "balance_loss_mlp": 1.01908851, + "epoch": 0.7001352773185029, + "flos": 23111314663680.0, + "grad_norm": 1.6057110461568351, + "language_loss": 0.81793225, + "learning_rate": 8.238182499680853e-07, + "loss": 0.83876055, + "num_input_tokens_seen": 251234500, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.37890625, + "step": 11645, + "time_per_iteration": 2.4412431716918945 + }, + { + "auxiliary_loss_clip": 0.01057557, + "auxiliary_loss_mlp": 0.0102712, + "balance_loss_clip": 1.01512742, + "balance_loss_mlp": 1.01930141, + "epoch": 0.7001954005711709, + "flos": 21321841461120.0, + "grad_norm": 2.016072442165029, + "language_loss": 0.6824671, + "learning_rate": 8.235126902015006e-07, + "loss": 0.70331395, + "num_input_tokens_seen": 251254360, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.3828125, + "step": 11646, + "time_per_iteration": 3.9557273387908936 + }, + { + "auxiliary_loss_clip": 0.0105799, + "auxiliary_loss_mlp": 0.01023724, + "balance_loss_clip": 1.01274431, + "balance_loss_mlp": 1.01940846, + "epoch": 0.7002555238238388, + "flos": 24534653771520.0, + "grad_norm": 2.069093606519249, + "language_loss": 0.71064687, + "learning_rate": 8.232071724207204e-07, + "loss": 0.73146403, + "num_input_tokens_seen": 251274790, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.38671875, + "step": 11647, + "time_per_iteration": 2.4422528743743896 + }, + { + "auxiliary_loss_clip": 0.01061229, + "auxiliary_loss_mlp": 0.01029244, + "balance_loss_clip": 1.01704931, + "balance_loss_mlp": 1.02039385, + "epoch": 0.7003156470765068, + "flos": 39054996247680.0, + "grad_norm": 1.6648471382641865, + "language_loss": 0.71708322, + "learning_rate": 8.229016966366498e-07, + "loss": 0.737988, + "num_input_tokens_seen": 251296275, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.40820312, + "step": 11648, + "time_per_iteration": 2.5724682807922363 + }, + { + "auxiliary_loss_clip": 0.01056403, + "auxiliary_loss_mlp": 0.01027457, + "balance_loss_clip": 1.01561403, + "balance_loss_mlp": 1.01777005, + "epoch": 0.7003757703291749, + "flos": 28802820792960.0, + "grad_norm": 1.4791721512014842, + "language_loss": 0.77418905, + "learning_rate": 8.225962628601897e-07, + "loss": 0.79502767, + "num_input_tokens_seen": 251317375, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.38671875, + "step": 11649, + "time_per_iteration": 2.4917030334472656 + }, + { + "auxiliary_loss_clip": 0.01058802, + "auxiliary_loss_mlp": 0.01022806, + "balance_loss_clip": 1.01131415, + "balance_loss_mlp": 1.01974332, + "epoch": 0.7004358935818428, + "flos": 15953142562560.0, + "grad_norm": 1.7805245909807559, + "language_loss": 0.78499019, + "learning_rate": 8.222908711022404e-07, + "loss": 0.80580628, + "num_input_tokens_seen": 251333570, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.390625, + "step": 11650, + "time_per_iteration": 2.4261298179626465 + }, + { + "auxiliary_loss_clip": 0.01055673, + "auxiliary_loss_mlp": 0.01021829, + "balance_loss_clip": 1.01017034, + "balance_loss_mlp": 1.01714575, + "epoch": 0.7004960168345108, + "flos": 20410957422720.0, + "grad_norm": 2.4183150706762095, + "language_loss": 0.78473413, + "learning_rate": 8.219855213736999e-07, + "loss": 0.80550921, + "num_input_tokens_seen": 251351070, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.38476562, + "step": 11651, + "time_per_iteration": 2.3866071701049805 + }, + { + "auxiliary_loss_clip": 0.01058914, + "auxiliary_loss_mlp": 0.01024273, + "balance_loss_clip": 1.01284099, + "balance_loss_mlp": 1.01933861, + "epoch": 0.7005561400871787, + "flos": 17346595680000.0, + "grad_norm": 1.8505195309158187, + "language_loss": 0.69213367, + "learning_rate": 8.216802136854673e-07, + "loss": 0.71296555, + "num_input_tokens_seen": 251370005, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.39648438, + "step": 11652, + "time_per_iteration": 2.4045352935791016 + }, + { + "auxiliary_loss_clip": 0.0105737, + "auxiliary_loss_mlp": 0.01023694, + "balance_loss_clip": 1.01142156, + "balance_loss_mlp": 1.01922989, + "epoch": 0.7006162633398467, + "flos": 25300927491840.0, + "grad_norm": 1.3587706162479525, + "language_loss": 0.74267346, + "learning_rate": 8.213749480484353e-07, + "loss": 0.76348412, + "num_input_tokens_seen": 251391210, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.38085938, + "step": 11653, + "time_per_iteration": 2.42864990234375 + }, + { + "auxiliary_loss_clip": 0.01058904, + "auxiliary_loss_mlp": 0.01023245, + "balance_loss_clip": 1.01124656, + "balance_loss_mlp": 1.0193094, + "epoch": 0.7006763865925146, + "flos": 20667918096000.0, + "grad_norm": 1.9199105740819855, + "language_loss": 0.70076901, + "learning_rate": 8.210697244735006e-07, + "loss": 0.72159052, + "num_input_tokens_seen": 251411505, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.39648438, + "step": 11654, + "time_per_iteration": 2.4615890979766846 + }, + { + "auxiliary_loss_clip": 0.01059915, + "auxiliary_loss_mlp": 0.0102848, + "balance_loss_clip": 1.01478314, + "balance_loss_mlp": 1.01913095, + "epoch": 0.7007365098451827, + "flos": 20045451732480.0, + "grad_norm": 2.63391301075869, + "language_loss": 0.72692895, + "learning_rate": 8.207645429715546e-07, + "loss": 0.74781293, + "num_input_tokens_seen": 251428975, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.40625, + "step": 11655, + "time_per_iteration": 2.370312213897705 + }, + { + "auxiliary_loss_clip": 0.01065763, + "auxiliary_loss_mlp": 0.01030216, + "balance_loss_clip": 1.01622677, + "balance_loss_mlp": 1.02251601, + "epoch": 0.7007966330978506, + "flos": 20776323467520.0, + "grad_norm": 1.8255486803474437, + "language_loss": 0.7052474, + "learning_rate": 8.204594035534888e-07, + "loss": 0.72620714, + "num_input_tokens_seen": 251446940, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.43359375, + "step": 11656, + "time_per_iteration": 2.397951364517212 + }, + { + "auxiliary_loss_clip": 0.01053796, + "auxiliary_loss_mlp": 0.01022591, + "balance_loss_clip": 1.01188004, + "balance_loss_mlp": 1.01725328, + "epoch": 0.7008567563505186, + "flos": 29637035752320.0, + "grad_norm": 1.8015158670856564, + "language_loss": 0.77652901, + "learning_rate": 8.201543062301928e-07, + "loss": 0.79729295, + "num_input_tokens_seen": 251466205, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.36523438, + "step": 11657, + "time_per_iteration": 2.4563724994659424 + }, + { + "auxiliary_loss_clip": 0.01061128, + "auxiliary_loss_mlp": 0.01030145, + "balance_loss_clip": 1.01719284, + "balance_loss_mlp": 1.01979136, + "epoch": 0.7009168796031865, + "flos": 17091066372480.0, + "grad_norm": 2.5424994928387905, + "language_loss": 0.78338742, + "learning_rate": 8.198492510125541e-07, + "loss": 0.80430019, + "num_input_tokens_seen": 251484820, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.4140625, + "step": 11658, + "time_per_iteration": 2.3695080280303955 + }, + { + "auxiliary_loss_clip": 0.01055642, + "auxiliary_loss_mlp": 0.01022105, + "balance_loss_clip": 1.01108372, + "balance_loss_mlp": 1.01802361, + "epoch": 0.7009770028558545, + "flos": 20447930419200.0, + "grad_norm": 1.7801011021537572, + "language_loss": 0.8258698, + "learning_rate": 8.19544237911461e-07, + "loss": 0.84664726, + "num_input_tokens_seen": 251502670, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.375, + "step": 11659, + "time_per_iteration": 2.40639328956604 + }, + { + "auxiliary_loss_clip": 0.01055026, + "auxiliary_loss_mlp": 0.01023042, + "balance_loss_clip": 1.01206303, + "balance_loss_mlp": 1.01783276, + "epoch": 0.7010371261085224, + "flos": 19244125140480.0, + "grad_norm": 2.2327337883868346, + "language_loss": 0.6930142, + "learning_rate": 8.192392669377963e-07, + "loss": 0.71379483, + "num_input_tokens_seen": 251521630, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.37109375, + "step": 11660, + "time_per_iteration": 2.4475629329681396 + }, + { + "auxiliary_loss_clip": 0.01058966, + "auxiliary_loss_mlp": 0.01032276, + "balance_loss_clip": 1.01896584, + "balance_loss_mlp": 1.01869392, + "epoch": 0.7010972493611904, + "flos": 22126484632320.0, + "grad_norm": 1.7468345899951543, + "language_loss": 0.8102026, + "learning_rate": 8.189343381024456e-07, + "loss": 0.83111501, + "num_input_tokens_seen": 251540105, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.40234375, + "step": 11661, + "time_per_iteration": 2.4452693462371826 + }, + { + "auxiliary_loss_clip": 0.01055849, + "auxiliary_loss_mlp": 0.01025329, + "balance_loss_clip": 1.01451731, + "balance_loss_mlp": 1.01763868, + "epoch": 0.7011573726138585, + "flos": 31389885072000.0, + "grad_norm": 1.6945447400196083, + "language_loss": 0.79065454, + "learning_rate": 8.186294514162897e-07, + "loss": 0.81146634, + "num_input_tokens_seen": 251560530, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.3828125, + "step": 11662, + "time_per_iteration": 2.489665985107422 + }, + { + "auxiliary_loss_clip": 0.01058438, + "auxiliary_loss_mlp": 0.01019304, + "balance_loss_clip": 1.00733542, + "balance_loss_mlp": 1.0180589, + "epoch": 0.7012174958665264, + "flos": 18149598017280.0, + "grad_norm": 2.3817366382748375, + "language_loss": 0.83468956, + "learning_rate": 8.183246068902113e-07, + "loss": 0.85546696, + "num_input_tokens_seen": 251577930, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.40429688, + "step": 11663, + "time_per_iteration": 2.364264965057373 + }, + { + "auxiliary_loss_clip": 0.01007255, + "auxiliary_loss_mlp": 0.01000172, + "balance_loss_clip": 0.99918288, + "balance_loss_mlp": 1.00091457, + "epoch": 0.7012776191191944, + "flos": 60648216773760.0, + "grad_norm": 0.8120725568524961, + "language_loss": 0.53730357, + "learning_rate": 8.180198045350864e-07, + "loss": 0.55737782, + "num_input_tokens_seen": 251638820, + "router_z_loss_clip": 0.0098877, + "router_z_loss_mlp": 0.06347656, + "step": 11664, + "time_per_iteration": 3.0251355171203613 + }, + { + "auxiliary_loss_clip": 0.01059482, + "auxiliary_loss_mlp": 0.01025088, + "balance_loss_clip": 1.01257694, + "balance_loss_mlp": 1.01925647, + "epoch": 0.7013377423718623, + "flos": 27197374700160.0, + "grad_norm": 1.815814775733652, + "language_loss": 0.7938056, + "learning_rate": 8.17715044361795e-07, + "loss": 0.81465131, + "num_input_tokens_seen": 251658070, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.40234375, + "step": 11665, + "time_per_iteration": 2.4308278560638428 + }, + { + "auxiliary_loss_clip": 0.01055598, + "auxiliary_loss_mlp": 0.01020498, + "balance_loss_clip": 1.00946474, + "balance_loss_mlp": 1.01731753, + "epoch": 0.7013978656245303, + "flos": 16542650736000.0, + "grad_norm": 2.182858156690661, + "language_loss": 0.77408457, + "learning_rate": 8.174103263812124e-07, + "loss": 0.79484546, + "num_input_tokens_seen": 251671575, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.3828125, + "step": 11666, + "time_per_iteration": 2.3587749004364014 + }, + { + "auxiliary_loss_clip": 0.010584, + "auxiliary_loss_mlp": 0.01024706, + "balance_loss_clip": 1.01299322, + "balance_loss_mlp": 1.02003336, + "epoch": 0.7014579888771982, + "flos": 23142806576640.0, + "grad_norm": 8.291228861499924, + "language_loss": 0.80922097, + "learning_rate": 8.171056506042135e-07, + "loss": 0.83005202, + "num_input_tokens_seen": 251689350, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.38476562, + "step": 11667, + "time_per_iteration": 2.3977580070495605 + }, + { + "auxiliary_loss_clip": 0.01057181, + "auxiliary_loss_mlp": 0.01023477, + "balance_loss_clip": 1.01138353, + "balance_loss_mlp": 1.018255, + "epoch": 0.7015181121298663, + "flos": 25080939815040.0, + "grad_norm": 2.005067915214569, + "language_loss": 0.65662819, + "learning_rate": 8.168010170416704e-07, + "loss": 0.6774348, + "num_input_tokens_seen": 251704635, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.390625, + "step": 11668, + "time_per_iteration": 3.8341422080993652 + }, + { + "auxiliary_loss_clip": 0.01059538, + "auxiliary_loss_mlp": 0.0102091, + "balance_loss_clip": 1.00854766, + "balance_loss_mlp": 1.01882172, + "epoch": 0.7015782353825342, + "flos": 23326868597760.0, + "grad_norm": 1.8461898087077944, + "language_loss": 0.7668829, + "learning_rate": 8.164964257044569e-07, + "loss": 0.78768736, + "num_input_tokens_seen": 251723035, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.40625, + "step": 11669, + "time_per_iteration": 2.41625714302063 + }, + { + "auxiliary_loss_clip": 0.01007384, + "auxiliary_loss_mlp": 0.01001706, + "balance_loss_clip": 1.00080025, + "balance_loss_mlp": 1.0009985, + "epoch": 0.7016383586352022, + "flos": 70393732444800.0, + "grad_norm": 0.6879596114489489, + "language_loss": 0.54458737, + "learning_rate": 8.161918766034408e-07, + "loss": 0.56467831, + "num_input_tokens_seen": 251791630, + "router_z_loss_clip": 0.0090332, + "router_z_loss_mlp": 0.06396484, + "step": 11670, + "time_per_iteration": 3.1956474781036377 + }, + { + "auxiliary_loss_clip": 0.01058872, + "auxiliary_loss_mlp": 0.01026305, + "balance_loss_clip": 1.01414013, + "balance_loss_mlp": 1.01978433, + "epoch": 0.7016984818878701, + "flos": 19791249056640.0, + "grad_norm": 2.7751009174847336, + "language_loss": 0.81564295, + "learning_rate": 8.158873697494908e-07, + "loss": 0.83649468, + "num_input_tokens_seen": 251809840, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.390625, + "step": 11671, + "time_per_iteration": 2.429922580718994 + }, + { + "auxiliary_loss_clip": 0.01059636, + "auxiliary_loss_mlp": 0.01023423, + "balance_loss_clip": 1.01178837, + "balance_loss_mlp": 1.02046371, + "epoch": 0.7017586051405381, + "flos": 12896077294080.0, + "grad_norm": 2.9787747069331645, + "language_loss": 0.74751663, + "learning_rate": 8.155829051534753e-07, + "loss": 0.76834726, + "num_input_tokens_seen": 251827550, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.390625, + "step": 11672, + "time_per_iteration": 2.4162371158599854 + }, + { + "auxiliary_loss_clip": 0.01056661, + "auxiliary_loss_mlp": 0.01022328, + "balance_loss_clip": 1.01083004, + "balance_loss_mlp": 1.0181644, + "epoch": 0.701818728393206, + "flos": 18331844647680.0, + "grad_norm": 1.7759768783521768, + "language_loss": 0.87001264, + "learning_rate": 8.152784828262593e-07, + "loss": 0.8908025, + "num_input_tokens_seen": 251844880, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.38476562, + "step": 11673, + "time_per_iteration": 2.3732597827911377 + }, + { + "auxiliary_loss_clip": 0.01055786, + "auxiliary_loss_mlp": 0.01023967, + "balance_loss_clip": 1.01251674, + "balance_loss_mlp": 1.01857269, + "epoch": 0.701878851645874, + "flos": 17383254474240.0, + "grad_norm": 6.333678278373648, + "language_loss": 0.73086643, + "learning_rate": 8.149741027787069e-07, + "loss": 0.75166392, + "num_input_tokens_seen": 251861025, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.37304688, + "step": 11674, + "time_per_iteration": 2.4741461277008057 + }, + { + "auxiliary_loss_clip": 0.01055805, + "auxiliary_loss_mlp": 0.01023875, + "balance_loss_clip": 1.0126816, + "balance_loss_mlp": 1.01961231, + "epoch": 0.701938974898542, + "flos": 23914351912320.0, + "grad_norm": 1.5136593746021167, + "language_loss": 0.71883702, + "learning_rate": 8.146697650216798e-07, + "loss": 0.7396338, + "num_input_tokens_seen": 251880175, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.36132812, + "step": 11675, + "time_per_iteration": 2.427698850631714 + }, + { + "auxiliary_loss_clip": 0.01056299, + "auxiliary_loss_mlp": 0.01023578, + "balance_loss_clip": 1.01141214, + "balance_loss_mlp": 1.01785469, + "epoch": 0.70199909815121, + "flos": 21794600448000.0, + "grad_norm": 2.2666902830251745, + "language_loss": 0.51088876, + "learning_rate": 8.143654695660412e-07, + "loss": 0.5316875, + "num_input_tokens_seen": 251899005, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.38476562, + "step": 11676, + "time_per_iteration": 2.430854320526123 + }, + { + "auxiliary_loss_clip": 0.0100721, + "auxiliary_loss_mlp": 0.01000646, + "balance_loss_clip": 0.99972808, + "balance_loss_mlp": 1.00078869, + "epoch": 0.702059221403878, + "flos": 71711459089920.0, + "grad_norm": 0.8080827773174146, + "language_loss": 0.59199095, + "learning_rate": 8.140612164226475e-07, + "loss": 0.61206961, + "num_input_tokens_seen": 251966790, + "router_z_loss_clip": 0.00915527, + "router_z_loss_mlp": 0.06445312, + "step": 11677, + "time_per_iteration": 4.516522407531738 + }, + { + "auxiliary_loss_clip": 0.01060021, + "auxiliary_loss_mlp": 0.01022242, + "balance_loss_clip": 1.00990343, + "balance_loss_mlp": 1.01883388, + "epoch": 0.7021193446565459, + "flos": 28109794838400.0, + "grad_norm": 2.463469266250139, + "language_loss": 0.62814736, + "learning_rate": 8.137570056023593e-07, + "loss": 0.64897001, + "num_input_tokens_seen": 251989315, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.41210938, + "step": 11678, + "time_per_iteration": 3.9457767009735107 + }, + { + "auxiliary_loss_clip": 0.0105569, + "auxiliary_loss_mlp": 0.01025454, + "balance_loss_clip": 1.01409936, + "balance_loss_mlp": 1.01810348, + "epoch": 0.7021794679092139, + "flos": 22923936063360.0, + "grad_norm": 1.5515572964579265, + "language_loss": 0.79432219, + "learning_rate": 8.134528371160321e-07, + "loss": 0.81513357, + "num_input_tokens_seen": 252006620, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.37695312, + "step": 11679, + "time_per_iteration": 2.414508104324341 + }, + { + "auxiliary_loss_clip": 0.01057782, + "auxiliary_loss_mlp": 0.01024704, + "balance_loss_clip": 1.01388526, + "balance_loss_mlp": 1.020033, + "epoch": 0.7022395911618818, + "flos": 18076839010560.0, + "grad_norm": 2.3920961713544253, + "language_loss": 0.70832509, + "learning_rate": 8.131487109745212e-07, + "loss": 0.72914994, + "num_input_tokens_seen": 252024570, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.37890625, + "step": 11680, + "time_per_iteration": 2.415681838989258 + }, + { + "auxiliary_loss_clip": 0.0106082, + "auxiliary_loss_mlp": 0.01027074, + "balance_loss_clip": 1.01380002, + "balance_loss_mlp": 1.01917493, + "epoch": 0.7022997144145499, + "flos": 16033372600320.0, + "grad_norm": 1.7025439725334346, + "language_loss": 0.74891877, + "learning_rate": 8.128446271886789e-07, + "loss": 0.76979774, + "num_input_tokens_seen": 252042775, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.41601562, + "step": 11681, + "time_per_iteration": 2.3588805198669434 + }, + { + "auxiliary_loss_clip": 0.01057971, + "auxiliary_loss_mlp": 0.01025127, + "balance_loss_clip": 1.01370704, + "balance_loss_mlp": 1.01849008, + "epoch": 0.7023598376672178, + "flos": 26467480483200.0, + "grad_norm": 1.5535841060349485, + "language_loss": 0.76869464, + "learning_rate": 8.125405857693588e-07, + "loss": 0.78952563, + "num_input_tokens_seen": 252063690, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.39453125, + "step": 11682, + "time_per_iteration": 2.454876184463501 + }, + { + "auxiliary_loss_clip": 0.01059958, + "auxiliary_loss_mlp": 0.01025253, + "balance_loss_clip": 1.01304591, + "balance_loss_mlp": 1.01924479, + "epoch": 0.7024199609198858, + "flos": 17054966160000.0, + "grad_norm": 3.112871190494291, + "language_loss": 0.73390323, + "learning_rate": 8.12236586727411e-07, + "loss": 0.75475532, + "num_input_tokens_seen": 252080335, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.40625, + "step": 11683, + "time_per_iteration": 2.4014554023742676 + }, + { + "auxiliary_loss_clip": 0.01057546, + "auxiliary_loss_mlp": 0.01024225, + "balance_loss_clip": 1.01157069, + "balance_loss_mlp": 1.01856351, + "epoch": 0.7024800841725537, + "flos": 25847841939840.0, + "grad_norm": 1.8218214891842237, + "language_loss": 0.71333826, + "learning_rate": 8.119326300736837e-07, + "loss": 0.73415595, + "num_input_tokens_seen": 252101075, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.390625, + "step": 11684, + "time_per_iteration": 2.451327323913574 + }, + { + "auxiliary_loss_clip": 0.01056908, + "auxiliary_loss_mlp": 0.01025306, + "balance_loss_clip": 1.01349807, + "balance_loss_mlp": 1.01848626, + "epoch": 0.7025402074252217, + "flos": 23511908136960.0, + "grad_norm": 2.0463898873282496, + "language_loss": 0.71874702, + "learning_rate": 8.116287158190251e-07, + "loss": 0.73956913, + "num_input_tokens_seen": 252120510, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.38476562, + "step": 11685, + "time_per_iteration": 2.443833351135254 + }, + { + "auxiliary_loss_clip": 0.0105768, + "auxiliary_loss_mlp": 0.01026336, + "balance_loss_clip": 1.0143379, + "balance_loss_mlp": 1.01819444, + "epoch": 0.7026003306778896, + "flos": 20150121588480.0, + "grad_norm": 1.8586703061331382, + "language_loss": 0.84572327, + "learning_rate": 8.113248439742808e-07, + "loss": 0.86656344, + "num_input_tokens_seen": 252137590, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.39453125, + "step": 11686, + "time_per_iteration": 3.874119997024536 + }, + { + "auxiliary_loss_clip": 0.01006927, + "auxiliary_loss_mlp": 0.01001527, + "balance_loss_clip": 1.00064445, + "balance_loss_mlp": 1.00052238, + "epoch": 0.7026604539305576, + "flos": 64769294770560.0, + "grad_norm": 0.9794289078310043, + "language_loss": 0.69937658, + "learning_rate": 8.110210145502949e-07, + "loss": 0.71946108, + "num_input_tokens_seen": 252199830, + "router_z_loss_clip": 0.0088501, + "router_z_loss_mlp": 0.06445312, + "step": 11687, + "time_per_iteration": 3.2338755130767822 + }, + { + "auxiliary_loss_clip": 0.01058256, + "auxiliary_loss_mlp": 0.01024169, + "balance_loss_clip": 1.01295197, + "balance_loss_mlp": 1.01947165, + "epoch": 0.7027205771832256, + "flos": 21870396743040.0, + "grad_norm": 3.061255716842376, + "language_loss": 0.77096325, + "learning_rate": 8.107172275579099e-07, + "loss": 0.79178751, + "num_input_tokens_seen": 252217200, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.38671875, + "step": 11688, + "time_per_iteration": 2.4081709384918213 + }, + { + "auxiliary_loss_clip": 0.01058099, + "auxiliary_loss_mlp": 0.01025672, + "balance_loss_clip": 1.01346469, + "balance_loss_mlp": 1.01857197, + "epoch": 0.7027807004358936, + "flos": 23366669414400.0, + "grad_norm": 2.161905898709337, + "language_loss": 0.68768662, + "learning_rate": 8.104134830079688e-07, + "loss": 0.70852435, + "num_input_tokens_seen": 252236105, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.39453125, + "step": 11689, + "time_per_iteration": 2.403057336807251 + }, + { + "auxiliary_loss_clip": 0.01059727, + "auxiliary_loss_mlp": 0.01024723, + "balance_loss_clip": 1.0120331, + "balance_loss_mlp": 1.01881313, + "epoch": 0.7028408236885616, + "flos": 15303373649280.0, + "grad_norm": 2.6249717614980836, + "language_loss": 0.80153859, + "learning_rate": 8.101097809113105e-07, + "loss": 0.82238299, + "num_input_tokens_seen": 252253315, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.41015625, + "step": 11690, + "time_per_iteration": 2.4251983165740967 + }, + { + "auxiliary_loss_clip": 0.01059726, + "auxiliary_loss_mlp": 0.01026601, + "balance_loss_clip": 1.0144062, + "balance_loss_mlp": 1.02041864, + "epoch": 0.7029009469412295, + "flos": 22017101742720.0, + "grad_norm": 1.667408319960402, + "language_loss": 0.76009262, + "learning_rate": 8.098061212787732e-07, + "loss": 0.78095591, + "num_input_tokens_seen": 252272765, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.39257812, + "step": 11691, + "time_per_iteration": 2.4273617267608643 + }, + { + "auxiliary_loss_clip": 0.0105866, + "auxiliary_loss_mlp": 0.01022471, + "balance_loss_clip": 1.01171851, + "balance_loss_mlp": 1.02031279, + "epoch": 0.7029610701938975, + "flos": 21834436176000.0, + "grad_norm": 4.033384458454239, + "language_loss": 0.8222096, + "learning_rate": 8.095025041211932e-07, + "loss": 0.84302092, + "num_input_tokens_seen": 252290510, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.38476562, + "step": 11692, + "time_per_iteration": 2.466222047805786 + }, + { + "auxiliary_loss_clip": 0.01056907, + "auxiliary_loss_mlp": 0.01022194, + "balance_loss_clip": 1.00984383, + "balance_loss_mlp": 1.0182873, + "epoch": 0.7030211934465654, + "flos": 19134637516800.0, + "grad_norm": 1.610166003822813, + "language_loss": 0.76214421, + "learning_rate": 8.091989294494079e-07, + "loss": 0.78293526, + "num_input_tokens_seen": 252309365, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.38671875, + "step": 11693, + "time_per_iteration": 2.456744432449341 + }, + { + "auxiliary_loss_clip": 0.010592, + "auxiliary_loss_mlp": 0.01023788, + "balance_loss_clip": 1.01243913, + "balance_loss_mlp": 1.02066171, + "epoch": 0.7030813166992335, + "flos": 38544461303040.0, + "grad_norm": 1.3955066610938445, + "language_loss": 0.6831044, + "learning_rate": 8.088953972742482e-07, + "loss": 0.70393431, + "num_input_tokens_seen": 252333010, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.38476562, + "step": 11694, + "time_per_iteration": 2.6222243309020996 + }, + { + "auxiliary_loss_clip": 0.01057254, + "auxiliary_loss_mlp": 0.0102336, + "balance_loss_clip": 1.01118255, + "balance_loss_mlp": 1.01793456, + "epoch": 0.7031414399519014, + "flos": 14720009875200.0, + "grad_norm": 2.169317768893043, + "language_loss": 0.75874126, + "learning_rate": 8.085919076065488e-07, + "loss": 0.77954739, + "num_input_tokens_seen": 252351330, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.39453125, + "step": 11695, + "time_per_iteration": 2.422978401184082 + }, + { + "auxiliary_loss_clip": 0.01061944, + "auxiliary_loss_mlp": 0.0103124, + "balance_loss_clip": 1.01817477, + "balance_loss_mlp": 1.02041149, + "epoch": 0.7032015632045694, + "flos": 14026390427520.0, + "grad_norm": 2.0584110655596017, + "language_loss": 0.73917824, + "learning_rate": 8.082884604571394e-07, + "loss": 0.76011008, + "num_input_tokens_seen": 252369580, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.4140625, + "step": 11696, + "time_per_iteration": 2.4759674072265625 + }, + { + "auxiliary_loss_clip": 0.01058177, + "auxiliary_loss_mlp": 0.01025341, + "balance_loss_clip": 1.01322365, + "balance_loss_mlp": 1.0186671, + "epoch": 0.7032616864572373, + "flos": 27635918688000.0, + "grad_norm": 1.5921399707515425, + "language_loss": 0.75306463, + "learning_rate": 8.079850558368495e-07, + "loss": 0.77389979, + "num_input_tokens_seen": 252390525, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.39648438, + "step": 11697, + "time_per_iteration": 2.472139835357666 + }, + { + "auxiliary_loss_clip": 0.0106196, + "auxiliary_loss_mlp": 0.01033043, + "balance_loss_clip": 1.01936328, + "balance_loss_mlp": 1.02075613, + "epoch": 0.7033218097099053, + "flos": 17966338957440.0, + "grad_norm": 2.7076881592877804, + "language_loss": 0.80982757, + "learning_rate": 8.076816937565061e-07, + "loss": 0.83077759, + "num_input_tokens_seen": 252407470, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.41015625, + "step": 11698, + "time_per_iteration": 2.4816761016845703 + }, + { + "auxiliary_loss_clip": 0.01058664, + "auxiliary_loss_mlp": 0.01027968, + "balance_loss_clip": 1.01530159, + "balance_loss_mlp": 1.0182035, + "epoch": 0.7033819329625732, + "flos": 19500666877440.0, + "grad_norm": 1.4604608204078975, + "language_loss": 0.8492465, + "learning_rate": 8.073783742269364e-07, + "loss": 0.8701129, + "num_input_tokens_seen": 252427025, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.40429688, + "step": 11699, + "time_per_iteration": 2.4438514709472656 + }, + { + "auxiliary_loss_clip": 0.01059585, + "auxiliary_loss_mlp": 0.01021731, + "balance_loss_clip": 1.00887418, + "balance_loss_mlp": 1.01958549, + "epoch": 0.7034420562152413, + "flos": 23986517425920.0, + "grad_norm": 2.1003336354665096, + "language_loss": 0.78967643, + "learning_rate": 8.070750972589658e-07, + "loss": 0.81048959, + "num_input_tokens_seen": 252445410, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.3984375, + "step": 11700, + "time_per_iteration": 2.4672563076019287 + }, + { + "auxiliary_loss_clip": 0.01057684, + "auxiliary_loss_mlp": 0.01023216, + "balance_loss_clip": 1.01091957, + "balance_loss_mlp": 1.0182569, + "epoch": 0.7035021794679092, + "flos": 35041974508800.0, + "grad_norm": 1.69629125405954, + "language_loss": 0.74656367, + "learning_rate": 8.067718628634148e-07, + "loss": 0.76737273, + "num_input_tokens_seen": 252463905, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.39453125, + "step": 11701, + "time_per_iteration": 2.530568838119507 + }, + { + "auxiliary_loss_clip": 0.01060303, + "auxiliary_loss_mlp": 0.01025217, + "balance_loss_clip": 1.01228309, + "balance_loss_mlp": 1.01952004, + "epoch": 0.7035623027205772, + "flos": 10996697531520.0, + "grad_norm": 2.0985962201883313, + "language_loss": 0.840523, + "learning_rate": 8.064686710511075e-07, + "loss": 0.86137819, + "num_input_tokens_seen": 252478655, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.40820312, + "step": 11702, + "time_per_iteration": 2.3910021781921387 + }, + { + "auxiliary_loss_clip": 0.01057594, + "auxiliary_loss_mlp": 0.01025114, + "balance_loss_clip": 1.01352727, + "balance_loss_mlp": 1.01834023, + "epoch": 0.7036224259732452, + "flos": 23622582746880.0, + "grad_norm": 1.8103986544853627, + "language_loss": 0.60812473, + "learning_rate": 8.061655218328631e-07, + "loss": 0.62895185, + "num_input_tokens_seen": 252498740, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.39257812, + "step": 11703, + "time_per_iteration": 2.4149158000946045 + }, + { + "auxiliary_loss_clip": 0.01058105, + "auxiliary_loss_mlp": 0.01029078, + "balance_loss_clip": 1.0160718, + "balance_loss_mlp": 1.01786733, + "epoch": 0.7036825492259131, + "flos": 31684831171200.0, + "grad_norm": 2.3374450581612267, + "language_loss": 0.61164498, + "learning_rate": 8.058624152195003e-07, + "loss": 0.6325168, + "num_input_tokens_seen": 252517800, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.40234375, + "step": 11704, + "time_per_iteration": 2.489269256591797 + }, + { + "auxiliary_loss_clip": 0.01054726, + "auxiliary_loss_mlp": 0.01025587, + "balance_loss_clip": 1.01541281, + "balance_loss_mlp": 1.01808023, + "epoch": 0.7037426724785811, + "flos": 30191491054080.0, + "grad_norm": 1.5679892363578927, + "language_loss": 0.7072345, + "learning_rate": 8.055593512218357e-07, + "loss": 0.72803766, + "num_input_tokens_seen": 252539620, + "router_z_loss_clip": 0.1015625, + "router_z_loss_mlp": 0.3671875, + "step": 11705, + "time_per_iteration": 2.473233222961426 + }, + { + "auxiliary_loss_clip": 0.01057374, + "auxiliary_loss_mlp": 0.01025462, + "balance_loss_clip": 1.01422036, + "balance_loss_mlp": 1.01880515, + "epoch": 0.703802795731249, + "flos": 24310511642880.0, + "grad_norm": 1.9602960227288642, + "language_loss": 0.61532426, + "learning_rate": 8.052563298506858e-07, + "loss": 0.63615257, + "num_input_tokens_seen": 252557300, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.38476562, + "step": 11706, + "time_per_iteration": 2.4576711654663086 + }, + { + "auxiliary_loss_clip": 0.01058586, + "auxiliary_loss_mlp": 0.01025506, + "balance_loss_clip": 1.01380014, + "balance_loss_mlp": 1.0182718, + "epoch": 0.7038629189839171, + "flos": 22527846155520.0, + "grad_norm": 2.025739967195442, + "language_loss": 0.67800093, + "learning_rate": 8.049533511168645e-07, + "loss": 0.69884187, + "num_input_tokens_seen": 252576715, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.40234375, + "step": 11707, + "time_per_iteration": 3.895209789276123 + }, + { + "auxiliary_loss_clip": 0.01054742, + "auxiliary_loss_mlp": 0.0102334, + "balance_loss_clip": 1.01228905, + "balance_loss_mlp": 1.01806402, + "epoch": 0.703923042236585, + "flos": 26249273285760.0, + "grad_norm": 1.672270137494906, + "language_loss": 0.76133448, + "learning_rate": 8.04650415031184e-07, + "loss": 0.78211534, + "num_input_tokens_seen": 252596190, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.3671875, + "step": 11708, + "time_per_iteration": 2.4830853939056396 + }, + { + "auxiliary_loss_clip": 0.01056839, + "auxiliary_loss_mlp": 0.01021646, + "balance_loss_clip": 1.01045203, + "balance_loss_mlp": 1.01892042, + "epoch": 0.703983165489253, + "flos": 19389259128960.0, + "grad_norm": 1.7703723540544798, + "language_loss": 0.72535074, + "learning_rate": 8.043475216044547e-07, + "loss": 0.74613559, + "num_input_tokens_seen": 252613410, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.37890625, + "step": 11709, + "time_per_iteration": 2.4030232429504395 + }, + { + "auxiliary_loss_clip": 0.01055873, + "auxiliary_loss_mlp": 0.01023103, + "balance_loss_clip": 1.01147389, + "balance_loss_mlp": 1.01790106, + "epoch": 0.7040432887419209, + "flos": 16682897134080.0, + "grad_norm": 2.113884830036197, + "language_loss": 0.79060447, + "learning_rate": 8.040446708474879e-07, + "loss": 0.81139421, + "num_input_tokens_seen": 252629150, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.38085938, + "step": 11710, + "time_per_iteration": 2.3706603050231934 + }, + { + "auxiliary_loss_clip": 0.0105932, + "auxiliary_loss_mlp": 0.01026631, + "balance_loss_clip": 1.01448965, + "balance_loss_mlp": 1.01921475, + "epoch": 0.7041034119945889, + "flos": 21140362880640.0, + "grad_norm": 1.5359516987690942, + "language_loss": 0.77375418, + "learning_rate": 8.037418627710892e-07, + "loss": 0.79461372, + "num_input_tokens_seen": 252648225, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.40234375, + "step": 11711, + "time_per_iteration": 2.4435079097747803 + }, + { + "auxiliary_loss_clip": 0.01054864, + "auxiliary_loss_mlp": 0.01024494, + "balance_loss_clip": 1.01405716, + "balance_loss_mlp": 1.017537, + "epoch": 0.7041635352472568, + "flos": 16909343412480.0, + "grad_norm": 1.8922948112183666, + "language_loss": 0.74360901, + "learning_rate": 8.034390973860672e-07, + "loss": 0.76440263, + "num_input_tokens_seen": 252665380, + "router_z_loss_clip": 0.10449219, + "router_z_loss_mlp": 0.37304688, + "step": 11712, + "time_per_iteration": 2.4116017818450928 + }, + { + "auxiliary_loss_clip": 0.01007284, + "auxiliary_loss_mlp": 0.01001963, + "balance_loss_clip": 1.00108683, + "balance_loss_mlp": 1.00077617, + "epoch": 0.7042236584999249, + "flos": 71677558293120.0, + "grad_norm": 0.8790638592938387, + "language_loss": 0.64635074, + "learning_rate": 8.031363747032256e-07, + "loss": 0.66644311, + "num_input_tokens_seen": 252727950, + "router_z_loss_clip": 0.00878906, + "router_z_loss_mlp": 0.06542969, + "step": 11713, + "time_per_iteration": 3.117393732070923 + }, + { + "auxiliary_loss_clip": 0.01056677, + "auxiliary_loss_mlp": 0.01020842, + "balance_loss_clip": 1.00995278, + "balance_loss_mlp": 1.01850438, + "epoch": 0.7042837817525928, + "flos": 28656918754560.0, + "grad_norm": 2.061485027554399, + "language_loss": 0.72662503, + "learning_rate": 8.028336947333682e-07, + "loss": 0.74740016, + "num_input_tokens_seen": 252746770, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.38085938, + "step": 11714, + "time_per_iteration": 2.477229356765747 + }, + { + "auxiliary_loss_clip": 0.01057599, + "auxiliary_loss_mlp": 0.01027508, + "balance_loss_clip": 1.01554561, + "balance_loss_mlp": 1.01883292, + "epoch": 0.7043439050052608, + "flos": 19752600314880.0, + "grad_norm": 3.2663672084137514, + "language_loss": 0.79603159, + "learning_rate": 8.025310574872967e-07, + "loss": 0.81688273, + "num_input_tokens_seen": 252765610, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.38671875, + "step": 11715, + "time_per_iteration": 2.4157824516296387 + }, + { + "auxiliary_loss_clip": 0.01059082, + "auxiliary_loss_mlp": 0.01030499, + "balance_loss_clip": 1.01831019, + "balance_loss_mlp": 1.01822543, + "epoch": 0.7044040282579288, + "flos": 11537956339200.0, + "grad_norm": 2.1083372847611566, + "language_loss": 0.71411812, + "learning_rate": 8.022284629758109e-07, + "loss": 0.7350139, + "num_input_tokens_seen": 252781610, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.40820312, + "step": 11716, + "time_per_iteration": 2.369102954864502 + }, + { + "auxiliary_loss_clip": 0.01060451, + "auxiliary_loss_mlp": 0.01028086, + "balance_loss_clip": 1.01679718, + "balance_loss_mlp": 1.01972735, + "epoch": 0.7044641515105967, + "flos": 33654735613440.0, + "grad_norm": 1.914242810746655, + "language_loss": 0.6626581, + "learning_rate": 8.019259112097117e-07, + "loss": 0.68354344, + "num_input_tokens_seen": 252800600, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.40625, + "step": 11717, + "time_per_iteration": 5.341396808624268 + }, + { + "auxiliary_loss_clip": 0.01058001, + "auxiliary_loss_mlp": 0.01023602, + "balance_loss_clip": 1.01253963, + "balance_loss_mlp": 1.01864791, + "epoch": 0.7045242747632647, + "flos": 26722660677120.0, + "grad_norm": 1.4033331401746734, + "language_loss": 0.74151385, + "learning_rate": 8.016234021997934e-07, + "loss": 0.76232994, + "num_input_tokens_seen": 252822310, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.39453125, + "step": 11718, + "time_per_iteration": 2.4645094871520996 + }, + { + "auxiliary_loss_clip": 0.01057245, + "auxiliary_loss_mlp": 0.01022847, + "balance_loss_clip": 1.01192117, + "balance_loss_mlp": 1.01824188, + "epoch": 0.7045843980159326, + "flos": 26796432113280.0, + "grad_norm": 1.7875949544570189, + "language_loss": 0.79913223, + "learning_rate": 8.01320935956854e-07, + "loss": 0.81993318, + "num_input_tokens_seen": 252842355, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.390625, + "step": 11719, + "time_per_iteration": 2.4874343872070312 + }, + { + "auxiliary_loss_clip": 0.01058016, + "auxiliary_loss_mlp": 0.01028381, + "balance_loss_clip": 1.01609612, + "balance_loss_mlp": 1.019732, + "epoch": 0.7046445212686007, + "flos": 41573176680960.0, + "grad_norm": 3.180911341024504, + "language_loss": 0.65584701, + "learning_rate": 8.010185124916868e-07, + "loss": 0.67671096, + "num_input_tokens_seen": 252866785, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.3828125, + "step": 11720, + "time_per_iteration": 2.5910277366638184 + }, + { + "auxiliary_loss_clip": 0.01058976, + "auxiliary_loss_mlp": 0.0102492, + "balance_loss_clip": 1.01248097, + "balance_loss_mlp": 1.01948559, + "epoch": 0.7047046445212686, + "flos": 15559252070400.0, + "grad_norm": 3.8782516697385643, + "language_loss": 0.80045795, + "learning_rate": 8.007161318150851e-07, + "loss": 0.82129693, + "num_input_tokens_seen": 252881870, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.39453125, + "step": 11721, + "time_per_iteration": 2.3903748989105225 + }, + { + "auxiliary_loss_clip": 0.01007382, + "auxiliary_loss_mlp": 0.01003894, + "balance_loss_clip": 1.00291634, + "balance_loss_mlp": 1.00097466, + "epoch": 0.7047647677739366, + "flos": 70406475091200.0, + "grad_norm": 0.8001737988307572, + "language_loss": 0.64804059, + "learning_rate": 8.004137939378388e-07, + "loss": 0.66815335, + "num_input_tokens_seen": 252951300, + "router_z_loss_clip": 0.00976562, + "router_z_loss_mlp": 0.06445312, + "step": 11722, + "time_per_iteration": 3.158966541290283 + }, + { + "auxiliary_loss_clip": 0.01057828, + "auxiliary_loss_mlp": 0.01023248, + "balance_loss_clip": 1.01211429, + "balance_loss_mlp": 1.01976717, + "epoch": 0.7048248910266045, + "flos": 23658892427520.0, + "grad_norm": 1.5981278140965238, + "language_loss": 0.65985906, + "learning_rate": 8.0011149887074e-07, + "loss": 0.68066978, + "num_input_tokens_seen": 252971400, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.38085938, + "step": 11723, + "time_per_iteration": 2.443753480911255 + }, + { + "auxiliary_loss_clip": 0.01058914, + "auxiliary_loss_mlp": 0.0102434, + "balance_loss_clip": 1.01216316, + "balance_loss_mlp": 1.01912677, + "epoch": 0.7048850142792725, + "flos": 21396101656320.0, + "grad_norm": 1.7223846647209855, + "language_loss": 0.8128733, + "learning_rate": 7.998092466245739e-07, + "loss": 0.83370584, + "num_input_tokens_seen": 252989475, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.3984375, + "step": 11724, + "time_per_iteration": 2.4478919506073 + }, + { + "auxiliary_loss_clip": 0.01058081, + "auxiliary_loss_mlp": 0.01026219, + "balance_loss_clip": 1.0141964, + "balance_loss_mlp": 1.01819348, + "epoch": 0.7049451375319404, + "flos": 21647162309760.0, + "grad_norm": 1.5875890147503509, + "language_loss": 0.73559201, + "learning_rate": 7.995070372101291e-07, + "loss": 0.75643492, + "num_input_tokens_seen": 253007220, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.3984375, + "step": 11725, + "time_per_iteration": 3.939319372177124 + }, + { + "auxiliary_loss_clip": 0.01058996, + "auxiliary_loss_mlp": 0.01023017, + "balance_loss_clip": 1.0103513, + "balance_loss_mlp": 1.01910591, + "epoch": 0.7050052607846085, + "flos": 14865911913600.0, + "grad_norm": 3.111623327679327, + "language_loss": 0.78655797, + "learning_rate": 7.992048706381896e-07, + "loss": 0.80737811, + "num_input_tokens_seen": 253025410, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.3984375, + "step": 11726, + "time_per_iteration": 2.387848138809204 + }, + { + "auxiliary_loss_clip": 0.01058716, + "auxiliary_loss_mlp": 0.01026987, + "balance_loss_clip": 1.01580465, + "balance_loss_mlp": 1.01921368, + "epoch": 0.7050653840372764, + "flos": 19240843472640.0, + "grad_norm": 1.8060992903280317, + "language_loss": 0.70438051, + "learning_rate": 7.989027469195409e-07, + "loss": 0.72523755, + "num_input_tokens_seen": 253043305, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.39453125, + "step": 11727, + "time_per_iteration": 2.3719289302825928 + }, + { + "auxiliary_loss_clip": 0.01053118, + "auxiliary_loss_mlp": 0.01022142, + "balance_loss_clip": 1.01143146, + "balance_loss_mlp": 1.01776659, + "epoch": 0.7051255072899444, + "flos": 27779237285760.0, + "grad_norm": 1.5843062831750527, + "language_loss": 0.69039178, + "learning_rate": 7.98600666064962e-07, + "loss": 0.71114433, + "num_input_tokens_seen": 253062790, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.35351562, + "step": 11728, + "time_per_iteration": 2.49446702003479 + }, + { + "auxiliary_loss_clip": 0.01057877, + "auxiliary_loss_mlp": 0.01023069, + "balance_loss_clip": 1.01167274, + "balance_loss_mlp": 1.01956928, + "epoch": 0.7051856305426124, + "flos": 27890784679680.0, + "grad_norm": 4.081289045117951, + "language_loss": 0.73329878, + "learning_rate": 7.982986280852355e-07, + "loss": 0.75410831, + "num_input_tokens_seen": 253082055, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.3828125, + "step": 11729, + "time_per_iteration": 2.439624786376953 + }, + { + "auxiliary_loss_clip": 0.01060264, + "auxiliary_loss_mlp": 0.01028269, + "balance_loss_clip": 1.01600266, + "balance_loss_mlp": 1.01893556, + "epoch": 0.7052457537952803, + "flos": 25042465630080.0, + "grad_norm": 1.664567384359027, + "language_loss": 0.78424501, + "learning_rate": 7.97996632991141e-07, + "loss": 0.80513036, + "num_input_tokens_seen": 253102575, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.4140625, + "step": 11730, + "time_per_iteration": 2.4403135776519775 + }, + { + "auxiliary_loss_clip": 0.01059172, + "auxiliary_loss_mlp": 0.01025061, + "balance_loss_clip": 1.01275253, + "balance_loss_mlp": 1.01904237, + "epoch": 0.7053058770479483, + "flos": 21870641122560.0, + "grad_norm": 2.2199667578963576, + "language_loss": 0.63405359, + "learning_rate": 7.976946807934528e-07, + "loss": 0.6548959, + "num_input_tokens_seen": 253121290, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.40039062, + "step": 11731, + "time_per_iteration": 2.4280858039855957 + }, + { + "auxiliary_loss_clip": 0.01059219, + "auxiliary_loss_mlp": 0.01023449, + "balance_loss_clip": 1.01131988, + "balance_loss_mlp": 1.0189079, + "epoch": 0.7053660003006162, + "flos": 16397796038400.0, + "grad_norm": 1.8714766985098146, + "language_loss": 0.74558681, + "learning_rate": 7.973927715029499e-07, + "loss": 0.76641357, + "num_input_tokens_seen": 253139720, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.40234375, + "step": 11732, + "time_per_iteration": 2.4224681854248047 + }, + { + "auxiliary_loss_clip": 0.01057258, + "auxiliary_loss_mlp": 0.01021853, + "balance_loss_clip": 1.01052165, + "balance_loss_mlp": 1.01914859, + "epoch": 0.7054261235532843, + "flos": 22710441899520.0, + "grad_norm": 1.4474669029095186, + "language_loss": 0.71165657, + "learning_rate": 7.970909051304044e-07, + "loss": 0.73244774, + "num_input_tokens_seen": 253160250, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.38085938, + "step": 11733, + "time_per_iteration": 2.425022602081299 + }, + { + "auxiliary_loss_clip": 0.01056895, + "auxiliary_loss_mlp": 0.01023464, + "balance_loss_clip": 1.01166821, + "balance_loss_mlp": 1.01829171, + "epoch": 0.7054862468059522, + "flos": 13588858869120.0, + "grad_norm": 1.8947355423732963, + "language_loss": 0.73662299, + "learning_rate": 7.967890816865921e-07, + "loss": 0.75742662, + "num_input_tokens_seen": 253178710, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.38476562, + "step": 11734, + "time_per_iteration": 2.3793718814849854 + }, + { + "auxiliary_loss_clip": 0.01058969, + "auxiliary_loss_mlp": 0.01027433, + "balance_loss_clip": 1.01481497, + "balance_loss_mlp": 1.01869202, + "epoch": 0.7055463700586202, + "flos": 15879999530880.0, + "grad_norm": 2.4167545100676033, + "language_loss": 0.69380015, + "learning_rate": 7.964873011822808e-07, + "loss": 0.71466416, + "num_input_tokens_seen": 253194805, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.40234375, + "step": 11735, + "time_per_iteration": 2.3652937412261963 + }, + { + "auxiliary_loss_clip": 0.01059051, + "auxiliary_loss_mlp": 0.01024562, + "balance_loss_clip": 1.01140785, + "balance_loss_mlp": 1.01882625, + "epoch": 0.7056064933112881, + "flos": 23075039894400.0, + "grad_norm": 2.6568313655366813, + "language_loss": 0.72308069, + "learning_rate": 7.961855636282427e-07, + "loss": 0.74391687, + "num_input_tokens_seen": 253213895, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.40234375, + "step": 11736, + "time_per_iteration": 2.4763498306274414 + }, + { + "auxiliary_loss_clip": 0.0105575, + "auxiliary_loss_mlp": 0.01020594, + "balance_loss_clip": 1.01007986, + "balance_loss_mlp": 1.01864398, + "epoch": 0.7056666165639561, + "flos": 24056134410240.0, + "grad_norm": 2.2745743766422093, + "language_loss": 0.69229615, + "learning_rate": 7.958838690352449e-07, + "loss": 0.71305954, + "num_input_tokens_seen": 253231620, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.37109375, + "step": 11737, + "time_per_iteration": 2.4512977600097656 + }, + { + "auxiliary_loss_clip": 0.01061413, + "auxiliary_loss_mlp": 0.01029289, + "balance_loss_clip": 1.01613998, + "balance_loss_mlp": 1.02031159, + "epoch": 0.705726739816624, + "flos": 17492288250240.0, + "grad_norm": 2.1273872966313037, + "language_loss": 0.67464185, + "learning_rate": 7.955822174140549e-07, + "loss": 0.69554889, + "num_input_tokens_seen": 253249590, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.41015625, + "step": 11738, + "time_per_iteration": 2.432893991470337 + }, + { + "auxiliary_loss_clip": 0.01057096, + "auxiliary_loss_mlp": 0.01026792, + "balance_loss_clip": 1.01371443, + "balance_loss_mlp": 1.01833391, + "epoch": 0.7057868630692921, + "flos": 51348578342400.0, + "grad_norm": 2.762339606847065, + "language_loss": 0.75239825, + "learning_rate": 7.952806087754364e-07, + "loss": 0.77323711, + "num_input_tokens_seen": 253273870, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.38671875, + "step": 11739, + "time_per_iteration": 2.679354667663574 + }, + { + "auxiliary_loss_clip": 0.01055845, + "auxiliary_loss_mlp": 0.01022911, + "balance_loss_clip": 1.01156271, + "balance_loss_mlp": 1.01831412, + "epoch": 0.70584698632196, + "flos": 26101800236160.0, + "grad_norm": 1.6086141220536552, + "language_loss": 0.71203315, + "learning_rate": 7.949790431301557e-07, + "loss": 0.73282069, + "num_input_tokens_seen": 253293720, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.375, + "step": 11740, + "time_per_iteration": 2.446223735809326 + }, + { + "auxiliary_loss_clip": 0.01057331, + "auxiliary_loss_mlp": 0.01022697, + "balance_loss_clip": 1.01185489, + "balance_loss_mlp": 1.01911962, + "epoch": 0.705907109574628, + "flos": 21542073517440.0, + "grad_norm": 1.8607480440707056, + "language_loss": 0.8218894, + "learning_rate": 7.94677520488972e-07, + "loss": 0.84268969, + "num_input_tokens_seen": 253313700, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.3828125, + "step": 11741, + "time_per_iteration": 2.4016988277435303 + }, + { + "auxiliary_loss_clip": 0.01057713, + "auxiliary_loss_mlp": 0.01024077, + "balance_loss_clip": 1.01225758, + "balance_loss_mlp": 1.01803887, + "epoch": 0.705967232827296, + "flos": 22709743672320.0, + "grad_norm": 24.064672200781168, + "language_loss": 0.7802282, + "learning_rate": 7.943760408626474e-07, + "loss": 0.80104613, + "num_input_tokens_seen": 253332425, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.39648438, + "step": 11742, + "time_per_iteration": 2.4103751182556152 + }, + { + "auxiliary_loss_clip": 0.01060035, + "auxiliary_loss_mlp": 0.0102509, + "balance_loss_clip": 1.01278782, + "balance_loss_mlp": 1.01926112, + "epoch": 0.7060273560799639, + "flos": 28690051501440.0, + "grad_norm": 1.4582462264491378, + "language_loss": 0.64180779, + "learning_rate": 7.940746042619404e-07, + "loss": 0.66265905, + "num_input_tokens_seen": 253353620, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.40820312, + "step": 11743, + "time_per_iteration": 2.452486753463745 + }, + { + "auxiliary_loss_clip": 0.01059479, + "auxiliary_loss_mlp": 0.01022476, + "balance_loss_clip": 1.0094583, + "balance_loss_mlp": 1.01893663, + "epoch": 0.7060874793326319, + "flos": 15705258842880.0, + "grad_norm": 3.0724829819986006, + "language_loss": 0.65943354, + "learning_rate": 7.937732106976098e-07, + "loss": 0.68025309, + "num_input_tokens_seen": 253370930, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.40625, + "step": 11744, + "time_per_iteration": 2.403808832168579 + }, + { + "auxiliary_loss_clip": 0.01056215, + "auxiliary_loss_mlp": 0.01024634, + "balance_loss_clip": 1.01262331, + "balance_loss_mlp": 1.0183239, + "epoch": 0.7061476025852998, + "flos": 21505694014080.0, + "grad_norm": 2.7183139590945244, + "language_loss": 0.64125657, + "learning_rate": 7.9347186018041e-07, + "loss": 0.66206509, + "num_input_tokens_seen": 253389810, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.37890625, + "step": 11745, + "time_per_iteration": 2.382481098175049 + }, + { + "auxiliary_loss_clip": 0.01059297, + "auxiliary_loss_mlp": 0.01024869, + "balance_loss_clip": 1.01325226, + "balance_loss_mlp": 1.02003467, + "epoch": 0.7062077258379679, + "flos": 28180633720320.0, + "grad_norm": 1.7935357280348954, + "language_loss": 0.71581459, + "learning_rate": 7.931705527210952e-07, + "loss": 0.73665625, + "num_input_tokens_seen": 253408685, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.39257812, + "step": 11746, + "time_per_iteration": 3.86710524559021 + }, + { + "auxiliary_loss_clip": 0.01058741, + "auxiliary_loss_mlp": 0.01025778, + "balance_loss_clip": 1.01365995, + "balance_loss_mlp": 1.01963568, + "epoch": 0.7062678490906358, + "flos": 27852485051520.0, + "grad_norm": 1.3626177189816981, + "language_loss": 0.7948463, + "learning_rate": 7.928692883304206e-07, + "loss": 0.81569147, + "num_input_tokens_seen": 253429685, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.390625, + "step": 11747, + "time_per_iteration": 2.430205821990967 + }, + { + "auxiliary_loss_clip": 0.01056609, + "auxiliary_loss_mlp": 0.01023998, + "balance_loss_clip": 1.01289368, + "balance_loss_mlp": 1.0193305, + "epoch": 0.7063279723433038, + "flos": 23183759468160.0, + "grad_norm": 4.216189905535511, + "language_loss": 0.65051174, + "learning_rate": 7.925680670191344e-07, + "loss": 0.67131782, + "num_input_tokens_seen": 253448260, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.37304688, + "step": 11748, + "time_per_iteration": 2.407656192779541 + }, + { + "auxiliary_loss_clip": 0.01057706, + "auxiliary_loss_mlp": 0.01026631, + "balance_loss_clip": 1.01425695, + "balance_loss_mlp": 1.01800644, + "epoch": 0.7063880955959717, + "flos": 20187757900800.0, + "grad_norm": 1.8032026864713744, + "language_loss": 0.79449111, + "learning_rate": 7.922668887979889e-07, + "loss": 0.81533444, + "num_input_tokens_seen": 253467725, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.3984375, + "step": 11749, + "time_per_iteration": 2.3881402015686035 + }, + { + "auxiliary_loss_clip": 0.01007447, + "auxiliary_loss_mlp": 0.01003828, + "balance_loss_clip": 1.00287437, + "balance_loss_mlp": 1.00100875, + "epoch": 0.7064482188486397, + "flos": 63665376491520.0, + "grad_norm": 0.7850226349497644, + "language_loss": 0.5401029, + "learning_rate": 7.919657536777304e-07, + "loss": 0.56021565, + "num_input_tokens_seen": 253526940, + "router_z_loss_clip": 0.00952148, + "router_z_loss_mlp": 0.06445312, + "step": 11750, + "time_per_iteration": 3.0080296993255615 + }, + { + "auxiliary_loss_clip": 0.01058123, + "auxiliary_loss_mlp": 0.01024855, + "balance_loss_clip": 1.01271355, + "balance_loss_mlp": 1.01963782, + "epoch": 0.7065083421013076, + "flos": 25190078325120.0, + "grad_norm": 2.0028687400537994, + "language_loss": 0.79031372, + "learning_rate": 7.916646616691085e-07, + "loss": 0.81114346, + "num_input_tokens_seen": 253546160, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.38476562, + "step": 11751, + "time_per_iteration": 2.43397855758667 + }, + { + "auxiliary_loss_clip": 0.01053798, + "auxiliary_loss_mlp": 0.01021846, + "balance_loss_clip": 1.01093233, + "balance_loss_mlp": 1.017488, + "epoch": 0.7065684653539757, + "flos": 22892583795840.0, + "grad_norm": 1.6549456263849047, + "language_loss": 0.67745888, + "learning_rate": 7.913636127828651e-07, + "loss": 0.69821537, + "num_input_tokens_seen": 253565505, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.36328125, + "step": 11752, + "time_per_iteration": 2.438093662261963 + }, + { + "auxiliary_loss_clip": 0.01057199, + "auxiliary_loss_mlp": 0.01027938, + "balance_loss_clip": 1.01613665, + "balance_loss_mlp": 1.01942742, + "epoch": 0.7066285886066436, + "flos": 23549125512960.0, + "grad_norm": 2.050474879054968, + "language_loss": 0.76294184, + "learning_rate": 7.910626070297461e-07, + "loss": 0.78379321, + "num_input_tokens_seen": 253585125, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.37890625, + "step": 11753, + "time_per_iteration": 2.425234794616699 + }, + { + "auxiliary_loss_clip": 0.01056, + "auxiliary_loss_mlp": 0.01023321, + "balance_loss_clip": 1.01165056, + "balance_loss_mlp": 1.01867294, + "epoch": 0.7066887118593116, + "flos": 21068232278400.0, + "grad_norm": 1.7286594469491827, + "language_loss": 0.71030396, + "learning_rate": 7.907616444204928e-07, + "loss": 0.73109716, + "num_input_tokens_seen": 253604815, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.37304688, + "step": 11754, + "time_per_iteration": 2.426517963409424 + }, + { + "auxiliary_loss_clip": 0.01056996, + "auxiliary_loss_mlp": 0.01026493, + "balance_loss_clip": 1.01481652, + "balance_loss_mlp": 1.01806664, + "epoch": 0.7067488351119796, + "flos": 21175311018240.0, + "grad_norm": 1.8998645570549175, + "language_loss": 0.89292479, + "learning_rate": 7.90460724965846e-07, + "loss": 0.91375965, + "num_input_tokens_seen": 253622855, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.390625, + "step": 11755, + "time_per_iteration": 2.3941378593444824 + }, + { + "auxiliary_loss_clip": 0.01056725, + "auxiliary_loss_mlp": 0.01027929, + "balance_loss_clip": 1.01638365, + "balance_loss_mlp": 1.01790214, + "epoch": 0.7068089583646475, + "flos": 20448174798720.0, + "grad_norm": 1.841713922102235, + "language_loss": 0.79767603, + "learning_rate": 7.901598486765438e-07, + "loss": 0.81852257, + "num_input_tokens_seen": 253642760, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.38867188, + "step": 11756, + "time_per_iteration": 3.9592440128326416 + }, + { + "auxiliary_loss_clip": 0.01007237, + "auxiliary_loss_mlp": 0.01001508, + "balance_loss_clip": 1.00060248, + "balance_loss_mlp": 1.00070071, + "epoch": 0.7068690816173155, + "flos": 59106452734080.0, + "grad_norm": 0.8278457896412512, + "language_loss": 0.60413778, + "learning_rate": 7.89859015563326e-07, + "loss": 0.6242252, + "num_input_tokens_seen": 253695685, + "router_z_loss_clip": 0.0090332, + "router_z_loss_mlp": 0.06542969, + "step": 11757, + "time_per_iteration": 2.834179639816284 + }, + { + "auxiliary_loss_clip": 0.01056676, + "auxiliary_loss_mlp": 0.01027701, + "balance_loss_clip": 1.01592302, + "balance_loss_mlp": 1.01817811, + "epoch": 0.7069292048699835, + "flos": 16250672102400.0, + "grad_norm": 1.7368225240942736, + "language_loss": 0.80231655, + "learning_rate": 7.895582256369256e-07, + "loss": 0.82316029, + "num_input_tokens_seen": 253713305, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.38476562, + "step": 11758, + "time_per_iteration": 2.377772331237793 + }, + { + "auxiliary_loss_clip": 0.01055061, + "auxiliary_loss_mlp": 0.0102768, + "balance_loss_clip": 1.01475215, + "balance_loss_mlp": 1.01743865, + "epoch": 0.7069893281226515, + "flos": 41171151841920.0, + "grad_norm": 1.7130072884864276, + "language_loss": 0.7736398, + "learning_rate": 7.892574789080793e-07, + "loss": 0.79446721, + "num_input_tokens_seen": 253736100, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.375, + "step": 11759, + "time_per_iteration": 2.5769765377044678 + }, + { + "auxiliary_loss_clip": 0.01057152, + "auxiliary_loss_mlp": 0.01022913, + "balance_loss_clip": 1.01146317, + "balance_loss_mlp": 1.01843977, + "epoch": 0.7070494513753194, + "flos": 24206121077760.0, + "grad_norm": 1.5973959874714065, + "language_loss": 0.68057549, + "learning_rate": 7.88956775387519e-07, + "loss": 0.7013762, + "num_input_tokens_seen": 253757350, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.38671875, + "step": 11760, + "time_per_iteration": 2.422759771347046 + }, + { + "auxiliary_loss_clip": 0.01059174, + "auxiliary_loss_mlp": 0.01023881, + "balance_loss_clip": 1.01254416, + "balance_loss_mlp": 1.01999903, + "epoch": 0.7071095746279874, + "flos": 20184860257920.0, + "grad_norm": 1.8064129154111639, + "language_loss": 0.80254179, + "learning_rate": 7.886561150859763e-07, + "loss": 0.82337236, + "num_input_tokens_seen": 253772855, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.39257812, + "step": 11761, + "time_per_iteration": 2.393888473510742 + }, + { + "auxiliary_loss_clip": 0.01058506, + "auxiliary_loss_mlp": 0.01024841, + "balance_loss_clip": 1.01285434, + "balance_loss_mlp": 1.01945817, + "epoch": 0.7071696978806553, + "flos": 18182172182400.0, + "grad_norm": 2.287684927144999, + "language_loss": 0.75105047, + "learning_rate": 7.883554980141811e-07, + "loss": 0.77188396, + "num_input_tokens_seen": 253790360, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.390625, + "step": 11762, + "time_per_iteration": 2.3851962089538574 + }, + { + "auxiliary_loss_clip": 0.01057115, + "auxiliary_loss_mlp": 0.01023428, + "balance_loss_clip": 1.01220489, + "balance_loss_mlp": 1.01909363, + "epoch": 0.7072298211333233, + "flos": 24130638984960.0, + "grad_norm": 1.7248592829497842, + "language_loss": 0.76949263, + "learning_rate": 7.880549241828604e-07, + "loss": 0.7902981, + "num_input_tokens_seen": 253810585, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.38085938, + "step": 11763, + "time_per_iteration": 2.435248851776123 + }, + { + "auxiliary_loss_clip": 0.01058587, + "auxiliary_loss_mlp": 0.01025877, + "balance_loss_clip": 1.01390862, + "balance_loss_mlp": 1.01993608, + "epoch": 0.7072899443859912, + "flos": 27197200143360.0, + "grad_norm": 2.0892059413676405, + "language_loss": 0.79097176, + "learning_rate": 7.877543936027437e-07, + "loss": 0.81181645, + "num_input_tokens_seen": 253829080, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.38671875, + "step": 11764, + "time_per_iteration": 3.865684986114502 + }, + { + "auxiliary_loss_clip": 0.01060447, + "auxiliary_loss_mlp": 0.01027814, + "balance_loss_clip": 1.01538694, + "balance_loss_mlp": 1.01967478, + "epoch": 0.7073500676386593, + "flos": 16434664300800.0, + "grad_norm": 1.745557839083156, + "language_loss": 0.79628509, + "learning_rate": 7.874539062845528e-07, + "loss": 0.81716776, + "num_input_tokens_seen": 253846780, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.40820312, + "step": 11765, + "time_per_iteration": 2.3797287940979004 + }, + { + "auxiliary_loss_clip": 0.01054743, + "auxiliary_loss_mlp": 0.0102469, + "balance_loss_clip": 1.01356757, + "balance_loss_mlp": 1.01806021, + "epoch": 0.7074101908913272, + "flos": 27672472748160.0, + "grad_norm": 1.5991375636901306, + "language_loss": 0.68879175, + "learning_rate": 7.871534622390137e-07, + "loss": 0.70958608, + "num_input_tokens_seen": 253867075, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.3671875, + "step": 11766, + "time_per_iteration": 2.427687406539917 + }, + { + "auxiliary_loss_clip": 0.01057515, + "auxiliary_loss_mlp": 0.01027113, + "balance_loss_clip": 1.01522827, + "balance_loss_mlp": 1.01792014, + "epoch": 0.7074703141439952, + "flos": 22236949774080.0, + "grad_norm": 1.7256098359278642, + "language_loss": 0.64188337, + "learning_rate": 7.868530614768478e-07, + "loss": 0.66272968, + "num_input_tokens_seen": 253885790, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.39648438, + "step": 11767, + "time_per_iteration": 2.4240596294403076 + }, + { + "auxiliary_loss_clip": 0.01056855, + "auxiliary_loss_mlp": 0.01021803, + "balance_loss_clip": 1.00970972, + "balance_loss_mlp": 1.01788867, + "epoch": 0.7075304373966632, + "flos": 29641923342720.0, + "grad_norm": 1.4821599176761482, + "language_loss": 0.52732331, + "learning_rate": 7.865527040087756e-07, + "loss": 0.54810989, + "num_input_tokens_seen": 253907070, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.390625, + "step": 11768, + "time_per_iteration": 2.4777684211730957 + }, + { + "auxiliary_loss_clip": 0.0105527, + "auxiliary_loss_mlp": 0.01023544, + "balance_loss_clip": 1.01271987, + "balance_loss_mlp": 1.01828575, + "epoch": 0.7075905606493311, + "flos": 19754206237440.0, + "grad_norm": 1.6945167911450891, + "language_loss": 0.75370002, + "learning_rate": 7.862523898455151e-07, + "loss": 0.77448815, + "num_input_tokens_seen": 253927290, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.36914062, + "step": 11769, + "time_per_iteration": 2.4131319522857666 + }, + { + "auxiliary_loss_clip": 0.01007642, + "auxiliary_loss_mlp": 0.0100438, + "balance_loss_clip": 1.00342619, + "balance_loss_mlp": 1.00124097, + "epoch": 0.7076506839019991, + "flos": 65713136999040.0, + "grad_norm": 0.857889671379262, + "language_loss": 0.61987269, + "learning_rate": 7.859521189977856e-07, + "loss": 0.63999283, + "num_input_tokens_seen": 253983440, + "router_z_loss_clip": 0.00952148, + "router_z_loss_mlp": 0.06396484, + "step": 11770, + "time_per_iteration": 2.998178005218506 + }, + { + "auxiliary_loss_clip": 0.01058259, + "auxiliary_loss_mlp": 0.01026029, + "balance_loss_clip": 1.01389909, + "balance_loss_mlp": 1.01789606, + "epoch": 0.707710807154667, + "flos": 23764260510720.0, + "grad_norm": 2.256023710520534, + "language_loss": 0.76389962, + "learning_rate": 7.856518914763019e-07, + "loss": 0.78474259, + "num_input_tokens_seen": 254003825, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.40429688, + "step": 11771, + "time_per_iteration": 2.425105571746826 + }, + { + "auxiliary_loss_clip": 0.0105488, + "auxiliary_loss_mlp": 0.0101887, + "balance_loss_clip": 1.00799775, + "balance_loss_mlp": 1.01855135, + "epoch": 0.7077709304073351, + "flos": 21250304352000.0, + "grad_norm": 1.764682761597799, + "language_loss": 0.71414471, + "learning_rate": 7.853517072917786e-07, + "loss": 0.73488212, + "num_input_tokens_seen": 254023345, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.36328125, + "step": 11772, + "time_per_iteration": 2.3967108726501465 + }, + { + "auxiliary_loss_clip": 0.0105928, + "auxiliary_loss_mlp": 0.01025076, + "balance_loss_clip": 1.01197505, + "balance_loss_mlp": 1.01851666, + "epoch": 0.707831053660003, + "flos": 20739699584640.0, + "grad_norm": 1.6507808636825974, + "language_loss": 0.69801611, + "learning_rate": 7.850515664549278e-07, + "loss": 0.71885967, + "num_input_tokens_seen": 254041815, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.40820312, + "step": 11773, + "time_per_iteration": 2.401939630508423 + }, + { + "auxiliary_loss_clip": 0.01057224, + "auxiliary_loss_mlp": 0.0102859, + "balance_loss_clip": 1.01678264, + "balance_loss_mlp": 1.01820731, + "epoch": 0.707891176912671, + "flos": 21979919278080.0, + "grad_norm": 1.7274448559977744, + "language_loss": 0.7033509, + "learning_rate": 7.847514689764625e-07, + "loss": 0.72420907, + "num_input_tokens_seen": 254062065, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.390625, + "step": 11774, + "time_per_iteration": 2.4462125301361084 + }, + { + "auxiliary_loss_clip": 0.01054532, + "auxiliary_loss_mlp": 0.01024624, + "balance_loss_clip": 1.01369905, + "balance_loss_mlp": 1.01865602, + "epoch": 0.7079513001653389, + "flos": 21067918076160.0, + "grad_norm": 1.595488084024525, + "language_loss": 0.74356711, + "learning_rate": 7.844514148670909e-07, + "loss": 0.76435864, + "num_input_tokens_seen": 254080605, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.359375, + "step": 11775, + "time_per_iteration": 2.4125890731811523 + }, + { + "auxiliary_loss_clip": 0.01057609, + "auxiliary_loss_mlp": 0.01022854, + "balance_loss_clip": 1.01155329, + "balance_loss_mlp": 1.01934242, + "epoch": 0.7080114234180069, + "flos": 18039691457280.0, + "grad_norm": 1.7638844557668913, + "language_loss": 0.87072563, + "learning_rate": 7.841514041375206e-07, + "loss": 0.89153028, + "num_input_tokens_seen": 254098710, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.3828125, + "step": 11776, + "time_per_iteration": 2.3972127437591553 + }, + { + "auxiliary_loss_clip": 0.01057457, + "auxiliary_loss_mlp": 0.0102337, + "balance_loss_clip": 1.01211619, + "balance_loss_mlp": 1.01991606, + "epoch": 0.7080715466706748, + "flos": 15121371398400.0, + "grad_norm": 1.5620087365736695, + "language_loss": 0.74988234, + "learning_rate": 7.838514367984599e-07, + "loss": 0.77069056, + "num_input_tokens_seen": 254117200, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.375, + "step": 11777, + "time_per_iteration": 2.388424873352051 + }, + { + "auxiliary_loss_clip": 0.01062628, + "auxiliary_loss_mlp": 0.01026052, + "balance_loss_clip": 1.0129329, + "balance_loss_mlp": 1.02108967, + "epoch": 0.7081316699233429, + "flos": 14422096310400.0, + "grad_norm": 2.118024558051579, + "language_loss": 0.82161963, + "learning_rate": 7.835515128606132e-07, + "loss": 0.84250641, + "num_input_tokens_seen": 254132115, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.41601562, + "step": 11778, + "time_per_iteration": 2.384431838989258 + }, + { + "auxiliary_loss_clip": 0.01054684, + "auxiliary_loss_mlp": 0.01021612, + "balance_loss_clip": 1.01066232, + "balance_loss_mlp": 1.01905346, + "epoch": 0.7081917931760108, + "flos": 23221256135040.0, + "grad_norm": 1.538883554432175, + "language_loss": 0.84729576, + "learning_rate": 7.832516323346839e-07, + "loss": 0.86805868, + "num_input_tokens_seen": 254152285, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.35546875, + "step": 11779, + "time_per_iteration": 2.4318366050720215 + }, + { + "auxiliary_loss_clip": 0.01058326, + "auxiliary_loss_mlp": 0.01025238, + "balance_loss_clip": 1.01370466, + "balance_loss_mlp": 1.01936054, + "epoch": 0.7082519164286788, + "flos": 39306964596480.0, + "grad_norm": 4.809994298912176, + "language_loss": 0.71989179, + "learning_rate": 7.829517952313733e-07, + "loss": 0.74072748, + "num_input_tokens_seen": 254172805, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.38867188, + "step": 11780, + "time_per_iteration": 2.542623996734619 + }, + { + "auxiliary_loss_clip": 0.01055311, + "auxiliary_loss_mlp": 0.01019401, + "balance_loss_clip": 1.00856447, + "balance_loss_mlp": 1.01887107, + "epoch": 0.7083120396813467, + "flos": 21650129775360.0, + "grad_norm": 1.3805843949975738, + "language_loss": 0.73088932, + "learning_rate": 7.82652001561384e-07, + "loss": 0.75163645, + "num_input_tokens_seen": 254191890, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.36328125, + "step": 11781, + "time_per_iteration": 2.4132473468780518 + }, + { + "auxiliary_loss_clip": 0.01057094, + "auxiliary_loss_mlp": 0.01022487, + "balance_loss_clip": 1.01159692, + "balance_loss_mlp": 1.01967716, + "epoch": 0.7083721629340147, + "flos": 17566059686400.0, + "grad_norm": 1.6807257891052128, + "language_loss": 0.77118933, + "learning_rate": 7.823522513354117e-07, + "loss": 0.79198515, + "num_input_tokens_seen": 254210150, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.375, + "step": 11782, + "time_per_iteration": 2.3670740127563477 + }, + { + "auxiliary_loss_clip": 0.01055535, + "auxiliary_loss_mlp": 0.01020721, + "balance_loss_clip": 1.01017118, + "balance_loss_mlp": 1.01833379, + "epoch": 0.7084322861866827, + "flos": 29349246481920.0, + "grad_norm": 1.410373385200969, + "language_loss": 0.69951957, + "learning_rate": 7.820525445641564e-07, + "loss": 0.72028214, + "num_input_tokens_seen": 254233015, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.37109375, + "step": 11783, + "time_per_iteration": 2.462576150894165 + }, + { + "auxiliary_loss_clip": 0.01058431, + "auxiliary_loss_mlp": 0.01024037, + "balance_loss_clip": 1.01218748, + "balance_loss_mlp": 1.01892424, + "epoch": 0.7084924094393507, + "flos": 20193238984320.0, + "grad_norm": 2.0867971608929174, + "language_loss": 0.78952646, + "learning_rate": 7.817528812583125e-07, + "loss": 0.81035113, + "num_input_tokens_seen": 254251345, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.39453125, + "step": 11784, + "time_per_iteration": 2.388256549835205 + }, + { + "auxiliary_loss_clip": 0.01056833, + "auxiliary_loss_mlp": 0.01024957, + "balance_loss_clip": 1.01342916, + "balance_loss_mlp": 1.01904917, + "epoch": 0.7085525326920187, + "flos": 23476087215360.0, + "grad_norm": 20.24890433396057, + "language_loss": 0.77301157, + "learning_rate": 7.81453261428575e-07, + "loss": 0.79382944, + "num_input_tokens_seen": 254269905, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.37695312, + "step": 11785, + "time_per_iteration": 2.4191784858703613 + }, + { + "auxiliary_loss_clip": 0.01055429, + "auxiliary_loss_mlp": 0.01026642, + "balance_loss_clip": 1.01516795, + "balance_loss_mlp": 1.01814389, + "epoch": 0.7086126559446866, + "flos": 25957608854400.0, + "grad_norm": 2.6473203199433932, + "language_loss": 0.7798202, + "learning_rate": 7.811536850856351e-07, + "loss": 0.80064094, + "num_input_tokens_seen": 254289990, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.37304688, + "step": 11786, + "time_per_iteration": 3.9454920291900635 + }, + { + "auxiliary_loss_clip": 0.01058041, + "auxiliary_loss_mlp": 0.01020623, + "balance_loss_clip": 1.00913763, + "balance_loss_mlp": 1.01874804, + "epoch": 0.7086727791973546, + "flos": 26723568372480.0, + "grad_norm": 1.8079525589429595, + "language_loss": 0.79036063, + "learning_rate": 7.808541522401856e-07, + "loss": 0.81114727, + "num_input_tokens_seen": 254309085, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.39257812, + "step": 11787, + "time_per_iteration": 2.4424779415130615 + }, + { + "auxiliary_loss_clip": 0.01057833, + "auxiliary_loss_mlp": 0.0102195, + "balance_loss_clip": 1.01035142, + "balance_loss_mlp": 1.01926017, + "epoch": 0.7087329024500225, + "flos": 21682459560960.0, + "grad_norm": 1.74320859465369, + "language_loss": 0.76910907, + "learning_rate": 7.805546629029156e-07, + "loss": 0.78990686, + "num_input_tokens_seen": 254327045, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.38476562, + "step": 11788, + "time_per_iteration": 2.4635841846466064 + }, + { + "auxiliary_loss_clip": 0.01059692, + "auxiliary_loss_mlp": 0.01029077, + "balance_loss_clip": 1.01569629, + "balance_loss_mlp": 1.01879787, + "epoch": 0.7087930257026905, + "flos": 17930099099520.0, + "grad_norm": 1.927601560836376, + "language_loss": 0.68212742, + "learning_rate": 7.802552170845126e-07, + "loss": 0.70301509, + "num_input_tokens_seen": 254344585, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.41015625, + "step": 11789, + "time_per_iteration": 2.378394842147827 + }, + { + "auxiliary_loss_clip": 0.01057938, + "auxiliary_loss_mlp": 0.01024032, + "balance_loss_clip": 1.0120697, + "balance_loss_mlp": 1.01790464, + "epoch": 0.7088531489553584, + "flos": 18910669944960.0, + "grad_norm": 1.6756668539238408, + "language_loss": 0.77667212, + "learning_rate": 7.799558147956637e-07, + "loss": 0.79749179, + "num_input_tokens_seen": 254362470, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.40039062, + "step": 11790, + "time_per_iteration": 2.3973259925842285 + }, + { + "auxiliary_loss_clip": 0.01059038, + "auxiliary_loss_mlp": 0.01024036, + "balance_loss_clip": 1.01128662, + "balance_loss_mlp": 1.01784921, + "epoch": 0.7089132722080265, + "flos": 27379656241920.0, + "grad_norm": 1.7217292396072703, + "language_loss": 0.71014965, + "learning_rate": 7.796564560470534e-07, + "loss": 0.7309804, + "num_input_tokens_seen": 254383190, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.41210938, + "step": 11791, + "time_per_iteration": 2.4613358974456787 + }, + { + "auxiliary_loss_clip": 0.01057548, + "auxiliary_loss_mlp": 0.01028545, + "balance_loss_clip": 1.01564622, + "balance_loss_mlp": 1.01857519, + "epoch": 0.7089733954606944, + "flos": 22161851706240.0, + "grad_norm": 1.539565479833714, + "language_loss": 0.82112825, + "learning_rate": 7.793571408493649e-07, + "loss": 0.84198916, + "num_input_tokens_seen": 254403115, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.390625, + "step": 11792, + "time_per_iteration": 2.4312522411346436 + }, + { + "auxiliary_loss_clip": 0.01057697, + "auxiliary_loss_mlp": 0.01024908, + "balance_loss_clip": 1.01271307, + "balance_loss_mlp": 1.01769018, + "epoch": 0.7090335187133624, + "flos": 24424677388800.0, + "grad_norm": 2.0539467443546244, + "language_loss": 0.64484668, + "learning_rate": 7.790578692132794e-07, + "loss": 0.66567272, + "num_input_tokens_seen": 254421875, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.40039062, + "step": 11793, + "time_per_iteration": 2.4358372688293457 + }, + { + "auxiliary_loss_clip": 0.01056414, + "auxiliary_loss_mlp": 0.01023819, + "balance_loss_clip": 1.01306653, + "balance_loss_mlp": 1.01851082, + "epoch": 0.7090936419660303, + "flos": 21834156885120.0, + "grad_norm": 2.1041567807540402, + "language_loss": 0.70442671, + "learning_rate": 7.787586411494788e-07, + "loss": 0.72522902, + "num_input_tokens_seen": 254440765, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.37890625, + "step": 11794, + "time_per_iteration": 2.4374377727508545 + }, + { + "auxiliary_loss_clip": 0.01057456, + "auxiliary_loss_mlp": 0.01029178, + "balance_loss_clip": 1.01824057, + "balance_loss_mlp": 1.0199213, + "epoch": 0.7091537652186983, + "flos": 20081377388160.0, + "grad_norm": 1.6947392401741426, + "language_loss": 0.76136291, + "learning_rate": 7.784594566686409e-07, + "loss": 0.78222924, + "num_input_tokens_seen": 254459480, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.375, + "step": 11795, + "time_per_iteration": 2.3953464031219482 + }, + { + "auxiliary_loss_clip": 0.01057543, + "auxiliary_loss_mlp": 0.01025114, + "balance_loss_clip": 1.01421213, + "balance_loss_mlp": 1.01916456, + "epoch": 0.7092138884713663, + "flos": 13150733817600.0, + "grad_norm": 2.5523673663401523, + "language_loss": 0.7478078, + "learning_rate": 7.781603157814427e-07, + "loss": 0.76863432, + "num_input_tokens_seen": 254473985, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.3828125, + "step": 11796, + "time_per_iteration": 3.7798616886138916 + }, + { + "auxiliary_loss_clip": 0.0106243, + "auxiliary_loss_mlp": 0.0102634, + "balance_loss_clip": 1.01352465, + "balance_loss_mlp": 1.02075076, + "epoch": 0.7092740117240343, + "flos": 21645102539520.0, + "grad_norm": 1.7036289922297467, + "language_loss": 0.74347138, + "learning_rate": 7.778612184985592e-07, + "loss": 0.76435906, + "num_input_tokens_seen": 254492135, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.41601562, + "step": 11797, + "time_per_iteration": 2.3881027698516846 + }, + { + "auxiliary_loss_clip": 0.01057917, + "auxiliary_loss_mlp": 0.01024899, + "balance_loss_clip": 1.01324582, + "balance_loss_mlp": 1.01970387, + "epoch": 0.7093341349767023, + "flos": 21031468750080.0, + "grad_norm": 1.4523565346184337, + "language_loss": 0.79407924, + "learning_rate": 7.775621648306665e-07, + "loss": 0.81490743, + "num_input_tokens_seen": 254512865, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.3828125, + "step": 11798, + "time_per_iteration": 2.4330615997314453 + }, + { + "auxiliary_loss_clip": 0.01056651, + "auxiliary_loss_mlp": 0.01026562, + "balance_loss_clip": 1.01461124, + "balance_loss_mlp": 1.01844716, + "epoch": 0.7093942582293702, + "flos": 22016578072320.0, + "grad_norm": 1.9339360338209473, + "language_loss": 0.66999233, + "learning_rate": 7.772631547884343e-07, + "loss": 0.69082451, + "num_input_tokens_seen": 254532605, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.3828125, + "step": 11799, + "time_per_iteration": 2.3950605392456055 + }, + { + "auxiliary_loss_clip": 0.0105701, + "auxiliary_loss_mlp": 0.01020814, + "balance_loss_clip": 1.00883913, + "balance_loss_mlp": 1.01834476, + "epoch": 0.7094543814820382, + "flos": 27234347696640.0, + "grad_norm": 1.7952189541367782, + "language_loss": 0.81628078, + "learning_rate": 7.769641883825355e-07, + "loss": 0.83705902, + "num_input_tokens_seen": 254553780, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.38671875, + "step": 11800, + "time_per_iteration": 2.4640839099884033 + }, + { + "auxiliary_loss_clip": 0.01053962, + "auxiliary_loss_mlp": 0.01023126, + "balance_loss_clip": 1.0124929, + "balance_loss_mlp": 1.01696837, + "epoch": 0.7095145047347061, + "flos": 12088466657280.0, + "grad_norm": 1.7320230120689073, + "language_loss": 0.86154652, + "learning_rate": 7.76665265623639e-07, + "loss": 0.88231742, + "num_input_tokens_seen": 254567510, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.37109375, + "step": 11801, + "time_per_iteration": 2.3319995403289795 + }, + { + "auxiliary_loss_clip": 0.01057822, + "auxiliary_loss_mlp": 0.01024888, + "balance_loss_clip": 1.0137713, + "balance_loss_mlp": 1.01873636, + "epoch": 0.7095746279873741, + "flos": 19382975084160.0, + "grad_norm": 1.909937087851846, + "language_loss": 0.7620666, + "learning_rate": 7.763663865224122e-07, + "loss": 0.78289366, + "num_input_tokens_seen": 254585565, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.390625, + "step": 11802, + "time_per_iteration": 2.389946937561035 + }, + { + "auxiliary_loss_clip": 0.01060352, + "auxiliary_loss_mlp": 0.01029649, + "balance_loss_clip": 1.01771009, + "balance_loss_mlp": 1.01962256, + "epoch": 0.709634751240042, + "flos": 21359547596160.0, + "grad_norm": 1.770906577193093, + "language_loss": 0.81468469, + "learning_rate": 7.760675510895207e-07, + "loss": 0.8355847, + "num_input_tokens_seen": 254603465, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.40625, + "step": 11803, + "time_per_iteration": 2.380082845687866 + }, + { + "auxiliary_loss_clip": 0.01057329, + "auxiliary_loss_mlp": 0.01023802, + "balance_loss_clip": 1.0114634, + "balance_loss_mlp": 1.01770127, + "epoch": 0.7096948744927101, + "flos": 13916204576640.0, + "grad_norm": 2.2295396470345765, + "language_loss": 0.6719839, + "learning_rate": 7.757687593356308e-07, + "loss": 0.69279522, + "num_input_tokens_seen": 254620500, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.39648438, + "step": 11804, + "time_per_iteration": 3.7256217002868652 + }, + { + "auxiliary_loss_clip": 0.01053717, + "auxiliary_loss_mlp": 0.01019961, + "balance_loss_clip": 1.00969064, + "balance_loss_mlp": 1.0182004, + "epoch": 0.709754997745378, + "flos": 30297068605440.0, + "grad_norm": 1.8206260475844094, + "language_loss": 0.7829814, + "learning_rate": 7.754700112714054e-07, + "loss": 0.80371821, + "num_input_tokens_seen": 254638565, + "router_z_loss_clip": 0.10253906, + "router_z_loss_mlp": 0.35546875, + "step": 11805, + "time_per_iteration": 2.4594228267669678 + }, + { + "auxiliary_loss_clip": 0.01057898, + "auxiliary_loss_mlp": 0.0102572, + "balance_loss_clip": 1.01364386, + "balance_loss_mlp": 1.01920485, + "epoch": 0.709815120998046, + "flos": 18514161100800.0, + "grad_norm": 1.9718772181902855, + "language_loss": 0.78846639, + "learning_rate": 7.751713069075041e-07, + "loss": 0.80930257, + "num_input_tokens_seen": 254657505, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.38671875, + "step": 11806, + "time_per_iteration": 2.3677783012390137 + }, + { + "auxiliary_loss_clip": 0.01056856, + "auxiliary_loss_mlp": 0.01025081, + "balance_loss_clip": 1.0135951, + "balance_loss_mlp": 1.01845872, + "epoch": 0.7098752442507139, + "flos": 22271513886720.0, + "grad_norm": 2.6014530118431547, + "language_loss": 0.5648576, + "learning_rate": 7.74872646254589e-07, + "loss": 0.58567703, + "num_input_tokens_seen": 254674730, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.38476562, + "step": 11807, + "time_per_iteration": 2.383009195327759 + }, + { + "auxiliary_loss_clip": 0.01058938, + "auxiliary_loss_mlp": 0.01029521, + "balance_loss_clip": 1.01757073, + "balance_loss_mlp": 1.01823366, + "epoch": 0.7099353675033819, + "flos": 19274604624000.0, + "grad_norm": 2.3333447548937625, + "language_loss": 0.68278456, + "learning_rate": 7.745740293233176e-07, + "loss": 0.70366919, + "num_input_tokens_seen": 254691665, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.40625, + "step": 11808, + "time_per_iteration": 2.382005214691162 + }, + { + "auxiliary_loss_clip": 0.01058482, + "auxiliary_loss_mlp": 0.0102475, + "balance_loss_clip": 1.0135386, + "balance_loss_mlp": 1.01909316, + "epoch": 0.70999549075605, + "flos": 21907439562240.0, + "grad_norm": 2.1117721765596964, + "language_loss": 0.71312451, + "learning_rate": 7.742754561243469e-07, + "loss": 0.73395681, + "num_input_tokens_seen": 254711610, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.39453125, + "step": 11809, + "time_per_iteration": 2.3871235847473145 + }, + { + "auxiliary_loss_clip": 0.010077, + "auxiliary_loss_mlp": 0.01002128, + "balance_loss_clip": 1.00119233, + "balance_loss_mlp": 1.001055, + "epoch": 0.7100556140087179, + "flos": 70453015666560.0, + "grad_norm": 0.7556774289581409, + "language_loss": 0.59450555, + "learning_rate": 7.739769266683318e-07, + "loss": 0.61460388, + "num_input_tokens_seen": 254772615, + "router_z_loss_clip": 0.00933838, + "router_z_loss_mlp": 0.06640625, + "step": 11810, + "time_per_iteration": 3.1369516849517822 + }, + { + "auxiliary_loss_clip": 0.0105797, + "auxiliary_loss_mlp": 0.01026132, + "balance_loss_clip": 1.01489091, + "balance_loss_mlp": 1.01961935, + "epoch": 0.7101157372613859, + "flos": 23038450922880.0, + "grad_norm": 6.140779034735049, + "language_loss": 0.74360013, + "learning_rate": 7.73678440965927e-07, + "loss": 0.76444113, + "num_input_tokens_seen": 254791375, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.3828125, + "step": 11811, + "time_per_iteration": 2.385640859603882 + }, + { + "auxiliary_loss_clip": 0.01055043, + "auxiliary_loss_mlp": 0.0102316, + "balance_loss_clip": 1.01192486, + "balance_loss_mlp": 1.0187242, + "epoch": 0.7101758605140538, + "flos": 23184213315840.0, + "grad_norm": 1.6479507758444183, + "language_loss": 0.83600247, + "learning_rate": 7.73379999027784e-07, + "loss": 0.85678458, + "num_input_tokens_seen": 254809300, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.36328125, + "step": 11812, + "time_per_iteration": 2.451754331588745 + }, + { + "auxiliary_loss_clip": 0.01058855, + "auxiliary_loss_mlp": 0.01025823, + "balance_loss_clip": 1.01383662, + "balance_loss_mlp": 1.01961732, + "epoch": 0.7102359837667218, + "flos": 23694992640000.0, + "grad_norm": 1.5371790575660718, + "language_loss": 0.70379019, + "learning_rate": 7.730816008645537e-07, + "loss": 0.72463703, + "num_input_tokens_seen": 254829325, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.390625, + "step": 11813, + "time_per_iteration": 2.3917348384857178 + }, + { + "auxiliary_loss_clip": 0.01054636, + "auxiliary_loss_mlp": 0.01018828, + "balance_loss_clip": 1.00788426, + "balance_loss_mlp": 1.0176475, + "epoch": 0.7102961070193897, + "flos": 19390097001600.0, + "grad_norm": 1.9476774244518797, + "language_loss": 0.81635308, + "learning_rate": 7.727832464868846e-07, + "loss": 0.83708769, + "num_input_tokens_seen": 254847690, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.37109375, + "step": 11814, + "time_per_iteration": 2.385869026184082 + }, + { + "auxiliary_loss_clip": 0.01057175, + "auxiliary_loss_mlp": 0.0102433, + "balance_loss_clip": 1.01223612, + "balance_loss_mlp": 1.01919317, + "epoch": 0.7103562302720577, + "flos": 21506427152640.0, + "grad_norm": 1.7131113186318874, + "language_loss": 0.76220715, + "learning_rate": 7.724849359054257e-07, + "loss": 0.78302222, + "num_input_tokens_seen": 254865960, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.37890625, + "step": 11815, + "time_per_iteration": 2.403526544570923 + }, + { + "auxiliary_loss_clip": 0.01055493, + "auxiliary_loss_mlp": 0.01023976, + "balance_loss_clip": 1.01268673, + "balance_loss_mlp": 1.01813531, + "epoch": 0.7104163535247257, + "flos": 14534272108800.0, + "grad_norm": 1.7701060889054367, + "language_loss": 0.78816813, + "learning_rate": 7.721866691308208e-07, + "loss": 0.80896282, + "num_input_tokens_seen": 254882815, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.37304688, + "step": 11816, + "time_per_iteration": 2.384295701980591 + }, + { + "auxiliary_loss_clip": 0.01059126, + "auxiliary_loss_mlp": 0.01026676, + "balance_loss_clip": 1.01443887, + "balance_loss_mlp": 1.01969337, + "epoch": 0.7104764767773937, + "flos": 11399525331840.0, + "grad_norm": 2.135262363624506, + "language_loss": 0.86618382, + "learning_rate": 7.718884461737159e-07, + "loss": 0.88704181, + "num_input_tokens_seen": 254898705, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.39453125, + "step": 11817, + "time_per_iteration": 2.3602707386016846 + }, + { + "auxiliary_loss_clip": 0.01056179, + "auxiliary_loss_mlp": 0.01027929, + "balance_loss_clip": 1.01691985, + "balance_loss_mlp": 1.01830137, + "epoch": 0.7105366000300616, + "flos": 11689688574720.0, + "grad_norm": 2.1795035683319623, + "language_loss": 0.84733391, + "learning_rate": 7.715902670447532e-07, + "loss": 0.86817503, + "num_input_tokens_seen": 254913665, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.37890625, + "step": 11818, + "time_per_iteration": 2.3752529621124268 + }, + { + "auxiliary_loss_clip": 0.01058073, + "auxiliary_loss_mlp": 0.01024156, + "balance_loss_clip": 1.01246774, + "balance_loss_mlp": 1.01823807, + "epoch": 0.7105967232827296, + "flos": 19353019271040.0, + "grad_norm": 2.0726415342578806, + "language_loss": 0.75511956, + "learning_rate": 7.712921317545742e-07, + "loss": 0.77594185, + "num_input_tokens_seen": 254932140, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.3984375, + "step": 11819, + "time_per_iteration": 2.378812551498413 + }, + { + "auxiliary_loss_clip": 0.01058606, + "auxiliary_loss_mlp": 0.01025357, + "balance_loss_clip": 1.01248288, + "balance_loss_mlp": 1.01901829, + "epoch": 0.7106568465353975, + "flos": 22929277501440.0, + "grad_norm": 1.578106024464024, + "language_loss": 0.70960951, + "learning_rate": 7.709940403138182e-07, + "loss": 0.73044908, + "num_input_tokens_seen": 254951580, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.39648438, + "step": 11820, + "time_per_iteration": 2.4080734252929688 + }, + { + "auxiliary_loss_clip": 0.01057324, + "auxiliary_loss_mlp": 0.01022747, + "balance_loss_clip": 1.01144028, + "balance_loss_mlp": 1.0185219, + "epoch": 0.7107169697880655, + "flos": 19098642038400.0, + "grad_norm": 1.6993106355787229, + "language_loss": 0.75580382, + "learning_rate": 7.706959927331232e-07, + "loss": 0.77660453, + "num_input_tokens_seen": 254969425, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.38867188, + "step": 11821, + "time_per_iteration": 2.376163959503174 + }, + { + "auxiliary_loss_clip": 0.0100748, + "auxiliary_loss_mlp": 0.01002099, + "balance_loss_clip": 1.00123525, + "balance_loss_mlp": 1.00092542, + "epoch": 0.7107770930407336, + "flos": 63635071564800.0, + "grad_norm": 0.7723808389241912, + "language_loss": 0.55130529, + "learning_rate": 7.703979890231272e-07, + "loss": 0.57140112, + "num_input_tokens_seen": 255032680, + "router_z_loss_clip": 0.00866699, + "router_z_loss_mlp": 0.06542969, + "step": 11822, + "time_per_iteration": 3.06742787361145 + }, + { + "auxiliary_loss_clip": 0.01053124, + "auxiliary_loss_mlp": 0.0102153, + "balance_loss_clip": 1.01044393, + "balance_loss_mlp": 1.01713967, + "epoch": 0.7108372162934015, + "flos": 22053376512000.0, + "grad_norm": 1.9619657204404886, + "language_loss": 0.60405886, + "learning_rate": 7.701000291944626e-07, + "loss": 0.62480539, + "num_input_tokens_seen": 255054400, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.359375, + "step": 11823, + "time_per_iteration": 2.42301344871521 + }, + { + "auxiliary_loss_clip": 0.01057464, + "auxiliary_loss_mlp": 0.01024477, + "balance_loss_clip": 1.01302147, + "balance_loss_mlp": 1.01907325, + "epoch": 0.7108973395460695, + "flos": 19134148757760.0, + "grad_norm": 2.036343291532973, + "language_loss": 0.70656049, + "learning_rate": 7.69802113257765e-07, + "loss": 0.72737998, + "num_input_tokens_seen": 255072785, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.3828125, + "step": 11824, + "time_per_iteration": 2.383565902709961 + }, + { + "auxiliary_loss_clip": 0.01053204, + "auxiliary_loss_mlp": 0.01025308, + "balance_loss_clip": 1.01507342, + "balance_loss_mlp": 1.0179882, + "epoch": 0.7109574627987374, + "flos": 17893475216640.0, + "grad_norm": 3.1332713683129163, + "language_loss": 0.72086972, + "learning_rate": 7.695042412236656e-07, + "loss": 0.74165487, + "num_input_tokens_seen": 255091820, + "router_z_loss_clip": 0.10205078, + "router_z_loss_mlp": 0.3515625, + "step": 11825, + "time_per_iteration": 2.372851848602295 + }, + { + "auxiliary_loss_clip": 0.01057323, + "auxiliary_loss_mlp": 0.01025932, + "balance_loss_clip": 1.0139401, + "balance_loss_mlp": 1.01758599, + "epoch": 0.7110175860514054, + "flos": 28978538999040.0, + "grad_norm": 1.6698384631427137, + "language_loss": 0.79125649, + "learning_rate": 7.692064131027947e-07, + "loss": 0.81208897, + "num_input_tokens_seen": 255111720, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.3984375, + "step": 11826, + "time_per_iteration": 3.8952202796936035 + }, + { + "auxiliary_loss_clip": 0.01055138, + "auxiliary_loss_mlp": 0.01026815, + "balance_loss_clip": 1.01602626, + "balance_loss_mlp": 1.01804733, + "epoch": 0.7110777093040733, + "flos": 26172220181760.0, + "grad_norm": 1.7153622528529897, + "language_loss": 0.83125305, + "learning_rate": 7.6890862890578e-07, + "loss": 0.8520726, + "num_input_tokens_seen": 255133495, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.37109375, + "step": 11827, + "time_per_iteration": 2.424691677093506 + }, + { + "auxiliary_loss_clip": 0.01057913, + "auxiliary_loss_mlp": 0.01024045, + "balance_loss_clip": 1.01175439, + "balance_loss_mlp": 1.01884365, + "epoch": 0.7111378325567413, + "flos": 26868737272320.0, + "grad_norm": 1.364439375087764, + "language_loss": 0.6222899, + "learning_rate": 7.686108886432512e-07, + "loss": 0.6431095, + "num_input_tokens_seen": 255156880, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.390625, + "step": 11828, + "time_per_iteration": 2.4508934020996094 + }, + { + "auxiliary_loss_clip": 0.01056256, + "auxiliary_loss_mlp": 0.01023861, + "balance_loss_clip": 1.01236331, + "balance_loss_mlp": 1.01834559, + "epoch": 0.7111979558094093, + "flos": 27270587554560.0, + "grad_norm": 1.6136449754656923, + "language_loss": 0.72188371, + "learning_rate": 7.683131923258308e-07, + "loss": 0.74268484, + "num_input_tokens_seen": 255178920, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.37890625, + "step": 11829, + "time_per_iteration": 2.431138038635254 + }, + { + "auxiliary_loss_clip": 0.01056952, + "auxiliary_loss_mlp": 0.01024106, + "balance_loss_clip": 1.01287603, + "balance_loss_mlp": 1.01903081, + "epoch": 0.7112580790620773, + "flos": 25045747297920.0, + "grad_norm": 1.7188420459301896, + "language_loss": 0.80130804, + "learning_rate": 7.680155399641448e-07, + "loss": 0.82211864, + "num_input_tokens_seen": 255198095, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.37890625, + "step": 11830, + "time_per_iteration": 2.432354688644409 + }, + { + "auxiliary_loss_clip": 0.01057102, + "auxiliary_loss_mlp": 0.01025811, + "balance_loss_clip": 1.01418817, + "balance_loss_mlp": 1.01832128, + "epoch": 0.7113182023147452, + "flos": 21645730944000.0, + "grad_norm": 1.9302002742941777, + "language_loss": 0.84084278, + "learning_rate": 7.677179315688147e-07, + "loss": 0.86167192, + "num_input_tokens_seen": 255215860, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.38671875, + "step": 11831, + "time_per_iteration": 2.384316921234131 + }, + { + "auxiliary_loss_clip": 0.01057817, + "auxiliary_loss_mlp": 0.01022786, + "balance_loss_clip": 1.01107347, + "balance_loss_mlp": 1.01851606, + "epoch": 0.7113783255674132, + "flos": 20995228892160.0, + "grad_norm": 1.9716480862065622, + "language_loss": 0.77102768, + "learning_rate": 7.67420367150463e-07, + "loss": 0.79183376, + "num_input_tokens_seen": 255235425, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.39453125, + "step": 11832, + "time_per_iteration": 2.4040608406066895 + }, + { + "auxiliary_loss_clip": 0.01007404, + "auxiliary_loss_mlp": 0.01001381, + "balance_loss_clip": 1.00034356, + "balance_loss_mlp": 1.000844, + "epoch": 0.7114384488200811, + "flos": 66768142596480.0, + "grad_norm": 0.7471661479425632, + "language_loss": 0.56596935, + "learning_rate": 7.671228467197069e-07, + "loss": 0.58605719, + "num_input_tokens_seen": 255291680, + "router_z_loss_clip": 0.01037598, + "router_z_loss_mlp": 0.06542969, + "step": 11833, + "time_per_iteration": 2.9580652713775635 + }, + { + "auxiliary_loss_clip": 0.01055395, + "auxiliary_loss_mlp": 0.01027501, + "balance_loss_clip": 1.01661682, + "balance_loss_mlp": 1.01812863, + "epoch": 0.7114985720727491, + "flos": 25008879035520.0, + "grad_norm": 1.6305561455502948, + "language_loss": 0.71153677, + "learning_rate": 7.668253702871652e-07, + "loss": 0.73236567, + "num_input_tokens_seen": 255313880, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.37109375, + "step": 11834, + "time_per_iteration": 2.439903497695923 + }, + { + "auxiliary_loss_clip": 0.0105969, + "auxiliary_loss_mlp": 0.01023329, + "balance_loss_clip": 1.01103258, + "balance_loss_mlp": 1.01988697, + "epoch": 0.7115586953254172, + "flos": 21469070131200.0, + "grad_norm": 1.8393067429414545, + "language_loss": 0.79418182, + "learning_rate": 7.665279378634548e-07, + "loss": 0.81501204, + "num_input_tokens_seen": 255332390, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.3984375, + "step": 11835, + "time_per_iteration": 5.320026874542236 + }, + { + "auxiliary_loss_clip": 0.01055152, + "auxiliary_loss_mlp": 0.0102234, + "balance_loss_clip": 1.0115099, + "balance_loss_mlp": 1.01741982, + "epoch": 0.7116188185780851, + "flos": 28621307301120.0, + "grad_norm": 1.4762076347110151, + "language_loss": 0.75805235, + "learning_rate": 7.662305494591883e-07, + "loss": 0.77882731, + "num_input_tokens_seen": 255354025, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.37695312, + "step": 11836, + "time_per_iteration": 2.450211524963379 + }, + { + "auxiliary_loss_clip": 0.01056261, + "auxiliary_loss_mlp": 0.01023255, + "balance_loss_clip": 1.0118413, + "balance_loss_mlp": 1.01795113, + "epoch": 0.7116789418307531, + "flos": 25292653499520.0, + "grad_norm": 1.687607041041567, + "language_loss": 0.6980511, + "learning_rate": 7.659332050849803e-07, + "loss": 0.7188462, + "num_input_tokens_seen": 255371400, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.3828125, + "step": 11837, + "time_per_iteration": 2.437499523162842 + }, + { + "auxiliary_loss_clip": 0.0105916, + "auxiliary_loss_mlp": 0.01027419, + "balance_loss_clip": 1.01327467, + "balance_loss_mlp": 1.01899076, + "epoch": 0.711739065083421, + "flos": 25556107685760.0, + "grad_norm": 1.889910923777013, + "language_loss": 0.61601353, + "learning_rate": 7.656359047514411e-07, + "loss": 0.63687932, + "num_input_tokens_seen": 255390710, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.40234375, + "step": 11838, + "time_per_iteration": 2.441587448120117 + }, + { + "auxiliary_loss_clip": 0.01057741, + "auxiliary_loss_mlp": 0.01020977, + "balance_loss_clip": 1.00952744, + "balance_loss_mlp": 1.01957214, + "epoch": 0.711799188336089, + "flos": 26139785662080.0, + "grad_norm": 3.030323282811118, + "language_loss": 0.67388606, + "learning_rate": 7.653386484691828e-07, + "loss": 0.69467318, + "num_input_tokens_seen": 255408790, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.3828125, + "step": 11839, + "time_per_iteration": 2.431864023208618 + }, + { + "auxiliary_loss_clip": 0.01058554, + "auxiliary_loss_mlp": 0.01026753, + "balance_loss_clip": 1.01496935, + "balance_loss_mlp": 1.01994729, + "epoch": 0.7118593115887569, + "flos": 21139629742080.0, + "grad_norm": 1.900836884702583, + "language_loss": 0.83899474, + "learning_rate": 7.650414362488107e-07, + "loss": 0.85984784, + "num_input_tokens_seen": 255426280, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.38671875, + "step": 11840, + "time_per_iteration": 2.3764772415161133 + }, + { + "auxiliary_loss_clip": 0.01059753, + "auxiliary_loss_mlp": 0.01027592, + "balance_loss_clip": 1.01592708, + "balance_loss_mlp": 1.01973987, + "epoch": 0.711919434841425, + "flos": 14974806044160.0, + "grad_norm": 1.6945637195118575, + "language_loss": 0.76559353, + "learning_rate": 7.647442681009337e-07, + "loss": 0.78646696, + "num_input_tokens_seen": 255442935, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.40039062, + "step": 11841, + "time_per_iteration": 2.3595268726348877 + }, + { + "auxiliary_loss_clip": 0.01057564, + "auxiliary_loss_mlp": 0.0102678, + "balance_loss_clip": 1.01469851, + "balance_loss_mlp": 1.01824319, + "epoch": 0.7119795580940929, + "flos": 16508051712000.0, + "grad_norm": 2.2711841445616106, + "language_loss": 0.75069511, + "learning_rate": 7.644471440361564e-07, + "loss": 0.7715385, + "num_input_tokens_seen": 255460925, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.39453125, + "step": 11842, + "time_per_iteration": 2.3719377517700195 + }, + { + "auxiliary_loss_clip": 0.01007053, + "auxiliary_loss_mlp": 0.00999801, + "balance_loss_clip": 0.99884105, + "balance_loss_mlp": 1.00055385, + "epoch": 0.7120396813467609, + "flos": 66567286344960.0, + "grad_norm": 0.7808239332952486, + "language_loss": 0.61631322, + "learning_rate": 7.641500640650825e-07, + "loss": 0.6363818, + "num_input_tokens_seen": 255521360, + "router_z_loss_clip": 0.00958252, + "router_z_loss_mlp": 0.06542969, + "step": 11843, + "time_per_iteration": 3.0047032833099365 + }, + { + "auxiliary_loss_clip": 0.01055772, + "auxiliary_loss_mlp": 0.01021691, + "balance_loss_clip": 1.01051521, + "balance_loss_mlp": 1.019068, + "epoch": 0.7120998045994288, + "flos": 26431519916160.0, + "grad_norm": 4.264728241210653, + "language_loss": 0.80069464, + "learning_rate": 7.638530281983128e-07, + "loss": 0.82146919, + "num_input_tokens_seen": 255541435, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.3671875, + "step": 11844, + "time_per_iteration": 3.851933002471924 + }, + { + "auxiliary_loss_clip": 0.01054923, + "auxiliary_loss_mlp": 0.01023454, + "balance_loss_clip": 1.01175404, + "balance_loss_mlp": 1.01702833, + "epoch": 0.7121599278520968, + "flos": 16427263092480.0, + "grad_norm": 2.1380086199509583, + "language_loss": 0.78890562, + "learning_rate": 7.635560364464504e-07, + "loss": 0.8096894, + "num_input_tokens_seen": 255558505, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.37890625, + "step": 11845, + "time_per_iteration": 2.3563907146453857 + }, + { + "auxiliary_loss_clip": 0.0105744, + "auxiliary_loss_mlp": 0.01024151, + "balance_loss_clip": 1.01203346, + "balance_loss_mlp": 1.02076173, + "epoch": 0.7122200511047647, + "flos": 28948618097280.0, + "grad_norm": 1.8970258726259441, + "language_loss": 0.77287269, + "learning_rate": 7.632590888200912e-07, + "loss": 0.79368854, + "num_input_tokens_seen": 255577815, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.3671875, + "step": 11846, + "time_per_iteration": 2.4347758293151855 + }, + { + "auxiliary_loss_clip": 0.01057945, + "auxiliary_loss_mlp": 0.01024364, + "balance_loss_clip": 1.01317573, + "balance_loss_mlp": 1.0190835, + "epoch": 0.7122801743574327, + "flos": 16470939070080.0, + "grad_norm": 1.872733626931005, + "language_loss": 0.5840717, + "learning_rate": 7.629621853298343e-07, + "loss": 0.60489476, + "num_input_tokens_seen": 255595885, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.38867188, + "step": 11847, + "time_per_iteration": 2.370479106903076 + }, + { + "auxiliary_loss_clip": 0.01054237, + "auxiliary_loss_mlp": 0.01025887, + "balance_loss_clip": 1.0142107, + "balance_loss_mlp": 1.01802599, + "epoch": 0.7123402976101008, + "flos": 20630002492800.0, + "grad_norm": 1.3359374089380844, + "language_loss": 0.71386838, + "learning_rate": 7.626653259862743e-07, + "loss": 0.73466963, + "num_input_tokens_seen": 255616750, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.36328125, + "step": 11848, + "time_per_iteration": 2.4121084213256836 + }, + { + "auxiliary_loss_clip": 0.01007423, + "auxiliary_loss_mlp": 0.01002667, + "balance_loss_clip": 1.00176132, + "balance_loss_mlp": 1.00089836, + "epoch": 0.7124004208627687, + "flos": 62322756180480.0, + "grad_norm": 0.8183332508716309, + "language_loss": 0.63048601, + "learning_rate": 7.623685108000075e-07, + "loss": 0.65058696, + "num_input_tokens_seen": 255677900, + "router_z_loss_clip": 0.0090332, + "router_z_loss_mlp": 0.06542969, + "step": 11849, + "time_per_iteration": 3.041691541671753 + }, + { + "auxiliary_loss_clip": 0.01057815, + "auxiliary_loss_mlp": 0.01027606, + "balance_loss_clip": 1.01623344, + "balance_loss_mlp": 1.01861429, + "epoch": 0.7124605441154367, + "flos": 39674425322880.0, + "grad_norm": 1.5560028176705811, + "language_loss": 0.64124179, + "learning_rate": 7.620717397816243e-07, + "loss": 0.66209602, + "num_input_tokens_seen": 255699140, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.39257812, + "step": 11850, + "time_per_iteration": 2.569902181625366 + }, + { + "auxiliary_loss_clip": 0.01056514, + "auxiliary_loss_mlp": 0.01023528, + "balance_loss_clip": 1.01227427, + "balance_loss_mlp": 1.01835132, + "epoch": 0.7125206673681046, + "flos": 28180668631680.0, + "grad_norm": 1.6125370157727235, + "language_loss": 0.70024061, + "learning_rate": 7.617750129417157e-07, + "loss": 0.72104108, + "num_input_tokens_seen": 255719640, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.3828125, + "step": 11851, + "time_per_iteration": 2.4309754371643066 + }, + { + "auxiliary_loss_clip": 0.01056829, + "auxiliary_loss_mlp": 0.01022702, + "balance_loss_clip": 1.01186001, + "balance_loss_mlp": 1.01970291, + "epoch": 0.7125807906207726, + "flos": 26175746229120.0, + "grad_norm": 3.6094750931758486, + "language_loss": 0.8338989, + "learning_rate": 7.614783302908731e-07, + "loss": 0.85469425, + "num_input_tokens_seen": 255740450, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.37109375, + "step": 11852, + "time_per_iteration": 2.453864574432373 + }, + { + "auxiliary_loss_clip": 0.01058475, + "auxiliary_loss_mlp": 0.01024409, + "balance_loss_clip": 1.01162958, + "balance_loss_mlp": 1.01917696, + "epoch": 0.7126409138734405, + "flos": 17156598727680.0, + "grad_norm": 2.131262835785237, + "language_loss": 0.73381102, + "learning_rate": 7.611816918396816e-07, + "loss": 0.75463986, + "num_input_tokens_seen": 255758070, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.39453125, + "step": 11853, + "time_per_iteration": 2.3772051334381104 + }, + { + "auxiliary_loss_clip": 0.01054627, + "auxiliary_loss_mlp": 0.0102191, + "balance_loss_clip": 1.01181889, + "balance_loss_mlp": 1.01823831, + "epoch": 0.7127010371261085, + "flos": 18768957269760.0, + "grad_norm": 1.8526747247199542, + "language_loss": 0.92216313, + "learning_rate": 7.608850975987297e-07, + "loss": 0.94292843, + "num_input_tokens_seen": 255775685, + "router_z_loss_clip": 0.10107422, + "router_z_loss_mlp": 0.36328125, + "step": 11854, + "time_per_iteration": 2.3957433700561523 + }, + { + "auxiliary_loss_clip": 0.01056053, + "auxiliary_loss_mlp": 0.01029658, + "balance_loss_clip": 1.01846457, + "balance_loss_mlp": 1.01847696, + "epoch": 0.7127611603787765, + "flos": 20375380880640.0, + "grad_norm": 2.790540967374152, + "language_loss": 0.79799509, + "learning_rate": 7.605885475786007e-07, + "loss": 0.81885219, + "num_input_tokens_seen": 255794750, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.37695312, + "step": 11855, + "time_per_iteration": 2.3851590156555176 + }, + { + "auxiliary_loss_clip": 0.01057618, + "auxiliary_loss_mlp": 0.01024527, + "balance_loss_clip": 1.01210582, + "balance_loss_mlp": 1.01844847, + "epoch": 0.7128212836314445, + "flos": 20447965330560.0, + "grad_norm": 1.9364557851693098, + "language_loss": 0.73022652, + "learning_rate": 7.602920417898795e-07, + "loss": 0.75104797, + "num_input_tokens_seen": 255813325, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.390625, + "step": 11856, + "time_per_iteration": 2.4165844917297363 + }, + { + "auxiliary_loss_clip": 0.01059015, + "auxiliary_loss_mlp": 0.01024157, + "balance_loss_clip": 1.01201522, + "balance_loss_mlp": 1.01907837, + "epoch": 0.7128814068841124, + "flos": 23439707712000.0, + "grad_norm": 1.5548981063604292, + "language_loss": 0.69923049, + "learning_rate": 7.59995580243145e-07, + "loss": 0.7200622, + "num_input_tokens_seen": 255832470, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.40039062, + "step": 11857, + "time_per_iteration": 2.4477930068969727 + }, + { + "auxiliary_loss_clip": 0.01056594, + "auxiliary_loss_mlp": 0.01023676, + "balance_loss_clip": 1.01238656, + "balance_loss_mlp": 1.01878929, + "epoch": 0.7129415301367804, + "flos": 18221972999040.0, + "grad_norm": 2.370411920566125, + "language_loss": 0.85002971, + "learning_rate": 7.596991629489793e-07, + "loss": 0.87083244, + "num_input_tokens_seen": 255849740, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.37890625, + "step": 11858, + "time_per_iteration": 2.3813254833221436 + }, + { + "auxiliary_loss_clip": 0.01062708, + "auxiliary_loss_mlp": 0.01024925, + "balance_loss_clip": 1.01272953, + "balance_loss_mlp": 1.02122974, + "epoch": 0.7130016533894483, + "flos": 15522977301120.0, + "grad_norm": 1.9184101859463687, + "language_loss": 0.80256498, + "learning_rate": 7.594027899179602e-07, + "loss": 0.82344127, + "num_input_tokens_seen": 255866975, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.4140625, + "step": 11859, + "time_per_iteration": 2.356269359588623 + }, + { + "auxiliary_loss_clip": 0.01057135, + "auxiliary_loss_mlp": 0.01021892, + "balance_loss_clip": 1.01060867, + "balance_loss_mlp": 1.02010036, + "epoch": 0.7130617766421163, + "flos": 57113646439680.0, + "grad_norm": 1.8804915287220445, + "language_loss": 0.68911529, + "learning_rate": 7.591064611606642e-07, + "loss": 0.70990551, + "num_input_tokens_seen": 255892915, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.37109375, + "step": 11860, + "time_per_iteration": 2.7238662242889404 + }, + { + "auxiliary_loss_clip": 0.01057783, + "auxiliary_loss_mlp": 0.01025786, + "balance_loss_clip": 1.01348948, + "balance_loss_mlp": 1.0191853, + "epoch": 0.7131218998947844, + "flos": 19787338984320.0, + "grad_norm": 2.0795875686262524, + "language_loss": 0.64516294, + "learning_rate": 7.58810176687666e-07, + "loss": 0.66599858, + "num_input_tokens_seen": 255911480, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.38671875, + "step": 11861, + "time_per_iteration": 2.3768815994262695 + }, + { + "auxiliary_loss_clip": 0.01061176, + "auxiliary_loss_mlp": 0.01026329, + "balance_loss_clip": 1.01382446, + "balance_loss_mlp": 1.02049959, + "epoch": 0.7131820231474523, + "flos": 26650669720320.0, + "grad_norm": 3.881466518144616, + "language_loss": 0.67288995, + "learning_rate": 7.585139365095412e-07, + "loss": 0.69376504, + "num_input_tokens_seen": 255931140, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.40625, + "step": 11862, + "time_per_iteration": 2.4373741149902344 + }, + { + "auxiliary_loss_clip": 0.01056002, + "auxiliary_loss_mlp": 0.01029062, + "balance_loss_clip": 1.01820827, + "balance_loss_mlp": 1.01883948, + "epoch": 0.7132421464001203, + "flos": 29204321961600.0, + "grad_norm": 2.2284501327360524, + "language_loss": 0.66944277, + "learning_rate": 7.582177406368591e-07, + "loss": 0.69029343, + "num_input_tokens_seen": 255951665, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.37109375, + "step": 11863, + "time_per_iteration": 2.4444966316223145 + }, + { + "auxiliary_loss_clip": 0.01060972, + "auxiliary_loss_mlp": 0.01027476, + "balance_loss_clip": 1.01487601, + "balance_loss_mlp": 1.02007031, + "epoch": 0.7133022696527882, + "flos": 23072561187840.0, + "grad_norm": 2.7510165572921768, + "language_loss": 0.65747583, + "learning_rate": 7.579215890801923e-07, + "loss": 0.67836034, + "num_input_tokens_seen": 255970055, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.41015625, + "step": 11864, + "time_per_iteration": 2.4048690795898438 + }, + { + "auxiliary_loss_clip": 0.01060145, + "auxiliary_loss_mlp": 0.01025755, + "balance_loss_clip": 1.01344061, + "balance_loss_mlp": 1.01993632, + "epoch": 0.7133623929054562, + "flos": 17456153126400.0, + "grad_norm": 1.9021067402224863, + "language_loss": 0.85451531, + "learning_rate": 7.576254818501091e-07, + "loss": 0.87537432, + "num_input_tokens_seen": 255987720, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.40234375, + "step": 11865, + "time_per_iteration": 3.811561346054077 + }, + { + "auxiliary_loss_clip": 0.01057241, + "auxiliary_loss_mlp": 0.01023401, + "balance_loss_clip": 1.01111042, + "balance_loss_mlp": 1.01889372, + "epoch": 0.7134225161581241, + "flos": 19535545192320.0, + "grad_norm": 2.3224864860930823, + "language_loss": 0.74897897, + "learning_rate": 7.573294189571766e-07, + "loss": 0.76978534, + "num_input_tokens_seen": 256005490, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.3828125, + "step": 11866, + "time_per_iteration": 2.390885591506958 + }, + { + "auxiliary_loss_clip": 0.01059464, + "auxiliary_loss_mlp": 0.01027018, + "balance_loss_clip": 1.01441777, + "balance_loss_mlp": 1.02024007, + "epoch": 0.7134826394107922, + "flos": 26248889260800.0, + "grad_norm": 2.500084476422644, + "language_loss": 0.70750701, + "learning_rate": 7.570334004119606e-07, + "loss": 0.7283718, + "num_input_tokens_seen": 256026030, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.39257812, + "step": 11867, + "time_per_iteration": 2.4315176010131836 + }, + { + "auxiliary_loss_clip": 0.01055909, + "auxiliary_loss_mlp": 0.01022879, + "balance_loss_clip": 1.01192951, + "balance_loss_mlp": 1.01870584, + "epoch": 0.7135427626634601, + "flos": 15814397352960.0, + "grad_norm": 1.964210230402764, + "language_loss": 0.71814722, + "learning_rate": 7.567374262250246e-07, + "loss": 0.73893511, + "num_input_tokens_seen": 256043680, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.37109375, + "step": 11868, + "time_per_iteration": 2.3614754676818848 + }, + { + "auxiliary_loss_clip": 0.01060635, + "auxiliary_loss_mlp": 0.01023899, + "balance_loss_clip": 1.01070869, + "balance_loss_mlp": 1.02015126, + "epoch": 0.7136028859161281, + "flos": 18222426846720.0, + "grad_norm": 2.3151622028383394, + "language_loss": 0.65978152, + "learning_rate": 7.56441496406933e-07, + "loss": 0.68062687, + "num_input_tokens_seen": 256059705, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.40625, + "step": 11869, + "time_per_iteration": 2.3568570613861084 + }, + { + "auxiliary_loss_clip": 0.01055852, + "auxiliary_loss_mlp": 0.01019549, + "balance_loss_clip": 1.00808096, + "balance_loss_mlp": 1.01778901, + "epoch": 0.713663009168796, + "flos": 24313723488000.0, + "grad_norm": 1.7491978611081913, + "language_loss": 0.77922565, + "learning_rate": 7.561456109682442e-07, + "loss": 0.79997969, + "num_input_tokens_seen": 256079785, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.38085938, + "step": 11870, + "time_per_iteration": 2.428544521331787 + }, + { + "auxiliary_loss_clip": 0.01058445, + "auxiliary_loss_mlp": 0.01024363, + "balance_loss_clip": 1.0124774, + "balance_loss_mlp": 1.02023232, + "epoch": 0.713723132421464, + "flos": 26537376758400.0, + "grad_norm": 1.6536442001194267, + "language_loss": 0.80859178, + "learning_rate": 7.558497699195198e-07, + "loss": 0.82941985, + "num_input_tokens_seen": 256099000, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.3828125, + "step": 11871, + "time_per_iteration": 2.4342219829559326 + }, + { + "auxiliary_loss_clip": 0.01059904, + "auxiliary_loss_mlp": 0.01030124, + "balance_loss_clip": 1.01765513, + "balance_loss_mlp": 1.0197475, + "epoch": 0.7137832556741319, + "flos": 19864636467840.0, + "grad_norm": 1.586909121785915, + "language_loss": 0.79039574, + "learning_rate": 7.55553973271317e-07, + "loss": 0.81129599, + "num_input_tokens_seen": 256117985, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.40039062, + "step": 11872, + "time_per_iteration": 2.424175500869751 + }, + { + "auxiliary_loss_clip": 0.01058329, + "auxiliary_loss_mlp": 0.01024145, + "balance_loss_clip": 1.01176548, + "balance_loss_mlp": 1.01881742, + "epoch": 0.7138433789267999, + "flos": 21687870821760.0, + "grad_norm": 2.2349626021603903, + "language_loss": 0.83764005, + "learning_rate": 7.552582210341916e-07, + "loss": 0.85846484, + "num_input_tokens_seen": 256134350, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.39453125, + "step": 11873, + "time_per_iteration": 2.374058961868286 + }, + { + "auxiliary_loss_clip": 0.01054876, + "auxiliary_loss_mlp": 0.0102049, + "balance_loss_clip": 1.00933838, + "balance_loss_mlp": 1.01745737, + "epoch": 0.713903502179468, + "flos": 17601775873920.0, + "grad_norm": 1.881932386115031, + "language_loss": 0.86557448, + "learning_rate": 7.549625132186976e-07, + "loss": 0.8863281, + "num_input_tokens_seen": 256150610, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.37304688, + "step": 11874, + "time_per_iteration": 2.372166156768799 + }, + { + "auxiliary_loss_clip": 0.0105718, + "auxiliary_loss_mlp": 0.01022327, + "balance_loss_clip": 1.01088333, + "balance_loss_mlp": 1.01810145, + "epoch": 0.7139636254321359, + "flos": 18039377255040.0, + "grad_norm": 1.9184750048301253, + "language_loss": 0.83223355, + "learning_rate": 7.546668498353896e-07, + "loss": 0.8530286, + "num_input_tokens_seen": 256168620, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.390625, + "step": 11875, + "time_per_iteration": 5.27921986579895 + }, + { + "auxiliary_loss_clip": 0.01062567, + "auxiliary_loss_mlp": 0.01025251, + "balance_loss_clip": 1.01132715, + "balance_loss_mlp": 1.02009761, + "epoch": 0.7140237486848039, + "flos": 23330010620160.0, + "grad_norm": 3.588191021975456, + "language_loss": 0.69682026, + "learning_rate": 7.543712308948185e-07, + "loss": 0.7176984, + "num_input_tokens_seen": 256186700, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.42578125, + "step": 11876, + "time_per_iteration": 2.4017279148101807 + }, + { + "auxiliary_loss_clip": 0.01059599, + "auxiliary_loss_mlp": 0.01021053, + "balance_loss_clip": 1.00890541, + "balance_loss_mlp": 1.01980996, + "epoch": 0.7140838719374718, + "flos": 16836130558080.0, + "grad_norm": 2.111896257257286, + "language_loss": 0.77897507, + "learning_rate": 7.540756564075341e-07, + "loss": 0.79978156, + "num_input_tokens_seen": 256205390, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.3984375, + "step": 11877, + "time_per_iteration": 2.3912394046783447 + }, + { + "auxiliary_loss_clip": 0.01058061, + "auxiliary_loss_mlp": 0.01023469, + "balance_loss_clip": 1.01211452, + "balance_loss_mlp": 1.01955295, + "epoch": 0.7141439951901398, + "flos": 21140956373760.0, + "grad_norm": 1.8491725862045914, + "language_loss": 0.69468111, + "learning_rate": 7.537801263840837e-07, + "loss": 0.71549642, + "num_input_tokens_seen": 256224575, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.38476562, + "step": 11878, + "time_per_iteration": 2.463575839996338 + }, + { + "auxiliary_loss_clip": 0.01056042, + "auxiliary_loss_mlp": 0.01027502, + "balance_loss_clip": 1.01658881, + "balance_loss_mlp": 1.01881289, + "epoch": 0.7142041184428077, + "flos": 24716551288320.0, + "grad_norm": 1.7646181770303961, + "language_loss": 0.67822015, + "learning_rate": 7.534846408350163e-07, + "loss": 0.69905555, + "num_input_tokens_seen": 256242130, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.37109375, + "step": 11879, + "time_per_iteration": 2.411792039871216 + }, + { + "auxiliary_loss_clip": 0.01058377, + "auxiliary_loss_mlp": 0.01026133, + "balance_loss_clip": 1.01368165, + "balance_loss_mlp": 1.01959634, + "epoch": 0.7142642416954758, + "flos": 21907125360000.0, + "grad_norm": 1.4507472610240841, + "language_loss": 0.69289279, + "learning_rate": 7.531891997708752e-07, + "loss": 0.71373796, + "num_input_tokens_seen": 256261920, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.38671875, + "step": 11880, + "time_per_iteration": 2.3850080966949463 + }, + { + "auxiliary_loss_clip": 0.01059047, + "auxiliary_loss_mlp": 0.01025253, + "balance_loss_clip": 1.01315904, + "balance_loss_mlp": 1.01872468, + "epoch": 0.7143243649481437, + "flos": 20804813003520.0, + "grad_norm": 2.3947296508214047, + "language_loss": 0.80411386, + "learning_rate": 7.528938032022036e-07, + "loss": 0.82495689, + "num_input_tokens_seen": 256277970, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.40234375, + "step": 11881, + "time_per_iteration": 2.3918604850769043 + }, + { + "auxiliary_loss_clip": 0.01057426, + "auxiliary_loss_mlp": 0.01026794, + "balance_loss_clip": 1.01583242, + "balance_loss_mlp": 1.01862788, + "epoch": 0.7143844882008117, + "flos": 27233789114880.0, + "grad_norm": 1.4995261894981355, + "language_loss": 0.6362325, + "learning_rate": 7.525984511395449e-07, + "loss": 0.65707469, + "num_input_tokens_seen": 256298205, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.38867188, + "step": 11882, + "time_per_iteration": 2.427870273590088 + }, + { + "auxiliary_loss_clip": 0.01060151, + "auxiliary_loss_mlp": 0.01025659, + "balance_loss_clip": 1.01338077, + "balance_loss_mlp": 1.01952565, + "epoch": 0.7144446114534796, + "flos": 17928702645120.0, + "grad_norm": 1.6149608682020473, + "language_loss": 0.68660939, + "learning_rate": 7.523031435934386e-07, + "loss": 0.70746756, + "num_input_tokens_seen": 256316685, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.40625, + "step": 11883, + "time_per_iteration": 3.7735037803649902 + }, + { + "auxiliary_loss_clip": 0.01058974, + "auxiliary_loss_mlp": 0.01030465, + "balance_loss_clip": 1.01815641, + "balance_loss_mlp": 1.01964927, + "epoch": 0.7145047347061476, + "flos": 20739909052800.0, + "grad_norm": 2.1066803476855718, + "language_loss": 0.77317083, + "learning_rate": 7.520078805744239e-07, + "loss": 0.7940653, + "num_input_tokens_seen": 256334205, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.39257812, + "step": 11884, + "time_per_iteration": 2.3894076347351074 + }, + { + "auxiliary_loss_clip": 0.01062252, + "auxiliary_loss_mlp": 0.01028678, + "balance_loss_clip": 1.01452756, + "balance_loss_mlp": 1.01926744, + "epoch": 0.7145648579588155, + "flos": 21177545345280.0, + "grad_norm": 2.3406369688046706, + "language_loss": 0.7388956, + "learning_rate": 7.51712662093037e-07, + "loss": 0.7598049, + "num_input_tokens_seen": 256353340, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.4296875, + "step": 11885, + "time_per_iteration": 2.3780646324157715 + }, + { + "auxiliary_loss_clip": 0.01007576, + "auxiliary_loss_mlp": 0.01001523, + "balance_loss_clip": 1.00053334, + "balance_loss_mlp": 1.00119948, + "epoch": 0.7146249812114835, + "flos": 64781094810240.0, + "grad_norm": 0.8887265906364654, + "language_loss": 0.66386217, + "learning_rate": 7.514174881598155e-07, + "loss": 0.68395317, + "num_input_tokens_seen": 256411550, + "router_z_loss_clip": 0.0098877, + "router_z_loss_mlp": 0.06396484, + "step": 11886, + "time_per_iteration": 2.9943158626556396 + }, + { + "auxiliary_loss_clip": 0.01057321, + "auxiliary_loss_mlp": 0.01023454, + "balance_loss_clip": 1.01117003, + "balance_loss_mlp": 1.01858187, + "epoch": 0.7146851044641516, + "flos": 18112904311680.0, + "grad_norm": 1.5801283093963985, + "language_loss": 0.7519753, + "learning_rate": 7.511223587852906e-07, + "loss": 0.77278304, + "num_input_tokens_seen": 256430360, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.38671875, + "step": 11887, + "time_per_iteration": 2.3767919540405273 + }, + { + "auxiliary_loss_clip": 0.01058277, + "auxiliary_loss_mlp": 0.01024321, + "balance_loss_clip": 1.01287055, + "balance_loss_mlp": 1.0193907, + "epoch": 0.7147452277168195, + "flos": 19242868331520.0, + "grad_norm": 2.2720017625619398, + "language_loss": 0.71675432, + "learning_rate": 7.508272739799972e-07, + "loss": 0.7375803, + "num_input_tokens_seen": 256449750, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.38867188, + "step": 11888, + "time_per_iteration": 2.3729329109191895 + }, + { + "auxiliary_loss_clip": 0.01059666, + "auxiliary_loss_mlp": 0.01027571, + "balance_loss_clip": 1.01526308, + "balance_loss_mlp": 1.01931334, + "epoch": 0.7148053509694875, + "flos": 23763701928960.0, + "grad_norm": 2.914262087769313, + "language_loss": 0.83914077, + "learning_rate": 7.50532233754465e-07, + "loss": 0.86001313, + "num_input_tokens_seen": 256467330, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.40234375, + "step": 11889, + "time_per_iteration": 2.465358257293701 + }, + { + "auxiliary_loss_clip": 0.01057052, + "auxiliary_loss_mlp": 0.01023007, + "balance_loss_clip": 1.01094866, + "balance_loss_mlp": 1.01847422, + "epoch": 0.7148654742221554, + "flos": 22484414557440.0, + "grad_norm": 1.7750868965817481, + "language_loss": 0.76062202, + "learning_rate": 7.502372381192233e-07, + "loss": 0.78142262, + "num_input_tokens_seen": 256485705, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.38671875, + "step": 11890, + "time_per_iteration": 2.3841259479522705 + }, + { + "auxiliary_loss_clip": 0.01058187, + "auxiliary_loss_mlp": 0.01025969, + "balance_loss_clip": 1.01496005, + "balance_loss_mlp": 1.01980567, + "epoch": 0.7149255974748234, + "flos": 24678112014720.0, + "grad_norm": 1.7088648460507407, + "language_loss": 0.73755658, + "learning_rate": 7.499422870847991e-07, + "loss": 0.75839812, + "num_input_tokens_seen": 256504755, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.3828125, + "step": 11891, + "time_per_iteration": 2.4418816566467285 + }, + { + "auxiliary_loss_clip": 0.01059555, + "auxiliary_loss_mlp": 0.01027167, + "balance_loss_clip": 1.01482868, + "balance_loss_mlp": 1.0195899, + "epoch": 0.7149857207274913, + "flos": 18404603654400.0, + "grad_norm": 1.7561633206656984, + "language_loss": 0.67768049, + "learning_rate": 7.4964738066172e-07, + "loss": 0.69854772, + "num_input_tokens_seen": 256523670, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.3984375, + "step": 11892, + "time_per_iteration": 2.361300468444824 + }, + { + "auxiliary_loss_clip": 0.01059477, + "auxiliary_loss_mlp": 0.01023799, + "balance_loss_clip": 1.01021504, + "balance_loss_mlp": 1.01817846, + "epoch": 0.7150458439801594, + "flos": 24968449814400.0, + "grad_norm": 4.3501606762249665, + "language_loss": 0.74021435, + "learning_rate": 7.493525188605095e-07, + "loss": 0.76104712, + "num_input_tokens_seen": 256542225, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.4140625, + "step": 11893, + "time_per_iteration": 2.4264211654663086 + }, + { + "auxiliary_loss_clip": 0.01059646, + "auxiliary_loss_mlp": 0.01025697, + "balance_loss_clip": 1.01238167, + "balance_loss_mlp": 1.01923084, + "epoch": 0.7151059672328273, + "flos": 16689844494720.0, + "grad_norm": 2.760970874309989, + "language_loss": 0.66216618, + "learning_rate": 7.490577016916905e-07, + "loss": 0.68301964, + "num_input_tokens_seen": 256560730, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.40429688, + "step": 11894, + "time_per_iteration": 2.355877161026001 + }, + { + "auxiliary_loss_clip": 0.01059839, + "auxiliary_loss_mlp": 0.01027684, + "balance_loss_clip": 1.01558387, + "balance_loss_mlp": 1.01952982, + "epoch": 0.7151660904854953, + "flos": 27270587554560.0, + "grad_norm": 1.604435428773259, + "language_loss": 0.78072584, + "learning_rate": 7.487629291657844e-07, + "loss": 0.80160105, + "num_input_tokens_seen": 256580505, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.40234375, + "step": 11895, + "time_per_iteration": 2.458163261413574 + }, + { + "auxiliary_loss_clip": 0.01058123, + "auxiliary_loss_mlp": 0.01024574, + "balance_loss_clip": 1.01256371, + "balance_loss_mlp": 1.01821542, + "epoch": 0.7152262137381632, + "flos": 18331286065920.0, + "grad_norm": 1.7884924448016823, + "language_loss": 0.696042, + "learning_rate": 7.484682012933107e-07, + "loss": 0.716869, + "num_input_tokens_seen": 256597330, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.40039062, + "step": 11896, + "time_per_iteration": 2.366248607635498 + }, + { + "auxiliary_loss_clip": 0.0105834, + "auxiliary_loss_mlp": 0.01033586, + "balance_loss_clip": 1.02089024, + "balance_loss_mlp": 1.01842928, + "epoch": 0.7152863369908312, + "flos": 21798196318080.0, + "grad_norm": 2.4032529275001457, + "language_loss": 0.869169, + "learning_rate": 7.481735180847876e-07, + "loss": 0.89008832, + "num_input_tokens_seen": 256616030, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.3984375, + "step": 11897, + "time_per_iteration": 2.418959617614746 + }, + { + "auxiliary_loss_clip": 0.01058565, + "auxiliary_loss_mlp": 0.01021604, + "balance_loss_clip": 1.00964165, + "balance_loss_mlp": 1.01956666, + "epoch": 0.7153464602434991, + "flos": 22157487786240.0, + "grad_norm": 1.941379913751532, + "language_loss": 0.78235406, + "learning_rate": 7.478788795507309e-07, + "loss": 0.80315578, + "num_input_tokens_seen": 256635570, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.390625, + "step": 11898, + "time_per_iteration": 2.4031717777252197 + }, + { + "auxiliary_loss_clip": 0.01061521, + "auxiliary_loss_mlp": 0.01026607, + "balance_loss_clip": 1.01353002, + "balance_loss_mlp": 1.02031231, + "epoch": 0.7154065834961671, + "flos": 24714945365760.0, + "grad_norm": 1.7029228903735754, + "language_loss": 0.72947496, + "learning_rate": 7.47584285701657e-07, + "loss": 0.7503562, + "num_input_tokens_seen": 256655290, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.41210938, + "step": 11899, + "time_per_iteration": 2.4246368408203125 + }, + { + "auxiliary_loss_clip": 0.01056512, + "auxiliary_loss_mlp": 0.0102491, + "balance_loss_clip": 1.01240468, + "balance_loss_mlp": 1.01822925, + "epoch": 0.7154667067488351, + "flos": 22600395694080.0, + "grad_norm": 2.1040390796639086, + "language_loss": 0.759386, + "learning_rate": 7.472897365480781e-07, + "loss": 0.78020024, + "num_input_tokens_seen": 256671605, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.3828125, + "step": 11900, + "time_per_iteration": 2.3961870670318604 + }, + { + "auxiliary_loss_clip": 0.01059288, + "auxiliary_loss_mlp": 0.01026201, + "balance_loss_clip": 1.01428556, + "balance_loss_mlp": 1.01955867, + "epoch": 0.7155268300015031, + "flos": 18770144256000.0, + "grad_norm": 1.9108014131252191, + "language_loss": 0.80949128, + "learning_rate": 7.469952321005061e-07, + "loss": 0.83034623, + "num_input_tokens_seen": 256689680, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.3984375, + "step": 11901, + "time_per_iteration": 2.393080472946167 + }, + { + "auxiliary_loss_clip": 0.01061103, + "auxiliary_loss_mlp": 0.01027534, + "balance_loss_clip": 1.01499331, + "balance_loss_mlp": 1.01939785, + "epoch": 0.7155869532541711, + "flos": 18295360410240.0, + "grad_norm": 1.8464001029185102, + "language_loss": 0.81350791, + "learning_rate": 7.467007723694507e-07, + "loss": 0.83439434, + "num_input_tokens_seen": 256707760, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.41796875, + "step": 11902, + "time_per_iteration": 2.3558666706085205 + }, + { + "auxiliary_loss_clip": 0.01057559, + "auxiliary_loss_mlp": 0.0102681, + "balance_loss_clip": 1.01459694, + "balance_loss_mlp": 1.01867449, + "epoch": 0.715647076506839, + "flos": 11980096197120.0, + "grad_norm": 1.6559874990329704, + "language_loss": 0.67826498, + "learning_rate": 7.464063573654222e-07, + "loss": 0.69910872, + "num_input_tokens_seen": 256724150, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.38867188, + "step": 11903, + "time_per_iteration": 2.365272283554077 + }, + { + "auxiliary_loss_clip": 0.01055308, + "auxiliary_loss_mlp": 0.01023264, + "balance_loss_clip": 1.01222563, + "balance_loss_mlp": 1.01832294, + "epoch": 0.715707199759507, + "flos": 18950680229760.0, + "grad_norm": 1.8249437545119387, + "language_loss": 0.76261234, + "learning_rate": 7.461119870989248e-07, + "loss": 0.78339803, + "num_input_tokens_seen": 256742780, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.37109375, + "step": 11904, + "time_per_iteration": 2.381528377532959 + }, + { + "auxiliary_loss_clip": 0.01058996, + "auxiliary_loss_mlp": 0.01024197, + "balance_loss_clip": 1.01154935, + "balance_loss_mlp": 1.01926041, + "epoch": 0.7157673230121749, + "flos": 15303513294720.0, + "grad_norm": 2.185786036792145, + "language_loss": 0.72493607, + "learning_rate": 7.458176615804657e-07, + "loss": 0.74576801, + "num_input_tokens_seen": 256761355, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.3984375, + "step": 11905, + "time_per_iteration": 3.9376490116119385 + }, + { + "auxiliary_loss_clip": 0.01057282, + "auxiliary_loss_mlp": 0.01024695, + "balance_loss_clip": 1.01304197, + "balance_loss_mlp": 1.01835203, + "epoch": 0.715827446264843, + "flos": 23220732464640.0, + "grad_norm": 1.5262454280243556, + "language_loss": 0.77786779, + "learning_rate": 7.455233808205483e-07, + "loss": 0.79868758, + "num_input_tokens_seen": 256781335, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.38867188, + "step": 11906, + "time_per_iteration": 2.3984854221343994 + }, + { + "auxiliary_loss_clip": 0.01006977, + "auxiliary_loss_mlp": 0.0100081, + "balance_loss_clip": 0.99989229, + "balance_loss_mlp": 1.00068355, + "epoch": 0.7158875695175109, + "flos": 60972490281600.0, + "grad_norm": 0.7333790089103979, + "language_loss": 0.55364084, + "learning_rate": 7.452291448296744e-07, + "loss": 0.57371873, + "num_input_tokens_seen": 256838890, + "router_z_loss_clip": 0.00915527, + "router_z_loss_mlp": 0.0625, + "step": 11907, + "time_per_iteration": 2.976529836654663 + }, + { + "auxiliary_loss_clip": 0.01058737, + "auxiliary_loss_mlp": 0.01026, + "balance_loss_clip": 1.0142045, + "balance_loss_mlp": 1.01904798, + "epoch": 0.7159476927701789, + "flos": 17127829900800.0, + "grad_norm": 1.7757036263977795, + "language_loss": 0.69148445, + "learning_rate": 7.44934953618344e-07, + "loss": 0.71233177, + "num_input_tokens_seen": 256858145, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.3984375, + "step": 11908, + "time_per_iteration": 2.3912549018859863 + }, + { + "auxiliary_loss_clip": 0.01058127, + "auxiliary_loss_mlp": 0.01025732, + "balance_loss_clip": 1.0131433, + "balance_loss_mlp": 1.01882589, + "epoch": 0.7160078160228468, + "flos": 22489546527360.0, + "grad_norm": 1.6156630674001826, + "language_loss": 0.70822167, + "learning_rate": 7.446408071970576e-07, + "loss": 0.72906029, + "num_input_tokens_seen": 256878545, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.39257812, + "step": 11909, + "time_per_iteration": 2.3975822925567627 + }, + { + "auxiliary_loss_clip": 0.01058922, + "auxiliary_loss_mlp": 0.01026426, + "balance_loss_clip": 1.01483274, + "balance_loss_mlp": 1.01960409, + "epoch": 0.7160679392755148, + "flos": 30589640732160.0, + "grad_norm": 1.4364800302368665, + "language_loss": 0.75133234, + "learning_rate": 7.443467055763113e-07, + "loss": 0.7721858, + "num_input_tokens_seen": 256899920, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.39257812, + "step": 11910, + "time_per_iteration": 2.4708926677703857 + }, + { + "auxiliary_loss_clip": 0.01057942, + "auxiliary_loss_mlp": 0.01025561, + "balance_loss_clip": 1.01500511, + "balance_loss_mlp": 1.01883817, + "epoch": 0.7161280625281827, + "flos": 21322609511040.0, + "grad_norm": 1.8167598486722436, + "language_loss": 0.76298463, + "learning_rate": 7.440526487666014e-07, + "loss": 0.78381968, + "num_input_tokens_seen": 256918460, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.390625, + "step": 11911, + "time_per_iteration": 2.410891532897949 + }, + { + "auxiliary_loss_clip": 0.01057936, + "auxiliary_loss_mlp": 0.01025509, + "balance_loss_clip": 1.01302755, + "balance_loss_mlp": 1.01898146, + "epoch": 0.7161881857808508, + "flos": 61857889027200.0, + "grad_norm": 1.7095787782872387, + "language_loss": 0.58433414, + "learning_rate": 7.437586367784217e-07, + "loss": 0.60516858, + "num_input_tokens_seen": 256942015, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.390625, + "step": 11912, + "time_per_iteration": 2.77535343170166 + }, + { + "auxiliary_loss_clip": 0.01057889, + "auxiliary_loss_mlp": 0.0102531, + "balance_loss_clip": 1.01287091, + "balance_loss_mlp": 1.01900744, + "epoch": 0.7162483090335187, + "flos": 20811097048320.0, + "grad_norm": 1.6536561631892854, + "language_loss": 0.78065062, + "learning_rate": 7.434646696222648e-07, + "loss": 0.80148262, + "num_input_tokens_seen": 256961065, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.38867188, + "step": 11913, + "time_per_iteration": 2.4303574562072754 + }, + { + "auxiliary_loss_clip": 0.01054417, + "auxiliary_loss_mlp": 0.01024505, + "balance_loss_clip": 1.01392508, + "balance_loss_mlp": 1.01797748, + "epoch": 0.7163084322861867, + "flos": 24096389074560.0, + "grad_norm": 1.570007483663626, + "language_loss": 0.74170774, + "learning_rate": 7.431707473086215e-07, + "loss": 0.76249695, + "num_input_tokens_seen": 256982165, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.36523438, + "step": 11914, + "time_per_iteration": 2.4045283794403076 + }, + { + "auxiliary_loss_clip": 0.01055532, + "auxiliary_loss_mlp": 0.01021849, + "balance_loss_clip": 1.01094723, + "balance_loss_mlp": 1.01906633, + "epoch": 0.7163685555388547, + "flos": 20079946022400.0, + "grad_norm": 1.6093688503595818, + "language_loss": 0.74001104, + "learning_rate": 7.428768698479808e-07, + "loss": 0.7607848, + "num_input_tokens_seen": 256999825, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.36328125, + "step": 11915, + "time_per_iteration": 5.2783873081207275 + }, + { + "auxiliary_loss_clip": 0.0105973, + "auxiliary_loss_mlp": 0.01023454, + "balance_loss_clip": 1.01181889, + "balance_loss_mlp": 1.01886797, + "epoch": 0.7164286787915226, + "flos": 17456013480960.0, + "grad_norm": 1.772969632995078, + "language_loss": 0.81009716, + "learning_rate": 7.425830372508324e-07, + "loss": 0.83092904, + "num_input_tokens_seen": 257017450, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.40820312, + "step": 11916, + "time_per_iteration": 2.355010509490967 + }, + { + "auxiliary_loss_clip": 0.01057657, + "auxiliary_loss_mlp": 0.01026424, + "balance_loss_clip": 1.01566577, + "balance_loss_mlp": 1.01909745, + "epoch": 0.7164888020441906, + "flos": 19717896556800.0, + "grad_norm": 1.7472561223006782, + "language_loss": 0.68348587, + "learning_rate": 7.422892495276593e-07, + "loss": 0.70432669, + "num_input_tokens_seen": 257035465, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.38671875, + "step": 11917, + "time_per_iteration": 2.3779685497283936 + }, + { + "auxiliary_loss_clip": 0.01059741, + "auxiliary_loss_mlp": 0.01023893, + "balance_loss_clip": 1.01157856, + "balance_loss_mlp": 1.02068758, + "epoch": 0.7165489252968585, + "flos": 21469454156160.0, + "grad_norm": 1.7577748059548755, + "language_loss": 0.75977278, + "learning_rate": 7.419955066889485e-07, + "loss": 0.78060901, + "num_input_tokens_seen": 257053750, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.390625, + "step": 11918, + "time_per_iteration": 2.397847890853882 + }, + { + "auxiliary_loss_clip": 0.01006875, + "auxiliary_loss_mlp": 0.01000326, + "balance_loss_clip": 0.99949151, + "balance_loss_mlp": 1.00051284, + "epoch": 0.7166090485495266, + "flos": 69924499505280.0, + "grad_norm": 0.6575741736282789, + "language_loss": 0.53942233, + "learning_rate": 7.417018087451812e-07, + "loss": 0.55949432, + "num_input_tokens_seen": 257121215, + "router_z_loss_clip": 0.00836182, + "router_z_loss_mlp": 0.06347656, + "step": 11919, + "time_per_iteration": 3.06589412689209 + }, + { + "auxiliary_loss_clip": 0.01059278, + "auxiliary_loss_mlp": 0.0102494, + "balance_loss_clip": 1.01294804, + "balance_loss_mlp": 1.01985741, + "epoch": 0.7166691718021945, + "flos": 27342683245440.0, + "grad_norm": 1.9618299522805813, + "language_loss": 0.69285953, + "learning_rate": 7.414081557068412e-07, + "loss": 0.71370161, + "num_input_tokens_seen": 257143370, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.39453125, + "step": 11920, + "time_per_iteration": 2.436887264251709 + }, + { + "auxiliary_loss_clip": 0.01058006, + "auxiliary_loss_mlp": 0.01026427, + "balance_loss_clip": 1.01449454, + "balance_loss_mlp": 1.01915359, + "epoch": 0.7167292950548625, + "flos": 30407568658560.0, + "grad_norm": 1.8539402013807393, + "language_loss": 0.74905217, + "learning_rate": 7.411145475844052e-07, + "loss": 0.76989651, + "num_input_tokens_seen": 257162160, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.38867188, + "step": 11921, + "time_per_iteration": 2.4449961185455322 + }, + { + "auxiliary_loss_clip": 0.0106143, + "auxiliary_loss_mlp": 0.01025853, + "balance_loss_clip": 1.01240051, + "balance_loss_mlp": 1.01897955, + "epoch": 0.7167894183075304, + "flos": 14570477055360.0, + "grad_norm": 3.350170257750669, + "language_loss": 0.75579667, + "learning_rate": 7.408209843883536e-07, + "loss": 0.7766695, + "num_input_tokens_seen": 257179300, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.42382812, + "step": 11922, + "time_per_iteration": 2.3747031688690186 + }, + { + "auxiliary_loss_clip": 0.01006949, + "auxiliary_loss_mlp": 0.01001653, + "balance_loss_clip": 1.00074673, + "balance_loss_mlp": 1.00062108, + "epoch": 0.7168495415601984, + "flos": 64107725817600.0, + "grad_norm": 0.7484143236173347, + "language_loss": 0.55125618, + "learning_rate": 7.405274661291619e-07, + "loss": 0.57134223, + "num_input_tokens_seen": 257235470, + "router_z_loss_clip": 0.0090332, + "router_z_loss_mlp": 0.06347656, + "step": 11923, + "time_per_iteration": 4.254017353057861 + }, + { + "auxiliary_loss_clip": 0.01058961, + "auxiliary_loss_mlp": 0.01025588, + "balance_loss_clip": 1.01313019, + "balance_loss_mlp": 1.01869106, + "epoch": 0.7169096648128663, + "flos": 24680276519040.0, + "grad_norm": 1.5364284741042726, + "language_loss": 0.76833665, + "learning_rate": 7.402339928173051e-07, + "loss": 0.78918219, + "num_input_tokens_seen": 257255850, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.40234375, + "step": 11924, + "time_per_iteration": 2.406129837036133 + }, + { + "auxiliary_loss_clip": 0.01058783, + "auxiliary_loss_mlp": 0.0102282, + "balance_loss_clip": 1.01186478, + "balance_loss_mlp": 1.01946855, + "epoch": 0.7169697880655344, + "flos": 20666486730240.0, + "grad_norm": 1.728508591174122, + "language_loss": 0.68053728, + "learning_rate": 7.39940564463256e-07, + "loss": 0.70135331, + "num_input_tokens_seen": 257275425, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.39453125, + "step": 11925, + "time_per_iteration": 2.410896062850952 + }, + { + "auxiliary_loss_clip": 0.01054087, + "auxiliary_loss_mlp": 0.01020676, + "balance_loss_clip": 1.0094341, + "balance_loss_mlp": 1.0181706, + "epoch": 0.7170299113182023, + "flos": 21031643306880.0, + "grad_norm": 1.4802744990108818, + "language_loss": 0.77354264, + "learning_rate": 7.396471810774876e-07, + "loss": 0.7942903, + "num_input_tokens_seen": 257295740, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.359375, + "step": 11926, + "time_per_iteration": 2.4310624599456787 + }, + { + "auxiliary_loss_clip": 0.01059572, + "auxiliary_loss_mlp": 0.01027188, + "balance_loss_clip": 1.01415861, + "balance_loss_mlp": 1.01843464, + "epoch": 0.7170900345708703, + "flos": 22892199770880.0, + "grad_norm": 2.2422029658496463, + "language_loss": 0.77357996, + "learning_rate": 7.3935384267047e-07, + "loss": 0.7944476, + "num_input_tokens_seen": 257315970, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.41015625, + "step": 11927, + "time_per_iteration": 2.3910670280456543 + }, + { + "auxiliary_loss_clip": 0.01057499, + "auxiliary_loss_mlp": 0.01022715, + "balance_loss_clip": 1.01106191, + "balance_loss_mlp": 1.01974928, + "epoch": 0.7171501578235383, + "flos": 15517915153920.0, + "grad_norm": 1.5752664916069479, + "language_loss": 0.68896466, + "learning_rate": 7.390605492526696e-07, + "loss": 0.70976681, + "num_input_tokens_seen": 257334230, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.37890625, + "step": 11928, + "time_per_iteration": 2.400447368621826 + }, + { + "auxiliary_loss_clip": 0.01055911, + "auxiliary_loss_mlp": 0.01019744, + "balance_loss_clip": 1.00889015, + "balance_loss_mlp": 1.01882207, + "epoch": 0.7172102810762062, + "flos": 26103091956480.0, + "grad_norm": 1.7793997695791421, + "language_loss": 0.65168428, + "learning_rate": 7.387673008345552e-07, + "loss": 0.67244077, + "num_input_tokens_seen": 257352145, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.37109375, + "step": 11929, + "time_per_iteration": 2.4240479469299316 + }, + { + "auxiliary_loss_clip": 0.01057567, + "auxiliary_loss_mlp": 0.01021863, + "balance_loss_clip": 1.00974572, + "balance_loss_mlp": 1.01764464, + "epoch": 0.7172704043288742, + "flos": 21505589280000.0, + "grad_norm": 1.8218024026399766, + "language_loss": 0.70036197, + "learning_rate": 7.384740974265917e-07, + "loss": 0.72115624, + "num_input_tokens_seen": 257371460, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.3984375, + "step": 11930, + "time_per_iteration": 2.4150047302246094 + }, + { + "auxiliary_loss_clip": 0.01058321, + "auxiliary_loss_mlp": 0.01022898, + "balance_loss_clip": 1.01125717, + "balance_loss_mlp": 1.01889384, + "epoch": 0.7173305275815421, + "flos": 18769934787840.0, + "grad_norm": 1.7989305449484096, + "language_loss": 0.80656993, + "learning_rate": 7.381809390392426e-07, + "loss": 0.82738209, + "num_input_tokens_seen": 257390800, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.39453125, + "step": 11931, + "time_per_iteration": 2.4141762256622314 + }, + { + "auxiliary_loss_clip": 0.01056245, + "auxiliary_loss_mlp": 0.01027462, + "balance_loss_clip": 1.01652503, + "balance_loss_mlp": 1.0182879, + "epoch": 0.7173906508342102, + "flos": 16178960436480.0, + "grad_norm": 2.2369277295251813, + "language_loss": 0.78147256, + "learning_rate": 7.378878256829695e-07, + "loss": 0.80230963, + "num_input_tokens_seen": 257407495, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.37890625, + "step": 11932, + "time_per_iteration": 2.362708806991577 + }, + { + "auxiliary_loss_clip": 0.01056892, + "auxiliary_loss_mlp": 0.01026428, + "balance_loss_clip": 1.01471567, + "balance_loss_mlp": 1.01787686, + "epoch": 0.7174507740868781, + "flos": 26612684294400.0, + "grad_norm": 1.3864543705674857, + "language_loss": 0.75110781, + "learning_rate": 7.375947573682344e-07, + "loss": 0.77194107, + "num_input_tokens_seen": 257429675, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.390625, + "step": 11933, + "time_per_iteration": 2.4577887058258057 + }, + { + "auxiliary_loss_clip": 0.01062994, + "auxiliary_loss_mlp": 0.01029477, + "balance_loss_clip": 1.01470685, + "balance_loss_mlp": 1.02091753, + "epoch": 0.7175108973395461, + "flos": 18432185495040.0, + "grad_norm": 2.5017570137835548, + "language_loss": 0.69438815, + "learning_rate": 7.373017341054939e-07, + "loss": 0.7153129, + "num_input_tokens_seen": 257442765, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.421875, + "step": 11934, + "time_per_iteration": 2.340301036834717 + }, + { + "auxiliary_loss_clip": 0.010577, + "auxiliary_loss_mlp": 0.01023276, + "balance_loss_clip": 1.01145637, + "balance_loss_mlp": 1.01878285, + "epoch": 0.717571020592214, + "flos": 23913828241920.0, + "grad_norm": 2.4170241076390226, + "language_loss": 0.86560583, + "learning_rate": 7.370087559052072e-07, + "loss": 0.8864156, + "num_input_tokens_seen": 257459310, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.38867188, + "step": 11935, + "time_per_iteration": 2.430102586746216 + }, + { + "auxiliary_loss_clip": 0.01057478, + "auxiliary_loss_mlp": 0.01021398, + "balance_loss_clip": 1.0092926, + "balance_loss_mlp": 1.01785815, + "epoch": 0.717631143844882, + "flos": 38255310489600.0, + "grad_norm": 1.501061321488206, + "language_loss": 0.74028409, + "learning_rate": 7.367158227778285e-07, + "loss": 0.76107287, + "num_input_tokens_seen": 257484750, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.39648438, + "step": 11936, + "time_per_iteration": 2.5746891498565674 + }, + { + "auxiliary_loss_clip": 0.01006883, + "auxiliary_loss_mlp": 0.01000649, + "balance_loss_clip": 0.99979645, + "balance_loss_mlp": 1.00054908, + "epoch": 0.7176912670975499, + "flos": 65512385481600.0, + "grad_norm": 0.7584158239566438, + "language_loss": 0.55960464, + "learning_rate": 7.36422934733814e-07, + "loss": 0.57967997, + "num_input_tokens_seen": 257543110, + "router_z_loss_clip": 0.00854492, + "router_z_loss_mlp": 0.06347656, + "step": 11937, + "time_per_iteration": 3.0119669437408447 + }, + { + "auxiliary_loss_clip": 0.01060565, + "auxiliary_loss_mlp": 0.01030187, + "balance_loss_clip": 1.01703191, + "balance_loss_mlp": 1.01977861, + "epoch": 0.717751390350218, + "flos": 31279838866560.0, + "grad_norm": 1.7124783451171095, + "language_loss": 0.54449624, + "learning_rate": 7.361300917836131e-07, + "loss": 0.56540376, + "num_input_tokens_seen": 257567410, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.40820312, + "step": 11938, + "time_per_iteration": 2.48083233833313 + }, + { + "auxiliary_loss_clip": 0.01054928, + "auxiliary_loss_mlp": 0.01024447, + "balance_loss_clip": 1.012604, + "balance_loss_mlp": 1.0177865, + "epoch": 0.7178115136028859, + "flos": 19858177866240.0, + "grad_norm": 1.6962891332104222, + "language_loss": 0.76914465, + "learning_rate": 7.358372939376789e-07, + "loss": 0.78993845, + "num_input_tokens_seen": 257586270, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.37109375, + "step": 11939, + "time_per_iteration": 2.4222161769866943 + }, + { + "auxiliary_loss_clip": 0.01007129, + "auxiliary_loss_mlp": 0.01000629, + "balance_loss_clip": 0.99978822, + "balance_loss_mlp": 1.00064218, + "epoch": 0.7178716368555539, + "flos": 64345169174400.0, + "grad_norm": 0.7548441734959199, + "language_loss": 0.61468142, + "learning_rate": 7.355445412064598e-07, + "loss": 0.63475901, + "num_input_tokens_seen": 257647415, + "router_z_loss_clip": 0.00842285, + "router_z_loss_mlp": 0.06494141, + "step": 11940, + "time_per_iteration": 3.018367290496826 + }, + { + "auxiliary_loss_clip": 0.01055788, + "auxiliary_loss_mlp": 0.01028026, + "balance_loss_clip": 1.01637316, + "balance_loss_mlp": 1.0185101, + "epoch": 0.7179317601082219, + "flos": 26761344330240.0, + "grad_norm": 1.4060774694757816, + "language_loss": 0.59173149, + "learning_rate": 7.352518336004037e-07, + "loss": 0.61256963, + "num_input_tokens_seen": 257669795, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.37304688, + "step": 11941, + "time_per_iteration": 2.4577579498291016 + }, + { + "auxiliary_loss_clip": 0.01057727, + "auxiliary_loss_mlp": 0.01023515, + "balance_loss_clip": 1.01202369, + "balance_loss_mlp": 1.01905429, + "epoch": 0.7179918833608898, + "flos": 23512676186880.0, + "grad_norm": 1.9711899834851532, + "language_loss": 0.79706955, + "learning_rate": 7.349591711299561e-07, + "loss": 0.81788194, + "num_input_tokens_seen": 257687415, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.38671875, + "step": 11942, + "time_per_iteration": 2.4008190631866455 + }, + { + "auxiliary_loss_clip": 0.01058149, + "auxiliary_loss_mlp": 0.01027964, + "balance_loss_clip": 1.01637053, + "balance_loss_mlp": 1.01845121, + "epoch": 0.7180520066135578, + "flos": 17164628340480.0, + "grad_norm": 2.1394745689003094, + "language_loss": 0.66137493, + "learning_rate": 7.34666553805561e-07, + "loss": 0.68223608, + "num_input_tokens_seen": 257706215, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.39648438, + "step": 11943, + "time_per_iteration": 2.3841476440429688 + }, + { + "auxiliary_loss_clip": 0.01055161, + "auxiliary_loss_mlp": 0.01022014, + "balance_loss_clip": 1.01051664, + "balance_loss_mlp": 1.01797891, + "epoch": 0.7181121298662257, + "flos": 17565675661440.0, + "grad_norm": 2.009534072337897, + "language_loss": 0.78998482, + "learning_rate": 7.343739816376631e-07, + "loss": 0.81075656, + "num_input_tokens_seen": 257724740, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.37109375, + "step": 11944, + "time_per_iteration": 3.78669810295105 + }, + { + "auxiliary_loss_clip": 0.01057952, + "auxiliary_loss_mlp": 0.01028099, + "balance_loss_clip": 1.01477134, + "balance_loss_mlp": 1.01841354, + "epoch": 0.7181722531188938, + "flos": 11946858716160.0, + "grad_norm": 3.377400457344838, + "language_loss": 0.62937224, + "learning_rate": 7.34081454636701e-07, + "loss": 0.65023273, + "num_input_tokens_seen": 257742060, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.39453125, + "step": 11945, + "time_per_iteration": 2.3590242862701416 + }, + { + "auxiliary_loss_clip": 0.01006975, + "auxiliary_loss_mlp": 0.01001822, + "balance_loss_clip": 1.00094604, + "balance_loss_mlp": 1.00065136, + "epoch": 0.7182323763715617, + "flos": 65500480707840.0, + "grad_norm": 0.6964237769788442, + "language_loss": 0.51065618, + "learning_rate": 7.337889728131159e-07, + "loss": 0.53074414, + "num_input_tokens_seen": 257802250, + "router_z_loss_clip": 0.00878906, + "router_z_loss_mlp": 0.06347656, + "step": 11946, + "time_per_iteration": 2.988936185836792 + }, + { + "auxiliary_loss_clip": 0.01061253, + "auxiliary_loss_mlp": 0.01028507, + "balance_loss_clip": 1.01513147, + "balance_loss_mlp": 1.02022386, + "epoch": 0.7182924996242297, + "flos": 20629897758720.0, + "grad_norm": 1.5647049165013358, + "language_loss": 0.74194145, + "learning_rate": 7.334965361773453e-07, + "loss": 0.76283908, + "num_input_tokens_seen": 257821155, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.41015625, + "step": 11947, + "time_per_iteration": 2.383014440536499 + }, + { + "auxiliary_loss_clip": 0.01056167, + "auxiliary_loss_mlp": 0.01020067, + "balance_loss_clip": 1.00806284, + "balance_loss_mlp": 1.01746631, + "epoch": 0.7183526228768976, + "flos": 16215514496640.0, + "grad_norm": 1.5252751673518627, + "language_loss": 0.72614449, + "learning_rate": 7.332041447398256e-07, + "loss": 0.74690688, + "num_input_tokens_seen": 257839905, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.38671875, + "step": 11948, + "time_per_iteration": 2.384660005569458 + }, + { + "auxiliary_loss_clip": 0.01061122, + "auxiliary_loss_mlp": 0.01029759, + "balance_loss_clip": 1.01641357, + "balance_loss_mlp": 1.02058387, + "epoch": 0.7184127461295656, + "flos": 22231678158720.0, + "grad_norm": 1.5718970795532037, + "language_loss": 0.71623898, + "learning_rate": 7.329117985109908e-07, + "loss": 0.73714787, + "num_input_tokens_seen": 257860055, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.40625, + "step": 11949, + "time_per_iteration": 2.4121084213256836 + }, + { + "auxiliary_loss_clip": 0.0105707, + "auxiliary_loss_mlp": 0.01026064, + "balance_loss_clip": 1.01481104, + "balance_loss_mlp": 1.01898956, + "epoch": 0.7184728693822335, + "flos": 27343276738560.0, + "grad_norm": 1.9471853447602447, + "language_loss": 0.76044285, + "learning_rate": 7.326194975012759e-07, + "loss": 0.7812742, + "num_input_tokens_seen": 257879315, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.38085938, + "step": 11950, + "time_per_iteration": 2.448060989379883 + }, + { + "auxiliary_loss_clip": 0.01060608, + "auxiliary_loss_mlp": 0.01023789, + "balance_loss_clip": 1.01084912, + "balance_loss_mlp": 1.02009892, + "epoch": 0.7185329926349016, + "flos": 16507597864320.0, + "grad_norm": 2.228422006159032, + "language_loss": 0.67736298, + "learning_rate": 7.323272417211095e-07, + "loss": 0.6982069, + "num_input_tokens_seen": 257896570, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.40625, + "step": 11951, + "time_per_iteration": 2.368119955062866 + }, + { + "auxiliary_loss_clip": 0.01062589, + "auxiliary_loss_mlp": 0.01027814, + "balance_loss_clip": 1.0145874, + "balance_loss_mlp": 1.02118826, + "epoch": 0.7185931158875695, + "flos": 23949928454400.0, + "grad_norm": 2.062190580951287, + "language_loss": 0.78166676, + "learning_rate": 7.320350311809238e-07, + "loss": 0.80257082, + "num_input_tokens_seen": 257916855, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.4140625, + "step": 11952, + "time_per_iteration": 2.4075698852539062 + }, + { + "auxiliary_loss_clip": 0.01058441, + "auxiliary_loss_mlp": 0.01027703, + "balance_loss_clip": 1.01606822, + "balance_loss_mlp": 1.01868987, + "epoch": 0.7186532391402375, + "flos": 26540798071680.0, + "grad_norm": 1.971483224983397, + "language_loss": 0.74728096, + "learning_rate": 7.317428658911456e-07, + "loss": 0.76814246, + "num_input_tokens_seen": 257937140, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.3984375, + "step": 11953, + "time_per_iteration": 2.428452491760254 + }, + { + "auxiliary_loss_clip": 0.01058239, + "auxiliary_loss_mlp": 0.01026711, + "balance_loss_clip": 1.01439095, + "balance_loss_mlp": 1.01883101, + "epoch": 0.7187133623929055, + "flos": 22381944117120.0, + "grad_norm": 3.747281146547953, + "language_loss": 0.73433471, + "learning_rate": 7.314507458622033e-07, + "loss": 0.75518417, + "num_input_tokens_seen": 257956785, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.39453125, + "step": 11954, + "time_per_iteration": 5.254071950912476 + }, + { + "auxiliary_loss_clip": 0.01059554, + "auxiliary_loss_mlp": 0.01024774, + "balance_loss_clip": 1.01259661, + "balance_loss_mlp": 1.02024901, + "epoch": 0.7187734856455734, + "flos": 15778646254080.0, + "grad_norm": 1.594850171250398, + "language_loss": 0.74412489, + "learning_rate": 7.311586711045197e-07, + "loss": 0.76496816, + "num_input_tokens_seen": 257975455, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.39257812, + "step": 11955, + "time_per_iteration": 2.3774542808532715 + }, + { + "auxiliary_loss_clip": 0.01059045, + "auxiliary_loss_mlp": 0.01026919, + "balance_loss_clip": 1.01528966, + "balance_loss_mlp": 1.0193795, + "epoch": 0.7188336088982414, + "flos": 31758253493760.0, + "grad_norm": 6.456714036037005, + "language_loss": 0.73080111, + "learning_rate": 7.308666416285198e-07, + "loss": 0.7516607, + "num_input_tokens_seen": 257996850, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.39648438, + "step": 11956, + "time_per_iteration": 2.466381311416626 + }, + { + "auxiliary_loss_clip": 0.01058104, + "auxiliary_loss_mlp": 0.0102202, + "balance_loss_clip": 1.00995636, + "balance_loss_mlp": 1.01889634, + "epoch": 0.7188937321509093, + "flos": 21464287274880.0, + "grad_norm": 1.6406964128121497, + "language_loss": 0.71003592, + "learning_rate": 7.305746574446256e-07, + "loss": 0.73083711, + "num_input_tokens_seen": 258016145, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.39257812, + "step": 11957, + "time_per_iteration": 2.4107253551483154 + }, + { + "auxiliary_loss_clip": 0.01058514, + "auxiliary_loss_mlp": 0.01033773, + "balance_loss_clip": 1.02128553, + "balance_loss_mlp": 1.01854765, + "epoch": 0.7189538554035774, + "flos": 27270273352320.0, + "grad_norm": 1.9095951533647193, + "language_loss": 0.73179591, + "learning_rate": 7.302827185632552e-07, + "loss": 0.75271881, + "num_input_tokens_seen": 258035420, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.40039062, + "step": 11958, + "time_per_iteration": 2.4252402782440186 + }, + { + "auxiliary_loss_clip": 0.01056355, + "auxiliary_loss_mlp": 0.01025811, + "balance_loss_clip": 1.01395547, + "balance_loss_mlp": 1.01917171, + "epoch": 0.7190139786562453, + "flos": 21579535272960.0, + "grad_norm": 1.6746934584540165, + "language_loss": 0.83940589, + "learning_rate": 7.29990824994829e-07, + "loss": 0.86022747, + "num_input_tokens_seen": 258053520, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.37109375, + "step": 11959, + "time_per_iteration": 2.388685703277588 + }, + { + "auxiliary_loss_clip": 0.01062655, + "auxiliary_loss_mlp": 0.01027338, + "balance_loss_clip": 1.01457667, + "balance_loss_mlp": 1.02206826, + "epoch": 0.7190741019089133, + "flos": 26720112147840.0, + "grad_norm": 1.7317745813487102, + "language_loss": 0.82173771, + "learning_rate": 7.296989767497635e-07, + "loss": 0.84263766, + "num_input_tokens_seen": 258073020, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.40625, + "step": 11960, + "time_per_iteration": 2.4177987575531006 + }, + { + "auxiliary_loss_clip": 0.01057208, + "auxiliary_loss_mlp": 0.01030435, + "balance_loss_clip": 1.01872301, + "balance_loss_mlp": 1.01916361, + "epoch": 0.7191342251615812, + "flos": 26103545804160.0, + "grad_norm": 1.7074826139975816, + "language_loss": 0.77663928, + "learning_rate": 7.294071738384739e-07, + "loss": 0.79751569, + "num_input_tokens_seen": 258093155, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.38085938, + "step": 11961, + "time_per_iteration": 2.425579071044922 + }, + { + "auxiliary_loss_clip": 0.01055427, + "auxiliary_loss_mlp": 0.01024421, + "balance_loss_clip": 1.01348329, + "balance_loss_mlp": 1.0179925, + "epoch": 0.7191943484142492, + "flos": 22965901384320.0, + "grad_norm": 1.3300157946388453, + "language_loss": 0.75117475, + "learning_rate": 7.291154162713733e-07, + "loss": 0.77197325, + "num_input_tokens_seen": 258113905, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.375, + "step": 11962, + "time_per_iteration": 2.4050185680389404 + }, + { + "auxiliary_loss_clip": 0.01058717, + "auxiliary_loss_mlp": 0.01025035, + "balance_loss_clip": 1.01175487, + "balance_loss_mlp": 1.01879466, + "epoch": 0.7192544716669171, + "flos": 22564225658880.0, + "grad_norm": 1.6063596412367862, + "language_loss": 0.75412363, + "learning_rate": 7.28823704058875e-07, + "loss": 0.77496111, + "num_input_tokens_seen": 258132820, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.3984375, + "step": 11963, + "time_per_iteration": 3.8054726123809814 + }, + { + "auxiliary_loss_clip": 0.01060778, + "auxiliary_loss_mlp": 0.01029699, + "balance_loss_clip": 1.01619291, + "balance_loss_mlp": 1.01940084, + "epoch": 0.7193145949195852, + "flos": 18981404092800.0, + "grad_norm": 2.4875511854024133, + "language_loss": 0.8078934, + "learning_rate": 7.285320372113888e-07, + "loss": 0.82879817, + "num_input_tokens_seen": 258148055, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.4140625, + "step": 11964, + "time_per_iteration": 2.3767282962799072 + }, + { + "auxiliary_loss_clip": 0.0105722, + "auxiliary_loss_mlp": 0.01030524, + "balance_loss_clip": 1.01875758, + "balance_loss_mlp": 1.01819921, + "epoch": 0.7193747181722531, + "flos": 18003277042560.0, + "grad_norm": 1.6170346624699499, + "language_loss": 0.7492671, + "learning_rate": 7.282404157393239e-07, + "loss": 0.77014452, + "num_input_tokens_seen": 258165995, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.390625, + "step": 11965, + "time_per_iteration": 2.3926968574523926 + }, + { + "auxiliary_loss_clip": 0.01058238, + "auxiliary_loss_mlp": 0.01028003, + "balance_loss_clip": 1.01500297, + "balance_loss_mlp": 1.01853538, + "epoch": 0.7194348414249211, + "flos": 24388262974080.0, + "grad_norm": 1.6780718834276973, + "language_loss": 0.77397156, + "learning_rate": 7.279488396530862e-07, + "loss": 0.79483396, + "num_input_tokens_seen": 258186165, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.39648438, + "step": 11966, + "time_per_iteration": 2.4341628551483154 + }, + { + "auxiliary_loss_clip": 0.01058242, + "auxiliary_loss_mlp": 0.01023717, + "balance_loss_clip": 1.01182604, + "balance_loss_mlp": 1.0187428, + "epoch": 0.7194949646775891, + "flos": 22162375376640.0, + "grad_norm": 1.9956851305833196, + "language_loss": 0.73168004, + "learning_rate": 7.276573089630837e-07, + "loss": 0.7524997, + "num_input_tokens_seen": 258204595, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.39453125, + "step": 11967, + "time_per_iteration": 2.416109085083008 + }, + { + "auxiliary_loss_clip": 0.01056645, + "auxiliary_loss_mlp": 0.01023087, + "balance_loss_clip": 1.01315713, + "balance_loss_mlp": 1.02114367, + "epoch": 0.719555087930257, + "flos": 20265334675200.0, + "grad_norm": 1.8085484737678774, + "language_loss": 0.81693906, + "learning_rate": 7.273658236797176e-07, + "loss": 0.83773637, + "num_input_tokens_seen": 258223110, + "router_z_loss_clip": 0.09912109, + "router_z_loss_mlp": 0.35546875, + "step": 11968, + "time_per_iteration": 2.3923723697662354 + }, + { + "auxiliary_loss_clip": 0.01056017, + "auxiliary_loss_mlp": 0.01024766, + "balance_loss_clip": 1.0129106, + "balance_loss_mlp": 1.01796293, + "epoch": 0.719615211182925, + "flos": 24715189745280.0, + "grad_norm": 1.7668271437017942, + "language_loss": 0.76310396, + "learning_rate": 7.270743838133923e-07, + "loss": 0.78391182, + "num_input_tokens_seen": 258242660, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.38085938, + "step": 11969, + "time_per_iteration": 2.4093809127807617 + }, + { + "auxiliary_loss_clip": 0.01056747, + "auxiliary_loss_mlp": 0.01024768, + "balance_loss_clip": 1.01279342, + "balance_loss_mlp": 1.0172348, + "epoch": 0.719675334435593, + "flos": 20008653292800.0, + "grad_norm": 1.5417279373396262, + "language_loss": 0.70834327, + "learning_rate": 7.267829893745075e-07, + "loss": 0.7291584, + "num_input_tokens_seen": 258261850, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.39453125, + "step": 11970, + "time_per_iteration": 2.3993561267852783 + }, + { + "auxiliary_loss_clip": 0.01007472, + "auxiliary_loss_mlp": 0.01001265, + "balance_loss_clip": 1.00041223, + "balance_loss_mlp": 1.00106108, + "epoch": 0.719735457688261, + "flos": 44197177178880.0, + "grad_norm": 0.9146593838764074, + "language_loss": 0.60828614, + "learning_rate": 7.264916403734638e-07, + "loss": 0.6283735, + "num_input_tokens_seen": 258312570, + "router_z_loss_clip": 0.00854492, + "router_z_loss_mlp": 0.06396484, + "step": 11971, + "time_per_iteration": 2.9681596755981445 + }, + { + "auxiliary_loss_clip": 0.01063285, + "auxiliary_loss_mlp": 0.01026763, + "balance_loss_clip": 1.01381063, + "balance_loss_mlp": 1.02076554, + "epoch": 0.7197955809409289, + "flos": 16801880647680.0, + "grad_norm": 1.94017235369428, + "language_loss": 0.79838967, + "learning_rate": 7.262003368206571e-07, + "loss": 0.81929022, + "num_input_tokens_seen": 258331600, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.42578125, + "step": 11972, + "time_per_iteration": 2.3945977687835693 + }, + { + "auxiliary_loss_clip": 0.01059551, + "auxiliary_loss_mlp": 0.0102823, + "balance_loss_clip": 1.01610637, + "balance_loss_mlp": 1.0193038, + "epoch": 0.7198557041935969, + "flos": 24534234835200.0, + "grad_norm": 1.939002930190736, + "language_loss": 0.75760615, + "learning_rate": 7.25909078726483e-07, + "loss": 0.77848387, + "num_input_tokens_seen": 258351785, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.40234375, + "step": 11973, + "time_per_iteration": 2.4266672134399414 + }, + { + "auxiliary_loss_clip": 0.01007118, + "auxiliary_loss_mlp": 0.01000717, + "balance_loss_clip": 0.99988884, + "balance_loss_mlp": 1.00073409, + "epoch": 0.7199158274462648, + "flos": 70707811835520.0, + "grad_norm": 0.8641116870803104, + "language_loss": 0.57275069, + "learning_rate": 7.256178661013376e-07, + "loss": 0.59282899, + "num_input_tokens_seen": 258404035, + "router_z_loss_clip": 0.00830078, + "router_z_loss_mlp": 0.06396484, + "step": 11974, + "time_per_iteration": 2.9186785221099854 + }, + { + "auxiliary_loss_clip": 0.01059107, + "auxiliary_loss_mlp": 0.01027315, + "balance_loss_clip": 1.01448762, + "balance_loss_mlp": 1.0178169, + "epoch": 0.7199759506989328, + "flos": 29346802686720.0, + "grad_norm": 5.8573589947950015, + "language_loss": 0.61186117, + "learning_rate": 7.253266989556115e-07, + "loss": 0.63272536, + "num_input_tokens_seen": 258424850, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.41210938, + "step": 11975, + "time_per_iteration": 2.4538021087646484 + }, + { + "auxiliary_loss_clip": 0.01055449, + "auxiliary_loss_mlp": 0.0102396, + "balance_loss_clip": 1.0126946, + "balance_loss_mlp": 1.01754344, + "epoch": 0.7200360739516007, + "flos": 24639428361600.0, + "grad_norm": 1.868661767325094, + "language_loss": 0.67696398, + "learning_rate": 7.250355772996972e-07, + "loss": 0.69775808, + "num_input_tokens_seen": 258445485, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.37890625, + "step": 11976, + "time_per_iteration": 2.4562768936157227 + }, + { + "auxiliary_loss_clip": 0.01057577, + "auxiliary_loss_mlp": 0.0102614, + "balance_loss_clip": 1.01462424, + "balance_loss_mlp": 1.0182296, + "epoch": 0.7200961972042688, + "flos": 20811830186880.0, + "grad_norm": 2.002839750965841, + "language_loss": 0.67281651, + "learning_rate": 7.247445011439836e-07, + "loss": 0.6936537, + "num_input_tokens_seen": 258464505, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.39257812, + "step": 11977, + "time_per_iteration": 2.3865177631378174 + }, + { + "auxiliary_loss_clip": 0.01059496, + "auxiliary_loss_mlp": 0.0102837, + "balance_loss_clip": 1.01548934, + "balance_loss_mlp": 1.01951528, + "epoch": 0.7201563204569367, + "flos": 31244646349440.0, + "grad_norm": 1.637248268026412, + "language_loss": 0.75456834, + "learning_rate": 7.244534704988582e-07, + "loss": 0.77544701, + "num_input_tokens_seen": 258487190, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.40039062, + "step": 11978, + "time_per_iteration": 2.4721145629882812 + }, + { + "auxiliary_loss_clip": 0.01056305, + "auxiliary_loss_mlp": 0.01029107, + "balance_loss_clip": 1.01691222, + "balance_loss_mlp": 1.01778531, + "epoch": 0.7202164437096047, + "flos": 26650180961280.0, + "grad_norm": 1.8366776717619302, + "language_loss": 0.7887454, + "learning_rate": 7.24162485374707e-07, + "loss": 0.80959952, + "num_input_tokens_seen": 258503790, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.38476562, + "step": 11979, + "time_per_iteration": 2.4273784160614014 + }, + { + "auxiliary_loss_clip": 0.01058881, + "auxiliary_loss_mlp": 0.01024157, + "balance_loss_clip": 1.01184225, + "balance_loss_mlp": 1.01952517, + "epoch": 0.7202765669622727, + "flos": 25958376904320.0, + "grad_norm": 1.6773214255615907, + "language_loss": 0.64876032, + "learning_rate": 7.238715457819154e-07, + "loss": 0.66959071, + "num_input_tokens_seen": 258527335, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.39453125, + "step": 11980, + "time_per_iteration": 2.4523487091064453 + }, + { + "auxiliary_loss_clip": 0.01059553, + "auxiliary_loss_mlp": 0.01024103, + "balance_loss_clip": 1.01150882, + "balance_loss_mlp": 1.01989019, + "epoch": 0.7203366902149406, + "flos": 28511086538880.0, + "grad_norm": 2.046751336783734, + "language_loss": 0.67539644, + "learning_rate": 7.235806517308656e-07, + "loss": 0.69623309, + "num_input_tokens_seen": 258546690, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.39648438, + "step": 11981, + "time_per_iteration": 2.4774057865142822 + }, + { + "auxiliary_loss_clip": 0.01057855, + "auxiliary_loss_mlp": 0.01028578, + "balance_loss_clip": 1.01626372, + "balance_loss_mlp": 1.01868916, + "epoch": 0.7203968134676086, + "flos": 21104960895360.0, + "grad_norm": 1.7692226555091193, + "language_loss": 0.73863244, + "learning_rate": 7.232898032319392e-07, + "loss": 0.75949681, + "num_input_tokens_seen": 258566340, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.390625, + "step": 11982, + "time_per_iteration": 2.4296557903289795 + }, + { + "auxiliary_loss_clip": 0.0105691, + "auxiliary_loss_mlp": 0.01032837, + "balance_loss_clip": 1.02137518, + "balance_loss_mlp": 1.01785159, + "epoch": 0.7204569367202766, + "flos": 18331181331840.0, + "grad_norm": 1.712521155224582, + "language_loss": 0.659338, + "learning_rate": 7.229990002955148e-07, + "loss": 0.68023551, + "num_input_tokens_seen": 258584455, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.390625, + "step": 11983, + "time_per_iteration": 3.8592514991760254 + }, + { + "auxiliary_loss_clip": 0.01059644, + "auxiliary_loss_mlp": 0.01025542, + "balance_loss_clip": 1.01306105, + "balance_loss_mlp": 1.02010465, + "epoch": 0.7205170599729446, + "flos": 23254074679680.0, + "grad_norm": 1.4980473168089616, + "language_loss": 0.66256249, + "learning_rate": 7.227082429319726e-07, + "loss": 0.68341434, + "num_input_tokens_seen": 258604725, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.39453125, + "step": 11984, + "time_per_iteration": 2.4188828468322754 + }, + { + "auxiliary_loss_clip": 0.01061702, + "auxiliary_loss_mlp": 0.01025381, + "balance_loss_clip": 1.01203012, + "balance_loss_mlp": 1.02041769, + "epoch": 0.7205771832256125, + "flos": 20119851573120.0, + "grad_norm": 1.8021113413558505, + "language_loss": 0.73627591, + "learning_rate": 7.224175311516865e-07, + "loss": 0.75714678, + "num_input_tokens_seen": 258622885, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.4140625, + "step": 11985, + "time_per_iteration": 2.3857791423797607 + }, + { + "auxiliary_loss_clip": 0.01059324, + "auxiliary_loss_mlp": 0.01021853, + "balance_loss_clip": 1.01010466, + "balance_loss_mlp": 1.02068508, + "epoch": 0.7206373064782805, + "flos": 27702184181760.0, + "grad_norm": 1.6575610664011828, + "language_loss": 0.62667507, + "learning_rate": 7.221268649650328e-07, + "loss": 0.64748681, + "num_input_tokens_seen": 258644305, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.38671875, + "step": 11986, + "time_per_iteration": 2.4376134872436523 + }, + { + "auxiliary_loss_clip": 0.01059448, + "auxiliary_loss_mlp": 0.01027217, + "balance_loss_clip": 1.01371622, + "balance_loss_mlp": 1.02006698, + "epoch": 0.7206974297309484, + "flos": 17967176830080.0, + "grad_norm": 1.5695099732761857, + "language_loss": 0.72446734, + "learning_rate": 7.218362443823842e-07, + "loss": 0.74533403, + "num_input_tokens_seen": 258661775, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.39453125, + "step": 11987, + "time_per_iteration": 2.4218151569366455 + }, + { + "auxiliary_loss_clip": 0.01060487, + "auxiliary_loss_mlp": 0.01025764, + "balance_loss_clip": 1.01324117, + "balance_loss_mlp": 1.01927197, + "epoch": 0.7207575529836164, + "flos": 16982207153280.0, + "grad_norm": 2.0910036579935958, + "language_loss": 0.78735811, + "learning_rate": 7.215456694141122e-07, + "loss": 0.80822062, + "num_input_tokens_seen": 258679830, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.41210938, + "step": 11988, + "time_per_iteration": 2.377091884613037 + }, + { + "auxiliary_loss_clip": 0.01057333, + "auxiliary_loss_mlp": 0.0102541, + "balance_loss_clip": 1.01363766, + "balance_loss_mlp": 1.01848924, + "epoch": 0.7208176762362843, + "flos": 18726782480640.0, + "grad_norm": 1.7024421371483491, + "language_loss": 0.79091072, + "learning_rate": 7.212551400705868e-07, + "loss": 0.81173813, + "num_input_tokens_seen": 258697415, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.38867188, + "step": 11989, + "time_per_iteration": 2.4017763137817383 + }, + { + "auxiliary_loss_clip": 0.01058521, + "auxiliary_loss_mlp": 0.01026569, + "balance_loss_clip": 1.01471972, + "balance_loss_mlp": 1.01933753, + "epoch": 0.7208777994889524, + "flos": 18733485461760.0, + "grad_norm": 1.6530073828299083, + "language_loss": 0.82533562, + "learning_rate": 7.209646563621754e-07, + "loss": 0.84618658, + "num_input_tokens_seen": 258716755, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.39257812, + "step": 11990, + "time_per_iteration": 2.4172117710113525 + }, + { + "auxiliary_loss_clip": 0.01056744, + "auxiliary_loss_mlp": 0.0102813, + "balance_loss_clip": 1.01626873, + "balance_loss_mlp": 1.01797283, + "epoch": 0.7209379227416203, + "flos": 14792559413760.0, + "grad_norm": 1.9259062136439042, + "language_loss": 0.76255953, + "learning_rate": 7.206742182992467e-07, + "loss": 0.78340828, + "num_input_tokens_seen": 258733270, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.38867188, + "step": 11991, + "time_per_iteration": 2.3914425373077393 + }, + { + "auxiliary_loss_clip": 0.01056354, + "auxiliary_loss_mlp": 0.01020012, + "balance_loss_clip": 1.00836492, + "balance_loss_mlp": 1.01919568, + "epoch": 0.7209980459942883, + "flos": 29635744032000.0, + "grad_norm": 1.5667991959253835, + "language_loss": 0.72542298, + "learning_rate": 7.203838258921631e-07, + "loss": 0.74618661, + "num_input_tokens_seen": 258755270, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.37109375, + "step": 11992, + "time_per_iteration": 2.4699673652648926 + }, + { + "auxiliary_loss_clip": 0.01058266, + "auxiliary_loss_mlp": 0.01022446, + "balance_loss_clip": 1.00989318, + "balance_loss_mlp": 1.01851523, + "epoch": 0.7210581692469563, + "flos": 23476052304000.0, + "grad_norm": 1.9822223268992185, + "language_loss": 0.66551638, + "learning_rate": 7.200934791512898e-07, + "loss": 0.68632346, + "num_input_tokens_seen": 258775340, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.39648438, + "step": 11993, + "time_per_iteration": 2.44653582572937 + }, + { + "auxiliary_loss_clip": 0.01057261, + "auxiliary_loss_mlp": 0.01024659, + "balance_loss_clip": 1.01273799, + "balance_loss_mlp": 1.01805854, + "epoch": 0.7211182924996242, + "flos": 26098762947840.0, + "grad_norm": 1.9812096563550325, + "language_loss": 0.66188562, + "learning_rate": 7.198031780869878e-07, + "loss": 0.68270481, + "num_input_tokens_seen": 258794580, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.390625, + "step": 11994, + "time_per_iteration": 5.363102436065674 + }, + { + "auxiliary_loss_clip": 0.01059343, + "auxiliary_loss_mlp": 0.01025092, + "balance_loss_clip": 1.01171112, + "balance_loss_mlp": 1.01917315, + "epoch": 0.7211784157522922, + "flos": 17711123852160.0, + "grad_norm": 1.6470892432713322, + "language_loss": 0.67087352, + "learning_rate": 7.195129227096172e-07, + "loss": 0.69171786, + "num_input_tokens_seen": 258812330, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.40234375, + "step": 11995, + "time_per_iteration": 2.369818925857544 + }, + { + "auxiliary_loss_clip": 0.01056859, + "auxiliary_loss_mlp": 0.010206, + "balance_loss_clip": 1.00841713, + "balance_loss_mlp": 1.01908851, + "epoch": 0.7212385390049602, + "flos": 24422547795840.0, + "grad_norm": 1.818748953083688, + "language_loss": 0.79622787, + "learning_rate": 7.192227130295363e-07, + "loss": 0.81700253, + "num_input_tokens_seen": 258831770, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.37890625, + "step": 11996, + "time_per_iteration": 2.4341049194335938 + }, + { + "auxiliary_loss_clip": 0.01059516, + "auxiliary_loss_mlp": 0.01027309, + "balance_loss_clip": 1.01533401, + "balance_loss_mlp": 1.01965666, + "epoch": 0.7212986622576282, + "flos": 28145999784960.0, + "grad_norm": 2.302056833068386, + "language_loss": 0.81656331, + "learning_rate": 7.189325490571025e-07, + "loss": 0.83743155, + "num_input_tokens_seen": 258849090, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.3984375, + "step": 11997, + "time_per_iteration": 2.438490867614746 + }, + { + "auxiliary_loss_clip": 0.01059123, + "auxiliary_loss_mlp": 0.01029006, + "balance_loss_clip": 1.01690006, + "balance_loss_mlp": 1.01856446, + "epoch": 0.7213587855102961, + "flos": 21834680555520.0, + "grad_norm": 2.2204399212863337, + "language_loss": 0.66970384, + "learning_rate": 7.18642430802671e-07, + "loss": 0.69058514, + "num_input_tokens_seen": 258868230, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.40625, + "step": 11998, + "time_per_iteration": 2.42875337600708 + }, + { + "auxiliary_loss_clip": 0.01056813, + "auxiliary_loss_mlp": 0.01025336, + "balance_loss_clip": 1.01453543, + "balance_loss_mlp": 1.01912105, + "epoch": 0.7214189087629641, + "flos": 14610661896960.0, + "grad_norm": 2.01280695458227, + "language_loss": 0.72668803, + "learning_rate": 7.183523582765952e-07, + "loss": 0.74750954, + "num_input_tokens_seen": 258885525, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.37695312, + "step": 11999, + "time_per_iteration": 2.393481492996216 + }, + { + "auxiliary_loss_clip": 0.0105867, + "auxiliary_loss_mlp": 0.01022859, + "balance_loss_clip": 1.01056826, + "balance_loss_mlp": 1.01906931, + "epoch": 0.721479032015632, + "flos": 19389852622080.0, + "grad_norm": 2.376474479573387, + "language_loss": 0.82915246, + "learning_rate": 7.18062331489226e-07, + "loss": 0.84996772, + "num_input_tokens_seen": 258903245, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.39453125, + "step": 12000, + "time_per_iteration": 2.401214361190796 + }, + { + "auxiliary_loss_clip": 0.01061165, + "auxiliary_loss_mlp": 0.01026319, + "balance_loss_clip": 1.01449907, + "balance_loss_mlp": 1.02039528, + "epoch": 0.7215391552683, + "flos": 18511961685120.0, + "grad_norm": 1.9680204630692437, + "language_loss": 0.77170587, + "learning_rate": 7.177723504509161e-07, + "loss": 0.79258072, + "num_input_tokens_seen": 258921245, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.40625, + "step": 12001, + "time_per_iteration": 2.384063720703125 + }, + { + "auxiliary_loss_clip": 0.01060002, + "auxiliary_loss_mlp": 0.01028043, + "balance_loss_clip": 1.0163008, + "balance_loss_mlp": 1.02133012, + "epoch": 0.721599278520968, + "flos": 23257600727040.0, + "grad_norm": 1.7033414820971489, + "language_loss": 0.81635135, + "learning_rate": 7.17482415172012e-07, + "loss": 0.83723181, + "num_input_tokens_seen": 258939425, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.38671875, + "step": 12002, + "time_per_iteration": 2.4270153045654297 + }, + { + "auxiliary_loss_clip": 0.01059866, + "auxiliary_loss_mlp": 0.01025913, + "balance_loss_clip": 1.01375914, + "balance_loss_mlp": 1.01979411, + "epoch": 0.721659401773636, + "flos": 39197581706880.0, + "grad_norm": 1.9427377603368723, + "language_loss": 0.62250417, + "learning_rate": 7.171925256628609e-07, + "loss": 0.64336199, + "num_input_tokens_seen": 258960710, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.40234375, + "step": 12003, + "time_per_iteration": 3.9934234619140625 + }, + { + "auxiliary_loss_clip": 0.01057575, + "auxiliary_loss_mlp": 0.0102298, + "balance_loss_clip": 1.01044571, + "balance_loss_mlp": 1.01811516, + "epoch": 0.7217195250263039, + "flos": 14939020033920.0, + "grad_norm": 1.9548147355926322, + "language_loss": 0.68489063, + "learning_rate": 7.169026819338099e-07, + "loss": 0.70569611, + "num_input_tokens_seen": 258978475, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.39453125, + "step": 12004, + "time_per_iteration": 2.382181406021118 + }, + { + "auxiliary_loss_clip": 0.01007141, + "auxiliary_loss_mlp": 0.01001239, + "balance_loss_clip": 1.0002141, + "balance_loss_mlp": 1.00086534, + "epoch": 0.7217796482789719, + "flos": 70865828115840.0, + "grad_norm": 0.8659941651706805, + "language_loss": 0.54170167, + "learning_rate": 7.166128839952006e-07, + "loss": 0.56178552, + "num_input_tokens_seen": 259037520, + "router_z_loss_clip": 0.01025391, + "router_z_loss_mlp": 0.0625, + "step": 12005, + "time_per_iteration": 2.962143898010254 + }, + { + "auxiliary_loss_clip": 0.01060493, + "auxiliary_loss_mlp": 0.01026386, + "balance_loss_clip": 1.01405334, + "balance_loss_mlp": 1.02051127, + "epoch": 0.7218397715316398, + "flos": 37450004002560.0, + "grad_norm": 1.5346765078252764, + "language_loss": 0.63109577, + "learning_rate": 7.163231318573766e-07, + "loss": 0.65196455, + "num_input_tokens_seen": 259061325, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.40039062, + "step": 12006, + "time_per_iteration": 2.5455422401428223 + }, + { + "auxiliary_loss_clip": 0.01057601, + "auxiliary_loss_mlp": 0.01022475, + "balance_loss_clip": 1.01085854, + "balance_loss_mlp": 1.01998353, + "epoch": 0.7218998947843078, + "flos": 22709569115520.0, + "grad_norm": 1.6377767856046606, + "language_loss": 0.92068768, + "learning_rate": 7.160334255306775e-07, + "loss": 0.94148844, + "num_input_tokens_seen": 259078135, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.37695312, + "step": 12007, + "time_per_iteration": 2.413604736328125 + }, + { + "auxiliary_loss_clip": 0.01059905, + "auxiliary_loss_mlp": 0.01023446, + "balance_loss_clip": 1.01053548, + "balance_loss_mlp": 1.01959145, + "epoch": 0.7219600180369758, + "flos": 12166357633920.0, + "grad_norm": 2.051200881576478, + "language_loss": 0.63979471, + "learning_rate": 7.15743765025444e-07, + "loss": 0.6606282, + "num_input_tokens_seen": 259095910, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.40234375, + "step": 12008, + "time_per_iteration": 2.418485164642334 + }, + { + "auxiliary_loss_clip": 0.01060638, + "auxiliary_loss_mlp": 0.0102583, + "balance_loss_clip": 1.01340199, + "balance_loss_mlp": 1.01963758, + "epoch": 0.7220201412896438, + "flos": 22595612837760.0, + "grad_norm": 4.983165780394398, + "language_loss": 0.78647053, + "learning_rate": 7.154541503520109e-07, + "loss": 0.80733526, + "num_input_tokens_seen": 259114225, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.41015625, + "step": 12009, + "time_per_iteration": 2.4814281463623047 + }, + { + "auxiliary_loss_clip": 0.01059937, + "auxiliary_loss_mlp": 0.01030477, + "balance_loss_clip": 1.01661909, + "balance_loss_mlp": 1.01931274, + "epoch": 0.7220802645423118, + "flos": 26717598529920.0, + "grad_norm": 1.6813302391480391, + "language_loss": 0.64001405, + "learning_rate": 7.151645815207152e-07, + "loss": 0.66091824, + "num_input_tokens_seen": 259134660, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.40625, + "step": 12010, + "time_per_iteration": 2.583998441696167 + }, + { + "auxiliary_loss_clip": 0.01058008, + "auxiliary_loss_mlp": 0.01027122, + "balance_loss_clip": 1.01447439, + "balance_loss_mlp": 1.01829088, + "epoch": 0.7221403877949797, + "flos": 24419545418880.0, + "grad_norm": 1.8622451188861526, + "language_loss": 0.77124625, + "learning_rate": 7.14875058541891e-07, + "loss": 0.79209757, + "num_input_tokens_seen": 259153300, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.3984375, + "step": 12011, + "time_per_iteration": 2.429408550262451 + }, + { + "auxiliary_loss_clip": 0.01057798, + "auxiliary_loss_mlp": 0.0102263, + "balance_loss_clip": 1.01116848, + "balance_loss_mlp": 1.01941919, + "epoch": 0.7222005110476477, + "flos": 23513234768640.0, + "grad_norm": 1.6809886853995715, + "language_loss": 0.79163802, + "learning_rate": 7.145855814258699e-07, + "loss": 0.8124423, + "num_input_tokens_seen": 259172115, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.3828125, + "step": 12012, + "time_per_iteration": 2.4381308555603027 + }, + { + "auxiliary_loss_clip": 0.01060108, + "auxiliary_loss_mlp": 0.01025538, + "balance_loss_clip": 1.01390314, + "balance_loss_mlp": 1.02014291, + "epoch": 0.7222606343003156, + "flos": 23111419397760.0, + "grad_norm": 1.7925904292160229, + "language_loss": 0.75863516, + "learning_rate": 7.142961501829825e-07, + "loss": 0.7794916, + "num_input_tokens_seen": 259191345, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.3984375, + "step": 12013, + "time_per_iteration": 2.5585880279541016 + }, + { + "auxiliary_loss_clip": 0.01058621, + "auxiliary_loss_mlp": 0.01021177, + "balance_loss_clip": 1.00928032, + "balance_loss_mlp": 1.01947987, + "epoch": 0.7223207575529836, + "flos": 24350068080000.0, + "grad_norm": 1.6689566652285066, + "language_loss": 0.75717914, + "learning_rate": 7.140067648235588e-07, + "loss": 0.77797705, + "num_input_tokens_seen": 259211700, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.390625, + "step": 12014, + "time_per_iteration": 2.4506614208221436 + }, + { + "auxiliary_loss_clip": 0.01058933, + "auxiliary_loss_mlp": 0.01024991, + "balance_loss_clip": 1.01292741, + "balance_loss_mlp": 1.01881564, + "epoch": 0.7223808808056515, + "flos": 28328909731200.0, + "grad_norm": 2.1760700589918076, + "language_loss": 0.86602032, + "learning_rate": 7.137174253579257e-07, + "loss": 0.88685954, + "num_input_tokens_seen": 259233825, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.40234375, + "step": 12015, + "time_per_iteration": 2.443923234939575 + }, + { + "auxiliary_loss_clip": 0.01058151, + "auxiliary_loss_mlp": 0.01025038, + "balance_loss_clip": 1.01383793, + "balance_loss_mlp": 1.01993704, + "epoch": 0.7224410040583196, + "flos": 21068371923840.0, + "grad_norm": 3.7611342473407743, + "language_loss": 0.78205895, + "learning_rate": 7.134281317964091e-07, + "loss": 0.8028909, + "num_input_tokens_seen": 259253055, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.3828125, + "step": 12016, + "time_per_iteration": 2.423051357269287 + }, + { + "auxiliary_loss_clip": 0.0105759, + "auxiliary_loss_mlp": 0.01026914, + "balance_loss_clip": 1.01511812, + "balance_loss_mlp": 1.01968861, + "epoch": 0.7225011273109875, + "flos": 26794267608960.0, + "grad_norm": 1.4981801190954154, + "language_loss": 0.77912629, + "learning_rate": 7.131388841493327e-07, + "loss": 0.79997128, + "num_input_tokens_seen": 259273420, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.37890625, + "step": 12017, + "time_per_iteration": 2.460642099380493 + }, + { + "auxiliary_loss_clip": 0.01056739, + "auxiliary_loss_mlp": 0.01023434, + "balance_loss_clip": 1.01246071, + "balance_loss_mlp": 1.01972771, + "epoch": 0.7225612505636555, + "flos": 23582537550720.0, + "grad_norm": 2.302918346770682, + "language_loss": 0.74205923, + "learning_rate": 7.128496824270196e-07, + "loss": 0.76286101, + "num_input_tokens_seen": 259291000, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.37109375, + "step": 12018, + "time_per_iteration": 2.4178214073181152 + }, + { + "auxiliary_loss_clip": 0.0105803, + "auxiliary_loss_mlp": 0.01025516, + "balance_loss_clip": 1.01462567, + "balance_loss_mlp": 1.02105021, + "epoch": 0.7226213738163234, + "flos": 20776567847040.0, + "grad_norm": 1.6712296599841507, + "language_loss": 0.77799153, + "learning_rate": 7.125605266397903e-07, + "loss": 0.79882693, + "num_input_tokens_seen": 259312390, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.36914062, + "step": 12019, + "time_per_iteration": 2.397578477859497 + }, + { + "auxiliary_loss_clip": 0.01056861, + "auxiliary_loss_mlp": 0.01022687, + "balance_loss_clip": 1.01039624, + "balance_loss_mlp": 1.01829433, + "epoch": 0.7226814970689914, + "flos": 32634433774080.0, + "grad_norm": 1.7268073657553367, + "language_loss": 0.74233502, + "learning_rate": 7.122714167979635e-07, + "loss": 0.76313049, + "num_input_tokens_seen": 259332645, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.38671875, + "step": 12020, + "time_per_iteration": 2.486300468444824 + }, + { + "auxiliary_loss_clip": 0.01060101, + "auxiliary_loss_mlp": 0.01025984, + "balance_loss_clip": 1.01381266, + "balance_loss_mlp": 1.01874673, + "epoch": 0.7227416203216595, + "flos": 22453306669440.0, + "grad_norm": 2.015557317457274, + "language_loss": 0.77463031, + "learning_rate": 7.119823529118587e-07, + "loss": 0.79549116, + "num_input_tokens_seen": 259353810, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.4140625, + "step": 12021, + "time_per_iteration": 2.4411346912384033 + }, + { + "auxiliary_loss_clip": 0.01058016, + "auxiliary_loss_mlp": 0.01024865, + "balance_loss_clip": 1.01285434, + "balance_loss_mlp": 1.01931357, + "epoch": 0.7228017435743274, + "flos": 21651246938880.0, + "grad_norm": 1.6358083050024217, + "language_loss": 0.68348444, + "learning_rate": 7.116933349917892e-07, + "loss": 0.70431328, + "num_input_tokens_seen": 259372460, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.38671875, + "step": 12022, + "time_per_iteration": 2.431593656539917 + }, + { + "auxiliary_loss_clip": 0.01059146, + "auxiliary_loss_mlp": 0.01022288, + "balance_loss_clip": 1.01005709, + "balance_loss_mlp": 1.01954091, + "epoch": 0.7228618668269954, + "flos": 29532191339520.0, + "grad_norm": 1.7131666719531091, + "language_loss": 0.69548023, + "learning_rate": 7.114043630480713e-07, + "loss": 0.71629459, + "num_input_tokens_seen": 259393275, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.39648438, + "step": 12023, + "time_per_iteration": 3.9342262744903564 + }, + { + "auxiliary_loss_clip": 0.01055997, + "auxiliary_loss_mlp": 0.01022698, + "balance_loss_clip": 1.01115274, + "balance_loss_mlp": 1.01737881, + "epoch": 0.7229219900796633, + "flos": 27452589805440.0, + "grad_norm": 1.5184650471258145, + "language_loss": 0.71086586, + "learning_rate": 7.111154370910164e-07, + "loss": 0.73165274, + "num_input_tokens_seen": 259416205, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.38671875, + "step": 12024, + "time_per_iteration": 2.478302001953125 + }, + { + "auxiliary_loss_clip": 0.01056892, + "auxiliary_loss_mlp": 0.01027654, + "balance_loss_clip": 1.01591778, + "balance_loss_mlp": 1.01851201, + "epoch": 0.7229821133323313, + "flos": 16288448060160.0, + "grad_norm": 2.108419034070096, + "language_loss": 0.75725687, + "learning_rate": 7.108265571309376e-07, + "loss": 0.77810234, + "num_input_tokens_seen": 259433115, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.3828125, + "step": 12025, + "time_per_iteration": 2.415452718734741 + }, + { + "auxiliary_loss_clip": 0.01056786, + "auxiliary_loss_mlp": 0.01022157, + "balance_loss_clip": 1.01105857, + "balance_loss_mlp": 1.01930463, + "epoch": 0.7230422365849992, + "flos": 20411306536320.0, + "grad_norm": 2.017786880982829, + "language_loss": 0.76742399, + "learning_rate": 7.105377231781414e-07, + "loss": 0.78821343, + "num_input_tokens_seen": 259450475, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.375, + "step": 12026, + "time_per_iteration": 2.409289836883545 + }, + { + "auxiliary_loss_clip": 0.01058856, + "auxiliary_loss_mlp": 0.01031016, + "balance_loss_clip": 1.01920235, + "balance_loss_mlp": 1.01951361, + "epoch": 0.7231023598376672, + "flos": 25592312632320.0, + "grad_norm": 1.8427135140071373, + "language_loss": 0.67626905, + "learning_rate": 7.102489352429375e-07, + "loss": 0.69716775, + "num_input_tokens_seen": 259469355, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.39453125, + "step": 12027, + "time_per_iteration": 2.4960010051727295 + }, + { + "auxiliary_loss_clip": 0.01061093, + "auxiliary_loss_mlp": 0.010257, + "balance_loss_clip": 1.01297414, + "balance_loss_mlp": 1.02093363, + "epoch": 0.7231624830903352, + "flos": 25148601763200.0, + "grad_norm": 2.581494166600324, + "language_loss": 0.79236162, + "learning_rate": 7.099601933356314e-07, + "loss": 0.8132295, + "num_input_tokens_seen": 259486565, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.40234375, + "step": 12028, + "time_per_iteration": 2.432420492172241 + }, + { + "auxiliary_loss_clip": 0.01056348, + "auxiliary_loss_mlp": 0.01024659, + "balance_loss_clip": 1.01271462, + "balance_loss_mlp": 1.01782465, + "epoch": 0.7232226063430032, + "flos": 21724669261440.0, + "grad_norm": 1.7832546054681813, + "language_loss": 0.82549787, + "learning_rate": 7.096714974665279e-07, + "loss": 0.84630793, + "num_input_tokens_seen": 259505070, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.38476562, + "step": 12029, + "time_per_iteration": 2.4512298107147217 + }, + { + "auxiliary_loss_clip": 0.01007012, + "auxiliary_loss_mlp": 0.01001972, + "balance_loss_clip": 1.00104773, + "balance_loss_mlp": 1.00070775, + "epoch": 0.7232827295956711, + "flos": 68433149335680.0, + "grad_norm": 0.8005885443875672, + "language_loss": 0.61774272, + "learning_rate": 7.093828476459287e-07, + "loss": 0.63783252, + "num_input_tokens_seen": 259569135, + "router_z_loss_clip": 0.00921631, + "router_z_loss_mlp": 0.06298828, + "step": 12030, + "time_per_iteration": 3.1028072834014893 + }, + { + "auxiliary_loss_clip": 0.01056436, + "auxiliary_loss_mlp": 0.010239, + "balance_loss_clip": 1.01178813, + "balance_loss_mlp": 1.01927817, + "epoch": 0.7233428528483391, + "flos": 20191633061760.0, + "grad_norm": 1.671259323263015, + "language_loss": 0.77649355, + "learning_rate": 7.090942438841365e-07, + "loss": 0.797297, + "num_input_tokens_seen": 259587035, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.37109375, + "step": 12031, + "time_per_iteration": 2.4150431156158447 + }, + { + "auxiliary_loss_clip": 0.01059948, + "auxiliary_loss_mlp": 0.01025152, + "balance_loss_clip": 1.01227164, + "balance_loss_mlp": 1.01912212, + "epoch": 0.723402976101007, + "flos": 23948392354560.0, + "grad_norm": 1.8621357888104708, + "language_loss": 0.81119573, + "learning_rate": 7.088056861914509e-07, + "loss": 0.83204675, + "num_input_tokens_seen": 259606140, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.40820312, + "step": 12032, + "time_per_iteration": 2.4602198600769043 + }, + { + "auxiliary_loss_clip": 0.0106103, + "auxiliary_loss_mlp": 0.01024646, + "balance_loss_clip": 1.01225436, + "balance_loss_mlp": 1.01935649, + "epoch": 0.723463099353675, + "flos": 20812353857280.0, + "grad_norm": 1.866904008769928, + "language_loss": 0.75157213, + "learning_rate": 7.085171745781676e-07, + "loss": 0.77242887, + "num_input_tokens_seen": 259624275, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.41796875, + "step": 12033, + "time_per_iteration": 2.4521899223327637 + }, + { + "auxiliary_loss_clip": 0.01055455, + "auxiliary_loss_mlp": 0.01022228, + "balance_loss_clip": 1.01102245, + "balance_loss_mlp": 1.01838613, + "epoch": 0.723523222606343, + "flos": 19097036115840.0, + "grad_norm": 1.5827689558520401, + "language_loss": 0.74977565, + "learning_rate": 7.082287090545848e-07, + "loss": 0.7705524, + "num_input_tokens_seen": 259643465, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.37109375, + "step": 12034, + "time_per_iteration": 5.434307813644409 + }, + { + "auxiliary_loss_clip": 0.01056883, + "auxiliary_loss_mlp": 0.01021268, + "balance_loss_clip": 1.00999069, + "balance_loss_mlp": 1.01864004, + "epoch": 0.723583345859011, + "flos": 26505745200000.0, + "grad_norm": 1.6773849343010834, + "language_loss": 0.80661994, + "learning_rate": 7.079402896309967e-07, + "loss": 0.8274014, + "num_input_tokens_seen": 259662500, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.3828125, + "step": 12035, + "time_per_iteration": 2.4295990467071533 + }, + { + "auxiliary_loss_clip": 0.01058152, + "auxiliary_loss_mlp": 0.01025754, + "balance_loss_clip": 1.01322532, + "balance_loss_mlp": 1.01923537, + "epoch": 0.723643469111679, + "flos": 16032953664000.0, + "grad_norm": 5.028736796794771, + "language_loss": 0.61163765, + "learning_rate": 7.07651916317696e-07, + "loss": 0.63247669, + "num_input_tokens_seen": 259680140, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.390625, + "step": 12036, + "time_per_iteration": 2.3845133781433105 + }, + { + "auxiliary_loss_clip": 0.01056875, + "auxiliary_loss_mlp": 0.01024141, + "balance_loss_clip": 1.01341844, + "balance_loss_mlp": 1.01913571, + "epoch": 0.7237035923643469, + "flos": 21944447470080.0, + "grad_norm": 1.8480387001993435, + "language_loss": 0.67351973, + "learning_rate": 7.073635891249734e-07, + "loss": 0.69432998, + "num_input_tokens_seen": 259700160, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.37890625, + "step": 12037, + "time_per_iteration": 2.40641188621521 + }, + { + "auxiliary_loss_clip": 0.01057685, + "auxiliary_loss_mlp": 0.01027588, + "balance_loss_clip": 1.0156076, + "balance_loss_mlp": 1.01932788, + "epoch": 0.7237637156170149, + "flos": 23582083703040.0, + "grad_norm": 1.7368269810316947, + "language_loss": 0.72760004, + "learning_rate": 7.070753080631207e-07, + "loss": 0.74845278, + "num_input_tokens_seen": 259720525, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.38476562, + "step": 12038, + "time_per_iteration": 2.4283456802368164 + }, + { + "auxiliary_loss_clip": 0.01057311, + "auxiliary_loss_mlp": 0.01023977, + "balance_loss_clip": 1.01038074, + "balance_loss_mlp": 1.01823258, + "epoch": 0.7238238388696828, + "flos": 20593657900800.0, + "grad_norm": 1.648912770472305, + "language_loss": 0.71996629, + "learning_rate": 7.06787073142423e-07, + "loss": 0.74077916, + "num_input_tokens_seen": 259738680, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.390625, + "step": 12039, + "time_per_iteration": 2.3960776329040527 + }, + { + "auxiliary_loss_clip": 0.01059798, + "auxiliary_loss_mlp": 0.01025084, + "balance_loss_clip": 1.01342499, + "balance_loss_mlp": 1.01909757, + "epoch": 0.7238839621223508, + "flos": 24205876698240.0, + "grad_norm": 4.128610212195205, + "language_loss": 0.75701535, + "learning_rate": 7.06498884373169e-07, + "loss": 0.77786416, + "num_input_tokens_seen": 259758790, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.40625, + "step": 12040, + "time_per_iteration": 2.460969924926758 + }, + { + "auxiliary_loss_clip": 0.01057265, + "auxiliary_loss_mlp": 0.01025794, + "balance_loss_clip": 1.01325321, + "balance_loss_mlp": 1.01802015, + "epoch": 0.7239440853750188, + "flos": 14208881437440.0, + "grad_norm": 1.566250696406553, + "language_loss": 0.76816815, + "learning_rate": 7.062107417656416e-07, + "loss": 0.78899878, + "num_input_tokens_seen": 259777370, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.39257812, + "step": 12041, + "time_per_iteration": 2.4253854751586914 + }, + { + "auxiliary_loss_clip": 0.01058736, + "auxiliary_loss_mlp": 0.0102226, + "balance_loss_clip": 1.01013613, + "balance_loss_mlp": 1.0188874, + "epoch": 0.7240042086276868, + "flos": 21613785183360.0, + "grad_norm": 2.0963009073312495, + "language_loss": 0.63415051, + "learning_rate": 7.059226453301264e-07, + "loss": 0.65496039, + "num_input_tokens_seen": 259794665, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.3984375, + "step": 12042, + "time_per_iteration": 3.954350233078003 + }, + { + "auxiliary_loss_clip": 0.01060586, + "auxiliary_loss_mlp": 0.01025697, + "balance_loss_clip": 1.01270366, + "balance_loss_mlp": 1.01994705, + "epoch": 0.7240643318803547, + "flos": 23330324822400.0, + "grad_norm": 2.8414509766780705, + "language_loss": 0.83502269, + "learning_rate": 7.056345950769016e-07, + "loss": 0.85588551, + "num_input_tokens_seen": 259811110, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.40625, + "step": 12043, + "time_per_iteration": 2.407780647277832 + }, + { + "auxiliary_loss_clip": 0.0106005, + "auxiliary_loss_mlp": 0.01024126, + "balance_loss_clip": 1.01157331, + "balance_loss_mlp": 1.01969409, + "epoch": 0.7241244551330227, + "flos": 24023699890560.0, + "grad_norm": 1.4504378389059718, + "language_loss": 0.64192879, + "learning_rate": 7.053465910162494e-07, + "loss": 0.66277051, + "num_input_tokens_seen": 259831080, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.40429688, + "step": 12044, + "time_per_iteration": 2.442500591278076 + }, + { + "auxiliary_loss_clip": 0.01059061, + "auxiliary_loss_mlp": 0.01023735, + "balance_loss_clip": 1.01130772, + "balance_loss_mlp": 1.01982927, + "epoch": 0.7241845783856906, + "flos": 18729435744000.0, + "grad_norm": 1.5700695236997277, + "language_loss": 0.81499851, + "learning_rate": 7.050586331584472e-07, + "loss": 0.83582652, + "num_input_tokens_seen": 259850135, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.39257812, + "step": 12045, + "time_per_iteration": 2.389601945877075 + }, + { + "auxiliary_loss_clip": 0.01056459, + "auxiliary_loss_mlp": 0.01023485, + "balance_loss_clip": 1.01173663, + "balance_loss_mlp": 1.01866746, + "epoch": 0.7242447016383586, + "flos": 19497699411840.0, + "grad_norm": 1.7836786089417873, + "language_loss": 0.72072363, + "learning_rate": 7.047707215137712e-07, + "loss": 0.74152303, + "num_input_tokens_seen": 259868185, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.37890625, + "step": 12046, + "time_per_iteration": 2.4426615238189697 + }, + { + "auxiliary_loss_clip": 0.01057517, + "auxiliary_loss_mlp": 0.01022782, + "balance_loss_clip": 1.01047325, + "balance_loss_mlp": 1.0183177, + "epoch": 0.7243048248910267, + "flos": 22162410288000.0, + "grad_norm": 1.9585413442155113, + "language_loss": 0.71058846, + "learning_rate": 7.044828560924967e-07, + "loss": 0.73139137, + "num_input_tokens_seen": 259887055, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.390625, + "step": 12047, + "time_per_iteration": 2.4645140171051025 + }, + { + "auxiliary_loss_clip": 0.01061338, + "auxiliary_loss_mlp": 0.01029239, + "balance_loss_clip": 1.01630485, + "balance_loss_mlp": 1.01913095, + "epoch": 0.7243649481436946, + "flos": 27671530141440.0, + "grad_norm": 1.9979042633972603, + "language_loss": 0.70003587, + "learning_rate": 7.041950369048964e-07, + "loss": 0.7209416, + "num_input_tokens_seen": 259908295, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.421875, + "step": 12048, + "time_per_iteration": 2.4909207820892334 + }, + { + "auxiliary_loss_clip": 0.01006813, + "auxiliary_loss_mlp": 0.01001541, + "balance_loss_clip": 1.00062895, + "balance_loss_mlp": 1.00055695, + "epoch": 0.7244250713963626, + "flos": 63241355629440.0, + "grad_norm": 0.8227193732171996, + "language_loss": 0.53770971, + "learning_rate": 7.03907263961242e-07, + "loss": 0.55779326, + "num_input_tokens_seen": 259968475, + "router_z_loss_clip": 0.00909424, + "router_z_loss_mlp": 0.0625, + "step": 12049, + "time_per_iteration": 3.052288293838501 + }, + { + "auxiliary_loss_clip": 0.01061392, + "auxiliary_loss_mlp": 0.01024293, + "balance_loss_clip": 1.01017237, + "balance_loss_mlp": 1.01941776, + "epoch": 0.7244851946490305, + "flos": 17966164400640.0, + "grad_norm": 2.8254847339086537, + "language_loss": 0.60490584, + "learning_rate": 7.036195372718028e-07, + "loss": 0.6257627, + "num_input_tokens_seen": 259984865, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.41992188, + "step": 12050, + "time_per_iteration": 2.384951591491699 + }, + { + "auxiliary_loss_clip": 0.01006851, + "auxiliary_loss_mlp": 0.01001402, + "balance_loss_clip": 1.00047231, + "balance_loss_mlp": 1.00047135, + "epoch": 0.7245453179016985, + "flos": 70940227956480.0, + "grad_norm": 0.7427251280605452, + "language_loss": 0.53343344, + "learning_rate": 7.033318568468482e-07, + "loss": 0.55351603, + "num_input_tokens_seen": 260046735, + "router_z_loss_clip": 0.00927734, + "router_z_loss_mlp": 0.06347656, + "step": 12051, + "time_per_iteration": 3.068305015563965 + }, + { + "auxiliary_loss_clip": 0.01057751, + "auxiliary_loss_mlp": 0.01023468, + "balance_loss_clip": 1.01208305, + "balance_loss_mlp": 1.01947355, + "epoch": 0.7246054411543664, + "flos": 24567402493440.0, + "grad_norm": 2.3124395654174545, + "language_loss": 0.72132975, + "learning_rate": 7.030442226966445e-07, + "loss": 0.74214196, + "num_input_tokens_seen": 260067950, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.3828125, + "step": 12052, + "time_per_iteration": 2.4545087814331055 + }, + { + "auxiliary_loss_clip": 0.01058396, + "auxiliary_loss_mlp": 0.01025293, + "balance_loss_clip": 1.01215649, + "balance_loss_mlp": 1.01968372, + "epoch": 0.7246655644070344, + "flos": 32337078791040.0, + "grad_norm": 1.8084619348689706, + "language_loss": 0.74277002, + "learning_rate": 7.02756634831456e-07, + "loss": 0.76360685, + "num_input_tokens_seen": 260087730, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.38671875, + "step": 12053, + "time_per_iteration": 2.5674571990966797 + }, + { + "auxiliary_loss_clip": 0.01057956, + "auxiliary_loss_mlp": 0.0102677, + "balance_loss_clip": 1.0142951, + "balance_loss_mlp": 1.01863647, + "epoch": 0.7247256876597024, + "flos": 21871374261120.0, + "grad_norm": 1.8618538440889, + "language_loss": 0.78433305, + "learning_rate": 7.024690932615458e-07, + "loss": 0.80518031, + "num_input_tokens_seen": 260107760, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.39257812, + "step": 12054, + "time_per_iteration": 2.4239158630371094 + }, + { + "auxiliary_loss_clip": 0.01059368, + "auxiliary_loss_mlp": 0.01024437, + "balance_loss_clip": 1.01208103, + "balance_loss_mlp": 1.01950228, + "epoch": 0.7247858109123704, + "flos": 16212267740160.0, + "grad_norm": 2.8943683954302335, + "language_loss": 0.69188255, + "learning_rate": 7.021815979971772e-07, + "loss": 0.71272063, + "num_input_tokens_seen": 260123660, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.3984375, + "step": 12055, + "time_per_iteration": 2.3543131351470947 + }, + { + "auxiliary_loss_clip": 0.01055212, + "auxiliary_loss_mlp": 0.01024954, + "balance_loss_clip": 1.01365948, + "balance_loss_mlp": 1.01788402, + "epoch": 0.7248459341650383, + "flos": 20849641056000.0, + "grad_norm": 1.640734202160545, + "language_loss": 0.73690736, + "learning_rate": 7.018941490486079e-07, + "loss": 0.75770903, + "num_input_tokens_seen": 260142690, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.37304688, + "step": 12056, + "time_per_iteration": 2.407367706298828 + }, + { + "auxiliary_loss_clip": 0.01055302, + "auxiliary_loss_mlp": 0.01019836, + "balance_loss_clip": 1.00870752, + "balance_loss_mlp": 1.01840138, + "epoch": 0.7249060574177063, + "flos": 25920600946560.0, + "grad_norm": 1.4963189643404298, + "language_loss": 0.71010983, + "learning_rate": 7.016067464260977e-07, + "loss": 0.73086119, + "num_input_tokens_seen": 260162590, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.36914062, + "step": 12057, + "time_per_iteration": 2.4554102420806885 + }, + { + "auxiliary_loss_clip": 0.01060216, + "auxiliary_loss_mlp": 0.01027278, + "balance_loss_clip": 1.01504135, + "balance_loss_mlp": 1.02041447, + "epoch": 0.7249661806703742, + "flos": 17344640643840.0, + "grad_norm": 1.9489068860401313, + "language_loss": 0.62574154, + "learning_rate": 7.013193901399024e-07, + "loss": 0.64661646, + "num_input_tokens_seen": 260181065, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.3984375, + "step": 12058, + "time_per_iteration": 2.496142864227295 + }, + { + "auxiliary_loss_clip": 0.0105934, + "auxiliary_loss_mlp": 0.01029401, + "balance_loss_clip": 1.0170033, + "balance_loss_mlp": 1.02027929, + "epoch": 0.7250263039230422, + "flos": 19573111681920.0, + "grad_norm": 1.7568096628678562, + "language_loss": 0.74583411, + "learning_rate": 7.010320802002785e-07, + "loss": 0.76672149, + "num_input_tokens_seen": 260200330, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.390625, + "step": 12059, + "time_per_iteration": 2.473097562789917 + }, + { + "auxiliary_loss_clip": 0.01055702, + "auxiliary_loss_mlp": 0.01027995, + "balance_loss_clip": 1.01658118, + "balance_loss_mlp": 1.01827741, + "epoch": 0.7250864271757103, + "flos": 21975695003520.0, + "grad_norm": 1.609508319609084, + "language_loss": 0.78977823, + "learning_rate": 7.007448166174772e-07, + "loss": 0.81061518, + "num_input_tokens_seen": 260219975, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.375, + "step": 12060, + "time_per_iteration": 2.4425556659698486 + }, + { + "auxiliary_loss_clip": 0.01057686, + "auxiliary_loss_mlp": 0.0102296, + "balance_loss_clip": 1.01000762, + "balance_loss_mlp": 1.01946032, + "epoch": 0.7251465504283782, + "flos": 25011357742080.0, + "grad_norm": 1.7825182989395671, + "language_loss": 0.76381367, + "learning_rate": 7.004575994017521e-07, + "loss": 0.78462017, + "num_input_tokens_seen": 260242025, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.3828125, + "step": 12061, + "time_per_iteration": 2.444241523742676 + }, + { + "auxiliary_loss_clip": 0.01055576, + "auxiliary_loss_mlp": 0.01023226, + "balance_loss_clip": 1.01247907, + "balance_loss_mlp": 1.01884246, + "epoch": 0.7252066736810462, + "flos": 16689216090240.0, + "grad_norm": 1.856784354342466, + "language_loss": 0.81105012, + "learning_rate": 7.00170428563353e-07, + "loss": 0.83183813, + "num_input_tokens_seen": 260260015, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.3671875, + "step": 12062, + "time_per_iteration": 2.413235902786255 + }, + { + "auxiliary_loss_clip": 0.01061045, + "auxiliary_loss_mlp": 0.0102707, + "balance_loss_clip": 1.01347423, + "balance_loss_mlp": 1.02044582, + "epoch": 0.7252667969337141, + "flos": 25701835167360.0, + "grad_norm": 2.4737574678323715, + "language_loss": 0.69223106, + "learning_rate": 6.998833041125263e-07, + "loss": 0.71311212, + "num_input_tokens_seen": 260278635, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.40625, + "step": 12063, + "time_per_iteration": 3.8505003452301025 + }, + { + "auxiliary_loss_clip": 0.01056128, + "auxiliary_loss_mlp": 0.01023311, + "balance_loss_clip": 1.01193213, + "balance_loss_mlp": 1.01873302, + "epoch": 0.7253269201863821, + "flos": 18258945995520.0, + "grad_norm": 1.7911256280334917, + "language_loss": 0.694646, + "learning_rate": 6.995962260595207e-07, + "loss": 0.71544039, + "num_input_tokens_seen": 260298510, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.375, + "step": 12064, + "time_per_iteration": 2.385999917984009 + }, + { + "auxiliary_loss_clip": 0.01060633, + "auxiliary_loss_mlp": 0.01023317, + "balance_loss_clip": 1.01156855, + "balance_loss_mlp": 1.02159619, + "epoch": 0.72538704343905, + "flos": 20410782865920.0, + "grad_norm": 1.6559067537020782, + "language_loss": 0.90329373, + "learning_rate": 6.99309194414581e-07, + "loss": 0.92413324, + "num_input_tokens_seen": 260317405, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.390625, + "step": 12065, + "time_per_iteration": 2.4268176555633545 + }, + { + "auxiliary_loss_clip": 0.01006709, + "auxiliary_loss_mlp": 0.01001803, + "balance_loss_clip": 1.00087368, + "balance_loss_mlp": 1.00034094, + "epoch": 0.725447166691718, + "flos": 70147524470400.0, + "grad_norm": 0.6506760756021324, + "language_loss": 0.56058002, + "learning_rate": 6.990222091879506e-07, + "loss": 0.58066523, + "num_input_tokens_seen": 260388085, + "router_z_loss_clip": 0.00927734, + "router_z_loss_mlp": 0.06347656, + "step": 12066, + "time_per_iteration": 3.1948585510253906 + }, + { + "auxiliary_loss_clip": 0.01056658, + "auxiliary_loss_mlp": 0.01025024, + "balance_loss_clip": 1.0132463, + "balance_loss_mlp": 1.01835489, + "epoch": 0.725507289944386, + "flos": 27051123548160.0, + "grad_norm": 2.1413672379066857, + "language_loss": 0.7700069, + "learning_rate": 6.987352703898699e-07, + "loss": 0.7908237, + "num_input_tokens_seen": 260406165, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.3828125, + "step": 12067, + "time_per_iteration": 2.4376120567321777 + }, + { + "auxiliary_loss_clip": 0.01006903, + "auxiliary_loss_mlp": 0.01000558, + "balance_loss_clip": 0.99955678, + "balance_loss_mlp": 1.0004791, + "epoch": 0.725567413197054, + "flos": 62918583310080.0, + "grad_norm": 0.8069635957898997, + "language_loss": 0.5701232, + "learning_rate": 6.984483780305812e-07, + "loss": 0.5901978, + "num_input_tokens_seen": 260461365, + "router_z_loss_clip": 0.01000977, + "router_z_loss_mlp": 0.06445312, + "step": 12068, + "time_per_iteration": 2.960752248764038 + }, + { + "auxiliary_loss_clip": 0.01007158, + "auxiliary_loss_mlp": 0.01002902, + "balance_loss_clip": 1.00193083, + "balance_loss_mlp": 1.00064778, + "epoch": 0.7256275364497219, + "flos": 60292660821120.0, + "grad_norm": 0.8106542183040983, + "language_loss": 0.55360818, + "learning_rate": 6.981615321203216e-07, + "loss": 0.57370877, + "num_input_tokens_seen": 260523795, + "router_z_loss_clip": 0.00970459, + "router_z_loss_mlp": 0.06494141, + "step": 12069, + "time_per_iteration": 3.0850534439086914 + }, + { + "auxiliary_loss_clip": 0.01055672, + "auxiliary_loss_mlp": 0.01023947, + "balance_loss_clip": 1.01288438, + "balance_loss_mlp": 1.01893234, + "epoch": 0.7256876597023899, + "flos": 24497366572800.0, + "grad_norm": 1.8737097461501877, + "language_loss": 0.79926026, + "learning_rate": 6.978747326693283e-07, + "loss": 0.82005638, + "num_input_tokens_seen": 260544765, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.3671875, + "step": 12070, + "time_per_iteration": 2.4756252765655518 + }, + { + "auxiliary_loss_clip": 0.01055112, + "auxiliary_loss_mlp": 0.0102699, + "balance_loss_clip": 1.01578999, + "balance_loss_mlp": 1.01908755, + "epoch": 0.7257477829550578, + "flos": 24351604179840.0, + "grad_norm": 1.7658697248289759, + "language_loss": 0.71757478, + "learning_rate": 6.975879796878357e-07, + "loss": 0.73839581, + "num_input_tokens_seen": 260564340, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.359375, + "step": 12071, + "time_per_iteration": 2.462167978286743 + }, + { + "auxiliary_loss_clip": 0.0105742, + "auxiliary_loss_mlp": 0.01030871, + "balance_loss_clip": 1.01910496, + "balance_loss_mlp": 1.01950049, + "epoch": 0.7258079062077258, + "flos": 17201252223360.0, + "grad_norm": 1.7747448227082918, + "language_loss": 0.70386994, + "learning_rate": 6.973012731860792e-07, + "loss": 0.72475284, + "num_input_tokens_seen": 260582565, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.37890625, + "step": 12072, + "time_per_iteration": 2.4051828384399414 + }, + { + "auxiliary_loss_clip": 0.01058859, + "auxiliary_loss_mlp": 0.01024943, + "balance_loss_clip": 1.0125159, + "balance_loss_mlp": 1.01879644, + "epoch": 0.7258680294603939, + "flos": 21579255982080.0, + "grad_norm": 2.5444355847339097, + "language_loss": 0.78429258, + "learning_rate": 6.97014613174288e-07, + "loss": 0.80513054, + "num_input_tokens_seen": 260601700, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.40039062, + "step": 12073, + "time_per_iteration": 4.017388105392456 + }, + { + "auxiliary_loss_clip": 0.01057379, + "auxiliary_loss_mlp": 0.01026272, + "balance_loss_clip": 1.01454806, + "balance_loss_mlp": 1.01984096, + "epoch": 0.7259281527130618, + "flos": 34854107149440.0, + "grad_norm": 1.3631214514427386, + "language_loss": 0.70075691, + "learning_rate": 6.967279996626943e-07, + "loss": 0.72159338, + "num_input_tokens_seen": 260623040, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.375, + "step": 12074, + "time_per_iteration": 2.5387113094329834 + }, + { + "auxiliary_loss_clip": 0.01058794, + "auxiliary_loss_mlp": 0.01024676, + "balance_loss_clip": 1.01211154, + "balance_loss_mlp": 1.01930964, + "epoch": 0.7259882759657298, + "flos": 25403642311680.0, + "grad_norm": 1.8264831073545598, + "language_loss": 0.74071848, + "learning_rate": 6.964414326615251e-07, + "loss": 0.76155317, + "num_input_tokens_seen": 260642735, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.39453125, + "step": 12075, + "time_per_iteration": 2.4355695247650146 + }, + { + "auxiliary_loss_clip": 0.01057853, + "auxiliary_loss_mlp": 0.01024624, + "balance_loss_clip": 1.01220238, + "balance_loss_mlp": 1.01905394, + "epoch": 0.7260483992183977, + "flos": 62951438632320.0, + "grad_norm": 1.4531045812230976, + "language_loss": 0.63388419, + "learning_rate": 6.961549121810095e-07, + "loss": 0.65470898, + "num_input_tokens_seen": 260669935, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.38671875, + "step": 12076, + "time_per_iteration": 2.8121190071105957 + }, + { + "auxiliary_loss_clip": 0.01058222, + "auxiliary_loss_mlp": 0.01023133, + "balance_loss_clip": 1.01095605, + "balance_loss_mlp": 1.01892114, + "epoch": 0.7261085224710657, + "flos": 26466363319680.0, + "grad_norm": 8.152955626975452, + "language_loss": 0.78921211, + "learning_rate": 6.958684382313704e-07, + "loss": 0.81002569, + "num_input_tokens_seen": 260689605, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.39257812, + "step": 12077, + "time_per_iteration": 2.4363934993743896 + }, + { + "auxiliary_loss_clip": 0.01006893, + "auxiliary_loss_mlp": 0.01002203, + "balance_loss_clip": 1.00124383, + "balance_loss_mlp": 1.00050759, + "epoch": 0.7261686457237336, + "flos": 66769748519040.0, + "grad_norm": 0.8706810797844009, + "language_loss": 0.64917117, + "learning_rate": 6.955820108228314e-07, + "loss": 0.66926217, + "num_input_tokens_seen": 260748265, + "router_z_loss_clip": 0.00958252, + "router_z_loss_mlp": 0.06396484, + "step": 12078, + "time_per_iteration": 3.0663349628448486 + }, + { + "auxiliary_loss_clip": 0.01006924, + "auxiliary_loss_mlp": 0.01001861, + "balance_loss_clip": 1.00091898, + "balance_loss_mlp": 1.00055099, + "epoch": 0.7262287689764017, + "flos": 69996071525760.0, + "grad_norm": 0.7444590251041284, + "language_loss": 0.5928371, + "learning_rate": 6.952956299656166e-07, + "loss": 0.61292493, + "num_input_tokens_seen": 260816715, + "router_z_loss_clip": 0.00939941, + "router_z_loss_mlp": 0.06347656, + "step": 12079, + "time_per_iteration": 3.2041549682617188 + }, + { + "auxiliary_loss_clip": 0.01056896, + "auxiliary_loss_mlp": 0.0102598, + "balance_loss_clip": 1.01409519, + "balance_loss_mlp": 1.01791096, + "epoch": 0.7262888922290696, + "flos": 23804305706880.0, + "grad_norm": 1.9342093867022763, + "language_loss": 0.64870846, + "learning_rate": 6.950092956699432e-07, + "loss": 0.66953719, + "num_input_tokens_seen": 260836765, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.390625, + "step": 12080, + "time_per_iteration": 2.419490337371826 + }, + { + "auxiliary_loss_clip": 0.01058509, + "auxiliary_loss_mlp": 0.01024744, + "balance_loss_clip": 1.01243567, + "balance_loss_mlp": 1.01898468, + "epoch": 0.7263490154817376, + "flos": 19499305334400.0, + "grad_norm": 2.3801840593749968, + "language_loss": 0.69778693, + "learning_rate": 6.947230079460317e-07, + "loss": 0.71861953, + "num_input_tokens_seen": 260854610, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.39453125, + "step": 12081, + "time_per_iteration": 3.8599820137023926 + }, + { + "auxiliary_loss_clip": 0.01061069, + "auxiliary_loss_mlp": 0.01027716, + "balance_loss_clip": 1.01531196, + "balance_loss_mlp": 1.02055001, + "epoch": 0.7264091387344055, + "flos": 16285410771840.0, + "grad_norm": 1.7756182011852357, + "language_loss": 0.81325877, + "learning_rate": 6.944367668040987e-07, + "loss": 0.83414662, + "num_input_tokens_seen": 260871620, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.40429688, + "step": 12082, + "time_per_iteration": 2.4006354808807373 + }, + { + "auxiliary_loss_clip": 0.0106163, + "auxiliary_loss_mlp": 0.01026955, + "balance_loss_clip": 1.01361513, + "balance_loss_mlp": 1.01950502, + "epoch": 0.7264692619870735, + "flos": 24350905952640.0, + "grad_norm": 1.8335902934041204, + "language_loss": 0.76985931, + "learning_rate": 6.941505722543592e-07, + "loss": 0.79074514, + "num_input_tokens_seen": 260890490, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.421875, + "step": 12083, + "time_per_iteration": 2.4312055110931396 + }, + { + "auxiliary_loss_clip": 0.01060274, + "auxiliary_loss_mlp": 0.01024327, + "balance_loss_clip": 1.01200104, + "balance_loss_mlp": 1.02049196, + "epoch": 0.7265293852397414, + "flos": 25118296836480.0, + "grad_norm": 2.339171914357877, + "language_loss": 0.729146, + "learning_rate": 6.93864424307026e-07, + "loss": 0.74999201, + "num_input_tokens_seen": 260909700, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.3984375, + "step": 12084, + "time_per_iteration": 2.444794178009033 + }, + { + "auxiliary_loss_clip": 0.01060573, + "auxiliary_loss_mlp": 0.01028677, + "balance_loss_clip": 1.01666701, + "balance_loss_mlp": 1.02020216, + "epoch": 0.7265895084924094, + "flos": 22637124311040.0, + "grad_norm": 1.9064477813952097, + "language_loss": 0.77772546, + "learning_rate": 6.935783229723125e-07, + "loss": 0.79861796, + "num_input_tokens_seen": 260929090, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.40234375, + "step": 12085, + "time_per_iteration": 2.4358198642730713 + }, + { + "auxiliary_loss_clip": 0.01058587, + "auxiliary_loss_mlp": 0.01027274, + "balance_loss_clip": 1.01590776, + "balance_loss_mlp": 1.01887083, + "epoch": 0.7266496317450775, + "flos": 23367088350720.0, + "grad_norm": 1.7369186566950028, + "language_loss": 0.72457278, + "learning_rate": 6.932922682604279e-07, + "loss": 0.74543142, + "num_input_tokens_seen": 260946615, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.3984375, + "step": 12086, + "time_per_iteration": 2.4313042163848877 + }, + { + "auxiliary_loss_clip": 0.01058124, + "auxiliary_loss_mlp": 0.01028215, + "balance_loss_clip": 1.01604402, + "balance_loss_mlp": 1.01906776, + "epoch": 0.7267097549977454, + "flos": 28073345512320.0, + "grad_norm": 1.684696331163727, + "language_loss": 0.69139594, + "learning_rate": 6.930062601815811e-07, + "loss": 0.71225935, + "num_input_tokens_seen": 260968515, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.390625, + "step": 12087, + "time_per_iteration": 2.48289155960083 + }, + { + "auxiliary_loss_clip": 0.01060875, + "auxiliary_loss_mlp": 0.01027648, + "balance_loss_clip": 1.01480317, + "balance_loss_mlp": 1.02033401, + "epoch": 0.7267698782504134, + "flos": 22194565516800.0, + "grad_norm": 2.8246344617069266, + "language_loss": 0.79224879, + "learning_rate": 6.927202987459781e-07, + "loss": 0.81313401, + "num_input_tokens_seen": 260986790, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.40625, + "step": 12088, + "time_per_iteration": 2.4004294872283936 + }, + { + "auxiliary_loss_clip": 0.01057429, + "auxiliary_loss_mlp": 0.01025409, + "balance_loss_clip": 1.01378632, + "balance_loss_mlp": 1.01812398, + "epoch": 0.7268300015030813, + "flos": 18513881809920.0, + "grad_norm": 2.4441267285600814, + "language_loss": 0.73617709, + "learning_rate": 6.924343839638264e-07, + "loss": 0.75700551, + "num_input_tokens_seen": 261004925, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.39257812, + "step": 12089, + "time_per_iteration": 2.44063663482666 + }, + { + "auxiliary_loss_clip": 0.01057934, + "auxiliary_loss_mlp": 0.01026385, + "balance_loss_clip": 1.01503038, + "balance_loss_mlp": 1.01931012, + "epoch": 0.7268901247557493, + "flos": 23984946414720.0, + "grad_norm": 1.6938544514555793, + "language_loss": 0.71026564, + "learning_rate": 6.921485158453268e-07, + "loss": 0.73110884, + "num_input_tokens_seen": 261023895, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.38671875, + "step": 12090, + "time_per_iteration": 2.4349353313446045 + }, + { + "auxiliary_loss_clip": 0.01058073, + "auxiliary_loss_mlp": 0.01028139, + "balance_loss_clip": 1.01521647, + "balance_loss_mlp": 1.01845288, + "epoch": 0.7269502480084172, + "flos": 32086716364800.0, + "grad_norm": 2.2756745822142896, + "language_loss": 0.77429122, + "learning_rate": 6.918626944006831e-07, + "loss": 0.79515326, + "num_input_tokens_seen": 261045445, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.39648438, + "step": 12091, + "time_per_iteration": 2.5116069316864014 + }, + { + "auxiliary_loss_clip": 0.01056855, + "auxiliary_loss_mlp": 0.01023388, + "balance_loss_clip": 1.0112226, + "balance_loss_mlp": 1.01790941, + "epoch": 0.7270103712610853, + "flos": 19061773776000.0, + "grad_norm": 7.363177801674721, + "language_loss": 0.71159399, + "learning_rate": 6.915769196400956e-07, + "loss": 0.73239642, + "num_input_tokens_seen": 261064275, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.38867188, + "step": 12092, + "time_per_iteration": 2.419755697250366 + }, + { + "auxiliary_loss_clip": 0.01056965, + "auxiliary_loss_mlp": 0.0102152, + "balance_loss_clip": 1.00956929, + "balance_loss_mlp": 1.01924467, + "epoch": 0.7270704945137532, + "flos": 34531474475520.0, + "grad_norm": 1.7069977564572172, + "language_loss": 0.60709786, + "learning_rate": 6.912911915737607e-07, + "loss": 0.62788272, + "num_input_tokens_seen": 261083310, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.37695312, + "step": 12093, + "time_per_iteration": 2.52085018157959 + }, + { + "auxiliary_loss_clip": 0.01058664, + "auxiliary_loss_mlp": 0.01022778, + "balance_loss_clip": 1.0111078, + "balance_loss_mlp": 1.01958394, + "epoch": 0.7271306177664212, + "flos": 21506496975360.0, + "grad_norm": 1.466935400744639, + "language_loss": 0.75487924, + "learning_rate": 6.910055102118775e-07, + "loss": 0.7756936, + "num_input_tokens_seen": 261103460, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.390625, + "step": 12094, + "time_per_iteration": 2.427574872970581 + }, + { + "auxiliary_loss_clip": 0.01055993, + "auxiliary_loss_mlp": 0.01025292, + "balance_loss_clip": 1.01449728, + "balance_loss_mlp": 1.01893795, + "epoch": 0.7271907410190891, + "flos": 22271374241280.0, + "grad_norm": 6.308672629462427, + "language_loss": 0.85232842, + "learning_rate": 6.907198755646397e-07, + "loss": 0.87314123, + "num_input_tokens_seen": 261121375, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.37109375, + "step": 12095, + "time_per_iteration": 2.4020988941192627 + }, + { + "auxiliary_loss_clip": 0.01056663, + "auxiliary_loss_mlp": 0.01025716, + "balance_loss_clip": 1.01343763, + "balance_loss_mlp": 1.01787186, + "epoch": 0.7272508642717571, + "flos": 22892025214080.0, + "grad_norm": 1.5676749392494755, + "language_loss": 0.77639973, + "learning_rate": 6.904342876422433e-07, + "loss": 0.79722351, + "num_input_tokens_seen": 261141105, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.38867188, + "step": 12096, + "time_per_iteration": 2.4092462062835693 + }, + { + "auxiliary_loss_clip": 0.01058428, + "auxiliary_loss_mlp": 0.01028272, + "balance_loss_clip": 1.01689339, + "balance_loss_mlp": 1.01982772, + "epoch": 0.727310987524425, + "flos": 11655089550720.0, + "grad_norm": 2.1815794291145143, + "language_loss": 0.72275853, + "learning_rate": 6.90148746454877e-07, + "loss": 0.74362546, + "num_input_tokens_seen": 261159255, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.38671875, + "step": 12097, + "time_per_iteration": 2.377249002456665 + }, + { + "auxiliary_loss_clip": 0.01057946, + "auxiliary_loss_mlp": 0.01031187, + "balance_loss_clip": 1.01888466, + "balance_loss_mlp": 1.01887894, + "epoch": 0.727371110777093, + "flos": 24534165012480.0, + "grad_norm": 1.7377161125899927, + "language_loss": 0.76698554, + "learning_rate": 6.898632520127334e-07, + "loss": 0.78787684, + "num_input_tokens_seen": 261177960, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.390625, + "step": 12098, + "time_per_iteration": 2.426664352416992 + }, + { + "auxiliary_loss_clip": 0.01058086, + "auxiliary_loss_mlp": 0.0102549, + "balance_loss_clip": 1.01297951, + "balance_loss_mlp": 1.01806629, + "epoch": 0.7274312340297611, + "flos": 74737278691200.0, + "grad_norm": 1.811236247322373, + "language_loss": 0.67860067, + "learning_rate": 6.895778043260001e-07, + "loss": 0.69943643, + "num_input_tokens_seen": 261205660, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.40039062, + "step": 12099, + "time_per_iteration": 2.813098430633545 + }, + { + "auxiliary_loss_clip": 0.0106059, + "auxiliary_loss_mlp": 0.01024703, + "balance_loss_clip": 1.01101208, + "balance_loss_mlp": 1.01904476, + "epoch": 0.727491357282429, + "flos": 22341864009600.0, + "grad_norm": 1.6015029591435352, + "language_loss": 0.72707665, + "learning_rate": 6.892924034048644e-07, + "loss": 0.74792963, + "num_input_tokens_seen": 261225185, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.41601562, + "step": 12100, + "time_per_iteration": 2.4009757041931152 + }, + { + "auxiliary_loss_clip": 0.01057097, + "auxiliary_loss_mlp": 0.01024236, + "balance_loss_clip": 1.01284516, + "balance_loss_mlp": 1.01874685, + "epoch": 0.727551480535097, + "flos": 23296354202880.0, + "grad_norm": 1.3957093644681513, + "language_loss": 0.74574935, + "learning_rate": 6.890070492595104e-07, + "loss": 0.7665627, + "num_input_tokens_seen": 261247965, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.3828125, + "step": 12101, + "time_per_iteration": 2.456162214279175 + }, + { + "auxiliary_loss_clip": 0.01059048, + "auxiliary_loss_mlp": 0.01023817, + "balance_loss_clip": 1.01298118, + "balance_loss_mlp": 1.02046061, + "epoch": 0.7276116037877649, + "flos": 21469489067520.0, + "grad_norm": 1.7493395687768973, + "language_loss": 0.8281464, + "learning_rate": 6.887217419001232e-07, + "loss": 0.84897506, + "num_input_tokens_seen": 261267585, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.38671875, + "step": 12102, + "time_per_iteration": 3.857746124267578 + }, + { + "auxiliary_loss_clip": 0.01055397, + "auxiliary_loss_mlp": 0.01026654, + "balance_loss_clip": 1.01467931, + "balance_loss_mlp": 1.01744127, + "epoch": 0.7276717270404329, + "flos": 21463170111360.0, + "grad_norm": 2.297293787539673, + "language_loss": 0.81519306, + "learning_rate": 6.884364813368841e-07, + "loss": 0.83601362, + "num_input_tokens_seen": 261285200, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.37890625, + "step": 12103, + "time_per_iteration": 2.4539427757263184 + }, + { + "auxiliary_loss_clip": 0.0105886, + "auxiliary_loss_mlp": 0.01020669, + "balance_loss_clip": 1.00852156, + "balance_loss_mlp": 1.01997542, + "epoch": 0.7277318502931008, + "flos": 16836270203520.0, + "grad_norm": 1.546113355368156, + "language_loss": 0.66564882, + "learning_rate": 6.881512675799735e-07, + "loss": 0.6864441, + "num_input_tokens_seen": 261303645, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.38867188, + "step": 12104, + "time_per_iteration": 2.4529025554656982 + }, + { + "auxiliary_loss_clip": 0.01054911, + "auxiliary_loss_mlp": 0.01030827, + "balance_loss_clip": 1.01845288, + "balance_loss_mlp": 1.01829958, + "epoch": 0.7277919735457689, + "flos": 33399171394560.0, + "grad_norm": 1.7791158218699976, + "language_loss": 0.66155624, + "learning_rate": 6.878661006395687e-07, + "loss": 0.68241364, + "num_input_tokens_seen": 261323265, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.3671875, + "step": 12105, + "time_per_iteration": 2.5232350826263428 + }, + { + "auxiliary_loss_clip": 0.01058635, + "auxiliary_loss_mlp": 0.01023248, + "balance_loss_clip": 1.01120174, + "balance_loss_mlp": 1.01963127, + "epoch": 0.7278520967984368, + "flos": 19205546221440.0, + "grad_norm": 2.0485399053464133, + "language_loss": 0.75657213, + "learning_rate": 6.875809805258488e-07, + "loss": 0.77739096, + "num_input_tokens_seen": 261339745, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.390625, + "step": 12106, + "time_per_iteration": 2.4050400257110596 + }, + { + "auxiliary_loss_clip": 0.01059646, + "auxiliary_loss_mlp": 0.01030168, + "balance_loss_clip": 1.01774669, + "balance_loss_mlp": 1.01978397, + "epoch": 0.7279122200511048, + "flos": 34093244689920.0, + "grad_norm": 3.726580778798343, + "language_loss": 0.70373112, + "learning_rate": 6.872959072489872e-07, + "loss": 0.72462928, + "num_input_tokens_seen": 261359310, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.3984375, + "step": 12107, + "time_per_iteration": 2.4932475090026855 + }, + { + "auxiliary_loss_clip": 0.01057214, + "auxiliary_loss_mlp": 0.01026911, + "balance_loss_clip": 1.01506162, + "balance_loss_mlp": 1.01897156, + "epoch": 0.7279723433037727, + "flos": 54597071928960.0, + "grad_norm": 1.6502177552781432, + "language_loss": 0.75289166, + "learning_rate": 6.870108808191574e-07, + "loss": 0.7737329, + "num_input_tokens_seen": 261384640, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.3828125, + "step": 12108, + "time_per_iteration": 2.700761079788208 + }, + { + "auxiliary_loss_clip": 0.01057777, + "auxiliary_loss_mlp": 0.01025975, + "balance_loss_clip": 1.01396465, + "balance_loss_mlp": 1.01840329, + "epoch": 0.7280324665564407, + "flos": 36136012872960.0, + "grad_norm": 1.522872065238769, + "language_loss": 0.67166889, + "learning_rate": 6.867259012465331e-07, + "loss": 0.69250643, + "num_input_tokens_seen": 261405290, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.39453125, + "step": 12109, + "time_per_iteration": 2.517411231994629 + }, + { + "auxiliary_loss_clip": 0.01057766, + "auxiliary_loss_mlp": 0.01023711, + "balance_loss_clip": 1.0110209, + "balance_loss_mlp": 1.01875484, + "epoch": 0.7280925898091086, + "flos": 11617767440640.0, + "grad_norm": 1.9220233515751477, + "language_loss": 0.63401663, + "learning_rate": 6.864409685412822e-07, + "loss": 0.65483141, + "num_input_tokens_seen": 261419710, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.390625, + "step": 12110, + "time_per_iteration": 2.3724470138549805 + }, + { + "auxiliary_loss_clip": 0.01059271, + "auxiliary_loss_mlp": 0.0102507, + "balance_loss_clip": 1.01174283, + "balance_loss_mlp": 1.01847589, + "epoch": 0.7281527130617766, + "flos": 34275665877120.0, + "grad_norm": 1.820329176167351, + "language_loss": 0.58181047, + "learning_rate": 6.861560827135746e-07, + "loss": 0.60265386, + "num_input_tokens_seen": 261442385, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.40820312, + "step": 12111, + "time_per_iteration": 2.499248504638672 + }, + { + "auxiliary_loss_clip": 0.01057313, + "auxiliary_loss_mlp": 0.01024996, + "balance_loss_clip": 1.01244938, + "balance_loss_mlp": 1.0182333, + "epoch": 0.7282128363144446, + "flos": 13917182094720.0, + "grad_norm": 2.953987382350149, + "language_loss": 0.73998642, + "learning_rate": 6.858712437735761e-07, + "loss": 0.76080954, + "num_input_tokens_seen": 261459805, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.390625, + "step": 12112, + "time_per_iteration": 2.3639674186706543 + }, + { + "auxiliary_loss_clip": 0.01055247, + "auxiliary_loss_mlp": 0.01025414, + "balance_loss_clip": 1.01391578, + "balance_loss_mlp": 1.01827288, + "epoch": 0.7282729595671126, + "flos": 20776567847040.0, + "grad_norm": 1.9066303531442834, + "language_loss": 0.66772777, + "learning_rate": 6.855864517314541e-07, + "loss": 0.68853438, + "num_input_tokens_seen": 261477175, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.36914062, + "step": 12113, + "time_per_iteration": 5.3532795906066895 + }, + { + "auxiliary_loss_clip": 0.01057489, + "auxiliary_loss_mlp": 0.01024023, + "balance_loss_clip": 1.01226938, + "balance_loss_mlp": 1.0187372, + "epoch": 0.7283330828197806, + "flos": 16324513361280.0, + "grad_norm": 1.564628965060063, + "language_loss": 0.73145366, + "learning_rate": 6.853017065973692e-07, + "loss": 0.75226879, + "num_input_tokens_seen": 261494990, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.38867188, + "step": 12114, + "time_per_iteration": 2.408719301223755 + }, + { + "auxiliary_loss_clip": 0.010561, + "auxiliary_loss_mlp": 0.01022326, + "balance_loss_clip": 1.01069105, + "balance_loss_mlp": 1.01759493, + "epoch": 0.7283932060724485, + "flos": 27488969308800.0, + "grad_norm": 1.9137535973754543, + "language_loss": 0.68114829, + "learning_rate": 6.850170083814852e-07, + "loss": 0.70193255, + "num_input_tokens_seen": 261514445, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.38476562, + "step": 12115, + "time_per_iteration": 2.4411768913269043 + }, + { + "auxiliary_loss_clip": 0.01059576, + "auxiliary_loss_mlp": 0.01023132, + "balance_loss_clip": 1.01062107, + "balance_loss_mlp": 1.01852548, + "epoch": 0.7284533293251165, + "flos": 18366932430720.0, + "grad_norm": 1.65795650143952, + "language_loss": 0.60263252, + "learning_rate": 6.847323570939616e-07, + "loss": 0.62345958, + "num_input_tokens_seen": 261533565, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.41015625, + "step": 12116, + "time_per_iteration": 2.406975507736206 + }, + { + "auxiliary_loss_clip": 0.01056364, + "auxiliary_loss_mlp": 0.01024858, + "balance_loss_clip": 1.01309216, + "balance_loss_mlp": 1.01819634, + "epoch": 0.7285134525777844, + "flos": 21724459793280.0, + "grad_norm": 1.8034270277998983, + "language_loss": 0.72781372, + "learning_rate": 6.844477527449568e-07, + "loss": 0.74862599, + "num_input_tokens_seen": 261553795, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.3828125, + "step": 12117, + "time_per_iteration": 2.409393072128296 + }, + { + "auxiliary_loss_clip": 0.01058477, + "auxiliary_loss_mlp": 0.0102057, + "balance_loss_clip": 1.00881636, + "balance_loss_mlp": 1.01966166, + "epoch": 0.7285735758304525, + "flos": 20740293077760.0, + "grad_norm": 1.9768095004249797, + "language_loss": 0.69516563, + "learning_rate": 6.841631953446272e-07, + "loss": 0.71595609, + "num_input_tokens_seen": 261572565, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.38867188, + "step": 12118, + "time_per_iteration": 2.4372470378875732 + }, + { + "auxiliary_loss_clip": 0.0105425, + "auxiliary_loss_mlp": 0.01023684, + "balance_loss_clip": 1.01264548, + "balance_loss_mlp": 1.01767421, + "epoch": 0.7286336990831204, + "flos": 17310006708480.0, + "grad_norm": 1.6145274240406948, + "language_loss": 0.83908707, + "learning_rate": 6.838786849031291e-07, + "loss": 0.85986638, + "num_input_tokens_seen": 261590910, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.36523438, + "step": 12119, + "time_per_iteration": 2.3759801387786865 + }, + { + "auxiliary_loss_clip": 0.01057869, + "auxiliary_loss_mlp": 0.01022259, + "balance_loss_clip": 1.01069546, + "balance_loss_mlp": 1.01856577, + "epoch": 0.7286938223357884, + "flos": 19786501111680.0, + "grad_norm": 2.074978576731114, + "language_loss": 0.81491053, + "learning_rate": 6.835942214306151e-07, + "loss": 0.83571184, + "num_input_tokens_seen": 261606005, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.39257812, + "step": 12120, + "time_per_iteration": 2.386312484741211 + }, + { + "auxiliary_loss_clip": 0.01058449, + "auxiliary_loss_mlp": 0.01027098, + "balance_loss_clip": 1.01455116, + "balance_loss_mlp": 1.01981997, + "epoch": 0.7287539455884563, + "flos": 15339962620800.0, + "grad_norm": 1.8658847695277636, + "language_loss": 0.78335696, + "learning_rate": 6.833098049372375e-07, + "loss": 0.80421245, + "num_input_tokens_seen": 261622305, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.38671875, + "step": 12121, + "time_per_iteration": 3.7819395065307617 + }, + { + "auxiliary_loss_clip": 0.0105783, + "auxiliary_loss_mlp": 0.01026109, + "balance_loss_clip": 1.01333618, + "balance_loss_mlp": 1.01802635, + "epoch": 0.7288140688411243, + "flos": 25191300222720.0, + "grad_norm": 1.5671021919387405, + "language_loss": 0.69087589, + "learning_rate": 6.830254354331458e-07, + "loss": 0.71171522, + "num_input_tokens_seen": 261642465, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.3984375, + "step": 12122, + "time_per_iteration": 2.4430956840515137 + }, + { + "auxiliary_loss_clip": 0.01055466, + "auxiliary_loss_mlp": 0.01022019, + "balance_loss_clip": 1.01071787, + "balance_loss_mlp": 1.01850677, + "epoch": 0.7288741920937922, + "flos": 23983131024000.0, + "grad_norm": 1.788892454269446, + "language_loss": 0.87049395, + "learning_rate": 6.827411129284886e-07, + "loss": 0.89126885, + "num_input_tokens_seen": 261661420, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.36914062, + "step": 12123, + "time_per_iteration": 2.475998878479004 + }, + { + "auxiliary_loss_clip": 0.0105627, + "auxiliary_loss_mlp": 0.01023623, + "balance_loss_clip": 1.01275134, + "balance_loss_mlp": 1.01782441, + "epoch": 0.7289343153464602, + "flos": 22743888848640.0, + "grad_norm": 1.9042047495147854, + "language_loss": 0.82609373, + "learning_rate": 6.824568374334125e-07, + "loss": 0.84689271, + "num_input_tokens_seen": 261680865, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.3828125, + "step": 12124, + "time_per_iteration": 2.402543544769287 + }, + { + "auxiliary_loss_clip": 0.01056133, + "auxiliary_loss_mlp": 0.01026228, + "balance_loss_clip": 1.01452208, + "balance_loss_mlp": 1.01864243, + "epoch": 0.7289944385991282, + "flos": 24898867741440.0, + "grad_norm": 1.7949963014814492, + "language_loss": 0.67269576, + "learning_rate": 6.821726089580624e-07, + "loss": 0.69351935, + "num_input_tokens_seen": 261701455, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.375, + "step": 12125, + "time_per_iteration": 2.4380135536193848 + }, + { + "auxiliary_loss_clip": 0.01059438, + "auxiliary_loss_mlp": 0.01033665, + "balance_loss_clip": 1.02058816, + "balance_loss_mlp": 1.01802552, + "epoch": 0.7290545618517962, + "flos": 22965936295680.0, + "grad_norm": 2.2231319609131055, + "language_loss": 0.75149679, + "learning_rate": 6.818884275125831e-07, + "loss": 0.7724278, + "num_input_tokens_seen": 261721260, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.4140625, + "step": 12126, + "time_per_iteration": 2.444546937942505 + }, + { + "auxiliary_loss_clip": 0.01006787, + "auxiliary_loss_mlp": 0.01001815, + "balance_loss_clip": 1.0008142, + "balance_loss_mlp": 1.00037491, + "epoch": 0.7291146851044642, + "flos": 61898176736640.0, + "grad_norm": 0.8148629641102407, + "language_loss": 0.58657372, + "learning_rate": 6.816042931071142e-07, + "loss": 0.60665977, + "num_input_tokens_seen": 261779370, + "router_z_loss_clip": 0.01000977, + "router_z_loss_mlp": 0.06445312, + "step": 12127, + "time_per_iteration": 2.988057851791382 + }, + { + "auxiliary_loss_clip": 0.0105651, + "auxiliary_loss_mlp": 0.0102396, + "balance_loss_clip": 1.0127182, + "balance_loss_mlp": 1.01931405, + "epoch": 0.7291748083571321, + "flos": 23329836063360.0, + "grad_norm": 2.053117709690317, + "language_loss": 0.68521869, + "learning_rate": 6.813202057517973e-07, + "loss": 0.70602334, + "num_input_tokens_seen": 261798050, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.37109375, + "step": 12128, + "time_per_iteration": 2.406209945678711 + }, + { + "auxiliary_loss_clip": 0.01058468, + "auxiliary_loss_mlp": 0.01026818, + "balance_loss_clip": 1.01412797, + "balance_loss_mlp": 1.01894975, + "epoch": 0.7292349316098001, + "flos": 28328735174400.0, + "grad_norm": 2.0705333940362687, + "language_loss": 0.65316057, + "learning_rate": 6.810361654567695e-07, + "loss": 0.6740135, + "num_input_tokens_seen": 261817660, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.39453125, + "step": 12129, + "time_per_iteration": 2.4781129360198975 + }, + { + "auxiliary_loss_clip": 0.01054328, + "auxiliary_loss_mlp": 0.0101993, + "balance_loss_clip": 1.00823009, + "balance_loss_mlp": 1.01788807, + "epoch": 0.729295054862468, + "flos": 24131127744000.0, + "grad_norm": 2.277529328017726, + "language_loss": 0.74251628, + "learning_rate": 6.807521722321697e-07, + "loss": 0.76325887, + "num_input_tokens_seen": 261837935, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.36328125, + "step": 12130, + "time_per_iteration": 2.4176132678985596 + }, + { + "auxiliary_loss_clip": 0.01056807, + "auxiliary_loss_mlp": 0.01023195, + "balance_loss_clip": 1.01109481, + "balance_loss_mlp": 1.01837599, + "epoch": 0.7293551781151361, + "flos": 22815251400960.0, + "grad_norm": 1.6891423445117983, + "language_loss": 0.7004692, + "learning_rate": 6.804682260881298e-07, + "loss": 0.72126925, + "num_input_tokens_seen": 261857575, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.38476562, + "step": 12131, + "time_per_iteration": 2.4347240924835205 + }, + { + "auxiliary_loss_clip": 0.01057806, + "auxiliary_loss_mlp": 0.0102491, + "balance_loss_clip": 1.01386499, + "balance_loss_mlp": 1.0201118, + "epoch": 0.729415301367804, + "flos": 22125611848320.0, + "grad_norm": 1.6393381239485105, + "language_loss": 0.7756238, + "learning_rate": 6.801843270347854e-07, + "loss": 0.79645097, + "num_input_tokens_seen": 261877265, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.37890625, + "step": 12132, + "time_per_iteration": 2.417330503463745 + }, + { + "auxiliary_loss_clip": 0.01059022, + "auxiliary_loss_mlp": 0.01032861, + "balance_loss_clip": 1.0203979, + "balance_loss_mlp": 1.01973951, + "epoch": 0.729475424620472, + "flos": 12348778821120.0, + "grad_norm": 2.08858599226155, + "language_loss": 0.7916882, + "learning_rate": 6.799004750822672e-07, + "loss": 0.81260711, + "num_input_tokens_seen": 261893695, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.39257812, + "step": 12133, + "time_per_iteration": 2.407796859741211 + }, + { + "auxiliary_loss_clip": 0.01057064, + "auxiliary_loss_mlp": 0.01028411, + "balance_loss_clip": 1.01657963, + "balance_loss_mlp": 1.01799405, + "epoch": 0.7295355478731399, + "flos": 22194356048640.0, + "grad_norm": 1.906960312683527, + "language_loss": 0.72154331, + "learning_rate": 6.796166702407055e-07, + "loss": 0.74239802, + "num_input_tokens_seen": 261911825, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.390625, + "step": 12134, + "time_per_iteration": 2.3872268199920654 + }, + { + "auxiliary_loss_clip": 0.01058728, + "auxiliary_loss_mlp": 0.01030411, + "balance_loss_clip": 1.01883018, + "balance_loss_mlp": 1.01946282, + "epoch": 0.7295956711258079, + "flos": 23220907021440.0, + "grad_norm": 3.3408041779505857, + "language_loss": 0.71492177, + "learning_rate": 6.793329125202278e-07, + "loss": 0.7358132, + "num_input_tokens_seen": 261931190, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.39257812, + "step": 12135, + "time_per_iteration": 2.4251174926757812 + }, + { + "auxiliary_loss_clip": 0.01059124, + "auxiliary_loss_mlp": 0.01024788, + "balance_loss_clip": 1.01312923, + "balance_loss_mlp": 1.01926756, + "epoch": 0.7296557943784758, + "flos": 31867741117440.0, + "grad_norm": 1.817606474102737, + "language_loss": 0.61997604, + "learning_rate": 6.790492019309628e-07, + "loss": 0.64081514, + "num_input_tokens_seen": 261951240, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.3984375, + "step": 12136, + "time_per_iteration": 2.49261474609375 + }, + { + "auxiliary_loss_clip": 0.01054869, + "auxiliary_loss_mlp": 0.01028275, + "balance_loss_clip": 1.01729572, + "balance_loss_mlp": 1.01741362, + "epoch": 0.7297159176311439, + "flos": 26650495163520.0, + "grad_norm": 2.732169375707953, + "language_loss": 0.74455589, + "learning_rate": 6.787655384830328e-07, + "loss": 0.7653873, + "num_input_tokens_seen": 261971605, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.375, + "step": 12137, + "time_per_iteration": 2.431112289428711 + }, + { + "auxiliary_loss_clip": 0.01059255, + "auxiliary_loss_mlp": 0.01030251, + "balance_loss_clip": 1.01879501, + "balance_loss_mlp": 1.02016091, + "epoch": 0.7297760408838118, + "flos": 24748531960320.0, + "grad_norm": 1.8715440070700557, + "language_loss": 0.74025762, + "learning_rate": 6.784819221865619e-07, + "loss": 0.76115274, + "num_input_tokens_seen": 261990830, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.390625, + "step": 12138, + "time_per_iteration": 2.4384446144104004 + }, + { + "auxiliary_loss_clip": 0.01055901, + "auxiliary_loss_mlp": 0.01022426, + "balance_loss_clip": 1.01119685, + "balance_loss_mlp": 1.01846862, + "epoch": 0.7298361641364798, + "flos": 18072894026880.0, + "grad_norm": 2.1857687071116394, + "language_loss": 0.71573877, + "learning_rate": 6.781983530516722e-07, + "loss": 0.73652208, + "num_input_tokens_seen": 262008190, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.37304688, + "step": 12139, + "time_per_iteration": 2.374129295349121 + }, + { + "auxiliary_loss_clip": 0.01054861, + "auxiliary_loss_mlp": 0.01021891, + "balance_loss_clip": 1.01120365, + "balance_loss_mlp": 1.019099, + "epoch": 0.7298962873891478, + "flos": 29894380450560.0, + "grad_norm": 1.5910533770826145, + "language_loss": 0.73572612, + "learning_rate": 6.779148310884832e-07, + "loss": 0.75649363, + "num_input_tokens_seen": 262030460, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.35742188, + "step": 12140, + "time_per_iteration": 2.5200369358062744 + }, + { + "auxiliary_loss_clip": 0.01057943, + "auxiliary_loss_mlp": 0.01028103, + "balance_loss_clip": 1.01551414, + "balance_loss_mlp": 1.01907086, + "epoch": 0.7299564106418157, + "flos": 32264843454720.0, + "grad_norm": 1.9180808399287488, + "language_loss": 0.55484319, + "learning_rate": 6.776313563071132e-07, + "loss": 0.57570362, + "num_input_tokens_seen": 262050830, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.390625, + "step": 12141, + "time_per_iteration": 2.4869332313537598 + }, + { + "auxiliary_loss_clip": 0.01056164, + "auxiliary_loss_mlp": 0.01020406, + "balance_loss_clip": 1.00945091, + "balance_loss_mlp": 1.01867008, + "epoch": 0.7300165338944837, + "flos": 22929172767360.0, + "grad_norm": 1.331051528981529, + "language_loss": 0.72516918, + "learning_rate": 6.77347928717678e-07, + "loss": 0.7459349, + "num_input_tokens_seen": 262071245, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.375, + "step": 12142, + "time_per_iteration": 3.8506991863250732 + }, + { + "auxiliary_loss_clip": 0.01056597, + "auxiliary_loss_mlp": 0.01024755, + "balance_loss_clip": 1.01403785, + "balance_loss_mlp": 1.01902425, + "epoch": 0.7300766571471516, + "flos": 19827768205440.0, + "grad_norm": 1.7822707449611306, + "language_loss": 0.73611861, + "learning_rate": 6.770645483302941e-07, + "loss": 0.75693214, + "num_input_tokens_seen": 262087525, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.375, + "step": 12143, + "time_per_iteration": 2.413691520690918 + }, + { + "auxiliary_loss_clip": 0.01057952, + "auxiliary_loss_mlp": 0.01023605, + "balance_loss_clip": 1.01157629, + "balance_loss_mlp": 1.01910257, + "epoch": 0.7301367803998197, + "flos": 24346821323520.0, + "grad_norm": 2.3599604000928327, + "language_loss": 0.66353333, + "learning_rate": 6.767812151550722e-07, + "loss": 0.68434894, + "num_input_tokens_seen": 262107355, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.38867188, + "step": 12144, + "time_per_iteration": 2.437361478805542 + }, + { + "auxiliary_loss_clip": 0.01055567, + "auxiliary_loss_mlp": 0.01022664, + "balance_loss_clip": 1.01115417, + "balance_loss_mlp": 1.01808238, + "epoch": 0.7301969036524876, + "flos": 15303618028800.0, + "grad_norm": 1.529206196578569, + "language_loss": 0.79083085, + "learning_rate": 6.764979292021256e-07, + "loss": 0.8116132, + "num_input_tokens_seen": 262125645, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.375, + "step": 12145, + "time_per_iteration": 2.4142439365386963 + }, + { + "auxiliary_loss_clip": 0.01056641, + "auxiliary_loss_mlp": 0.01023434, + "balance_loss_clip": 1.01190698, + "balance_loss_mlp": 1.01950431, + "epoch": 0.7302570269051556, + "flos": 23506322319360.0, + "grad_norm": 1.6998270048905406, + "language_loss": 0.91411883, + "learning_rate": 6.762146904815629e-07, + "loss": 0.93491954, + "num_input_tokens_seen": 262144075, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.37109375, + "step": 12146, + "time_per_iteration": 2.411301374435425 + }, + { + "auxiliary_loss_clip": 0.01055389, + "auxiliary_loss_mlp": 0.01022081, + "balance_loss_clip": 1.01060152, + "balance_loss_mlp": 1.01900268, + "epoch": 0.7303171501578235, + "flos": 20521981146240.0, + "grad_norm": 1.6851245437280753, + "language_loss": 0.62065983, + "learning_rate": 6.759314990034939e-07, + "loss": 0.64143455, + "num_input_tokens_seen": 262165940, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.36328125, + "step": 12147, + "time_per_iteration": 2.4705607891082764 + }, + { + "auxiliary_loss_clip": 0.01057984, + "auxiliary_loss_mlp": 0.01029601, + "balance_loss_clip": 1.01753747, + "balance_loss_mlp": 1.01978707, + "epoch": 0.7303772734104915, + "flos": 18331635179520.0, + "grad_norm": 2.2995920325998638, + "language_loss": 0.75449538, + "learning_rate": 6.756483547780225e-07, + "loss": 0.77537119, + "num_input_tokens_seen": 262184520, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.38085938, + "step": 12148, + "time_per_iteration": 2.416613817214966 + }, + { + "auxiliary_loss_clip": 0.01006999, + "auxiliary_loss_mlp": 0.01001251, + "balance_loss_clip": 1.00029182, + "balance_loss_mlp": 1.00066674, + "epoch": 0.7304373966631594, + "flos": 60651638087040.0, + "grad_norm": 0.7150357258081217, + "language_loss": 0.56791079, + "learning_rate": 6.753652578152555e-07, + "loss": 0.58799326, + "num_input_tokens_seen": 262247070, + "router_z_loss_clip": 0.00958252, + "router_z_loss_mlp": 0.06347656, + "step": 12149, + "time_per_iteration": 3.024137020111084 + }, + { + "auxiliary_loss_clip": 0.01056741, + "auxiliary_loss_mlp": 0.01024196, + "balance_loss_clip": 1.01189339, + "balance_loss_mlp": 1.0176754, + "epoch": 0.7304975199158275, + "flos": 19827069978240.0, + "grad_norm": 1.9007986530708698, + "language_loss": 0.7382285, + "learning_rate": 6.75082208125295e-07, + "loss": 0.75903785, + "num_input_tokens_seen": 262266605, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.390625, + "step": 12150, + "time_per_iteration": 2.3873963356018066 + }, + { + "auxiliary_loss_clip": 0.01059489, + "auxiliary_loss_mlp": 0.0102607, + "balance_loss_clip": 1.01345813, + "balance_loss_mlp": 1.01899052, + "epoch": 0.7305576431684954, + "flos": 13223178622080.0, + "grad_norm": 2.019996028475269, + "language_loss": 0.84029919, + "learning_rate": 6.747992057182423e-07, + "loss": 0.86115479, + "num_input_tokens_seen": 262283880, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.40625, + "step": 12151, + "time_per_iteration": 2.39165997505188 + }, + { + "auxiliary_loss_clip": 0.0105802, + "auxiliary_loss_mlp": 0.01027542, + "balance_loss_clip": 1.01445889, + "balance_loss_mlp": 1.01882136, + "epoch": 0.7306177664211634, + "flos": 24059346255360.0, + "grad_norm": 1.6950631199644135, + "language_loss": 0.77993143, + "learning_rate": 6.745162506041972e-07, + "loss": 0.80078709, + "num_input_tokens_seen": 262304155, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.39257812, + "step": 12152, + "time_per_iteration": 2.464449882507324 + }, + { + "auxiliary_loss_clip": 0.01059906, + "auxiliary_loss_mlp": 0.01024038, + "balance_loss_clip": 1.0122062, + "balance_loss_mlp": 1.02056193, + "epoch": 0.7306778896738314, + "flos": 27087887076480.0, + "grad_norm": 1.5610149484775242, + "language_loss": 0.79597902, + "learning_rate": 6.742333427932577e-07, + "loss": 0.81681848, + "num_input_tokens_seen": 262325660, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.39453125, + "step": 12153, + "time_per_iteration": 4.046667575836182 + }, + { + "auxiliary_loss_clip": 0.01058517, + "auxiliary_loss_mlp": 0.01025172, + "balance_loss_clip": 1.01366282, + "balance_loss_mlp": 1.01849318, + "epoch": 0.7307380129264993, + "flos": 16689739760640.0, + "grad_norm": 1.929783874770788, + "language_loss": 0.67400038, + "learning_rate": 6.739504822955195e-07, + "loss": 0.69483727, + "num_input_tokens_seen": 262344075, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.40039062, + "step": 12154, + "time_per_iteration": 2.3749823570251465 + }, + { + "auxiliary_loss_clip": 0.010561, + "auxiliary_loss_mlp": 0.01027392, + "balance_loss_clip": 1.01550102, + "balance_loss_mlp": 1.01755905, + "epoch": 0.7307981361791673, + "flos": 21724669261440.0, + "grad_norm": 1.7662336050041378, + "language_loss": 0.6584695, + "learning_rate": 6.736676691210772e-07, + "loss": 0.67930442, + "num_input_tokens_seen": 262363305, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.38671875, + "step": 12155, + "time_per_iteration": 2.4054813385009766 + }, + { + "auxiliary_loss_clip": 0.01054714, + "auxiliary_loss_mlp": 0.01022228, + "balance_loss_clip": 1.01061678, + "balance_loss_mlp": 1.01763034, + "epoch": 0.7308582594318352, + "flos": 18039691457280.0, + "grad_norm": 1.736606316876608, + "language_loss": 0.81868398, + "learning_rate": 6.733849032800247e-07, + "loss": 0.83945346, + "num_input_tokens_seen": 262380730, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.37109375, + "step": 12156, + "time_per_iteration": 2.3886520862579346 + }, + { + "auxiliary_loss_clip": 0.01054764, + "auxiliary_loss_mlp": 0.01024955, + "balance_loss_clip": 1.01411903, + "balance_loss_mlp": 1.01782513, + "epoch": 0.7309183826845033, + "flos": 13844108885760.0, + "grad_norm": 2.1925410427445304, + "language_loss": 0.75210649, + "learning_rate": 6.731021847824528e-07, + "loss": 0.77290368, + "num_input_tokens_seen": 262395480, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.36914062, + "step": 12157, + "time_per_iteration": 2.3835504055023193 + }, + { + "auxiliary_loss_clip": 0.01052991, + "auxiliary_loss_mlp": 0.01023094, + "balance_loss_clip": 1.01258612, + "balance_loss_mlp": 1.01718283, + "epoch": 0.7309785059371712, + "flos": 17018272454400.0, + "grad_norm": 1.9356341480280583, + "language_loss": 0.72962332, + "learning_rate": 6.728195136384502e-07, + "loss": 0.75038415, + "num_input_tokens_seen": 262413340, + "router_z_loss_clip": 0.10498047, + "router_z_loss_mlp": 0.35742188, + "step": 12158, + "time_per_iteration": 2.3670454025268555 + }, + { + "auxiliary_loss_clip": 0.01058227, + "auxiliary_loss_mlp": 0.01026348, + "balance_loss_clip": 1.01473069, + "balance_loss_mlp": 1.01960969, + "epoch": 0.7310386291898392, + "flos": 26501276545920.0, + "grad_norm": 1.5224829129843418, + "language_loss": 0.85557669, + "learning_rate": 6.725368898581049e-07, + "loss": 0.87642241, + "num_input_tokens_seen": 262433455, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.38671875, + "step": 12159, + "time_per_iteration": 2.443847417831421 + }, + { + "auxiliary_loss_clip": 0.01058973, + "auxiliary_loss_mlp": 0.01025821, + "balance_loss_clip": 1.01360178, + "balance_loss_mlp": 1.01835227, + "epoch": 0.7310987524425071, + "flos": 16944989777280.0, + "grad_norm": 1.893289846931089, + "language_loss": 0.73462242, + "learning_rate": 6.722543134515046e-07, + "loss": 0.7554704, + "num_input_tokens_seen": 262450335, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.40625, + "step": 12160, + "time_per_iteration": 3.8254055976867676 + }, + { + "auxiliary_loss_clip": 0.01059196, + "auxiliary_loss_mlp": 0.01028028, + "balance_loss_clip": 1.01538539, + "balance_loss_mlp": 1.01925826, + "epoch": 0.7311588756951751, + "flos": 13844423088000.0, + "grad_norm": 1.8294125705785265, + "language_loss": 0.72416961, + "learning_rate": 6.719717844287314e-07, + "loss": 0.74504185, + "num_input_tokens_seen": 262468240, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.3984375, + "step": 12161, + "time_per_iteration": 2.3701956272125244 + }, + { + "auxiliary_loss_clip": 0.01061117, + "auxiliary_loss_mlp": 0.01024527, + "balance_loss_clip": 1.0115335, + "balance_loss_mlp": 1.01925576, + "epoch": 0.731218998947843, + "flos": 28766615846400.0, + "grad_norm": 2.4103949390416224, + "language_loss": 0.69663548, + "learning_rate": 6.716893027998695e-07, + "loss": 0.71749192, + "num_input_tokens_seen": 262487045, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.41796875, + "step": 12162, + "time_per_iteration": 2.4579970836639404 + }, + { + "auxiliary_loss_clip": 0.01061135, + "auxiliary_loss_mlp": 0.0102516, + "balance_loss_clip": 1.01269639, + "balance_loss_mlp": 1.02062726, + "epoch": 0.7312791222005111, + "flos": 27087572874240.0, + "grad_norm": 1.6616914170718455, + "language_loss": 0.66755474, + "learning_rate": 6.71406868574999e-07, + "loss": 0.68841767, + "num_input_tokens_seen": 262504855, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.40625, + "step": 12163, + "time_per_iteration": 2.434591770172119 + }, + { + "auxiliary_loss_clip": 0.01058235, + "auxiliary_loss_mlp": 0.01023497, + "balance_loss_clip": 1.01174283, + "balance_loss_mlp": 1.01952004, + "epoch": 0.731339245453179, + "flos": 20922958644480.0, + "grad_norm": 1.525289911751317, + "language_loss": 0.68447256, + "learning_rate": 6.71124481764201e-07, + "loss": 0.70528996, + "num_input_tokens_seen": 262524920, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.38671875, + "step": 12164, + "time_per_iteration": 2.4047727584838867 + }, + { + "auxiliary_loss_clip": 0.01056683, + "auxiliary_loss_mlp": 0.01023639, + "balance_loss_clip": 1.01224887, + "balance_loss_mlp": 1.01995826, + "epoch": 0.731399368705847, + "flos": 23074586046720.0, + "grad_norm": 1.7146463211126453, + "language_loss": 0.73013353, + "learning_rate": 6.708421423775507e-07, + "loss": 0.75093675, + "num_input_tokens_seen": 262545725, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.3671875, + "step": 12165, + "time_per_iteration": 2.4321727752685547 + }, + { + "auxiliary_loss_clip": 0.01059198, + "auxiliary_loss_mlp": 0.01029013, + "balance_loss_clip": 1.01637113, + "balance_loss_mlp": 1.01937795, + "epoch": 0.731459491958515, + "flos": 23581664766720.0, + "grad_norm": 1.7736447012383365, + "language_loss": 0.76650131, + "learning_rate": 6.705598504251262e-07, + "loss": 0.78738338, + "num_input_tokens_seen": 262565480, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.3984375, + "step": 12166, + "time_per_iteration": 2.44063663482666 + }, + { + "auxiliary_loss_clip": 0.01056679, + "auxiliary_loss_mlp": 0.01025104, + "balance_loss_clip": 1.01326036, + "balance_loss_mlp": 1.01868129, + "epoch": 0.7315196152111829, + "flos": 22378278424320.0, + "grad_norm": 1.7315717937563553, + "language_loss": 0.79597187, + "learning_rate": 6.702776059170014e-07, + "loss": 0.81678969, + "num_input_tokens_seen": 262584145, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.38085938, + "step": 12167, + "time_per_iteration": 2.4115347862243652 + }, + { + "auxiliary_loss_clip": 0.01057281, + "auxiliary_loss_mlp": 0.01031095, + "balance_loss_clip": 1.01921582, + "balance_loss_mlp": 1.01960087, + "epoch": 0.7315797384638509, + "flos": 26175850963200.0, + "grad_norm": 1.7538591833924904, + "language_loss": 0.77479267, + "learning_rate": 6.699954088632471e-07, + "loss": 0.79567647, + "num_input_tokens_seen": 262604045, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.37695312, + "step": 12168, + "time_per_iteration": 2.4571404457092285 + }, + { + "auxiliary_loss_clip": 0.01058279, + "auxiliary_loss_mlp": 0.01028563, + "balance_loss_clip": 1.01549733, + "balance_loss_mlp": 1.01846886, + "epoch": 0.7316398617165188, + "flos": 21505275077760.0, + "grad_norm": 1.6620403425157497, + "language_loss": 0.81579441, + "learning_rate": 6.697132592739363e-07, + "loss": 0.83666277, + "num_input_tokens_seen": 262624540, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.3984375, + "step": 12169, + "time_per_iteration": 2.407305955886841 + }, + { + "auxiliary_loss_clip": 0.01059122, + "auxiliary_loss_mlp": 0.01028618, + "balance_loss_clip": 1.01639891, + "balance_loss_mlp": 1.01895571, + "epoch": 0.7316999849691869, + "flos": 30481235360640.0, + "grad_norm": 1.5927452388581762, + "language_loss": 0.70075566, + "learning_rate": 6.694311571591371e-07, + "loss": 0.72163308, + "num_input_tokens_seen": 262644545, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.40234375, + "step": 12170, + "time_per_iteration": 2.4704179763793945 + }, + { + "auxiliary_loss_clip": 0.01055927, + "auxiliary_loss_mlp": 0.0102371, + "balance_loss_clip": 1.01114583, + "balance_loss_mlp": 1.01849461, + "epoch": 0.7317601082218548, + "flos": 21542701921920.0, + "grad_norm": 2.452730622116455, + "language_loss": 0.69637871, + "learning_rate": 6.691491025289173e-07, + "loss": 0.71717507, + "num_input_tokens_seen": 262662570, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.375, + "step": 12171, + "time_per_iteration": 2.4160735607147217 + }, + { + "auxiliary_loss_clip": 0.01057345, + "auxiliary_loss_mlp": 0.01029877, + "balance_loss_clip": 1.01839745, + "balance_loss_mlp": 1.02010202, + "epoch": 0.7318202314745228, + "flos": 33250301890560.0, + "grad_norm": 1.8558663646440612, + "language_loss": 0.65625215, + "learning_rate": 6.688670953933422e-07, + "loss": 0.67712438, + "num_input_tokens_seen": 262683245, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.37109375, + "step": 12172, + "time_per_iteration": 2.513596534729004 + }, + { + "auxiliary_loss_clip": 0.01058797, + "auxiliary_loss_mlp": 0.01021995, + "balance_loss_clip": 1.01000237, + "balance_loss_mlp": 1.01922119, + "epoch": 0.7318803547271907, + "flos": 20156021608320.0, + "grad_norm": 2.5248084750332023, + "language_loss": 0.60788971, + "learning_rate": 6.685851357624769e-07, + "loss": 0.62869763, + "num_input_tokens_seen": 262701585, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.39648438, + "step": 12173, + "time_per_iteration": 2.4442641735076904 + }, + { + "auxiliary_loss_clip": 0.01057004, + "auxiliary_loss_mlp": 0.01020857, + "balance_loss_clip": 1.00924599, + "balance_loss_mlp": 1.0189029, + "epoch": 0.7319404779798587, + "flos": 20557487865600.0, + "grad_norm": 2.6843676335285322, + "language_loss": 0.74025053, + "learning_rate": 6.683032236463833e-07, + "loss": 0.76102918, + "num_input_tokens_seen": 262719295, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.38085938, + "step": 12174, + "time_per_iteration": 2.40861439704895 + }, + { + "auxiliary_loss_clip": 0.0105527, + "auxiliary_loss_mlp": 0.01023795, + "balance_loss_clip": 1.01258886, + "balance_loss_mlp": 1.01828742, + "epoch": 0.7320006012325266, + "flos": 28694101219200.0, + "grad_norm": 1.4916766630798053, + "language_loss": 0.80658871, + "learning_rate": 6.680213590551222e-07, + "loss": 0.82737935, + "num_input_tokens_seen": 262739995, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.37109375, + "step": 12175, + "time_per_iteration": 2.5119428634643555 + }, + { + "auxiliary_loss_clip": 0.01056675, + "auxiliary_loss_mlp": 0.01024307, + "balance_loss_clip": 1.01197529, + "balance_loss_mlp": 1.01822782, + "epoch": 0.7320607244851947, + "flos": 16361765648640.0, + "grad_norm": 2.080598641141152, + "language_loss": 0.77206206, + "learning_rate": 6.67739541998752e-07, + "loss": 0.79287189, + "num_input_tokens_seen": 262757680, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.38476562, + "step": 12176, + "time_per_iteration": 2.410691738128662 + }, + { + "auxiliary_loss_clip": 0.01057182, + "auxiliary_loss_mlp": 0.01026229, + "balance_loss_clip": 1.01409924, + "balance_loss_mlp": 1.01893198, + "epoch": 0.7321208477378626, + "flos": 20954171266560.0, + "grad_norm": 1.4411506885128529, + "language_loss": 0.76576161, + "learning_rate": 6.674577724873316e-07, + "loss": 0.78659576, + "num_input_tokens_seen": 262776990, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.3828125, + "step": 12177, + "time_per_iteration": 2.525519609451294 + }, + { + "auxiliary_loss_clip": 0.01057206, + "auxiliary_loss_mlp": 0.01023753, + "balance_loss_clip": 1.01280355, + "balance_loss_mlp": 1.01878059, + "epoch": 0.7321809709905306, + "flos": 13844213619840.0, + "grad_norm": 2.4404310297618315, + "language_loss": 0.7402817, + "learning_rate": 6.671760505309143e-07, + "loss": 0.76109135, + "num_input_tokens_seen": 262795440, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.3828125, + "step": 12178, + "time_per_iteration": 2.389409065246582 + }, + { + "auxiliary_loss_clip": 0.0105918, + "auxiliary_loss_mlp": 0.01021696, + "balance_loss_clip": 1.00984097, + "balance_loss_mlp": 1.01913166, + "epoch": 0.7322410942431986, + "flos": 26978713655040.0, + "grad_norm": 2.432410624912958, + "language_loss": 0.8271122, + "learning_rate": 6.66894376139556e-07, + "loss": 0.84792095, + "num_input_tokens_seen": 262816385, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.40234375, + "step": 12179, + "time_per_iteration": 2.4234888553619385 + }, + { + "auxiliary_loss_clip": 0.01054674, + "auxiliary_loss_mlp": 0.01023894, + "balance_loss_clip": 1.0135231, + "balance_loss_mlp": 1.01899147, + "epoch": 0.7323012174958665, + "flos": 17638748870400.0, + "grad_norm": 1.5182115082021415, + "language_loss": 0.74219179, + "learning_rate": 6.666127493233084e-07, + "loss": 0.76297748, + "num_input_tokens_seen": 262834955, + "router_z_loss_clip": 0.10351562, + "router_z_loss_mlp": 0.35742188, + "step": 12180, + "time_per_iteration": 2.366661548614502 + }, + { + "auxiliary_loss_clip": 0.0105863, + "auxiliary_loss_mlp": 0.01022566, + "balance_loss_clip": 1.00963163, + "balance_loss_mlp": 1.017694, + "epoch": 0.7323613407485345, + "flos": 32341407799680.0, + "grad_norm": 2.8852414957639194, + "language_loss": 0.79911458, + "learning_rate": 6.663311700922218e-07, + "loss": 0.81992656, + "num_input_tokens_seen": 262853555, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.41015625, + "step": 12181, + "time_per_iteration": 3.9004061222076416 + }, + { + "auxiliary_loss_clip": 0.01059749, + "auxiliary_loss_mlp": 0.01024692, + "balance_loss_clip": 1.01262248, + "balance_loss_mlp": 1.0199275, + "epoch": 0.7324214640012024, + "flos": 18361975017600.0, + "grad_norm": 1.7356823017227685, + "language_loss": 0.71739435, + "learning_rate": 6.660496384563452e-07, + "loss": 0.73823881, + "num_input_tokens_seen": 262870975, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.3984375, + "step": 12182, + "time_per_iteration": 2.3955211639404297 + }, + { + "auxiliary_loss_clip": 0.01060854, + "auxiliary_loss_mlp": 0.01025482, + "balance_loss_clip": 1.01332879, + "balance_loss_mlp": 1.01987076, + "epoch": 0.7324815872538705, + "flos": 30810920129280.0, + "grad_norm": 1.5183529454469356, + "language_loss": 0.70894825, + "learning_rate": 6.657681544257249e-07, + "loss": 0.72981167, + "num_input_tokens_seen": 262892635, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.41015625, + "step": 12183, + "time_per_iteration": 2.4702823162078857 + }, + { + "auxiliary_loss_clip": 0.01060174, + "auxiliary_loss_mlp": 0.01030957, + "balance_loss_clip": 1.01841593, + "balance_loss_mlp": 1.02031279, + "epoch": 0.7325417105065384, + "flos": 21504053180160.0, + "grad_norm": 1.9643458516227297, + "language_loss": 0.7255609, + "learning_rate": 6.654867180104085e-07, + "loss": 0.74647218, + "num_input_tokens_seen": 262910725, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.3984375, + "step": 12184, + "time_per_iteration": 2.3833987712860107 + }, + { + "auxiliary_loss_clip": 0.01006827, + "auxiliary_loss_mlp": 0.01001311, + "balance_loss_clip": 1.00036335, + "balance_loss_mlp": 1.00046015, + "epoch": 0.7326018337592064, + "flos": 67254447191040.0, + "grad_norm": 0.7856746612572021, + "language_loss": 0.65222108, + "learning_rate": 6.652053292204371e-07, + "loss": 0.67230237, + "num_input_tokens_seen": 262974150, + "router_z_loss_clip": 0.00946045, + "router_z_loss_mlp": 0.06347656, + "step": 12185, + "time_per_iteration": 3.0148074626922607 + }, + { + "auxiliary_loss_clip": 0.01056921, + "auxiliary_loss_mlp": 0.01023525, + "balance_loss_clip": 1.01091838, + "balance_loss_mlp": 1.01755261, + "epoch": 0.7326619570118743, + "flos": 22855959912960.0, + "grad_norm": 1.988179143341366, + "language_loss": 0.80633676, + "learning_rate": 6.649239880658546e-07, + "loss": 0.82714117, + "num_input_tokens_seen": 262993370, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.39453125, + "step": 12186, + "time_per_iteration": 2.4229841232299805 + }, + { + "auxiliary_loss_clip": 0.01057241, + "auxiliary_loss_mlp": 0.01028902, + "balance_loss_clip": 1.01602101, + "balance_loss_mlp": 1.01880527, + "epoch": 0.7327220802645423, + "flos": 23326484572800.0, + "grad_norm": 1.7065209922038207, + "language_loss": 0.73348391, + "learning_rate": 6.646426945567008e-07, + "loss": 0.7543453, + "num_input_tokens_seen": 263012665, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.38476562, + "step": 12187, + "time_per_iteration": 2.4892325401306152 + }, + { + "auxiliary_loss_clip": 0.01060166, + "auxiliary_loss_mlp": 0.01032246, + "balance_loss_clip": 1.0197705, + "balance_loss_mlp": 1.02004516, + "epoch": 0.7327822035172102, + "flos": 23179674839040.0, + "grad_norm": 1.7489214166038292, + "language_loss": 0.89069206, + "learning_rate": 6.64361448703014e-07, + "loss": 0.91161621, + "num_input_tokens_seen": 263031475, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.40234375, + "step": 12188, + "time_per_iteration": 2.4273340702056885 + }, + { + "auxiliary_loss_clip": 0.01060772, + "auxiliary_loss_mlp": 0.01024587, + "balance_loss_clip": 1.01162291, + "balance_loss_mlp": 1.01982629, + "epoch": 0.7328423267698783, + "flos": 21065613926400.0, + "grad_norm": 1.9699334005106715, + "language_loss": 0.74463558, + "learning_rate": 6.64080250514831e-07, + "loss": 0.76548916, + "num_input_tokens_seen": 263051445, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.41015625, + "step": 12189, + "time_per_iteration": 2.4053096771240234 + }, + { + "auxiliary_loss_clip": 0.0105764, + "auxiliary_loss_mlp": 0.01023593, + "balance_loss_clip": 1.0110817, + "balance_loss_mlp": 1.01815975, + "epoch": 0.7329024500225462, + "flos": 21688499226240.0, + "grad_norm": 1.500628218727467, + "language_loss": 0.82362616, + "learning_rate": 6.637991000021883e-07, + "loss": 0.84443849, + "num_input_tokens_seen": 263070835, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.39453125, + "step": 12190, + "time_per_iteration": 2.4129340648651123 + }, + { + "auxiliary_loss_clip": 0.01060279, + "auxiliary_loss_mlp": 0.01029913, + "balance_loss_clip": 1.0164957, + "balance_loss_mlp": 1.01953185, + "epoch": 0.7329625732752142, + "flos": 24163073504640.0, + "grad_norm": 2.03939410480496, + "language_loss": 0.71768075, + "learning_rate": 6.635179971751184e-07, + "loss": 0.73858273, + "num_input_tokens_seen": 263090070, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.40820312, + "step": 12191, + "time_per_iteration": 2.436431884765625 + }, + { + "auxiliary_loss_clip": 0.01059685, + "auxiliary_loss_mlp": 0.01028545, + "balance_loss_clip": 1.01627254, + "balance_loss_mlp": 1.01984262, + "epoch": 0.7330226965278822, + "flos": 30076696903680.0, + "grad_norm": 1.5846178928348054, + "language_loss": 0.69293422, + "learning_rate": 6.632369420436532e-07, + "loss": 0.71381658, + "num_input_tokens_seen": 263110030, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.3984375, + "step": 12192, + "time_per_iteration": 5.399645805358887 + }, + { + "auxiliary_loss_clip": 0.01055219, + "auxiliary_loss_mlp": 0.01022861, + "balance_loss_clip": 1.01100564, + "balance_loss_mlp": 1.01731253, + "epoch": 0.7330828197805501, + "flos": 23367158173440.0, + "grad_norm": 1.530347175564547, + "language_loss": 0.73439938, + "learning_rate": 6.629559346178226e-07, + "loss": 0.75518024, + "num_input_tokens_seen": 263129735, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.37890625, + "step": 12193, + "time_per_iteration": 2.419410467147827 + }, + { + "auxiliary_loss_clip": 0.01058102, + "auxiliary_loss_mlp": 0.01025377, + "balance_loss_clip": 1.0123471, + "balance_loss_mlp": 1.01867938, + "epoch": 0.7331429430332181, + "flos": 21031748040960.0, + "grad_norm": 1.9016458447195872, + "language_loss": 0.76908535, + "learning_rate": 6.626749749076566e-07, + "loss": 0.78992015, + "num_input_tokens_seen": 263149100, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.39453125, + "step": 12194, + "time_per_iteration": 2.3694605827331543 + }, + { + "auxiliary_loss_clip": 0.0105637, + "auxiliary_loss_mlp": 0.0102556, + "balance_loss_clip": 1.01341891, + "balance_loss_mlp": 1.01820564, + "epoch": 0.733203066285886, + "flos": 14647006488960.0, + "grad_norm": 1.9563167187744142, + "language_loss": 0.70445156, + "learning_rate": 6.623940629231793e-07, + "loss": 0.72527081, + "num_input_tokens_seen": 263166620, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.38085938, + "step": 12195, + "time_per_iteration": 2.3515748977661133 + }, + { + "auxiliary_loss_clip": 0.01059757, + "auxiliary_loss_mlp": 0.01029141, + "balance_loss_clip": 1.01731551, + "balance_loss_mlp": 1.01963377, + "epoch": 0.7332631895385541, + "flos": 17164349049600.0, + "grad_norm": 6.3445269913854485, + "language_loss": 0.72198194, + "learning_rate": 6.621131986744179e-07, + "loss": 0.74287093, + "num_input_tokens_seen": 263184780, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.40234375, + "step": 12196, + "time_per_iteration": 2.375838279724121 + }, + { + "auxiliary_loss_clip": 0.01058987, + "auxiliary_loss_mlp": 0.01025493, + "balance_loss_clip": 1.01301146, + "balance_loss_mlp": 1.02012658, + "epoch": 0.733323312791222, + "flos": 28656883843200.0, + "grad_norm": 1.617737438051895, + "language_loss": 0.71477115, + "learning_rate": 6.618323821713956e-07, + "loss": 0.73561597, + "num_input_tokens_seen": 263204625, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.38867188, + "step": 12197, + "time_per_iteration": 2.4744904041290283 + }, + { + "auxiliary_loss_clip": 0.01060152, + "auxiliary_loss_mlp": 0.01030932, + "balance_loss_clip": 1.01724648, + "balance_loss_mlp": 1.01878953, + "epoch": 0.73338343604389, + "flos": 16617469512960.0, + "grad_norm": 2.050366458235755, + "language_loss": 0.7794522, + "learning_rate": 6.615516134241321e-07, + "loss": 0.80036306, + "num_input_tokens_seen": 263221565, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.4140625, + "step": 12198, + "time_per_iteration": 2.382197141647339 + }, + { + "auxiliary_loss_clip": 0.01058266, + "auxiliary_loss_mlp": 0.0102562, + "balance_loss_clip": 1.01442599, + "balance_loss_mlp": 1.01940799, + "epoch": 0.7334435592965579, + "flos": 21141026196480.0, + "grad_norm": 1.9993166478436049, + "language_loss": 0.7435627, + "learning_rate": 6.612708924426496e-07, + "loss": 0.76440156, + "num_input_tokens_seen": 263240620, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.38867188, + "step": 12199, + "time_per_iteration": 2.395162582397461 + }, + { + "auxiliary_loss_clip": 0.01060426, + "auxiliary_loss_mlp": 0.01030803, + "balance_loss_clip": 1.01751113, + "balance_loss_mlp": 1.0201534, + "epoch": 0.7335036825492259, + "flos": 17124478410240.0, + "grad_norm": 2.1681916641144876, + "language_loss": 0.77307659, + "learning_rate": 6.609902192369643e-07, + "loss": 0.79398888, + "num_input_tokens_seen": 263254365, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.40234375, + "step": 12200, + "time_per_iteration": 3.8770785331726074 + }, + { + "auxiliary_loss_clip": 0.01059244, + "auxiliary_loss_mlp": 0.01023784, + "balance_loss_clip": 1.01205349, + "balance_loss_mlp": 1.02008927, + "epoch": 0.7335638058018938, + "flos": 23730708827520.0, + "grad_norm": 1.5843465584838052, + "language_loss": 0.61258167, + "learning_rate": 6.60709593817095e-07, + "loss": 0.633412, + "num_input_tokens_seen": 263275880, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.390625, + "step": 12201, + "time_per_iteration": 2.449751853942871 + }, + { + "auxiliary_loss_clip": 0.01057006, + "auxiliary_loss_mlp": 0.01018629, + "balance_loss_clip": 1.0073216, + "balance_loss_mlp": 1.01899648, + "epoch": 0.7336239290545619, + "flos": 34931858480640.0, + "grad_norm": 1.7065698145925887, + "language_loss": 0.51793277, + "learning_rate": 6.604290161930541e-07, + "loss": 0.53868914, + "num_input_tokens_seen": 263298315, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.37890625, + "step": 12202, + "time_per_iteration": 2.521649122238159 + }, + { + "auxiliary_loss_clip": 0.01058704, + "auxiliary_loss_mlp": 0.01024375, + "balance_loss_clip": 1.01190567, + "balance_loss_mlp": 1.02017546, + "epoch": 0.7336840523072298, + "flos": 21102063252480.0, + "grad_norm": 1.866391677419385, + "language_loss": 0.68686855, + "learning_rate": 6.601484863748565e-07, + "loss": 0.7076993, + "num_input_tokens_seen": 263318615, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.38476562, + "step": 12203, + "time_per_iteration": 2.3949978351593018 + }, + { + "auxiliary_loss_clip": 0.01060132, + "auxiliary_loss_mlp": 0.01026875, + "balance_loss_clip": 1.01435781, + "balance_loss_mlp": 1.01835966, + "epoch": 0.7337441755598978, + "flos": 24023280954240.0, + "grad_norm": 2.266328209481374, + "language_loss": 0.65952003, + "learning_rate": 6.598680043725129e-07, + "loss": 0.68039012, + "num_input_tokens_seen": 263336705, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.41796875, + "step": 12204, + "time_per_iteration": 2.4140193462371826 + }, + { + "auxiliary_loss_clip": 0.01057866, + "auxiliary_loss_mlp": 0.01026578, + "balance_loss_clip": 1.01447773, + "balance_loss_mlp": 1.0197053, + "epoch": 0.7338042988125658, + "flos": 22710197520000.0, + "grad_norm": 2.1101504405723417, + "language_loss": 0.76988077, + "learning_rate": 6.59587570196033e-07, + "loss": 0.79072523, + "num_input_tokens_seen": 263355065, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.38085938, + "step": 12205, + "time_per_iteration": 2.4227383136749268 + }, + { + "auxiliary_loss_clip": 0.01058875, + "auxiliary_loss_mlp": 0.01019388, + "balance_loss_clip": 1.00835502, + "balance_loss_mlp": 1.02075112, + "epoch": 0.7338644220652337, + "flos": 21359931621120.0, + "grad_norm": 2.5626305282347412, + "language_loss": 0.79721588, + "learning_rate": 6.593071838554239e-07, + "loss": 0.81799853, + "num_input_tokens_seen": 263374460, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.38085938, + "step": 12206, + "time_per_iteration": 2.4001522064208984 + }, + { + "auxiliary_loss_clip": 0.01057526, + "auxiliary_loss_mlp": 0.01021822, + "balance_loss_clip": 1.00934696, + "balance_loss_mlp": 1.01896238, + "epoch": 0.7339245453179017, + "flos": 30918906564480.0, + "grad_norm": 2.03280487481077, + "language_loss": 0.71938932, + "learning_rate": 6.590268453606936e-07, + "loss": 0.74018276, + "num_input_tokens_seen": 263393610, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.38671875, + "step": 12207, + "time_per_iteration": 2.5088624954223633 + }, + { + "auxiliary_loss_clip": 0.01006548, + "auxiliary_loss_mlp": 0.01000543, + "balance_loss_clip": 0.99968451, + "balance_loss_mlp": 1.00016999, + "epoch": 0.7339846685705697, + "flos": 67896535605120.0, + "grad_norm": 0.7835750670723087, + "language_loss": 0.54905593, + "learning_rate": 6.587465547218456e-07, + "loss": 0.56912684, + "num_input_tokens_seen": 263450340, + "router_z_loss_clip": 0.00860596, + "router_z_loss_mlp": 0.06396484, + "step": 12208, + "time_per_iteration": 3.113116979598999 + }, + { + "auxiliary_loss_clip": 0.01055848, + "auxiliary_loss_mlp": 0.01024414, + "balance_loss_clip": 1.01342845, + "balance_loss_mlp": 1.01827955, + "epoch": 0.7340447918232377, + "flos": 22235658053760.0, + "grad_norm": 1.3367692521686039, + "language_loss": 0.80438733, + "learning_rate": 6.584663119488832e-07, + "loss": 0.82518995, + "num_input_tokens_seen": 263471735, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.37695312, + "step": 12209, + "time_per_iteration": 2.454413414001465 + }, + { + "auxiliary_loss_clip": 0.01054655, + "auxiliary_loss_mlp": 0.01023815, + "balance_loss_clip": 1.01266944, + "balance_loss_mlp": 1.01699901, + "epoch": 0.7341049150759056, + "flos": 23763771751680.0, + "grad_norm": 1.4503927359073898, + "language_loss": 0.78836453, + "learning_rate": 6.581861170518064e-07, + "loss": 0.80914927, + "num_input_tokens_seen": 263493245, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.37695312, + "step": 12210, + "time_per_iteration": 2.4260454177856445 + }, + { + "auxiliary_loss_clip": 0.01054996, + "auxiliary_loss_mlp": 0.01023149, + "balance_loss_clip": 1.01187205, + "balance_loss_mlp": 1.01798487, + "epoch": 0.7341650383285736, + "flos": 17235641779200.0, + "grad_norm": 1.7710755556273263, + "language_loss": 0.76341939, + "learning_rate": 6.579059700406171e-07, + "loss": 0.78420091, + "num_input_tokens_seen": 263511660, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.37109375, + "step": 12211, + "time_per_iteration": 2.4361770153045654 + }, + { + "auxiliary_loss_clip": 0.01057971, + "auxiliary_loss_mlp": 0.01024123, + "balance_loss_clip": 1.01195192, + "balance_loss_mlp": 1.01910949, + "epoch": 0.7342251615812415, + "flos": 23402839449600.0, + "grad_norm": 2.1925882161055235, + "language_loss": 0.72180223, + "learning_rate": 6.576258709253106e-07, + "loss": 0.74262321, + "num_input_tokens_seen": 263530875, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.38867188, + "step": 12212, + "time_per_iteration": 2.4378764629364014 + }, + { + "auxiliary_loss_clip": 0.01057548, + "auxiliary_loss_mlp": 0.01024667, + "balance_loss_clip": 1.01301455, + "balance_loss_mlp": 1.0188961, + "epoch": 0.7342852848339095, + "flos": 22746088264320.0, + "grad_norm": 1.4812840467907442, + "language_loss": 0.68497914, + "learning_rate": 6.573458197158833e-07, + "loss": 0.70580131, + "num_input_tokens_seen": 263551585, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.38671875, + "step": 12213, + "time_per_iteration": 2.4564502239227295 + }, + { + "auxiliary_loss_clip": 0.0100665, + "auxiliary_loss_mlp": 0.01000472, + "balance_loss_clip": 0.99953049, + "balance_loss_mlp": 1.00043941, + "epoch": 0.7343454080865774, + "flos": 53939376270720.0, + "grad_norm": 0.7232728269407199, + "language_loss": 0.54277658, + "learning_rate": 6.570658164223311e-07, + "loss": 0.56284773, + "num_input_tokens_seen": 263609545, + "router_z_loss_clip": 0.00939941, + "router_z_loss_mlp": 0.06201172, + "step": 12214, + "time_per_iteration": 2.970184564590454 + }, + { + "auxiliary_loss_clip": 0.01058435, + "auxiliary_loss_mlp": 0.01025377, + "balance_loss_clip": 1.01335466, + "balance_loss_mlp": 1.01822495, + "epoch": 0.7344055313392455, + "flos": 12166043431680.0, + "grad_norm": 1.9575005052451595, + "language_loss": 0.70029759, + "learning_rate": 6.567858610546442e-07, + "loss": 0.72113574, + "num_input_tokens_seen": 263627880, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.40234375, + "step": 12215, + "time_per_iteration": 2.3866004943847656 + }, + { + "auxiliary_loss_clip": 0.01055896, + "auxiliary_loss_mlp": 0.01021877, + "balance_loss_clip": 1.01131511, + "balance_loss_mlp": 1.01889002, + "epoch": 0.7344656545919134, + "flos": 18549109238400.0, + "grad_norm": 1.5712302887725762, + "language_loss": 0.72834539, + "learning_rate": 6.565059536228153e-07, + "loss": 0.74912316, + "num_input_tokens_seen": 263645665, + "router_z_loss_clip": 0.10595703, + "router_z_loss_mlp": 0.37109375, + "step": 12216, + "time_per_iteration": 2.3784830570220947 + }, + { + "auxiliary_loss_clip": 0.01061065, + "auxiliary_loss_mlp": 0.01026233, + "balance_loss_clip": 1.01196361, + "balance_loss_mlp": 1.01917672, + "epoch": 0.7345257778445814, + "flos": 23660463438720.0, + "grad_norm": 1.7691580794638708, + "language_loss": 0.78582293, + "learning_rate": 6.562260941368325e-07, + "loss": 0.80669594, + "num_input_tokens_seen": 263668170, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.41796875, + "step": 12217, + "time_per_iteration": 2.452493906021118 + }, + { + "auxiliary_loss_clip": 0.01055752, + "auxiliary_loss_mlp": 0.01022452, + "balance_loss_clip": 1.01138341, + "balance_loss_mlp": 1.01820731, + "epoch": 0.7345859010972494, + "flos": 13807799205120.0, + "grad_norm": 2.193712008025406, + "language_loss": 0.77864206, + "learning_rate": 6.55946282606685e-07, + "loss": 0.79942405, + "num_input_tokens_seen": 263684190, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.375, + "step": 12218, + "time_per_iteration": 2.3618016242980957 + }, + { + "auxiliary_loss_clip": 0.01056226, + "auxiliary_loss_mlp": 0.01024732, + "balance_loss_clip": 1.0127213, + "balance_loss_mlp": 1.01906466, + "epoch": 0.7346460243499173, + "flos": 22271653532160.0, + "grad_norm": 1.8208431532012654, + "language_loss": 0.72179651, + "learning_rate": 6.556665190423562e-07, + "loss": 0.74260604, + "num_input_tokens_seen": 263702095, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.37109375, + "step": 12219, + "time_per_iteration": 2.4190218448638916 + }, + { + "auxiliary_loss_clip": 0.01055334, + "auxiliary_loss_mlp": 0.01024272, + "balance_loss_clip": 1.01311946, + "balance_loss_mlp": 1.01785994, + "epoch": 0.7347061476025853, + "flos": 23254214325120.0, + "grad_norm": 1.6888816140486393, + "language_loss": 0.74729294, + "learning_rate": 6.553868034538319e-07, + "loss": 0.76808894, + "num_input_tokens_seen": 263721385, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.375, + "step": 12220, + "time_per_iteration": 2.411688804626465 + }, + { + "auxiliary_loss_clip": 0.01059247, + "auxiliary_loss_mlp": 0.01021429, + "balance_loss_clip": 1.00952625, + "balance_loss_mlp": 1.01950598, + "epoch": 0.7347662708552533, + "flos": 15266679943680.0, + "grad_norm": 1.644118962921263, + "language_loss": 0.6587106, + "learning_rate": 6.55107135851094e-07, + "loss": 0.67951733, + "num_input_tokens_seen": 263737835, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.3984375, + "step": 12221, + "time_per_iteration": 3.790140390396118 + }, + { + "auxiliary_loss_clip": 0.01057523, + "auxiliary_loss_mlp": 0.01024476, + "balance_loss_clip": 1.01246548, + "balance_loss_mlp": 1.0192318, + "epoch": 0.7348263941079213, + "flos": 24858927279360.0, + "grad_norm": 2.0756460031243766, + "language_loss": 0.69519162, + "learning_rate": 6.548275162441228e-07, + "loss": 0.71601158, + "num_input_tokens_seen": 263756480, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.3828125, + "step": 12222, + "time_per_iteration": 2.4269402027130127 + }, + { + "auxiliary_loss_clip": 0.01056612, + "auxiliary_loss_mlp": 0.01025733, + "balance_loss_clip": 1.01531458, + "balance_loss_mlp": 1.01886189, + "epoch": 0.7348865173605892, + "flos": 24350975775360.0, + "grad_norm": 2.4294949303647995, + "language_loss": 0.65948337, + "learning_rate": 6.545479446428965e-07, + "loss": 0.68030685, + "num_input_tokens_seen": 263776440, + "router_z_loss_clip": 0.10400391, + "router_z_loss_mlp": 0.37695312, + "step": 12223, + "time_per_iteration": 2.4506702423095703 + }, + { + "auxiliary_loss_clip": 0.01058046, + "auxiliary_loss_mlp": 0.01023715, + "balance_loss_clip": 1.01239634, + "balance_loss_mlp": 1.01909733, + "epoch": 0.7349466406132572, + "flos": 20003765702400.0, + "grad_norm": 1.7597166625123855, + "language_loss": 0.72337776, + "learning_rate": 6.542684210573948e-07, + "loss": 0.74419534, + "num_input_tokens_seen": 263793700, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.38867188, + "step": 12224, + "time_per_iteration": 2.4174914360046387 + }, + { + "auxiliary_loss_clip": 0.0105917, + "auxiliary_loss_mlp": 0.01023215, + "balance_loss_clip": 1.01075792, + "balance_loss_mlp": 1.01993418, + "epoch": 0.7350067638659251, + "flos": 29823785948160.0, + "grad_norm": 5.536150580835948, + "language_loss": 0.72737914, + "learning_rate": 6.5398894549759e-07, + "loss": 0.74820298, + "num_input_tokens_seen": 263814620, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.39257812, + "step": 12225, + "time_per_iteration": 2.4828662872314453 + }, + { + "auxiliary_loss_clip": 0.0105952, + "auxiliary_loss_mlp": 0.01026683, + "balance_loss_clip": 1.01287889, + "balance_loss_mlp": 1.0185678, + "epoch": 0.7350668871185931, + "flos": 21865229861760.0, + "grad_norm": 1.7589148919448896, + "language_loss": 0.76343644, + "learning_rate": 6.53709517973458e-07, + "loss": 0.78429842, + "num_input_tokens_seen": 263832725, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.41015625, + "step": 12226, + "time_per_iteration": 2.404454469680786 + }, + { + "auxiliary_loss_clip": 0.01055977, + "auxiliary_loss_mlp": 0.01022489, + "balance_loss_clip": 1.01066959, + "balance_loss_mlp": 1.01896596, + "epoch": 0.735127010371261, + "flos": 22564993708800.0, + "grad_norm": 1.5965850409611757, + "language_loss": 0.67058969, + "learning_rate": 6.534301384949703e-07, + "loss": 0.6913743, + "num_input_tokens_seen": 263853850, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.36914062, + "step": 12227, + "time_per_iteration": 2.438338279724121 + }, + { + "auxiliary_loss_clip": 0.0105711, + "auxiliary_loss_mlp": 0.01027189, + "balance_loss_clip": 1.01517856, + "balance_loss_mlp": 1.01823306, + "epoch": 0.7351871336239291, + "flos": 25883174102400.0, + "grad_norm": 1.4010363340113776, + "language_loss": 0.63569474, + "learning_rate": 6.531508070720972e-07, + "loss": 0.65653777, + "num_input_tokens_seen": 263874760, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.38867188, + "step": 12228, + "time_per_iteration": 2.4734928607940674 + }, + { + "auxiliary_loss_clip": 0.01056331, + "auxiliary_loss_mlp": 0.01026365, + "balance_loss_clip": 1.01527262, + "balance_loss_mlp": 1.01790667, + "epoch": 0.735247256876597, + "flos": 17931181351680.0, + "grad_norm": 1.5166267693500521, + "language_loss": 0.63325703, + "learning_rate": 6.528715237148073e-07, + "loss": 0.65408397, + "num_input_tokens_seen": 263893390, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.3828125, + "step": 12229, + "time_per_iteration": 2.384789228439331 + }, + { + "auxiliary_loss_clip": 0.01059975, + "auxiliary_loss_mlp": 0.01030406, + "balance_loss_clip": 1.01850891, + "balance_loss_mlp": 1.01990485, + "epoch": 0.735307380129265, + "flos": 28873938965760.0, + "grad_norm": 2.0194892256459425, + "language_loss": 0.73265922, + "learning_rate": 6.525922884330668e-07, + "loss": 0.75356305, + "num_input_tokens_seen": 263911180, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.40234375, + "step": 12230, + "time_per_iteration": 2.4410037994384766 + }, + { + "auxiliary_loss_clip": 0.01059284, + "auxiliary_loss_mlp": 0.01025408, + "balance_loss_clip": 1.01280177, + "balance_loss_mlp": 1.02028108, + "epoch": 0.7353675033819329, + "flos": 13624819436160.0, + "grad_norm": 4.195923302905663, + "language_loss": 0.71759367, + "learning_rate": 6.523131012368428e-07, + "loss": 0.73844057, + "num_input_tokens_seen": 263928975, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.390625, + "step": 12231, + "time_per_iteration": 3.860565185546875 + }, + { + "auxiliary_loss_clip": 0.01058383, + "auxiliary_loss_mlp": 0.01027436, + "balance_loss_clip": 1.01372683, + "balance_loss_mlp": 1.01861811, + "epoch": 0.7354276266346009, + "flos": 19462087958400.0, + "grad_norm": 2.1699586761522527, + "language_loss": 0.64013839, + "learning_rate": 6.520339621360964e-07, + "loss": 0.66099656, + "num_input_tokens_seen": 263944495, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.3984375, + "step": 12232, + "time_per_iteration": 3.8204996585845947 + }, + { + "auxiliary_loss_clip": 0.01057882, + "auxiliary_loss_mlp": 0.01027022, + "balance_loss_clip": 1.01456428, + "balance_loss_mlp": 1.01890206, + "epoch": 0.735487749887269, + "flos": 15771140311680.0, + "grad_norm": 2.2004382778040634, + "language_loss": 0.75179827, + "learning_rate": 6.51754871140791e-07, + "loss": 0.77264738, + "num_input_tokens_seen": 263961325, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.390625, + "step": 12233, + "time_per_iteration": 2.3725509643554688 + }, + { + "auxiliary_loss_clip": 0.01059194, + "auxiliary_loss_mlp": 0.0102358, + "balance_loss_clip": 1.01105165, + "balance_loss_mlp": 1.0197382, + "epoch": 0.7355478731399369, + "flos": 18259644222720.0, + "grad_norm": 1.4476426288111621, + "language_loss": 0.73609209, + "learning_rate": 6.514758282608856e-07, + "loss": 0.75691986, + "num_input_tokens_seen": 263980445, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.39453125, + "step": 12234, + "time_per_iteration": 2.3913426399230957 + }, + { + "auxiliary_loss_clip": 0.01058957, + "auxiliary_loss_mlp": 0.01026249, + "balance_loss_clip": 1.01423228, + "balance_loss_mlp": 1.02005816, + "epoch": 0.7356079963926049, + "flos": 26540832983040.0, + "grad_norm": 1.9667131701809237, + "language_loss": 0.60150361, + "learning_rate": 6.511968335063405e-07, + "loss": 0.6223557, + "num_input_tokens_seen": 263999330, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.38867188, + "step": 12235, + "time_per_iteration": 2.465308904647827 + }, + { + "auxiliary_loss_clip": 0.01057707, + "auxiliary_loss_mlp": 0.01023316, + "balance_loss_clip": 1.01147246, + "balance_loss_mlp": 1.019804, + "epoch": 0.7356681196452728, + "flos": 10777896840960.0, + "grad_norm": 2.943774885067505, + "language_loss": 0.85509509, + "learning_rate": 6.509178868871092e-07, + "loss": 0.87590528, + "num_input_tokens_seen": 264014150, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.37890625, + "step": 12236, + "time_per_iteration": 2.3844776153564453 + }, + { + "auxiliary_loss_clip": 0.01057665, + "auxiliary_loss_mlp": 0.01020419, + "balance_loss_clip": 1.00893879, + "balance_loss_mlp": 1.01890171, + "epoch": 0.7357282428979408, + "flos": 19717687088640.0, + "grad_norm": 1.6852799870768425, + "language_loss": 0.69492882, + "learning_rate": 6.506389884131494e-07, + "loss": 0.71570969, + "num_input_tokens_seen": 264033140, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.38671875, + "step": 12237, + "time_per_iteration": 2.4045326709747314 + }, + { + "auxiliary_loss_clip": 0.01057245, + "auxiliary_loss_mlp": 0.01020326, + "balance_loss_clip": 1.00883389, + "balance_loss_mlp": 1.0179745, + "epoch": 0.7357883661506087, + "flos": 19462995653760.0, + "grad_norm": 1.6869186459274068, + "language_loss": 0.71918416, + "learning_rate": 6.503601380944128e-07, + "loss": 0.73995984, + "num_input_tokens_seen": 264052105, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.39257812, + "step": 12238, + "time_per_iteration": 2.376610517501831 + }, + { + "auxiliary_loss_clip": 0.01059357, + "auxiliary_loss_mlp": 0.01025633, + "balance_loss_clip": 1.01257992, + "balance_loss_mlp": 1.01820064, + "epoch": 0.7358484894032767, + "flos": 27121857696000.0, + "grad_norm": 1.8698562671589594, + "language_loss": 0.72591114, + "learning_rate": 6.500813359408513e-07, + "loss": 0.74676102, + "num_input_tokens_seen": 264070690, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.41210938, + "step": 12239, + "time_per_iteration": 3.8793461322784424 + }, + { + "auxiliary_loss_clip": 0.01055436, + "auxiliary_loss_mlp": 0.01025113, + "balance_loss_clip": 1.01343608, + "balance_loss_mlp": 1.01858914, + "epoch": 0.7359086126559446, + "flos": 24501032265600.0, + "grad_norm": 1.321428635205359, + "language_loss": 0.79155004, + "learning_rate": 6.498025819624138e-07, + "loss": 0.81235552, + "num_input_tokens_seen": 264094225, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.36914062, + "step": 12240, + "time_per_iteration": 2.587630271911621 + }, + { + "auxiliary_loss_clip": 0.01056718, + "auxiliary_loss_mlp": 0.01025774, + "balance_loss_clip": 1.01425838, + "balance_loss_mlp": 1.01940656, + "epoch": 0.7359687359086127, + "flos": 23330150265600.0, + "grad_norm": 1.762534019549688, + "language_loss": 0.83008599, + "learning_rate": 6.495238761690503e-07, + "loss": 0.8509109, + "num_input_tokens_seen": 264113190, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.37304688, + "step": 12241, + "time_per_iteration": 2.4342875480651855 + }, + { + "auxiliary_loss_clip": 0.01056481, + "auxiliary_loss_mlp": 0.01022117, + "balance_loss_clip": 1.01050568, + "balance_loss_mlp": 1.01836109, + "epoch": 0.7360288591612806, + "flos": 20192366200320.0, + "grad_norm": 2.2068239564372947, + "language_loss": 0.78924406, + "learning_rate": 6.492452185707052e-07, + "loss": 0.81002998, + "num_input_tokens_seen": 264132050, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.38085938, + "step": 12242, + "time_per_iteration": 2.388817071914673 + }, + { + "auxiliary_loss_clip": 0.01059217, + "auxiliary_loss_mlp": 0.01023958, + "balance_loss_clip": 1.01281166, + "balance_loss_mlp": 1.02013516, + "epoch": 0.7360889824139486, + "flos": 24971626748160.0, + "grad_norm": 1.9596849586119105, + "language_loss": 0.79318613, + "learning_rate": 6.489666091773231e-07, + "loss": 0.81401789, + "num_input_tokens_seen": 264152800, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.390625, + "step": 12243, + "time_per_iteration": 2.444636821746826 + }, + { + "auxiliary_loss_clip": 0.01058054, + "auxiliary_loss_mlp": 0.01027296, + "balance_loss_clip": 1.01522589, + "balance_loss_mlp": 1.01846957, + "epoch": 0.7361491056666165, + "flos": 15011429927040.0, + "grad_norm": 6.809347989651617, + "language_loss": 0.74231362, + "learning_rate": 6.486880479988481e-07, + "loss": 0.76316708, + "num_input_tokens_seen": 264169650, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.39453125, + "step": 12244, + "time_per_iteration": 2.3841235637664795 + }, + { + "auxiliary_loss_clip": 0.0105665, + "auxiliary_loss_mlp": 0.01021448, + "balance_loss_clip": 1.00956321, + "balance_loss_mlp": 1.01782084, + "epoch": 0.7362092289192845, + "flos": 22929277501440.0, + "grad_norm": 1.7421539282578484, + "language_loss": 0.6972158, + "learning_rate": 6.484095350452205e-07, + "loss": 0.71799678, + "num_input_tokens_seen": 264190530, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.38867188, + "step": 12245, + "time_per_iteration": 2.4210288524627686 + }, + { + "auxiliary_loss_clip": 0.01055537, + "auxiliary_loss_mlp": 0.01028342, + "balance_loss_clip": 1.01631975, + "balance_loss_mlp": 1.01811218, + "epoch": 0.7362693521719526, + "flos": 20701679247360.0, + "grad_norm": 1.4161433473379634, + "language_loss": 0.73117703, + "learning_rate": 6.4813107032638e-07, + "loss": 0.75201583, + "num_input_tokens_seen": 264210820, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.375, + "step": 12246, + "time_per_iteration": 2.4094126224517822 + }, + { + "auxiliary_loss_clip": 0.01053455, + "auxiliary_loss_mlp": 0.01019628, + "balance_loss_clip": 1.00856578, + "balance_loss_mlp": 1.01740599, + "epoch": 0.7363294754246205, + "flos": 13110653710080.0, + "grad_norm": 1.9928963219609632, + "language_loss": 0.73585695, + "learning_rate": 6.478526538522638e-07, + "loss": 0.75658774, + "num_input_tokens_seen": 264227430, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.359375, + "step": 12247, + "time_per_iteration": 2.3834590911865234 + }, + { + "auxiliary_loss_clip": 0.01056903, + "auxiliary_loss_mlp": 0.01022414, + "balance_loss_clip": 1.01079714, + "balance_loss_mlp": 1.02060795, + "epoch": 0.7363895986772885, + "flos": 14026564984320.0, + "grad_norm": 1.5962974258724754, + "language_loss": 0.7428261, + "learning_rate": 6.475742856328093e-07, + "loss": 0.76361924, + "num_input_tokens_seen": 264245230, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.36328125, + "step": 12248, + "time_per_iteration": 2.4040534496307373 + }, + { + "auxiliary_loss_clip": 0.01055803, + "auxiliary_loss_mlp": 0.01027509, + "balance_loss_clip": 1.01510501, + "balance_loss_mlp": 1.01766348, + "epoch": 0.7364497219299564, + "flos": 19718943897600.0, + "grad_norm": 1.5721745166107264, + "language_loss": 0.72040308, + "learning_rate": 6.472959656779482e-07, + "loss": 0.74123621, + "num_input_tokens_seen": 264263945, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.38085938, + "step": 12249, + "time_per_iteration": 2.3858227729797363 + }, + { + "auxiliary_loss_clip": 0.01059936, + "auxiliary_loss_mlp": 0.01024902, + "balance_loss_clip": 1.01255751, + "balance_loss_mlp": 1.01982749, + "epoch": 0.7365098451826244, + "flos": 21360315646080.0, + "grad_norm": 1.715117500447878, + "language_loss": 0.77086389, + "learning_rate": 6.470176939976153e-07, + "loss": 0.79171222, + "num_input_tokens_seen": 264281500, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.40039062, + "step": 12250, + "time_per_iteration": 2.4274628162384033 + }, + { + "auxiliary_loss_clip": 0.01059422, + "auxiliary_loss_mlp": 0.01022549, + "balance_loss_clip": 1.01003194, + "balance_loss_mlp": 1.01964402, + "epoch": 0.7365699684352923, + "flos": 23367088350720.0, + "grad_norm": 1.6156967454507751, + "language_loss": 0.7130326, + "learning_rate": 6.467394706017402e-07, + "loss": 0.73385233, + "num_input_tokens_seen": 264301625, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.3984375, + "step": 12251, + "time_per_iteration": 2.4185190200805664 + }, + { + "auxiliary_loss_clip": 0.01055736, + "auxiliary_loss_mlp": 0.01025858, + "balance_loss_clip": 1.01402605, + "balance_loss_mlp": 1.01820636, + "epoch": 0.7366300916879603, + "flos": 59522758185600.0, + "grad_norm": 1.5265002243603172, + "language_loss": 0.65871966, + "learning_rate": 6.464612955002535e-07, + "loss": 0.67953557, + "num_input_tokens_seen": 264323975, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.375, + "step": 12252, + "time_per_iteration": 2.7734243869781494 + }, + { + "auxiliary_loss_clip": 0.01061223, + "auxiliary_loss_mlp": 0.01026105, + "balance_loss_clip": 1.01411223, + "balance_loss_mlp": 1.02103484, + "epoch": 0.7366902149406283, + "flos": 20922085860480.0, + "grad_norm": 1.5741860160404793, + "language_loss": 0.79289144, + "learning_rate": 6.461831687030801e-07, + "loss": 0.81376469, + "num_input_tokens_seen": 264343785, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.40234375, + "step": 12253, + "time_per_iteration": 2.4123969078063965 + }, + { + "auxiliary_loss_clip": 0.01053967, + "auxiliary_loss_mlp": 0.01020656, + "balance_loss_clip": 1.00946784, + "balance_loss_mlp": 1.01794958, + "epoch": 0.7367503381932963, + "flos": 17347189173120.0, + "grad_norm": 2.1455390923232045, + "language_loss": 0.76314569, + "learning_rate": 6.459050902201477e-07, + "loss": 0.78389186, + "num_input_tokens_seen": 264361130, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.359375, + "step": 12254, + "time_per_iteration": 2.3496997356414795 + }, + { + "auxiliary_loss_clip": 0.0105649, + "auxiliary_loss_mlp": 0.01026765, + "balance_loss_clip": 1.01470661, + "balance_loss_mlp": 1.01832342, + "epoch": 0.7368104614459642, + "flos": 17820367096320.0, + "grad_norm": 3.1220026136769983, + "language_loss": 0.69268203, + "learning_rate": 6.456270600613795e-07, + "loss": 0.71351457, + "num_input_tokens_seen": 264376965, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.3828125, + "step": 12255, + "time_per_iteration": 2.3589117527008057 + }, + { + "auxiliary_loss_clip": 0.0105808, + "auxiliary_loss_mlp": 0.01029566, + "balance_loss_clip": 1.01697791, + "balance_loss_mlp": 1.01882493, + "epoch": 0.7368705846986322, + "flos": 24605003894400.0, + "grad_norm": 1.7102084057883526, + "language_loss": 0.75160986, + "learning_rate": 6.453490782366977e-07, + "loss": 0.77248633, + "num_input_tokens_seen": 264396310, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.39257812, + "step": 12256, + "time_per_iteration": 2.41969895362854 + }, + { + "auxiliary_loss_clip": 0.01057062, + "auxiliary_loss_mlp": 0.0102402, + "balance_loss_clip": 1.01184297, + "balance_loss_mlp": 1.01856947, + "epoch": 0.7369307079513001, + "flos": 34968726743040.0, + "grad_norm": 1.8178538029896802, + "language_loss": 0.73569119, + "learning_rate": 6.450711447560227e-07, + "loss": 0.75650203, + "num_input_tokens_seen": 264418085, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.38476562, + "step": 12257, + "time_per_iteration": 2.504638195037842 + }, + { + "auxiliary_loss_clip": 0.01054917, + "auxiliary_loss_mlp": 0.01025303, + "balance_loss_clip": 1.0141449, + "balance_loss_mlp": 1.01724923, + "epoch": 0.7369908312039681, + "flos": 21213540823680.0, + "grad_norm": 1.5064102662860597, + "language_loss": 0.78029913, + "learning_rate": 6.447932596292731e-07, + "loss": 0.80110133, + "num_input_tokens_seen": 264437595, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.37695312, + "step": 12258, + "time_per_iteration": 2.4046809673309326 + }, + { + "auxiliary_loss_clip": 0.01060644, + "auxiliary_loss_mlp": 0.01026537, + "balance_loss_clip": 1.01404953, + "balance_loss_mlp": 1.02093041, + "epoch": 0.7370509544566362, + "flos": 23622512924160.0, + "grad_norm": 1.3964681216451262, + "language_loss": 0.66343427, + "learning_rate": 6.44515422866366e-07, + "loss": 0.68430609, + "num_input_tokens_seen": 264457385, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.39648438, + "step": 12259, + "time_per_iteration": 2.4140267372131348 + }, + { + "auxiliary_loss_clip": 0.01057122, + "auxiliary_loss_mlp": 0.01025215, + "balance_loss_clip": 1.0132823, + "balance_loss_mlp": 1.0186317, + "epoch": 0.7371110777093041, + "flos": 24826527671040.0, + "grad_norm": 2.341845443945711, + "language_loss": 0.73039126, + "learning_rate": 6.442376344772165e-07, + "loss": 0.75121462, + "num_input_tokens_seen": 264477205, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.38476562, + "step": 12260, + "time_per_iteration": 3.857858180999756 + }, + { + "auxiliary_loss_clip": 0.0105873, + "auxiliary_loss_mlp": 0.01022957, + "balance_loss_clip": 1.01164365, + "balance_loss_mlp": 1.01906335, + "epoch": 0.7371712009619721, + "flos": 23148357482880.0, + "grad_norm": 1.5690100114803718, + "language_loss": 0.73490334, + "learning_rate": 6.439598944717386e-07, + "loss": 0.7557202, + "num_input_tokens_seen": 264497195, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.3984375, + "step": 12261, + "time_per_iteration": 2.4272451400756836 + }, + { + "auxiliary_loss_clip": 0.01058918, + "auxiliary_loss_mlp": 0.01024409, + "balance_loss_clip": 1.01098025, + "balance_loss_mlp": 1.01834702, + "epoch": 0.73723132421464, + "flos": 23111768511360.0, + "grad_norm": 1.9813761036173492, + "language_loss": 0.67246461, + "learning_rate": 6.436822028598441e-07, + "loss": 0.69329786, + "num_input_tokens_seen": 264516950, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.40625, + "step": 12262, + "time_per_iteration": 2.4040145874023438 + }, + { + "auxiliary_loss_clip": 0.01058438, + "auxiliary_loss_mlp": 0.01023235, + "balance_loss_clip": 1.01078916, + "balance_loss_mlp": 1.01897788, + "epoch": 0.737291447467308, + "flos": 19272544853760.0, + "grad_norm": 3.682773663181581, + "language_loss": 0.88755953, + "learning_rate": 6.434045596514431e-07, + "loss": 0.90837622, + "num_input_tokens_seen": 264532675, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.39453125, + "step": 12263, + "time_per_iteration": 2.386225938796997 + }, + { + "auxiliary_loss_clip": 0.01053573, + "auxiliary_loss_mlp": 0.01022221, + "balance_loss_clip": 1.01186192, + "balance_loss_mlp": 1.01823676, + "epoch": 0.7373515707199759, + "flos": 25117109850240.0, + "grad_norm": 1.6098472376259079, + "language_loss": 0.67169547, + "learning_rate": 6.431269648564428e-07, + "loss": 0.69245338, + "num_input_tokens_seen": 264555635, + "router_z_loss_clip": 0.10351562, + "router_z_loss_mlp": 0.35351562, + "step": 12264, + "time_per_iteration": 2.4625935554504395 + }, + { + "auxiliary_loss_clip": 0.01054506, + "auxiliary_loss_mlp": 0.01020396, + "balance_loss_clip": 1.0094645, + "balance_loss_mlp": 1.01801729, + "epoch": 0.737411693972644, + "flos": 32407324179840.0, + "grad_norm": 1.7952140767859484, + "language_loss": 0.80005455, + "learning_rate": 6.428494184847524e-07, + "loss": 0.82080364, + "num_input_tokens_seen": 264573140, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.36523438, + "step": 12265, + "time_per_iteration": 2.4882190227508545 + }, + { + "auxiliary_loss_clip": 0.01059297, + "auxiliary_loss_mlp": 0.01023394, + "balance_loss_clip": 1.01111543, + "balance_loss_mlp": 1.01866651, + "epoch": 0.7374718172253119, + "flos": 24314666094720.0, + "grad_norm": 1.6161540387858548, + "language_loss": 0.7415086, + "learning_rate": 6.425719205462737e-07, + "loss": 0.76233554, + "num_input_tokens_seen": 264591610, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.40625, + "step": 12266, + "time_per_iteration": 2.4206414222717285 + }, + { + "auxiliary_loss_clip": 0.01058031, + "auxiliary_loss_mlp": 0.01029732, + "balance_loss_clip": 1.01748943, + "balance_loss_mlp": 1.01933992, + "epoch": 0.7375319404779799, + "flos": 27155060265600.0, + "grad_norm": 1.5794466599854042, + "language_loss": 0.73314977, + "learning_rate": 6.422944710509121e-07, + "loss": 0.75402737, + "num_input_tokens_seen": 264611170, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.38671875, + "step": 12267, + "time_per_iteration": 2.450191020965576 + }, + { + "auxiliary_loss_clip": 0.01054572, + "auxiliary_loss_mlp": 0.01024358, + "balance_loss_clip": 1.01313412, + "balance_loss_mlp": 1.01727653, + "epoch": 0.7375920637306478, + "flos": 18879003475200.0, + "grad_norm": 7.1713405147012566, + "language_loss": 0.82894856, + "learning_rate": 6.420170700085687e-07, + "loss": 0.84973776, + "num_input_tokens_seen": 264629365, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.37304688, + "step": 12268, + "time_per_iteration": 2.3766262531280518 + }, + { + "auxiliary_loss_clip": 0.01056192, + "auxiliary_loss_mlp": 0.01025125, + "balance_loss_clip": 1.01354945, + "balance_loss_mlp": 1.01808047, + "epoch": 0.7376521869833158, + "flos": 15668844428160.0, + "grad_norm": 1.6484245844900796, + "language_loss": 0.73307216, + "learning_rate": 6.417397174291426e-07, + "loss": 0.75388527, + "num_input_tokens_seen": 264647915, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.38085938, + "step": 12269, + "time_per_iteration": 2.414579153060913 + }, + { + "auxiliary_loss_clip": 0.01059765, + "auxiliary_loss_mlp": 0.01024425, + "balance_loss_clip": 1.01191378, + "balance_loss_mlp": 1.01926947, + "epoch": 0.7377123102359837, + "flos": 36970611857280.0, + "grad_norm": 1.820424573804852, + "language_loss": 0.70577997, + "learning_rate": 6.414624133225317e-07, + "loss": 0.72662187, + "num_input_tokens_seen": 264669620, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.40625, + "step": 12270, + "time_per_iteration": 2.5227010250091553 + }, + { + "auxiliary_loss_clip": 0.01055328, + "auxiliary_loss_mlp": 0.01023323, + "balance_loss_clip": 1.01276672, + "balance_loss_mlp": 1.01841402, + "epoch": 0.7377724334886517, + "flos": 24495202068480.0, + "grad_norm": 3.999049732733337, + "language_loss": 0.69708359, + "learning_rate": 6.411851576986331e-07, + "loss": 0.71787012, + "num_input_tokens_seen": 264689345, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.36914062, + "step": 12271, + "time_per_iteration": 3.882479190826416 + }, + { + "auxiliary_loss_clip": 0.01057447, + "auxiliary_loss_mlp": 0.01022218, + "balance_loss_clip": 1.00964165, + "balance_loss_mlp": 1.01815355, + "epoch": 0.7378325567413198, + "flos": 24388856467200.0, + "grad_norm": 1.8915140089901354, + "language_loss": 0.67489684, + "learning_rate": 6.409079505673418e-07, + "loss": 0.69569355, + "num_input_tokens_seen": 264707625, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.39257812, + "step": 12272, + "time_per_iteration": 3.8464713096618652 + }, + { + "auxiliary_loss_clip": 0.01055478, + "auxiliary_loss_mlp": 0.01024798, + "balance_loss_clip": 1.01332426, + "balance_loss_mlp": 1.01886451, + "epoch": 0.7378926799939877, + "flos": 17415549348480.0, + "grad_norm": 1.6469739132744035, + "language_loss": 0.782233, + "learning_rate": 6.406307919385483e-07, + "loss": 0.80303574, + "num_input_tokens_seen": 264725575, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.3671875, + "step": 12273, + "time_per_iteration": 2.345674753189087 + }, + { + "auxiliary_loss_clip": 0.01055492, + "auxiliary_loss_mlp": 0.01025319, + "balance_loss_clip": 1.01380968, + "balance_loss_mlp": 1.0174005, + "epoch": 0.7379528032466557, + "flos": 18473347854720.0, + "grad_norm": 2.0501259105316727, + "language_loss": 0.84139442, + "learning_rate": 6.40353681822146e-07, + "loss": 0.86220253, + "num_input_tokens_seen": 264742855, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.38085938, + "step": 12274, + "time_per_iteration": 2.381255626678467 + }, + { + "auxiliary_loss_clip": 0.01060944, + "auxiliary_loss_mlp": 0.01025952, + "balance_loss_clip": 1.01248741, + "balance_loss_mlp": 1.01961994, + "epoch": 0.7380129264993236, + "flos": 17821030412160.0, + "grad_norm": 1.8627592657234557, + "language_loss": 0.74172533, + "learning_rate": 6.400766202280232e-07, + "loss": 0.76259428, + "num_input_tokens_seen": 264761155, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.4140625, + "step": 12275, + "time_per_iteration": 2.362823247909546 + }, + { + "auxiliary_loss_clip": 0.01059936, + "auxiliary_loss_mlp": 0.01027247, + "balance_loss_clip": 1.0149442, + "balance_loss_mlp": 1.02013564, + "epoch": 0.7380730497519916, + "flos": 22996415779200.0, + "grad_norm": 1.9539488675480121, + "language_loss": 0.73541188, + "learning_rate": 6.397996071660676e-07, + "loss": 0.7562837, + "num_input_tokens_seen": 264780660, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.3984375, + "step": 12276, + "time_per_iteration": 2.397045373916626 + }, + { + "auxiliary_loss_clip": 0.01057798, + "auxiliary_loss_mlp": 0.01023055, + "balance_loss_clip": 1.01074076, + "balance_loss_mlp": 1.01827967, + "epoch": 0.7381331730046595, + "flos": 20265229941120.0, + "grad_norm": 2.103117957931478, + "language_loss": 0.7731604, + "learning_rate": 6.395226426461646e-07, + "loss": 0.79396892, + "num_input_tokens_seen": 264798850, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.39648438, + "step": 12277, + "time_per_iteration": 2.3703861236572266 + }, + { + "auxiliary_loss_clip": 0.01056064, + "auxiliary_loss_mlp": 0.01028932, + "balance_loss_clip": 1.0164628, + "balance_loss_mlp": 1.01911902, + "epoch": 0.7381932962573275, + "flos": 19753542921600.0, + "grad_norm": 4.434507553047378, + "language_loss": 0.78485435, + "learning_rate": 6.392457266781996e-07, + "loss": 0.80570436, + "num_input_tokens_seen": 264816795, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.36914062, + "step": 12278, + "time_per_iteration": 3.7896523475646973 + }, + { + "auxiliary_loss_clip": 0.01057269, + "auxiliary_loss_mlp": 0.01022162, + "balance_loss_clip": 1.01003289, + "balance_loss_mlp": 1.01836669, + "epoch": 0.7382534195099955, + "flos": 17304525624960.0, + "grad_norm": 3.0724120154192778, + "language_loss": 0.71876264, + "learning_rate": 6.389688592720543e-07, + "loss": 0.73955691, + "num_input_tokens_seen": 264834105, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.38867188, + "step": 12279, + "time_per_iteration": 2.355325698852539 + }, + { + "auxiliary_loss_clip": 0.01057759, + "auxiliary_loss_mlp": 0.01025382, + "balance_loss_clip": 1.01259637, + "balance_loss_mlp": 1.01746893, + "epoch": 0.7383135427626635, + "flos": 18696372819840.0, + "grad_norm": 2.0911112329727177, + "language_loss": 0.85812998, + "learning_rate": 6.386920404376095e-07, + "loss": 0.87896132, + "num_input_tokens_seen": 264850895, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.40234375, + "step": 12280, + "time_per_iteration": 2.3553061485290527 + }, + { + "auxiliary_loss_clip": 0.01057085, + "auxiliary_loss_mlp": 0.01023462, + "balance_loss_clip": 1.01155901, + "balance_loss_mlp": 1.01899981, + "epoch": 0.7383736660153314, + "flos": 20880399830400.0, + "grad_norm": 1.9444941375173148, + "language_loss": 0.72408456, + "learning_rate": 6.384152701847434e-07, + "loss": 0.74488997, + "num_input_tokens_seen": 264869505, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.38085938, + "step": 12281, + "time_per_iteration": 2.436616897583008 + }, + { + "auxiliary_loss_clip": 0.01056723, + "auxiliary_loss_mlp": 0.01027449, + "balance_loss_clip": 1.01500928, + "balance_loss_mlp": 1.01788759, + "epoch": 0.7384337892679994, + "flos": 20662297367040.0, + "grad_norm": 1.6997557319447893, + "language_loss": 0.6113013, + "learning_rate": 6.38138548523335e-07, + "loss": 0.63214302, + "num_input_tokens_seen": 264886915, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.38867188, + "step": 12282, + "time_per_iteration": 2.4137699604034424 + }, + { + "auxiliary_loss_clip": 0.01058278, + "auxiliary_loss_mlp": 0.01026024, + "balance_loss_clip": 1.01325083, + "balance_loss_mlp": 1.01871264, + "epoch": 0.7384939125206673, + "flos": 29168326483200.0, + "grad_norm": 1.7369817261042586, + "language_loss": 0.67940092, + "learning_rate": 6.378618754632576e-07, + "loss": 0.70024395, + "num_input_tokens_seen": 264910350, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.39453125, + "step": 12283, + "time_per_iteration": 2.4969756603240967 + }, + { + "auxiliary_loss_clip": 0.01058777, + "auxiliary_loss_mlp": 0.01024241, + "balance_loss_clip": 1.01215887, + "balance_loss_mlp": 1.02022099, + "epoch": 0.7385540357733353, + "flos": 36311556522240.0, + "grad_norm": 1.8520950918980175, + "language_loss": 0.7584427, + "learning_rate": 6.375852510143867e-07, + "loss": 0.77927285, + "num_input_tokens_seen": 264930705, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.38476562, + "step": 12284, + "time_per_iteration": 2.5530524253845215 + }, + { + "auxiliary_loss_clip": 0.01057932, + "auxiliary_loss_mlp": 0.01025554, + "balance_loss_clip": 1.01206565, + "balance_loss_mlp": 1.0183394, + "epoch": 0.7386141590260034, + "flos": 20301574533120.0, + "grad_norm": 1.7941586175747968, + "language_loss": 0.69200397, + "learning_rate": 6.373086751865935e-07, + "loss": 0.71283883, + "num_input_tokens_seen": 264946975, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.39453125, + "step": 12285, + "time_per_iteration": 2.3973727226257324 + }, + { + "auxiliary_loss_clip": 0.01060769, + "auxiliary_loss_mlp": 0.01025585, + "balance_loss_clip": 1.01240063, + "balance_loss_mlp": 1.01907229, + "epoch": 0.7386742822786713, + "flos": 25482615540480.0, + "grad_norm": 1.8604477990498662, + "language_loss": 0.79850292, + "learning_rate": 6.370321479897485e-07, + "loss": 0.81936646, + "num_input_tokens_seen": 264967665, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.41796875, + "step": 12286, + "time_per_iteration": 2.4738271236419678 + }, + { + "auxiliary_loss_clip": 0.01053983, + "auxiliary_loss_mlp": 0.01025526, + "balance_loss_clip": 1.01376581, + "balance_loss_mlp": 1.01652217, + "epoch": 0.7387344055313393, + "flos": 13771105499520.0, + "grad_norm": 1.705569908531915, + "language_loss": 0.65588391, + "learning_rate": 6.367556694337199e-07, + "loss": 0.67667896, + "num_input_tokens_seen": 264985480, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.375, + "step": 12287, + "time_per_iteration": 2.3838133811950684 + }, + { + "auxiliary_loss_clip": 0.01055608, + "auxiliary_loss_mlp": 0.0102447, + "balance_loss_clip": 1.01312768, + "balance_loss_mlp": 1.01864123, + "epoch": 0.7387945287840072, + "flos": 27853951328640.0, + "grad_norm": 1.7897277987233207, + "language_loss": 0.76807213, + "learning_rate": 6.364792395283744e-07, + "loss": 0.78887296, + "num_input_tokens_seen": 265004790, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.36914062, + "step": 12288, + "time_per_iteration": 2.447788953781128 + }, + { + "auxiliary_loss_clip": 0.01060036, + "auxiliary_loss_mlp": 0.01025202, + "balance_loss_clip": 1.01160002, + "balance_loss_mlp": 1.01947749, + "epoch": 0.7388546520366752, + "flos": 44231463866880.0, + "grad_norm": 1.7557369584382014, + "language_loss": 0.58284956, + "learning_rate": 6.362028582835788e-07, + "loss": 0.60370195, + "num_input_tokens_seen": 265028790, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.40625, + "step": 12289, + "time_per_iteration": 2.5875697135925293 + }, + { + "auxiliary_loss_clip": 0.01007266, + "auxiliary_loss_mlp": 0.01001307, + "balance_loss_clip": 1.00035298, + "balance_loss_mlp": 1.00092554, + "epoch": 0.7389147752893431, + "flos": 70677681465600.0, + "grad_norm": 0.6376696004512017, + "language_loss": 0.49320418, + "learning_rate": 6.359265257091937e-07, + "loss": 0.51328987, + "num_input_tokens_seen": 265096660, + "router_z_loss_clip": 0.00952148, + "router_z_loss_mlp": 0.06347656, + "step": 12290, + "time_per_iteration": 3.1791086196899414 + }, + { + "auxiliary_loss_clip": 0.01059913, + "auxiliary_loss_mlp": 0.01024853, + "balance_loss_clip": 1.01194298, + "balance_loss_mlp": 1.02009487, + "epoch": 0.7389748985420111, + "flos": 25993778889600.0, + "grad_norm": 2.3521868064822296, + "language_loss": 0.67510551, + "learning_rate": 6.356502418150827e-07, + "loss": 0.69595313, + "num_input_tokens_seen": 265116375, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.3984375, + "step": 12291, + "time_per_iteration": 2.4766652584075928 + }, + { + "auxiliary_loss_clip": 0.01006892, + "auxiliary_loss_mlp": 0.01002581, + "balance_loss_clip": 1.00157964, + "balance_loss_mlp": 1.00042641, + "epoch": 0.7390350217946791, + "flos": 54401033445120.0, + "grad_norm": 0.9650535761506028, + "language_loss": 0.60807794, + "learning_rate": 6.353740066111051e-07, + "loss": 0.62817264, + "num_input_tokens_seen": 265161230, + "router_z_loss_clip": 0.01000977, + "router_z_loss_mlp": 0.06445312, + "step": 12292, + "time_per_iteration": 2.770267963409424 + }, + { + "auxiliary_loss_clip": 0.01058286, + "auxiliary_loss_mlp": 0.01029133, + "balance_loss_clip": 1.01790321, + "balance_loss_mlp": 1.01944339, + "epoch": 0.7390951450473471, + "flos": 32195610495360.0, + "grad_norm": 1.6867395811959693, + "language_loss": 0.66951382, + "learning_rate": 6.350978201071189e-07, + "loss": 0.69038802, + "num_input_tokens_seen": 265182515, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.38867188, + "step": 12293, + "time_per_iteration": 2.476315975189209 + }, + { + "auxiliary_loss_clip": 0.01056226, + "auxiliary_loss_mlp": 0.01025528, + "balance_loss_clip": 1.01329672, + "balance_loss_mlp": 1.01781178, + "epoch": 0.739155268300015, + "flos": 16683490627200.0, + "grad_norm": 1.9525569284853732, + "language_loss": 0.8335458, + "learning_rate": 6.3482168231298e-07, + "loss": 0.85436332, + "num_input_tokens_seen": 265198160, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.3828125, + "step": 12294, + "time_per_iteration": 2.3888323307037354 + }, + { + "auxiliary_loss_clip": 0.01055833, + "auxiliary_loss_mlp": 0.010267, + "balance_loss_clip": 1.01435602, + "balance_loss_mlp": 1.01829576, + "epoch": 0.739215391552683, + "flos": 31648416756480.0, + "grad_norm": 1.7218586999497538, + "language_loss": 0.72813118, + "learning_rate": 6.345455932385442e-07, + "loss": 0.74895656, + "num_input_tokens_seen": 265218480, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.375, + "step": 12295, + "time_per_iteration": 2.4781906604766846 + }, + { + "auxiliary_loss_clip": 0.01055963, + "auxiliary_loss_mlp": 0.01026494, + "balance_loss_clip": 1.01519859, + "balance_loss_mlp": 1.0192765, + "epoch": 0.7392755148053509, + "flos": 29717161056000.0, + "grad_norm": 1.702006317528613, + "language_loss": 0.78531194, + "learning_rate": 6.342695528936637e-07, + "loss": 0.80613649, + "num_input_tokens_seen": 265240165, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.3671875, + "step": 12296, + "time_per_iteration": 2.5622215270996094 + }, + { + "auxiliary_loss_clip": 0.01059119, + "auxiliary_loss_mlp": 0.01025308, + "balance_loss_clip": 1.01244593, + "balance_loss_mlp": 1.02005672, + "epoch": 0.7393356380580189, + "flos": 37048956681600.0, + "grad_norm": 1.9427133626203583, + "language_loss": 0.67157459, + "learning_rate": 6.3399356128819e-07, + "loss": 0.69241887, + "num_input_tokens_seen": 265263295, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.390625, + "step": 12297, + "time_per_iteration": 2.522812843322754 + }, + { + "auxiliary_loss_clip": 0.01059216, + "auxiliary_loss_mlp": 0.01026908, + "balance_loss_clip": 1.01435483, + "balance_loss_mlp": 1.02024782, + "epoch": 0.739395761310687, + "flos": 19718594784000.0, + "grad_norm": 1.655486502568218, + "language_loss": 0.69093645, + "learning_rate": 6.337176184319715e-07, + "loss": 0.71179765, + "num_input_tokens_seen": 265282740, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.390625, + "step": 12298, + "time_per_iteration": 2.456219434738159 + }, + { + "auxiliary_loss_clip": 0.01057099, + "auxiliary_loss_mlp": 0.01028154, + "balance_loss_clip": 1.01551759, + "balance_loss_mlp": 1.01748443, + "epoch": 0.7394558845633549, + "flos": 11800712298240.0, + "grad_norm": 2.1350132114989706, + "language_loss": 0.74991894, + "learning_rate": 6.33441724334858e-07, + "loss": 0.77077138, + "num_input_tokens_seen": 265300175, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.39648438, + "step": 12299, + "time_per_iteration": 2.5152647495269775 + }, + { + "auxiliary_loss_clip": 0.01006715, + "auxiliary_loss_mlp": 0.01002392, + "balance_loss_clip": 1.00145018, + "balance_loss_mlp": 1.00044918, + "epoch": 0.7395160078160229, + "flos": 66192668789760.0, + "grad_norm": 0.7375620484339258, + "language_loss": 0.60915715, + "learning_rate": 6.33165879006693e-07, + "loss": 0.6292482, + "num_input_tokens_seen": 265363275, + "router_z_loss_clip": 0.00939941, + "router_z_loss_mlp": 0.0625, + "step": 12300, + "time_per_iteration": 4.551513433456421 + }, + { + "auxiliary_loss_clip": 0.01057003, + "auxiliary_loss_mlp": 0.01027033, + "balance_loss_clip": 1.0151422, + "balance_loss_mlp": 1.01875424, + "epoch": 0.7395761310686908, + "flos": 21248698429440.0, + "grad_norm": 1.7347589429382981, + "language_loss": 0.80020165, + "learning_rate": 6.328900824573222e-07, + "loss": 0.82104194, + "num_input_tokens_seen": 265382935, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.3828125, + "step": 12301, + "time_per_iteration": 2.431077480316162 + }, + { + "auxiliary_loss_clip": 0.01056104, + "auxiliary_loss_mlp": 0.01021548, + "balance_loss_clip": 1.01056314, + "balance_loss_mlp": 1.01832449, + "epoch": 0.7396362543213588, + "flos": 25954187541120.0, + "grad_norm": 1.7024253968069687, + "language_loss": 0.73306102, + "learning_rate": 6.326143346965887e-07, + "loss": 0.75383747, + "num_input_tokens_seen": 265403245, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.37695312, + "step": 12302, + "time_per_iteration": 2.435908079147339 + }, + { + "auxiliary_loss_clip": 0.01058365, + "auxiliary_loss_mlp": 0.01033419, + "balance_loss_clip": 1.02063966, + "balance_loss_mlp": 1.01902413, + "epoch": 0.7396963775740267, + "flos": 27376793510400.0, + "grad_norm": 1.7177577287701258, + "language_loss": 0.74034929, + "learning_rate": 6.323386357343308e-07, + "loss": 0.76126707, + "num_input_tokens_seen": 265423105, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.39453125, + "step": 12303, + "time_per_iteration": 2.452777147293091 + }, + { + "auxiliary_loss_clip": 0.01054687, + "auxiliary_loss_mlp": 0.01027146, + "balance_loss_clip": 1.01638103, + "balance_loss_mlp": 1.01782846, + "epoch": 0.7397565008266948, + "flos": 25518960132480.0, + "grad_norm": 1.827305509733028, + "language_loss": 0.54664117, + "learning_rate": 6.320629855803897e-07, + "loss": 0.56745946, + "num_input_tokens_seen": 265443445, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.3671875, + "step": 12304, + "time_per_iteration": 2.436325788497925 + }, + { + "auxiliary_loss_clip": 0.01057878, + "auxiliary_loss_mlp": 0.01027316, + "balance_loss_clip": 1.0159198, + "balance_loss_mlp": 1.019261, + "epoch": 0.7398166240793627, + "flos": 23726763843840.0, + "grad_norm": 1.8830362324002319, + "language_loss": 0.84184009, + "learning_rate": 6.317873842446011e-07, + "loss": 0.862692, + "num_input_tokens_seen": 265462085, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.38671875, + "step": 12305, + "time_per_iteration": 2.445017099380493 + }, + { + "auxiliary_loss_clip": 0.010068, + "auxiliary_loss_mlp": 0.01001024, + "balance_loss_clip": 1.00004017, + "balance_loss_mlp": 1.0004642, + "epoch": 0.7398767473320307, + "flos": 67264012903680.0, + "grad_norm": 0.8763730575471884, + "language_loss": 0.57698536, + "learning_rate": 6.315118317368027e-07, + "loss": 0.59706366, + "num_input_tokens_seen": 265521190, + "router_z_loss_clip": 0.00982666, + "router_z_loss_mlp": 0.06347656, + "step": 12306, + "time_per_iteration": 3.0450072288513184 + }, + { + "auxiliary_loss_clip": 0.01059593, + "auxiliary_loss_mlp": 0.01026209, + "balance_loss_clip": 1.0131079, + "balance_loss_mlp": 1.0196147, + "epoch": 0.7399368705846986, + "flos": 22017590501760.0, + "grad_norm": 2.1934813660588093, + "language_loss": 0.82012337, + "learning_rate": 6.312363280668253e-07, + "loss": 0.84098136, + "num_input_tokens_seen": 265539705, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.3984375, + "step": 12307, + "time_per_iteration": 2.428062677383423 + }, + { + "auxiliary_loss_clip": 0.01056449, + "auxiliary_loss_mlp": 0.01024581, + "balance_loss_clip": 1.01347661, + "balance_loss_mlp": 1.01951659, + "epoch": 0.7399969938373666, + "flos": 14172990693120.0, + "grad_norm": 1.7246127720952993, + "language_loss": 0.69896334, + "learning_rate": 6.309608732445035e-07, + "loss": 0.71977365, + "num_input_tokens_seen": 265555855, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.36914062, + "step": 12308, + "time_per_iteration": 2.4169278144836426 + }, + { + "auxiliary_loss_clip": 0.01055068, + "auxiliary_loss_mlp": 0.01021395, + "balance_loss_clip": 1.01032639, + "balance_loss_mlp": 1.01813018, + "epoch": 0.7400571170900345, + "flos": 25300299087360.0, + "grad_norm": 1.681647414806817, + "language_loss": 0.81172776, + "learning_rate": 6.306854672796664e-07, + "loss": 0.83249247, + "num_input_tokens_seen": 265575455, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.36914062, + "step": 12309, + "time_per_iteration": 2.441969871520996 + }, + { + "auxiliary_loss_clip": 0.01058329, + "auxiliary_loss_mlp": 0.01031997, + "balance_loss_clip": 1.01920021, + "balance_loss_mlp": 1.01881313, + "epoch": 0.7401172403427025, + "flos": 22710232431360.0, + "grad_norm": 1.6576042900792942, + "language_loss": 0.72937143, + "learning_rate": 6.304101101821426e-07, + "loss": 0.75027466, + "num_input_tokens_seen": 265595250, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.39453125, + "step": 12310, + "time_per_iteration": 2.422379970550537 + }, + { + "auxiliary_loss_clip": 0.0105828, + "auxiliary_loss_mlp": 0.01035609, + "balance_loss_clip": 1.02287745, + "balance_loss_mlp": 1.01887202, + "epoch": 0.7401773635953706, + "flos": 18066749627520.0, + "grad_norm": 2.4664347726593543, + "language_loss": 0.88443315, + "learning_rate": 6.301348019617585e-07, + "loss": 0.90537214, + "num_input_tokens_seen": 265606945, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.39453125, + "step": 12311, + "time_per_iteration": 5.280548572540283 + }, + { + "auxiliary_loss_clip": 0.0105522, + "auxiliary_loss_mlp": 0.01025077, + "balance_loss_clip": 1.01365066, + "balance_loss_mlp": 1.0176332, + "epoch": 0.7402374868480385, + "flos": 22711000481280.0, + "grad_norm": 2.0009889286929616, + "language_loss": 0.80257154, + "learning_rate": 6.298595426283399e-07, + "loss": 0.82337451, + "num_input_tokens_seen": 265626115, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.375, + "step": 12312, + "time_per_iteration": 2.4538824558258057 + }, + { + "auxiliary_loss_clip": 0.01057429, + "auxiliary_loss_mlp": 0.01023284, + "balance_loss_clip": 1.01164877, + "balance_loss_mlp": 1.01921201, + "epoch": 0.7402976101007065, + "flos": 22855575888000.0, + "grad_norm": 1.6562197136408638, + "language_loss": 0.78051263, + "learning_rate": 6.295843321917102e-07, + "loss": 0.80131972, + "num_input_tokens_seen": 265646520, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.3828125, + "step": 12313, + "time_per_iteration": 2.4247887134552 + }, + { + "auxiliary_loss_clip": 0.01056601, + "auxiliary_loss_mlp": 0.01026309, + "balance_loss_clip": 1.01350021, + "balance_loss_mlp": 1.01716042, + "epoch": 0.7403577333533744, + "flos": 12345078216960.0, + "grad_norm": 2.096241206668565, + "language_loss": 0.78580081, + "learning_rate": 6.293091706616905e-07, + "loss": 0.8066299, + "num_input_tokens_seen": 265661875, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.39453125, + "step": 12314, + "time_per_iteration": 2.379382371902466 + }, + { + "auxiliary_loss_clip": 0.0100715, + "auxiliary_loss_mlp": 0.01000922, + "balance_loss_clip": 0.99999851, + "balance_loss_mlp": 1.0009675, + "epoch": 0.7404178566060424, + "flos": 60324117822720.0, + "grad_norm": 0.8688760304840859, + "language_loss": 0.55125374, + "learning_rate": 6.290340580480997e-07, + "loss": 0.57133448, + "num_input_tokens_seen": 265721255, + "router_z_loss_clip": 0.00921631, + "router_z_loss_mlp": 0.06176758, + "step": 12315, + "time_per_iteration": 3.006643056869507 + }, + { + "auxiliary_loss_clip": 0.01055587, + "auxiliary_loss_mlp": 0.01026443, + "balance_loss_clip": 1.01552391, + "balance_loss_mlp": 1.01870513, + "epoch": 0.7404779798587103, + "flos": 32013294042240.0, + "grad_norm": 1.526184989234868, + "language_loss": 0.79326558, + "learning_rate": 6.287589943607584e-07, + "loss": 0.8140859, + "num_input_tokens_seen": 265743970, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.36914062, + "step": 12316, + "time_per_iteration": 2.486974000930786 + }, + { + "auxiliary_loss_clip": 0.01056575, + "auxiliary_loss_mlp": 0.01025436, + "balance_loss_clip": 1.01352096, + "balance_loss_mlp": 1.01834679, + "epoch": 0.7405381031113784, + "flos": 12889060110720.0, + "grad_norm": 1.6254155511476462, + "language_loss": 0.75066906, + "learning_rate": 6.284839796094806e-07, + "loss": 0.77148914, + "num_input_tokens_seen": 265760890, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.3828125, + "step": 12317, + "time_per_iteration": 2.3846380710601807 + }, + { + "auxiliary_loss_clip": 0.01056164, + "auxiliary_loss_mlp": 0.01026049, + "balance_loss_clip": 1.01461124, + "balance_loss_mlp": 1.0196085, + "epoch": 0.7405982263640463, + "flos": 20228117299200.0, + "grad_norm": 1.7277359247958726, + "language_loss": 0.81570876, + "learning_rate": 6.28209013804081e-07, + "loss": 0.83653092, + "num_input_tokens_seen": 265779600, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.36523438, + "step": 12318, + "time_per_iteration": 3.833743095397949 + }, + { + "auxiliary_loss_clip": 0.01057731, + "auxiliary_loss_mlp": 0.01022109, + "balance_loss_clip": 1.01008701, + "balance_loss_mlp": 1.0189116, + "epoch": 0.7406583496167143, + "flos": 17566234243200.0, + "grad_norm": 1.9158271588536822, + "language_loss": 0.76992428, + "learning_rate": 6.279340969543742e-07, + "loss": 0.79072273, + "num_input_tokens_seen": 265797030, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.38867188, + "step": 12319, + "time_per_iteration": 2.3845832347869873 + }, + { + "auxiliary_loss_clip": 0.01056121, + "auxiliary_loss_mlp": 0.01023233, + "balance_loss_clip": 1.01252174, + "balance_loss_mlp": 1.01905274, + "epoch": 0.7407184728693822, + "flos": 18294766917120.0, + "grad_norm": 1.846809815814494, + "language_loss": 0.63066882, + "learning_rate": 6.27659229070169e-07, + "loss": 0.65146238, + "num_input_tokens_seen": 265815055, + "router_z_loss_clip": 0.10693359, + "router_z_loss_mlp": 0.37109375, + "step": 12320, + "time_per_iteration": 2.402940273284912 + }, + { + "auxiliary_loss_clip": 0.01058524, + "auxiliary_loss_mlp": 0.01031514, + "balance_loss_clip": 1.01875818, + "balance_loss_mlp": 1.0183953, + "epoch": 0.7407785961220502, + "flos": 16689635026560.0, + "grad_norm": 2.359415046726604, + "language_loss": 0.82566977, + "learning_rate": 6.273844101612765e-07, + "loss": 0.84657013, + "num_input_tokens_seen": 265828480, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.40039062, + "step": 12321, + "time_per_iteration": 2.3531064987182617 + }, + { + "auxiliary_loss_clip": 0.01061263, + "auxiliary_loss_mlp": 0.01032493, + "balance_loss_clip": 1.01850963, + "balance_loss_mlp": 1.01904511, + "epoch": 0.7408387193747181, + "flos": 22087312220160.0, + "grad_norm": 1.90642214158763, + "language_loss": 0.72312558, + "learning_rate": 6.271096402375027e-07, + "loss": 0.7440632, + "num_input_tokens_seen": 265845825, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.421875, + "step": 12322, + "time_per_iteration": 2.4217522144317627 + }, + { + "auxiliary_loss_clip": 0.01056389, + "auxiliary_loss_mlp": 0.01026769, + "balance_loss_clip": 1.01508033, + "balance_loss_mlp": 1.01852274, + "epoch": 0.7408988426273861, + "flos": 24235762688640.0, + "grad_norm": 1.6749524853152467, + "language_loss": 0.63933873, + "learning_rate": 6.268349193086557e-07, + "loss": 0.66017032, + "num_input_tokens_seen": 265866335, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.37890625, + "step": 12323, + "time_per_iteration": 2.42825984954834 + }, + { + "auxiliary_loss_clip": 0.01057241, + "auxiliary_loss_mlp": 0.01025104, + "balance_loss_clip": 1.01297474, + "balance_loss_mlp": 1.01802754, + "epoch": 0.7409589658800542, + "flos": 29021726217600.0, + "grad_norm": 1.4785936951910505, + "language_loss": 0.75915974, + "learning_rate": 6.26560247384537e-07, + "loss": 0.77998322, + "num_input_tokens_seen": 265888945, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.39257812, + "step": 12324, + "time_per_iteration": 2.4966323375701904 + }, + { + "auxiliary_loss_clip": 0.01057903, + "auxiliary_loss_mlp": 0.01021234, + "balance_loss_clip": 1.0099566, + "balance_loss_mlp": 1.0189594, + "epoch": 0.7410190891327221, + "flos": 19350435830400.0, + "grad_norm": 1.6751525116645112, + "language_loss": 0.75192046, + "learning_rate": 6.262856244749508e-07, + "loss": 0.77271187, + "num_input_tokens_seen": 265908030, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.38867188, + "step": 12325, + "time_per_iteration": 2.3957717418670654 + }, + { + "auxiliary_loss_clip": 0.01059044, + "auxiliary_loss_mlp": 0.01025579, + "balance_loss_clip": 1.01327109, + "balance_loss_mlp": 1.01826882, + "epoch": 0.7410792123853901, + "flos": 22746542112000.0, + "grad_norm": 2.664615288313408, + "language_loss": 0.68517298, + "learning_rate": 6.260110505896971e-07, + "loss": 0.70601922, + "num_input_tokens_seen": 265927030, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.40625, + "step": 12326, + "time_per_iteration": 2.458138942718506 + }, + { + "auxiliary_loss_clip": 0.01058734, + "auxiliary_loss_mlp": 0.01024809, + "balance_loss_clip": 1.01292372, + "balance_loss_mlp": 1.01971412, + "epoch": 0.741139335638058, + "flos": 25371312526080.0, + "grad_norm": 1.6109024811849502, + "language_loss": 0.8938818, + "learning_rate": 6.257365257385748e-07, + "loss": 0.9147172, + "num_input_tokens_seen": 265945490, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.390625, + "step": 12327, + "time_per_iteration": 2.4392991065979004 + }, + { + "auxiliary_loss_clip": 0.01057718, + "auxiliary_loss_mlp": 0.01023882, + "balance_loss_clip": 1.01219392, + "balance_loss_mlp": 1.01893497, + "epoch": 0.741199458890726, + "flos": 18584720691840.0, + "grad_norm": 1.7480911383453495, + "language_loss": 0.85638469, + "learning_rate": 6.2546204993138e-07, + "loss": 0.87720066, + "num_input_tokens_seen": 265963265, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.38867188, + "step": 12328, + "time_per_iteration": 2.450634717941284 + }, + { + "auxiliary_loss_clip": 0.01055663, + "auxiliary_loss_mlp": 0.01023661, + "balance_loss_clip": 1.01120973, + "balance_loss_mlp": 1.01767206, + "epoch": 0.7412595821433939, + "flos": 22125995873280.0, + "grad_norm": 1.8157080336055644, + "language_loss": 0.66811109, + "learning_rate": 6.251876231779103e-07, + "loss": 0.68890429, + "num_input_tokens_seen": 265982270, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.38085938, + "step": 12329, + "time_per_iteration": 2.4363558292388916 + }, + { + "auxiliary_loss_clip": 0.01062116, + "auxiliary_loss_mlp": 0.01025596, + "balance_loss_clip": 1.01323402, + "balance_loss_mlp": 1.02017808, + "epoch": 0.741319705396062, + "flos": 29168396305920.0, + "grad_norm": 1.6222564121268936, + "language_loss": 0.6661613, + "learning_rate": 6.249132454879564e-07, + "loss": 0.68703848, + "num_input_tokens_seen": 266003835, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.41992188, + "step": 12330, + "time_per_iteration": 2.501163959503174 + }, + { + "auxiliary_loss_clip": 0.0106266, + "auxiliary_loss_mlp": 0.01024318, + "balance_loss_clip": 1.00998938, + "balance_loss_mlp": 1.02077615, + "epoch": 0.7413798286487299, + "flos": 20666451818880.0, + "grad_norm": 4.136651559630429, + "language_loss": 0.85398185, + "learning_rate": 6.246389168713127e-07, + "loss": 0.87485158, + "num_input_tokens_seen": 266021595, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.41796875, + "step": 12331, + "time_per_iteration": 2.4134392738342285 + }, + { + "auxiliary_loss_clip": 0.01057675, + "auxiliary_loss_mlp": 0.01022261, + "balance_loss_clip": 1.01050711, + "balance_loss_mlp": 1.01816607, + "epoch": 0.7414399519013979, + "flos": 16397970595200.0, + "grad_norm": 1.7592990669603203, + "language_loss": 0.69574016, + "learning_rate": 6.243646373377678e-07, + "loss": 0.7165395, + "num_input_tokens_seen": 266039860, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.39453125, + "step": 12332, + "time_per_iteration": 2.3794429302215576 + }, + { + "auxiliary_loss_clip": 0.01058993, + "auxiliary_loss_mlp": 0.01025923, + "balance_loss_clip": 1.01356721, + "balance_loss_mlp": 1.01970804, + "epoch": 0.7415000751540658, + "flos": 25629041249280.0, + "grad_norm": 2.2074080251393884, + "language_loss": 0.6345489, + "learning_rate": 6.240904068971107e-07, + "loss": 0.65539807, + "num_input_tokens_seen": 266058050, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.39453125, + "step": 12333, + "time_per_iteration": 2.4985556602478027 + }, + { + "auxiliary_loss_clip": 0.01057567, + "auxiliary_loss_mlp": 0.01023703, + "balance_loss_clip": 1.01203871, + "balance_loss_mlp": 1.02092338, + "epoch": 0.7415601984067338, + "flos": 24498553559040.0, + "grad_norm": 1.6566754983158798, + "language_loss": 0.71755081, + "learning_rate": 6.238162255591275e-07, + "loss": 0.7383635, + "num_input_tokens_seen": 266078060, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.3671875, + "step": 12334, + "time_per_iteration": 2.462841510772705 + }, + { + "auxiliary_loss_clip": 0.01055991, + "auxiliary_loss_mlp": 0.0102186, + "balance_loss_clip": 1.01063085, + "balance_loss_mlp": 1.01795292, + "epoch": 0.7416203216594017, + "flos": 20886090382080.0, + "grad_norm": 1.7696766652231557, + "language_loss": 0.82357663, + "learning_rate": 6.235420933336026e-07, + "loss": 0.84435511, + "num_input_tokens_seen": 266097110, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.37890625, + "step": 12335, + "time_per_iteration": 2.390516996383667 + }, + { + "auxiliary_loss_clip": 0.01057759, + "auxiliary_loss_mlp": 0.01025362, + "balance_loss_clip": 1.01223159, + "balance_loss_mlp": 1.01853037, + "epoch": 0.7416804449120697, + "flos": 15623597439360.0, + "grad_norm": 7.808204561490157, + "language_loss": 0.75108266, + "learning_rate": 6.232680102303212e-07, + "loss": 0.77191389, + "num_input_tokens_seen": 266110870, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.39257812, + "step": 12336, + "time_per_iteration": 2.3426358699798584 + }, + { + "auxiliary_loss_clip": 0.01058595, + "auxiliary_loss_mlp": 0.01024119, + "balance_loss_clip": 1.01163745, + "balance_loss_mlp": 1.01941657, + "epoch": 0.7417405681647377, + "flos": 17119765376640.0, + "grad_norm": 1.809315081417527, + "language_loss": 0.73326236, + "learning_rate": 6.229939762590617e-07, + "loss": 0.75408947, + "num_input_tokens_seen": 266127845, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.390625, + "step": 12337, + "time_per_iteration": 2.399001359939575 + }, + { + "auxiliary_loss_clip": 0.01055195, + "auxiliary_loss_mlp": 0.01026077, + "balance_loss_clip": 1.01436436, + "balance_loss_mlp": 1.01864552, + "epoch": 0.7418006914174057, + "flos": 18879317677440.0, + "grad_norm": 1.5751932043587487, + "language_loss": 0.76734555, + "learning_rate": 6.22719991429606e-07, + "loss": 0.7881583, + "num_input_tokens_seen": 266145400, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.36523438, + "step": 12338, + "time_per_iteration": 2.3888423442840576 + }, + { + "auxiliary_loss_clip": 0.01056371, + "auxiliary_loss_mlp": 0.01020013, + "balance_loss_clip": 1.00909281, + "balance_loss_mlp": 1.01820016, + "epoch": 0.7418608146700737, + "flos": 21579640007040.0, + "grad_norm": 1.7072320283614204, + "language_loss": 0.73010516, + "learning_rate": 6.224460557517301e-07, + "loss": 0.75086904, + "num_input_tokens_seen": 266164430, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.3828125, + "step": 12339, + "time_per_iteration": 3.8276798725128174 + }, + { + "auxiliary_loss_clip": 0.01057711, + "auxiliary_loss_mlp": 0.01024266, + "balance_loss_clip": 1.01195729, + "balance_loss_mlp": 1.01916313, + "epoch": 0.7419209379227416, + "flos": 22339524948480.0, + "grad_norm": 1.6665583289008203, + "language_loss": 0.79635239, + "learning_rate": 6.221721692352123e-07, + "loss": 0.81717217, + "num_input_tokens_seen": 266183855, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.38671875, + "step": 12340, + "time_per_iteration": 2.4212918281555176 + }, + { + "auxiliary_loss_clip": 0.01056443, + "auxiliary_loss_mlp": 0.01026861, + "balance_loss_clip": 1.01493979, + "balance_loss_mlp": 1.017645, + "epoch": 0.7419810611754096, + "flos": 16761381603840.0, + "grad_norm": 1.4726870096362905, + "language_loss": 0.75684661, + "learning_rate": 6.218983318898243e-07, + "loss": 0.77767968, + "num_input_tokens_seen": 266202085, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.38671875, + "step": 12341, + "time_per_iteration": 2.393800735473633 + }, + { + "auxiliary_loss_clip": 0.01053955, + "auxiliary_loss_mlp": 0.01021894, + "balance_loss_clip": 1.01118302, + "balance_loss_mlp": 1.01942468, + "epoch": 0.7420411844280775, + "flos": 26211776618880.0, + "grad_norm": 1.4070302941854285, + "language_loss": 0.80297142, + "learning_rate": 6.216245437253407e-07, + "loss": 0.82372993, + "num_input_tokens_seen": 266223445, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.34570312, + "step": 12342, + "time_per_iteration": 2.4832074642181396 + }, + { + "auxiliary_loss_clip": 0.01056007, + "auxiliary_loss_mlp": 0.0102416, + "balance_loss_clip": 1.0127573, + "balance_loss_mlp": 1.01821852, + "epoch": 0.7421013076807456, + "flos": 68527208004480.0, + "grad_norm": 1.6406397974922882, + "language_loss": 0.77476346, + "learning_rate": 6.213508047515314e-07, + "loss": 0.79556513, + "num_input_tokens_seen": 266246575, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.37890625, + "step": 12343, + "time_per_iteration": 2.795880079269409 + }, + { + "auxiliary_loss_clip": 0.0105388, + "auxiliary_loss_mlp": 0.01024282, + "balance_loss_clip": 1.01274228, + "balance_loss_mlp": 1.01641977, + "epoch": 0.7421614309334135, + "flos": 24424188629760.0, + "grad_norm": 2.0673992909520504, + "language_loss": 0.67429996, + "learning_rate": 6.210771149781655e-07, + "loss": 0.69508159, + "num_input_tokens_seen": 266266055, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.375, + "step": 12344, + "time_per_iteration": 2.4243431091308594 + }, + { + "auxiliary_loss_clip": 0.01059321, + "auxiliary_loss_mlp": 0.01026983, + "balance_loss_clip": 1.01410866, + "balance_loss_mlp": 1.01999784, + "epoch": 0.7422215541860815, + "flos": 12310304636160.0, + "grad_norm": 2.1938801060633346, + "language_loss": 0.81072026, + "learning_rate": 6.208034744150099e-07, + "loss": 0.83158332, + "num_input_tokens_seen": 266282240, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.39453125, + "step": 12345, + "time_per_iteration": 2.353243827819824 + }, + { + "auxiliary_loss_clip": 0.01058361, + "auxiliary_loss_mlp": 0.01023216, + "balance_loss_clip": 1.01153362, + "balance_loss_mlp": 1.01990187, + "epoch": 0.7422816774387494, + "flos": 19644578968320.0, + "grad_norm": 2.0919326803644522, + "language_loss": 0.70789522, + "learning_rate": 6.205298830718317e-07, + "loss": 0.72871095, + "num_input_tokens_seen": 266300980, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.38476562, + "step": 12346, + "time_per_iteration": 2.408140182495117 + }, + { + "auxiliary_loss_clip": 0.01060019, + "auxiliary_loss_mlp": 0.01022925, + "balance_loss_clip": 1.01106405, + "balance_loss_mlp": 1.02071464, + "epoch": 0.7423418006914174, + "flos": 32014585762560.0, + "grad_norm": 1.635056384845561, + "language_loss": 0.73568451, + "learning_rate": 6.202563409583931e-07, + "loss": 0.75651395, + "num_input_tokens_seen": 266322215, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.39257812, + "step": 12347, + "time_per_iteration": 2.4806532859802246 + }, + { + "auxiliary_loss_clip": 0.01059576, + "auxiliary_loss_mlp": 0.01025005, + "balance_loss_clip": 1.012995, + "balance_loss_mlp": 1.0192678, + "epoch": 0.7424019239440853, + "flos": 18915941560320.0, + "grad_norm": 2.717455056676881, + "language_loss": 0.81100976, + "learning_rate": 6.199828480844558e-07, + "loss": 0.83185554, + "num_input_tokens_seen": 266341600, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.40234375, + "step": 12348, + "time_per_iteration": 2.387248992919922 + }, + { + "auxiliary_loss_clip": 0.01056339, + "auxiliary_loss_mlp": 0.01024002, + "balance_loss_clip": 1.0125339, + "balance_loss_mlp": 1.01830733, + "epoch": 0.7424620471967533, + "flos": 35875211950080.0, + "grad_norm": 1.5445628588042228, + "language_loss": 0.72164547, + "learning_rate": 6.197094044597814e-07, + "loss": 0.74244887, + "num_input_tokens_seen": 266362895, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.38085938, + "step": 12349, + "time_per_iteration": 2.5275566577911377 + }, + { + "auxiliary_loss_clip": 0.01060557, + "auxiliary_loss_mlp": 0.01028715, + "balance_loss_clip": 1.01511931, + "balance_loss_mlp": 1.01858616, + "epoch": 0.7425221704494213, + "flos": 27015372449280.0, + "grad_norm": 2.0598036122098424, + "language_loss": 0.78934741, + "learning_rate": 6.19436010094128e-07, + "loss": 0.81024015, + "num_input_tokens_seen": 266384015, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.41992188, + "step": 12350, + "time_per_iteration": 3.9366872310638428 + }, + { + "auxiliary_loss_clip": 0.01055862, + "auxiliary_loss_mlp": 0.01027041, + "balance_loss_clip": 1.01478684, + "balance_loss_mlp": 1.018224, + "epoch": 0.7425822937020893, + "flos": 34165724405760.0, + "grad_norm": 6.875041075827828, + "language_loss": 0.75282007, + "learning_rate": 6.191626649972521e-07, + "loss": 0.7736491, + "num_input_tokens_seen": 266405990, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.37695312, + "step": 12351, + "time_per_iteration": 3.9144082069396973 + }, + { + "auxiliary_loss_clip": 0.01057786, + "auxiliary_loss_mlp": 0.01021839, + "balance_loss_clip": 1.01035893, + "balance_loss_mlp": 1.01944923, + "epoch": 0.7426424169547573, + "flos": 21282634137600.0, + "grad_norm": 2.003432299632619, + "language_loss": 0.81237531, + "learning_rate": 6.188893691789081e-07, + "loss": 0.83317155, + "num_input_tokens_seen": 266424260, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.3828125, + "step": 12352, + "time_per_iteration": 2.393056869506836 + }, + { + "auxiliary_loss_clip": 0.01056148, + "auxiliary_loss_mlp": 0.01026955, + "balance_loss_clip": 1.01484287, + "balance_loss_mlp": 1.01909781, + "epoch": 0.7427025402074252, + "flos": 22447546295040.0, + "grad_norm": 1.8024188198876214, + "language_loss": 0.71649683, + "learning_rate": 6.186161226488511e-07, + "loss": 0.73732781, + "num_input_tokens_seen": 266444580, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.37109375, + "step": 12353, + "time_per_iteration": 2.43319034576416 + }, + { + "auxiliary_loss_clip": 0.01055883, + "auxiliary_loss_mlp": 0.01024825, + "balance_loss_clip": 1.01354218, + "balance_loss_mlp": 1.01888704, + "epoch": 0.7427626634600932, + "flos": 22523621880960.0, + "grad_norm": 4.268052129053393, + "language_loss": 0.72323388, + "learning_rate": 6.183429254168302e-07, + "loss": 0.74404097, + "num_input_tokens_seen": 266465640, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.36914062, + "step": 12354, + "time_per_iteration": 2.5371196269989014 + }, + { + "auxiliary_loss_clip": 0.01007011, + "auxiliary_loss_mlp": 0.01000894, + "balance_loss_clip": 0.99986899, + "balance_loss_mlp": 1.00083137, + "epoch": 0.7428227867127611, + "flos": 67206512027520.0, + "grad_norm": 0.6981686782146401, + "language_loss": 0.59524077, + "learning_rate": 6.180697774925967e-07, + "loss": 0.61531979, + "num_input_tokens_seen": 266531950, + "router_z_loss_clip": 0.01025391, + "router_z_loss_mlp": 0.06176758, + "step": 12355, + "time_per_iteration": 3.213822603225708 + }, + { + "auxiliary_loss_clip": 0.01056492, + "auxiliary_loss_mlp": 0.01024936, + "balance_loss_clip": 1.01276422, + "balance_loss_mlp": 1.01808834, + "epoch": 0.7428829099654292, + "flos": 14720324077440.0, + "grad_norm": 1.5649466724794978, + "language_loss": 0.67692912, + "learning_rate": 6.177966788858977e-07, + "loss": 0.69774342, + "num_input_tokens_seen": 266550665, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.38476562, + "step": 12356, + "time_per_iteration": 2.3821346759796143 + }, + { + "auxiliary_loss_clip": 0.0105731, + "auxiliary_loss_mlp": 0.01026932, + "balance_loss_clip": 1.01520216, + "balance_loss_mlp": 1.01678634, + "epoch": 0.7429430332180971, + "flos": 48644834699520.0, + "grad_norm": 1.669876431674229, + "language_loss": 0.72342062, + "learning_rate": 6.175236296064807e-07, + "loss": 0.74426305, + "num_input_tokens_seen": 266572455, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.40625, + "step": 12357, + "time_per_iteration": 4.047618865966797 + }, + { + "auxiliary_loss_clip": 0.01056136, + "auxiliary_loss_mlp": 0.0102062, + "balance_loss_clip": 1.00931346, + "balance_loss_mlp": 1.01981413, + "epoch": 0.7430031564707651, + "flos": 16763127171840.0, + "grad_norm": 2.009028962343229, + "language_loss": 0.65110809, + "learning_rate": 6.172506296640883e-07, + "loss": 0.6718756, + "num_input_tokens_seen": 266590895, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.36328125, + "step": 12358, + "time_per_iteration": 2.3972136974334717 + }, + { + "auxiliary_loss_clip": 0.010585, + "auxiliary_loss_mlp": 0.01023803, + "balance_loss_clip": 1.01094079, + "balance_loss_mlp": 1.01934385, + "epoch": 0.743063279723433, + "flos": 23869663505280.0, + "grad_norm": 1.8314603791946706, + "language_loss": 0.80984306, + "learning_rate": 6.169776790684644e-07, + "loss": 0.83066607, + "num_input_tokens_seen": 266607660, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.390625, + "step": 12359, + "time_per_iteration": 2.4128732681274414 + }, + { + "auxiliary_loss_clip": 0.01054811, + "auxiliary_loss_mlp": 0.01020509, + "balance_loss_clip": 1.00933897, + "balance_loss_mlp": 1.01679587, + "epoch": 0.743123402976101, + "flos": 14390848776960.0, + "grad_norm": 2.0005674402632843, + "language_loss": 0.68161547, + "learning_rate": 6.167047778293497e-07, + "loss": 0.70236862, + "num_input_tokens_seen": 266624260, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.37890625, + "step": 12360, + "time_per_iteration": 2.3534505367279053 + }, + { + "auxiliary_loss_clip": 0.01056264, + "auxiliary_loss_mlp": 0.01024877, + "balance_loss_clip": 1.01295042, + "balance_loss_mlp": 1.01793587, + "epoch": 0.7431835262287689, + "flos": 27453078564480.0, + "grad_norm": 1.8849744720522066, + "language_loss": 0.72542673, + "learning_rate": 6.164319259564834e-07, + "loss": 0.74623811, + "num_input_tokens_seen": 266644210, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.3828125, + "step": 12361, + "time_per_iteration": 2.431988477706909 + }, + { + "auxiliary_loss_clip": 0.01057732, + "auxiliary_loss_mlp": 0.01024183, + "balance_loss_clip": 1.01204133, + "balance_loss_mlp": 1.01888525, + "epoch": 0.743243649481437, + "flos": 20958465363840.0, + "grad_norm": 2.8491975928436393, + "language_loss": 0.55980593, + "learning_rate": 6.161591234596024e-07, + "loss": 0.58062506, + "num_input_tokens_seen": 266664230, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.38867188, + "step": 12362, + "time_per_iteration": 2.4197089672088623 + }, + { + "auxiliary_loss_clip": 0.01057427, + "auxiliary_loss_mlp": 0.01023468, + "balance_loss_clip": 1.01180935, + "balance_loss_mlp": 1.01969135, + "epoch": 0.7433037727341049, + "flos": 22782083742720.0, + "grad_norm": 1.7310941638931565, + "language_loss": 0.77614909, + "learning_rate": 6.158863703484427e-07, + "loss": 0.79695803, + "num_input_tokens_seen": 266683270, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.37695312, + "step": 12363, + "time_per_iteration": 2.3912434577941895 + }, + { + "auxiliary_loss_clip": 0.01055759, + "auxiliary_loss_mlp": 0.01022094, + "balance_loss_clip": 1.01120472, + "balance_loss_mlp": 1.01826882, + "epoch": 0.7433638959867729, + "flos": 22195717591680.0, + "grad_norm": 1.557071741485324, + "language_loss": 0.77958113, + "learning_rate": 6.156136666327383e-07, + "loss": 0.80035967, + "num_input_tokens_seen": 266701235, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.375, + "step": 12364, + "time_per_iteration": 2.405129909515381 + }, + { + "auxiliary_loss_clip": 0.01055744, + "auxiliary_loss_mlp": 0.01023046, + "balance_loss_clip": 1.01212084, + "balance_loss_mlp": 1.01825857, + "epoch": 0.7434240192394409, + "flos": 23295586152960.0, + "grad_norm": 1.5935748205085078, + "language_loss": 0.78614551, + "learning_rate": 6.153410123222202e-07, + "loss": 0.8069334, + "num_input_tokens_seen": 266721495, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.375, + "step": 12365, + "time_per_iteration": 2.4228734970092773 + }, + { + "auxiliary_loss_clip": 0.01058925, + "auxiliary_loss_mlp": 0.01026113, + "balance_loss_clip": 1.01453209, + "balance_loss_mlp": 1.01921117, + "epoch": 0.7434841424921088, + "flos": 54007773223680.0, + "grad_norm": 2.152660575667599, + "language_loss": 0.7712481, + "learning_rate": 6.150684074266203e-07, + "loss": 0.79209846, + "num_input_tokens_seen": 266747400, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.3984375, + "step": 12366, + "time_per_iteration": 2.7159767150878906 + }, + { + "auxiliary_loss_clip": 0.0105502, + "auxiliary_loss_mlp": 0.01022816, + "balance_loss_clip": 1.01270056, + "balance_loss_mlp": 1.01852572, + "epoch": 0.7435442657447768, + "flos": 21432900096000.0, + "grad_norm": 1.4225271736420508, + "language_loss": 0.71414202, + "learning_rate": 6.147958519556664e-07, + "loss": 0.73492038, + "num_input_tokens_seen": 266767630, + "router_z_loss_clip": 0.10058594, + "router_z_loss_mlp": 0.36523438, + "step": 12367, + "time_per_iteration": 2.3996353149414062 + }, + { + "auxiliary_loss_clip": 0.01058849, + "auxiliary_loss_mlp": 0.01025497, + "balance_loss_clip": 1.01356435, + "balance_loss_mlp": 1.01948881, + "epoch": 0.7436043889974447, + "flos": 24498239356800.0, + "grad_norm": 2.555668721687124, + "language_loss": 0.7453931, + "learning_rate": 6.145233459190855e-07, + "loss": 0.76623654, + "num_input_tokens_seen": 266788015, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.39257812, + "step": 12368, + "time_per_iteration": 2.431136131286621 + }, + { + "auxiliary_loss_clip": 0.01055608, + "auxiliary_loss_mlp": 0.010246, + "balance_loss_clip": 1.01407981, + "balance_loss_mlp": 1.01906848, + "epoch": 0.7436645122501128, + "flos": 40696786932480.0, + "grad_norm": 1.6350956493109963, + "language_loss": 0.69048089, + "learning_rate": 6.142508893266019e-07, + "loss": 0.71128297, + "num_input_tokens_seen": 266809010, + "router_z_loss_clip": 0.10498047, + "router_z_loss_mlp": 0.36523438, + "step": 12369, + "time_per_iteration": 2.55206036567688 + }, + { + "auxiliary_loss_clip": 0.01055995, + "auxiliary_loss_mlp": 0.01022461, + "balance_loss_clip": 1.01101661, + "balance_loss_mlp": 1.01824784, + "epoch": 0.7437246355027807, + "flos": 18908051592960.0, + "grad_norm": 1.9271415474915132, + "language_loss": 0.75510842, + "learning_rate": 6.139784821879406e-07, + "loss": 0.77589297, + "num_input_tokens_seen": 266825390, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.37890625, + "step": 12370, + "time_per_iteration": 2.427562713623047 + }, + { + "auxiliary_loss_clip": 0.01059645, + "auxiliary_loss_mlp": 0.0102797, + "balance_loss_clip": 1.01653767, + "balance_loss_mlp": 1.0208652, + "epoch": 0.7437847587554487, + "flos": 21542736833280.0, + "grad_norm": 1.3775049567415036, + "language_loss": 0.78289521, + "learning_rate": 6.137061245128208e-07, + "loss": 0.80377132, + "num_input_tokens_seen": 266844675, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.38867188, + "step": 12371, + "time_per_iteration": 2.3936381340026855 + }, + { + "auxiliary_loss_clip": 0.010572, + "auxiliary_loss_mlp": 0.01021513, + "balance_loss_clip": 1.00962782, + "balance_loss_mlp": 1.01863241, + "epoch": 0.7438448820081166, + "flos": 27781227233280.0, + "grad_norm": 1.649456314731079, + "language_loss": 0.69665974, + "learning_rate": 6.13433816310964e-07, + "loss": 0.71744686, + "num_input_tokens_seen": 266865160, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.38671875, + "step": 12372, + "time_per_iteration": 2.453582525253296 + }, + { + "auxiliary_loss_clip": 0.01053875, + "auxiliary_loss_mlp": 0.01022996, + "balance_loss_clip": 1.01151562, + "balance_loss_mlp": 1.01767147, + "epoch": 0.7439050052607846, + "flos": 17966862627840.0, + "grad_norm": 2.467514809604808, + "language_loss": 0.75059032, + "learning_rate": 6.131615575920879e-07, + "loss": 0.77135909, + "num_input_tokens_seen": 266883285, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.36328125, + "step": 12373, + "time_per_iteration": 2.368101119995117 + }, + { + "auxiliary_loss_clip": 0.0105837, + "auxiliary_loss_mlp": 0.01027444, + "balance_loss_clip": 1.01454616, + "balance_loss_mlp": 1.01820123, + "epoch": 0.7439651285134525, + "flos": 22957801948800.0, + "grad_norm": 1.7166257133000664, + "language_loss": 0.77061707, + "learning_rate": 6.128893483659081e-07, + "loss": 0.79147518, + "num_input_tokens_seen": 266900960, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.40234375, + "step": 12374, + "time_per_iteration": 2.4025416374206543 + }, + { + "auxiliary_loss_clip": 0.01057316, + "auxiliary_loss_mlp": 0.01023827, + "balance_loss_clip": 1.01164961, + "balance_loss_mlp": 1.01897383, + "epoch": 0.7440252517661206, + "flos": 18805790620800.0, + "grad_norm": 1.94098865589259, + "language_loss": 0.76381803, + "learning_rate": 6.126171886421389e-07, + "loss": 0.78462946, + "num_input_tokens_seen": 266917710, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.3828125, + "step": 12375, + "time_per_iteration": 2.367199659347534 + }, + { + "auxiliary_loss_clip": 0.01053154, + "auxiliary_loss_mlp": 0.01021642, + "balance_loss_clip": 1.01145601, + "balance_loss_mlp": 1.01696658, + "epoch": 0.7440853750187885, + "flos": 20265264852480.0, + "grad_norm": 1.9861392238067161, + "language_loss": 0.77496332, + "learning_rate": 6.123450784304942e-07, + "loss": 0.79571128, + "num_input_tokens_seen": 266934220, + "router_z_loss_clip": 0.1015625, + "router_z_loss_mlp": 0.36132812, + "step": 12376, + "time_per_iteration": 2.374833583831787 + }, + { + "auxiliary_loss_clip": 0.01058157, + "auxiliary_loss_mlp": 0.01020324, + "balance_loss_clip": 1.00805175, + "balance_loss_mlp": 1.01847386, + "epoch": 0.7441454982714565, + "flos": 25336120008960.0, + "grad_norm": 2.0432326272722965, + "language_loss": 0.79599941, + "learning_rate": 6.120730177406848e-07, + "loss": 0.81678414, + "num_input_tokens_seen": 266955210, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.39648438, + "step": 12377, + "time_per_iteration": 2.4384782314300537 + }, + { + "auxiliary_loss_clip": 0.01058505, + "auxiliary_loss_mlp": 0.01022567, + "balance_loss_clip": 1.01070571, + "balance_loss_mlp": 1.01919389, + "epoch": 0.7442056215241245, + "flos": 64478819191680.0, + "grad_norm": 1.9875864615115826, + "language_loss": 0.67412001, + "learning_rate": 6.118010065824177e-07, + "loss": 0.69493067, + "num_input_tokens_seen": 266976555, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.39257812, + "step": 12378, + "time_per_iteration": 2.8294119834899902 + }, + { + "auxiliary_loss_clip": 0.01058242, + "auxiliary_loss_mlp": 0.0102384, + "balance_loss_clip": 1.01119161, + "balance_loss_mlp": 1.01871347, + "epoch": 0.7442657447767924, + "flos": 31284028229760.0, + "grad_norm": 1.650196675133388, + "language_loss": 0.71803856, + "learning_rate": 6.115290449654027e-07, + "loss": 0.7388593, + "num_input_tokens_seen": 266997640, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.39453125, + "step": 12379, + "time_per_iteration": 3.880797863006592 + }, + { + "auxiliary_loss_clip": 0.010564, + "auxiliary_loss_mlp": 0.01027758, + "balance_loss_clip": 1.01553345, + "balance_loss_mlp": 1.01788998, + "epoch": 0.7443258680294604, + "flos": 20807012419200.0, + "grad_norm": 1.911249981148327, + "language_loss": 0.65375018, + "learning_rate": 6.112571328993443e-07, + "loss": 0.67459178, + "num_input_tokens_seen": 267016165, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.38476562, + "step": 12380, + "time_per_iteration": 2.4053828716278076 + }, + { + "auxiliary_loss_clip": 0.01059007, + "auxiliary_loss_mlp": 0.01024485, + "balance_loss_clip": 1.01200974, + "balance_loss_mlp": 1.01988697, + "epoch": 0.7443859912821283, + "flos": 22198754880000.0, + "grad_norm": 1.8419184533794877, + "language_loss": 0.78294295, + "learning_rate": 6.109852703939466e-07, + "loss": 0.80377793, + "num_input_tokens_seen": 267034075, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.390625, + "step": 12381, + "time_per_iteration": 2.376652956008911 + }, + { + "auxiliary_loss_clip": 0.01059142, + "auxiliary_loss_mlp": 0.01024272, + "balance_loss_clip": 1.01236343, + "balance_loss_mlp": 1.02040386, + "epoch": 0.7444461145347964, + "flos": 22016752629120.0, + "grad_norm": 1.9182249838713399, + "language_loss": 0.72744048, + "learning_rate": 6.107134574589111e-07, + "loss": 0.74827468, + "num_input_tokens_seen": 267053645, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.38671875, + "step": 12382, + "time_per_iteration": 2.4117119312286377 + }, + { + "auxiliary_loss_clip": 0.01007043, + "auxiliary_loss_mlp": 0.01001195, + "balance_loss_clip": 1.00035429, + "balance_loss_mlp": 1.00067818, + "epoch": 0.7445062377874643, + "flos": 70555170038400.0, + "grad_norm": 0.6508762837637454, + "language_loss": 0.54622966, + "learning_rate": 6.104416941039392e-07, + "loss": 0.56631207, + "num_input_tokens_seen": 267121830, + "router_z_loss_clip": 0.00842285, + "router_z_loss_mlp": 0.06347656, + "step": 12383, + "time_per_iteration": 3.2016940116882324 + }, + { + "auxiliary_loss_clip": 0.01055932, + "auxiliary_loss_mlp": 0.01022897, + "balance_loss_clip": 1.01075006, + "balance_loss_mlp": 1.0174551, + "epoch": 0.7445663610401323, + "flos": 22163317983360.0, + "grad_norm": 1.9975573625911343, + "language_loss": 0.76621777, + "learning_rate": 6.101699803387288e-07, + "loss": 0.78700608, + "num_input_tokens_seen": 267141145, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.38476562, + "step": 12384, + "time_per_iteration": 2.395061731338501 + }, + { + "auxiliary_loss_clip": 0.01056254, + "auxiliary_loss_mlp": 0.01023585, + "balance_loss_clip": 1.01190233, + "balance_loss_mlp": 1.01881886, + "epoch": 0.7446264842928002, + "flos": 24169113169920.0, + "grad_norm": 2.007115576814623, + "language_loss": 0.79200447, + "learning_rate": 6.098983161729769e-07, + "loss": 0.81280279, + "num_input_tokens_seen": 267159280, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.375, + "step": 12385, + "time_per_iteration": 2.4434943199157715 + }, + { + "auxiliary_loss_clip": 0.01060618, + "auxiliary_loss_mlp": 0.01025379, + "balance_loss_clip": 1.012725, + "balance_loss_mlp": 1.01916003, + "epoch": 0.7446866075454682, + "flos": 24133396982400.0, + "grad_norm": 1.7108248652349136, + "language_loss": 0.81555939, + "learning_rate": 6.096267016163777e-07, + "loss": 0.83641934, + "num_input_tokens_seen": 267179390, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.4140625, + "step": 12386, + "time_per_iteration": 2.4121978282928467 + }, + { + "auxiliary_loss_clip": 0.01055695, + "auxiliary_loss_mlp": 0.0102757, + "balance_loss_clip": 1.01645327, + "balance_loss_mlp": 1.01825476, + "epoch": 0.7447467307981361, + "flos": 23546995920000.0, + "grad_norm": 2.206026540640643, + "language_loss": 0.70897639, + "learning_rate": 6.09355136678626e-07, + "loss": 0.72980905, + "num_input_tokens_seen": 267198165, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.375, + "step": 12387, + "time_per_iteration": 2.402820587158203 + }, + { + "auxiliary_loss_clip": 0.01057718, + "auxiliary_loss_mlp": 0.01024308, + "balance_loss_clip": 1.0129236, + "balance_loss_mlp": 1.01872182, + "epoch": 0.7448068540508042, + "flos": 19566967282560.0, + "grad_norm": 2.058134170476825, + "language_loss": 0.70399171, + "learning_rate": 6.090836213694115e-07, + "loss": 0.72481197, + "num_input_tokens_seen": 267214520, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.390625, + "step": 12388, + "time_per_iteration": 2.379838466644287 + }, + { + "auxiliary_loss_clip": 0.01056776, + "auxiliary_loss_mlp": 0.01027002, + "balance_loss_clip": 1.01567101, + "balance_loss_mlp": 1.01885068, + "epoch": 0.7448669773034721, + "flos": 21838520805120.0, + "grad_norm": 2.816245319169908, + "language_loss": 0.85104841, + "learning_rate": 6.088121556984249e-07, + "loss": 0.87188613, + "num_input_tokens_seen": 267236555, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.37890625, + "step": 12389, + "time_per_iteration": 3.886741876602173 + }, + { + "auxiliary_loss_clip": 0.01055596, + "auxiliary_loss_mlp": 0.01026205, + "balance_loss_clip": 1.01472509, + "balance_loss_mlp": 1.01749015, + "epoch": 0.7449271005561401, + "flos": 25154222492160.0, + "grad_norm": 1.8014610011382324, + "language_loss": 0.79378527, + "learning_rate": 6.085407396753541e-07, + "loss": 0.81460333, + "num_input_tokens_seen": 267254800, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.38085938, + "step": 12390, + "time_per_iteration": 3.907315492630005 + }, + { + "auxiliary_loss_clip": 0.01058617, + "auxiliary_loss_mlp": 0.01021724, + "balance_loss_clip": 1.0098207, + "balance_loss_mlp": 1.01977372, + "epoch": 0.7449872238088081, + "flos": 22272247025280.0, + "grad_norm": 1.987708898804844, + "language_loss": 0.85010582, + "learning_rate": 6.082693733098851e-07, + "loss": 0.87090921, + "num_input_tokens_seen": 267274610, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.38867188, + "step": 12391, + "time_per_iteration": 2.420247793197632 + }, + { + "auxiliary_loss_clip": 0.01056787, + "auxiliary_loss_mlp": 0.01024244, + "balance_loss_clip": 1.01311624, + "balance_loss_mlp": 1.0192616, + "epoch": 0.745047347061476, + "flos": 20593762634880.0, + "grad_norm": 1.7583452641406079, + "language_loss": 0.73775458, + "learning_rate": 6.079980566117022e-07, + "loss": 0.75856489, + "num_input_tokens_seen": 267292600, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.375, + "step": 12392, + "time_per_iteration": 2.401505470275879 + }, + { + "auxiliary_loss_clip": 0.01054396, + "auxiliary_loss_mlp": 0.01024309, + "balance_loss_clip": 1.01328254, + "balance_loss_mlp": 1.01828384, + "epoch": 0.745107470314144, + "flos": 22126449720960.0, + "grad_norm": 1.6652267068295388, + "language_loss": 0.76457644, + "learning_rate": 6.077267895904872e-07, + "loss": 0.78536344, + "num_input_tokens_seen": 267311295, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.36132812, + "step": 12393, + "time_per_iteration": 2.4219231605529785 + }, + { + "auxiliary_loss_clip": 0.01057765, + "auxiliary_loss_mlp": 0.01027608, + "balance_loss_clip": 1.01482928, + "balance_loss_mlp": 1.0190866, + "epoch": 0.745167593566812, + "flos": 22235413674240.0, + "grad_norm": 2.388308038952665, + "language_loss": 0.72531903, + "learning_rate": 6.074555722559232e-07, + "loss": 0.74617279, + "num_input_tokens_seen": 267328390, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.38671875, + "step": 12394, + "time_per_iteration": 2.3981447219848633 + }, + { + "auxiliary_loss_clip": 0.01058461, + "auxiliary_loss_mlp": 0.01024776, + "balance_loss_clip": 1.01278949, + "balance_loss_mlp": 1.01941037, + "epoch": 0.74522771681948, + "flos": 20665229921280.0, + "grad_norm": 1.6600515504192355, + "language_loss": 0.81571728, + "learning_rate": 6.071844046176863e-07, + "loss": 0.83654964, + "num_input_tokens_seen": 267348185, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.390625, + "step": 12395, + "time_per_iteration": 2.3974742889404297 + }, + { + "auxiliary_loss_clip": 0.01060488, + "auxiliary_loss_mlp": 0.01024056, + "balance_loss_clip": 1.01176524, + "balance_loss_mlp": 1.02021396, + "epoch": 0.7452878400721479, + "flos": 21105903502080.0, + "grad_norm": 2.005981645238816, + "language_loss": 0.71447396, + "learning_rate": 6.069132866854561e-07, + "loss": 0.73531944, + "num_input_tokens_seen": 267367010, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.40234375, + "step": 12396, + "time_per_iteration": 2.386715888977051 + }, + { + "auxiliary_loss_clip": 0.01058574, + "auxiliary_loss_mlp": 0.01026547, + "balance_loss_clip": 1.01463819, + "balance_loss_mlp": 1.01870179, + "epoch": 0.7453479633248159, + "flos": 26686839755520.0, + "grad_norm": 2.3511776462944747, + "language_loss": 0.6858269, + "learning_rate": 6.06642218468907e-07, + "loss": 0.70667815, + "num_input_tokens_seen": 267386605, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.3984375, + "step": 12397, + "time_per_iteration": 3.8712832927703857 + }, + { + "auxiliary_loss_clip": 0.01055846, + "auxiliary_loss_mlp": 0.01028205, + "balance_loss_clip": 1.01640344, + "balance_loss_mlp": 1.018296, + "epoch": 0.7454080865774838, + "flos": 17015200254720.0, + "grad_norm": 1.8380681032764796, + "language_loss": 0.7690351, + "learning_rate": 6.063711999777132e-07, + "loss": 0.78987563, + "num_input_tokens_seen": 267404135, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.375, + "step": 12398, + "time_per_iteration": 2.369433641433716 + }, + { + "auxiliary_loss_clip": 0.01054873, + "auxiliary_loss_mlp": 0.01021709, + "balance_loss_clip": 1.00959182, + "balance_loss_mlp": 1.01819825, + "epoch": 0.7454682098301518, + "flos": 21322853890560.0, + "grad_norm": 2.17861397414956, + "language_loss": 0.77489972, + "learning_rate": 6.061002312215457e-07, + "loss": 0.7956655, + "num_input_tokens_seen": 267423120, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.3671875, + "step": 12399, + "time_per_iteration": 2.379012107849121 + }, + { + "auxiliary_loss_clip": 0.01052964, + "auxiliary_loss_mlp": 0.01022008, + "balance_loss_clip": 1.01150584, + "balance_loss_mlp": 1.01722753, + "epoch": 0.7455283330828197, + "flos": 17857375004160.0, + "grad_norm": 1.876839850385442, + "language_loss": 0.73680091, + "learning_rate": 6.058293122100761e-07, + "loss": 0.75755072, + "num_input_tokens_seen": 267441250, + "router_z_loss_clip": 0.10498047, + "router_z_loss_mlp": 0.35742188, + "step": 12400, + "time_per_iteration": 2.3653640747070312 + }, + { + "auxiliary_loss_clip": 0.01054264, + "auxiliary_loss_mlp": 0.01021992, + "balance_loss_clip": 1.01087618, + "balance_loss_mlp": 1.01791239, + "epoch": 0.7455884563354878, + "flos": 30772934703360.0, + "grad_norm": 2.0718217879930885, + "language_loss": 0.8203249, + "learning_rate": 6.055584429529721e-07, + "loss": 0.84108746, + "num_input_tokens_seen": 267462820, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.36328125, + "step": 12401, + "time_per_iteration": 2.460930109024048 + }, + { + "auxiliary_loss_clip": 0.01055673, + "auxiliary_loss_mlp": 0.01023649, + "balance_loss_clip": 1.01280713, + "balance_loss_mlp": 1.01879478, + "epoch": 0.7456485795881557, + "flos": 23184422784000.0, + "grad_norm": 1.6475964362403976, + "language_loss": 0.65045416, + "learning_rate": 6.052876234599003e-07, + "loss": 0.67124736, + "num_input_tokens_seen": 267483065, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.36914062, + "step": 12402, + "time_per_iteration": 2.425457239151001 + }, + { + "auxiliary_loss_clip": 0.01054303, + "auxiliary_loss_mlp": 0.01025237, + "balance_loss_clip": 1.01485372, + "balance_loss_mlp": 1.01742315, + "epoch": 0.7457087028408237, + "flos": 38725625681280.0, + "grad_norm": 1.631716490981115, + "language_loss": 0.73155034, + "learning_rate": 6.050168537405249e-07, + "loss": 0.75234574, + "num_input_tokens_seen": 267504825, + "router_z_loss_clip": 0.10351562, + "router_z_loss_mlp": 0.36914062, + "step": 12403, + "time_per_iteration": 2.5344767570495605 + }, + { + "auxiliary_loss_clip": 0.01007395, + "auxiliary_loss_mlp": 0.01001011, + "balance_loss_clip": 1.00020087, + "balance_loss_mlp": 1.00096488, + "epoch": 0.7457688260934917, + "flos": 56041113473280.0, + "grad_norm": 0.8296002205859611, + "language_loss": 0.58850193, + "learning_rate": 6.04746133804511e-07, + "loss": 0.60858601, + "num_input_tokens_seen": 267559260, + "router_z_loss_clip": 0.00811768, + "router_z_loss_mlp": 0.06445312, + "step": 12404, + "time_per_iteration": 2.8764400482177734 + }, + { + "auxiliary_loss_clip": 0.01054902, + "auxiliary_loss_mlp": 0.01024979, + "balance_loss_clip": 1.01342177, + "balance_loss_mlp": 1.01789951, + "epoch": 0.7458289493461596, + "flos": 20115243273600.0, + "grad_norm": 1.97964245662388, + "language_loss": 0.77796853, + "learning_rate": 6.044754636615172e-07, + "loss": 0.79876733, + "num_input_tokens_seen": 267578720, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.36914062, + "step": 12405, + "time_per_iteration": 2.42181396484375 + }, + { + "auxiliary_loss_clip": 0.01055193, + "auxiliary_loss_mlp": 0.01025984, + "balance_loss_clip": 1.01464117, + "balance_loss_mlp": 1.01866519, + "epoch": 0.7458890725988276, + "flos": 20192051998080.0, + "grad_norm": 1.5134843454951041, + "language_loss": 0.69092971, + "learning_rate": 6.042048433212052e-07, + "loss": 0.71174151, + "num_input_tokens_seen": 267598250, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.36523438, + "step": 12406, + "time_per_iteration": 2.391042709350586 + }, + { + "auxiliary_loss_clip": 0.01059038, + "auxiliary_loss_mlp": 0.01022355, + "balance_loss_clip": 1.01019573, + "balance_loss_mlp": 1.01843762, + "epoch": 0.7459491958514956, + "flos": 17017818606720.0, + "grad_norm": 1.6930361564949232, + "language_loss": 0.6454041, + "learning_rate": 6.039342727932319e-07, + "loss": 0.66621804, + "num_input_tokens_seen": 267615430, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.40625, + "step": 12407, + "time_per_iteration": 2.3512187004089355 + }, + { + "auxiliary_loss_clip": 0.01057988, + "auxiliary_loss_mlp": 0.01025159, + "balance_loss_clip": 1.01226032, + "balance_loss_mlp": 1.0187763, + "epoch": 0.7460093191041636, + "flos": 25077658147200.0, + "grad_norm": 1.6894317815247217, + "language_loss": 0.71763426, + "learning_rate": 6.036637520872531e-07, + "loss": 0.73846573, + "num_input_tokens_seen": 267635075, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.39257812, + "step": 12408, + "time_per_iteration": 2.429546356201172 + }, + { + "auxiliary_loss_clip": 0.01060184, + "auxiliary_loss_mlp": 0.01026492, + "balance_loss_clip": 1.01374245, + "balance_loss_mlp": 1.02008367, + "epoch": 0.7460694423568315, + "flos": 21907439562240.0, + "grad_norm": 1.7362634076275676, + "language_loss": 0.72761774, + "learning_rate": 6.033932812129234e-07, + "loss": 0.74848443, + "num_input_tokens_seen": 267654105, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.40039062, + "step": 12409, + "time_per_iteration": 2.4037108421325684 + }, + { + "auxiliary_loss_clip": 0.01054044, + "auxiliary_loss_mlp": 0.01024444, + "balance_loss_clip": 1.01373315, + "balance_loss_mlp": 1.01806247, + "epoch": 0.7461295656094995, + "flos": 21214657987200.0, + "grad_norm": 2.8730697885515335, + "language_loss": 0.66113281, + "learning_rate": 6.031228601798944e-07, + "loss": 0.68191767, + "num_input_tokens_seen": 267673090, + "router_z_loss_clip": 0.10693359, + "router_z_loss_mlp": 0.359375, + "step": 12410, + "time_per_iteration": 2.397968053817749 + }, + { + "auxiliary_loss_clip": 0.01054653, + "auxiliary_loss_mlp": 0.01023544, + "balance_loss_clip": 1.01222539, + "balance_loss_mlp": 1.01852155, + "epoch": 0.7461896888621674, + "flos": 22345739170560.0, + "grad_norm": 2.2572983783036307, + "language_loss": 0.84550506, + "learning_rate": 6.028524889978184e-07, + "loss": 0.86628699, + "num_input_tokens_seen": 267690605, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.36132812, + "step": 12411, + "time_per_iteration": 2.381276845932007 + }, + { + "auxiliary_loss_clip": 0.01058348, + "auxiliary_loss_mlp": 0.01021408, + "balance_loss_clip": 1.00921333, + "balance_loss_mlp": 1.0182296, + "epoch": 0.7462498121148354, + "flos": 25481777667840.0, + "grad_norm": 1.4294658438203707, + "language_loss": 0.77803957, + "learning_rate": 6.025821676763421e-07, + "loss": 0.79883718, + "num_input_tokens_seen": 267710540, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.40234375, + "step": 12412, + "time_per_iteration": 2.433680772781372 + }, + { + "auxiliary_loss_clip": 0.01056844, + "auxiliary_loss_mlp": 0.01024747, + "balance_loss_clip": 1.01391065, + "balance_loss_mlp": 1.01893473, + "epoch": 0.7463099353675033, + "flos": 33946539690240.0, + "grad_norm": 1.6982838273234406, + "language_loss": 0.62333238, + "learning_rate": 6.023118962251141e-07, + "loss": 0.64414835, + "num_input_tokens_seen": 267730780, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.37890625, + "step": 12413, + "time_per_iteration": 2.500378370285034 + }, + { + "auxiliary_loss_clip": 0.0100708, + "auxiliary_loss_mlp": 0.01001771, + "balance_loss_clip": 1.00079346, + "balance_loss_mlp": 1.00073242, + "epoch": 0.7463700586201714, + "flos": 62764616747520.0, + "grad_norm": 0.7173224074374578, + "language_loss": 0.54903448, + "learning_rate": 6.020416746537793e-07, + "loss": 0.56912303, + "num_input_tokens_seen": 267794240, + "router_z_loss_clip": 0.00976562, + "router_z_loss_mlp": 0.06347656, + "step": 12414, + "time_per_iteration": 3.0679023265838623 + }, + { + "auxiliary_loss_clip": 0.01055654, + "auxiliary_loss_mlp": 0.01024577, + "balance_loss_clip": 1.01271546, + "balance_loss_mlp": 1.01748598, + "epoch": 0.7464301818728393, + "flos": 33431396446080.0, + "grad_norm": 1.6884036640279367, + "language_loss": 0.55006754, + "learning_rate": 6.01771502971981e-07, + "loss": 0.5708698, + "num_input_tokens_seen": 267817190, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.38085938, + "step": 12415, + "time_per_iteration": 2.4853134155273438 + }, + { + "auxiliary_loss_clip": 0.0105632, + "auxiliary_loss_mlp": 0.01023649, + "balance_loss_clip": 1.01225305, + "balance_loss_mlp": 1.01893771, + "epoch": 0.7464903051255073, + "flos": 26868667449600.0, + "grad_norm": 1.7092046957900082, + "language_loss": 0.75102234, + "learning_rate": 6.015013811893608e-07, + "loss": 0.77182198, + "num_input_tokens_seen": 267836245, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.375, + "step": 12416, + "time_per_iteration": 2.4469215869903564 + }, + { + "auxiliary_loss_clip": 0.01056688, + "auxiliary_loss_mlp": 0.01024607, + "balance_loss_clip": 1.01261497, + "balance_loss_mlp": 1.01870286, + "epoch": 0.7465504283781753, + "flos": 44085387271680.0, + "grad_norm": 1.7450561735554138, + "language_loss": 0.69424546, + "learning_rate": 6.012313093155598e-07, + "loss": 0.71505845, + "num_input_tokens_seen": 267858310, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.38085938, + "step": 12417, + "time_per_iteration": 2.5781562328338623 + }, + { + "auxiliary_loss_clip": 0.01057871, + "auxiliary_loss_mlp": 0.0102041, + "balance_loss_clip": 1.00869727, + "balance_loss_mlp": 1.01967144, + "epoch": 0.7466105516308432, + "flos": 19675267920000.0, + "grad_norm": 1.8021453295728935, + "language_loss": 0.73836237, + "learning_rate": 6.009612873602143e-07, + "loss": 0.75914526, + "num_input_tokens_seen": 267876345, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.3828125, + "step": 12418, + "time_per_iteration": 2.406820058822632 + }, + { + "auxiliary_loss_clip": 0.01056666, + "auxiliary_loss_mlp": 0.01028361, + "balance_loss_clip": 1.01653576, + "balance_loss_mlp": 1.01875865, + "epoch": 0.7466706748835112, + "flos": 20519711907840.0, + "grad_norm": 1.5112936683423757, + "language_loss": 0.69012904, + "learning_rate": 6.006913153329623e-07, + "loss": 0.71097934, + "num_input_tokens_seen": 267896740, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.37890625, + "step": 12419, + "time_per_iteration": 3.921302318572998 + }, + { + "auxiliary_loss_clip": 0.01053442, + "auxiliary_loss_mlp": 0.01021627, + "balance_loss_clip": 1.01064181, + "balance_loss_mlp": 1.01814151, + "epoch": 0.7467307981361792, + "flos": 21689162542080.0, + "grad_norm": 1.6387711746671023, + "language_loss": 0.74694836, + "learning_rate": 6.004213932434373e-07, + "loss": 0.76769912, + "num_input_tokens_seen": 267914765, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.35351562, + "step": 12420, + "time_per_iteration": 2.3867690563201904 + }, + { + "auxiliary_loss_clip": 0.01058407, + "auxiliary_loss_mlp": 0.01023496, + "balance_loss_clip": 1.01110983, + "balance_loss_mlp": 1.01990604, + "epoch": 0.7467909213888472, + "flos": 19572657834240.0, + "grad_norm": 2.051219530952272, + "language_loss": 0.67568505, + "learning_rate": 6.001515211012736e-07, + "loss": 0.69650412, + "num_input_tokens_seen": 267934085, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.38476562, + "step": 12421, + "time_per_iteration": 2.3775784969329834 + }, + { + "auxiliary_loss_clip": 0.01059222, + "auxiliary_loss_mlp": 0.0102801, + "balance_loss_clip": 1.01532054, + "balance_loss_mlp": 1.01917362, + "epoch": 0.7468510446415151, + "flos": 23694119856000.0, + "grad_norm": 1.9555660079137087, + "language_loss": 0.72877342, + "learning_rate": 5.998816989161008e-07, + "loss": 0.74964571, + "num_input_tokens_seen": 267955170, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.40039062, + "step": 12422, + "time_per_iteration": 2.4215755462646484 + }, + { + "auxiliary_loss_clip": 0.01056383, + "auxiliary_loss_mlp": 0.01022204, + "balance_loss_clip": 1.01059294, + "balance_loss_mlp": 1.01921296, + "epoch": 0.7469111678941831, + "flos": 29314472901120.0, + "grad_norm": 2.2022274839874263, + "language_loss": 0.74669921, + "learning_rate": 5.996119266975479e-07, + "loss": 0.76748508, + "num_input_tokens_seen": 267974980, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.37109375, + "step": 12423, + "time_per_iteration": 2.4479918479919434 + }, + { + "auxiliary_loss_clip": 0.01053874, + "auxiliary_loss_mlp": 0.01019955, + "balance_loss_clip": 1.0097034, + "balance_loss_mlp": 1.01722419, + "epoch": 0.746971291146851, + "flos": 21797567913600.0, + "grad_norm": 1.3245367644197366, + "language_loss": 0.6783309, + "learning_rate": 5.993422044552445e-07, + "loss": 0.69906926, + "num_input_tokens_seen": 267994985, + "router_z_loss_clip": 0.10253906, + "router_z_loss_mlp": 0.3671875, + "step": 12424, + "time_per_iteration": 2.4143190383911133 + }, + { + "auxiliary_loss_clip": 0.01058903, + "auxiliary_loss_mlp": 0.01029234, + "balance_loss_clip": 1.01700866, + "balance_loss_mlp": 1.01996863, + "epoch": 0.747031414399519, + "flos": 36243650194560.0, + "grad_norm": 2.295076317181677, + "language_loss": 0.74493802, + "learning_rate": 5.990725321988137e-07, + "loss": 0.76581943, + "num_input_tokens_seen": 268014985, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.38867188, + "step": 12425, + "time_per_iteration": 2.5271358489990234 + }, + { + "auxiliary_loss_clip": 0.01056228, + "auxiliary_loss_mlp": 0.01021574, + "balance_loss_clip": 1.00995088, + "balance_loss_mlp": 1.01870215, + "epoch": 0.7470915376521869, + "flos": 19973879712000.0, + "grad_norm": 2.742475073676574, + "language_loss": 0.69248021, + "learning_rate": 5.988029099378811e-07, + "loss": 0.71325821, + "num_input_tokens_seen": 268034395, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.375, + "step": 12426, + "time_per_iteration": 2.414034128189087 + }, + { + "auxiliary_loss_clip": 0.01053806, + "auxiliary_loss_mlp": 0.01026481, + "balance_loss_clip": 1.01493001, + "balance_loss_mlp": 1.01751089, + "epoch": 0.747151660904855, + "flos": 20083262601600.0, + "grad_norm": 1.3839856947636244, + "language_loss": 0.65494144, + "learning_rate": 5.985333376820679e-07, + "loss": 0.6757443, + "num_input_tokens_seen": 268054485, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.36328125, + "step": 12427, + "time_per_iteration": 2.3918328285217285 + }, + { + "auxiliary_loss_clip": 0.01056804, + "auxiliary_loss_mlp": 0.01022882, + "balance_loss_clip": 1.01106262, + "balance_loss_mlp": 1.01856279, + "epoch": 0.7472117841575229, + "flos": 16289425578240.0, + "grad_norm": 1.621248407365296, + "language_loss": 0.74853194, + "learning_rate": 5.982638154409958e-07, + "loss": 0.76932877, + "num_input_tokens_seen": 268072250, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.3828125, + "step": 12428, + "time_per_iteration": 2.4028708934783936 + }, + { + "auxiliary_loss_clip": 0.01060124, + "auxiliary_loss_mlp": 0.01022558, + "balance_loss_clip": 1.00995743, + "balance_loss_mlp": 1.01964736, + "epoch": 0.7472719074101909, + "flos": 21389084472960.0, + "grad_norm": 8.548688990359487, + "language_loss": 0.58338076, + "learning_rate": 5.979943432242814e-07, + "loss": 0.60420758, + "num_input_tokens_seen": 268089840, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.40429688, + "step": 12429, + "time_per_iteration": 5.250476837158203 + }, + { + "auxiliary_loss_clip": 0.01059019, + "auxiliary_loss_mlp": 0.01025345, + "balance_loss_clip": 1.01366234, + "balance_loss_mlp": 1.01920629, + "epoch": 0.7473320306628589, + "flos": 29641993165440.0, + "grad_norm": 2.2896590855701984, + "language_loss": 0.60638458, + "learning_rate": 5.977249210415429e-07, + "loss": 0.62722826, + "num_input_tokens_seen": 268109360, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.3984375, + "step": 12430, + "time_per_iteration": 2.4484288692474365 + }, + { + "auxiliary_loss_clip": 0.0105735, + "auxiliary_loss_mlp": 0.01023291, + "balance_loss_clip": 1.0120616, + "balance_loss_mlp": 1.01931214, + "epoch": 0.7473921539155268, + "flos": 24134898170880.0, + "grad_norm": 1.4784352724231347, + "language_loss": 0.75431705, + "learning_rate": 5.974555489023951e-07, + "loss": 0.77512348, + "num_input_tokens_seen": 268131840, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.38085938, + "step": 12431, + "time_per_iteration": 2.4507498741149902 + }, + { + "auxiliary_loss_clip": 0.01058021, + "auxiliary_loss_mlp": 0.01025997, + "balance_loss_clip": 1.01455927, + "balance_loss_mlp": 1.01939857, + "epoch": 0.7474522771681948, + "flos": 17487156280320.0, + "grad_norm": 1.8818895745903261, + "language_loss": 0.752702, + "learning_rate": 5.971862268164511e-07, + "loss": 0.77354217, + "num_input_tokens_seen": 268148300, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.38671875, + "step": 12432, + "time_per_iteration": 2.361717939376831 + }, + { + "auxiliary_loss_clip": 0.01058821, + "auxiliary_loss_mlp": 0.01026535, + "balance_loss_clip": 1.01348734, + "balance_loss_mlp": 1.01823604, + "epoch": 0.7475124004208628, + "flos": 16726363643520.0, + "grad_norm": 2.100755629121845, + "language_loss": 0.70314771, + "learning_rate": 5.969169547933213e-07, + "loss": 0.72400129, + "num_input_tokens_seen": 268166450, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.40625, + "step": 12433, + "time_per_iteration": 2.413780927658081 + }, + { + "auxiliary_loss_clip": 0.01057213, + "auxiliary_loss_mlp": 0.01022738, + "balance_loss_clip": 1.01046538, + "balance_loss_mlp": 1.01846874, + "epoch": 0.7475725236735308, + "flos": 19719188277120.0, + "grad_norm": 1.5983901897507355, + "language_loss": 0.66920877, + "learning_rate": 5.966477328426176e-07, + "loss": 0.69000834, + "num_input_tokens_seen": 268186165, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.38867188, + "step": 12434, + "time_per_iteration": 2.4347877502441406 + }, + { + "auxiliary_loss_clip": 0.01053579, + "auxiliary_loss_mlp": 0.01028317, + "balance_loss_clip": 1.01784492, + "balance_loss_mlp": 1.01877642, + "epoch": 0.7476326469261987, + "flos": 26284814916480.0, + "grad_norm": 1.4475781679244697, + "language_loss": 0.79563797, + "learning_rate": 5.963785609739453e-07, + "loss": 0.81645685, + "num_input_tokens_seen": 268208145, + "router_z_loss_clip": 0.10449219, + "router_z_loss_mlp": 0.34765625, + "step": 12435, + "time_per_iteration": 2.442237377166748 + }, + { + "auxiliary_loss_clip": 0.01056857, + "auxiliary_loss_mlp": 0.01019184, + "balance_loss_clip": 1.00746012, + "balance_loss_mlp": 1.01803756, + "epoch": 0.7476927701788667, + "flos": 31830488830080.0, + "grad_norm": 1.7997775126420714, + "language_loss": 0.68159485, + "learning_rate": 5.961094391969121e-07, + "loss": 0.70235527, + "num_input_tokens_seen": 268228345, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.38867188, + "step": 12436, + "time_per_iteration": 2.460559844970703 + }, + { + "auxiliary_loss_clip": 0.01058888, + "auxiliary_loss_mlp": 0.01026942, + "balance_loss_clip": 1.01530147, + "balance_loss_mlp": 1.01935279, + "epoch": 0.7477528934315346, + "flos": 31794144238080.0, + "grad_norm": 1.907577431109213, + "language_loss": 0.71000141, + "learning_rate": 5.958403675211219e-07, + "loss": 0.73085976, + "num_input_tokens_seen": 268250260, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.39453125, + "step": 12437, + "time_per_iteration": 3.897904396057129 + }, + { + "auxiliary_loss_clip": 0.01052954, + "auxiliary_loss_mlp": 0.01022241, + "balance_loss_clip": 1.01198936, + "balance_loss_mlp": 1.01754308, + "epoch": 0.7478130166842026, + "flos": 20371051872000.0, + "grad_norm": 1.660823626207045, + "language_loss": 0.67360628, + "learning_rate": 5.955713459561768e-07, + "loss": 0.69435823, + "num_input_tokens_seen": 268268440, + "router_z_loss_clip": 0.10253906, + "router_z_loss_mlp": 0.35351562, + "step": 12438, + "time_per_iteration": 2.3953704833984375 + }, + { + "auxiliary_loss_clip": 0.01055361, + "auxiliary_loss_mlp": 0.01024591, + "balance_loss_clip": 1.01307571, + "balance_loss_mlp": 1.01788259, + "epoch": 0.7478731399368705, + "flos": 18147992094720.0, + "grad_norm": 1.6485330545835213, + "language_loss": 0.80697167, + "learning_rate": 5.953023745116781e-07, + "loss": 0.82777119, + "num_input_tokens_seen": 268285765, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.375, + "step": 12439, + "time_per_iteration": 2.367274045944214 + }, + { + "auxiliary_loss_clip": 0.01056587, + "auxiliary_loss_mlp": 0.01025118, + "balance_loss_clip": 1.01428199, + "balance_loss_mlp": 1.01829386, + "epoch": 0.7479332631895386, + "flos": 15266994145920.0, + "grad_norm": 1.9144377922399538, + "language_loss": 0.71071792, + "learning_rate": 5.950334531972234e-07, + "loss": 0.73153496, + "num_input_tokens_seen": 268304015, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.3828125, + "step": 12440, + "time_per_iteration": 2.3492560386657715 + }, + { + "auxiliary_loss_clip": 0.0105665, + "auxiliary_loss_mlp": 0.0102225, + "balance_loss_clip": 1.01101446, + "balance_loss_mlp": 1.01984, + "epoch": 0.7479933864422065, + "flos": 21141445132800.0, + "grad_norm": 1.7883044282262799, + "language_loss": 0.74165678, + "learning_rate": 5.947645820224123e-07, + "loss": 0.76244581, + "num_input_tokens_seen": 268323290, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.36914062, + "step": 12441, + "time_per_iteration": 2.4425036907196045 + }, + { + "auxiliary_loss_clip": 0.0105965, + "auxiliary_loss_mlp": 0.01027943, + "balance_loss_clip": 1.01542616, + "balance_loss_mlp": 1.02014923, + "epoch": 0.7480535096948745, + "flos": 14391162979200.0, + "grad_norm": 2.014931752445199, + "language_loss": 0.82476461, + "learning_rate": 5.94495760996837e-07, + "loss": 0.8456406, + "num_input_tokens_seen": 268339490, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.39453125, + "step": 12442, + "time_per_iteration": 2.3910655975341797 + }, + { + "auxiliary_loss_clip": 0.0105755, + "auxiliary_loss_mlp": 0.01028677, + "balance_loss_clip": 1.01536155, + "balance_loss_mlp": 1.0188036, + "epoch": 0.7481136329475425, + "flos": 27343451295360.0, + "grad_norm": 2.2153900951761933, + "language_loss": 0.6236943, + "learning_rate": 5.942269901300934e-07, + "loss": 0.64455652, + "num_input_tokens_seen": 268359865, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.38671875, + "step": 12443, + "time_per_iteration": 2.436368465423584 + }, + { + "auxiliary_loss_clip": 0.01054803, + "auxiliary_loss_mlp": 0.01021096, + "balance_loss_clip": 1.01020646, + "balance_loss_mlp": 1.01797366, + "epoch": 0.7481737562002104, + "flos": 19530587779200.0, + "grad_norm": 2.714636290082376, + "language_loss": 0.71739352, + "learning_rate": 5.939582694317717e-07, + "loss": 0.7381525, + "num_input_tokens_seen": 268377065, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.3671875, + "step": 12444, + "time_per_iteration": 2.3736183643341064 + }, + { + "auxiliary_loss_clip": 0.01054859, + "auxiliary_loss_mlp": 0.01020606, + "balance_loss_clip": 1.0088098, + "balance_loss_mlp": 1.01785827, + "epoch": 0.7482338794528784, + "flos": 21759023905920.0, + "grad_norm": 2.8170674421278634, + "language_loss": 0.68899679, + "learning_rate": 5.936895989114641e-07, + "loss": 0.70975143, + "num_input_tokens_seen": 268396935, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.37109375, + "step": 12445, + "time_per_iteration": 2.4175615310668945 + }, + { + "auxiliary_loss_clip": 0.01054483, + "auxiliary_loss_mlp": 0.0101881, + "balance_loss_clip": 1.00779474, + "balance_loss_mlp": 1.01799464, + "epoch": 0.7482940027055464, + "flos": 18696372819840.0, + "grad_norm": 1.7246400020780046, + "language_loss": 0.7444061, + "learning_rate": 5.934209785787559e-07, + "loss": 0.76513898, + "num_input_tokens_seen": 268414460, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.36523438, + "step": 12446, + "time_per_iteration": 2.375856637954712 + }, + { + "auxiliary_loss_clip": 0.01058567, + "auxiliary_loss_mlp": 0.0102388, + "balance_loss_clip": 1.0109812, + "balance_loss_mlp": 1.01795363, + "epoch": 0.7483541259582144, + "flos": 15997097831040.0, + "grad_norm": 2.0540734526644644, + "language_loss": 0.73457772, + "learning_rate": 5.931524084432353e-07, + "loss": 0.75540221, + "num_input_tokens_seen": 268432225, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.40625, + "step": 12447, + "time_per_iteration": 2.3890676498413086 + }, + { + "auxiliary_loss_clip": 0.01054825, + "auxiliary_loss_mlp": 0.01025394, + "balance_loss_clip": 1.01502275, + "balance_loss_mlp": 1.01719618, + "epoch": 0.7484142492108823, + "flos": 25555130167680.0, + "grad_norm": 2.716499898707456, + "language_loss": 0.7221207, + "learning_rate": 5.928838885144864e-07, + "loss": 0.7429229, + "num_input_tokens_seen": 268449270, + "router_z_loss_clip": 0.10351562, + "router_z_loss_mlp": 0.375, + "step": 12448, + "time_per_iteration": 2.420755386352539 + }, + { + "auxiliary_loss_clip": 0.01058133, + "auxiliary_loss_mlp": 0.01026365, + "balance_loss_clip": 1.01500428, + "balance_loss_mlp": 1.019557, + "epoch": 0.7484743724635503, + "flos": 22886788510080.0, + "grad_norm": 4.087385555518913, + "language_loss": 0.73612267, + "learning_rate": 5.926154188020922e-07, + "loss": 0.75696766, + "num_input_tokens_seen": 268467250, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.38671875, + "step": 12449, + "time_per_iteration": 2.4348607063293457 + }, + { + "auxiliary_loss_clip": 0.01061209, + "auxiliary_loss_mlp": 0.01027169, + "balance_loss_clip": 1.01483703, + "balance_loss_mlp": 1.02075028, + "epoch": 0.7485344957162182, + "flos": 25299147012480.0, + "grad_norm": 4.019707177965092, + "language_loss": 0.60725361, + "learning_rate": 5.923469993156327e-07, + "loss": 0.62813735, + "num_input_tokens_seen": 268487270, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.40625, + "step": 12450, + "time_per_iteration": 2.420968770980835 + }, + { + "auxiliary_loss_clip": 0.01052001, + "auxiliary_loss_mlp": 0.01019407, + "balance_loss_clip": 1.00917304, + "balance_loss_mlp": 1.01694894, + "epoch": 0.7485946189688862, + "flos": 27051786864000.0, + "grad_norm": 1.6519782945188262, + "language_loss": 0.69972384, + "learning_rate": 5.920786300646892e-07, + "loss": 0.72043794, + "num_input_tokens_seen": 268508020, + "router_z_loss_clip": 0.10253906, + "router_z_loss_mlp": 0.34960938, + "step": 12451, + "time_per_iteration": 2.4439597129821777 + }, + { + "auxiliary_loss_clip": 0.01058343, + "auxiliary_loss_mlp": 0.01026127, + "balance_loss_clip": 1.01415229, + "balance_loss_mlp": 1.01864076, + "epoch": 0.7486547422215541, + "flos": 26905535712000.0, + "grad_norm": 2.20785876339996, + "language_loss": 0.80567765, + "learning_rate": 5.918103110588364e-07, + "loss": 0.82652235, + "num_input_tokens_seen": 268527375, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.39648438, + "step": 12452, + "time_per_iteration": 2.4360134601593018 + }, + { + "auxiliary_loss_clip": 0.01061185, + "auxiliary_loss_mlp": 0.01027486, + "balance_loss_clip": 1.01520801, + "balance_loss_mlp": 1.02046418, + "epoch": 0.7487148654742222, + "flos": 22345180588800.0, + "grad_norm": 2.555279677868419, + "language_loss": 0.7105875, + "learning_rate": 5.91542042307652e-07, + "loss": 0.73147428, + "num_input_tokens_seen": 268544870, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.40625, + "step": 12453, + "time_per_iteration": 2.4080681800842285 + }, + { + "auxiliary_loss_clip": 0.01056728, + "auxiliary_loss_mlp": 0.01022592, + "balance_loss_clip": 1.01165485, + "balance_loss_mlp": 1.01845086, + "epoch": 0.7487749887268901, + "flos": 23037717784320.0, + "grad_norm": 1.4849236929402747, + "language_loss": 0.73918617, + "learning_rate": 5.912738238207091e-07, + "loss": 0.75997931, + "num_input_tokens_seen": 268564580, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.3828125, + "step": 12454, + "time_per_iteration": 2.4002649784088135 + }, + { + "auxiliary_loss_clip": 0.01006694, + "auxiliary_loss_mlp": 0.01000908, + "balance_loss_clip": 1.00000226, + "balance_loss_mlp": 1.0004127, + "epoch": 0.7488351119795581, + "flos": 71521915985280.0, + "grad_norm": 0.7353264201218488, + "language_loss": 0.59429348, + "learning_rate": 5.9100565560758e-07, + "loss": 0.61436951, + "num_input_tokens_seen": 268629550, + "router_z_loss_clip": 0.0090332, + "router_z_loss_mlp": 0.0625, + "step": 12455, + "time_per_iteration": 3.190803289413452 + }, + { + "auxiliary_loss_clip": 0.01056858, + "auxiliary_loss_mlp": 0.01023416, + "balance_loss_clip": 1.01223433, + "balance_loss_mlp": 1.01898289, + "epoch": 0.748895235232226, + "flos": 17195456937600.0, + "grad_norm": 1.6876290758214765, + "language_loss": 0.79413301, + "learning_rate": 5.907375376778343e-07, + "loss": 0.81493574, + "num_input_tokens_seen": 268646645, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.37890625, + "step": 12456, + "time_per_iteration": 2.3483664989471436 + }, + { + "auxiliary_loss_clip": 0.01007058, + "auxiliary_loss_mlp": 0.01000992, + "balance_loss_clip": 1.00005054, + "balance_loss_mlp": 1.00076234, + "epoch": 0.748955358484894, + "flos": 58976086250880.0, + "grad_norm": 0.8416095561889513, + "language_loss": 0.6144104, + "learning_rate": 5.904694700410404e-07, + "loss": 0.63449085, + "num_input_tokens_seen": 268702275, + "router_z_loss_clip": 0.00939941, + "router_z_loss_mlp": 0.06298828, + "step": 12457, + "time_per_iteration": 2.9740002155303955 + }, + { + "auxiliary_loss_clip": 0.01058644, + "auxiliary_loss_mlp": 0.01026207, + "balance_loss_clip": 1.01414299, + "balance_loss_mlp": 1.01948965, + "epoch": 0.749015481737562, + "flos": 11359724515200.0, + "grad_norm": 2.0898359031042, + "language_loss": 0.67529595, + "learning_rate": 5.902014527067667e-07, + "loss": 0.69614446, + "num_input_tokens_seen": 268716265, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.390625, + "step": 12458, + "time_per_iteration": 3.77606201171875 + }, + { + "auxiliary_loss_clip": 0.01055044, + "auxiliary_loss_mlp": 0.01026799, + "balance_loss_clip": 1.01605868, + "balance_loss_mlp": 1.01770854, + "epoch": 0.74907560499023, + "flos": 21105414743040.0, + "grad_norm": 1.828324076557754, + "language_loss": 0.80368805, + "learning_rate": 5.899334856845753e-07, + "loss": 0.82450652, + "num_input_tokens_seen": 268734330, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.375, + "step": 12459, + "time_per_iteration": 2.3760852813720703 + }, + { + "auxiliary_loss_clip": 0.01054038, + "auxiliary_loss_mlp": 0.0102679, + "balance_loss_clip": 1.01582313, + "balance_loss_mlp": 1.01800346, + "epoch": 0.749135728242898, + "flos": 22267080144000.0, + "grad_norm": 1.6307341617030122, + "language_loss": 0.8049835, + "learning_rate": 5.896655689840313e-07, + "loss": 0.82579178, + "num_input_tokens_seen": 268753500, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.36132812, + "step": 12460, + "time_per_iteration": 2.4163081645965576 + }, + { + "auxiliary_loss_clip": 0.01056129, + "auxiliary_loss_mlp": 0.01024283, + "balance_loss_clip": 1.01267219, + "balance_loss_mlp": 1.01847148, + "epoch": 0.7491958514955659, + "flos": 24056483523840.0, + "grad_norm": 1.676936220165959, + "language_loss": 0.86277974, + "learning_rate": 5.893977026146955e-07, + "loss": 0.88358384, + "num_input_tokens_seen": 268772055, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.375, + "step": 12461, + "time_per_iteration": 2.424964189529419 + }, + { + "auxiliary_loss_clip": 0.01054065, + "auxiliary_loss_mlp": 0.01022546, + "balance_loss_clip": 1.01229429, + "balance_loss_mlp": 1.01816845, + "epoch": 0.7492559747482339, + "flos": 24491152350720.0, + "grad_norm": 1.7672188924721626, + "language_loss": 0.69562376, + "learning_rate": 5.89129886586127e-07, + "loss": 0.71638989, + "num_input_tokens_seen": 268792265, + "router_z_loss_clip": 0.10253906, + "router_z_loss_mlp": 0.359375, + "step": 12462, + "time_per_iteration": 2.432504415512085 + }, + { + "auxiliary_loss_clip": 0.01056256, + "auxiliary_loss_mlp": 0.01025136, + "balance_loss_clip": 1.01255965, + "balance_loss_mlp": 1.01864278, + "epoch": 0.7493160980009018, + "flos": 27744114591360.0, + "grad_norm": 1.7748549331290997, + "language_loss": 0.70223391, + "learning_rate": 5.888621209078833e-07, + "loss": 0.72304785, + "num_input_tokens_seen": 268812735, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.375, + "step": 12463, + "time_per_iteration": 2.433098316192627 + }, + { + "auxiliary_loss_clip": 0.01057377, + "auxiliary_loss_mlp": 0.01023059, + "balance_loss_clip": 1.01220524, + "balance_loss_mlp": 1.01960504, + "epoch": 0.7493762212535698, + "flos": 30224903091840.0, + "grad_norm": 1.66548366439414, + "language_loss": 0.7744081, + "learning_rate": 5.885944055895208e-07, + "loss": 0.79521251, + "num_input_tokens_seen": 268833090, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.37695312, + "step": 12464, + "time_per_iteration": 2.4725582599639893 + }, + { + "auxiliary_loss_clip": 0.01054368, + "auxiliary_loss_mlp": 0.0102213, + "balance_loss_clip": 1.01118016, + "balance_loss_mlp": 1.01749444, + "epoch": 0.7494363445062378, + "flos": 21943400129280.0, + "grad_norm": 1.8320308831973684, + "language_loss": 0.78323555, + "learning_rate": 5.883267406405938e-07, + "loss": 0.80400056, + "num_input_tokens_seen": 268851880, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.36914062, + "step": 12465, + "time_per_iteration": 2.3944058418273926 + }, + { + "auxiliary_loss_clip": 0.01061427, + "auxiliary_loss_mlp": 0.01024249, + "balance_loss_clip": 1.01097465, + "balance_loss_mlp": 1.01918149, + "epoch": 0.7494964677589058, + "flos": 12489653623680.0, + "grad_norm": 4.257867501802662, + "language_loss": 0.74677658, + "learning_rate": 5.88059126070654e-07, + "loss": 0.76763332, + "num_input_tokens_seen": 268867910, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.421875, + "step": 12466, + "time_per_iteration": 2.374910831451416 + }, + { + "auxiliary_loss_clip": 0.01056691, + "auxiliary_loss_mlp": 0.01025488, + "balance_loss_clip": 1.01418662, + "balance_loss_mlp": 1.01833832, + "epoch": 0.7495565910115737, + "flos": 21651980077440.0, + "grad_norm": 1.8773237995802485, + "language_loss": 0.66027343, + "learning_rate": 5.877915618892521e-07, + "loss": 0.68109524, + "num_input_tokens_seen": 268887260, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.3828125, + "step": 12467, + "time_per_iteration": 2.385901689529419 + }, + { + "auxiliary_loss_clip": 0.01056613, + "auxiliary_loss_mlp": 0.01023594, + "balance_loss_clip": 1.01194704, + "balance_loss_mlp": 1.01866269, + "epoch": 0.7496167142642417, + "flos": 15267622550400.0, + "grad_norm": 2.9936863182120956, + "language_loss": 0.76502591, + "learning_rate": 5.875240481059367e-07, + "loss": 0.78582799, + "num_input_tokens_seen": 268902520, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.37890625, + "step": 12468, + "time_per_iteration": 3.8734042644500732 + }, + { + "auxiliary_loss_clip": 0.01056847, + "auxiliary_loss_mlp": 0.01021835, + "balance_loss_clip": 1.01021242, + "balance_loss_mlp": 1.01930726, + "epoch": 0.7496768375169096, + "flos": 22053830359680.0, + "grad_norm": 2.3785414479104015, + "language_loss": 0.69530332, + "learning_rate": 5.872565847302547e-07, + "loss": 0.71609008, + "num_input_tokens_seen": 268920970, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.375, + "step": 12469, + "time_per_iteration": 2.384430170059204 + }, + { + "auxiliary_loss_clip": 0.0105705, + "auxiliary_loss_mlp": 0.01025061, + "balance_loss_clip": 1.01321769, + "balance_loss_mlp": 1.01863253, + "epoch": 0.7497369607695776, + "flos": 19056187958400.0, + "grad_norm": 1.8936844499196686, + "language_loss": 0.69378519, + "learning_rate": 5.869891717717505e-07, + "loss": 0.71460629, + "num_input_tokens_seen": 268936600, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.3828125, + "step": 12470, + "time_per_iteration": 2.4098339080810547 + }, + { + "auxiliary_loss_clip": 0.01058249, + "auxiliary_loss_mlp": 0.01027681, + "balance_loss_clip": 1.01466382, + "balance_loss_mlp": 1.01757622, + "epoch": 0.7497970840222457, + "flos": 21616333712640.0, + "grad_norm": 1.8808277348682094, + "language_loss": 0.7543726, + "learning_rate": 5.867218092399688e-07, + "loss": 0.77523196, + "num_input_tokens_seen": 268956560, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.40625, + "step": 12471, + "time_per_iteration": 2.396634817123413 + }, + { + "auxiliary_loss_clip": 0.01058423, + "auxiliary_loss_mlp": 0.01022273, + "balance_loss_clip": 1.01038742, + "balance_loss_mlp": 1.01943982, + "epoch": 0.7498572072749136, + "flos": 13734725996160.0, + "grad_norm": 2.1314651459119407, + "language_loss": 0.77462125, + "learning_rate": 5.864544971444503e-07, + "loss": 0.79542816, + "num_input_tokens_seen": 268973945, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.390625, + "step": 12472, + "time_per_iteration": 2.415847063064575 + }, + { + "auxiliary_loss_clip": 0.01055018, + "auxiliary_loss_mlp": 0.01025489, + "balance_loss_clip": 1.01470661, + "balance_loss_mlp": 1.01706982, + "epoch": 0.7499173305275816, + "flos": 22965412625280.0, + "grad_norm": 1.6107295446772105, + "language_loss": 0.84504509, + "learning_rate": 5.861872354947345e-07, + "loss": 0.86585021, + "num_input_tokens_seen": 268993245, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.37890625, + "step": 12473, + "time_per_iteration": 2.400257110595703 + }, + { + "auxiliary_loss_clip": 0.01060163, + "auxiliary_loss_mlp": 0.01029337, + "balance_loss_clip": 1.0161469, + "balance_loss_mlp": 1.01935685, + "epoch": 0.7499774537802495, + "flos": 22739559840000.0, + "grad_norm": 2.1160005664548978, + "language_loss": 0.73834813, + "learning_rate": 5.859200243003592e-07, + "loss": 0.75924307, + "num_input_tokens_seen": 269012125, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.40820312, + "step": 12474, + "time_per_iteration": 2.414358139038086 + }, + { + "auxiliary_loss_clip": 0.01059681, + "auxiliary_loss_mlp": 0.01026135, + "balance_loss_clip": 1.01371312, + "balance_loss_mlp": 1.01859391, + "epoch": 0.7500375770329175, + "flos": 18295569878400.0, + "grad_norm": 2.641432827614488, + "language_loss": 0.74488968, + "learning_rate": 5.856528635708619e-07, + "loss": 0.76574779, + "num_input_tokens_seen": 269030545, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.41015625, + "step": 12475, + "time_per_iteration": 2.389249563217163 + }, + { + "auxiliary_loss_clip": 0.01059894, + "auxiliary_loss_mlp": 0.01028753, + "balance_loss_clip": 1.01594353, + "balance_loss_mlp": 1.01886535, + "epoch": 0.7500977002855854, + "flos": 19169027072640.0, + "grad_norm": 1.730607441613958, + "language_loss": 0.79768127, + "learning_rate": 5.853857533157747e-07, + "loss": 0.81856775, + "num_input_tokens_seen": 269048180, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.41015625, + "step": 12476, + "time_per_iteration": 3.779188394546509 + }, + { + "auxiliary_loss_clip": 0.01056126, + "auxiliary_loss_mlp": 0.01024205, + "balance_loss_clip": 1.01254594, + "balance_loss_mlp": 1.01801467, + "epoch": 0.7501578235382534, + "flos": 22162794312960.0, + "grad_norm": 2.548588191116197, + "language_loss": 0.77827179, + "learning_rate": 5.851186935446316e-07, + "loss": 0.79907513, + "num_input_tokens_seen": 269068600, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.38085938, + "step": 12477, + "time_per_iteration": 2.4082863330841064 + }, + { + "auxiliary_loss_clip": 0.01006667, + "auxiliary_loss_mlp": 0.01000941, + "balance_loss_clip": 0.9999876, + "balance_loss_mlp": 1.00040603, + "epoch": 0.7502179467909214, + "flos": 64462374074880.0, + "grad_norm": 0.8080583996904289, + "language_loss": 0.54451638, + "learning_rate": 5.848516842669626e-07, + "loss": 0.56459248, + "num_input_tokens_seen": 269119045, + "router_z_loss_clip": 0.00952148, + "router_z_loss_mlp": 0.0625, + "step": 12478, + "time_per_iteration": 3.0148112773895264 + }, + { + "auxiliary_loss_clip": 0.01054851, + "auxiliary_loss_mlp": 0.01023961, + "balance_loss_clip": 1.01251698, + "balance_loss_mlp": 1.01847219, + "epoch": 0.7502780700435894, + "flos": 20477432384640.0, + "grad_norm": 1.9888857148438421, + "language_loss": 0.79995573, + "learning_rate": 5.845847254922971e-07, + "loss": 0.8207438, + "num_input_tokens_seen": 269136755, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.36328125, + "step": 12479, + "time_per_iteration": 2.4173004627227783 + }, + { + "auxiliary_loss_clip": 0.01059072, + "auxiliary_loss_mlp": 0.01027041, + "balance_loss_clip": 1.0145359, + "balance_loss_mlp": 1.01911592, + "epoch": 0.7503381932962573, + "flos": 20444334549120.0, + "grad_norm": 2.0737822299084736, + "language_loss": 0.62834251, + "learning_rate": 5.843178172301613e-07, + "loss": 0.64920366, + "num_input_tokens_seen": 269156120, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.3984375, + "step": 12480, + "time_per_iteration": 2.409273147583008 + }, + { + "auxiliary_loss_clip": 0.01054629, + "auxiliary_loss_mlp": 0.01022685, + "balance_loss_clip": 1.01221263, + "balance_loss_mlp": 1.01872635, + "epoch": 0.7503983165489253, + "flos": 22380861864960.0, + "grad_norm": 1.854424955749656, + "language_loss": 0.77688646, + "learning_rate": 5.840509594900813e-07, + "loss": 0.79765964, + "num_input_tokens_seen": 269175650, + "router_z_loss_clip": 0.10449219, + "router_z_loss_mlp": 0.359375, + "step": 12481, + "time_per_iteration": 2.400726556777954 + }, + { + "auxiliary_loss_clip": 0.01057821, + "auxiliary_loss_mlp": 0.01025022, + "balance_loss_clip": 1.01354218, + "balance_loss_mlp": 1.01800609, + "epoch": 0.7504584398015932, + "flos": 24898309159680.0, + "grad_norm": 2.989636614016745, + "language_loss": 0.71215779, + "learning_rate": 5.837841522815805e-07, + "loss": 0.73298621, + "num_input_tokens_seen": 269197080, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.3984375, + "step": 12482, + "time_per_iteration": 2.566549062728882 + }, + { + "auxiliary_loss_clip": 0.01057426, + "auxiliary_loss_mlp": 0.01025171, + "balance_loss_clip": 1.01335144, + "balance_loss_mlp": 1.01937282, + "epoch": 0.7505185630542612, + "flos": 25884046886400.0, + "grad_norm": 1.4844877003981942, + "language_loss": 0.70384765, + "learning_rate": 5.835173956141805e-07, + "loss": 0.72467363, + "num_input_tokens_seen": 269218600, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.38085938, + "step": 12483, + "time_per_iteration": 2.4435060024261475 + }, + { + "auxiliary_loss_clip": 0.01056098, + "auxiliary_loss_mlp": 0.01022482, + "balance_loss_clip": 1.01153815, + "balance_loss_mlp": 1.01825953, + "epoch": 0.7505786863069293, + "flos": 23142876399360.0, + "grad_norm": 3.459994488860213, + "language_loss": 0.74231136, + "learning_rate": 5.83250689497401e-07, + "loss": 0.76309711, + "num_input_tokens_seen": 269239245, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.37890625, + "step": 12484, + "time_per_iteration": 2.449193239212036 + }, + { + "auxiliary_loss_clip": 0.01055302, + "auxiliary_loss_mlp": 0.0101957, + "balance_loss_clip": 1.00812614, + "balance_loss_mlp": 1.01810694, + "epoch": 0.7506388095595972, + "flos": 16982416621440.0, + "grad_norm": 2.2685156247132046, + "language_loss": 0.84200084, + "learning_rate": 5.829840339407599e-07, + "loss": 0.86274958, + "num_input_tokens_seen": 269258520, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.37109375, + "step": 12485, + "time_per_iteration": 2.4017796516418457 + }, + { + "auxiliary_loss_clip": 0.01055503, + "auxiliary_loss_mlp": 0.01022789, + "balance_loss_clip": 1.01149404, + "balance_loss_mlp": 1.01811099, + "epoch": 0.7506989328122652, + "flos": 22343923779840.0, + "grad_norm": 1.472746158186576, + "language_loss": 0.78099018, + "learning_rate": 5.827174289537738e-07, + "loss": 0.80177307, + "num_input_tokens_seen": 269278320, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.375, + "step": 12486, + "time_per_iteration": 2.537281036376953 + }, + { + "auxiliary_loss_clip": 0.01058706, + "auxiliary_loss_mlp": 0.01023818, + "balance_loss_clip": 1.01197433, + "balance_loss_mlp": 1.01915574, + "epoch": 0.7507590560649331, + "flos": 25774873464960.0, + "grad_norm": 1.851261613458631, + "language_loss": 0.72724116, + "learning_rate": 5.824508745459562e-07, + "loss": 0.74806631, + "num_input_tokens_seen": 269298025, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.39648438, + "step": 12487, + "time_per_iteration": 2.525874614715576 + }, + { + "auxiliary_loss_clip": 0.01056548, + "auxiliary_loss_mlp": 0.01020199, + "balance_loss_clip": 1.00913, + "balance_loss_mlp": 1.01796091, + "epoch": 0.7508191793176011, + "flos": 24278286591360.0, + "grad_norm": 3.178779639690473, + "language_loss": 0.67047584, + "learning_rate": 5.82184370726821e-07, + "loss": 0.69124329, + "num_input_tokens_seen": 269316770, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.38671875, + "step": 12488, + "time_per_iteration": 2.4335105419158936 + }, + { + "auxiliary_loss_clip": 0.01056352, + "auxiliary_loss_mlp": 0.01025075, + "balance_loss_clip": 1.01292169, + "balance_loss_mlp": 1.01869047, + "epoch": 0.750879302570269, + "flos": 19898607087360.0, + "grad_norm": 1.7190006080708438, + "language_loss": 0.77145022, + "learning_rate": 5.819179175058789e-07, + "loss": 0.79226446, + "num_input_tokens_seen": 269334755, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.37695312, + "step": 12489, + "time_per_iteration": 2.3694334030151367 + }, + { + "auxiliary_loss_clip": 0.01055259, + "auxiliary_loss_mlp": 0.01025122, + "balance_loss_clip": 1.01392877, + "balance_loss_mlp": 1.01838851, + "epoch": 0.750939425822937, + "flos": 29204391784320.0, + "grad_norm": 1.5866730981080304, + "language_loss": 0.74985331, + "learning_rate": 5.816515148926384e-07, + "loss": 0.77065712, + "num_input_tokens_seen": 269353810, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.36914062, + "step": 12490, + "time_per_iteration": 2.4518163204193115 + }, + { + "auxiliary_loss_clip": 0.01056271, + "auxiliary_loss_mlp": 0.01023679, + "balance_loss_clip": 1.01236629, + "balance_loss_mlp": 1.01863027, + "epoch": 0.750999549075605, + "flos": 21141235664640.0, + "grad_norm": 1.5230858401528649, + "language_loss": 0.78275704, + "learning_rate": 5.813851628966062e-07, + "loss": 0.80355656, + "num_input_tokens_seen": 269372910, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.375, + "step": 12491, + "time_per_iteration": 2.3809070587158203 + }, + { + "auxiliary_loss_clip": 0.01053112, + "auxiliary_loss_mlp": 0.01023146, + "balance_loss_clip": 1.01282239, + "balance_loss_mlp": 1.01710999, + "epoch": 0.751059672328273, + "flos": 23546681717760.0, + "grad_norm": 2.1436413784615675, + "language_loss": 0.76367271, + "learning_rate": 5.811188615272899e-07, + "loss": 0.78443527, + "num_input_tokens_seen": 269391545, + "router_z_loss_clip": 0.10351562, + "router_z_loss_mlp": 0.359375, + "step": 12492, + "time_per_iteration": 2.410649299621582 + }, + { + "auxiliary_loss_clip": 0.01054424, + "auxiliary_loss_mlp": 0.01024251, + "balance_loss_clip": 1.01299763, + "balance_loss_mlp": 1.01700628, + "epoch": 0.7511197955809409, + "flos": 18988735478400.0, + "grad_norm": 1.9082961458981553, + "language_loss": 0.71402478, + "learning_rate": 5.808526107941902e-07, + "loss": 0.73481154, + "num_input_tokens_seen": 269408530, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.37304688, + "step": 12493, + "time_per_iteration": 2.381715774536133 + }, + { + "auxiliary_loss_clip": 0.01056988, + "auxiliary_loss_mlp": 0.01021734, + "balance_loss_clip": 1.0101831, + "balance_loss_mlp": 1.01921082, + "epoch": 0.7511799188336089, + "flos": 22046080037760.0, + "grad_norm": 1.5941481528320547, + "language_loss": 0.80655336, + "learning_rate": 5.80586410706811e-07, + "loss": 0.82734054, + "num_input_tokens_seen": 269425930, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.37890625, + "step": 12494, + "time_per_iteration": 2.400967597961426 + }, + { + "auxiliary_loss_clip": 0.01058605, + "auxiliary_loss_mlp": 0.01024035, + "balance_loss_clip": 1.01156521, + "balance_loss_mlp": 1.01942086, + "epoch": 0.7512400420862768, + "flos": 16466330770560.0, + "grad_norm": 2.05599779538208, + "language_loss": 0.78561825, + "learning_rate": 5.803202612746518e-07, + "loss": 0.80644464, + "num_input_tokens_seen": 269443945, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.39257812, + "step": 12495, + "time_per_iteration": 2.37457275390625 + }, + { + "auxiliary_loss_clip": 0.01057503, + "auxiliary_loss_mlp": 0.01027108, + "balance_loss_clip": 1.01560998, + "balance_loss_mlp": 1.01910961, + "epoch": 0.7513001653389448, + "flos": 20447302014720.0, + "grad_norm": 2.109059836166139, + "language_loss": 0.71044612, + "learning_rate": 5.800541625072104e-07, + "loss": 0.73129225, + "num_input_tokens_seen": 269463625, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.3828125, + "step": 12496, + "time_per_iteration": 2.3938565254211426 + }, + { + "auxiliary_loss_clip": 0.0105749, + "auxiliary_loss_mlp": 0.01021692, + "balance_loss_clip": 1.01025355, + "balance_loss_mlp": 1.0191406, + "epoch": 0.7513602885916129, + "flos": 23475703190400.0, + "grad_norm": 1.5393067270983758, + "language_loss": 0.78026038, + "learning_rate": 5.797881144139829e-07, + "loss": 0.80105215, + "num_input_tokens_seen": 269483415, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.3828125, + "step": 12497, + "time_per_iteration": 2.430467128753662 + }, + { + "auxiliary_loss_clip": 0.01058004, + "auxiliary_loss_mlp": 0.0102848, + "balance_loss_clip": 1.01618934, + "balance_loss_mlp": 1.01870668, + "epoch": 0.7514204118442808, + "flos": 26796013176960.0, + "grad_norm": 1.5264340188228118, + "language_loss": 0.77003694, + "learning_rate": 5.795221170044648e-07, + "loss": 0.79090178, + "num_input_tokens_seen": 269504635, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.39257812, + "step": 12498, + "time_per_iteration": 3.923457384109497 + }, + { + "auxiliary_loss_clip": 0.01055384, + "auxiliary_loss_mlp": 0.01022978, + "balance_loss_clip": 1.0120523, + "balance_loss_mlp": 1.01928532, + "epoch": 0.7514805350969488, + "flos": 19864601556480.0, + "grad_norm": 1.7927354698847753, + "language_loss": 0.74327886, + "learning_rate": 5.792561702881493e-07, + "loss": 0.76406252, + "num_input_tokens_seen": 269523955, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.36132812, + "step": 12499, + "time_per_iteration": 2.4322073459625244 + }, + { + "auxiliary_loss_clip": 0.01053961, + "auxiliary_loss_mlp": 0.01019115, + "balance_loss_clip": 1.00854731, + "balance_loss_mlp": 1.01700234, + "epoch": 0.7515406583496167, + "flos": 24570404870400.0, + "grad_norm": 1.9553588556083004, + "language_loss": 0.7938534, + "learning_rate": 5.789902742745251e-07, + "loss": 0.8145842, + "num_input_tokens_seen": 269544410, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.36914062, + "step": 12500, + "time_per_iteration": 2.430222272872925 + }, + { + "auxiliary_loss_clip": 0.01062043, + "auxiliary_loss_mlp": 0.01027749, + "balance_loss_clip": 1.01339054, + "balance_loss_mlp": 1.02005553, + "epoch": 0.7516007816022847, + "flos": 20119223168640.0, + "grad_norm": 1.8570690813832453, + "language_loss": 0.73717159, + "learning_rate": 5.787244289730835e-07, + "loss": 0.75806952, + "num_input_tokens_seen": 269563315, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.41992188, + "step": 12501, + "time_per_iteration": 2.4050345420837402 + }, + { + "auxiliary_loss_clip": 0.01055667, + "auxiliary_loss_mlp": 0.01016692, + "balance_loss_clip": 1.00546837, + "balance_loss_mlp": 1.01821446, + "epoch": 0.7516609048549526, + "flos": 22783515108480.0, + "grad_norm": 1.6236841091774012, + "language_loss": 0.78407258, + "learning_rate": 5.784586343933111e-07, + "loss": 0.80479616, + "num_input_tokens_seen": 269583950, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.375, + "step": 12502, + "time_per_iteration": 2.4594266414642334 + }, + { + "auxiliary_loss_clip": 0.0105787, + "auxiliary_loss_mlp": 0.0102845, + "balance_loss_clip": 1.01578975, + "balance_loss_mlp": 1.01892531, + "epoch": 0.7517210281076206, + "flos": 10633251611520.0, + "grad_norm": 2.4302466181475046, + "language_loss": 0.70732009, + "learning_rate": 5.781928905446933e-07, + "loss": 0.72818327, + "num_input_tokens_seen": 269600120, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.390625, + "step": 12503, + "time_per_iteration": 2.3747963905334473 + }, + { + "auxiliary_loss_clip": 0.01006988, + "auxiliary_loss_mlp": 0.01001391, + "balance_loss_clip": 1.00052047, + "balance_loss_mlp": 1.00048268, + "epoch": 0.7517811513602886, + "flos": 66049001078400.0, + "grad_norm": 0.9667953277360066, + "language_loss": 0.63969803, + "learning_rate": 5.779271974367132e-07, + "loss": 0.65978181, + "num_input_tokens_seen": 269659815, + "router_z_loss_clip": 0.00872803, + "router_z_loss_mlp": 0.06494141, + "step": 12504, + "time_per_iteration": 2.9742655754089355 + }, + { + "auxiliary_loss_clip": 0.01053411, + "auxiliary_loss_mlp": 0.01023212, + "balance_loss_clip": 1.01233983, + "balance_loss_mlp": 1.01734293, + "epoch": 0.7518412746129566, + "flos": 37266849676800.0, + "grad_norm": 1.4790480946729387, + "language_loss": 0.68623775, + "learning_rate": 5.776615550788548e-07, + "loss": 0.70700395, + "num_input_tokens_seen": 269684565, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.359375, + "step": 12505, + "time_per_iteration": 2.572373628616333 + }, + { + "auxiliary_loss_clip": 0.01006679, + "auxiliary_loss_mlp": 0.01001074, + "balance_loss_clip": 1.00013793, + "balance_loss_mlp": 1.00030994, + "epoch": 0.7519013978656245, + "flos": 60515162115840.0, + "grad_norm": 0.6420994830555833, + "language_loss": 0.55036688, + "learning_rate": 5.773959634805956e-07, + "loss": 0.57044435, + "num_input_tokens_seen": 269752325, + "router_z_loss_clip": 0.00933838, + "router_z_loss_mlp": 0.06347656, + "step": 12506, + "time_per_iteration": 3.1733505725860596 + }, + { + "auxiliary_loss_clip": 0.01059893, + "auxiliary_loss_mlp": 0.01024527, + "balance_loss_clip": 1.01261806, + "balance_loss_mlp": 1.02031434, + "epoch": 0.7519615211182925, + "flos": 18805895354880.0, + "grad_norm": 2.6769162333539733, + "language_loss": 0.78074539, + "learning_rate": 5.771304226514155e-07, + "loss": 0.80158955, + "num_input_tokens_seen": 269770630, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.39453125, + "step": 12507, + "time_per_iteration": 3.95412278175354 + }, + { + "auxiliary_loss_clip": 0.01056299, + "auxiliary_loss_mlp": 0.01023574, + "balance_loss_clip": 1.01268411, + "balance_loss_mlp": 1.01973867, + "epoch": 0.7520216443709604, + "flos": 14574352216320.0, + "grad_norm": 1.624114200693809, + "language_loss": 0.71397471, + "learning_rate": 5.768649326007902e-07, + "loss": 0.7347734, + "num_input_tokens_seen": 269787280, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.3671875, + "step": 12508, + "time_per_iteration": 3.834461212158203 + }, + { + "auxiliary_loss_clip": 0.01054282, + "auxiliary_loss_mlp": 0.01020528, + "balance_loss_clip": 1.00966239, + "balance_loss_mlp": 1.01766157, + "epoch": 0.7520817676236284, + "flos": 17055629475840.0, + "grad_norm": 1.6707879691842127, + "language_loss": 0.71880114, + "learning_rate": 5.765994933381957e-07, + "loss": 0.73954922, + "num_input_tokens_seen": 269805205, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.36523438, + "step": 12509, + "time_per_iteration": 2.3688201904296875 + }, + { + "auxiliary_loss_clip": 0.010589, + "auxiliary_loss_mlp": 0.01034061, + "balance_loss_clip": 1.02078152, + "balance_loss_mlp": 1.0187149, + "epoch": 0.7521418908762965, + "flos": 25665211284480.0, + "grad_norm": 1.6355516002176202, + "language_loss": 0.62082398, + "learning_rate": 5.763341048731028e-07, + "loss": 0.64175361, + "num_input_tokens_seen": 269824820, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.40234375, + "step": 12510, + "time_per_iteration": 2.4838812351226807 + }, + { + "auxiliary_loss_clip": 0.01053383, + "auxiliary_loss_mlp": 0.01024198, + "balance_loss_clip": 1.01236069, + "balance_loss_mlp": 1.01758194, + "epoch": 0.7522020141289644, + "flos": 20885706357120.0, + "grad_norm": 1.7649445581602556, + "language_loss": 0.81535184, + "learning_rate": 5.760687672149842e-07, + "loss": 0.83612764, + "num_input_tokens_seen": 269842825, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.35742188, + "step": 12511, + "time_per_iteration": 2.4053075313568115 + }, + { + "auxiliary_loss_clip": 0.01059193, + "auxiliary_loss_mlp": 0.01028535, + "balance_loss_clip": 1.01614904, + "balance_loss_mlp": 1.01871014, + "epoch": 0.7522621373816324, + "flos": 12639500645760.0, + "grad_norm": 1.734557964195035, + "language_loss": 0.76022029, + "learning_rate": 5.758034803733085e-07, + "loss": 0.78109753, + "num_input_tokens_seen": 269859000, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.40429688, + "step": 12512, + "time_per_iteration": 2.3960232734680176 + }, + { + "auxiliary_loss_clip": 0.01052128, + "auxiliary_loss_mlp": 0.01020318, + "balance_loss_clip": 1.01016784, + "balance_loss_mlp": 1.01706243, + "epoch": 0.7523222606343003, + "flos": 25625061354240.0, + "grad_norm": 1.590548244436563, + "language_loss": 0.82114136, + "learning_rate": 5.755382443575429e-07, + "loss": 0.84186578, + "num_input_tokens_seen": 269878895, + "router_z_loss_clip": 0.1015625, + "router_z_loss_mlp": 0.3515625, + "step": 12513, + "time_per_iteration": 2.4352457523345947 + }, + { + "auxiliary_loss_clip": 0.01058627, + "auxiliary_loss_mlp": 0.01023257, + "balance_loss_clip": 1.01132369, + "balance_loss_mlp": 1.01898623, + "epoch": 0.7523823838869683, + "flos": 20447860596480.0, + "grad_norm": 2.301595479175504, + "language_loss": 0.74665534, + "learning_rate": 5.752730591771535e-07, + "loss": 0.76747417, + "num_input_tokens_seen": 269897280, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.39648438, + "step": 12514, + "time_per_iteration": 2.419009208679199 + }, + { + "auxiliary_loss_clip": 0.01054381, + "auxiliary_loss_mlp": 0.01022624, + "balance_loss_clip": 1.01215148, + "balance_loss_mlp": 1.01898921, + "epoch": 0.7524425071396362, + "flos": 14719730584320.0, + "grad_norm": 2.3381879209779277, + "language_loss": 0.69090831, + "learning_rate": 5.750079248416031e-07, + "loss": 0.71167833, + "num_input_tokens_seen": 269914640, + "router_z_loss_clip": 0.10498047, + "router_z_loss_mlp": 0.35351562, + "step": 12515, + "time_per_iteration": 2.366093397140503 + }, + { + "auxiliary_loss_clip": 0.01057215, + "auxiliary_loss_mlp": 0.0102501, + "balance_loss_clip": 1.01369381, + "balance_loss_mlp": 1.01934826, + "epoch": 0.7525026303923043, + "flos": 30590722984320.0, + "grad_norm": 5.833890387336426, + "language_loss": 0.70238167, + "learning_rate": 5.747428413603554e-07, + "loss": 0.7232039, + "num_input_tokens_seen": 269934960, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.37890625, + "step": 12516, + "time_per_iteration": 3.881223678588867 + }, + { + "auxiliary_loss_clip": 0.01055122, + "auxiliary_loss_mlp": 0.01023326, + "balance_loss_clip": 1.01291895, + "balance_loss_mlp": 1.01937103, + "epoch": 0.7525627536449722, + "flos": 24790567104000.0, + "grad_norm": 1.6401981816518376, + "language_loss": 0.89386976, + "learning_rate": 5.744778087428686e-07, + "loss": 0.9146542, + "num_input_tokens_seen": 269956655, + "router_z_loss_clip": 0.10400391, + "router_z_loss_mlp": 0.35742188, + "step": 12517, + "time_per_iteration": 2.420232057571411 + }, + { + "auxiliary_loss_clip": 0.01057786, + "auxiliary_loss_mlp": 0.01028567, + "balance_loss_clip": 1.01695609, + "balance_loss_mlp": 1.01884627, + "epoch": 0.7526228768976402, + "flos": 20778662528640.0, + "grad_norm": 1.690279268017393, + "language_loss": 0.74521482, + "learning_rate": 5.742128269986022e-07, + "loss": 0.76607835, + "num_input_tokens_seen": 269976835, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.390625, + "step": 12518, + "time_per_iteration": 2.406212568283081 + }, + { + "auxiliary_loss_clip": 0.01006833, + "auxiliary_loss_mlp": 0.01000734, + "balance_loss_clip": 0.99991155, + "balance_loss_mlp": 1.00049901, + "epoch": 0.7526830001503081, + "flos": 66556114709760.0, + "grad_norm": 0.7046537304474692, + "language_loss": 0.55747223, + "learning_rate": 5.739478961370126e-07, + "loss": 0.57754791, + "num_input_tokens_seen": 270040630, + "router_z_loss_clip": 0.00823975, + "router_z_loss_mlp": 0.06347656, + "step": 12519, + "time_per_iteration": 3.0884997844696045 + }, + { + "auxiliary_loss_clip": 0.0105491, + "auxiliary_loss_mlp": 0.01023177, + "balance_loss_clip": 1.0125494, + "balance_loss_mlp": 1.01796675, + "epoch": 0.7527431234029761, + "flos": 23476750531200.0, + "grad_norm": 1.4682062301936096, + "language_loss": 0.7783829, + "learning_rate": 5.736830161675544e-07, + "loss": 0.7991637, + "num_input_tokens_seen": 270059695, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.37109375, + "step": 12520, + "time_per_iteration": 2.4851319789886475 + }, + { + "auxiliary_loss_clip": 0.01055937, + "auxiliary_loss_mlp": 0.01020855, + "balance_loss_clip": 1.00980401, + "balance_loss_mlp": 1.01849747, + "epoch": 0.752803246655644, + "flos": 22048593655680.0, + "grad_norm": 2.277460846629054, + "language_loss": 0.73908108, + "learning_rate": 5.734181870996797e-07, + "loss": 0.75984901, + "num_input_tokens_seen": 270078420, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.375, + "step": 12521, + "time_per_iteration": 2.395730495452881 + }, + { + "auxiliary_loss_clip": 0.01059636, + "auxiliary_loss_mlp": 0.01030723, + "balance_loss_clip": 1.01824808, + "balance_loss_mlp": 1.01916003, + "epoch": 0.752863369908312, + "flos": 30152493198720.0, + "grad_norm": 2.135782666804369, + "language_loss": 0.66934264, + "learning_rate": 5.731534089428413e-07, + "loss": 0.69024622, + "num_input_tokens_seen": 270097040, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.40429688, + "step": 12522, + "time_per_iteration": 2.4526360034942627 + }, + { + "auxiliary_loss_clip": 0.01059483, + "auxiliary_loss_mlp": 0.01030271, + "balance_loss_clip": 1.01852942, + "balance_loss_mlp": 1.0200398, + "epoch": 0.7529234931609801, + "flos": 24566599532160.0, + "grad_norm": 1.536296703001226, + "language_loss": 0.78358138, + "learning_rate": 5.728886817064866e-07, + "loss": 0.804479, + "num_input_tokens_seen": 270116365, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.39453125, + "step": 12523, + "time_per_iteration": 2.4470043182373047 + }, + { + "auxiliary_loss_clip": 0.01055426, + "auxiliary_loss_mlp": 0.01024331, + "balance_loss_clip": 1.01319695, + "balance_loss_mlp": 1.01825404, + "epoch": 0.752983616413648, + "flos": 23111279752320.0, + "grad_norm": 2.0537598010557883, + "language_loss": 0.80631113, + "learning_rate": 5.726240054000644e-07, + "loss": 0.82710874, + "num_input_tokens_seen": 270135395, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.37109375, + "step": 12524, + "time_per_iteration": 2.4456875324249268 + }, + { + "auxiliary_loss_clip": 0.01054437, + "auxiliary_loss_mlp": 0.01021919, + "balance_loss_clip": 1.01126194, + "balance_loss_mlp": 1.01845646, + "epoch": 0.753043739666316, + "flos": 24315783258240.0, + "grad_norm": 1.9664379410068713, + "language_loss": 0.74044734, + "learning_rate": 5.723593800330191e-07, + "loss": 0.76121092, + "num_input_tokens_seen": 270156425, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.359375, + "step": 12525, + "time_per_iteration": 2.427631378173828 + }, + { + "auxiliary_loss_clip": 0.01057087, + "auxiliary_loss_mlp": 0.01020405, + "balance_loss_clip": 1.00950384, + "balance_loss_mlp": 1.01915693, + "epoch": 0.7531038629189839, + "flos": 24242151467520.0, + "grad_norm": 1.8582170584043594, + "language_loss": 0.71972263, + "learning_rate": 5.720948056147965e-07, + "loss": 0.74049759, + "num_input_tokens_seen": 270176905, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.37890625, + "step": 12526, + "time_per_iteration": 2.4316165447235107 + }, + { + "auxiliary_loss_clip": 0.01055295, + "auxiliary_loss_mlp": 0.0102251, + "balance_loss_clip": 1.01130438, + "balance_loss_mlp": 1.01826108, + "epoch": 0.7531639861716519, + "flos": 30187546070400.0, + "grad_norm": 1.6616484403624512, + "language_loss": 0.72068113, + "learning_rate": 5.71830282154836e-07, + "loss": 0.74145919, + "num_input_tokens_seen": 270196640, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.37109375, + "step": 12527, + "time_per_iteration": 2.470968008041382 + }, + { + "auxiliary_loss_clip": 0.01054879, + "auxiliary_loss_mlp": 0.01019757, + "balance_loss_clip": 1.00944543, + "balance_loss_mlp": 1.01860285, + "epoch": 0.7532241094243198, + "flos": 18222217378560.0, + "grad_norm": 2.3027486881328243, + "language_loss": 0.81055927, + "learning_rate": 5.715658096625797e-07, + "loss": 0.83130562, + "num_input_tokens_seen": 270213905, + "router_z_loss_clip": 0.10302734, + "router_z_loss_mlp": 0.36328125, + "step": 12528, + "time_per_iteration": 2.3823180198669434 + }, + { + "auxiliary_loss_clip": 0.01056189, + "auxiliary_loss_mlp": 0.01025472, + "balance_loss_clip": 1.01279438, + "balance_loss_mlp": 1.01799774, + "epoch": 0.7532842326769879, + "flos": 20880155450880.0, + "grad_norm": 1.8543028414242595, + "language_loss": 0.85446739, + "learning_rate": 5.71301388147466e-07, + "loss": 0.87528396, + "num_input_tokens_seen": 270231995, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.3828125, + "step": 12529, + "time_per_iteration": 2.396714925765991 + }, + { + "auxiliary_loss_clip": 0.01059028, + "auxiliary_loss_mlp": 0.01025406, + "balance_loss_clip": 1.0135628, + "balance_loss_mlp": 1.01991177, + "epoch": 0.7533443559296558, + "flos": 18077676883200.0, + "grad_norm": 1.7113605647687107, + "language_loss": 0.73446757, + "learning_rate": 5.710370176189292e-07, + "loss": 0.75531185, + "num_input_tokens_seen": 270251480, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.39257812, + "step": 12530, + "time_per_iteration": 2.4284374713897705 + }, + { + "auxiliary_loss_clip": 0.01006844, + "auxiliary_loss_mlp": 0.0100052, + "balance_loss_clip": 0.99957836, + "balance_loss_mlp": 1.00039089, + "epoch": 0.7534044791823238, + "flos": 50252024494080.0, + "grad_norm": 0.826137482577428, + "language_loss": 0.6362738, + "learning_rate": 5.707726980864062e-07, + "loss": 0.65634739, + "num_input_tokens_seen": 270306480, + "router_z_loss_clip": 0.00939941, + "router_z_loss_mlp": 0.06445312, + "step": 12531, + "time_per_iteration": 2.9051976203918457 + }, + { + "auxiliary_loss_clip": 0.01058858, + "auxiliary_loss_mlp": 0.01022677, + "balance_loss_clip": 1.01041019, + "balance_loss_mlp": 1.0184257, + "epoch": 0.7534646024349917, + "flos": 20849222119680.0, + "grad_norm": 2.0322894257977806, + "language_loss": 0.69874144, + "learning_rate": 5.705084295593287e-07, + "loss": 0.71955681, + "num_input_tokens_seen": 270324595, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.40429688, + "step": 12532, + "time_per_iteration": 2.388017177581787 + }, + { + "auxiliary_loss_clip": 0.01053567, + "auxiliary_loss_mlp": 0.01022644, + "balance_loss_clip": 1.01268983, + "balance_loss_mlp": 1.01855421, + "epoch": 0.7535247256876597, + "flos": 23070780708480.0, + "grad_norm": 1.685172547420883, + "language_loss": 0.77354115, + "learning_rate": 5.702442120471296e-07, + "loss": 0.7943033, + "num_input_tokens_seen": 270344375, + "router_z_loss_clip": 0.09960938, + "router_z_loss_mlp": 0.34960938, + "step": 12533, + "time_per_iteration": 2.4353647232055664 + }, + { + "auxiliary_loss_clip": 0.01006717, + "auxiliary_loss_mlp": 0.01002485, + "balance_loss_clip": 1.00165617, + "balance_loss_mlp": 1.00043631, + "epoch": 0.7535848489403276, + "flos": 58620006627840.0, + "grad_norm": 0.7816356604457855, + "language_loss": 0.57347071, + "learning_rate": 5.699800455592354e-07, + "loss": 0.59356272, + "num_input_tokens_seen": 270405235, + "router_z_loss_clip": 0.00830078, + "router_z_loss_mlp": 0.0625, + "step": 12534, + "time_per_iteration": 3.0990865230560303 + }, + { + "auxiliary_loss_clip": 0.01054588, + "auxiliary_loss_mlp": 0.01023032, + "balance_loss_clip": 1.01167095, + "balance_loss_mlp": 1.0176003, + "epoch": 0.7536449721929956, + "flos": 26576688816000.0, + "grad_norm": 2.161822243146877, + "language_loss": 0.71336067, + "learning_rate": 5.697159301050756e-07, + "loss": 0.73413688, + "num_input_tokens_seen": 270425820, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.36914062, + "step": 12535, + "time_per_iteration": 2.443918228149414 + }, + { + "auxiliary_loss_clip": 0.01056991, + "auxiliary_loss_mlp": 0.01023856, + "balance_loss_clip": 1.01198888, + "balance_loss_mlp": 1.01764691, + "epoch": 0.7537050954456637, + "flos": 25734898091520.0, + "grad_norm": 1.8566589207396418, + "language_loss": 0.80541414, + "learning_rate": 5.69451865694075e-07, + "loss": 0.8262226, + "num_input_tokens_seen": 270447120, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.39453125, + "step": 12536, + "time_per_iteration": 2.465869188308716 + }, + { + "auxiliary_loss_clip": 0.01051692, + "auxiliary_loss_mlp": 0.01018619, + "balance_loss_clip": 1.00835538, + "balance_loss_mlp": 1.01684284, + "epoch": 0.7537652186983316, + "flos": 30223192435200.0, + "grad_norm": 1.8749481079911268, + "language_loss": 0.74463344, + "learning_rate": 5.691878523356574e-07, + "loss": 0.76533651, + "num_input_tokens_seen": 270468680, + "router_z_loss_clip": 0.10253906, + "router_z_loss_mlp": 0.34765625, + "step": 12537, + "time_per_iteration": 3.8899481296539307 + }, + { + "auxiliary_loss_clip": 0.01056188, + "auxiliary_loss_mlp": 0.01024553, + "balance_loss_clip": 1.0132699, + "balance_loss_mlp": 1.01829505, + "epoch": 0.7538253419509996, + "flos": 12640408341120.0, + "grad_norm": 1.7126890974525626, + "language_loss": 0.74271101, + "learning_rate": 5.689238900392445e-07, + "loss": 0.76351845, + "num_input_tokens_seen": 270486310, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.37890625, + "step": 12538, + "time_per_iteration": 2.379102945327759 + }, + { + "auxiliary_loss_clip": 0.01055898, + "auxiliary_loss_mlp": 0.01024214, + "balance_loss_clip": 1.01226354, + "balance_loss_mlp": 1.01810789, + "epoch": 0.7538854652036675, + "flos": 23184841720320.0, + "grad_norm": 1.5274179561714152, + "language_loss": 0.67558956, + "learning_rate": 5.686599788142581e-07, + "loss": 0.69639069, + "num_input_tokens_seen": 270507210, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.37890625, + "step": 12539, + "time_per_iteration": 2.4532840251922607 + }, + { + "auxiliary_loss_clip": 0.01055082, + "auxiliary_loss_mlp": 0.01020514, + "balance_loss_clip": 1.00904059, + "balance_loss_mlp": 1.01818657, + "epoch": 0.7539455884563355, + "flos": 23185086099840.0, + "grad_norm": 1.565285751466271, + "language_loss": 0.74405038, + "learning_rate": 5.683961186701138e-07, + "loss": 0.76480627, + "num_input_tokens_seen": 270525250, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.3671875, + "step": 12540, + "time_per_iteration": 2.401994466781616 + }, + { + "auxiliary_loss_clip": 0.0105684, + "auxiliary_loss_mlp": 0.01023645, + "balance_loss_clip": 1.01198649, + "balance_loss_mlp": 1.01775527, + "epoch": 0.7540057117090034, + "flos": 13180515073920.0, + "grad_norm": 2.386558801999635, + "language_loss": 0.72851861, + "learning_rate": 5.6813230961623e-07, + "loss": 0.74932349, + "num_input_tokens_seen": 270539295, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.390625, + "step": 12541, + "time_per_iteration": 2.3462862968444824 + }, + { + "auxiliary_loss_clip": 0.01055707, + "auxiliary_loss_mlp": 0.01023657, + "balance_loss_clip": 1.0120461, + "balance_loss_mlp": 1.01855743, + "epoch": 0.7540658349616715, + "flos": 45476396593920.0, + "grad_norm": 1.5538035955554315, + "language_loss": 0.72210294, + "learning_rate": 5.678685516620206e-07, + "loss": 0.74289656, + "num_input_tokens_seen": 270562815, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.37109375, + "step": 12542, + "time_per_iteration": 2.6061675548553467 + }, + { + "auxiliary_loss_clip": 0.01055486, + "auxiliary_loss_mlp": 0.01020722, + "balance_loss_clip": 1.00909901, + "balance_loss_mlp": 1.01757169, + "epoch": 0.7541259582143394, + "flos": 19929994266240.0, + "grad_norm": 2.120443072126911, + "language_loss": 0.84838325, + "learning_rate": 5.676048448168995e-07, + "loss": 0.86914527, + "num_input_tokens_seen": 270579055, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.37890625, + "step": 12543, + "time_per_iteration": 2.4463610649108887 + }, + { + "auxiliary_loss_clip": 0.01059686, + "auxiliary_loss_mlp": 0.01025252, + "balance_loss_clip": 1.01371312, + "balance_loss_mlp": 1.02235651, + "epoch": 0.7541860814670074, + "flos": 27197025586560.0, + "grad_norm": 2.57014226312519, + "language_loss": 0.73601156, + "learning_rate": 5.673411890902766e-07, + "loss": 0.75686097, + "num_input_tokens_seen": 270599080, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.37304688, + "step": 12544, + "time_per_iteration": 2.46722412109375 + }, + { + "auxiliary_loss_clip": 0.01056568, + "auxiliary_loss_mlp": 0.01024932, + "balance_loss_clip": 1.01374996, + "balance_loss_mlp": 1.01868677, + "epoch": 0.7542462047196753, + "flos": 21323098270080.0, + "grad_norm": 1.7909196482037304, + "language_loss": 0.68313086, + "learning_rate": 5.670775844915607e-07, + "loss": 0.70394588, + "num_input_tokens_seen": 270618715, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.37890625, + "step": 12545, + "time_per_iteration": 2.4401121139526367 + }, + { + "auxiliary_loss_clip": 0.01054679, + "auxiliary_loss_mlp": 0.01028007, + "balance_loss_clip": 1.01673627, + "balance_loss_mlp": 1.01755166, + "epoch": 0.7543063279723433, + "flos": 11940330291840.0, + "grad_norm": 1.7854962793295646, + "language_loss": 0.69106698, + "learning_rate": 5.668140310301612e-07, + "loss": 0.71189392, + "num_input_tokens_seen": 270635695, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.37109375, + "step": 12546, + "time_per_iteration": 2.3577346801757812 + }, + { + "auxiliary_loss_clip": 0.01055811, + "auxiliary_loss_mlp": 0.01026094, + "balance_loss_clip": 1.01447093, + "balance_loss_mlp": 1.01808381, + "epoch": 0.7543664512250112, + "flos": 22818882182400.0, + "grad_norm": 2.689522794898665, + "language_loss": 0.73136294, + "learning_rate": 5.665505287154812e-07, + "loss": 0.75218201, + "num_input_tokens_seen": 270654325, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.37890625, + "step": 12547, + "time_per_iteration": 5.32967734336853 + }, + { + "auxiliary_loss_clip": 0.01056914, + "auxiliary_loss_mlp": 0.01022721, + "balance_loss_clip": 1.01074052, + "balance_loss_mlp": 1.0194056, + "epoch": 0.7544265744776792, + "flos": 20922784087680.0, + "grad_norm": 1.869369113869487, + "language_loss": 0.67918992, + "learning_rate": 5.662870775569262e-07, + "loss": 0.69998628, + "num_input_tokens_seen": 270674260, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.375, + "step": 12548, + "time_per_iteration": 2.418229579925537 + }, + { + "auxiliary_loss_clip": 0.01057426, + "auxiliary_loss_mlp": 0.01023383, + "balance_loss_clip": 1.01168251, + "balance_loss_mlp": 1.01865208, + "epoch": 0.7544866977303473, + "flos": 15194584252800.0, + "grad_norm": 1.7092958447974993, + "language_loss": 0.86778599, + "learning_rate": 5.660236775638971e-07, + "loss": 0.88859409, + "num_input_tokens_seen": 270692200, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.38867188, + "step": 12549, + "time_per_iteration": 2.4147989749908447 + }, + { + "auxiliary_loss_clip": 0.01053856, + "auxiliary_loss_mlp": 0.01027649, + "balance_loss_clip": 1.01684237, + "balance_loss_mlp": 1.01783979, + "epoch": 0.7545468209830152, + "flos": 27082615461120.0, + "grad_norm": 1.865528526792351, + "language_loss": 0.77149671, + "learning_rate": 5.657603287457946e-07, + "loss": 0.79231179, + "num_input_tokens_seen": 270709675, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.36132812, + "step": 12550, + "time_per_iteration": 2.443317413330078 + }, + { + "auxiliary_loss_clip": 0.01056185, + "auxiliary_loss_mlp": 0.01018595, + "balance_loss_clip": 1.0078007, + "balance_loss_mlp": 1.01827455, + "epoch": 0.7546069442356832, + "flos": 26870447928960.0, + "grad_norm": 1.4098363118717643, + "language_loss": 0.69461346, + "learning_rate": 5.654970311120159e-07, + "loss": 0.71536124, + "num_input_tokens_seen": 270733055, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.37890625, + "step": 12551, + "time_per_iteration": 2.4715023040771484 + }, + { + "auxiliary_loss_clip": 0.01059076, + "auxiliary_loss_mlp": 0.01022859, + "balance_loss_clip": 1.01096797, + "balance_loss_mlp": 1.02082467, + "epoch": 0.7546670674883511, + "flos": 15742196928000.0, + "grad_norm": 2.9170007439980137, + "language_loss": 0.86151874, + "learning_rate": 5.65233784671959e-07, + "loss": 0.88233805, + "num_input_tokens_seen": 270749275, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.3828125, + "step": 12552, + "time_per_iteration": 2.3798000812530518 + }, + { + "auxiliary_loss_clip": 0.01055999, + "auxiliary_loss_mlp": 0.01019693, + "balance_loss_clip": 1.00879765, + "balance_loss_mlp": 1.01810336, + "epoch": 0.7547271907410191, + "flos": 23476575974400.0, + "grad_norm": 2.115963555987892, + "language_loss": 0.77925766, + "learning_rate": 5.649705894350176e-07, + "loss": 0.80001462, + "num_input_tokens_seen": 270768230, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.37890625, + "step": 12553, + "time_per_iteration": 2.4178318977355957 + }, + { + "auxiliary_loss_clip": 0.01056851, + "auxiliary_loss_mlp": 0.01023477, + "balance_loss_clip": 1.01183033, + "balance_loss_mlp": 1.01859951, + "epoch": 0.754787313993687, + "flos": 31721455054080.0, + "grad_norm": 4.305070399263682, + "language_loss": 0.63099194, + "learning_rate": 5.647074454105845e-07, + "loss": 0.65179515, + "num_input_tokens_seen": 270786285, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.3828125, + "step": 12554, + "time_per_iteration": 2.4958996772766113 + }, + { + "auxiliary_loss_clip": 0.01056921, + "auxiliary_loss_mlp": 0.01027226, + "balance_loss_clip": 1.01595521, + "balance_loss_mlp": 1.01847339, + "epoch": 0.7548474372463551, + "flos": 27561833049600.0, + "grad_norm": 1.6709415147149482, + "language_loss": 0.73094195, + "learning_rate": 5.6444435260805e-07, + "loss": 0.75178337, + "num_input_tokens_seen": 270805505, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.38476562, + "step": 12555, + "time_per_iteration": 3.855452299118042 + }, + { + "auxiliary_loss_clip": 0.01057464, + "auxiliary_loss_mlp": 0.01027696, + "balance_loss_clip": 1.01585817, + "balance_loss_mlp": 1.01912475, + "epoch": 0.754907560499023, + "flos": 19317547463040.0, + "grad_norm": 1.7228301873613772, + "language_loss": 0.78487802, + "learning_rate": 5.64181311036805e-07, + "loss": 0.80572963, + "num_input_tokens_seen": 270824610, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.3828125, + "step": 12556, + "time_per_iteration": 2.411290168762207 + }, + { + "auxiliary_loss_clip": 0.01006893, + "auxiliary_loss_mlp": 0.01000937, + "balance_loss_clip": 1.00003147, + "balance_loss_mlp": 1.00057769, + "epoch": 0.754967683751691, + "flos": 69741100800000.0, + "grad_norm": 0.7073797884135813, + "language_loss": 0.50448596, + "learning_rate": 5.639183207062346e-07, + "loss": 0.52456427, + "num_input_tokens_seen": 270886155, + "router_z_loss_clip": 0.0090332, + "router_z_loss_mlp": 0.06298828, + "step": 12557, + "time_per_iteration": 3.0201492309570312 + }, + { + "auxiliary_loss_clip": 0.0105826, + "auxiliary_loss_mlp": 0.01021491, + "balance_loss_clip": 1.00957024, + "balance_loss_mlp": 1.01950979, + "epoch": 0.7550278070043589, + "flos": 24420872050560.0, + "grad_norm": 1.6102612408885888, + "language_loss": 0.71484733, + "learning_rate": 5.636553816257257e-07, + "loss": 0.73564482, + "num_input_tokens_seen": 270905325, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.38671875, + "step": 12558, + "time_per_iteration": 2.4496095180511475 + }, + { + "auxiliary_loss_clip": 0.010568, + "auxiliary_loss_mlp": 0.01022452, + "balance_loss_clip": 1.01145542, + "balance_loss_mlp": 1.01903379, + "epoch": 0.7550879302570269, + "flos": 32633246787840.0, + "grad_norm": 1.6680693390158643, + "language_loss": 0.80304772, + "learning_rate": 5.633924938046617e-07, + "loss": 0.82384026, + "num_input_tokens_seen": 270927535, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.37695312, + "step": 12559, + "time_per_iteration": 2.4800424575805664 + }, + { + "auxiliary_loss_clip": 0.01057315, + "auxiliary_loss_mlp": 0.01019199, + "balance_loss_clip": 1.00756383, + "balance_loss_mlp": 1.01900768, + "epoch": 0.7551480535096948, + "flos": 21794565536640.0, + "grad_norm": 1.8007143409319855, + "language_loss": 0.7877112, + "learning_rate": 5.631296572524242e-07, + "loss": 0.80847633, + "num_input_tokens_seen": 270946920, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.3828125, + "step": 12560, + "time_per_iteration": 2.4479188919067383 + }, + { + "auxiliary_loss_clip": 0.01054396, + "auxiliary_loss_mlp": 0.01024873, + "balance_loss_clip": 1.01384056, + "balance_loss_mlp": 1.0184381, + "epoch": 0.7552081767623628, + "flos": 18514126189440.0, + "grad_norm": 1.6146177126269583, + "language_loss": 0.70951396, + "learning_rate": 5.628668719783931e-07, + "loss": 0.73030668, + "num_input_tokens_seen": 270965705, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.359375, + "step": 12561, + "time_per_iteration": 2.3925490379333496 + }, + { + "auxiliary_loss_clip": 0.0105578, + "auxiliary_loss_mlp": 0.01022444, + "balance_loss_clip": 1.01057673, + "balance_loss_mlp": 1.01801419, + "epoch": 0.7552683000150308, + "flos": 27633614538240.0, + "grad_norm": 3.107749152443765, + "language_loss": 0.7542237, + "learning_rate": 5.62604137991946e-07, + "loss": 0.77500594, + "num_input_tokens_seen": 270986550, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.37890625, + "step": 12562, + "time_per_iteration": 2.4701926708221436 + }, + { + "auxiliary_loss_clip": 0.01055908, + "auxiliary_loss_mlp": 0.01023906, + "balance_loss_clip": 1.01155603, + "balance_loss_mlp": 1.01776683, + "epoch": 0.7553284232676988, + "flos": 20301888735360.0, + "grad_norm": 1.753866329979862, + "language_loss": 0.75920618, + "learning_rate": 5.62341455302461e-07, + "loss": 0.78000426, + "num_input_tokens_seen": 271006250, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.3828125, + "step": 12563, + "time_per_iteration": 2.416424512863159 + }, + { + "auxiliary_loss_clip": 0.01061223, + "auxiliary_loss_mlp": 0.01025999, + "balance_loss_clip": 1.01237392, + "balance_loss_mlp": 1.01870513, + "epoch": 0.7553885465203668, + "flos": 33254072317440.0, + "grad_norm": 2.2195368105596085, + "language_loss": 0.6734165, + "learning_rate": 5.620788239193102e-07, + "loss": 0.69428867, + "num_input_tokens_seen": 271025575, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.42578125, + "step": 12564, + "time_per_iteration": 2.4963326454162598 + }, + { + "auxiliary_loss_clip": 0.01059067, + "auxiliary_loss_mlp": 0.0102649, + "balance_loss_clip": 1.01375806, + "balance_loss_mlp": 1.018942, + "epoch": 0.7554486697730347, + "flos": 21615181637760.0, + "grad_norm": 1.690589601027234, + "language_loss": 0.68778992, + "learning_rate": 5.618162438518678e-07, + "loss": 0.70864546, + "num_input_tokens_seen": 271045805, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.40039062, + "step": 12565, + "time_per_iteration": 2.4365506172180176 + }, + { + "auxiliary_loss_clip": 0.01059055, + "auxiliary_loss_mlp": 0.01026975, + "balance_loss_clip": 1.01444042, + "balance_loss_mlp": 1.01855314, + "epoch": 0.7555087930257027, + "flos": 27631834058880.0, + "grad_norm": 1.6034827879542217, + "language_loss": 0.75231647, + "learning_rate": 5.615537151095044e-07, + "loss": 0.77317685, + "num_input_tokens_seen": 271066065, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.40429688, + "step": 12566, + "time_per_iteration": 2.5045173168182373 + }, + { + "auxiliary_loss_clip": 0.0105871, + "auxiliary_loss_mlp": 0.0102849, + "balance_loss_clip": 1.01602113, + "balance_loss_mlp": 1.01914787, + "epoch": 0.7555689162783706, + "flos": 23620557888000.0, + "grad_norm": 2.885688533280564, + "language_loss": 0.74294561, + "learning_rate": 5.612912377015886e-07, + "loss": 0.76381761, + "num_input_tokens_seen": 271085870, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.39453125, + "step": 12567, + "time_per_iteration": 2.405266523361206 + }, + { + "auxiliary_loss_clip": 0.01056187, + "auxiliary_loss_mlp": 0.0102066, + "balance_loss_clip": 1.0095613, + "balance_loss_mlp": 1.01841331, + "epoch": 0.7556290395310387, + "flos": 24861929656320.0, + "grad_norm": 2.0336500419178583, + "language_loss": 0.62736326, + "learning_rate": 5.610288116374873e-07, + "loss": 0.64813173, + "num_input_tokens_seen": 271104260, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.37890625, + "step": 12568, + "time_per_iteration": 2.411311626434326 + }, + { + "auxiliary_loss_clip": 0.01055574, + "auxiliary_loss_mlp": 0.01025877, + "balance_loss_clip": 1.01458776, + "balance_loss_mlp": 1.01843548, + "epoch": 0.7556891627837066, + "flos": 43542103605120.0, + "grad_norm": 3.014258591118845, + "language_loss": 0.6681124, + "learning_rate": 5.607664369265668e-07, + "loss": 0.68892688, + "num_input_tokens_seen": 271125745, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.37109375, + "step": 12569, + "time_per_iteration": 2.574190616607666 + }, + { + "auxiliary_loss_clip": 0.01059657, + "auxiliary_loss_mlp": 0.01026331, + "balance_loss_clip": 1.01303315, + "balance_loss_mlp": 1.01890898, + "epoch": 0.7557492860363746, + "flos": 26649727113600.0, + "grad_norm": 3.1071744995744703, + "language_loss": 0.67502099, + "learning_rate": 5.6050411357819e-07, + "loss": 0.69588089, + "num_input_tokens_seen": 271147145, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.40820312, + "step": 12570, + "time_per_iteration": 2.4540836811065674 + }, + { + "auxiliary_loss_clip": 0.01057517, + "auxiliary_loss_mlp": 0.01023602, + "balance_loss_clip": 1.01094198, + "balance_loss_mlp": 1.01935935, + "epoch": 0.7558094092890425, + "flos": 55180889550720.0, + "grad_norm": 1.8825479521188813, + "language_loss": 0.71630138, + "learning_rate": 5.602418416017185e-07, + "loss": 0.73711258, + "num_input_tokens_seen": 271170865, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.3828125, + "step": 12571, + "time_per_iteration": 2.68269681930542 + }, + { + "auxiliary_loss_clip": 0.0105871, + "auxiliary_loss_mlp": 0.01021694, + "balance_loss_clip": 1.00920129, + "balance_loss_mlp": 1.01914215, + "epoch": 0.7558695325417105, + "flos": 23987145830400.0, + "grad_norm": 1.5346124070535554, + "language_loss": 0.73399341, + "learning_rate": 5.599796210065118e-07, + "loss": 0.75479746, + "num_input_tokens_seen": 271191450, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.39648438, + "step": 12572, + "time_per_iteration": 2.4497432708740234 + }, + { + "auxiliary_loss_clip": 0.0105975, + "auxiliary_loss_mlp": 0.01028983, + "balance_loss_clip": 1.01720548, + "balance_loss_mlp": 1.02083707, + "epoch": 0.7559296557943784, + "flos": 14610382606080.0, + "grad_norm": 2.0407898773846997, + "language_loss": 0.7673291, + "learning_rate": 5.597174518019292e-07, + "loss": 0.78821647, + "num_input_tokens_seen": 271207335, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.38867188, + "step": 12573, + "time_per_iteration": 2.3592655658721924 + }, + { + "auxiliary_loss_clip": 0.01059893, + "auxiliary_loss_mlp": 0.01027168, + "balance_loss_clip": 1.01484752, + "balance_loss_mlp": 1.019315, + "epoch": 0.7559897790470465, + "flos": 18549528174720.0, + "grad_norm": 1.6117920750632249, + "language_loss": 0.69105077, + "learning_rate": 5.594553339973254e-07, + "loss": 0.71192139, + "num_input_tokens_seen": 271226895, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.40625, + "step": 12574, + "time_per_iteration": 2.400753974914551 + }, + { + "auxiliary_loss_clip": 0.01054557, + "auxiliary_loss_mlp": 0.01026366, + "balance_loss_clip": 1.01467168, + "balance_loss_mlp": 1.01759315, + "epoch": 0.7560499022997144, + "flos": 17966897539200.0, + "grad_norm": 2.0390158921305948, + "language_loss": 0.7195785, + "learning_rate": 5.591932676020545e-07, + "loss": 0.7403878, + "num_input_tokens_seen": 271244375, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.36914062, + "step": 12575, + "time_per_iteration": 2.3745503425598145 + }, + { + "auxiliary_loss_clip": 0.0105373, + "auxiliary_loss_mlp": 0.01024299, + "balance_loss_clip": 1.01365387, + "balance_loss_mlp": 1.01714826, + "epoch": 0.7561100255523824, + "flos": 15737030046720.0, + "grad_norm": 2.3990887104693246, + "language_loss": 0.72002077, + "learning_rate": 5.589312526254705e-07, + "loss": 0.7408011, + "num_input_tokens_seen": 271259530, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.36523438, + "step": 12576, + "time_per_iteration": 2.381631851196289 + }, + { + "auxiliary_loss_clip": 0.01056342, + "auxiliary_loss_mlp": 0.0102316, + "balance_loss_clip": 1.01134598, + "balance_loss_mlp": 1.01899314, + "epoch": 0.7561701488050504, + "flos": 15887191271040.0, + "grad_norm": 2.3229925687520008, + "language_loss": 0.67466766, + "learning_rate": 5.586692890769231e-07, + "loss": 0.6954627, + "num_input_tokens_seen": 271276835, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.375, + "step": 12577, + "time_per_iteration": 3.812256336212158 + }, + { + "auxiliary_loss_clip": 0.01055958, + "auxiliary_loss_mlp": 0.01024607, + "balance_loss_clip": 1.01336539, + "balance_loss_mlp": 1.01812506, + "epoch": 0.7562302720577183, + "flos": 20338128593280.0, + "grad_norm": 2.018564407452816, + "language_loss": 0.77763736, + "learning_rate": 5.584073769657613e-07, + "loss": 0.79844302, + "num_input_tokens_seen": 271296275, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.37890625, + "step": 12578, + "time_per_iteration": 2.412731647491455 + }, + { + "auxiliary_loss_clip": 0.01056053, + "auxiliary_loss_mlp": 0.01024198, + "balance_loss_clip": 1.01206279, + "balance_loss_mlp": 1.01738548, + "epoch": 0.7562903953103863, + "flos": 20811201782400.0, + "grad_norm": 1.3899002854443012, + "language_loss": 0.75551426, + "learning_rate": 5.581455163013314e-07, + "loss": 0.77631676, + "num_input_tokens_seen": 271315685, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.38671875, + "step": 12579, + "time_per_iteration": 2.438898801803589 + }, + { + "auxiliary_loss_clip": 0.01060006, + "auxiliary_loss_mlp": 0.01026852, + "balance_loss_clip": 1.01393604, + "balance_loss_mlp": 1.01906013, + "epoch": 0.7563505185630542, + "flos": 37595487104640.0, + "grad_norm": 1.8947590659536366, + "language_loss": 0.62709993, + "learning_rate": 5.5788370709298e-07, + "loss": 0.64796853, + "num_input_tokens_seen": 271336790, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.41015625, + "step": 12580, + "time_per_iteration": 2.555629253387451 + }, + { + "auxiliary_loss_clip": 0.01056637, + "auxiliary_loss_mlp": 0.0102096, + "balance_loss_clip": 1.00930119, + "balance_loss_mlp": 1.01897609, + "epoch": 0.7564106418157223, + "flos": 20229932689920.0, + "grad_norm": 1.6255649765286073, + "language_loss": 0.74787199, + "learning_rate": 5.576219493500487e-07, + "loss": 0.76864791, + "num_input_tokens_seen": 271355470, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.37695312, + "step": 12581, + "time_per_iteration": 2.3906352519989014 + }, + { + "auxiliary_loss_clip": 0.01056083, + "auxiliary_loss_mlp": 0.01024038, + "balance_loss_clip": 1.01299953, + "balance_loss_mlp": 1.01759326, + "epoch": 0.7564707650683902, + "flos": 24753698841600.0, + "grad_norm": 2.02841768398679, + "language_loss": 0.62560695, + "learning_rate": 5.573602430818803e-07, + "loss": 0.64640814, + "num_input_tokens_seen": 271375810, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.38476562, + "step": 12582, + "time_per_iteration": 2.423943042755127 + }, + { + "auxiliary_loss_clip": 0.01006592, + "auxiliary_loss_mlp": 0.01000849, + "balance_loss_clip": 0.999991, + "balance_loss_mlp": 1.00031471, + "epoch": 0.7565308883210582, + "flos": 48527594887680.0, + "grad_norm": 0.937749197957306, + "language_loss": 0.60650444, + "learning_rate": 5.570985882978139e-07, + "loss": 0.62657887, + "num_input_tokens_seen": 271424775, + "router_z_loss_clip": 0.00860596, + "router_z_loss_mlp": 0.0625, + "step": 12583, + "time_per_iteration": 2.800554037094116 + }, + { + "auxiliary_loss_clip": 0.01057002, + "auxiliary_loss_mlp": 0.01027578, + "balance_loss_clip": 1.01605034, + "balance_loss_mlp": 1.0183568, + "epoch": 0.7565910115737261, + "flos": 12494261923200.0, + "grad_norm": 2.8963566913378984, + "language_loss": 0.78761518, + "learning_rate": 5.568369850071872e-07, + "loss": 0.80846095, + "num_input_tokens_seen": 271440500, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.38671875, + "step": 12584, + "time_per_iteration": 2.4139230251312256 + }, + { + "auxiliary_loss_clip": 0.01054951, + "auxiliary_loss_mlp": 0.01020911, + "balance_loss_clip": 1.00935948, + "balance_loss_mlp": 1.01841235, + "epoch": 0.7566511348263941, + "flos": 21172099173120.0, + "grad_norm": 2.407950779560251, + "language_loss": 0.77761024, + "learning_rate": 5.565754332193357e-07, + "loss": 0.79836881, + "num_input_tokens_seen": 271458180, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.36523438, + "step": 12585, + "time_per_iteration": 2.401625871658325 + }, + { + "auxiliary_loss_clip": 0.01059831, + "auxiliary_loss_mlp": 0.01025634, + "balance_loss_clip": 1.01415992, + "balance_loss_mlp": 1.02022743, + "epoch": 0.756711258079062, + "flos": 21753961758720.0, + "grad_norm": 1.895939831724597, + "language_loss": 0.82980216, + "learning_rate": 5.563139329435948e-07, + "loss": 0.85065681, + "num_input_tokens_seen": 271475730, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.39648438, + "step": 12586, + "time_per_iteration": 3.835698127746582 + }, + { + "auxiliary_loss_clip": 0.01058873, + "auxiliary_loss_mlp": 0.01024351, + "balance_loss_clip": 1.01274633, + "balance_loss_mlp": 1.01937699, + "epoch": 0.75677138133173, + "flos": 22381804471680.0, + "grad_norm": 2.231044543898833, + "language_loss": 0.83634055, + "learning_rate": 5.560524841892959e-07, + "loss": 0.85717285, + "num_input_tokens_seen": 271495030, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.39453125, + "step": 12587, + "time_per_iteration": 3.795560836791992 + }, + { + "auxiliary_loss_clip": 0.01056082, + "auxiliary_loss_mlp": 0.01019266, + "balance_loss_clip": 1.00797057, + "balance_loss_mlp": 1.01876104, + "epoch": 0.756831504584398, + "flos": 22707928281600.0, + "grad_norm": 1.6580837493508347, + "language_loss": 0.71033013, + "learning_rate": 5.557910869657696e-07, + "loss": 0.73108363, + "num_input_tokens_seen": 271515355, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.37304688, + "step": 12588, + "time_per_iteration": 2.416828155517578 + }, + { + "auxiliary_loss_clip": 0.01057008, + "auxiliary_loss_mlp": 0.01021664, + "balance_loss_clip": 1.01027942, + "balance_loss_mlp": 1.01836586, + "epoch": 0.756891627837066, + "flos": 24097192035840.0, + "grad_norm": 1.9280005545848975, + "language_loss": 0.68971944, + "learning_rate": 5.555297412823444e-07, + "loss": 0.7105062, + "num_input_tokens_seen": 271535090, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.38671875, + "step": 12589, + "time_per_iteration": 2.418278932571411 + }, + { + "auxiliary_loss_clip": 0.01055507, + "auxiliary_loss_mlp": 0.01021622, + "balance_loss_clip": 1.00951028, + "balance_loss_mlp": 1.01678705, + "epoch": 0.756951751089734, + "flos": 19748166572160.0, + "grad_norm": 2.0353498954988654, + "language_loss": 0.9213894, + "learning_rate": 5.552684471483471e-07, + "loss": 0.94216073, + "num_input_tokens_seen": 271551075, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.38671875, + "step": 12590, + "time_per_iteration": 2.431641101837158 + }, + { + "auxiliary_loss_clip": 0.0105577, + "auxiliary_loss_mlp": 0.01023665, + "balance_loss_clip": 1.01231074, + "balance_loss_mlp": 1.01785851, + "epoch": 0.7570118743424019, + "flos": 35077830341760.0, + "grad_norm": 1.8030587036062367, + "language_loss": 0.65551436, + "learning_rate": 5.550072045731027e-07, + "loss": 0.67630869, + "num_input_tokens_seen": 271571035, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.37890625, + "step": 12591, + "time_per_iteration": 2.498485565185547 + }, + { + "auxiliary_loss_clip": 0.01056248, + "auxiliary_loss_mlp": 0.01019579, + "balance_loss_clip": 1.00881457, + "balance_loss_mlp": 1.01904905, + "epoch": 0.7570719975950699, + "flos": 25593325061760.0, + "grad_norm": 2.434806014285968, + "language_loss": 0.73853308, + "learning_rate": 5.547460135659336e-07, + "loss": 0.75929135, + "num_input_tokens_seen": 271592950, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.37109375, + "step": 12592, + "time_per_iteration": 2.4625613689422607 + }, + { + "auxiliary_loss_clip": 0.01056461, + "auxiliary_loss_mlp": 0.01027085, + "balance_loss_clip": 1.01618922, + "balance_loss_mlp": 1.01803327, + "epoch": 0.7571321208477378, + "flos": 10815463330560.0, + "grad_norm": 3.7644544027287954, + "language_loss": 0.71351033, + "learning_rate": 5.544848741361627e-07, + "loss": 0.73434579, + "num_input_tokens_seen": 271608835, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.38476562, + "step": 12593, + "time_per_iteration": 2.4081029891967773 + }, + { + "auxiliary_loss_clip": 0.01057745, + "auxiliary_loss_mlp": 0.01021861, + "balance_loss_clip": 1.0101192, + "balance_loss_mlp": 1.01860452, + "epoch": 0.7571922441004059, + "flos": 18259120552320.0, + "grad_norm": 1.9479135322893648, + "language_loss": 0.66300488, + "learning_rate": 5.542237862931074e-07, + "loss": 0.68380094, + "num_input_tokens_seen": 271627730, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.390625, + "step": 12594, + "time_per_iteration": 2.3762240409851074 + }, + { + "auxiliary_loss_clip": 0.01057048, + "auxiliary_loss_mlp": 0.01022818, + "balance_loss_clip": 1.01115966, + "balance_loss_mlp": 1.01835871, + "epoch": 0.7572523673530738, + "flos": 22889476684800.0, + "grad_norm": 1.7178325372428223, + "language_loss": 0.80883926, + "learning_rate": 5.539627500460866e-07, + "loss": 0.82963789, + "num_input_tokens_seen": 271646415, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.38671875, + "step": 12595, + "time_per_iteration": 3.820347547531128 + }, + { + "auxiliary_loss_clip": 0.01055893, + "auxiliary_loss_mlp": 0.01022822, + "balance_loss_clip": 1.01180685, + "balance_loss_mlp": 1.0183742, + "epoch": 0.7573124906057418, + "flos": 20995263803520.0, + "grad_norm": 1.9689675324024472, + "language_loss": 0.71814799, + "learning_rate": 5.537017654044152e-07, + "loss": 0.73893511, + "num_input_tokens_seen": 271666240, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.375, + "step": 12596, + "time_per_iteration": 2.421865463256836 + }, + { + "auxiliary_loss_clip": 0.0105754, + "auxiliary_loss_mlp": 0.01018799, + "balance_loss_clip": 1.00770688, + "balance_loss_mlp": 1.01923287, + "epoch": 0.7573726138584097, + "flos": 20885252509440.0, + "grad_norm": 1.9216782302009572, + "language_loss": 0.80570501, + "learning_rate": 5.534408323774085e-07, + "loss": 0.82646841, + "num_input_tokens_seen": 271686370, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.3828125, + "step": 12597, + "time_per_iteration": 2.386061906814575 + }, + { + "auxiliary_loss_clip": 0.01062754, + "auxiliary_loss_mlp": 0.01026344, + "balance_loss_clip": 1.01373804, + "balance_loss_mlp": 1.02179039, + "epoch": 0.7574327371110777, + "flos": 24529486890240.0, + "grad_norm": 1.853885754633243, + "language_loss": 0.83083665, + "learning_rate": 5.531799509743762e-07, + "loss": 0.85172766, + "num_input_tokens_seen": 271705050, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.41015625, + "step": 12598, + "time_per_iteration": 2.4469833374023438 + }, + { + "auxiliary_loss_clip": 0.0105625, + "auxiliary_loss_mlp": 0.01020085, + "balance_loss_clip": 1.0095644, + "balance_loss_mlp": 1.01928926, + "epoch": 0.7574928603637456, + "flos": 23363492480640.0, + "grad_norm": 1.666381625801404, + "language_loss": 0.62620759, + "learning_rate": 5.529191212046305e-07, + "loss": 0.64697087, + "num_input_tokens_seen": 271724915, + "router_z_loss_clip": 0.10498047, + "router_z_loss_mlp": 0.36914062, + "step": 12599, + "time_per_iteration": 2.3969945907592773 + }, + { + "auxiliary_loss_clip": 0.01059472, + "auxiliary_loss_mlp": 0.01024086, + "balance_loss_clip": 1.01073492, + "balance_loss_mlp": 1.01953804, + "epoch": 0.7575529836164137, + "flos": 13515436546560.0, + "grad_norm": 2.4164912712679016, + "language_loss": 0.63347048, + "learning_rate": 5.52658343077479e-07, + "loss": 0.65430605, + "num_input_tokens_seen": 271742410, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.40039062, + "step": 12600, + "time_per_iteration": 2.3726117610931396 + }, + { + "auxiliary_loss_clip": 0.01056731, + "auxiliary_loss_mlp": 0.01022444, + "balance_loss_clip": 1.01124477, + "balance_loss_mlp": 1.01890445, + "epoch": 0.7576131068690816, + "flos": 19645556486400.0, + "grad_norm": 1.679356333639844, + "language_loss": 0.66445583, + "learning_rate": 5.523976166022282e-07, + "loss": 0.6852476, + "num_input_tokens_seen": 271761425, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.37890625, + "step": 12601, + "time_per_iteration": 2.3805856704711914 + }, + { + "auxiliary_loss_clip": 0.01054908, + "auxiliary_loss_mlp": 0.01022576, + "balance_loss_clip": 1.01209188, + "balance_loss_mlp": 1.01842189, + "epoch": 0.7576732301217496, + "flos": 20047197300480.0, + "grad_norm": 7.4926753115350255, + "language_loss": 0.67785513, + "learning_rate": 5.521369417881823e-07, + "loss": 0.69862998, + "num_input_tokens_seen": 271780875, + "router_z_loss_clip": 0.10449219, + "router_z_loss_mlp": 0.36523438, + "step": 12602, + "time_per_iteration": 2.4439666271209717 + }, + { + "auxiliary_loss_clip": 0.01057148, + "auxiliary_loss_mlp": 0.01025402, + "balance_loss_clip": 1.01334357, + "balance_loss_mlp": 1.0196172, + "epoch": 0.7577333533744176, + "flos": 15376202478720.0, + "grad_norm": 4.791972896566938, + "language_loss": 0.67459673, + "learning_rate": 5.518763186446451e-07, + "loss": 0.69542217, + "num_input_tokens_seen": 271799490, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.375, + "step": 12603, + "time_per_iteration": 2.391078233718872 + }, + { + "auxiliary_loss_clip": 0.01053228, + "auxiliary_loss_mlp": 0.01023959, + "balance_loss_clip": 1.01353371, + "balance_loss_mlp": 1.01687169, + "epoch": 0.7577934766270855, + "flos": 17893894152960.0, + "grad_norm": 2.075855140820898, + "language_loss": 0.61719453, + "learning_rate": 5.516157471809178e-07, + "loss": 0.63796639, + "num_input_tokens_seen": 271817040, + "router_z_loss_clip": 0.10449219, + "router_z_loss_mlp": 0.36328125, + "step": 12604, + "time_per_iteration": 2.381815195083618 + }, + { + "auxiliary_loss_clip": 0.01056613, + "auxiliary_loss_mlp": 0.0102324, + "balance_loss_clip": 1.01120567, + "balance_loss_mlp": 1.01872778, + "epoch": 0.7578535998797535, + "flos": 21612772753920.0, + "grad_norm": 1.8397341782324108, + "language_loss": 0.80135667, + "learning_rate": 5.513552274062974e-07, + "loss": 0.82215524, + "num_input_tokens_seen": 271835480, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.37890625, + "step": 12605, + "time_per_iteration": 2.390231132507324 + }, + { + "auxiliary_loss_clip": 0.01058514, + "auxiliary_loss_mlp": 0.010258, + "balance_loss_clip": 1.01303256, + "balance_loss_mlp": 1.01822829, + "epoch": 0.7579137231324214, + "flos": 18477397572480.0, + "grad_norm": 2.3035591325527625, + "language_loss": 0.78927588, + "learning_rate": 5.510947593300832e-07, + "loss": 0.81011903, + "num_input_tokens_seen": 271849835, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.40234375, + "step": 12606, + "time_per_iteration": 2.4166555404663086 + }, + { + "auxiliary_loss_clip": 0.01055072, + "auxiliary_loss_mlp": 0.01022844, + "balance_loss_clip": 1.01303339, + "balance_loss_mlp": 1.01904023, + "epoch": 0.7579738463850895, + "flos": 23254004856960.0, + "grad_norm": 1.5191145036314406, + "language_loss": 0.73168111, + "learning_rate": 5.508343429615703e-07, + "loss": 0.75246024, + "num_input_tokens_seen": 271869560, + "router_z_loss_clip": 0.09814453, + "router_z_loss_mlp": 0.36132812, + "step": 12607, + "time_per_iteration": 2.398589849472046 + }, + { + "auxiliary_loss_clip": 0.01057262, + "auxiliary_loss_mlp": 0.01022427, + "balance_loss_clip": 1.01061904, + "balance_loss_mlp": 1.01764655, + "epoch": 0.7580339696377574, + "flos": 14026180959360.0, + "grad_norm": 1.8086414048849535, + "language_loss": 0.75180578, + "learning_rate": 5.505739783100516e-07, + "loss": 0.77260268, + "num_input_tokens_seen": 271887950, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.39648438, + "step": 12608, + "time_per_iteration": 2.357600688934326 + }, + { + "auxiliary_loss_clip": 0.01056515, + "auxiliary_loss_mlp": 0.01024092, + "balance_loss_clip": 1.01231408, + "balance_loss_mlp": 1.01933157, + "epoch": 0.7580940928904254, + "flos": 25081603130880.0, + "grad_norm": 2.0959664707265904, + "language_loss": 0.71363813, + "learning_rate": 5.503136653848188e-07, + "loss": 0.7344442, + "num_input_tokens_seen": 271907700, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.37109375, + "step": 12609, + "time_per_iteration": 2.437389373779297 + }, + { + "auxiliary_loss_clip": 0.01058601, + "auxiliary_loss_mlp": 0.0102256, + "balance_loss_clip": 1.01025796, + "balance_loss_mlp": 1.01872241, + "epoch": 0.7581542161430933, + "flos": 23835448506240.0, + "grad_norm": 1.8366100208089378, + "language_loss": 0.81672317, + "learning_rate": 5.500534041951637e-07, + "loss": 0.83753479, + "num_input_tokens_seen": 271926840, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.3984375, + "step": 12610, + "time_per_iteration": 2.415102481842041 + }, + { + "auxiliary_loss_clip": 0.01058811, + "auxiliary_loss_mlp": 0.01023845, + "balance_loss_clip": 1.01241875, + "balance_loss_mlp": 1.01927948, + "epoch": 0.7582143393957613, + "flos": 22235902433280.0, + "grad_norm": 1.7904752726853048, + "language_loss": 0.70127583, + "learning_rate": 5.497931947503713e-07, + "loss": 0.72210228, + "num_input_tokens_seen": 271946465, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.39453125, + "step": 12611, + "time_per_iteration": 2.4098236560821533 + }, + { + "auxiliary_loss_clip": 0.01056949, + "auxiliary_loss_mlp": 0.01021014, + "balance_loss_clip": 1.00958824, + "balance_loss_mlp": 1.01910508, + "epoch": 0.7582744626484292, + "flos": 21105310008960.0, + "grad_norm": 1.8335845293028354, + "language_loss": 0.71048802, + "learning_rate": 5.495330370597302e-07, + "loss": 0.73126769, + "num_input_tokens_seen": 271967295, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.37890625, + "step": 12612, + "time_per_iteration": 2.4258642196655273 + }, + { + "auxiliary_loss_clip": 0.01058371, + "auxiliary_loss_mlp": 0.01028286, + "balance_loss_clip": 1.01631165, + "balance_loss_mlp": 1.01893878, + "epoch": 0.7583345859010973, + "flos": 24603712174080.0, + "grad_norm": 1.394057083166991, + "language_loss": 0.59561217, + "learning_rate": 5.492729311325232e-07, + "loss": 0.61647874, + "num_input_tokens_seen": 271987960, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.39453125, + "step": 12613, + "time_per_iteration": 2.4543826580047607 + }, + { + "auxiliary_loss_clip": 0.01056592, + "auxiliary_loss_mlp": 0.01027509, + "balance_loss_clip": 1.01618981, + "balance_loss_mlp": 1.01840472, + "epoch": 0.7583947091537652, + "flos": 33545422546560.0, + "grad_norm": 1.5282714956001104, + "language_loss": 0.59703708, + "learning_rate": 5.490128769780351e-07, + "loss": 0.61787808, + "num_input_tokens_seen": 272011780, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.38085938, + "step": 12614, + "time_per_iteration": 2.530689239501953 + }, + { + "auxiliary_loss_clip": 0.01056927, + "auxiliary_loss_mlp": 0.01022187, + "balance_loss_clip": 1.01058745, + "balance_loss_mlp": 1.01822317, + "epoch": 0.7584548324064332, + "flos": 20119956307200.0, + "grad_norm": 3.1047585910216062, + "language_loss": 0.73334694, + "learning_rate": 5.487528746055436e-07, + "loss": 0.75413805, + "num_input_tokens_seen": 272030825, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.38671875, + "step": 12615, + "time_per_iteration": 2.4090635776519775 + }, + { + "auxiliary_loss_clip": 0.01007145, + "auxiliary_loss_mlp": 0.01000859, + "balance_loss_clip": 1.0000124, + "balance_loss_mlp": 1.00070107, + "epoch": 0.7585149556591012, + "flos": 70399004060160.0, + "grad_norm": 0.8050862545999121, + "language_loss": 0.67692792, + "learning_rate": 5.484929240243294e-07, + "loss": 0.69700789, + "num_input_tokens_seen": 272095825, + "router_z_loss_clip": 0.00848389, + "router_z_loss_mlp": 0.06445312, + "step": 12616, + "time_per_iteration": 3.0989675521850586 + }, + { + "auxiliary_loss_clip": 0.01059311, + "auxiliary_loss_mlp": 0.01027377, + "balance_loss_clip": 1.01514637, + "balance_loss_mlp": 1.0198642, + "epoch": 0.7585750789117691, + "flos": 16142860224000.0, + "grad_norm": 1.794968224171684, + "language_loss": 0.84479547, + "learning_rate": 5.482330252436693e-07, + "loss": 0.86566234, + "num_input_tokens_seen": 272113950, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.39453125, + "step": 12617, + "time_per_iteration": 3.863772392272949 + }, + { + "auxiliary_loss_clip": 0.01057463, + "auxiliary_loss_mlp": 0.01022894, + "balance_loss_clip": 1.01211154, + "balance_loss_mlp": 1.01909554, + "epoch": 0.7586352021644371, + "flos": 17492218427520.0, + "grad_norm": 2.3967912902257322, + "language_loss": 0.7509625, + "learning_rate": 5.479731782728381e-07, + "loss": 0.77176601, + "num_input_tokens_seen": 272130315, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.38476562, + "step": 12618, + "time_per_iteration": 2.365624189376831 + }, + { + "auxiliary_loss_clip": 0.0105873, + "auxiliary_loss_mlp": 0.01023685, + "balance_loss_clip": 1.01141858, + "balance_loss_mlp": 1.01858068, + "epoch": 0.758695325417105, + "flos": 17274220698240.0, + "grad_norm": 2.2746215397586442, + "language_loss": 0.77333957, + "learning_rate": 5.477133831211091e-07, + "loss": 0.7941637, + "num_input_tokens_seen": 272149080, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.40039062, + "step": 12619, + "time_per_iteration": 2.3832969665527344 + }, + { + "auxiliary_loss_clip": 0.0105739, + "auxiliary_loss_mlp": 0.01023775, + "balance_loss_clip": 1.01205099, + "balance_loss_mlp": 1.01908779, + "epoch": 0.7587554486697731, + "flos": 29494415381760.0, + "grad_norm": 4.400950055373175, + "language_loss": 0.82586986, + "learning_rate": 5.474536397977529e-07, + "loss": 0.84668148, + "num_input_tokens_seen": 272168285, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.3828125, + "step": 12620, + "time_per_iteration": 2.466381549835205 + }, + { + "auxiliary_loss_clip": 0.0105751, + "auxiliary_loss_mlp": 0.01029123, + "balance_loss_clip": 1.01593876, + "balance_loss_mlp": 1.0177381, + "epoch": 0.758815571922441, + "flos": 16100057030400.0, + "grad_norm": 2.11276594313232, + "language_loss": 0.82651675, + "learning_rate": 5.471939483120413e-07, + "loss": 0.84738308, + "num_input_tokens_seen": 272184585, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.3984375, + "step": 12621, + "time_per_iteration": 2.362452745437622 + }, + { + "auxiliary_loss_clip": 0.01056833, + "auxiliary_loss_mlp": 0.01023129, + "balance_loss_clip": 1.01150632, + "balance_loss_mlp": 1.01913333, + "epoch": 0.758875695175109, + "flos": 16142790401280.0, + "grad_norm": 3.2374550989559148, + "language_loss": 0.73496568, + "learning_rate": 5.469343086732396e-07, + "loss": 0.75576532, + "num_input_tokens_seen": 272200205, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.37695312, + "step": 12622, + "time_per_iteration": 2.3792738914489746 + }, + { + "auxiliary_loss_clip": 0.01006725, + "auxiliary_loss_mlp": 0.01001282, + "balance_loss_clip": 1.00033426, + "balance_loss_mlp": 1.00051022, + "epoch": 0.7589358184277769, + "flos": 68458671406080.0, + "grad_norm": 0.9760277052165426, + "language_loss": 0.60914338, + "learning_rate": 5.466747208906151e-07, + "loss": 0.62922335, + "num_input_tokens_seen": 272259670, + "router_z_loss_clip": 0.00946045, + "router_z_loss_mlp": 0.06201172, + "step": 12623, + "time_per_iteration": 3.0818049907684326 + }, + { + "auxiliary_loss_clip": 0.01056768, + "auxiliary_loss_mlp": 0.01025034, + "balance_loss_clip": 1.01438856, + "balance_loss_mlp": 1.01948154, + "epoch": 0.7589959416804449, + "flos": 20046289605120.0, + "grad_norm": 1.987050772947283, + "language_loss": 0.67005974, + "learning_rate": 5.464151849734313e-07, + "loss": 0.69087774, + "num_input_tokens_seen": 272277925, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.37304688, + "step": 12624, + "time_per_iteration": 2.4132137298583984 + }, + { + "auxiliary_loss_clip": 0.01055447, + "auxiliary_loss_mlp": 0.01021575, + "balance_loss_clip": 1.01026201, + "balance_loss_mlp": 1.01869678, + "epoch": 0.7590560649331128, + "flos": 18770772660480.0, + "grad_norm": 2.2747282586959234, + "language_loss": 0.76038253, + "learning_rate": 5.461557009309507e-07, + "loss": 0.78115284, + "num_input_tokens_seen": 272296010, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.3671875, + "step": 12625, + "time_per_iteration": 3.8659541606903076 + }, + { + "auxiliary_loss_clip": 0.01057905, + "auxiliary_loss_mlp": 0.01025575, + "balance_loss_clip": 1.01402414, + "balance_loss_mlp": 1.02081048, + "epoch": 0.7591161881857809, + "flos": 29823995416320.0, + "grad_norm": 1.6980897902515402, + "language_loss": 0.6303364, + "learning_rate": 5.458962687724327e-07, + "loss": 0.65117121, + "num_input_tokens_seen": 272318330, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.37109375, + "step": 12626, + "time_per_iteration": 3.914882183074951 + }, + { + "auxiliary_loss_clip": 0.01057374, + "auxiliary_loss_mlp": 0.01027462, + "balance_loss_clip": 1.01477814, + "balance_loss_mlp": 1.01881242, + "epoch": 0.7591763114384488, + "flos": 20301679267200.0, + "grad_norm": 2.664975061732884, + "language_loss": 0.74006736, + "learning_rate": 5.456368885071377e-07, + "loss": 0.7609157, + "num_input_tokens_seen": 272335265, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.38671875, + "step": 12627, + "time_per_iteration": 2.367540121078491 + }, + { + "auxiliary_loss_clip": 0.010602, + "auxiliary_loss_mlp": 0.01021812, + "balance_loss_clip": 1.01048088, + "balance_loss_mlp": 1.01991487, + "epoch": 0.7592364346911168, + "flos": 20812563325440.0, + "grad_norm": 4.036361411341729, + "language_loss": 0.68486041, + "learning_rate": 5.453775601443198e-07, + "loss": 0.70568049, + "num_input_tokens_seen": 272354795, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.40234375, + "step": 12628, + "time_per_iteration": 2.3889288902282715 + }, + { + "auxiliary_loss_clip": 0.01062056, + "auxiliary_loss_mlp": 0.0103013, + "balance_loss_clip": 1.01723766, + "balance_loss_mlp": 1.0201726, + "epoch": 0.7592965579437848, + "flos": 21250443997440.0, + "grad_norm": 1.9715266809186285, + "language_loss": 0.63058811, + "learning_rate": 5.451182836932357e-07, + "loss": 0.65151, + "num_input_tokens_seen": 272372875, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.41796875, + "step": 12629, + "time_per_iteration": 2.3988516330718994 + }, + { + "auxiliary_loss_clip": 0.01052939, + "auxiliary_loss_mlp": 0.01019588, + "balance_loss_clip": 1.00854874, + "balance_loss_mlp": 1.01720071, + "epoch": 0.7593566811964527, + "flos": 26212405023360.0, + "grad_norm": 1.6567114021710017, + "language_loss": 0.77566063, + "learning_rate": 5.448590591631371e-07, + "loss": 0.79638594, + "num_input_tokens_seen": 272394715, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.35742188, + "step": 12630, + "time_per_iteration": 2.4454710483551025 + }, + { + "auxiliary_loss_clip": 0.010067, + "auxiliary_loss_mlp": 0.01000586, + "balance_loss_clip": 0.99960208, + "balance_loss_mlp": 1.00024319, + "epoch": 0.7594168044491207, + "flos": 71233777601280.0, + "grad_norm": 1.25381410805551, + "language_loss": 0.61562049, + "learning_rate": 5.445998865632766e-07, + "loss": 0.63569331, + "num_input_tokens_seen": 272458775, + "router_z_loss_clip": 0.00982666, + "router_z_loss_mlp": 0.06445312, + "step": 12631, + "time_per_iteration": 3.134965658187866 + }, + { + "auxiliary_loss_clip": 0.01061589, + "auxiliary_loss_mlp": 0.01030291, + "balance_loss_clip": 1.0168798, + "balance_loss_mlp": 1.02043128, + "epoch": 0.7594769277017887, + "flos": 26612160624000.0, + "grad_norm": 3.1429971414103957, + "language_loss": 0.74087232, + "learning_rate": 5.443407659029013e-07, + "loss": 0.76179111, + "num_input_tokens_seen": 272479355, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.41210938, + "step": 12632, + "time_per_iteration": 2.5168590545654297 + }, + { + "auxiliary_loss_clip": 0.01058185, + "auxiliary_loss_mlp": 0.01022618, + "balance_loss_clip": 1.01059532, + "balance_loss_mlp": 1.01968431, + "epoch": 0.7595370509544567, + "flos": 17595177626880.0, + "grad_norm": 1.8795204301827686, + "language_loss": 0.74939179, + "learning_rate": 5.440816971912605e-07, + "loss": 0.77019989, + "num_input_tokens_seen": 272493555, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.38476562, + "step": 12633, + "time_per_iteration": 2.347559690475464 + }, + { + "auxiliary_loss_clip": 0.0105562, + "auxiliary_loss_mlp": 0.01024056, + "balance_loss_clip": 1.01292801, + "balance_loss_mlp": 1.0180583, + "epoch": 0.7595971742071246, + "flos": 18002020233600.0, + "grad_norm": 1.8022284623672127, + "language_loss": 0.73287868, + "learning_rate": 5.438226804375991e-07, + "loss": 0.75367546, + "num_input_tokens_seen": 272508925, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.375, + "step": 12634, + "time_per_iteration": 3.8815176486968994 + }, + { + "auxiliary_loss_clip": 0.01057389, + "auxiliary_loss_mlp": 0.01025519, + "balance_loss_clip": 1.0132165, + "balance_loss_mlp": 1.01935971, + "epoch": 0.7596572974597926, + "flos": 28839060650880.0, + "grad_norm": 1.6534056686913052, + "language_loss": 0.64311993, + "learning_rate": 5.435637156511597e-07, + "loss": 0.66394901, + "num_input_tokens_seen": 272528805, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.38085938, + "step": 12635, + "time_per_iteration": 2.429352045059204 + }, + { + "auxiliary_loss_clip": 0.01057915, + "auxiliary_loss_mlp": 0.01028092, + "balance_loss_clip": 1.01546133, + "balance_loss_mlp": 1.01849914, + "epoch": 0.7597174207124605, + "flos": 14281954646400.0, + "grad_norm": 1.6641435829235318, + "language_loss": 0.68704277, + "learning_rate": 5.43304802841185e-07, + "loss": 0.70790291, + "num_input_tokens_seen": 272546655, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.39453125, + "step": 12636, + "time_per_iteration": 2.378615140914917 + }, + { + "auxiliary_loss_clip": 0.01059283, + "auxiliary_loss_mlp": 0.01025256, + "balance_loss_clip": 1.01281071, + "balance_loss_mlp": 1.02023077, + "epoch": 0.7597775439651285, + "flos": 21687870821760.0, + "grad_norm": 1.9392075044886747, + "language_loss": 0.81038773, + "learning_rate": 5.430459420169154e-07, + "loss": 0.83123308, + "num_input_tokens_seen": 272564010, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.390625, + "step": 12637, + "time_per_iteration": 2.40956449508667 + }, + { + "auxiliary_loss_clip": 0.01055987, + "auxiliary_loss_mlp": 0.01024965, + "balance_loss_clip": 1.0135982, + "balance_loss_mlp": 1.01829219, + "epoch": 0.7598376672177964, + "flos": 36099773015040.0, + "grad_norm": 1.6992521409565777, + "language_loss": 0.66891706, + "learning_rate": 5.42787133187588e-07, + "loss": 0.68972659, + "num_input_tokens_seen": 272585840, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.37695312, + "step": 12638, + "time_per_iteration": 2.52872633934021 + }, + { + "auxiliary_loss_clip": 0.01056725, + "auxiliary_loss_mlp": 0.01026948, + "balance_loss_clip": 1.01434112, + "balance_loss_mlp": 1.01818919, + "epoch": 0.7598977904704645, + "flos": 18331355888640.0, + "grad_norm": 2.0877064302154698, + "language_loss": 0.65609902, + "learning_rate": 5.425283763624388e-07, + "loss": 0.67693573, + "num_input_tokens_seen": 272602300, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.38671875, + "step": 12639, + "time_per_iteration": 2.356624126434326 + }, + { + "auxiliary_loss_clip": 0.01056729, + "auxiliary_loss_mlp": 0.01022629, + "balance_loss_clip": 1.01048708, + "balance_loss_mlp": 1.01876998, + "epoch": 0.7599579137231324, + "flos": 20191633061760.0, + "grad_norm": 1.8722730361368085, + "language_loss": 0.69857478, + "learning_rate": 5.422696715507036e-07, + "loss": 0.71936834, + "num_input_tokens_seen": 272619595, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.37890625, + "step": 12640, + "time_per_iteration": 2.3940181732177734 + }, + { + "auxiliary_loss_clip": 0.01058295, + "auxiliary_loss_mlp": 0.01024041, + "balance_loss_clip": 1.01138699, + "balance_loss_mlp": 1.01893651, + "epoch": 0.7600180369758004, + "flos": 24023699890560.0, + "grad_norm": 2.214929318856801, + "language_loss": 0.67117071, + "learning_rate": 5.420110187616138e-07, + "loss": 0.69199407, + "num_input_tokens_seen": 272638825, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.39453125, + "step": 12641, + "time_per_iteration": 2.4163644313812256 + }, + { + "auxiliary_loss_clip": 0.01056446, + "auxiliary_loss_mlp": 0.01021768, + "balance_loss_clip": 1.00966203, + "balance_loss_mlp": 1.01725256, + "epoch": 0.7600781602284684, + "flos": 18988526010240.0, + "grad_norm": 2.3100559157101443, + "language_loss": 0.66909504, + "learning_rate": 5.417524180044007e-07, + "loss": 0.68987715, + "num_input_tokens_seen": 272657240, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.39257812, + "step": 12642, + "time_per_iteration": 2.3825266361236572 + }, + { + "auxiliary_loss_clip": 0.01057214, + "auxiliary_loss_mlp": 0.01024734, + "balance_loss_clip": 1.01331973, + "balance_loss_mlp": 1.02018845, + "epoch": 0.7601382834811363, + "flos": 26066328428160.0, + "grad_norm": 1.7173469062434974, + "language_loss": 0.75456572, + "learning_rate": 5.414938692882918e-07, + "loss": 0.77538514, + "num_input_tokens_seen": 272677520, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.37109375, + "step": 12643, + "time_per_iteration": 2.46781325340271 + }, + { + "auxiliary_loss_clip": 0.01058486, + "auxiliary_loss_mlp": 0.01026484, + "balance_loss_clip": 1.01415765, + "balance_loss_mlp": 1.01972032, + "epoch": 0.7601984067338043, + "flos": 18843217464960.0, + "grad_norm": 1.87732912191024, + "language_loss": 0.78837144, + "learning_rate": 5.412353726225165e-07, + "loss": 0.80922115, + "num_input_tokens_seen": 272696770, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.38671875, + "step": 12644, + "time_per_iteration": 2.388009548187256 + }, + { + "auxiliary_loss_clip": 0.01056823, + "auxiliary_loss_mlp": 0.01022217, + "balance_loss_clip": 1.01062381, + "balance_loss_mlp": 1.01846862, + "epoch": 0.7602585299864723, + "flos": 24645188736000.0, + "grad_norm": 1.6477786122300788, + "language_loss": 0.80104977, + "learning_rate": 5.409769280162971e-07, + "loss": 0.82184017, + "num_input_tokens_seen": 272718340, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.3828125, + "step": 12645, + "time_per_iteration": 2.460785388946533 + }, + { + "auxiliary_loss_clip": 0.01059862, + "auxiliary_loss_mlp": 0.01025531, + "balance_loss_clip": 1.01331842, + "balance_loss_mlp": 1.02062774, + "epoch": 0.7603186532391403, + "flos": 23840964501120.0, + "grad_norm": 1.518710713714845, + "language_loss": 0.73099232, + "learning_rate": 5.407185354788584e-07, + "loss": 0.75184631, + "num_input_tokens_seen": 272739575, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.39257812, + "step": 12646, + "time_per_iteration": 2.4421308040618896 + }, + { + "auxiliary_loss_clip": 0.01055238, + "auxiliary_loss_mlp": 0.01024308, + "balance_loss_clip": 1.01328683, + "balance_loss_mlp": 1.0183394, + "epoch": 0.7603787764918082, + "flos": 22198824702720.0, + "grad_norm": 1.9167774574279524, + "language_loss": 0.67722547, + "learning_rate": 5.40460195019421e-07, + "loss": 0.69802094, + "num_input_tokens_seen": 272758710, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.36914062, + "step": 12647, + "time_per_iteration": 2.3963546752929688 + }, + { + "auxiliary_loss_clip": 0.01006743, + "auxiliary_loss_mlp": 0.01001071, + "balance_loss_clip": 1.00021851, + "balance_loss_mlp": 1.00053072, + "epoch": 0.7604388997444762, + "flos": 54083951677440.0, + "grad_norm": 0.6712847280857831, + "language_loss": 0.49001333, + "learning_rate": 5.402019066472061e-07, + "loss": 0.51009142, + "num_input_tokens_seen": 272814855, + "router_z_loss_clip": 0.00854492, + "router_z_loss_mlp": 0.06201172, + "step": 12648, + "time_per_iteration": 3.0305590629577637 + }, + { + "auxiliary_loss_clip": 0.01055975, + "auxiliary_loss_mlp": 0.01022361, + "balance_loss_clip": 1.01140594, + "balance_loss_mlp": 1.01892257, + "epoch": 0.7604990229971441, + "flos": 19680923560320.0, + "grad_norm": 1.5064359514312804, + "language_loss": 0.7659511, + "learning_rate": 5.399436703714295e-07, + "loss": 0.78673446, + "num_input_tokens_seen": 272834400, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.37109375, + "step": 12649, + "time_per_iteration": 2.4174959659576416 + }, + { + "auxiliary_loss_clip": 0.01056016, + "auxiliary_loss_mlp": 0.01023977, + "balance_loss_clip": 1.01256263, + "balance_loss_mlp": 1.01839745, + "epoch": 0.7605591462498121, + "flos": 25226876764800.0, + "grad_norm": 1.514791523342264, + "language_loss": 0.68582422, + "learning_rate": 5.39685486201307e-07, + "loss": 0.70662415, + "num_input_tokens_seen": 272854760, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.37695312, + "step": 12650, + "time_per_iteration": 2.4157519340515137 + }, + { + "auxiliary_loss_clip": 0.01006806, + "auxiliary_loss_mlp": 0.01001552, + "balance_loss_clip": 1.0006516, + "balance_loss_mlp": 1.00059319, + "epoch": 0.76061926950248, + "flos": 66780466306560.0, + "grad_norm": 0.8693995634377066, + "language_loss": 0.62720084, + "learning_rate": 5.394273541460543e-07, + "loss": 0.64728439, + "num_input_tokens_seen": 272919030, + "router_z_loss_clip": 0.00897217, + "router_z_loss_mlp": 0.06201172, + "step": 12651, + "time_per_iteration": 3.065166711807251 + }, + { + "auxiliary_loss_clip": 0.01055047, + "auxiliary_loss_mlp": 0.01023985, + "balance_loss_clip": 1.01247513, + "balance_loss_mlp": 1.01819396, + "epoch": 0.7606793927551481, + "flos": 25337167349760.0, + "grad_norm": 1.4431271274950717, + "language_loss": 0.71378136, + "learning_rate": 5.39169274214881e-07, + "loss": 0.7345717, + "num_input_tokens_seen": 272938925, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.3671875, + "step": 12652, + "time_per_iteration": 2.4310829639434814 + }, + { + "auxiliary_loss_clip": 0.0105807, + "auxiliary_loss_mlp": 0.01026885, + "balance_loss_clip": 1.01501167, + "balance_loss_mlp": 1.01978374, + "epoch": 0.760739516007816, + "flos": 18222636314880.0, + "grad_norm": 3.1828715743355227, + "language_loss": 0.79987359, + "learning_rate": 5.389112464169994e-07, + "loss": 0.82072312, + "num_input_tokens_seen": 272954945, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.3828125, + "step": 12653, + "time_per_iteration": 2.373729944229126 + }, + { + "auxiliary_loss_clip": 0.01060328, + "auxiliary_loss_mlp": 0.01023091, + "balance_loss_clip": 1.01139712, + "balance_loss_mlp": 1.01971817, + "epoch": 0.760799639260484, + "flos": 22558185993600.0, + "grad_norm": 1.584348405263309, + "language_loss": 0.80144751, + "learning_rate": 5.386532707616169e-07, + "loss": 0.82228166, + "num_input_tokens_seen": 272972855, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.40625, + "step": 12654, + "time_per_iteration": 2.4043970108032227 + }, + { + "auxiliary_loss_clip": 0.01057486, + "auxiliary_loss_mlp": 0.01030349, + "balance_loss_clip": 1.01788592, + "balance_loss_mlp": 1.0180254, + "epoch": 0.760859762513152, + "flos": 22308242503680.0, + "grad_norm": 2.5732795284630994, + "language_loss": 0.79234588, + "learning_rate": 5.383953472579401e-07, + "loss": 0.8132242, + "num_input_tokens_seen": 272989895, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.39453125, + "step": 12655, + "time_per_iteration": 2.4113359451293945 + }, + { + "auxiliary_loss_clip": 0.01054958, + "auxiliary_loss_mlp": 0.01020465, + "balance_loss_clip": 1.01051688, + "balance_loss_mlp": 1.01942313, + "epoch": 0.7609198857658199, + "flos": 24862732617600.0, + "grad_norm": 1.5946827331055147, + "language_loss": 0.68511564, + "learning_rate": 5.381374759151733e-07, + "loss": 0.70586985, + "num_input_tokens_seen": 273011695, + "router_z_loss_clip": 0.09960938, + "router_z_loss_mlp": 0.35546875, + "step": 12656, + "time_per_iteration": 2.4855806827545166 + }, + { + "auxiliary_loss_clip": 0.01057726, + "auxiliary_loss_mlp": 0.0102391, + "balance_loss_clip": 1.01172113, + "balance_loss_mlp": 1.01745307, + "epoch": 0.760980009018488, + "flos": 16835851267200.0, + "grad_norm": 2.311347235422682, + "language_loss": 0.73874712, + "learning_rate": 5.378796567425198e-07, + "loss": 0.75956357, + "num_input_tokens_seen": 273028815, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.40234375, + "step": 12657, + "time_per_iteration": 3.7803289890289307 + }, + { + "auxiliary_loss_clip": 0.0100674, + "auxiliary_loss_mlp": 0.01003462, + "balance_loss_clip": 1.00251997, + "balance_loss_mlp": 1.0006175, + "epoch": 0.7610401322711559, + "flos": 61227670475520.0, + "grad_norm": 0.8587116383025214, + "language_loss": 0.64953715, + "learning_rate": 5.376218897491809e-07, + "loss": 0.66963917, + "num_input_tokens_seen": 273084080, + "router_z_loss_clip": 0.00939941, + "router_z_loss_mlp": 0.06103516, + "step": 12658, + "time_per_iteration": 2.996083974838257 + }, + { + "auxiliary_loss_clip": 0.01054844, + "auxiliary_loss_mlp": 0.01022568, + "balance_loss_clip": 1.01154149, + "balance_loss_mlp": 1.01868784, + "epoch": 0.7611002555238239, + "flos": 19639865934720.0, + "grad_norm": 1.586753715204382, + "language_loss": 0.79454201, + "learning_rate": 5.373641749443547e-07, + "loss": 0.81531608, + "num_input_tokens_seen": 273102295, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.36132812, + "step": 12659, + "time_per_iteration": 2.381378173828125 + }, + { + "auxiliary_loss_clip": 0.01054329, + "auxiliary_loss_mlp": 0.0102407, + "balance_loss_clip": 1.01319242, + "balance_loss_mlp": 1.01699495, + "epoch": 0.7611603787764918, + "flos": 26870936688000.0, + "grad_norm": 1.7250283055410638, + "language_loss": 0.68806708, + "learning_rate": 5.371065123372383e-07, + "loss": 0.7088511, + "num_input_tokens_seen": 273123400, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.37304688, + "step": 12660, + "time_per_iteration": 2.4399967193603516 + }, + { + "auxiliary_loss_clip": 0.01055797, + "auxiliary_loss_mlp": 0.01020254, + "balance_loss_clip": 1.0089469, + "balance_loss_mlp": 1.01791751, + "epoch": 0.7612205020291598, + "flos": 27343032359040.0, + "grad_norm": 1.613391357356361, + "language_loss": 0.7065469, + "learning_rate": 5.368489019370283e-07, + "loss": 0.72730744, + "num_input_tokens_seen": 273145150, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.37890625, + "step": 12661, + "time_per_iteration": 2.4826343059539795 + }, + { + "auxiliary_loss_clip": 0.01055953, + "auxiliary_loss_mlp": 0.010216, + "balance_loss_clip": 1.01041842, + "balance_loss_mlp": 1.018188, + "epoch": 0.7612806252818277, + "flos": 29313320826240.0, + "grad_norm": 1.46873229578561, + "language_loss": 0.83349246, + "learning_rate": 5.365913437529166e-07, + "loss": 0.85426795, + "num_input_tokens_seen": 273165180, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.37890625, + "step": 12662, + "time_per_iteration": 2.4499213695526123 + }, + { + "auxiliary_loss_clip": 0.01056626, + "auxiliary_loss_mlp": 0.01030538, + "balance_loss_clip": 1.01905251, + "balance_loss_mlp": 1.01838326, + "epoch": 0.7613407485344957, + "flos": 19025045159040.0, + "grad_norm": 1.605441824628566, + "language_loss": 0.68965638, + "learning_rate": 5.363338377940958e-07, + "loss": 0.71052802, + "num_input_tokens_seen": 273184005, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.3828125, + "step": 12663, + "time_per_iteration": 2.3755836486816406 + }, + { + "auxiliary_loss_clip": 0.01055292, + "auxiliary_loss_mlp": 0.01020014, + "balance_loss_clip": 1.00905228, + "balance_loss_mlp": 1.01812899, + "epoch": 0.7614008717871636, + "flos": 23254982375040.0, + "grad_norm": 1.5466716763293646, + "language_loss": 0.70511186, + "learning_rate": 5.360763840697553e-07, + "loss": 0.72586489, + "num_input_tokens_seen": 273203565, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.37109375, + "step": 12664, + "time_per_iteration": 2.4190807342529297 + }, + { + "auxiliary_loss_clip": 0.01061152, + "auxiliary_loss_mlp": 0.01032718, + "balance_loss_clip": 1.01976562, + "balance_loss_mlp": 1.02039409, + "epoch": 0.7614609950398317, + "flos": 21578837045760.0, + "grad_norm": 1.9015136492688247, + "language_loss": 0.79326421, + "learning_rate": 5.358189825890833e-07, + "loss": 0.8142029, + "num_input_tokens_seen": 273221645, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.40625, + "step": 12665, + "time_per_iteration": 5.257839202880859 + }, + { + "auxiliary_loss_clip": 0.01056734, + "auxiliary_loss_mlp": 0.01030299, + "balance_loss_clip": 1.019207, + "balance_loss_mlp": 1.01954794, + "epoch": 0.7615211182924996, + "flos": 29276627120640.0, + "grad_norm": 1.6541129942881139, + "language_loss": 0.87459302, + "learning_rate": 5.355616333612651e-07, + "loss": 0.89546335, + "num_input_tokens_seen": 273242040, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.37109375, + "step": 12666, + "time_per_iteration": 2.531552791595459 + }, + { + "auxiliary_loss_clip": 0.01057509, + "auxiliary_loss_mlp": 0.01024286, + "balance_loss_clip": 1.0127939, + "balance_loss_mlp": 1.01836956, + "epoch": 0.7615812415451676, + "flos": 13260291264000.0, + "grad_norm": 2.0287925009999745, + "language_loss": 0.83666891, + "learning_rate": 5.35304336395485e-07, + "loss": 0.85748684, + "num_input_tokens_seen": 273257365, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.390625, + "step": 12667, + "time_per_iteration": 2.3821961879730225 + }, + { + "auxiliary_loss_clip": 0.01056549, + "auxiliary_loss_mlp": 0.01027137, + "balance_loss_clip": 1.01622963, + "balance_loss_mlp": 1.01900244, + "epoch": 0.7616413647978356, + "flos": 18583847907840.0, + "grad_norm": 1.6467625653865616, + "language_loss": 0.78678668, + "learning_rate": 5.350470917009264e-07, + "loss": 0.80762351, + "num_input_tokens_seen": 273274710, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.375, + "step": 12668, + "time_per_iteration": 2.383173704147339 + }, + { + "auxiliary_loss_clip": 0.01054703, + "auxiliary_loss_mlp": 0.01021744, + "balance_loss_clip": 1.01079488, + "balance_loss_mlp": 1.0177449, + "epoch": 0.7617014880505035, + "flos": 18515173530240.0, + "grad_norm": 1.585471658423257, + "language_loss": 0.63781011, + "learning_rate": 5.347898992867677e-07, + "loss": 0.65857458, + "num_input_tokens_seen": 273292870, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.36914062, + "step": 12669, + "time_per_iteration": 2.388807535171509 + }, + { + "auxiliary_loss_clip": 0.01058293, + "auxiliary_loss_mlp": 0.01025659, + "balance_loss_clip": 1.01248026, + "balance_loss_mlp": 1.01838934, + "epoch": 0.7617616113031715, + "flos": 24972010773120.0, + "grad_norm": 1.861646365475663, + "language_loss": 0.66443592, + "learning_rate": 5.345327591621891e-07, + "loss": 0.68527544, + "num_input_tokens_seen": 273312375, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.3984375, + "step": 12670, + "time_per_iteration": 2.418081283569336 + }, + { + "auxiliary_loss_clip": 0.01058076, + "auxiliary_loss_mlp": 0.01026515, + "balance_loss_clip": 1.01511812, + "balance_loss_mlp": 1.01900136, + "epoch": 0.7618217345558395, + "flos": 23293910407680.0, + "grad_norm": 1.633034832601029, + "language_loss": 0.73197931, + "learning_rate": 5.342756713363668e-07, + "loss": 0.75282526, + "num_input_tokens_seen": 273332590, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.390625, + "step": 12671, + "time_per_iteration": 2.4177775382995605 + }, + { + "auxiliary_loss_clip": 0.0105524, + "auxiliary_loss_mlp": 0.01023865, + "balance_loss_clip": 1.01277876, + "balance_loss_mlp": 1.01795745, + "epoch": 0.7618818578085075, + "flos": 25481742756480.0, + "grad_norm": 1.641463609122279, + "language_loss": 0.73443574, + "learning_rate": 5.340186358184753e-07, + "loss": 0.75522679, + "num_input_tokens_seen": 273352885, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.37304688, + "step": 12672, + "time_per_iteration": 2.4521665573120117 + }, + { + "auxiliary_loss_clip": 0.01006893, + "auxiliary_loss_mlp": 0.01001623, + "balance_loss_clip": 1.000687, + "balance_loss_mlp": 1.00062585, + "epoch": 0.7619419810611754, + "flos": 61149220917120.0, + "grad_norm": 0.7618691325320736, + "language_loss": 0.56696427, + "learning_rate": 5.337616526176873e-07, + "loss": 0.58704937, + "num_input_tokens_seen": 273411730, + "router_z_loss_clip": 0.00933838, + "router_z_loss_mlp": 0.0625, + "step": 12673, + "time_per_iteration": 3.0371246337890625 + }, + { + "auxiliary_loss_clip": 0.01057937, + "auxiliary_loss_mlp": 0.01024131, + "balance_loss_clip": 1.01247227, + "balance_loss_mlp": 1.01867485, + "epoch": 0.7620021043138434, + "flos": 23257530904320.0, + "grad_norm": 1.869483090997881, + "language_loss": 0.74954623, + "learning_rate": 5.33504721743175e-07, + "loss": 0.77036691, + "num_input_tokens_seen": 273430020, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.39257812, + "step": 12674, + "time_per_iteration": 3.841740846633911 + }, + { + "auxiliary_loss_clip": 0.01055261, + "auxiliary_loss_mlp": 0.0102348, + "balance_loss_clip": 1.01308537, + "balance_loss_mlp": 1.01796055, + "epoch": 0.7620622275665113, + "flos": 25081323840000.0, + "grad_norm": 2.012195076325852, + "language_loss": 0.72288001, + "learning_rate": 5.332478432041065e-07, + "loss": 0.74366748, + "num_input_tokens_seen": 273448690, + "router_z_loss_clip": 0.10400391, + "router_z_loss_mlp": 0.37304688, + "step": 12675, + "time_per_iteration": 2.4015309810638428 + }, + { + "auxiliary_loss_clip": 0.01056032, + "auxiliary_loss_mlp": 0.01022538, + "balance_loss_clip": 1.01192856, + "balance_loss_mlp": 1.01939273, + "epoch": 0.7621223508191793, + "flos": 20154031660800.0, + "grad_norm": 3.034728336904776, + "language_loss": 0.73094141, + "learning_rate": 5.329910170096499e-07, + "loss": 0.7517271, + "num_input_tokens_seen": 273465190, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.3671875, + "step": 12676, + "time_per_iteration": 2.3748412132263184 + }, + { + "auxiliary_loss_clip": 0.01060839, + "auxiliary_loss_mlp": 0.010252, + "balance_loss_clip": 1.01214683, + "balance_loss_mlp": 1.0195806, + "epoch": 0.7621824740718472, + "flos": 17999332058880.0, + "grad_norm": 2.822050900223289, + "language_loss": 0.54106647, + "learning_rate": 5.327342431689696e-07, + "loss": 0.56192684, + "num_input_tokens_seen": 273478620, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.4140625, + "step": 12677, + "time_per_iteration": 2.351217031478882 + }, + { + "auxiliary_loss_clip": 0.01056854, + "auxiliary_loss_mlp": 0.0102339, + "balance_loss_clip": 1.01166534, + "balance_loss_mlp": 1.01829123, + "epoch": 0.7622425973245153, + "flos": 21724599438720.0, + "grad_norm": 1.9958721995266138, + "language_loss": 0.79207587, + "learning_rate": 5.324775216912312e-07, + "loss": 0.81287837, + "num_input_tokens_seen": 273497635, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.38671875, + "step": 12678, + "time_per_iteration": 2.4085841178894043 + }, + { + "auxiliary_loss_clip": 0.01057877, + "auxiliary_loss_mlp": 0.01023212, + "balance_loss_clip": 1.01133847, + "balance_loss_mlp": 1.017977, + "epoch": 0.7623027205771832, + "flos": 19717547443200.0, + "grad_norm": 2.5484645755666313, + "language_loss": 0.77605468, + "learning_rate": 5.322208525855942e-07, + "loss": 0.79686558, + "num_input_tokens_seen": 273513955, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.3984375, + "step": 12679, + "time_per_iteration": 2.391066789627075 + }, + { + "auxiliary_loss_clip": 0.01057439, + "auxiliary_loss_mlp": 0.01018273, + "balance_loss_clip": 1.00781846, + "balance_loss_mlp": 1.01915169, + "epoch": 0.7623628438298512, + "flos": 23987669500800.0, + "grad_norm": 1.3321999012577985, + "language_loss": 0.80076456, + "learning_rate": 5.319642358612191e-07, + "loss": 0.8215217, + "num_input_tokens_seen": 273533970, + "router_z_loss_clip": 0.10449219, + "router_z_loss_mlp": 0.3828125, + "step": 12680, + "time_per_iteration": 2.467721462249756 + }, + { + "auxiliary_loss_clip": 0.01058991, + "auxiliary_loss_mlp": 0.01023881, + "balance_loss_clip": 1.01169193, + "balance_loss_mlp": 1.01957679, + "epoch": 0.7624229670825191, + "flos": 22344622007040.0, + "grad_norm": 1.767425688548251, + "language_loss": 0.62770331, + "learning_rate": 5.317076715272652e-07, + "loss": 0.64853203, + "num_input_tokens_seen": 273553090, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.39453125, + "step": 12681, + "time_per_iteration": 2.4354090690612793 + }, + { + "auxiliary_loss_clip": 0.01055684, + "auxiliary_loss_mlp": 0.01024785, + "balance_loss_clip": 1.01387143, + "balance_loss_mlp": 1.01907122, + "epoch": 0.7624830903351871, + "flos": 22710651367680.0, + "grad_norm": 4.188361279771882, + "language_loss": 0.76170754, + "learning_rate": 5.314511595928867e-07, + "loss": 0.78251225, + "num_input_tokens_seen": 273572460, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.36523438, + "step": 12682, + "time_per_iteration": 2.406252861022949 + }, + { + "auxiliary_loss_clip": 0.01055704, + "auxiliary_loss_mlp": 0.01019333, + "balance_loss_clip": 1.00884247, + "balance_loss_mlp": 1.01936388, + "epoch": 0.7625432135878552, + "flos": 25592522100480.0, + "grad_norm": 1.5047188690580482, + "language_loss": 0.68328679, + "learning_rate": 5.311947000672392e-07, + "loss": 0.70403719, + "num_input_tokens_seen": 273592815, + "router_z_loss_clip": 0.10498047, + "router_z_loss_mlp": 0.36328125, + "step": 12683, + "time_per_iteration": 2.4313127994537354 + }, + { + "auxiliary_loss_clip": 0.01057405, + "auxiliary_loss_mlp": 0.01025667, + "balance_loss_clip": 1.01353109, + "balance_loss_mlp": 1.01889098, + "epoch": 0.7626033368405231, + "flos": 23986517425920.0, + "grad_norm": 2.0991211708534503, + "language_loss": 0.83292854, + "learning_rate": 5.309382929594739e-07, + "loss": 0.85375923, + "num_input_tokens_seen": 273611790, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.38476562, + "step": 12684, + "time_per_iteration": 2.438108444213867 + }, + { + "auxiliary_loss_clip": 0.01058009, + "auxiliary_loss_mlp": 0.01026225, + "balance_loss_clip": 1.01424456, + "balance_loss_mlp": 1.01932061, + "epoch": 0.7626634600931911, + "flos": 12598443020160.0, + "grad_norm": 1.806791668129609, + "language_loss": 0.82477248, + "learning_rate": 5.306819382787433e-07, + "loss": 0.84561485, + "num_input_tokens_seen": 273628340, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.38671875, + "step": 12685, + "time_per_iteration": 2.4030187129974365 + }, + { + "auxiliary_loss_clip": 0.01058046, + "auxiliary_loss_mlp": 0.01019784, + "balance_loss_clip": 1.00763047, + "balance_loss_mlp": 1.0188942, + "epoch": 0.762723583345859, + "flos": 26321403888000.0, + "grad_norm": 2.1470861765778944, + "language_loss": 0.77012801, + "learning_rate": 5.304256360341936e-07, + "loss": 0.79090631, + "num_input_tokens_seen": 273646585, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.390625, + "step": 12686, + "time_per_iteration": 2.4103105068206787 + }, + { + "auxiliary_loss_clip": 0.01059287, + "auxiliary_loss_mlp": 0.01025305, + "balance_loss_clip": 1.01216197, + "balance_loss_mlp": 1.01899838, + "epoch": 0.762783706598527, + "flos": 21906008196480.0, + "grad_norm": 1.6918569152114942, + "language_loss": 0.72512102, + "learning_rate": 5.301693862349734e-07, + "loss": 0.74596685, + "num_input_tokens_seen": 273665410, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.40234375, + "step": 12687, + "time_per_iteration": 2.4041836261749268 + }, + { + "auxiliary_loss_clip": 0.01057028, + "auxiliary_loss_mlp": 0.01031874, + "balance_loss_clip": 1.0193634, + "balance_loss_mlp": 1.01885021, + "epoch": 0.7628438298511949, + "flos": 15338915280000.0, + "grad_norm": 1.9554492071622434, + "language_loss": 0.64642674, + "learning_rate": 5.299131888902271e-07, + "loss": 0.66731572, + "num_input_tokens_seen": 273683035, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.3828125, + "step": 12688, + "time_per_iteration": 2.3533835411071777 + }, + { + "auxiliary_loss_clip": 0.0105511, + "auxiliary_loss_mlp": 0.01022191, + "balance_loss_clip": 1.01157522, + "balance_loss_mlp": 1.01838291, + "epoch": 0.7629039531038629, + "flos": 13005460183680.0, + "grad_norm": 1.8964200983348027, + "language_loss": 0.70808721, + "learning_rate": 5.296570440090973e-07, + "loss": 0.72886014, + "num_input_tokens_seen": 273700130, + "router_z_loss_clip": 0.10595703, + "router_z_loss_mlp": 0.3671875, + "step": 12689, + "time_per_iteration": 2.3683223724365234 + }, + { + "auxiliary_loss_clip": 0.01060485, + "auxiliary_loss_mlp": 0.01026056, + "balance_loss_clip": 1.01355112, + "balance_loss_mlp": 1.01944661, + "epoch": 0.7629640763565309, + "flos": 26170614259200.0, + "grad_norm": 1.7960722650663925, + "language_loss": 0.70231104, + "learning_rate": 5.29400951600725e-07, + "loss": 0.72317654, + "num_input_tokens_seen": 273720310, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.41015625, + "step": 12690, + "time_per_iteration": 2.423076868057251 + }, + { + "auxiliary_loss_clip": 0.01057784, + "auxiliary_loss_mlp": 0.01026175, + "balance_loss_clip": 1.01473713, + "balance_loss_mlp": 1.01893854, + "epoch": 0.7630241996091989, + "flos": 36792240387840.0, + "grad_norm": 2.448653782376371, + "language_loss": 0.69304818, + "learning_rate": 5.291449116742503e-07, + "loss": 0.71388781, + "num_input_tokens_seen": 273744475, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.38867188, + "step": 12691, + "time_per_iteration": 2.506945848464966 + }, + { + "auxiliary_loss_clip": 0.01054934, + "auxiliary_loss_mlp": 0.01023028, + "balance_loss_clip": 1.01167333, + "balance_loss_mlp": 1.01772928, + "epoch": 0.7630843228618668, + "flos": 21834087062400.0, + "grad_norm": 2.1692725001268727, + "language_loss": 0.81635463, + "learning_rate": 5.288889242388105e-07, + "loss": 0.83713424, + "num_input_tokens_seen": 273764635, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.37109375, + "step": 12692, + "time_per_iteration": 2.38686466217041 + }, + { + "auxiliary_loss_clip": 0.01061158, + "auxiliary_loss_mlp": 0.0102355, + "balance_loss_clip": 1.01187968, + "balance_loss_mlp": 1.02109957, + "epoch": 0.7631444461145348, + "flos": 12639710113920.0, + "grad_norm": 2.7693556624864386, + "language_loss": 0.77063686, + "learning_rate": 5.286329893035406e-07, + "loss": 0.79148394, + "num_input_tokens_seen": 273780115, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.40039062, + "step": 12693, + "time_per_iteration": 2.3519506454467773 + }, + { + "auxiliary_loss_clip": 0.01057337, + "auxiliary_loss_mlp": 0.01023435, + "balance_loss_clip": 1.01144242, + "balance_loss_mlp": 1.01872075, + "epoch": 0.7632045693672027, + "flos": 16835676710400.0, + "grad_norm": 2.1953017380758864, + "language_loss": 0.73097444, + "learning_rate": 5.283771068775747e-07, + "loss": 0.75178212, + "num_input_tokens_seen": 273796605, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.38476562, + "step": 12694, + "time_per_iteration": 2.4040920734405518 + }, + { + "auxiliary_loss_clip": 0.01055071, + "auxiliary_loss_mlp": 0.0102529, + "balance_loss_clip": 1.01401865, + "balance_loss_mlp": 1.01793623, + "epoch": 0.7632646926198707, + "flos": 22016263870080.0, + "grad_norm": 1.7277880587737375, + "language_loss": 0.70801485, + "learning_rate": 5.281212769700442e-07, + "loss": 0.72881842, + "num_input_tokens_seen": 273816515, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.37109375, + "step": 12695, + "time_per_iteration": 2.4326562881469727 + }, + { + "auxiliary_loss_clip": 0.01056706, + "auxiliary_loss_mlp": 0.01022938, + "balance_loss_clip": 1.01111841, + "balance_loss_mlp": 1.01885128, + "epoch": 0.7633248158725388, + "flos": 23112850763520.0, + "grad_norm": 1.6070775846937908, + "language_loss": 0.72062492, + "learning_rate": 5.278654995900793e-07, + "loss": 0.7414214, + "num_input_tokens_seen": 273837060, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.37890625, + "step": 12696, + "time_per_iteration": 3.918768882751465 + }, + { + "auxiliary_loss_clip": 0.01058331, + "auxiliary_loss_mlp": 0.01026836, + "balance_loss_clip": 1.01468253, + "balance_loss_mlp": 1.01931679, + "epoch": 0.7633849391252067, + "flos": 10889060209920.0, + "grad_norm": 2.358236676087713, + "language_loss": 0.71436632, + "learning_rate": 5.276097747468074e-07, + "loss": 0.73521793, + "num_input_tokens_seen": 273853365, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.390625, + "step": 12697, + "time_per_iteration": 2.359100103378296 + }, + { + "auxiliary_loss_clip": 0.01057512, + "auxiliary_loss_mlp": 0.0102167, + "balance_loss_clip": 1.00989199, + "balance_loss_mlp": 1.01953518, + "epoch": 0.7634450623778747, + "flos": 20993169121920.0, + "grad_norm": 1.9628852869825606, + "language_loss": 0.669783, + "learning_rate": 5.273541024493565e-07, + "loss": 0.69057488, + "num_input_tokens_seen": 273870750, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.37890625, + "step": 12698, + "time_per_iteration": 2.371586322784424 + }, + { + "auxiliary_loss_clip": 0.01056647, + "auxiliary_loss_mlp": 0.01022271, + "balance_loss_clip": 1.01111293, + "balance_loss_mlp": 1.01839483, + "epoch": 0.7635051856305426, + "flos": 18880993422720.0, + "grad_norm": 1.6825255106412247, + "language_loss": 0.72309959, + "learning_rate": 5.27098482706848e-07, + "loss": 0.7438888, + "num_input_tokens_seen": 273890890, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.3828125, + "step": 12699, + "time_per_iteration": 2.4170806407928467 + }, + { + "auxiliary_loss_clip": 0.01056023, + "auxiliary_loss_mlp": 0.01020875, + "balance_loss_clip": 1.00978255, + "balance_loss_mlp": 1.0187819, + "epoch": 0.7635653088832106, + "flos": 34785572417280.0, + "grad_norm": 3.137346480834247, + "language_loss": 0.73489714, + "learning_rate": 5.268429155284069e-07, + "loss": 0.75566614, + "num_input_tokens_seen": 273914015, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.37304688, + "step": 12700, + "time_per_iteration": 2.5047898292541504 + }, + { + "auxiliary_loss_clip": 0.0105643, + "auxiliary_loss_mlp": 0.01020653, + "balance_loss_clip": 1.00945926, + "balance_loss_mlp": 1.0183773, + "epoch": 0.7636254321358785, + "flos": 23177510334720.0, + "grad_norm": 2.0312941716377213, + "language_loss": 0.69497883, + "learning_rate": 5.265874009231519e-07, + "loss": 0.71574962, + "num_input_tokens_seen": 273927415, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.38085938, + "step": 12701, + "time_per_iteration": 2.384995460510254 + }, + { + "auxiliary_loss_clip": 0.01057395, + "auxiliary_loss_mlp": 0.01022403, + "balance_loss_clip": 1.01110244, + "balance_loss_mlp": 1.01903617, + "epoch": 0.7636855553885465, + "flos": 21324145610880.0, + "grad_norm": 1.6562088380864313, + "language_loss": 0.64359188, + "learning_rate": 5.263319389002037e-07, + "loss": 0.66438997, + "num_input_tokens_seen": 273946690, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.3828125, + "step": 12702, + "time_per_iteration": 2.407416582107544 + }, + { + "auxiliary_loss_clip": 0.01057424, + "auxiliary_loss_mlp": 0.0102115, + "balance_loss_clip": 1.01001048, + "balance_loss_mlp": 1.01870012, + "epoch": 0.7637456786412145, + "flos": 28656814020480.0, + "grad_norm": 1.936127111538503, + "language_loss": 0.653319, + "learning_rate": 5.260765294686767e-07, + "loss": 0.67410475, + "num_input_tokens_seen": 273966870, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.38671875, + "step": 12703, + "time_per_iteration": 2.471559762954712 + }, + { + "auxiliary_loss_clip": 0.01055696, + "auxiliary_loss_mlp": 0.0101796, + "balance_loss_clip": 1.00727296, + "balance_loss_mlp": 1.01871204, + "epoch": 0.7638058018938825, + "flos": 21906217664640.0, + "grad_norm": 1.8333097379863454, + "language_loss": 0.83597004, + "learning_rate": 5.258211726376875e-07, + "loss": 0.85670662, + "num_input_tokens_seen": 273986360, + "router_z_loss_clip": 0.10693359, + "router_z_loss_mlp": 0.36914062, + "step": 12704, + "time_per_iteration": 2.3900389671325684 + }, + { + "auxiliary_loss_clip": 0.01059286, + "auxiliary_loss_mlp": 0.01024349, + "balance_loss_clip": 1.01278543, + "balance_loss_mlp": 1.02022719, + "epoch": 0.7638659251465504, + "flos": 29642586658560.0, + "grad_norm": 1.5961768991790397, + "language_loss": 0.68098366, + "learning_rate": 5.255658684163488e-07, + "loss": 0.70181996, + "num_input_tokens_seen": 274009745, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.390625, + "step": 12705, + "time_per_iteration": 5.326321125030518 + }, + { + "auxiliary_loss_clip": 0.01055585, + "auxiliary_loss_mlp": 0.01022003, + "balance_loss_clip": 1.01082134, + "balance_loss_mlp": 1.01788008, + "epoch": 0.7639260483992184, + "flos": 26139960218880.0, + "grad_norm": 1.6766614057360305, + "language_loss": 0.73653382, + "learning_rate": 5.253106168137715e-07, + "loss": 0.75730973, + "num_input_tokens_seen": 274028775, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.37695312, + "step": 12706, + "time_per_iteration": 2.4469082355499268 + }, + { + "auxiliary_loss_clip": 0.01057386, + "auxiliary_loss_mlp": 0.01027318, + "balance_loss_clip": 1.01639891, + "balance_loss_mlp": 1.01846075, + "epoch": 0.7639861716518863, + "flos": 20155672494720.0, + "grad_norm": 1.8734563575977992, + "language_loss": 0.7796821, + "learning_rate": 5.250554178390643e-07, + "loss": 0.80052912, + "num_input_tokens_seen": 274047520, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.390625, + "step": 12707, + "time_per_iteration": 2.3798186779022217 + }, + { + "auxiliary_loss_clip": 0.01055265, + "auxiliary_loss_mlp": 0.01018702, + "balance_loss_clip": 1.00775814, + "balance_loss_mlp": 1.017452, + "epoch": 0.7640462949045543, + "flos": 18582276896640.0, + "grad_norm": 1.7497321615076575, + "language_loss": 0.80004698, + "learning_rate": 5.248002715013358e-07, + "loss": 0.8207866, + "num_input_tokens_seen": 274065350, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.37890625, + "step": 12708, + "time_per_iteration": 2.3972599506378174 + }, + { + "auxiliary_loss_clip": 0.01006757, + "auxiliary_loss_mlp": 0.01001183, + "balance_loss_clip": 1.00036633, + "balance_loss_mlp": 1.00054491, + "epoch": 0.7641064181572224, + "flos": 68318494830720.0, + "grad_norm": 0.8198005132990904, + "language_loss": 0.56372672, + "learning_rate": 5.245451778096914e-07, + "loss": 0.58380616, + "num_input_tokens_seen": 274122315, + "router_z_loss_clip": 0.00817871, + "router_z_loss_mlp": 0.06201172, + "step": 12709, + "time_per_iteration": 3.097318172454834 + }, + { + "auxiliary_loss_clip": 0.01058991, + "auxiliary_loss_mlp": 0.01030991, + "balance_loss_clip": 1.01810408, + "balance_loss_mlp": 1.01852369, + "epoch": 0.7641665414098903, + "flos": 17967979791360.0, + "grad_norm": 1.676255127921163, + "language_loss": 0.63604915, + "learning_rate": 5.242901367732333e-07, + "loss": 0.65694892, + "num_input_tokens_seen": 274140555, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.40625, + "step": 12710, + "time_per_iteration": 2.3575994968414307 + }, + { + "auxiliary_loss_clip": 0.01057711, + "auxiliary_loss_mlp": 0.01026815, + "balance_loss_clip": 1.01508498, + "balance_loss_mlp": 1.01864481, + "epoch": 0.7642266646625583, + "flos": 21251002579200.0, + "grad_norm": 2.1129389947024375, + "language_loss": 0.64662546, + "learning_rate": 5.240351484010648e-07, + "loss": 0.66747069, + "num_input_tokens_seen": 274161125, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.390625, + "step": 12711, + "time_per_iteration": 2.4024839401245117 + }, + { + "auxiliary_loss_clip": 0.01056038, + "auxiliary_loss_mlp": 0.01028065, + "balance_loss_clip": 1.01696694, + "balance_loss_mlp": 1.01776361, + "epoch": 0.7642867879152262, + "flos": 22746681757440.0, + "grad_norm": 1.701042487897494, + "language_loss": 0.72546685, + "learning_rate": 5.237802127022853e-07, + "loss": 0.74630785, + "num_input_tokens_seen": 274180835, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.3828125, + "step": 12712, + "time_per_iteration": 2.3970847129821777 + }, + { + "auxiliary_loss_clip": 0.01055375, + "auxiliary_loss_mlp": 0.01022615, + "balance_loss_clip": 1.01162982, + "balance_loss_mlp": 1.01775753, + "epoch": 0.7643469111678942, + "flos": 23330988138240.0, + "grad_norm": 1.8001965892816343, + "language_loss": 0.80387056, + "learning_rate": 5.235253296859925e-07, + "loss": 0.82465041, + "num_input_tokens_seen": 274201190, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.37695312, + "step": 12713, + "time_per_iteration": 2.3972043991088867 + }, + { + "auxiliary_loss_clip": 0.01057866, + "auxiliary_loss_mlp": 0.01026873, + "balance_loss_clip": 1.0143621, + "balance_loss_mlp": 1.01863074, + "epoch": 0.7644070344205621, + "flos": 19856292652800.0, + "grad_norm": 2.024564053419012, + "language_loss": 0.83238089, + "learning_rate": 5.232704993612822e-07, + "loss": 0.85322833, + "num_input_tokens_seen": 274217595, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.39257812, + "step": 12714, + "time_per_iteration": 3.893205165863037 + }, + { + "auxiliary_loss_clip": 0.01058167, + "auxiliary_loss_mlp": 0.01031636, + "balance_loss_clip": 1.01883936, + "balance_loss_mlp": 1.0187695, + "epoch": 0.7644671576732301, + "flos": 22089546547200.0, + "grad_norm": 1.500184989865865, + "language_loss": 0.72933066, + "learning_rate": 5.230157217372506e-07, + "loss": 0.75022864, + "num_input_tokens_seen": 274237885, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.39453125, + "step": 12715, + "time_per_iteration": 2.4026401042938232 + }, + { + "auxiliary_loss_clip": 0.01055053, + "auxiliary_loss_mlp": 0.01020012, + "balance_loss_clip": 1.00880611, + "balance_loss_mlp": 1.01755226, + "epoch": 0.7645272809258981, + "flos": 25480311390720.0, + "grad_norm": 1.756614377074459, + "language_loss": 0.63239855, + "learning_rate": 5.227609968229871e-07, + "loss": 0.65314925, + "num_input_tokens_seen": 274258820, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.375, + "step": 12716, + "time_per_iteration": 2.424358367919922 + }, + { + "auxiliary_loss_clip": 0.01058344, + "auxiliary_loss_mlp": 0.01026492, + "balance_loss_clip": 1.01426697, + "balance_loss_mlp": 1.0187819, + "epoch": 0.7645874041785661, + "flos": 21540851619840.0, + "grad_norm": 1.4739476467174553, + "language_loss": 0.80261505, + "learning_rate": 5.225063246275844e-07, + "loss": 0.82346338, + "num_input_tokens_seen": 274278835, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.39648438, + "step": 12717, + "time_per_iteration": 2.4279706478118896 + }, + { + "auxiliary_loss_clip": 0.01058385, + "auxiliary_loss_mlp": 0.01026126, + "balance_loss_clip": 1.01484871, + "balance_loss_mlp": 1.01985908, + "epoch": 0.764647527431234, + "flos": 20629862847360.0, + "grad_norm": 2.927788988099322, + "language_loss": 0.66224945, + "learning_rate": 5.222517051601301e-07, + "loss": 0.68309456, + "num_input_tokens_seen": 274297110, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.38671875, + "step": 12718, + "time_per_iteration": 2.36490535736084 + }, + { + "auxiliary_loss_clip": 0.01054487, + "auxiliary_loss_mlp": 0.01025445, + "balance_loss_clip": 1.01451397, + "balance_loss_mlp": 1.01820803, + "epoch": 0.764707650683902, + "flos": 21433004830080.0, + "grad_norm": 1.7840032056242268, + "language_loss": 0.7725687, + "learning_rate": 5.219971384297121e-07, + "loss": 0.79336798, + "num_input_tokens_seen": 274315610, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.36328125, + "step": 12719, + "time_per_iteration": 2.4240074157714844 + }, + { + "auxiliary_loss_clip": 0.0105688, + "auxiliary_loss_mlp": 0.01020311, + "balance_loss_clip": 1.00899804, + "balance_loss_mlp": 1.0184449, + "epoch": 0.7647677739365699, + "flos": 22710092785920.0, + "grad_norm": 1.7889324675533333, + "language_loss": 0.70139182, + "learning_rate": 5.217426244454133e-07, + "loss": 0.72216368, + "num_input_tokens_seen": 274333975, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.38476562, + "step": 12720, + "time_per_iteration": 2.441678047180176 + }, + { + "auxiliary_loss_clip": 0.01056601, + "auxiliary_loss_mlp": 0.01027476, + "balance_loss_clip": 1.01622295, + "balance_loss_mlp": 1.01833773, + "epoch": 0.7648278971892379, + "flos": 21323063358720.0, + "grad_norm": 1.6628909240299028, + "language_loss": 0.73699236, + "learning_rate": 5.214881632163182e-07, + "loss": 0.75783312, + "num_input_tokens_seen": 274353695, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.3828125, + "step": 12721, + "time_per_iteration": 2.4158198833465576 + }, + { + "auxiliary_loss_clip": 0.01057304, + "auxiliary_loss_mlp": 0.01023008, + "balance_loss_clip": 1.01195765, + "balance_loss_mlp": 1.01916194, + "epoch": 0.764888020441906, + "flos": 20666312173440.0, + "grad_norm": 1.8310018591257453, + "language_loss": 0.73544061, + "learning_rate": 5.212337547515076e-07, + "loss": 0.75624371, + "num_input_tokens_seen": 274371120, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.38085938, + "step": 12722, + "time_per_iteration": 2.3881258964538574 + }, + { + "auxiliary_loss_clip": 0.01056389, + "auxiliary_loss_mlp": 0.01025473, + "balance_loss_clip": 1.01383209, + "balance_loss_mlp": 1.01837826, + "epoch": 0.7649481436945739, + "flos": 25081358751360.0, + "grad_norm": 2.351082089065087, + "language_loss": 0.74100012, + "learning_rate": 5.209793990600601e-07, + "loss": 0.76181871, + "num_input_tokens_seen": 274389665, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.37890625, + "step": 12723, + "time_per_iteration": 2.419563055038452 + }, + { + "auxiliary_loss_clip": 0.01057598, + "auxiliary_loss_mlp": 0.01023598, + "balance_loss_clip": 1.01139081, + "balance_loss_mlp": 1.0180192, + "epoch": 0.7650082669472419, + "flos": 24899705614080.0, + "grad_norm": 1.78641297155238, + "language_loss": 0.73228431, + "learning_rate": 5.207250961510536e-07, + "loss": 0.75309628, + "num_input_tokens_seen": 274408750, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.39648438, + "step": 12724, + "time_per_iteration": 2.417306900024414 + }, + { + "auxiliary_loss_clip": 0.01054396, + "auxiliary_loss_mlp": 0.01022414, + "balance_loss_clip": 1.01188743, + "balance_loss_mlp": 1.01849198, + "epoch": 0.7650683901999098, + "flos": 14646517729920.0, + "grad_norm": 2.023345903261675, + "language_loss": 0.83898425, + "learning_rate": 5.204708460335632e-07, + "loss": 0.85975236, + "num_input_tokens_seen": 274424600, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.359375, + "step": 12725, + "time_per_iteration": 2.3972008228302 + }, + { + "auxiliary_loss_clip": 0.01057178, + "auxiliary_loss_mlp": 0.01025303, + "balance_loss_clip": 1.01309013, + "balance_loss_mlp": 1.01831484, + "epoch": 0.7651285134525778, + "flos": 26351429523840.0, + "grad_norm": 1.836694072922959, + "language_loss": 0.77529585, + "learning_rate": 5.202166487166626e-07, + "loss": 0.7961207, + "num_input_tokens_seen": 274443075, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.38867188, + "step": 12726, + "time_per_iteration": 2.4357211589813232 + }, + { + "auxiliary_loss_clip": 0.01056366, + "auxiliary_loss_mlp": 0.01024702, + "balance_loss_clip": 1.01289427, + "balance_loss_mlp": 1.01775002, + "epoch": 0.7651886367052457, + "flos": 26645782129920.0, + "grad_norm": 1.7114789544125155, + "language_loss": 0.70528245, + "learning_rate": 5.199625042094227e-07, + "loss": 0.72609305, + "num_input_tokens_seen": 274463240, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.38671875, + "step": 12727, + "time_per_iteration": 2.4257912635803223 + }, + { + "auxiliary_loss_clip": 0.01053986, + "auxiliary_loss_mlp": 0.0102222, + "balance_loss_clip": 1.0120039, + "balance_loss_mlp": 1.01832628, + "epoch": 0.7652487599579137, + "flos": 25701660610560.0, + "grad_norm": 1.715072100711349, + "language_loss": 0.79661059, + "learning_rate": 5.197084125209144e-07, + "loss": 0.81737262, + "num_input_tokens_seen": 274482750, + "router_z_loss_clip": 0.10205078, + "router_z_loss_mlp": 0.35742188, + "step": 12728, + "time_per_iteration": 2.430539131164551 + }, + { + "auxiliary_loss_clip": 0.01057067, + "auxiliary_loss_mlp": 0.01024046, + "balance_loss_clip": 1.0118506, + "balance_loss_mlp": 1.0184319, + "epoch": 0.7653088832105817, + "flos": 28584299393280.0, + "grad_norm": 1.681620738510284, + "language_loss": 0.55711555, + "learning_rate": 5.19454373660205e-07, + "loss": 0.57792664, + "num_input_tokens_seen": 274503545, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.38671875, + "step": 12729, + "time_per_iteration": 2.4450325965881348 + }, + { + "auxiliary_loss_clip": 0.01056006, + "auxiliary_loss_mlp": 0.01023084, + "balance_loss_clip": 1.01208138, + "balance_loss_mlp": 1.01793659, + "epoch": 0.7653690064632497, + "flos": 23365656984960.0, + "grad_norm": 1.6760798420201906, + "language_loss": 0.77903032, + "learning_rate": 5.192003876363609e-07, + "loss": 0.7998212, + "num_input_tokens_seen": 274523825, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.38085938, + "step": 12730, + "time_per_iteration": 2.416482448577881 + }, + { + "auxiliary_loss_clip": 0.01061217, + "auxiliary_loss_mlp": 0.01029072, + "balance_loss_clip": 1.01707959, + "balance_loss_mlp": 1.02132952, + "epoch": 0.7654291297159176, + "flos": 15773130259200.0, + "grad_norm": 1.6851603777658815, + "language_loss": 0.68900055, + "learning_rate": 5.18946454458445e-07, + "loss": 0.70990348, + "num_input_tokens_seen": 274541625, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.3984375, + "step": 12731, + "time_per_iteration": 2.425203561782837 + }, + { + "auxiliary_loss_clip": 0.01058851, + "auxiliary_loss_mlp": 0.0102545, + "balance_loss_clip": 1.01423812, + "balance_loss_mlp": 1.01960683, + "epoch": 0.7654892529685856, + "flos": 18033023387520.0, + "grad_norm": 2.1920750214340963, + "language_loss": 0.70766914, + "learning_rate": 5.18692574135522e-07, + "loss": 0.72851217, + "num_input_tokens_seen": 274557580, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.39257812, + "step": 12732, + "time_per_iteration": 2.3895351886749268 + }, + { + "auxiliary_loss_clip": 0.01058213, + "auxiliary_loss_mlp": 0.01024289, + "balance_loss_clip": 1.0122962, + "balance_loss_mlp": 1.01920414, + "epoch": 0.7655493762212535, + "flos": 27234766632960.0, + "grad_norm": 1.5023328491539467, + "language_loss": 0.78310287, + "learning_rate": 5.184387466766491e-07, + "loss": 0.80392784, + "num_input_tokens_seen": 274578135, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.390625, + "step": 12733, + "time_per_iteration": 2.453371524810791 + }, + { + "auxiliary_loss_clip": 0.01054215, + "auxiliary_loss_mlp": 0.01028582, + "balance_loss_clip": 1.01731133, + "balance_loss_mlp": 1.01787376, + "epoch": 0.7656094994739215, + "flos": 20773006888320.0, + "grad_norm": 1.9022977091424347, + "language_loss": 0.77389836, + "learning_rate": 5.181849720908868e-07, + "loss": 0.79472643, + "num_input_tokens_seen": 274595655, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.36328125, + "step": 12734, + "time_per_iteration": 2.401423692703247 + }, + { + "auxiliary_loss_clip": 0.01059652, + "auxiliary_loss_mlp": 0.01023102, + "balance_loss_clip": 1.01057935, + "balance_loss_mlp": 1.01874113, + "epoch": 0.7656696227265896, + "flos": 23038136720640.0, + "grad_norm": 1.8479189033141465, + "language_loss": 0.73696601, + "learning_rate": 5.17931250387291e-07, + "loss": 0.75779355, + "num_input_tokens_seen": 274616305, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.40820312, + "step": 12735, + "time_per_iteration": 2.4535582065582275 + }, + { + "auxiliary_loss_clip": 0.01055888, + "auxiliary_loss_mlp": 0.01025118, + "balance_loss_clip": 1.01382327, + "balance_loss_mlp": 1.0183742, + "epoch": 0.7657297459792575, + "flos": 27524441116800.0, + "grad_norm": 1.467497517487205, + "language_loss": 0.72608948, + "learning_rate": 5.176775815749175e-07, + "loss": 0.74689955, + "num_input_tokens_seen": 274638110, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.375, + "step": 12736, + "time_per_iteration": 3.898552179336548 + }, + { + "auxiliary_loss_clip": 0.01060161, + "auxiliary_loss_mlp": 0.010309, + "balance_loss_clip": 1.01763201, + "balance_loss_mlp": 1.01999974, + "epoch": 0.7657898692319255, + "flos": 17127515698560.0, + "grad_norm": 1.899158363003593, + "language_loss": 0.77590859, + "learning_rate": 5.174239656628167e-07, + "loss": 0.79681921, + "num_input_tokens_seen": 274656565, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.40234375, + "step": 12737, + "time_per_iteration": 2.421804904937744 + }, + { + "auxiliary_loss_clip": 0.01057966, + "auxiliary_loss_mlp": 0.01024381, + "balance_loss_clip": 1.01259112, + "balance_loss_mlp": 1.01969826, + "epoch": 0.7658499924845934, + "flos": 21464810945280.0, + "grad_norm": 1.7896995188834264, + "language_loss": 0.76402938, + "learning_rate": 5.171704026600418e-07, + "loss": 0.7848528, + "num_input_tokens_seen": 274674215, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.3828125, + "step": 12738, + "time_per_iteration": 2.3756449222564697 + }, + { + "auxiliary_loss_clip": 0.01057535, + "auxiliary_loss_mlp": 0.01022594, + "balance_loss_clip": 1.01087594, + "balance_loss_mlp": 1.01889086, + "epoch": 0.7659101157372614, + "flos": 29495392899840.0, + "grad_norm": 1.9029561572341718, + "language_loss": 0.62777776, + "learning_rate": 5.169168925756415e-07, + "loss": 0.64857906, + "num_input_tokens_seen": 274693445, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.38671875, + "step": 12739, + "time_per_iteration": 2.4869589805603027 + }, + { + "auxiliary_loss_clip": 0.01057937, + "auxiliary_loss_mlp": 0.01029093, + "balance_loss_clip": 1.01652789, + "balance_loss_mlp": 1.01870155, + "epoch": 0.7659702389899293, + "flos": 18550819895040.0, + "grad_norm": 1.8581111221002773, + "language_loss": 0.79125559, + "learning_rate": 5.166634354186612e-07, + "loss": 0.81212592, + "num_input_tokens_seen": 274712815, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.39257812, + "step": 12740, + "time_per_iteration": 2.39117431640625 + }, + { + "auxiliary_loss_clip": 0.01057279, + "auxiliary_loss_mlp": 0.01025321, + "balance_loss_clip": 1.01312613, + "balance_loss_mlp": 1.01854146, + "epoch": 0.7660303622425974, + "flos": 23548078172160.0, + "grad_norm": 1.839658646229387, + "language_loss": 0.6553427, + "learning_rate": 5.164100311981478e-07, + "loss": 0.67616868, + "num_input_tokens_seen": 274732690, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.38671875, + "step": 12741, + "time_per_iteration": 2.442939281463623 + }, + { + "auxiliary_loss_clip": 0.01055168, + "auxiliary_loss_mlp": 0.0101998, + "balance_loss_clip": 1.00830984, + "balance_loss_mlp": 1.01775932, + "epoch": 0.7660904854952653, + "flos": 18915732092160.0, + "grad_norm": 1.5159897547744954, + "language_loss": 0.76069176, + "learning_rate": 5.161566799231443e-07, + "loss": 0.78144324, + "num_input_tokens_seen": 274752460, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.375, + "step": 12742, + "time_per_iteration": 2.3981966972351074 + }, + { + "auxiliary_loss_clip": 0.01054868, + "auxiliary_loss_mlp": 0.01024348, + "balance_loss_clip": 1.01374412, + "balance_loss_mlp": 1.01861966, + "epoch": 0.7661506087479333, + "flos": 23146437358080.0, + "grad_norm": 1.3771118605551216, + "language_loss": 0.76571405, + "learning_rate": 5.159033816026919e-07, + "loss": 0.78650624, + "num_input_tokens_seen": 274773070, + "router_z_loss_clip": 0.10595703, + "router_z_loss_mlp": 0.36328125, + "step": 12743, + "time_per_iteration": 2.442121744155884 + }, + { + "auxiliary_loss_clip": 0.01055325, + "auxiliary_loss_mlp": 0.01020428, + "balance_loss_clip": 1.00988364, + "balance_loss_mlp": 1.01838934, + "epoch": 0.7662107320006012, + "flos": 17564837788800.0, + "grad_norm": 1.9313099306077839, + "language_loss": 0.74911255, + "learning_rate": 5.156501362458297e-07, + "loss": 0.7698701, + "num_input_tokens_seen": 274790220, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.36914062, + "step": 12744, + "time_per_iteration": 3.8271477222442627 + }, + { + "auxiliary_loss_clip": 0.01057819, + "auxiliary_loss_mlp": 0.01020795, + "balance_loss_clip": 1.0087254, + "balance_loss_mlp": 1.01912892, + "epoch": 0.7662708552532692, + "flos": 22302167927040.0, + "grad_norm": 2.5312635249106217, + "language_loss": 0.71527064, + "learning_rate": 5.153969438615964e-07, + "loss": 0.7360568, + "num_input_tokens_seen": 274805095, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.38671875, + "step": 12745, + "time_per_iteration": 3.818575382232666 + }, + { + "auxiliary_loss_clip": 0.01056964, + "auxiliary_loss_mlp": 0.01026809, + "balance_loss_clip": 1.01583576, + "balance_loss_mlp": 1.01895022, + "epoch": 0.7663309785059371, + "flos": 15741149587200.0, + "grad_norm": 2.2464862703203443, + "language_loss": 0.76700854, + "learning_rate": 5.151438044590273e-07, + "loss": 0.78784621, + "num_input_tokens_seen": 274821800, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.38085938, + "step": 12746, + "time_per_iteration": 2.368812084197998 + }, + { + "auxiliary_loss_clip": 0.01006779, + "auxiliary_loss_mlp": 0.0100205, + "balance_loss_clip": 1.00119734, + "balance_loss_mlp": 1.00061798, + "epoch": 0.7663911017586051, + "flos": 62159780620800.0, + "grad_norm": 0.6639589590035155, + "language_loss": 0.56811351, + "learning_rate": 5.148907180471565e-07, + "loss": 0.58820182, + "num_input_tokens_seen": 274886970, + "router_z_loss_clip": 0.00854492, + "router_z_loss_mlp": 0.06152344, + "step": 12747, + "time_per_iteration": 3.069406270980835 + }, + { + "auxiliary_loss_clip": 0.01053496, + "auxiliary_loss_mlp": 0.01025427, + "balance_loss_clip": 1.01543736, + "balance_loss_mlp": 1.01776218, + "epoch": 0.7664512250112732, + "flos": 26504802593280.0, + "grad_norm": 2.24567040782505, + "language_loss": 0.72643971, + "learning_rate": 5.146376846350151e-07, + "loss": 0.74722898, + "num_input_tokens_seen": 274907240, + "router_z_loss_clip": 0.09960938, + "router_z_loss_mlp": 0.35742188, + "step": 12748, + "time_per_iteration": 2.4321255683898926 + }, + { + "auxiliary_loss_clip": 0.01055319, + "auxiliary_loss_mlp": 0.01021903, + "balance_loss_clip": 1.00960088, + "balance_loss_mlp": 1.01804304, + "epoch": 0.7665113482639411, + "flos": 16248717066240.0, + "grad_norm": 2.0652875748951574, + "language_loss": 0.69085455, + "learning_rate": 5.143847042316351e-07, + "loss": 0.71162677, + "num_input_tokens_seen": 274924650, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.37304688, + "step": 12749, + "time_per_iteration": 2.3445277214050293 + }, + { + "auxiliary_loss_clip": 0.01056406, + "auxiliary_loss_mlp": 0.01022957, + "balance_loss_clip": 1.01109624, + "balance_loss_mlp": 1.01861823, + "epoch": 0.7665714715166091, + "flos": 27196676472960.0, + "grad_norm": 1.5260638266582187, + "language_loss": 0.7341311, + "learning_rate": 5.141317768460425e-07, + "loss": 0.75492471, + "num_input_tokens_seen": 274944550, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.37890625, + "step": 12750, + "time_per_iteration": 2.4386119842529297 + }, + { + "auxiliary_loss_clip": 0.01054896, + "auxiliary_loss_mlp": 0.01019825, + "balance_loss_clip": 1.00925744, + "balance_loss_mlp": 1.01849377, + "epoch": 0.766631594769277, + "flos": 21066766001280.0, + "grad_norm": 2.2669873863835046, + "language_loss": 0.75671262, + "learning_rate": 5.13878902487265e-07, + "loss": 0.77745986, + "num_input_tokens_seen": 274961330, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.36328125, + "step": 12751, + "time_per_iteration": 2.3687355518341064 + }, + { + "auxiliary_loss_clip": 0.0105795, + "auxiliary_loss_mlp": 0.01025218, + "balance_loss_clip": 1.01386344, + "balance_loss_mlp": 1.01972616, + "epoch": 0.766691718021945, + "flos": 24096808010880.0, + "grad_norm": 1.7477421491037914, + "language_loss": 0.60996836, + "learning_rate": 5.136260811643263e-07, + "loss": 0.63080007, + "num_input_tokens_seen": 274981655, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.3828125, + "step": 12752, + "time_per_iteration": 2.4087424278259277 + }, + { + "auxiliary_loss_clip": 0.0105702, + "auxiliary_loss_mlp": 0.01024309, + "balance_loss_clip": 1.0120542, + "balance_loss_mlp": 1.0183816, + "epoch": 0.7667518412746129, + "flos": 23439533155200.0, + "grad_norm": 2.3940932645021915, + "language_loss": 0.69418132, + "learning_rate": 5.133733128862506e-07, + "loss": 0.71499467, + "num_input_tokens_seen": 274999970, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.38671875, + "step": 12753, + "time_per_iteration": 3.827907085418701 + }, + { + "auxiliary_loss_clip": 0.01055628, + "auxiliary_loss_mlp": 0.01028509, + "balance_loss_clip": 1.01723158, + "balance_loss_mlp": 1.01796722, + "epoch": 0.766811964527281, + "flos": 18147852449280.0, + "grad_norm": 2.2299327085974676, + "language_loss": 0.62078673, + "learning_rate": 5.131205976620565e-07, + "loss": 0.64162809, + "num_input_tokens_seen": 275015805, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.37695312, + "step": 12754, + "time_per_iteration": 2.37272572517395 + }, + { + "auxiliary_loss_clip": 0.01056725, + "auxiliary_loss_mlp": 0.01020484, + "balance_loss_clip": 1.00924277, + "balance_loss_mlp": 1.01976979, + "epoch": 0.7668720877799489, + "flos": 19535056433280.0, + "grad_norm": 2.289409115920051, + "language_loss": 0.80074072, + "learning_rate": 5.128679355007633e-07, + "loss": 0.82151282, + "num_input_tokens_seen": 275031810, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.36914062, + "step": 12755, + "time_per_iteration": 2.3629045486450195 + }, + { + "auxiliary_loss_clip": 0.01056528, + "auxiliary_loss_mlp": 0.01023002, + "balance_loss_clip": 1.01130724, + "balance_loss_mlp": 1.0196774, + "epoch": 0.7669322110326169, + "flos": 22673224523520.0, + "grad_norm": 1.8193426166757112, + "language_loss": 0.70182824, + "learning_rate": 5.126153264113891e-07, + "loss": 0.72262359, + "num_input_tokens_seen": 275049325, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.3671875, + "step": 12756, + "time_per_iteration": 2.400109052658081 + }, + { + "auxiliary_loss_clip": 0.01056356, + "auxiliary_loss_mlp": 0.01029764, + "balance_loss_clip": 1.01857662, + "balance_loss_mlp": 1.01856184, + "epoch": 0.7669923342852848, + "flos": 26468178710400.0, + "grad_norm": 3.154641730831522, + "language_loss": 0.70074868, + "learning_rate": 5.123627704029465e-07, + "loss": 0.72160983, + "num_input_tokens_seen": 275070865, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.37890625, + "step": 12757, + "time_per_iteration": 2.429011583328247 + }, + { + "auxiliary_loss_clip": 0.01058248, + "auxiliary_loss_mlp": 0.010276, + "balance_loss_clip": 1.0149579, + "balance_loss_mlp": 1.01994872, + "epoch": 0.7670524575379528, + "flos": 22855052217600.0, + "grad_norm": 1.8575351422788435, + "language_loss": 0.76184845, + "learning_rate": 5.121102674844509e-07, + "loss": 0.78270692, + "num_input_tokens_seen": 275088015, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.3828125, + "step": 12758, + "time_per_iteration": 2.3911612033843994 + }, + { + "auxiliary_loss_clip": 0.01054619, + "auxiliary_loss_mlp": 0.01028419, + "balance_loss_clip": 1.01661706, + "balance_loss_mlp": 1.01802111, + "epoch": 0.7671125807906207, + "flos": 22451142165120.0, + "grad_norm": 1.7917693290577883, + "language_loss": 0.76310873, + "learning_rate": 5.118578176649124e-07, + "loss": 0.78393912, + "num_input_tokens_seen": 275106975, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.3671875, + "step": 12759, + "time_per_iteration": 2.4130661487579346 + }, + { + "auxiliary_loss_clip": 0.01054836, + "auxiliary_loss_mlp": 0.0101934, + "balance_loss_clip": 1.00830173, + "balance_loss_mlp": 1.01735973, + "epoch": 0.7671727040432887, + "flos": 35370088266240.0, + "grad_norm": 2.5824639889306127, + "language_loss": 0.683065, + "learning_rate": 5.116054209533404e-07, + "loss": 0.70380676, + "num_input_tokens_seen": 275129560, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.375, + "step": 12760, + "time_per_iteration": 2.5723557472229004 + }, + { + "auxiliary_loss_clip": 0.01057354, + "auxiliary_loss_mlp": 0.01026315, + "balance_loss_clip": 1.01417398, + "balance_loss_mlp": 1.0196991, + "epoch": 0.7672328272959568, + "flos": 22493770801920.0, + "grad_norm": 1.3627755570604536, + "language_loss": 0.79253733, + "learning_rate": 5.113530773587418e-07, + "loss": 0.81337404, + "num_input_tokens_seen": 275151180, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.37695312, + "step": 12761, + "time_per_iteration": 2.4302048683166504 + }, + { + "auxiliary_loss_clip": 0.01055362, + "auxiliary_loss_mlp": 0.01025918, + "balance_loss_clip": 1.01444364, + "balance_loss_mlp": 1.01840103, + "epoch": 0.7672929505486247, + "flos": 22814588085120.0, + "grad_norm": 2.3692939275257956, + "language_loss": 0.66224539, + "learning_rate": 5.111007868901232e-07, + "loss": 0.68305814, + "num_input_tokens_seen": 275170605, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.36914062, + "step": 12762, + "time_per_iteration": 2.4195151329040527 + }, + { + "auxiliary_loss_clip": 0.01056482, + "auxiliary_loss_mlp": 0.0102158, + "balance_loss_clip": 1.00983191, + "balance_loss_mlp": 1.01821661, + "epoch": 0.7673530738012927, + "flos": 20337814391040.0, + "grad_norm": 1.8026237703487635, + "language_loss": 0.74298859, + "learning_rate": 5.108485495564876e-07, + "loss": 0.76376927, + "num_input_tokens_seen": 275188750, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.3828125, + "step": 12763, + "time_per_iteration": 2.406956672668457 + }, + { + "auxiliary_loss_clip": 0.0105684, + "auxiliary_loss_mlp": 0.01024483, + "balance_loss_clip": 1.0131402, + "balance_loss_mlp": 1.01878762, + "epoch": 0.7674131970539606, + "flos": 34932137771520.0, + "grad_norm": 1.6580146560082512, + "language_loss": 0.70726252, + "learning_rate": 5.105963653668366e-07, + "loss": 0.72807568, + "num_input_tokens_seen": 275211365, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.38085938, + "step": 12764, + "time_per_iteration": 2.527526617050171 + }, + { + "auxiliary_loss_clip": 0.01053147, + "auxiliary_loss_mlp": 0.01024938, + "balance_loss_clip": 1.01395249, + "balance_loss_mlp": 1.01678586, + "epoch": 0.7674733203066286, + "flos": 28327618010880.0, + "grad_norm": 1.519055326903604, + "language_loss": 0.69792819, + "learning_rate": 5.103442343301696e-07, + "loss": 0.71870905, + "num_input_tokens_seen": 275231670, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.36328125, + "step": 12765, + "time_per_iteration": 2.465924024581909 + }, + { + "auxiliary_loss_clip": 0.0105338, + "auxiliary_loss_mlp": 0.01021507, + "balance_loss_clip": 1.01103425, + "balance_loss_mlp": 1.01684833, + "epoch": 0.7675334435592965, + "flos": 16288797173760.0, + "grad_norm": 2.7426727830636026, + "language_loss": 0.61142457, + "learning_rate": 5.100921564554863e-07, + "loss": 0.63217342, + "num_input_tokens_seen": 275249425, + "router_z_loss_clip": 0.10498047, + "router_z_loss_mlp": 0.36523438, + "step": 12766, + "time_per_iteration": 2.388141632080078 + }, + { + "auxiliary_loss_clip": 0.01006685, + "auxiliary_loss_mlp": 0.01000825, + "balance_loss_clip": 1.00000811, + "balance_loss_mlp": 1.00057316, + "epoch": 0.7675935668119646, + "flos": 64822641194880.0, + "grad_norm": 0.7318688011906712, + "language_loss": 0.6079455, + "learning_rate": 5.098401317517802e-07, + "loss": 0.62802064, + "num_input_tokens_seen": 275312485, + "router_z_loss_clip": 0.00817871, + "router_z_loss_mlp": 0.06103516, + "step": 12767, + "time_per_iteration": 3.1558616161346436 + }, + { + "auxiliary_loss_clip": 0.01054253, + "auxiliary_loss_mlp": 0.01020081, + "balance_loss_clip": 1.00965059, + "balance_loss_mlp": 1.01835895, + "epoch": 0.7676536900646325, + "flos": 22674271864320.0, + "grad_norm": 1.939498414189914, + "language_loss": 0.69803745, + "learning_rate": 5.095881602280472e-07, + "loss": 0.71878082, + "num_input_tokens_seen": 275331680, + "router_z_loss_clip": 0.10449219, + "router_z_loss_mlp": 0.359375, + "step": 12768, + "time_per_iteration": 2.418407678604126 + }, + { + "auxiliary_loss_clip": 0.01060246, + "auxiliary_loss_mlp": 0.01024658, + "balance_loss_clip": 1.01315451, + "balance_loss_mlp": 1.01992798, + "epoch": 0.7677138133173005, + "flos": 26938563724800.0, + "grad_norm": 1.7812857899679542, + "language_loss": 0.70615983, + "learning_rate": 5.093362418932796e-07, + "loss": 0.72700882, + "num_input_tokens_seen": 275351615, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.40234375, + "step": 12769, + "time_per_iteration": 2.4563164710998535 + }, + { + "auxiliary_loss_clip": 0.01058117, + "auxiliary_loss_mlp": 0.01020307, + "balance_loss_clip": 1.00840425, + "balance_loss_mlp": 1.01917624, + "epoch": 0.7677739365699684, + "flos": 23798580243840.0, + "grad_norm": 2.054622536835918, + "language_loss": 0.80295056, + "learning_rate": 5.090843767564659e-07, + "loss": 0.82373482, + "num_input_tokens_seen": 275368815, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.38867188, + "step": 12770, + "time_per_iteration": 2.4045560359954834 + }, + { + "auxiliary_loss_clip": 0.01054378, + "auxiliary_loss_mlp": 0.01021127, + "balance_loss_clip": 1.0108037, + "balance_loss_mlp": 1.01852834, + "epoch": 0.7678340598226364, + "flos": 34454176992000.0, + "grad_norm": 1.5908620222497154, + "language_loss": 0.78629696, + "learning_rate": 5.088325648265961e-07, + "loss": 0.80705202, + "num_input_tokens_seen": 275389345, + "router_z_loss_clip": 0.10351562, + "router_z_loss_mlp": 0.35742188, + "step": 12771, + "time_per_iteration": 2.5123751163482666 + }, + { + "auxiliary_loss_clip": 0.01056563, + "auxiliary_loss_mlp": 0.01021271, + "balance_loss_clip": 1.00988102, + "balance_loss_mlp": 1.01882768, + "epoch": 0.7678941830753043, + "flos": 23840615387520.0, + "grad_norm": 1.2987980380033497, + "language_loss": 0.68074393, + "learning_rate": 5.085808061126559e-07, + "loss": 0.70152235, + "num_input_tokens_seen": 275411240, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.37695312, + "step": 12772, + "time_per_iteration": 2.4595835208892822 + }, + { + "auxiliary_loss_clip": 0.01054749, + "auxiliary_loss_mlp": 0.01020799, + "balance_loss_clip": 1.01039159, + "balance_loss_mlp": 1.0188272, + "epoch": 0.7679543063279723, + "flos": 25409751799680.0, + "grad_norm": 1.4749229894762574, + "language_loss": 0.73295164, + "learning_rate": 5.083291006236317e-07, + "loss": 0.75370711, + "num_input_tokens_seen": 275432010, + "router_z_loss_clip": 0.10351562, + "router_z_loss_mlp": 0.359375, + "step": 12773, + "time_per_iteration": 2.4561283588409424 + }, + { + "auxiliary_loss_clip": 0.01054754, + "auxiliary_loss_mlp": 0.01021783, + "balance_loss_clip": 1.01123333, + "balance_loss_mlp": 1.0183022, + "epoch": 0.7680144295806404, + "flos": 27961204625280.0, + "grad_norm": 1.9502697213642477, + "language_loss": 0.81116855, + "learning_rate": 5.080774483685033e-07, + "loss": 0.83193392, + "num_input_tokens_seen": 275453710, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.36523438, + "step": 12774, + "time_per_iteration": 2.482565402984619 + }, + { + "auxiliary_loss_clip": 0.01056889, + "auxiliary_loss_mlp": 0.01024728, + "balance_loss_clip": 1.0129447, + "balance_loss_mlp": 1.01954055, + "epoch": 0.7680745528333083, + "flos": 20411760384000.0, + "grad_norm": 2.734979291461665, + "language_loss": 0.69734263, + "learning_rate": 5.078258493562539e-07, + "loss": 0.71815884, + "num_input_tokens_seen": 275472915, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.375, + "step": 12775, + "time_per_iteration": 3.8946874141693115 + }, + { + "auxiliary_loss_clip": 0.01006402, + "auxiliary_loss_mlp": 0.01001276, + "balance_loss_clip": 1.00036991, + "balance_loss_mlp": 1.00038671, + "epoch": 0.7681346760859763, + "flos": 68397433148160.0, + "grad_norm": 0.6903218878939044, + "language_loss": 0.56880224, + "learning_rate": 5.075743035958617e-07, + "loss": 0.58887899, + "num_input_tokens_seen": 275534785, + "router_z_loss_clip": 0.0090332, + "router_z_loss_mlp": 0.06005859, + "step": 12776, + "time_per_iteration": 3.090618848800659 + }, + { + "auxiliary_loss_clip": 0.0105568, + "auxiliary_loss_mlp": 0.01029397, + "balance_loss_clip": 1.01811957, + "balance_loss_mlp": 1.01921344, + "epoch": 0.7681947993386442, + "flos": 21250409086080.0, + "grad_norm": 1.9096782417493667, + "language_loss": 0.73411071, + "learning_rate": 5.073228110963035e-07, + "loss": 0.75496149, + "num_input_tokens_seen": 275553205, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.36328125, + "step": 12777, + "time_per_iteration": 2.4299380779266357 + }, + { + "auxiliary_loss_clip": 0.01057449, + "auxiliary_loss_mlp": 0.0102619, + "balance_loss_clip": 1.01504374, + "balance_loss_mlp": 1.02038026, + "epoch": 0.7682549225913122, + "flos": 21396625326720.0, + "grad_norm": 1.958319646411997, + "language_loss": 0.7048676, + "learning_rate": 5.070713718665538e-07, + "loss": 0.72570395, + "num_input_tokens_seen": 275571490, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.37109375, + "step": 12778, + "time_per_iteration": 2.4222779273986816 + }, + { + "auxiliary_loss_clip": 0.01056554, + "auxiliary_loss_mlp": 0.01023068, + "balance_loss_clip": 1.0121305, + "balance_loss_mlp": 1.01922321, + "epoch": 0.7683150458439801, + "flos": 23037822518400.0, + "grad_norm": 1.8646443833436117, + "language_loss": 0.70084965, + "learning_rate": 5.068199859155875e-07, + "loss": 0.72164583, + "num_input_tokens_seen": 275589665, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.37304688, + "step": 12779, + "time_per_iteration": 2.4595999717712402 + }, + { + "auxiliary_loss_clip": 0.01057556, + "auxiliary_loss_mlp": 0.0102351, + "balance_loss_clip": 1.01194668, + "balance_loss_mlp": 1.01944757, + "epoch": 0.7683751690966482, + "flos": 67330070795520.0, + "grad_norm": 1.5352238396865154, + "language_loss": 0.58626795, + "learning_rate": 5.065686532523748e-07, + "loss": 0.60707855, + "num_input_tokens_seen": 275615605, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.38085938, + "step": 12780, + "time_per_iteration": 2.811880588531494 + }, + { + "auxiliary_loss_clip": 0.01059474, + "auxiliary_loss_mlp": 0.01025765, + "balance_loss_clip": 1.01224709, + "balance_loss_mlp": 1.01940727, + "epoch": 0.7684352923493161, + "flos": 21797812293120.0, + "grad_norm": 1.9528125912013732, + "language_loss": 0.67996407, + "learning_rate": 5.063173738858852e-07, + "loss": 0.70081651, + "num_input_tokens_seen": 275634965, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.40039062, + "step": 12781, + "time_per_iteration": 2.4630095958709717 + }, + { + "auxiliary_loss_clip": 0.01054297, + "auxiliary_loss_mlp": 0.010213, + "balance_loss_clip": 1.01033843, + "balance_loss_mlp": 1.01757812, + "epoch": 0.7684954156019841, + "flos": 25846445485440.0, + "grad_norm": 1.4271174854485744, + "language_loss": 0.79446983, + "learning_rate": 5.060661478250858e-07, + "loss": 0.81522584, + "num_input_tokens_seen": 275655785, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.3671875, + "step": 12782, + "time_per_iteration": 2.462989330291748 + }, + { + "auxiliary_loss_clip": 0.01056902, + "auxiliary_loss_mlp": 0.01024005, + "balance_loss_clip": 1.01166081, + "balance_loss_mlp": 1.01854229, + "epoch": 0.768555538854652, + "flos": 25446201125760.0, + "grad_norm": 1.720941326298002, + "language_loss": 0.66709507, + "learning_rate": 5.05814975078944e-07, + "loss": 0.68790412, + "num_input_tokens_seen": 275676160, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.3828125, + "step": 12783, + "time_per_iteration": 3.874603271484375 + }, + { + "auxiliary_loss_clip": 0.01056507, + "auxiliary_loss_mlp": 0.01027187, + "balance_loss_clip": 1.01487899, + "balance_loss_mlp": 1.01858413, + "epoch": 0.76861566210732, + "flos": 19645347018240.0, + "grad_norm": 1.9159267734971204, + "language_loss": 0.69146347, + "learning_rate": 5.055638556564217e-07, + "loss": 0.71230042, + "num_input_tokens_seen": 275695660, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.37890625, + "step": 12784, + "time_per_iteration": 3.7960760593414307 + }, + { + "auxiliary_loss_clip": 0.01055476, + "auxiliary_loss_mlp": 0.01020416, + "balance_loss_clip": 1.01048589, + "balance_loss_mlp": 1.01851678, + "epoch": 0.7686757853599879, + "flos": 22417939595520.0, + "grad_norm": 1.8245804133807833, + "language_loss": 0.8062076, + "learning_rate": 5.053127895664804e-07, + "loss": 0.82696652, + "num_input_tokens_seen": 275714025, + "router_z_loss_clip": 0.09912109, + "router_z_loss_mlp": 0.36914062, + "step": 12785, + "time_per_iteration": 2.4031145572662354 + }, + { + "auxiliary_loss_clip": 0.01056492, + "auxiliary_loss_mlp": 0.01027284, + "balance_loss_clip": 1.01469517, + "balance_loss_mlp": 1.01885939, + "epoch": 0.768735908612656, + "flos": 47772529580160.0, + "grad_norm": 1.620215763055785, + "language_loss": 0.77226174, + "learning_rate": 5.050617768180823e-07, + "loss": 0.79309952, + "num_input_tokens_seen": 275737300, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.37695312, + "step": 12786, + "time_per_iteration": 2.628635883331299 + }, + { + "auxiliary_loss_clip": 0.0105478, + "auxiliary_loss_mlp": 0.01024351, + "balance_loss_clip": 1.01280534, + "balance_loss_mlp": 1.01762152, + "epoch": 0.7687960318653239, + "flos": 30261876088320.0, + "grad_norm": 2.1045626186461703, + "language_loss": 0.58876914, + "learning_rate": 5.048108174201826e-07, + "loss": 0.60956043, + "num_input_tokens_seen": 275757895, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.37109375, + "step": 12787, + "time_per_iteration": 2.4704439640045166 + }, + { + "auxiliary_loss_clip": 0.01054368, + "auxiliary_loss_mlp": 0.01024039, + "balance_loss_clip": 1.0131011, + "balance_loss_mlp": 1.01694775, + "epoch": 0.7688561551179919, + "flos": 19572413454720.0, + "grad_norm": 1.5206243761616776, + "language_loss": 0.76175392, + "learning_rate": 5.045599113817394e-07, + "loss": 0.782538, + "num_input_tokens_seen": 275776745, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.375, + "step": 12788, + "time_per_iteration": 2.3959901332855225 + }, + { + "auxiliary_loss_clip": 0.01055571, + "auxiliary_loss_mlp": 0.01029788, + "balance_loss_clip": 1.0177834, + "balance_loss_mlp": 1.01792204, + "epoch": 0.7689162783706599, + "flos": 22782677235840.0, + "grad_norm": 1.7063274520738798, + "language_loss": 0.66915113, + "learning_rate": 5.043090587117056e-07, + "loss": 0.69000471, + "num_input_tokens_seen": 275797205, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.375, + "step": 12789, + "time_per_iteration": 2.428682327270508 + }, + { + "auxiliary_loss_clip": 0.01056825, + "auxiliary_loss_mlp": 0.0102357, + "balance_loss_clip": 1.01128006, + "balance_loss_mlp": 1.01889515, + "epoch": 0.7689764016233278, + "flos": 34202767224960.0, + "grad_norm": 1.604696244671686, + "language_loss": 0.68723762, + "learning_rate": 5.040582594190352e-07, + "loss": 0.70804167, + "num_input_tokens_seen": 275817935, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.37890625, + "step": 12790, + "time_per_iteration": 2.574496269226074 + }, + { + "auxiliary_loss_clip": 0.01055516, + "auxiliary_loss_mlp": 0.01023639, + "balance_loss_clip": 1.01246285, + "balance_loss_mlp": 1.0182538, + "epoch": 0.7690365248759958, + "flos": 17273522471040.0, + "grad_norm": 5.421375343718315, + "language_loss": 0.68664223, + "learning_rate": 5.038075135126765e-07, + "loss": 0.7074337, + "num_input_tokens_seen": 275837145, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.37304688, + "step": 12791, + "time_per_iteration": 2.3811724185943604 + }, + { + "auxiliary_loss_clip": 0.01056716, + "auxiliary_loss_mlp": 0.01023528, + "balance_loss_clip": 1.01291275, + "balance_loss_mlp": 1.01988447, + "epoch": 0.7690966481286637, + "flos": 18222182467200.0, + "grad_norm": 1.7199152311893096, + "language_loss": 0.79530632, + "learning_rate": 5.035568210015795e-07, + "loss": 0.8161087, + "num_input_tokens_seen": 275855705, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.36914062, + "step": 12792, + "time_per_iteration": 2.436178207397461 + }, + { + "auxiliary_loss_clip": 0.01061383, + "auxiliary_loss_mlp": 0.01020695, + "balance_loss_clip": 1.00824928, + "balance_loss_mlp": 1.021456, + "epoch": 0.7691567713813318, + "flos": 21536662256640.0, + "grad_norm": 2.090239508693481, + "language_loss": 0.72707492, + "learning_rate": 5.033061818946902e-07, + "loss": 0.74789572, + "num_input_tokens_seen": 275873930, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.3984375, + "step": 12793, + "time_per_iteration": 3.8012757301330566 + }, + { + "auxiliary_loss_clip": 0.01056393, + "auxiliary_loss_mlp": 0.01024306, + "balance_loss_clip": 1.0124805, + "balance_loss_mlp": 1.01942277, + "epoch": 0.7692168946339997, + "flos": 39378571528320.0, + "grad_norm": 1.7437031990671128, + "language_loss": 0.63395631, + "learning_rate": 5.030555962009532e-07, + "loss": 0.65476334, + "num_input_tokens_seen": 275895895, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.36914062, + "step": 12794, + "time_per_iteration": 2.5472841262817383 + }, + { + "auxiliary_loss_clip": 0.01059776, + "auxiliary_loss_mlp": 0.01023797, + "balance_loss_clip": 1.01191223, + "balance_loss_mlp": 1.0204252, + "epoch": 0.7692770178866677, + "flos": 25008774301440.0, + "grad_norm": 1.8021393023952204, + "language_loss": 0.76126599, + "learning_rate": 5.028050639293111e-07, + "loss": 0.78210175, + "num_input_tokens_seen": 275917825, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.39453125, + "step": 12795, + "time_per_iteration": 2.4169421195983887 + }, + { + "auxiliary_loss_clip": 0.01056262, + "auxiliary_loss_mlp": 0.01023387, + "balance_loss_clip": 1.01134717, + "balance_loss_mlp": 1.01745939, + "epoch": 0.7693371411393356, + "flos": 24715154833920.0, + "grad_norm": 1.577327608362423, + "language_loss": 0.71977836, + "learning_rate": 5.025545850887054e-07, + "loss": 0.74057484, + "num_input_tokens_seen": 275937890, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.38671875, + "step": 12796, + "time_per_iteration": 2.4328126907348633 + }, + { + "auxiliary_loss_clip": 0.01056358, + "auxiliary_loss_mlp": 0.01024513, + "balance_loss_clip": 1.01306319, + "balance_loss_mlp": 1.01822853, + "epoch": 0.7693972643920036, + "flos": 15923884976640.0, + "grad_norm": 3.3060024155343037, + "language_loss": 0.64842832, + "learning_rate": 5.023041596880748e-07, + "loss": 0.66923702, + "num_input_tokens_seen": 275954495, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.38085938, + "step": 12797, + "time_per_iteration": 2.3687801361083984 + }, + { + "auxiliary_loss_clip": 0.01056462, + "auxiliary_loss_mlp": 0.01027286, + "balance_loss_clip": 1.01504946, + "balance_loss_mlp": 1.01830173, + "epoch": 0.7694573876446715, + "flos": 25404864209280.0, + "grad_norm": 1.8053251721176131, + "language_loss": 0.91582143, + "learning_rate": 5.02053787736356e-07, + "loss": 0.93665886, + "num_input_tokens_seen": 275972395, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.3828125, + "step": 12798, + "time_per_iteration": 2.44700026512146 + }, + { + "auxiliary_loss_clip": 0.01056357, + "auxiliary_loss_mlp": 0.01021206, + "balance_loss_clip": 1.00995231, + "balance_loss_mlp": 1.01899004, + "epoch": 0.7695175108973396, + "flos": 16653290434560.0, + "grad_norm": 1.9834333965577904, + "language_loss": 0.82489133, + "learning_rate": 5.018034692424843e-07, + "loss": 0.84566689, + "num_input_tokens_seen": 275989020, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.375, + "step": 12799, + "time_per_iteration": 2.3768038749694824 + }, + { + "auxiliary_loss_clip": 0.01057795, + "auxiliary_loss_mlp": 0.01024915, + "balance_loss_clip": 1.0131495, + "balance_loss_mlp": 1.01901531, + "epoch": 0.7695776341500075, + "flos": 13625657308800.0, + "grad_norm": 3.2645667872596382, + "language_loss": 0.78033286, + "learning_rate": 5.015532042153933e-07, + "loss": 0.80115998, + "num_input_tokens_seen": 276006525, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.38671875, + "step": 12800, + "time_per_iteration": 2.381880760192871 + }, + { + "auxiliary_loss_clip": 0.01059694, + "auxiliary_loss_mlp": 0.01022423, + "balance_loss_clip": 1.00967336, + "balance_loss_mlp": 1.01879478, + "epoch": 0.7696377574026755, + "flos": 24275633328000.0, + "grad_norm": 1.7593279957319803, + "language_loss": 0.84165031, + "learning_rate": 5.013029926640138e-07, + "loss": 0.86247146, + "num_input_tokens_seen": 276027130, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.41015625, + "step": 12801, + "time_per_iteration": 2.4316234588623047 + }, + { + "auxiliary_loss_clip": 0.01055897, + "auxiliary_loss_mlp": 0.01023046, + "balance_loss_clip": 1.01178098, + "balance_loss_mlp": 1.01857448, + "epoch": 0.7696978806553435, + "flos": 20922085860480.0, + "grad_norm": 1.8996392933346098, + "language_loss": 0.72092128, + "learning_rate": 5.010528345972749e-07, + "loss": 0.74171072, + "num_input_tokens_seen": 276045715, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.37304688, + "step": 12802, + "time_per_iteration": 2.425363063812256 + }, + { + "auxiliary_loss_clip": 0.01056116, + "auxiliary_loss_mlp": 0.01027811, + "balance_loss_clip": 1.0165993, + "balance_loss_mlp": 1.01840937, + "epoch": 0.7697580039080114, + "flos": 22928509451520.0, + "grad_norm": 1.7169972414112693, + "language_loss": 0.75934255, + "learning_rate": 5.008027300241056e-07, + "loss": 0.78018188, + "num_input_tokens_seen": 276065375, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.37695312, + "step": 12803, + "time_per_iteration": 2.4496994018554688 + }, + { + "auxiliary_loss_clip": 0.01006322, + "auxiliary_loss_mlp": 0.01000449, + "balance_loss_clip": 0.99964422, + "balance_loss_mlp": 1.00026155, + "epoch": 0.7698181271606794, + "flos": 68714305447680.0, + "grad_norm": 0.7495378366620181, + "language_loss": 0.55820143, + "learning_rate": 5.005526789534294e-07, + "loss": 0.57826912, + "num_input_tokens_seen": 276131405, + "router_z_loss_clip": 0.00805664, + "router_z_loss_mlp": 0.06054688, + "step": 12804, + "time_per_iteration": 3.0666561126708984 + }, + { + "auxiliary_loss_clip": 0.01006739, + "auxiliary_loss_mlp": 0.0100094, + "balance_loss_clip": 1.00021923, + "balance_loss_mlp": 1.00050163, + "epoch": 0.7698782504133473, + "flos": 67406249249280.0, + "grad_norm": 0.7439893425814005, + "language_loss": 0.54022503, + "learning_rate": 5.003026813941715e-07, + "loss": 0.56030184, + "num_input_tokens_seen": 276200755, + "router_z_loss_clip": 0.00720215, + "router_z_loss_mlp": 0.06225586, + "step": 12805, + "time_per_iteration": 3.1967613697052 + }, + { + "auxiliary_loss_clip": 0.01060575, + "auxiliary_loss_mlp": 0.01023844, + "balance_loss_clip": 1.0114994, + "balance_loss_mlp": 1.02158427, + "epoch": 0.7699383736660154, + "flos": 22487835870720.0, + "grad_norm": 2.0296189647641096, + "language_loss": 0.72759855, + "learning_rate": 5.000527373552528e-07, + "loss": 0.74844277, + "num_input_tokens_seen": 276217880, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.390625, + "step": 12806, + "time_per_iteration": 2.393480062484741 + }, + { + "auxiliary_loss_clip": 0.01055559, + "auxiliary_loss_mlp": 0.01025341, + "balance_loss_clip": 1.01352739, + "balance_loss_mlp": 1.01765347, + "epoch": 0.7699984969186833, + "flos": 21538756938240.0, + "grad_norm": 1.7955607858153158, + "language_loss": 0.7482059, + "learning_rate": 4.998028468455946e-07, + "loss": 0.76901495, + "num_input_tokens_seen": 276234810, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.37890625, + "step": 12807, + "time_per_iteration": 2.3542680740356445 + }, + { + "auxiliary_loss_clip": 0.010602, + "auxiliary_loss_mlp": 0.01028578, + "balance_loss_clip": 1.01604342, + "balance_loss_mlp": 1.01922333, + "epoch": 0.7700586201713513, + "flos": 21718210659840.0, + "grad_norm": 1.9571696528307472, + "language_loss": 0.80124265, + "learning_rate": 4.995530098741128e-07, + "loss": 0.82213044, + "num_input_tokens_seen": 276252850, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.41015625, + "step": 12808, + "time_per_iteration": 2.3900980949401855 + }, + { + "auxiliary_loss_clip": 0.01059754, + "auxiliary_loss_mlp": 0.01027447, + "balance_loss_clip": 1.0155021, + "balance_loss_mlp": 1.01984191, + "epoch": 0.7701187434240192, + "flos": 27854754289920.0, + "grad_norm": 2.0315289383476984, + "language_loss": 0.79292738, + "learning_rate": 4.993032264497248e-07, + "loss": 0.81379938, + "num_input_tokens_seen": 276272525, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.3984375, + "step": 12809, + "time_per_iteration": 2.448192834854126 + }, + { + "auxiliary_loss_clip": 0.01052883, + "auxiliary_loss_mlp": 0.01019884, + "balance_loss_clip": 1.00956619, + "balance_loss_mlp": 1.01769745, + "epoch": 0.7701788666766872, + "flos": 28620050492160.0, + "grad_norm": 1.4887947655002374, + "language_loss": 0.70473206, + "learning_rate": 4.990534965813446e-07, + "loss": 0.72545975, + "num_input_tokens_seen": 276294210, + "router_z_loss_clip": 0.10302734, + "router_z_loss_mlp": 0.3515625, + "step": 12810, + "time_per_iteration": 2.455221652984619 + }, + { + "auxiliary_loss_clip": 0.01059679, + "auxiliary_loss_mlp": 0.01026961, + "balance_loss_clip": 1.01368141, + "balance_loss_mlp": 1.0193764, + "epoch": 0.7702389899293551, + "flos": 14245575143040.0, + "grad_norm": 2.788292839072021, + "language_loss": 0.78231329, + "learning_rate": 4.988038202778842e-07, + "loss": 0.80317968, + "num_input_tokens_seen": 276310290, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.40234375, + "step": 12811, + "time_per_iteration": 2.335257053375244 + }, + { + "auxiliary_loss_clip": 0.01006608, + "auxiliary_loss_mlp": 0.01000482, + "balance_loss_clip": 0.99963611, + "balance_loss_mlp": 1.00047851, + "epoch": 0.7702991131820232, + "flos": 70570847105280.0, + "grad_norm": 0.8186544625081337, + "language_loss": 0.56663704, + "learning_rate": 4.985541975482533e-07, + "loss": 0.58670795, + "num_input_tokens_seen": 276371715, + "router_z_loss_clip": 0.00848389, + "router_z_loss_mlp": 0.0612793, + "step": 12812, + "time_per_iteration": 3.085603952407837 + }, + { + "auxiliary_loss_clip": 0.01058415, + "auxiliary_loss_mlp": 0.01024431, + "balance_loss_clip": 1.01264691, + "balance_loss_mlp": 1.0189302, + "epoch": 0.7703592364346911, + "flos": 25478949847680.0, + "grad_norm": 1.658146892595542, + "language_loss": 0.7206167, + "learning_rate": 4.983046284013615e-07, + "loss": 0.74144518, + "num_input_tokens_seen": 276389895, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.39453125, + "step": 12813, + "time_per_iteration": 2.4428155422210693 + }, + { + "auxiliary_loss_clip": 0.01057565, + "auxiliary_loss_mlp": 0.0102191, + "balance_loss_clip": 1.00975657, + "balance_loss_mlp": 1.0187974, + "epoch": 0.7704193596873591, + "flos": 19279911150720.0, + "grad_norm": 1.5834931071221947, + "language_loss": 0.66598994, + "learning_rate": 4.980551128461152e-07, + "loss": 0.68678468, + "num_input_tokens_seen": 276408990, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.38671875, + "step": 12814, + "time_per_iteration": 2.4111006259918213 + }, + { + "auxiliary_loss_clip": 0.01057454, + "auxiliary_loss_mlp": 0.01023024, + "balance_loss_clip": 1.01149058, + "balance_loss_mlp": 1.01917076, + "epoch": 0.7704794829400271, + "flos": 23657356327680.0, + "grad_norm": 1.9665011417000087, + "language_loss": 0.65696704, + "learning_rate": 4.978056508914175e-07, + "loss": 0.67777181, + "num_input_tokens_seen": 276428190, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.3828125, + "step": 12815, + "time_per_iteration": 3.8464977741241455 + }, + { + "auxiliary_loss_clip": 0.0105736, + "auxiliary_loss_mlp": 0.01022787, + "balance_loss_clip": 1.01085973, + "balance_loss_mlp": 1.01846695, + "epoch": 0.770539606192695, + "flos": 18988316542080.0, + "grad_norm": 2.4359925363255774, + "language_loss": 0.65263456, + "learning_rate": 4.975562425461723e-07, + "loss": 0.67343605, + "num_input_tokens_seen": 276446855, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.390625, + "step": 12816, + "time_per_iteration": 2.3990042209625244 + }, + { + "auxiliary_loss_clip": 0.01057866, + "auxiliary_loss_mlp": 0.01026446, + "balance_loss_clip": 1.01453149, + "balance_loss_mlp": 1.01971495, + "epoch": 0.770599729445363, + "flos": 11829585859200.0, + "grad_norm": 2.2849221339199692, + "language_loss": 0.71799827, + "learning_rate": 4.973068878192803e-07, + "loss": 0.73884141, + "num_input_tokens_seen": 276462000, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.3828125, + "step": 12817, + "time_per_iteration": 2.3679358959198 + }, + { + "auxiliary_loss_clip": 0.01055996, + "auxiliary_loss_mlp": 0.01025815, + "balance_loss_clip": 1.01437092, + "balance_loss_mlp": 1.01782453, + "epoch": 0.770659852698031, + "flos": 17821623905280.0, + "grad_norm": 2.6098853136174336, + "language_loss": 0.61117858, + "learning_rate": 4.9705758671964e-07, + "loss": 0.63199669, + "num_input_tokens_seen": 276481190, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.3828125, + "step": 12818, + "time_per_iteration": 2.365363121032715 + }, + { + "auxiliary_loss_clip": 0.01055862, + "auxiliary_loss_mlp": 0.01020315, + "balance_loss_clip": 1.00981259, + "balance_loss_mlp": 1.01892209, + "epoch": 0.770719975950699, + "flos": 21870885502080.0, + "grad_norm": 1.6433733873350533, + "language_loss": 0.67383111, + "learning_rate": 4.96808339256148e-07, + "loss": 0.69459289, + "num_input_tokens_seen": 276499520, + "router_z_loss_clip": 0.10498047, + "router_z_loss_mlp": 0.36914062, + "step": 12819, + "time_per_iteration": 2.446286201477051 + }, + { + "auxiliary_loss_clip": 0.01057787, + "auxiliary_loss_mlp": 0.01025792, + "balance_loss_clip": 1.01454496, + "balance_loss_mlp": 1.01927924, + "epoch": 0.7707800992033669, + "flos": 21323971054080.0, + "grad_norm": 1.71653912936421, + "language_loss": 0.57441938, + "learning_rate": 4.965591454377005e-07, + "loss": 0.5952552, + "num_input_tokens_seen": 276519110, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.38476562, + "step": 12820, + "time_per_iteration": 2.4226369857788086 + }, + { + "auxiliary_loss_clip": 0.01054901, + "auxiliary_loss_mlp": 0.01024999, + "balance_loss_clip": 1.0131613, + "balance_loss_mlp": 1.01751828, + "epoch": 0.7708402224560349, + "flos": 28178294659200.0, + "grad_norm": 1.823697904430834, + "language_loss": 0.80866814, + "learning_rate": 4.96310005273189e-07, + "loss": 0.82946712, + "num_input_tokens_seen": 276538805, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.37304688, + "step": 12821, + "time_per_iteration": 2.458505153656006 + }, + { + "auxiliary_loss_clip": 0.01057586, + "auxiliary_loss_mlp": 0.01025517, + "balance_loss_clip": 1.0138166, + "balance_loss_mlp": 1.0191083, + "epoch": 0.7709003457087028, + "flos": 15376167567360.0, + "grad_norm": 2.6686723482443577, + "language_loss": 0.68569589, + "learning_rate": 4.960609187715057e-07, + "loss": 0.70652688, + "num_input_tokens_seen": 276554770, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.38476562, + "step": 12822, + "time_per_iteration": 2.3820762634277344 + }, + { + "auxiliary_loss_clip": 0.01055181, + "auxiliary_loss_mlp": 0.01027136, + "balance_loss_clip": 1.01508975, + "balance_loss_mlp": 1.01785755, + "epoch": 0.7709604689613708, + "flos": 30620713708800.0, + "grad_norm": 1.6815241480924767, + "language_loss": 0.72387123, + "learning_rate": 4.958118859415393e-07, + "loss": 0.74469447, + "num_input_tokens_seen": 276574535, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.37304688, + "step": 12823, + "time_per_iteration": 3.9900059700012207 + }, + { + "auxiliary_loss_clip": 0.01060905, + "auxiliary_loss_mlp": 0.01026969, + "balance_loss_clip": 1.01360583, + "balance_loss_mlp": 1.01928091, + "epoch": 0.7710205922140387, + "flos": 20300282812800.0, + "grad_norm": 1.7155130781150418, + "language_loss": 0.7646004, + "learning_rate": 4.955629067921785e-07, + "loss": 0.78547919, + "num_input_tokens_seen": 276592925, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.41601562, + "step": 12824, + "time_per_iteration": 3.8808798789978027 + }, + { + "auxiliary_loss_clip": 0.01056922, + "auxiliary_loss_mlp": 0.01022775, + "balance_loss_clip": 1.01139617, + "balance_loss_mlp": 1.01981664, + "epoch": 0.7710807154667068, + "flos": 19643252336640.0, + "grad_norm": 12.8059531451642, + "language_loss": 0.72564387, + "learning_rate": 4.953139813323066e-07, + "loss": 0.74644083, + "num_input_tokens_seen": 276610540, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.37109375, + "step": 12825, + "time_per_iteration": 2.417590618133545 + }, + { + "auxiliary_loss_clip": 0.01054963, + "auxiliary_loss_mlp": 0.01023927, + "balance_loss_clip": 1.01359725, + "balance_loss_mlp": 1.0184195, + "epoch": 0.7711408387193747, + "flos": 20005441447680.0, + "grad_norm": 1.385718280882504, + "language_loss": 0.73628509, + "learning_rate": 4.950651095708087e-07, + "loss": 0.75707394, + "num_input_tokens_seen": 276629200, + "router_z_loss_clip": 0.10351562, + "router_z_loss_mlp": 0.36523438, + "step": 12826, + "time_per_iteration": 2.4155447483062744 + }, + { + "auxiliary_loss_clip": 0.01057394, + "auxiliary_loss_mlp": 0.01026217, + "balance_loss_clip": 1.01378345, + "balance_loss_mlp": 1.01802969, + "epoch": 0.7712009619720427, + "flos": 24970020825600.0, + "grad_norm": 1.5046645560409386, + "language_loss": 0.81061101, + "learning_rate": 4.948162915165659e-07, + "loss": 0.83144718, + "num_input_tokens_seen": 276648655, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.39453125, + "step": 12827, + "time_per_iteration": 2.426349401473999 + }, + { + "auxiliary_loss_clip": 0.01006855, + "auxiliary_loss_mlp": 0.01001933, + "balance_loss_clip": 1.00109291, + "balance_loss_mlp": 1.00065422, + "epoch": 0.7712610852247107, + "flos": 63246347953920.0, + "grad_norm": 0.8585819174035294, + "language_loss": 0.5519051, + "learning_rate": 4.945675271784577e-07, + "loss": 0.57199299, + "num_input_tokens_seen": 276716500, + "router_z_loss_clip": 0.00842285, + "router_z_loss_mlp": 0.06201172, + "step": 12828, + "time_per_iteration": 3.198864221572876 + }, + { + "auxiliary_loss_clip": 0.01055047, + "auxiliary_loss_mlp": 0.0102183, + "balance_loss_clip": 1.01139903, + "balance_loss_mlp": 1.01809382, + "epoch": 0.7713212084773786, + "flos": 18696861578880.0, + "grad_norm": 2.1186524317210282, + "language_loss": 0.69775486, + "learning_rate": 4.943188165653622e-07, + "loss": 0.71852368, + "num_input_tokens_seen": 276733535, + "router_z_loss_clip": 0.10449219, + "router_z_loss_mlp": 0.36914062, + "step": 12829, + "time_per_iteration": 2.361795663833618 + }, + { + "auxiliary_loss_clip": 0.01059571, + "auxiliary_loss_mlp": 0.01026737, + "balance_loss_clip": 1.01465464, + "balance_loss_mlp": 1.02041471, + "epoch": 0.7713813317300466, + "flos": 14172501934080.0, + "grad_norm": 2.5932960490282446, + "language_loss": 0.79464316, + "learning_rate": 4.940701596861552e-07, + "loss": 0.81550622, + "num_input_tokens_seen": 276749575, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.39257812, + "step": 12830, + "time_per_iteration": 2.4150502681732178 + }, + { + "auxiliary_loss_clip": 0.01060103, + "auxiliary_loss_mlp": 0.01025224, + "balance_loss_clip": 1.01220632, + "balance_loss_mlp": 1.01954472, + "epoch": 0.7714414549827145, + "flos": 25702742862720.0, + "grad_norm": 1.6801927205923683, + "language_loss": 0.78019631, + "learning_rate": 4.938215565497102e-07, + "loss": 0.80104959, + "num_input_tokens_seen": 276769460, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.40625, + "step": 12831, + "time_per_iteration": 2.4297356605529785 + }, + { + "auxiliary_loss_clip": 0.010567, + "auxiliary_loss_mlp": 0.01026274, + "balance_loss_clip": 1.01528895, + "balance_loss_mlp": 1.01859689, + "epoch": 0.7715015782353826, + "flos": 30553994367360.0, + "grad_norm": 1.8792171816327148, + "language_loss": 0.60872531, + "learning_rate": 4.935730071648992e-07, + "loss": 0.62955505, + "num_input_tokens_seen": 276790820, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.38085938, + "step": 12832, + "time_per_iteration": 3.8874473571777344 + }, + { + "auxiliary_loss_clip": 0.01057179, + "auxiliary_loss_mlp": 0.01024378, + "balance_loss_clip": 1.01209331, + "balance_loss_mlp": 1.0172199, + "epoch": 0.7715617014880505, + "flos": 20228326767360.0, + "grad_norm": 1.6162031287856515, + "language_loss": 0.79098338, + "learning_rate": 4.933245115405928e-07, + "loss": 0.81179893, + "num_input_tokens_seen": 276811345, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.3984375, + "step": 12833, + "time_per_iteration": 2.4181137084960938 + }, + { + "auxiliary_loss_clip": 0.0100652, + "auxiliary_loss_mlp": 0.01003714, + "balance_loss_clip": 1.00296879, + "balance_loss_mlp": 1.00035548, + "epoch": 0.7716218247407185, + "flos": 63662059912320.0, + "grad_norm": 0.8443801640079429, + "language_loss": 0.55413073, + "learning_rate": 4.930760696856593e-07, + "loss": 0.57423306, + "num_input_tokens_seen": 276870950, + "router_z_loss_clip": 0.00744629, + "router_z_loss_mlp": 0.06152344, + "step": 12834, + "time_per_iteration": 3.079965353012085 + }, + { + "auxiliary_loss_clip": 0.01058512, + "auxiliary_loss_mlp": 0.01022004, + "balance_loss_clip": 1.01060772, + "balance_loss_mlp": 1.01993108, + "epoch": 0.7716819479933864, + "flos": 19790795208960.0, + "grad_norm": 2.315272153199747, + "language_loss": 0.72894633, + "learning_rate": 4.928276816089643e-07, + "loss": 0.74975145, + "num_input_tokens_seen": 276890760, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.38671875, + "step": 12835, + "time_per_iteration": 2.4274275302886963 + }, + { + "auxiliary_loss_clip": 0.01058165, + "auxiliary_loss_mlp": 0.0103, + "balance_loss_clip": 1.01801991, + "balance_loss_mlp": 1.01833117, + "epoch": 0.7717420712460544, + "flos": 18441192625920.0, + "grad_norm": 1.849043008868538, + "language_loss": 0.70283222, + "learning_rate": 4.92579347319372e-07, + "loss": 0.72371387, + "num_input_tokens_seen": 276909625, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.3984375, + "step": 12836, + "time_per_iteration": 2.4085962772369385 + }, + { + "auxiliary_loss_clip": 0.01055926, + "auxiliary_loss_mlp": 0.01023539, + "balance_loss_clip": 1.01205277, + "balance_loss_mlp": 1.0183717, + "epoch": 0.7718021944987223, + "flos": 35188016192640.0, + "grad_norm": 2.028330886781929, + "language_loss": 0.5983547, + "learning_rate": 4.923310668257466e-07, + "loss": 0.61914933, + "num_input_tokens_seen": 276930760, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.375, + "step": 12837, + "time_per_iteration": 2.5610857009887695 + }, + { + "auxiliary_loss_clip": 0.0100626, + "auxiliary_loss_mlp": 0.01001656, + "balance_loss_clip": 1.00073183, + "balance_loss_mlp": 1.0001508, + "epoch": 0.7718623177513904, + "flos": 67919612014080.0, + "grad_norm": 0.86710320478393, + "language_loss": 0.55804145, + "learning_rate": 4.920828401369457e-07, + "loss": 0.57812059, + "num_input_tokens_seen": 276989580, + "router_z_loss_clip": 0.00921631, + "router_z_loss_mlp": 0.06103516, + "step": 12838, + "time_per_iteration": 3.0166378021240234 + }, + { + "auxiliary_loss_clip": 0.01055246, + "auxiliary_loss_mlp": 0.01022899, + "balance_loss_clip": 1.01201463, + "balance_loss_mlp": 1.01893699, + "epoch": 0.7719224410040583, + "flos": 18580601151360.0, + "grad_norm": 2.1488777509129675, + "language_loss": 0.69447446, + "learning_rate": 4.918346672618303e-07, + "loss": 0.71525598, + "num_input_tokens_seen": 277005450, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.36328125, + "step": 12839, + "time_per_iteration": 2.3595192432403564 + }, + { + "auxiliary_loss_clip": 0.01055635, + "auxiliary_loss_mlp": 0.01024345, + "balance_loss_clip": 1.01344371, + "balance_loss_mlp": 1.01881146, + "epoch": 0.7719825642567263, + "flos": 23074690780800.0, + "grad_norm": 1.8619497063181838, + "language_loss": 0.8026315, + "learning_rate": 4.915865482092554e-07, + "loss": 0.82343137, + "num_input_tokens_seen": 277023055, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.3671875, + "step": 12840, + "time_per_iteration": 2.3821587562561035 + }, + { + "auxiliary_loss_clip": 0.01056487, + "auxiliary_loss_mlp": 0.01025812, + "balance_loss_clip": 1.01450467, + "balance_loss_mlp": 1.01884258, + "epoch": 0.7720426875093943, + "flos": 20338058770560.0, + "grad_norm": 1.7624813125197907, + "language_loss": 0.79980242, + "learning_rate": 4.913384829880778e-07, + "loss": 0.82062542, + "num_input_tokens_seen": 277041150, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.375, + "step": 12841, + "time_per_iteration": 2.452223062515259 + }, + { + "auxiliary_loss_clip": 0.01057248, + "auxiliary_loss_mlp": 0.01023753, + "balance_loss_clip": 1.01204062, + "balance_loss_mlp": 1.01915908, + "epoch": 0.7721028107620622, + "flos": 23879508508800.0, + "grad_norm": 1.467477161965978, + "language_loss": 0.76137948, + "learning_rate": 4.910904716071476e-07, + "loss": 0.78218943, + "num_input_tokens_seen": 277063895, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.38085938, + "step": 12842, + "time_per_iteration": 2.441494941711426 + }, + { + "auxiliary_loss_clip": 0.01059548, + "auxiliary_loss_mlp": 0.01023501, + "balance_loss_clip": 1.01190174, + "balance_loss_mlp": 1.02046585, + "epoch": 0.7721629340147302, + "flos": 26650355518080.0, + "grad_norm": 1.5437574775896858, + "language_loss": 0.68443334, + "learning_rate": 4.908425140753178e-07, + "loss": 0.70526385, + "num_input_tokens_seen": 277084045, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.390625, + "step": 12843, + "time_per_iteration": 2.47453236579895 + }, + { + "auxiliary_loss_clip": 0.0105785, + "auxiliary_loss_mlp": 0.01020713, + "balance_loss_clip": 1.00954342, + "balance_loss_mlp": 1.01822793, + "epoch": 0.7722230572673981, + "flos": 21177789724800.0, + "grad_norm": 1.8618479062163766, + "language_loss": 0.73312914, + "learning_rate": 4.905946104014373e-07, + "loss": 0.75391483, + "num_input_tokens_seen": 277102625, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.39648438, + "step": 12844, + "time_per_iteration": 2.394684314727783 + }, + { + "auxiliary_loss_clip": 0.01056356, + "auxiliary_loss_mlp": 0.01023343, + "balance_loss_clip": 1.01208341, + "balance_loss_mlp": 1.01885962, + "epoch": 0.7722831805200662, + "flos": 27963404040960.0, + "grad_norm": 1.5652896758882475, + "language_loss": 0.71422648, + "learning_rate": 4.903467605943515e-07, + "loss": 0.7350235, + "num_input_tokens_seen": 277123210, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.375, + "step": 12845, + "time_per_iteration": 2.4713268280029297 + }, + { + "auxiliary_loss_clip": 0.01054295, + "auxiliary_loss_mlp": 0.01021512, + "balance_loss_clip": 1.01078331, + "balance_loss_mlp": 1.01786685, + "epoch": 0.7723433037727341, + "flos": 33874164708480.0, + "grad_norm": 1.817292464706298, + "language_loss": 0.64385676, + "learning_rate": 4.900989646629068e-07, + "loss": 0.66461486, + "num_input_tokens_seen": 277144895, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.36328125, + "step": 12846, + "time_per_iteration": 2.5108962059020996 + }, + { + "auxiliary_loss_clip": 0.01006787, + "auxiliary_loss_mlp": 0.01000747, + "balance_loss_clip": 0.99987078, + "balance_loss_mlp": 1.00069237, + "epoch": 0.7724034270254021, + "flos": 62844951519360.0, + "grad_norm": 0.7738417348067956, + "language_loss": 0.61724484, + "learning_rate": 4.898512226159461e-07, + "loss": 0.63732016, + "num_input_tokens_seen": 277205160, + "router_z_loss_clip": 0.00878906, + "router_z_loss_mlp": 0.06103516, + "step": 12847, + "time_per_iteration": 3.010693311691284 + }, + { + "auxiliary_loss_clip": 0.01055511, + "auxiliary_loss_mlp": 0.01020633, + "balance_loss_clip": 1.00944519, + "balance_loss_mlp": 1.01743686, + "epoch": 0.77246355027807, + "flos": 23294329344000.0, + "grad_norm": 1.7254210403132741, + "language_loss": 0.79223037, + "learning_rate": 4.896035344623108e-07, + "loss": 0.81299174, + "num_input_tokens_seen": 277223005, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.38085938, + "step": 12848, + "time_per_iteration": 2.4058492183685303 + }, + { + "auxiliary_loss_clip": 0.01058014, + "auxiliary_loss_mlp": 0.01021477, + "balance_loss_clip": 1.00997329, + "balance_loss_mlp": 1.01849985, + "epoch": 0.772523673530738, + "flos": 20120235598080.0, + "grad_norm": 1.8806282716343377, + "language_loss": 0.72563231, + "learning_rate": 4.893559002108396e-07, + "loss": 0.7464273, + "num_input_tokens_seen": 277241785, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.39453125, + "step": 12849, + "time_per_iteration": 2.4281771183013916 + }, + { + "auxiliary_loss_clip": 0.01055116, + "auxiliary_loss_mlp": 0.01025593, + "balance_loss_clip": 1.01448834, + "balance_loss_mlp": 1.0181942, + "epoch": 0.7725837967834059, + "flos": 17819180110080.0, + "grad_norm": 1.820684934582389, + "language_loss": 0.7823602, + "learning_rate": 4.891083198703711e-07, + "loss": 0.80316728, + "num_input_tokens_seen": 277259050, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.36914062, + "step": 12850, + "time_per_iteration": 2.3573880195617676 + }, + { + "auxiliary_loss_clip": 0.01055781, + "auxiliary_loss_mlp": 0.01027906, + "balance_loss_clip": 1.01557374, + "balance_loss_mlp": 1.01726222, + "epoch": 0.772643920036074, + "flos": 27197688902400.0, + "grad_norm": 1.629109573705429, + "language_loss": 0.8010062, + "learning_rate": 4.888607934497402e-07, + "loss": 0.82184309, + "num_input_tokens_seen": 277278235, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.38476562, + "step": 12851, + "time_per_iteration": 2.4439566135406494 + }, + { + "auxiliary_loss_clip": 0.01057498, + "auxiliary_loss_mlp": 0.01023028, + "balance_loss_clip": 1.01114249, + "balance_loss_mlp": 1.01975155, + "epoch": 0.7727040432887419, + "flos": 21578453020800.0, + "grad_norm": 1.536362848997671, + "language_loss": 0.74005902, + "learning_rate": 4.886133209577803e-07, + "loss": 0.76086432, + "num_input_tokens_seen": 277298355, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.37695312, + "step": 12852, + "time_per_iteration": 2.4233803749084473 + }, + { + "auxiliary_loss_clip": 0.01056141, + "auxiliary_loss_mlp": 0.01026384, + "balance_loss_clip": 1.0151782, + "balance_loss_mlp": 1.01819706, + "epoch": 0.7727641665414099, + "flos": 22235553319680.0, + "grad_norm": 1.703926521248014, + "language_loss": 0.82282829, + "learning_rate": 4.883659024033228e-07, + "loss": 0.84365356, + "num_input_tokens_seen": 277316095, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.37890625, + "step": 12853, + "time_per_iteration": 2.4063644409179688 + }, + { + "auxiliary_loss_clip": 0.01006653, + "auxiliary_loss_mlp": 0.01000575, + "balance_loss_clip": 0.99972814, + "balance_loss_mlp": 1.00045323, + "epoch": 0.7728242897940779, + "flos": 54828822867840.0, + "grad_norm": 0.7948759946176458, + "language_loss": 0.54474783, + "learning_rate": 4.88118537795199e-07, + "loss": 0.56482011, + "num_input_tokens_seen": 277380130, + "router_z_loss_clip": 0.00848389, + "router_z_loss_mlp": 0.06201172, + "step": 12854, + "time_per_iteration": 4.4854044914245605 + }, + { + "auxiliary_loss_clip": 0.01057495, + "auxiliary_loss_mlp": 0.01022341, + "balance_loss_clip": 1.0103308, + "balance_loss_mlp": 1.01855445, + "epoch": 0.7728844130467458, + "flos": 34460461036800.0, + "grad_norm": 1.4349614418277663, + "language_loss": 0.7171905, + "learning_rate": 4.878712271422342e-07, + "loss": 0.73798883, + "num_input_tokens_seen": 277404015, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.390625, + "step": 12855, + "time_per_iteration": 2.520953416824341 + }, + { + "auxiliary_loss_clip": 0.01055907, + "auxiliary_loss_mlp": 0.01022539, + "balance_loss_clip": 1.01014686, + "balance_loss_mlp": 1.01840806, + "epoch": 0.7729445362994138, + "flos": 18915348067200.0, + "grad_norm": 1.759098310588767, + "language_loss": 0.67821884, + "learning_rate": 4.876239704532566e-07, + "loss": 0.69900328, + "num_input_tokens_seen": 277421375, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.375, + "step": 12856, + "time_per_iteration": 2.367647171020508 + }, + { + "auxiliary_loss_clip": 0.01057018, + "auxiliary_loss_mlp": 0.01022508, + "balance_loss_clip": 1.01120734, + "balance_loss_mlp": 1.01825762, + "epoch": 0.7730046595520818, + "flos": 22198964348160.0, + "grad_norm": 3.1910288722015765, + "language_loss": 0.78821254, + "learning_rate": 4.873767677370884e-07, + "loss": 0.80900788, + "num_input_tokens_seen": 277440170, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.38671875, + "step": 12857, + "time_per_iteration": 2.417023181915283 + }, + { + "auxiliary_loss_clip": 0.01056417, + "auxiliary_loss_mlp": 0.01023621, + "balance_loss_clip": 1.01155102, + "balance_loss_mlp": 1.01814389, + "epoch": 0.7730647828047498, + "flos": 13551501847680.0, + "grad_norm": 1.9522132957226817, + "language_loss": 0.78226435, + "learning_rate": 4.871296190025535e-07, + "loss": 0.8030647, + "num_input_tokens_seen": 277456880, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.3828125, + "step": 12858, + "time_per_iteration": 2.382589101791382 + }, + { + "auxiliary_loss_clip": 0.01055347, + "auxiliary_loss_mlp": 0.01020421, + "balance_loss_clip": 1.00894713, + "balance_loss_mlp": 1.01718569, + "epoch": 0.7731249060574177, + "flos": 21975101510400.0, + "grad_norm": 1.8207248836639827, + "language_loss": 0.77305728, + "learning_rate": 4.868825242584704e-07, + "loss": 0.79381496, + "num_input_tokens_seen": 277475365, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.3828125, + "step": 12859, + "time_per_iteration": 2.429746627807617 + }, + { + "auxiliary_loss_clip": 0.01053706, + "auxiliary_loss_mlp": 0.01021615, + "balance_loss_clip": 1.01176238, + "balance_loss_mlp": 1.01876807, + "epoch": 0.7731850293100857, + "flos": 22600709896320.0, + "grad_norm": 1.7069726535805008, + "language_loss": 0.67770994, + "learning_rate": 4.866354835136575e-07, + "loss": 0.69846314, + "num_input_tokens_seen": 277494975, + "router_z_loss_clip": 0.09863281, + "router_z_loss_mlp": 0.34960938, + "step": 12860, + "time_per_iteration": 2.4255974292755127 + }, + { + "auxiliary_loss_clip": 0.0105687, + "auxiliary_loss_mlp": 0.01023942, + "balance_loss_clip": 1.01186645, + "balance_loss_mlp": 1.01711988, + "epoch": 0.7732451525627536, + "flos": 14097613334400.0, + "grad_norm": 2.2367486958215115, + "language_loss": 0.74189442, + "learning_rate": 4.863884967769323e-07, + "loss": 0.76270258, + "num_input_tokens_seen": 277510520, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.3984375, + "step": 12861, + "time_per_iteration": 2.440732479095459 + }, + { + "auxiliary_loss_clip": 0.01058785, + "auxiliary_loss_mlp": 0.0102117, + "balance_loss_clip": 1.00924897, + "balance_loss_mlp": 1.01940346, + "epoch": 0.7733052758154216, + "flos": 21468965397120.0, + "grad_norm": 1.6225466233968695, + "language_loss": 0.74884629, + "learning_rate": 4.86141564057107e-07, + "loss": 0.76964581, + "num_input_tokens_seen": 277530505, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.39453125, + "step": 12862, + "time_per_iteration": 2.483633279800415 + }, + { + "auxiliary_loss_clip": 0.01055307, + "auxiliary_loss_mlp": 0.01024348, + "balance_loss_clip": 1.01434064, + "balance_loss_mlp": 1.01833248, + "epoch": 0.7733653990680895, + "flos": 21393308747520.0, + "grad_norm": 1.4965371117275095, + "language_loss": 0.82980847, + "learning_rate": 4.858946853629957e-07, + "loss": 0.85060501, + "num_input_tokens_seen": 277550810, + "router_z_loss_clip": 0.10009766, + "router_z_loss_mlp": 0.36914062, + "step": 12863, + "time_per_iteration": 5.199566841125488 + }, + { + "auxiliary_loss_clip": 0.01053633, + "auxiliary_loss_mlp": 0.01024451, + "balance_loss_clip": 1.01314425, + "balance_loss_mlp": 1.01721752, + "epoch": 0.7734255223207576, + "flos": 17675093462400.0, + "grad_norm": 1.5959380259216414, + "language_loss": 0.73396289, + "learning_rate": 4.856478607034085e-07, + "loss": 0.7547437, + "num_input_tokens_seen": 277567680, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.36523438, + "step": 12864, + "time_per_iteration": 2.371082305908203 + }, + { + "auxiliary_loss_clip": 0.01055591, + "auxiliary_loss_mlp": 0.01023105, + "balance_loss_clip": 1.01229239, + "balance_loss_mlp": 1.01763225, + "epoch": 0.7734856455734255, + "flos": 25229599850880.0, + "grad_norm": 1.698819361375691, + "language_loss": 0.82063293, + "learning_rate": 4.854010900871534e-07, + "loss": 0.84141994, + "num_input_tokens_seen": 277588970, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.38085938, + "step": 12865, + "time_per_iteration": 2.4399631023406982 + }, + { + "auxiliary_loss_clip": 0.01059448, + "auxiliary_loss_mlp": 0.01026397, + "balance_loss_clip": 1.01334381, + "balance_loss_mlp": 1.01939011, + "epoch": 0.7735457688260935, + "flos": 23432201769600.0, + "grad_norm": 2.191742057853246, + "language_loss": 0.71613693, + "learning_rate": 4.851543735230372e-07, + "loss": 0.73699534, + "num_input_tokens_seen": 277605450, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.40039062, + "step": 12866, + "time_per_iteration": 2.4142086505889893 + }, + { + "auxiliary_loss_clip": 0.01058348, + "auxiliary_loss_mlp": 0.0102729, + "balance_loss_clip": 1.01491022, + "balance_loss_mlp": 1.01915526, + "epoch": 0.7736058920787615, + "flos": 18728388403200.0, + "grad_norm": 4.443391956306494, + "language_loss": 0.64392167, + "learning_rate": 4.849077110198652e-07, + "loss": 0.66477805, + "num_input_tokens_seen": 277622530, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.39257812, + "step": 12867, + "time_per_iteration": 2.3362815380096436 + }, + { + "auxiliary_loss_clip": 0.01055149, + "auxiliary_loss_mlp": 0.01023687, + "balance_loss_clip": 1.01266062, + "balance_loss_mlp": 1.01830339, + "epoch": 0.7736660153314294, + "flos": 22892199770880.0, + "grad_norm": 1.8950181363858456, + "language_loss": 0.71197259, + "learning_rate": 4.846611025864398e-07, + "loss": 0.73276097, + "num_input_tokens_seen": 277642700, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.3671875, + "step": 12868, + "time_per_iteration": 2.4158711433410645 + }, + { + "auxiliary_loss_clip": 0.01058231, + "auxiliary_loss_mlp": 0.01024317, + "balance_loss_clip": 1.01236022, + "balance_loss_mlp": 1.0188185, + "epoch": 0.7737261385840974, + "flos": 13800258351360.0, + "grad_norm": 2.026103341901846, + "language_loss": 0.78236079, + "learning_rate": 4.844145482315616e-07, + "loss": 0.8031863, + "num_input_tokens_seen": 277660005, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.39453125, + "step": 12869, + "time_per_iteration": 2.363309621810913 + }, + { + "auxiliary_loss_clip": 0.01052992, + "auxiliary_loss_mlp": 0.01019749, + "balance_loss_clip": 1.00848973, + "balance_loss_mlp": 1.01687324, + "epoch": 0.7737862618367654, + "flos": 28729468293120.0, + "grad_norm": 1.836098358146471, + "language_loss": 0.73366964, + "learning_rate": 4.841680479640291e-07, + "loss": 0.75439703, + "num_input_tokens_seen": 277682890, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.36132812, + "step": 12870, + "time_per_iteration": 2.4794821739196777 + }, + { + "auxiliary_loss_clip": 0.01057079, + "auxiliary_loss_mlp": 0.01022592, + "balance_loss_clip": 1.01128519, + "balance_loss_mlp": 1.01873016, + "epoch": 0.7738463850894334, + "flos": 17017644049920.0, + "grad_norm": 1.9334545884687668, + "language_loss": 0.75766128, + "learning_rate": 4.839216017926409e-07, + "loss": 0.778458, + "num_input_tokens_seen": 277699330, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.3828125, + "step": 12871, + "time_per_iteration": 3.79901123046875 + }, + { + "auxiliary_loss_clip": 0.0105593, + "auxiliary_loss_mlp": 0.0102404, + "balance_loss_clip": 1.01323378, + "balance_loss_mlp": 1.0183537, + "epoch": 0.7739065083421013, + "flos": 20702970967680.0, + "grad_norm": 1.6164683339026842, + "language_loss": 0.69289947, + "learning_rate": 4.836752097261898e-07, + "loss": 0.71369916, + "num_input_tokens_seen": 277718750, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.375, + "step": 12872, + "time_per_iteration": 2.4024178981781006 + }, + { + "auxiliary_loss_clip": 0.01055335, + "auxiliary_loss_mlp": 0.01023853, + "balance_loss_clip": 1.01118088, + "balance_loss_mlp": 1.01725602, + "epoch": 0.7739666315947693, + "flos": 20696372720640.0, + "grad_norm": 1.9822327253519525, + "language_loss": 0.85266769, + "learning_rate": 4.834288717734707e-07, + "loss": 0.87345958, + "num_input_tokens_seen": 277734645, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.38085938, + "step": 12873, + "time_per_iteration": 2.39754319190979 + }, + { + "auxiliary_loss_clip": 0.01059111, + "auxiliary_loss_mlp": 0.01030465, + "balance_loss_clip": 1.01862764, + "balance_loss_mlp": 1.01966393, + "epoch": 0.7740267548474372, + "flos": 29569373804160.0, + "grad_norm": 2.409695859434699, + "language_loss": 0.6548152, + "learning_rate": 4.831825879432744e-07, + "loss": 0.67571098, + "num_input_tokens_seen": 277755535, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.39453125, + "step": 12874, + "time_per_iteration": 2.4460866451263428 + }, + { + "auxiliary_loss_clip": 0.01056754, + "auxiliary_loss_mlp": 0.01024701, + "balance_loss_clip": 1.01279747, + "balance_loss_mlp": 1.01948893, + "epoch": 0.7740868781001052, + "flos": 23657984732160.0, + "grad_norm": 1.5375786758613266, + "language_loss": 0.62561363, + "learning_rate": 4.829363582443888e-07, + "loss": 0.64642817, + "num_input_tokens_seen": 277775585, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.37109375, + "step": 12875, + "time_per_iteration": 2.422497034072876 + }, + { + "auxiliary_loss_clip": 0.01056004, + "auxiliary_loss_mlp": 0.01021929, + "balance_loss_clip": 1.01072955, + "balance_loss_mlp": 1.0185405, + "epoch": 0.7741470013527731, + "flos": 24716167263360.0, + "grad_norm": 2.240591376628956, + "language_loss": 0.65418899, + "learning_rate": 4.826901826856029e-07, + "loss": 0.67496824, + "num_input_tokens_seen": 277794795, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.375, + "step": 12876, + "time_per_iteration": 2.4420337677001953 + }, + { + "auxiliary_loss_clip": 0.01053447, + "auxiliary_loss_mlp": 0.01025022, + "balance_loss_clip": 1.01440632, + "balance_loss_mlp": 1.01744485, + "epoch": 0.7742071246054412, + "flos": 21870571299840.0, + "grad_norm": 2.085848912304642, + "language_loss": 0.70935875, + "learning_rate": 4.824440612757006e-07, + "loss": 0.73014343, + "num_input_tokens_seen": 277813235, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.359375, + "step": 12877, + "time_per_iteration": 2.3883330821990967 + }, + { + "auxiliary_loss_clip": 0.01057411, + "auxiliary_loss_mlp": 0.01025022, + "balance_loss_clip": 1.01297033, + "balance_loss_mlp": 1.01868987, + "epoch": 0.7742672478581091, + "flos": 22673154700800.0, + "grad_norm": 1.8248014267738981, + "language_loss": 0.82750738, + "learning_rate": 4.821979940234675e-07, + "loss": 0.84833181, + "num_input_tokens_seen": 277832560, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.38671875, + "step": 12878, + "time_per_iteration": 2.428403615951538 + }, + { + "auxiliary_loss_clip": 0.01057617, + "auxiliary_loss_mlp": 0.01027658, + "balance_loss_clip": 1.01515901, + "balance_loss_mlp": 1.01876569, + "epoch": 0.7743273711107771, + "flos": 18839970708480.0, + "grad_norm": 2.061062578956839, + "language_loss": 0.73554027, + "learning_rate": 4.819519809376824e-07, + "loss": 0.75639302, + "num_input_tokens_seen": 277850120, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.38867188, + "step": 12879, + "time_per_iteration": 2.3783457279205322 + }, + { + "auxiliary_loss_clip": 0.01053256, + "auxiliary_loss_mlp": 0.01022116, + "balance_loss_clip": 1.01133966, + "balance_loss_mlp": 1.01768374, + "epoch": 0.7743874943634451, + "flos": 28728106750080.0, + "grad_norm": 1.9396828623627635, + "language_loss": 0.79464954, + "learning_rate": 4.81706022027127e-07, + "loss": 0.81540328, + "num_input_tokens_seen": 277871020, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.35546875, + "step": 12880, + "time_per_iteration": 2.5190188884735107 + }, + { + "auxiliary_loss_clip": 0.01056037, + "auxiliary_loss_mlp": 0.01028421, + "balance_loss_clip": 1.01680374, + "balance_loss_mlp": 1.0193491, + "epoch": 0.774447617616113, + "flos": 21908521814400.0, + "grad_norm": 1.46905390520183, + "language_loss": 0.70409405, + "learning_rate": 4.814601173005781e-07, + "loss": 0.72493851, + "num_input_tokens_seen": 277891525, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.3671875, + "step": 12881, + "time_per_iteration": 2.471578598022461 + }, + { + "auxiliary_loss_clip": 0.01060206, + "auxiliary_loss_mlp": 0.01029006, + "balance_loss_clip": 1.01616681, + "balance_loss_mlp": 1.01980233, + "epoch": 0.774507740868781, + "flos": 19499619536640.0, + "grad_norm": 1.8773134643903082, + "language_loss": 0.84829533, + "learning_rate": 4.812142667668113e-07, + "loss": 0.86918736, + "num_input_tokens_seen": 277910425, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.40234375, + "step": 12882, + "time_per_iteration": 2.4253427982330322 + }, + { + "auxiliary_loss_clip": 0.01006321, + "auxiliary_loss_mlp": 0.01000988, + "balance_loss_clip": 1.0001471, + "balance_loss_mlp": 1.00027931, + "epoch": 0.774567864121449, + "flos": 59423113699200.0, + "grad_norm": 0.7863837082831483, + "language_loss": 0.60504061, + "learning_rate": 4.809684704346e-07, + "loss": 0.62511373, + "num_input_tokens_seen": 277972795, + "router_z_loss_clip": 0.00842285, + "router_z_loss_mlp": 0.06030273, + "step": 12883, + "time_per_iteration": 3.016136884689331 + }, + { + "auxiliary_loss_clip": 0.0105837, + "auxiliary_loss_mlp": 0.01027478, + "balance_loss_clip": 1.01523519, + "balance_loss_mlp": 1.01918805, + "epoch": 0.774627987374117, + "flos": 13224470342400.0, + "grad_norm": 2.0331790823205207, + "language_loss": 0.72690183, + "learning_rate": 4.807227283127173e-07, + "loss": 0.7477603, + "num_input_tokens_seen": 277990675, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.390625, + "step": 12884, + "time_per_iteration": 2.410663366317749 + }, + { + "auxiliary_loss_clip": 0.01054798, + "auxiliary_loss_mlp": 0.01020887, + "balance_loss_clip": 1.01060545, + "balance_loss_mlp": 1.01822042, + "epoch": 0.7746881106267849, + "flos": 21393064368000.0, + "grad_norm": 1.688426141219347, + "language_loss": 0.8116883, + "learning_rate": 4.804770404099323e-07, + "loss": 0.83244514, + "num_input_tokens_seen": 278010050, + "router_z_loss_clip": 0.10302734, + "router_z_loss_mlp": 0.36523438, + "step": 12885, + "time_per_iteration": 2.4070563316345215 + }, + { + "auxiliary_loss_clip": 0.01054423, + "auxiliary_loss_mlp": 0.01024005, + "balance_loss_clip": 1.01293039, + "balance_loss_mlp": 1.01737595, + "epoch": 0.7747482338794529, + "flos": 25628168465280.0, + "grad_norm": 1.7975012604642804, + "language_loss": 0.64147973, + "learning_rate": 4.80231406735013e-07, + "loss": 0.66226399, + "num_input_tokens_seen": 278030660, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.37109375, + "step": 12886, + "time_per_iteration": 2.469620943069458 + }, + { + "auxiliary_loss_clip": 0.01056135, + "auxiliary_loss_mlp": 0.01022948, + "balance_loss_clip": 1.01270771, + "balance_loss_mlp": 1.01747012, + "epoch": 0.7748083571321208, + "flos": 11546125597440.0, + "grad_norm": 1.9219123866014838, + "language_loss": 0.69431019, + "learning_rate": 4.79985827296725e-07, + "loss": 0.715101, + "num_input_tokens_seen": 278047645, + "router_z_loss_clip": 0.10253906, + "router_z_loss_mlp": 0.38671875, + "step": 12887, + "time_per_iteration": 2.4023280143737793 + }, + { + "auxiliary_loss_clip": 0.01054915, + "auxiliary_loss_mlp": 0.01026176, + "balance_loss_clip": 1.01541102, + "balance_loss_mlp": 1.01820481, + "epoch": 0.7748684803847888, + "flos": 19061424662400.0, + "grad_norm": 1.7116080763355674, + "language_loss": 0.70506406, + "learning_rate": 4.79740302103834e-07, + "loss": 0.72587502, + "num_input_tokens_seen": 278066170, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.3671875, + "step": 12888, + "time_per_iteration": 2.424818754196167 + }, + { + "auxiliary_loss_clip": 0.01057138, + "auxiliary_loss_mlp": 0.01024014, + "balance_loss_clip": 1.01203942, + "balance_loss_mlp": 1.01808882, + "epoch": 0.7749286036374567, + "flos": 22272072468480.0, + "grad_norm": 1.6301168056171922, + "language_loss": 0.81433076, + "learning_rate": 4.794948311651004e-07, + "loss": 0.83514231, + "num_input_tokens_seen": 278085545, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.390625, + "step": 12889, + "time_per_iteration": 2.4070613384246826 + }, + { + "auxiliary_loss_clip": 0.01056125, + "auxiliary_loss_mlp": 0.01026036, + "balance_loss_clip": 1.0149616, + "balance_loss_mlp": 1.01903152, + "epoch": 0.7749887268901248, + "flos": 20191458504960.0, + "grad_norm": 1.9903064907115964, + "language_loss": 0.79839611, + "learning_rate": 4.792494144892845e-07, + "loss": 0.81921774, + "num_input_tokens_seen": 278102995, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.37109375, + "step": 12890, + "time_per_iteration": 2.4104716777801514 + }, + { + "auxiliary_loss_clip": 0.01060094, + "auxiliary_loss_mlp": 0.01026377, + "balance_loss_clip": 1.01480198, + "balance_loss_mlp": 1.02050877, + "epoch": 0.7750488501427927, + "flos": 20336557582080.0, + "grad_norm": 1.7402955516134138, + "language_loss": 0.66316307, + "learning_rate": 4.790040520851464e-07, + "loss": 0.68402779, + "num_input_tokens_seen": 278121460, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.39648438, + "step": 12891, + "time_per_iteration": 2.38301944732666 + }, + { + "auxiliary_loss_clip": 0.01055731, + "auxiliary_loss_mlp": 0.01021736, + "balance_loss_clip": 1.01035094, + "balance_loss_mlp": 1.0185796, + "epoch": 0.7751089733954607, + "flos": 28362845439360.0, + "grad_norm": 1.3877698685435806, + "language_loss": 0.78626657, + "learning_rate": 4.7875874396144e-07, + "loss": 0.80704117, + "num_input_tokens_seen": 278143905, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.37109375, + "step": 12892, + "time_per_iteration": 2.4428346157073975 + }, + { + "auxiliary_loss_clip": 0.01058212, + "auxiliary_loss_mlp": 0.01025716, + "balance_loss_clip": 1.01420069, + "balance_loss_mlp": 1.01888633, + "epoch": 0.7751690966481286, + "flos": 16942930007040.0, + "grad_norm": 1.9908799815115477, + "language_loss": 0.66750276, + "learning_rate": 4.785134901269214e-07, + "loss": 0.68834209, + "num_input_tokens_seen": 278160850, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.39257812, + "step": 12893, + "time_per_iteration": 2.361713409423828 + }, + { + "auxiliary_loss_clip": 0.01006871, + "auxiliary_loss_mlp": 0.01000617, + "balance_loss_clip": 0.99971724, + "balance_loss_mlp": 1.00081015, + "epoch": 0.7752292199007966, + "flos": 65664362096640.0, + "grad_norm": 0.8110052969969225, + "language_loss": 0.58501101, + "learning_rate": 4.782682905903424e-07, + "loss": 0.60508585, + "num_input_tokens_seen": 278219950, + "router_z_loss_clip": 0.00897217, + "router_z_loss_mlp": 0.06054688, + "step": 12894, + "time_per_iteration": 4.527516603469849 + }, + { + "auxiliary_loss_clip": 0.01058654, + "auxiliary_loss_mlp": 0.01024255, + "balance_loss_clip": 1.01254225, + "balance_loss_mlp": 1.01979947, + "epoch": 0.7752893431534646, + "flos": 20593622989440.0, + "grad_norm": 2.4898539588538258, + "language_loss": 0.78118563, + "learning_rate": 4.780231453604544e-07, + "loss": 0.80201471, + "num_input_tokens_seen": 278237805, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.38867188, + "step": 12895, + "time_per_iteration": 2.3921680450439453 + }, + { + "auxiliary_loss_clip": 0.010577, + "auxiliary_loss_mlp": 0.0102375, + "balance_loss_clip": 1.01212716, + "balance_loss_mlp": 1.01963675, + "epoch": 0.7753494664061326, + "flos": 20484309922560.0, + "grad_norm": 1.6835964857594032, + "language_loss": 0.68050671, + "learning_rate": 4.777780544460046e-07, + "loss": 0.70132113, + "num_input_tokens_seen": 278257660, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.38085938, + "step": 12896, + "time_per_iteration": 2.4266014099121094 + }, + { + "auxiliary_loss_clip": 0.0105531, + "auxiliary_loss_mlp": 0.01026726, + "balance_loss_clip": 1.01578271, + "balance_loss_mlp": 1.01813257, + "epoch": 0.7754095896588006, + "flos": 20264880827520.0, + "grad_norm": 2.022686422745023, + "language_loss": 0.68869299, + "learning_rate": 4.775330178557409e-07, + "loss": 0.70951337, + "num_input_tokens_seen": 278275110, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.37109375, + "step": 12897, + "time_per_iteration": 2.427767515182495 + }, + { + "auxiliary_loss_clip": 0.01056844, + "auxiliary_loss_mlp": 0.01027438, + "balance_loss_clip": 1.01475453, + "balance_loss_mlp": 1.01864982, + "epoch": 0.7754697129114685, + "flos": 23329975708800.0, + "grad_norm": 1.6731754762939108, + "language_loss": 0.75317729, + "learning_rate": 4.772880355984073e-07, + "loss": 0.77402008, + "num_input_tokens_seen": 278293035, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.38085938, + "step": 12898, + "time_per_iteration": 2.44557785987854 + }, + { + "auxiliary_loss_clip": 0.01056318, + "auxiliary_loss_mlp": 0.01024129, + "balance_loss_clip": 1.01145077, + "balance_loss_mlp": 1.01754546, + "epoch": 0.7755298361641365, + "flos": 17346665502720.0, + "grad_norm": 1.723484274987464, + "language_loss": 0.699251, + "learning_rate": 4.77043107682747e-07, + "loss": 0.72005546, + "num_input_tokens_seen": 278311010, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.38671875, + "step": 12899, + "time_per_iteration": 2.3998239040374756 + }, + { + "auxiliary_loss_clip": 0.01055157, + "auxiliary_loss_mlp": 0.01023365, + "balance_loss_clip": 1.01170087, + "balance_loss_mlp": 1.01781058, + "epoch": 0.7755899594168044, + "flos": 19644858259200.0, + "grad_norm": 1.8993613514165126, + "language_loss": 0.75190258, + "learning_rate": 4.767982341175001e-07, + "loss": 0.77268785, + "num_input_tokens_seen": 278329900, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.375, + "step": 12900, + "time_per_iteration": 2.389375686645508 + }, + { + "auxiliary_loss_clip": 0.01054671, + "auxiliary_loss_mlp": 0.01028103, + "balance_loss_clip": 1.0175705, + "balance_loss_mlp": 1.01786637, + "epoch": 0.7756500826694724, + "flos": 27413312659200.0, + "grad_norm": 1.7167799631685872, + "language_loss": 0.77815187, + "learning_rate": 4.765534149114068e-07, + "loss": 0.79897964, + "num_input_tokens_seen": 278349980, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.3671875, + "step": 12901, + "time_per_iteration": 2.506442070007324 + }, + { + "auxiliary_loss_clip": 0.01058273, + "auxiliary_loss_mlp": 0.01023656, + "balance_loss_clip": 1.01159251, + "balance_loss_mlp": 1.02017319, + "epoch": 0.7757102059221404, + "flos": 28729258824960.0, + "grad_norm": 1.5345431610596658, + "language_loss": 0.77058864, + "learning_rate": 4.763086500732032e-07, + "loss": 0.79140788, + "num_input_tokens_seen": 278372485, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.38085938, + "step": 12902, + "time_per_iteration": 3.8849873542785645 + }, + { + "auxiliary_loss_clip": 0.01056333, + "auxiliary_loss_mlp": 0.01026083, + "balance_loss_clip": 1.01479948, + "balance_loss_mlp": 1.01888335, + "epoch": 0.7757703291748084, + "flos": 22485845923200.0, + "grad_norm": 2.0923484231305385, + "language_loss": 0.72942245, + "learning_rate": 4.7606393961162437e-07, + "loss": 0.75024664, + "num_input_tokens_seen": 278391660, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.375, + "step": 12903, + "time_per_iteration": 3.899397611618042 + }, + { + "auxiliary_loss_clip": 0.01056474, + "auxiliary_loss_mlp": 0.01019155, + "balance_loss_clip": 1.00783563, + "balance_loss_mlp": 1.01929402, + "epoch": 0.7758304524274763, + "flos": 21429199491840.0, + "grad_norm": 2.4050101578220096, + "language_loss": 0.76506448, + "learning_rate": 4.7581928353540357e-07, + "loss": 0.78582072, + "num_input_tokens_seen": 278409125, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.37109375, + "step": 12904, + "time_per_iteration": 2.4015612602233887 + }, + { + "auxiliary_loss_clip": 0.01053372, + "auxiliary_loss_mlp": 0.01021448, + "balance_loss_clip": 1.01128566, + "balance_loss_mlp": 1.01733661, + "epoch": 0.7758905756801443, + "flos": 23658124377600.0, + "grad_norm": 1.797787479376298, + "language_loss": 0.680004, + "learning_rate": 4.75574681853272e-07, + "loss": 0.70075226, + "num_input_tokens_seen": 278429450, + "router_z_loss_clip": 0.1015625, + "router_z_loss_mlp": 0.36132812, + "step": 12905, + "time_per_iteration": 2.413912057876587 + }, + { + "auxiliary_loss_clip": 0.01055452, + "auxiliary_loss_mlp": 0.01021916, + "balance_loss_clip": 1.01113975, + "balance_loss_mlp": 1.01908112, + "epoch": 0.7759506989328122, + "flos": 28364241893760.0, + "grad_norm": 1.5594227495241728, + "language_loss": 0.67529231, + "learning_rate": 4.7533013457395865e-07, + "loss": 0.69606602, + "num_input_tokens_seen": 278449925, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.36328125, + "step": 12906, + "time_per_iteration": 2.4874861240386963 + }, + { + "auxiliary_loss_clip": 0.01056121, + "auxiliary_loss_mlp": 0.01023485, + "balance_loss_clip": 1.01232076, + "balance_loss_mlp": 1.01849186, + "epoch": 0.7760108221854802, + "flos": 14901907392000.0, + "grad_norm": 2.0099874901045562, + "language_loss": 0.81133837, + "learning_rate": 4.750856417061904e-07, + "loss": 0.83213449, + "num_input_tokens_seen": 278467255, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.375, + "step": 12907, + "time_per_iteration": 2.3936517238616943 + }, + { + "auxiliary_loss_clip": 0.01055526, + "auxiliary_loss_mlp": 0.01024948, + "balance_loss_clip": 1.01353383, + "balance_loss_mlp": 1.01916313, + "epoch": 0.7760709454381483, + "flos": 14791651718400.0, + "grad_norm": 2.735851396131494, + "language_loss": 0.67108029, + "learning_rate": 4.7484120325869414e-07, + "loss": 0.69188505, + "num_input_tokens_seen": 278484250, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.36328125, + "step": 12908, + "time_per_iteration": 2.3859989643096924 + }, + { + "auxiliary_loss_clip": 0.01059401, + "auxiliary_loss_mlp": 0.01028377, + "balance_loss_clip": 1.01673007, + "balance_loss_mlp": 1.02027798, + "epoch": 0.7761310686908162, + "flos": 17378995288320.0, + "grad_norm": 1.590166039155409, + "language_loss": 0.70055509, + "learning_rate": 4.74596819240191e-07, + "loss": 0.72143292, + "num_input_tokens_seen": 278502740, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.390625, + "step": 12909, + "time_per_iteration": 2.3833229541778564 + }, + { + "auxiliary_loss_clip": 0.01056043, + "auxiliary_loss_mlp": 0.01021167, + "balance_loss_clip": 1.0104506, + "balance_loss_mlp": 1.01947176, + "epoch": 0.7761911919434842, + "flos": 25555374547200.0, + "grad_norm": 1.5909230751782817, + "language_loss": 0.67847502, + "learning_rate": 4.74352489659404e-07, + "loss": 0.69924712, + "num_input_tokens_seen": 278523890, + "router_z_loss_clip": 0.10693359, + "router_z_loss_mlp": 0.36523438, + "step": 12910, + "time_per_iteration": 2.441504716873169 + }, + { + "auxiliary_loss_clip": 0.0105849, + "auxiliary_loss_mlp": 0.01026201, + "balance_loss_clip": 1.01533508, + "balance_loss_mlp": 1.02062774, + "epoch": 0.7762513151961521, + "flos": 23178802055040.0, + "grad_norm": 1.6276878986912076, + "language_loss": 0.71439385, + "learning_rate": 4.741082145250519e-07, + "loss": 0.73524076, + "num_input_tokens_seen": 278543185, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.37890625, + "step": 12911, + "time_per_iteration": 3.8368091583251953 + }, + { + "auxiliary_loss_clip": 0.01059577, + "auxiliary_loss_mlp": 0.01025631, + "balance_loss_clip": 1.01317978, + "balance_loss_mlp": 1.02048564, + "epoch": 0.7763114384488201, + "flos": 21688534137600.0, + "grad_norm": 2.3568030301072866, + "language_loss": 0.63329136, + "learning_rate": 4.738639938458535e-07, + "loss": 0.65414339, + "num_input_tokens_seen": 278559220, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.390625, + "step": 12912, + "time_per_iteration": 2.4204766750335693 + }, + { + "auxiliary_loss_clip": 0.01006694, + "auxiliary_loss_mlp": 0.01001002, + "balance_loss_clip": 1.00008976, + "balance_loss_mlp": 1.00058055, + "epoch": 0.776371561701488, + "flos": 69021749813760.0, + "grad_norm": 0.7687074480675044, + "language_loss": 0.53221071, + "learning_rate": 4.736198276305223e-07, + "loss": 0.5522877, + "num_input_tokens_seen": 278618185, + "router_z_loss_clip": 0.00909424, + "router_z_loss_mlp": 0.06103516, + "step": 12913, + "time_per_iteration": 3.0955514907836914 + }, + { + "auxiliary_loss_clip": 0.01058379, + "auxiliary_loss_mlp": 0.0102504, + "balance_loss_clip": 1.01398909, + "balance_loss_mlp": 1.0201565, + "epoch": 0.776431684954156, + "flos": 22892793264000.0, + "grad_norm": 2.917499504198331, + "language_loss": 0.62188917, + "learning_rate": 4.7337571588777406e-07, + "loss": 0.64272338, + "num_input_tokens_seen": 278636210, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.3828125, + "step": 12914, + "time_per_iteration": 2.419403076171875 + }, + { + "auxiliary_loss_clip": 0.01054357, + "auxiliary_loss_mlp": 0.0102405, + "balance_loss_clip": 1.01216483, + "balance_loss_mlp": 1.01780701, + "epoch": 0.776491808206824, + "flos": 20260656552960.0, + "grad_norm": 1.8043079583201316, + "language_loss": 0.8228097, + "learning_rate": 4.731316586263192e-07, + "loss": 0.84359372, + "num_input_tokens_seen": 278653305, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.36523438, + "step": 12915, + "time_per_iteration": 2.3964407444000244 + }, + { + "auxiliary_loss_clip": 0.01060087, + "auxiliary_loss_mlp": 0.01028568, + "balance_loss_clip": 1.0169332, + "balance_loss_mlp": 1.02040124, + "epoch": 0.776551931459492, + "flos": 26757888105600.0, + "grad_norm": 1.7057951430601097, + "language_loss": 0.74678111, + "learning_rate": 4.72887655854868e-07, + "loss": 0.76766765, + "num_input_tokens_seen": 278671850, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.39648438, + "step": 12916, + "time_per_iteration": 2.458080530166626 + }, + { + "auxiliary_loss_clip": 0.0105661, + "auxiliary_loss_mlp": 0.01022672, + "balance_loss_clip": 1.01114488, + "balance_loss_mlp": 1.01845622, + "epoch": 0.7766120547121599, + "flos": 52663162965120.0, + "grad_norm": 1.6140333448660396, + "language_loss": 0.65808976, + "learning_rate": 4.7264370758212766e-07, + "loss": 0.6788826, + "num_input_tokens_seen": 278697860, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.38085938, + "step": 12917, + "time_per_iteration": 2.6787028312683105 + }, + { + "auxiliary_loss_clip": 0.01056195, + "auxiliary_loss_mlp": 0.01023692, + "balance_loss_clip": 1.01266527, + "balance_loss_mlp": 1.01755071, + "epoch": 0.7766721779648279, + "flos": 25155025453440.0, + "grad_norm": 1.5162729539906288, + "language_loss": 0.64533943, + "learning_rate": 4.723998138168055e-07, + "loss": 0.66613829, + "num_input_tokens_seen": 278720655, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.38671875, + "step": 12918, + "time_per_iteration": 2.4657044410705566 + }, + { + "auxiliary_loss_clip": 0.01055371, + "auxiliary_loss_mlp": 0.01020578, + "balance_loss_clip": 1.01031435, + "balance_loss_mlp": 1.01840746, + "epoch": 0.7767323012174958, + "flos": 23759861679360.0, + "grad_norm": 2.195403426941193, + "language_loss": 0.73799765, + "learning_rate": 4.7215597456760426e-07, + "loss": 0.75875711, + "num_input_tokens_seen": 278737375, + "router_z_loss_clip": 0.10253906, + "router_z_loss_mlp": 0.36914062, + "step": 12919, + "time_per_iteration": 2.440917730331421 + }, + { + "auxiliary_loss_clip": 0.01055259, + "auxiliary_loss_mlp": 0.01025022, + "balance_loss_clip": 1.01304126, + "balance_loss_mlp": 1.017506, + "epoch": 0.7767924244701638, + "flos": 22085671386240.0, + "grad_norm": 1.7616136153068183, + "language_loss": 0.79380286, + "learning_rate": 4.719121898432255e-07, + "loss": 0.81460571, + "num_input_tokens_seen": 278756510, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.37695312, + "step": 12920, + "time_per_iteration": 2.4338152408599854 + }, + { + "auxiliary_loss_clip": 0.01057503, + "auxiliary_loss_mlp": 0.01019869, + "balance_loss_clip": 1.00882435, + "balance_loss_mlp": 1.01919281, + "epoch": 0.7768525477228319, + "flos": 21980547682560.0, + "grad_norm": 1.7704086589803394, + "language_loss": 0.70810258, + "learning_rate": 4.7166845965237033e-07, + "loss": 0.72887623, + "num_input_tokens_seen": 278775410, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.3828125, + "step": 12921, + "time_per_iteration": 2.3940012454986572 + }, + { + "auxiliary_loss_clip": 0.01059845, + "auxiliary_loss_mlp": 0.01024458, + "balance_loss_clip": 1.01151192, + "balance_loss_mlp": 1.01891851, + "epoch": 0.7769126709754998, + "flos": 21793622929920.0, + "grad_norm": 1.844144331118007, + "language_loss": 0.75968653, + "learning_rate": 4.7142478400373686e-07, + "loss": 0.78052956, + "num_input_tokens_seen": 278794260, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.40820312, + "step": 12922, + "time_per_iteration": 2.4032235145568848 + }, + { + "auxiliary_loss_clip": 0.01057078, + "auxiliary_loss_mlp": 0.01024254, + "balance_loss_clip": 1.0129528, + "balance_loss_mlp": 1.01923072, + "epoch": 0.7769727942281678, + "flos": 20046952920960.0, + "grad_norm": 1.789853926660899, + "language_loss": 0.80361104, + "learning_rate": 4.7118116290602074e-07, + "loss": 0.82442439, + "num_input_tokens_seen": 278813290, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.37890625, + "step": 12923, + "time_per_iteration": 2.3714065551757812 + }, + { + "auxiliary_loss_clip": 0.01058523, + "auxiliary_loss_mlp": 0.01027092, + "balance_loss_clip": 1.01563048, + "balance_loss_mlp": 1.02089047, + "epoch": 0.7770329174808357, + "flos": 21685776140160.0, + "grad_norm": 2.0252300131581755, + "language_loss": 0.92344648, + "learning_rate": 4.709375963679156e-07, + "loss": 0.94430262, + "num_input_tokens_seen": 278830610, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.37695312, + "step": 12924, + "time_per_iteration": 2.3910956382751465 + }, + { + "auxiliary_loss_clip": 0.01055279, + "auxiliary_loss_mlp": 0.01021644, + "balance_loss_clip": 1.01068854, + "balance_loss_mlp": 1.01757455, + "epoch": 0.7770930407335037, + "flos": 25848051408000.0, + "grad_norm": 1.7475101228707643, + "language_loss": 0.65898907, + "learning_rate": 4.7069408439811574e-07, + "loss": 0.67975831, + "num_input_tokens_seen": 278849530, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.37695312, + "step": 12925, + "time_per_iteration": 2.4174680709838867 + }, + { + "auxiliary_loss_clip": 0.01057912, + "auxiliary_loss_mlp": 0.01022709, + "balance_loss_clip": 1.01159883, + "balance_loss_mlp": 1.01949775, + "epoch": 0.7771531639861716, + "flos": 24346856234880.0, + "grad_norm": 2.0052234071886446, + "language_loss": 0.71882766, + "learning_rate": 4.70450627005309e-07, + "loss": 0.73963386, + "num_input_tokens_seen": 278869005, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.38476562, + "step": 12926, + "time_per_iteration": 2.423459053039551 + }, + { + "auxiliary_loss_clip": 0.01057565, + "auxiliary_loss_mlp": 0.01022609, + "balance_loss_clip": 1.01065207, + "balance_loss_mlp": 1.01852489, + "epoch": 0.7772132872388396, + "flos": 25628761958400.0, + "grad_norm": 1.671640839612848, + "language_loss": 0.65484583, + "learning_rate": 4.702072241981854e-07, + "loss": 0.67564762, + "num_input_tokens_seen": 278888790, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.390625, + "step": 12927, + "time_per_iteration": 2.416715145111084 + }, + { + "auxiliary_loss_clip": 0.01057527, + "auxiliary_loss_mlp": 0.01023603, + "balance_loss_clip": 1.01071024, + "balance_loss_mlp": 1.01913953, + "epoch": 0.7772734104915076, + "flos": 26066223694080.0, + "grad_norm": 1.7880021303500686, + "language_loss": 0.72100979, + "learning_rate": 4.699638759854303e-07, + "loss": 0.74182111, + "num_input_tokens_seen": 278908150, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.3828125, + "step": 12928, + "time_per_iteration": 2.4396884441375732 + }, + { + "auxiliary_loss_clip": 0.01053816, + "auxiliary_loss_mlp": 0.01026425, + "balance_loss_clip": 1.01567197, + "balance_loss_mlp": 1.01726413, + "epoch": 0.7773335337441756, + "flos": 22924075708800.0, + "grad_norm": 1.5876907401959357, + "language_loss": 0.74406052, + "learning_rate": 4.6972058237573e-07, + "loss": 0.76486295, + "num_input_tokens_seen": 278927425, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.36523438, + "step": 12929, + "time_per_iteration": 2.4062325954437256 + }, + { + "auxiliary_loss_clip": 0.01055901, + "auxiliary_loss_mlp": 0.01022934, + "balance_loss_clip": 1.01110804, + "balance_loss_mlp": 1.01854277, + "epoch": 0.7773936569968435, + "flos": 20775729974400.0, + "grad_norm": 1.9760565263155745, + "language_loss": 0.77568591, + "learning_rate": 4.6947734337776456e-07, + "loss": 0.79647434, + "num_input_tokens_seen": 278946475, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.37304688, + "step": 12930, + "time_per_iteration": 2.4109346866607666 + }, + { + "auxiliary_loss_clip": 0.01056473, + "auxiliary_loss_mlp": 0.01026769, + "balance_loss_clip": 1.01536679, + "balance_loss_mlp": 1.01944184, + "epoch": 0.7774537802495115, + "flos": 20371331162880.0, + "grad_norm": 1.8203829613996108, + "language_loss": 0.79725552, + "learning_rate": 4.6923415900021623e-07, + "loss": 0.81808794, + "num_input_tokens_seen": 278964345, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.37109375, + "step": 12931, + "time_per_iteration": 2.4399919509887695 + }, + { + "auxiliary_loss_clip": 0.0100689, + "auxiliary_loss_mlp": 0.01000764, + "balance_loss_clip": 0.99992961, + "balance_loss_mlp": 1.00077403, + "epoch": 0.7775139035021794, + "flos": 53908931698560.0, + "grad_norm": 0.8370283352287399, + "language_loss": 0.59797323, + "learning_rate": 4.689910292517634e-07, + "loss": 0.61804974, + "num_input_tokens_seen": 279022380, + "router_z_loss_clip": 0.00836182, + "router_z_loss_mlp": 0.06103516, + "step": 12932, + "time_per_iteration": 3.0250189304351807 + }, + { + "auxiliary_loss_clip": 0.01055905, + "auxiliary_loss_mlp": 0.01026283, + "balance_loss_clip": 1.01466012, + "balance_loss_mlp": 1.01830912, + "epoch": 0.7775740267548474, + "flos": 28841155332480.0, + "grad_norm": 1.4529791985736242, + "language_loss": 0.76315075, + "learning_rate": 4.687479541410824e-07, + "loss": 0.78397262, + "num_input_tokens_seen": 279044275, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.375, + "step": 12933, + "time_per_iteration": 3.951028823852539 + }, + { + "auxiliary_loss_clip": 0.01056365, + "auxiliary_loss_mlp": 0.01026925, + "balance_loss_clip": 1.01465225, + "balance_loss_mlp": 1.01763451, + "epoch": 0.7776341500075155, + "flos": 21871374261120.0, + "grad_norm": 2.190775776032463, + "language_loss": 0.73039347, + "learning_rate": 4.685049336768478e-07, + "loss": 0.75122643, + "num_input_tokens_seen": 279063375, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.38671875, + "step": 12934, + "time_per_iteration": 2.396881580352783 + }, + { + "auxiliary_loss_clip": 0.01056936, + "auxiliary_loss_mlp": 0.01022955, + "balance_loss_clip": 1.01140356, + "balance_loss_mlp": 1.01817966, + "epoch": 0.7776942732601834, + "flos": 20228815526400.0, + "grad_norm": 1.6997720884681773, + "language_loss": 0.70150959, + "learning_rate": 4.682619678677331e-07, + "loss": 0.72230852, + "num_input_tokens_seen": 279082680, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.38671875, + "step": 12935, + "time_per_iteration": 2.427628755569458 + }, + { + "auxiliary_loss_clip": 0.01058134, + "auxiliary_loss_mlp": 0.0102505, + "balance_loss_clip": 1.0139699, + "balance_loss_mlp": 1.01954913, + "epoch": 0.7777543965128514, + "flos": 22230875197440.0, + "grad_norm": 2.006448711575701, + "language_loss": 0.83435321, + "learning_rate": 4.680190567224085e-07, + "loss": 0.85518503, + "num_input_tokens_seen": 279099805, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.38476562, + "step": 12936, + "time_per_iteration": 2.39936900138855 + }, + { + "auxiliary_loss_clip": 0.01057994, + "auxiliary_loss_mlp": 0.01023044, + "balance_loss_clip": 1.01162946, + "balance_loss_mlp": 1.01865554, + "epoch": 0.7778145197655193, + "flos": 14501069539200.0, + "grad_norm": 1.9169549984765657, + "language_loss": 0.68054485, + "learning_rate": 4.677762002495422e-07, + "loss": 0.70135522, + "num_input_tokens_seen": 279117975, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.39257812, + "step": 12937, + "time_per_iteration": 2.3778576850891113 + }, + { + "auxiliary_loss_clip": 0.01057693, + "auxiliary_loss_mlp": 0.01022722, + "balance_loss_clip": 1.01111746, + "balance_loss_mlp": 1.01882362, + "epoch": 0.7778746430181873, + "flos": 21139280628480.0, + "grad_norm": 1.493937737451838, + "language_loss": 0.87325472, + "learning_rate": 4.6753339845780293e-07, + "loss": 0.89405882, + "num_input_tokens_seen": 279137255, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.38867188, + "step": 12938, + "time_per_iteration": 2.397407293319702 + }, + { + "auxiliary_loss_clip": 0.01056591, + "auxiliary_loss_mlp": 0.01024038, + "balance_loss_clip": 1.0130825, + "balance_loss_mlp": 1.01841617, + "epoch": 0.7779347662708552, + "flos": 20265334675200.0, + "grad_norm": 1.8868769866793496, + "language_loss": 0.85336411, + "learning_rate": 4.6729065135585456e-07, + "loss": 0.87417036, + "num_input_tokens_seen": 279154500, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.38085938, + "step": 12939, + "time_per_iteration": 2.410005807876587 + }, + { + "auxiliary_loss_clip": 0.0105355, + "auxiliary_loss_mlp": 0.01022258, + "balance_loss_clip": 1.0114398, + "balance_loss_mlp": 1.01791024, + "epoch": 0.7779948895235232, + "flos": 19207990016640.0, + "grad_norm": 1.6931731310896296, + "language_loss": 0.68932253, + "learning_rate": 4.6704795895236016e-07, + "loss": 0.71008062, + "num_input_tokens_seen": 279173635, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.35546875, + "step": 12940, + "time_per_iteration": 2.3793866634368896 + }, + { + "auxiliary_loss_clip": 0.01054382, + "auxiliary_loss_mlp": 0.01020162, + "balance_loss_clip": 1.00942075, + "balance_loss_mlp": 1.01721835, + "epoch": 0.7780550127761912, + "flos": 23913583862400.0, + "grad_norm": 1.5825159707985306, + "language_loss": 0.77741516, + "learning_rate": 4.668053212559804e-07, + "loss": 0.79816067, + "num_input_tokens_seen": 279194430, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.37109375, + "step": 12941, + "time_per_iteration": 2.429905414581299 + }, + { + "auxiliary_loss_clip": 0.01057972, + "auxiliary_loss_mlp": 0.01027679, + "balance_loss_clip": 1.01427972, + "balance_loss_mlp": 1.01822853, + "epoch": 0.7781151360288592, + "flos": 32414585742720.0, + "grad_norm": 2.115190682506989, + "language_loss": 0.72816181, + "learning_rate": 4.6656273827537586e-07, + "loss": 0.74901831, + "num_input_tokens_seen": 279212920, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.3984375, + "step": 12942, + "time_per_iteration": 5.365305662155151 + }, + { + "auxiliary_loss_clip": 0.01054148, + "auxiliary_loss_mlp": 0.01026168, + "balance_loss_clip": 1.01558256, + "balance_loss_mlp": 1.01764631, + "epoch": 0.7781752592815271, + "flos": 22345285322880.0, + "grad_norm": 2.3067221642199094, + "language_loss": 0.67656207, + "learning_rate": 4.6632021001920163e-07, + "loss": 0.69736516, + "num_input_tokens_seen": 279232310, + "router_z_loss_clip": 0.10595703, + "router_z_loss_mlp": 0.36523438, + "step": 12943, + "time_per_iteration": 2.4075124263763428 + }, + { + "auxiliary_loss_clip": 0.01006507, + "auxiliary_loss_mlp": 0.01001488, + "balance_loss_clip": 1.00068378, + "balance_loss_mlp": 1.00038719, + "epoch": 0.7782353825341951, + "flos": 70457030472960.0, + "grad_norm": 0.7838578031535852, + "language_loss": 0.58512431, + "learning_rate": 4.660777364961148e-07, + "loss": 0.60520428, + "num_input_tokens_seen": 279295375, + "router_z_loss_clip": 0.00805664, + "router_z_loss_mlp": 0.06103516, + "step": 12944, + "time_per_iteration": 3.19575572013855 + }, + { + "auxiliary_loss_clip": 0.01057131, + "auxiliary_loss_mlp": 0.01026941, + "balance_loss_clip": 1.01478767, + "balance_loss_mlp": 1.01813269, + "epoch": 0.778295505786863, + "flos": 19061564307840.0, + "grad_norm": 1.8114115236329382, + "language_loss": 0.67910767, + "learning_rate": 4.6583531771476716e-07, + "loss": 0.69994843, + "num_input_tokens_seen": 279313660, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.390625, + "step": 12945, + "time_per_iteration": 2.3659701347351074 + }, + { + "auxiliary_loss_clip": 0.0105726, + "auxiliary_loss_mlp": 0.01027314, + "balance_loss_clip": 1.01522601, + "balance_loss_mlp": 1.01937759, + "epoch": 0.778355629039531, + "flos": 20998580382720.0, + "grad_norm": 1.787964550900192, + "language_loss": 0.69056308, + "learning_rate": 4.655929536838117e-07, + "loss": 0.71140879, + "num_input_tokens_seen": 279334495, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.37890625, + "step": 12946, + "time_per_iteration": 2.4233038425445557 + }, + { + "auxiliary_loss_clip": 0.01055139, + "auxiliary_loss_mlp": 0.01026031, + "balance_loss_clip": 1.0138483, + "balance_loss_mlp": 1.01713347, + "epoch": 0.7784157522921991, + "flos": 21397009351680.0, + "grad_norm": 1.6556425115244466, + "language_loss": 0.65674627, + "learning_rate": 4.6535064441189574e-07, + "loss": 0.67755795, + "num_input_tokens_seen": 279352985, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.37890625, + "step": 12947, + "time_per_iteration": 2.3881733417510986 + }, + { + "auxiliary_loss_clip": 0.01055948, + "auxiliary_loss_mlp": 0.01026596, + "balance_loss_clip": 1.01481223, + "balance_loss_mlp": 1.01862097, + "epoch": 0.778475875544867, + "flos": 20812807704960.0, + "grad_norm": 2.10288580102498, + "language_loss": 0.65722561, + "learning_rate": 4.651083899076682e-07, + "loss": 0.67805099, + "num_input_tokens_seen": 279371360, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.375, + "step": 12948, + "time_per_iteration": 2.3877623081207275 + }, + { + "auxiliary_loss_clip": 0.01058024, + "auxiliary_loss_mlp": 0.01026728, + "balance_loss_clip": 1.01470578, + "balance_loss_mlp": 1.01916814, + "epoch": 0.778535998797535, + "flos": 14354504184960.0, + "grad_norm": 1.9364088591844788, + "language_loss": 0.75222647, + "learning_rate": 4.648661901797746e-07, + "loss": 0.77307403, + "num_input_tokens_seen": 279389400, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.38867188, + "step": 12949, + "time_per_iteration": 2.3685662746429443 + }, + { + "auxiliary_loss_clip": 0.01059435, + "auxiliary_loss_mlp": 0.0102674, + "balance_loss_clip": 1.01420486, + "balance_loss_mlp": 1.01969564, + "epoch": 0.7785961220502029, + "flos": 19208513687040.0, + "grad_norm": 1.5047509679841555, + "language_loss": 0.68703938, + "learning_rate": 4.646240452368566e-07, + "loss": 0.70790112, + "num_input_tokens_seen": 279409715, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.3984375, + "step": 12950, + "time_per_iteration": 3.8461949825286865 + }, + { + "auxiliary_loss_clip": 0.01055453, + "auxiliary_loss_mlp": 0.01022969, + "balance_loss_clip": 1.01185858, + "balance_loss_mlp": 1.01728249, + "epoch": 0.7786562453028709, + "flos": 25737586266240.0, + "grad_norm": 2.772627108145196, + "language_loss": 0.71913713, + "learning_rate": 4.643819550875576e-07, + "loss": 0.73992133, + "num_input_tokens_seen": 279427705, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.38085938, + "step": 12951, + "time_per_iteration": 2.4139630794525146 + }, + { + "auxiliary_loss_clip": 0.01055062, + "auxiliary_loss_mlp": 0.01020887, + "balance_loss_clip": 1.00935364, + "balance_loss_mlp": 1.01887059, + "epoch": 0.7787163685555388, + "flos": 25738249582080.0, + "grad_norm": 2.631670101243215, + "language_loss": 0.66012341, + "learning_rate": 4.641399197405167e-07, + "loss": 0.68088293, + "num_input_tokens_seen": 279448215, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.36328125, + "step": 12952, + "time_per_iteration": 2.445021629333496 + }, + { + "auxiliary_loss_clip": 0.0105166, + "auxiliary_loss_mlp": 0.01023691, + "balance_loss_clip": 1.01343286, + "balance_loss_mlp": 1.01756144, + "epoch": 0.7787764918082068, + "flos": 22746611934720.0, + "grad_norm": 2.1023596362541586, + "language_loss": 0.81366622, + "learning_rate": 4.6389793920437116e-07, + "loss": 0.83441973, + "num_input_tokens_seen": 279466260, + "router_z_loss_clip": 0.10253906, + "router_z_loss_mlp": 0.33984375, + "step": 12953, + "time_per_iteration": 2.443286895751953 + }, + { + "auxiliary_loss_clip": 0.01057728, + "auxiliary_loss_mlp": 0.01024407, + "balance_loss_clip": 1.01256323, + "balance_loss_mlp": 1.01895261, + "epoch": 0.7788366150608748, + "flos": 15190080687360.0, + "grad_norm": 1.9940288405210427, + "language_loss": 0.76752609, + "learning_rate": 4.636560134877563e-07, + "loss": 0.78834748, + "num_input_tokens_seen": 279484520, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.38671875, + "step": 12954, + "time_per_iteration": 2.437037944793701 + }, + { + "auxiliary_loss_clip": 0.01056921, + "auxiliary_loss_mlp": 0.01025096, + "balance_loss_clip": 1.01479018, + "balance_loss_mlp": 1.01885533, + "epoch": 0.7788967383135428, + "flos": 21209316549120.0, + "grad_norm": 1.5464983310566653, + "language_loss": 0.72860408, + "learning_rate": 4.6341414259930703e-07, + "loss": 0.74942422, + "num_input_tokens_seen": 279503130, + "router_z_loss_clip": 0.10302734, + "router_z_loss_mlp": 0.38085938, + "step": 12955, + "time_per_iteration": 2.3840415477752686 + }, + { + "auxiliary_loss_clip": 0.01057202, + "auxiliary_loss_mlp": 0.01025442, + "balance_loss_clip": 1.01399815, + "balance_loss_mlp": 1.01868367, + "epoch": 0.7789568615662107, + "flos": 21682075536000.0, + "grad_norm": 2.0318024949805307, + "language_loss": 0.68721467, + "learning_rate": 4.6317232654765434e-07, + "loss": 0.70804107, + "num_input_tokens_seen": 279521930, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.38476562, + "step": 12956, + "time_per_iteration": 2.528681993484497 + }, + { + "auxiliary_loss_clip": 0.01056301, + "auxiliary_loss_mlp": 0.01022939, + "balance_loss_clip": 1.01221061, + "balance_loss_mlp": 1.01942277, + "epoch": 0.7790169848188787, + "flos": 26359144934400.0, + "grad_norm": 1.8589894857804874, + "language_loss": 0.76068121, + "learning_rate": 4.6293056534142814e-07, + "loss": 0.78147364, + "num_input_tokens_seen": 279542375, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.36914062, + "step": 12957, + "time_per_iteration": 2.439155340194702 + }, + { + "auxiliary_loss_clip": 0.01055429, + "auxiliary_loss_mlp": 0.01019337, + "balance_loss_clip": 1.00811899, + "balance_loss_mlp": 1.01854658, + "epoch": 0.7790771080715466, + "flos": 25515119882880.0, + "grad_norm": 1.74748741687547, + "language_loss": 0.77261615, + "learning_rate": 4.6268885898925593e-07, + "loss": 0.79336381, + "num_input_tokens_seen": 279561885, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.36914062, + "step": 12958, + "time_per_iteration": 2.4492294788360596 + }, + { + "auxiliary_loss_clip": 0.01057739, + "auxiliary_loss_mlp": 0.01023409, + "balance_loss_clip": 1.0112977, + "balance_loss_mlp": 1.01878285, + "epoch": 0.7791372313242146, + "flos": 16033267866240.0, + "grad_norm": 1.9703705150669695, + "language_loss": 0.71793836, + "learning_rate": 4.6244720749976473e-07, + "loss": 0.73874986, + "num_input_tokens_seen": 279579965, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.390625, + "step": 12959, + "time_per_iteration": 2.3781685829162598 + }, + { + "auxiliary_loss_clip": 0.01056328, + "auxiliary_loss_mlp": 0.01026766, + "balance_loss_clip": 1.0150063, + "balance_loss_mlp": 1.01887333, + "epoch": 0.7791973545768827, + "flos": 23841069235200.0, + "grad_norm": 1.9614319094942556, + "language_loss": 0.77862674, + "learning_rate": 4.622056108815766e-07, + "loss": 0.79945767, + "num_input_tokens_seen": 279599030, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.375, + "step": 12960, + "time_per_iteration": 2.4131064414978027 + }, + { + "auxiliary_loss_clip": 0.01057804, + "auxiliary_loss_mlp": 0.01023273, + "balance_loss_clip": 1.01118553, + "balance_loss_mlp": 1.01891208, + "epoch": 0.7792574778295506, + "flos": 24167297779200.0, + "grad_norm": 5.273703198420149, + "language_loss": 0.7500034, + "learning_rate": 4.619640691433151e-07, + "loss": 0.77081418, + "num_input_tokens_seen": 279614400, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.38867188, + "step": 12961, + "time_per_iteration": 2.3848788738250732 + }, + { + "auxiliary_loss_clip": 0.01055807, + "auxiliary_loss_mlp": 0.01026464, + "balance_loss_clip": 1.01521111, + "balance_loss_mlp": 1.02014935, + "epoch": 0.7793176010822186, + "flos": 21464007984000.0, + "grad_norm": 1.6098439409110918, + "language_loss": 0.73728615, + "learning_rate": 4.617225822935997e-07, + "loss": 0.75810885, + "num_input_tokens_seen": 279633745, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.35742188, + "step": 12962, + "time_per_iteration": 2.4136857986450195 + }, + { + "auxiliary_loss_clip": 0.0105757, + "auxiliary_loss_mlp": 0.01020899, + "balance_loss_clip": 1.01013458, + "balance_loss_mlp": 1.02009225, + "epoch": 0.7793777243348865, + "flos": 20665683768960.0, + "grad_norm": 1.9341339342075277, + "language_loss": 0.69762051, + "learning_rate": 4.614811503410483e-07, + "loss": 0.71840525, + "num_input_tokens_seen": 279651165, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.375, + "step": 12963, + "time_per_iteration": 2.3588247299194336 + }, + { + "auxiliary_loss_clip": 0.01056985, + "auxiliary_loss_mlp": 0.01023783, + "balance_loss_clip": 1.0121839, + "balance_loss_mlp": 1.01937294, + "epoch": 0.7794378475875545, + "flos": 27124545870720.0, + "grad_norm": 1.637954465454887, + "language_loss": 0.63581932, + "learning_rate": 4.6123977329427724e-07, + "loss": 0.65662694, + "num_input_tokens_seen": 279671175, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.375, + "step": 12964, + "time_per_iteration": 2.443828821182251 + }, + { + "auxiliary_loss_clip": 0.01056932, + "auxiliary_loss_mlp": 0.01022862, + "balance_loss_clip": 1.01112032, + "balance_loss_mlp": 1.0183481, + "epoch": 0.7794979708402224, + "flos": 28072891664640.0, + "grad_norm": 5.763803064644594, + "language_loss": 0.76781309, + "learning_rate": 4.609984511618998e-07, + "loss": 0.78861099, + "num_input_tokens_seen": 279688675, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.38476562, + "step": 12965, + "time_per_iteration": 2.4385461807250977 + }, + { + "auxiliary_loss_clip": 0.01051799, + "auxiliary_loss_mlp": 0.01021714, + "balance_loss_clip": 1.01100254, + "balance_loss_mlp": 1.01756716, + "epoch": 0.7795580940928905, + "flos": 26868353247360.0, + "grad_norm": 1.498230545991905, + "language_loss": 0.72928405, + "learning_rate": 4.6075718395253016e-07, + "loss": 0.75001919, + "num_input_tokens_seen": 279710245, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.34179688, + "step": 12966, + "time_per_iteration": 2.4373390674591064 + }, + { + "auxiliary_loss_clip": 0.01054237, + "auxiliary_loss_mlp": 0.01021587, + "balance_loss_clip": 1.01174617, + "balance_loss_mlp": 1.01784611, + "epoch": 0.7796182173455584, + "flos": 23834436076800.0, + "grad_norm": 1.4822982731412353, + "language_loss": 0.74266887, + "learning_rate": 4.605159716747762e-07, + "loss": 0.76342708, + "num_input_tokens_seen": 279729045, + "router_z_loss_clip": 0.09863281, + "router_z_loss_mlp": 0.36328125, + "step": 12967, + "time_per_iteration": 2.4343698024749756 + }, + { + "auxiliary_loss_clip": 0.01059051, + "auxiliary_loss_mlp": 0.01029879, + "balance_loss_clip": 1.01785111, + "balance_loss_mlp": 1.01984394, + "epoch": 0.7796783405982264, + "flos": 19791214145280.0, + "grad_norm": 2.678464203040502, + "language_loss": 0.72016549, + "learning_rate": 4.6027481433724746e-07, + "loss": 0.74105477, + "num_input_tokens_seen": 279748350, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.39257812, + "step": 12968, + "time_per_iteration": 2.4084391593933105 + }, + { + "auxiliary_loss_clip": 0.01056234, + "auxiliary_loss_mlp": 0.01030045, + "balance_loss_clip": 1.01835084, + "balance_loss_mlp": 1.01914644, + "epoch": 0.7797384638508943, + "flos": 15449310599040.0, + "grad_norm": 1.9892092028270463, + "language_loss": 0.60725105, + "learning_rate": 4.6003371194855e-07, + "loss": 0.62811387, + "num_input_tokens_seen": 279765620, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.37109375, + "step": 12969, + "time_per_iteration": 2.362010955810547 + }, + { + "auxiliary_loss_clip": 0.01053624, + "auxiliary_loss_mlp": 0.01022021, + "balance_loss_clip": 1.0118345, + "balance_loss_mlp": 1.01849675, + "epoch": 0.7797985871035623, + "flos": 20921701835520.0, + "grad_norm": 1.6658561923702389, + "language_loss": 0.70477211, + "learning_rate": 4.5979266451728825e-07, + "loss": 0.72552854, + "num_input_tokens_seen": 279782485, + "router_z_loss_clip": 0.10205078, + "router_z_loss_mlp": 0.3515625, + "step": 12970, + "time_per_iteration": 2.3880178928375244 + }, + { + "auxiliary_loss_clip": 0.01057771, + "auxiliary_loss_mlp": 0.01029272, + "balance_loss_clip": 1.01739264, + "balance_loss_mlp": 1.02013731, + "epoch": 0.7798587103562302, + "flos": 36935803365120.0, + "grad_norm": 2.151903896313296, + "language_loss": 0.72135067, + "learning_rate": 4.5955167205206355e-07, + "loss": 0.74222112, + "num_input_tokens_seen": 279804170, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.37695312, + "step": 12971, + "time_per_iteration": 2.509483814239502 + }, + { + "auxiliary_loss_clip": 0.01060071, + "auxiliary_loss_mlp": 0.01024604, + "balance_loss_clip": 1.01212275, + "balance_loss_mlp": 1.02020478, + "epoch": 0.7799188336088982, + "flos": 22418183975040.0, + "grad_norm": 1.656925873981116, + "language_loss": 0.74513906, + "learning_rate": 4.593107345614782e-07, + "loss": 0.76598585, + "num_input_tokens_seen": 279823730, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.3984375, + "step": 12972, + "time_per_iteration": 3.837838888168335 + }, + { + "auxiliary_loss_clip": 0.01054922, + "auxiliary_loss_mlp": 0.01021776, + "balance_loss_clip": 1.01026595, + "balance_loss_mlp": 1.01803517, + "epoch": 0.7799789568615663, + "flos": 18879457322880.0, + "grad_norm": 1.8798987269931648, + "language_loss": 0.71574914, + "learning_rate": 4.590698520541292e-07, + "loss": 0.73651612, + "num_input_tokens_seen": 279843035, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.36914062, + "step": 12973, + "time_per_iteration": 2.3849599361419678 + }, + { + "auxiliary_loss_clip": 0.01053239, + "auxiliary_loss_mlp": 0.01023729, + "balance_loss_clip": 1.01319098, + "balance_loss_mlp": 1.01750064, + "epoch": 0.7800390801142342, + "flos": 20261354780160.0, + "grad_norm": 2.347627823319685, + "language_loss": 0.77389562, + "learning_rate": 4.588290245386135e-07, + "loss": 0.79466534, + "num_input_tokens_seen": 279861450, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.35742188, + "step": 12974, + "time_per_iteration": 2.3914074897766113 + }, + { + "auxiliary_loss_clip": 0.01055818, + "auxiliary_loss_mlp": 0.01021563, + "balance_loss_clip": 1.01017821, + "balance_loss_mlp": 1.01857114, + "epoch": 0.7800992033669022, + "flos": 16689390647040.0, + "grad_norm": 1.563779504469596, + "language_loss": 0.69459081, + "learning_rate": 4.585882520235251e-07, + "loss": 0.71536469, + "num_input_tokens_seen": 279878660, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.37304688, + "step": 12975, + "time_per_iteration": 2.3535685539245605 + }, + { + "auxiliary_loss_clip": 0.01054759, + "auxiliary_loss_mlp": 0.0102137, + "balance_loss_clip": 1.01008654, + "balance_loss_mlp": 1.01876926, + "epoch": 0.7801593266195701, + "flos": 18584301755520.0, + "grad_norm": 2.0790139109240617, + "language_loss": 0.81893373, + "learning_rate": 4.583475345174581e-07, + "loss": 0.83969498, + "num_input_tokens_seen": 279895685, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.359375, + "step": 12976, + "time_per_iteration": 2.4159374237060547 + }, + { + "auxiliary_loss_clip": 0.01057968, + "auxiliary_loss_mlp": 0.01022388, + "balance_loss_clip": 1.01006746, + "balance_loss_mlp": 1.01802289, + "epoch": 0.7802194498722381, + "flos": 25483732704000.0, + "grad_norm": 1.441990204223938, + "language_loss": 0.65667772, + "learning_rate": 4.5810687202900087e-07, + "loss": 0.67748129, + "num_input_tokens_seen": 279917240, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.3984375, + "step": 12977, + "time_per_iteration": 2.4624452590942383 + }, + { + "auxiliary_loss_clip": 0.01054927, + "auxiliary_loss_mlp": 0.01016194, + "balance_loss_clip": 1.00649023, + "balance_loss_mlp": 1.01885533, + "epoch": 0.780279573124906, + "flos": 31174959542400.0, + "grad_norm": 1.5096981123858493, + "language_loss": 0.74979937, + "learning_rate": 4.578662645667437e-07, + "loss": 0.77051055, + "num_input_tokens_seen": 279938665, + "router_z_loss_clip": 0.09667969, + "router_z_loss_mlp": 0.36132812, + "step": 12978, + "time_per_iteration": 2.491560220718384 + }, + { + "auxiliary_loss_clip": 0.01058031, + "auxiliary_loss_mlp": 0.01026818, + "balance_loss_clip": 1.0142827, + "balance_loss_mlp": 1.01898551, + "epoch": 0.780339696377574, + "flos": 26942787999360.0, + "grad_norm": 1.825533597322912, + "language_loss": 0.62227309, + "learning_rate": 4.576257121392728e-07, + "loss": 0.6431216, + "num_input_tokens_seen": 279957965, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.390625, + "step": 12979, + "time_per_iteration": 2.503488302230835 + }, + { + "auxiliary_loss_clip": 0.010564, + "auxiliary_loss_mlp": 0.01022424, + "balance_loss_clip": 1.01139116, + "balance_loss_mlp": 1.01884508, + "epoch": 0.780399819630242, + "flos": 27956386857600.0, + "grad_norm": 1.8376057774230747, + "language_loss": 0.77052152, + "learning_rate": 4.5738521475517265e-07, + "loss": 0.79130971, + "num_input_tokens_seen": 279977490, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.375, + "step": 12980, + "time_per_iteration": 2.4577577114105225 + }, + { + "auxiliary_loss_clip": 0.01057435, + "auxiliary_loss_mlp": 0.01023887, + "balance_loss_clip": 1.01275837, + "balance_loss_mlp": 1.01866984, + "epoch": 0.78045994288291, + "flos": 22485845923200.0, + "grad_norm": 2.119278712218449, + "language_loss": 0.77781773, + "learning_rate": 4.571447724230262e-07, + "loss": 0.79863095, + "num_input_tokens_seen": 279994220, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.38867188, + "step": 12981, + "time_per_iteration": 5.248161554336548 + }, + { + "auxiliary_loss_clip": 0.01058375, + "auxiliary_loss_mlp": 0.01025661, + "balance_loss_clip": 1.013973, + "balance_loss_mlp": 1.02033114, + "epoch": 0.7805200661355779, + "flos": 20849780701440.0, + "grad_norm": 2.3302841867972117, + "language_loss": 0.73101944, + "learning_rate": 4.569043851514134e-07, + "loss": 0.75185978, + "num_input_tokens_seen": 280012590, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.38085938, + "step": 12982, + "time_per_iteration": 2.4192709922790527 + }, + { + "auxiliary_loss_clip": 0.01056201, + "auxiliary_loss_mlp": 0.01023063, + "balance_loss_clip": 1.01223826, + "balance_loss_mlp": 1.01828456, + "epoch": 0.7805801893882459, + "flos": 25664792348160.0, + "grad_norm": 1.3776129054231858, + "language_loss": 0.73304307, + "learning_rate": 4.5666405294891497e-07, + "loss": 0.75383568, + "num_input_tokens_seen": 280033700, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.37890625, + "step": 12983, + "time_per_iteration": 2.4346978664398193 + }, + { + "auxiliary_loss_clip": 0.01055825, + "auxiliary_loss_mlp": 0.01023123, + "balance_loss_clip": 1.01234674, + "balance_loss_mlp": 1.01834035, + "epoch": 0.7806403126409138, + "flos": 11327010704640.0, + "grad_norm": 2.0106680061707807, + "language_loss": 0.74648625, + "learning_rate": 4.564237758241054e-07, + "loss": 0.76727569, + "num_input_tokens_seen": 280052215, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.375, + "step": 12984, + "time_per_iteration": 2.3709840774536133 + }, + { + "auxiliary_loss_clip": 0.01054153, + "auxiliary_loss_mlp": 0.01023264, + "balance_loss_clip": 1.01242805, + "balance_loss_mlp": 1.01749587, + "epoch": 0.7807004358935818, + "flos": 19572343632000.0, + "grad_norm": 1.816160764779673, + "language_loss": 0.81193793, + "learning_rate": 4.561835537855614e-07, + "loss": 0.83271205, + "num_input_tokens_seen": 280070525, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.3671875, + "step": 12985, + "time_per_iteration": 2.3776535987854004 + }, + { + "auxiliary_loss_clip": 0.01055481, + "auxiliary_loss_mlp": 0.01032689, + "balance_loss_clip": 1.02185845, + "balance_loss_mlp": 1.01826227, + "epoch": 0.7807605591462499, + "flos": 19134812073600.0, + "grad_norm": 1.6598290550994232, + "language_loss": 0.77035654, + "learning_rate": 4.559433868418552e-07, + "loss": 0.79123825, + "num_input_tokens_seen": 280089855, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.37109375, + "step": 12986, + "time_per_iteration": 2.372241258621216 + }, + { + "auxiliary_loss_clip": 0.01056563, + "auxiliary_loss_mlp": 0.0102847, + "balance_loss_clip": 1.01694822, + "balance_loss_mlp": 1.01850367, + "epoch": 0.7808206823989178, + "flos": 32373423383040.0, + "grad_norm": 1.694754073590859, + "language_loss": 0.74077457, + "learning_rate": 4.557032750015577e-07, + "loss": 0.76162487, + "num_input_tokens_seen": 280109960, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.37890625, + "step": 12987, + "time_per_iteration": 2.4839096069335938 + }, + { + "auxiliary_loss_clip": 0.01006388, + "auxiliary_loss_mlp": 0.01004226, + "balance_loss_clip": 1.00335586, + "balance_loss_mlp": 1.0004003, + "epoch": 0.7808808056515858, + "flos": 55046855508480.0, + "grad_norm": 0.7610185833808351, + "language_loss": 0.55067188, + "learning_rate": 4.554632182732372e-07, + "loss": 0.57077801, + "num_input_tokens_seen": 280169805, + "router_z_loss_clip": 0.00872803, + "router_z_loss_mlp": 0.06005859, + "step": 12988, + "time_per_iteration": 3.039015769958496 + }, + { + "auxiliary_loss_clip": 0.01054581, + "auxiliary_loss_mlp": 0.01024097, + "balance_loss_clip": 1.01296902, + "balance_loss_mlp": 1.01890361, + "epoch": 0.7809409289042537, + "flos": 12858650449920.0, + "grad_norm": 3.2963077660146194, + "language_loss": 0.81061453, + "learning_rate": 4.5522321666546216e-07, + "loss": 0.83140123, + "num_input_tokens_seen": 280184630, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.35742188, + "step": 12989, + "time_per_iteration": 2.3590047359466553 + }, + { + "auxiliary_loss_clip": 0.01053391, + "auxiliary_loss_mlp": 0.01021536, + "balance_loss_clip": 1.01049697, + "balance_loss_mlp": 1.01725149, + "epoch": 0.7810010521569217, + "flos": 21686229987840.0, + "grad_norm": 1.4645805450566263, + "language_loss": 0.70655501, + "learning_rate": 4.5498327018679683e-07, + "loss": 0.72730422, + "num_input_tokens_seen": 280203880, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.36132812, + "step": 12990, + "time_per_iteration": 3.865032196044922 + }, + { + "auxiliary_loss_clip": 0.01056327, + "auxiliary_loss_mlp": 0.01026285, + "balance_loss_clip": 1.0135355, + "balance_loss_mlp": 1.01871204, + "epoch": 0.7810611754095896, + "flos": 16756319456640.0, + "grad_norm": 1.9005994663659334, + "language_loss": 0.77519882, + "learning_rate": 4.5474337884580436e-07, + "loss": 0.79602492, + "num_input_tokens_seen": 280220460, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.37695312, + "step": 12991, + "time_per_iteration": 2.3888661861419678 + }, + { + "auxiliary_loss_clip": 0.01055828, + "auxiliary_loss_mlp": 0.01026934, + "balance_loss_clip": 1.0153172, + "balance_loss_mlp": 1.0181036, + "epoch": 0.7811212986622577, + "flos": 43505793924480.0, + "grad_norm": 1.6496296018293057, + "language_loss": 0.66055012, + "learning_rate": 4.545035426510453e-07, + "loss": 0.68137777, + "num_input_tokens_seen": 280242680, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.37695312, + "step": 12992, + "time_per_iteration": 2.605182409286499 + }, + { + "auxiliary_loss_clip": 0.01055734, + "auxiliary_loss_mlp": 0.01024157, + "balance_loss_clip": 1.01245642, + "balance_loss_mlp": 1.01800895, + "epoch": 0.7811814219149256, + "flos": 21756754667520.0, + "grad_norm": 1.9202576828598419, + "language_loss": 0.61884129, + "learning_rate": 4.5426376161108025e-07, + "loss": 0.63964021, + "num_input_tokens_seen": 280260655, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.37695312, + "step": 12993, + "time_per_iteration": 2.456468105316162 + }, + { + "auxiliary_loss_clip": 0.01055147, + "auxiliary_loss_mlp": 0.01021046, + "balance_loss_clip": 1.00974536, + "balance_loss_mlp": 1.01840556, + "epoch": 0.7812415451675936, + "flos": 24060358684800.0, + "grad_norm": 1.5890572502783458, + "language_loss": 0.68527895, + "learning_rate": 4.540240357344649e-07, + "loss": 0.70604086, + "num_input_tokens_seen": 280281185, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.3671875, + "step": 12994, + "time_per_iteration": 2.4516077041625977 + }, + { + "auxiliary_loss_clip": 0.01053165, + "auxiliary_loss_mlp": 0.01021661, + "balance_loss_clip": 1.01074743, + "balance_loss_mlp": 1.01705229, + "epoch": 0.7813016684202615, + "flos": 18988700567040.0, + "grad_norm": 4.8023692459883955, + "language_loss": 0.68975806, + "learning_rate": 4.537843650297546e-07, + "loss": 0.71050626, + "num_input_tokens_seen": 280298255, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.359375, + "step": 12995, + "time_per_iteration": 2.4132165908813477 + }, + { + "auxiliary_loss_clip": 0.01053614, + "auxiliary_loss_mlp": 0.0102057, + "balance_loss_clip": 1.01019239, + "balance_loss_mlp": 1.0173893, + "epoch": 0.7813617916729295, + "flos": 25259730220800.0, + "grad_norm": 1.7057568450847491, + "language_loss": 0.74954396, + "learning_rate": 4.53544749505504e-07, + "loss": 0.77028579, + "num_input_tokens_seen": 280319000, + "router_z_loss_clip": 0.10400391, + "router_z_loss_mlp": 0.36328125, + "step": 12996, + "time_per_iteration": 2.419989824295044 + }, + { + "auxiliary_loss_clip": 0.01057879, + "auxiliary_loss_mlp": 0.01022474, + "balance_loss_clip": 1.01070166, + "balance_loss_mlp": 1.01886225, + "epoch": 0.7814219149255974, + "flos": 17965117059840.0, + "grad_norm": 2.4410526026193713, + "language_loss": 0.68234217, + "learning_rate": 4.533051891702622e-07, + "loss": 0.70314574, + "num_input_tokens_seen": 280336375, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.390625, + "step": 12997, + "time_per_iteration": 2.3715569972991943 + }, + { + "auxiliary_loss_clip": 0.01058415, + "auxiliary_loss_mlp": 0.01027323, + "balance_loss_clip": 1.01511562, + "balance_loss_mlp": 1.01859462, + "epoch": 0.7814820381782654, + "flos": 25774978199040.0, + "grad_norm": 1.684878459066326, + "language_loss": 0.82321268, + "learning_rate": 4.5306568403258015e-07, + "loss": 0.84407008, + "num_input_tokens_seen": 280358760, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.3984375, + "step": 12998, + "time_per_iteration": 2.448147773742676 + }, + { + "auxiliary_loss_clip": 0.01056807, + "auxiliary_loss_mlp": 0.01027434, + "balance_loss_clip": 1.01459467, + "balance_loss_mlp": 1.01844871, + "epoch": 0.7815421614309335, + "flos": 20518594744320.0, + "grad_norm": 2.0032686781594466, + "language_loss": 0.74658984, + "learning_rate": 4.528262341010043e-07, + "loss": 0.76743221, + "num_input_tokens_seen": 280377085, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.3828125, + "step": 12999, + "time_per_iteration": 2.394557476043701 + }, + { + "auxiliary_loss_clip": 0.01058552, + "auxiliary_loss_mlp": 0.01027199, + "balance_loss_clip": 1.01521254, + "balance_loss_mlp": 1.01950192, + "epoch": 0.7816022846836014, + "flos": 21286614032640.0, + "grad_norm": 1.607907167697068, + "language_loss": 0.84490597, + "learning_rate": 4.5258683938408124e-07, + "loss": 0.86576343, + "num_input_tokens_seen": 280395465, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.390625, + "step": 13000, + "time_per_iteration": 2.3862481117248535 + }, + { + "auxiliary_loss_clip": 0.01056387, + "auxiliary_loss_mlp": 0.01022284, + "balance_loss_clip": 1.01035142, + "balance_loss_mlp": 1.01810575, + "epoch": 0.7816624079362694, + "flos": 19207396523520.0, + "grad_norm": 1.7748933675128802, + "language_loss": 0.66026366, + "learning_rate": 4.5234749989035247e-07, + "loss": 0.68105036, + "num_input_tokens_seen": 280412775, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.3828125, + "step": 13001, + "time_per_iteration": 2.369511365890503 + }, + { + "auxiliary_loss_clip": 0.01055534, + "auxiliary_loss_mlp": 0.01025487, + "balance_loss_clip": 1.01423395, + "balance_loss_mlp": 1.0175817, + "epoch": 0.7817225311889373, + "flos": 26103475981440.0, + "grad_norm": 1.5260580175032696, + "language_loss": 0.67006457, + "learning_rate": 4.521082156283609e-07, + "loss": 0.69087481, + "num_input_tokens_seen": 280432905, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.37890625, + "step": 13002, + "time_per_iteration": 2.4213011264801025 + }, + { + "auxiliary_loss_clip": 0.01059812, + "auxiliary_loss_mlp": 0.01025339, + "balance_loss_clip": 1.01278043, + "balance_loss_mlp": 1.02035427, + "epoch": 0.7817826544416053, + "flos": 21249885415680.0, + "grad_norm": 2.293883859748109, + "language_loss": 0.72777116, + "learning_rate": 4.5186898660664543e-07, + "loss": 0.74862272, + "num_input_tokens_seen": 280450785, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.39453125, + "step": 13003, + "time_per_iteration": 2.442657470703125 + }, + { + "auxiliary_loss_clip": 0.01059761, + "auxiliary_loss_mlp": 0.01024024, + "balance_loss_clip": 1.01262188, + "balance_loss_mlp": 1.02039814, + "epoch": 0.7818427776942732, + "flos": 19931320897920.0, + "grad_norm": 1.6157887297686708, + "language_loss": 0.62222326, + "learning_rate": 4.5162981283374346e-07, + "loss": 0.64306116, + "num_input_tokens_seen": 280468400, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.39453125, + "step": 13004, + "time_per_iteration": 2.374194383621216 + }, + { + "auxiliary_loss_clip": 0.01052313, + "auxiliary_loss_mlp": 0.01021972, + "balance_loss_clip": 1.01181602, + "balance_loss_mlp": 1.01742744, + "epoch": 0.7819029009469413, + "flos": 11362971271680.0, + "grad_norm": 1.88934906713201, + "language_loss": 0.82945365, + "learning_rate": 4.513906943181902e-07, + "loss": 0.85019648, + "num_input_tokens_seen": 280483930, + "router_z_loss_clip": 0.1015625, + "router_z_loss_mlp": 0.34765625, + "step": 13005, + "time_per_iteration": 2.373286247253418 + }, + { + "auxiliary_loss_clip": 0.01054125, + "auxiliary_loss_mlp": 0.01021881, + "balance_loss_clip": 1.0104965, + "balance_loss_mlp": 1.01703477, + "epoch": 0.7819630241996092, + "flos": 24278146945920.0, + "grad_norm": 1.843007980525435, + "language_loss": 0.72785962, + "learning_rate": 4.511516310685206e-07, + "loss": 0.74861968, + "num_input_tokens_seen": 280503465, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.37109375, + "step": 13006, + "time_per_iteration": 2.389967679977417 + }, + { + "auxiliary_loss_clip": 0.01059219, + "auxiliary_loss_mlp": 0.01027456, + "balance_loss_clip": 1.01468229, + "balance_loss_mlp": 1.01878119, + "epoch": 0.7820231474522772, + "flos": 22707858458880.0, + "grad_norm": 2.329117696707678, + "language_loss": 0.71897751, + "learning_rate": 4.5091262309326404e-07, + "loss": 0.73984426, + "num_input_tokens_seen": 280523375, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.40429688, + "step": 13007, + "time_per_iteration": 2.4253454208374023 + }, + { + "auxiliary_loss_clip": 0.01057608, + "auxiliary_loss_mlp": 0.01022589, + "balance_loss_clip": 1.01017904, + "balance_loss_mlp": 1.01783085, + "epoch": 0.7820832707049451, + "flos": 20046394339200.0, + "grad_norm": 2.275106236257389, + "language_loss": 0.69050425, + "learning_rate": 4.5067367040095196e-07, + "loss": 0.71130621, + "num_input_tokens_seen": 280542920, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.3984375, + "step": 13008, + "time_per_iteration": 2.373225688934326 + }, + { + "auxiliary_loss_clip": 0.01055758, + "auxiliary_loss_mlp": 0.01020892, + "balance_loss_clip": 1.0088222, + "balance_loss_mlp": 1.0176909, + "epoch": 0.7821433939576131, + "flos": 27161553778560.0, + "grad_norm": 1.834140736153072, + "language_loss": 0.6973294, + "learning_rate": 4.50434773000111e-07, + "loss": 0.7180959, + "num_input_tokens_seen": 280561700, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.38085938, + "step": 13009, + "time_per_iteration": 2.445242166519165 + }, + { + "auxiliary_loss_clip": 0.01056062, + "auxiliary_loss_mlp": 0.010218, + "balance_loss_clip": 1.01036191, + "balance_loss_mlp": 1.01805425, + "epoch": 0.782203517210281, + "flos": 22600954275840.0, + "grad_norm": 1.711641962951911, + "language_loss": 0.81680638, + "learning_rate": 4.5019593089926735e-07, + "loss": 0.83758503, + "num_input_tokens_seen": 280580605, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.37890625, + "step": 13010, + "time_per_iteration": 2.374439001083374 + }, + { + "auxiliary_loss_clip": 0.01054556, + "auxiliary_loss_mlp": 0.01023653, + "balance_loss_clip": 1.01271534, + "balance_loss_mlp": 1.01789367, + "epoch": 0.782263640462949, + "flos": 29058524657280.0, + "grad_norm": 1.4308151734858463, + "language_loss": 0.62445432, + "learning_rate": 4.4995714410694405e-07, + "loss": 0.64523637, + "num_input_tokens_seen": 280601495, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.3671875, + "step": 13011, + "time_per_iteration": 2.4564850330352783 + }, + { + "auxiliary_loss_clip": 0.01055398, + "auxiliary_loss_mlp": 0.01020759, + "balance_loss_clip": 1.01023293, + "balance_loss_mlp": 1.0183953, + "epoch": 0.782323763715617, + "flos": 25298378962560.0, + "grad_norm": 1.6902777967848441, + "language_loss": 0.70332861, + "learning_rate": 4.4971841263166263e-07, + "loss": 0.72409022, + "num_input_tokens_seen": 280622760, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.36914062, + "step": 13012, + "time_per_iteration": 3.8740835189819336 + }, + { + "auxiliary_loss_clip": 0.01006708, + "auxiliary_loss_mlp": 0.01003251, + "balance_loss_clip": 1.0023216, + "balance_loss_mlp": 1.00065994, + "epoch": 0.782383886968285, + "flos": 65190695414400.0, + "grad_norm": 0.7042973302135321, + "language_loss": 0.5497967, + "learning_rate": 4.4947973648194446e-07, + "loss": 0.56989634, + "num_input_tokens_seen": 280687115, + "router_z_loss_clip": 0.00927734, + "router_z_loss_mlp": 0.06054688, + "step": 13013, + "time_per_iteration": 3.1282777786254883 + }, + { + "auxiliary_loss_clip": 0.01060106, + "auxiliary_loss_mlp": 0.01027271, + "balance_loss_clip": 1.01516509, + "balance_loss_mlp": 1.01998711, + "epoch": 0.782444010220953, + "flos": 18404464008960.0, + "grad_norm": 1.7637872527446783, + "language_loss": 0.65677941, + "learning_rate": 4.4924111566630474e-07, + "loss": 0.67765319, + "num_input_tokens_seen": 280705000, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.40039062, + "step": 13014, + "time_per_iteration": 2.400663137435913 + }, + { + "auxiliary_loss_clip": 0.01006717, + "auxiliary_loss_mlp": 0.01002075, + "balance_loss_clip": 1.00119317, + "balance_loss_mlp": 1.00067925, + "epoch": 0.7825041334736209, + "flos": 63951313593600.0, + "grad_norm": 0.7281890173480887, + "language_loss": 0.58456677, + "learning_rate": 4.4900255019326126e-07, + "loss": 0.60465467, + "num_input_tokens_seen": 280773525, + "router_z_loss_clip": 0.0088501, + "router_z_loss_mlp": 0.06054688, + "step": 13015, + "time_per_iteration": 3.138761281967163 + }, + { + "auxiliary_loss_clip": 0.01056437, + "auxiliary_loss_mlp": 0.01025682, + "balance_loss_clip": 1.01432681, + "balance_loss_mlp": 1.01937354, + "epoch": 0.7825642567262889, + "flos": 20338338061440.0, + "grad_norm": 1.615853543453255, + "language_loss": 0.74258363, + "learning_rate": 4.4876404007132663e-07, + "loss": 0.76340485, + "num_input_tokens_seen": 280791915, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.37109375, + "step": 13016, + "time_per_iteration": 2.398668050765991 + }, + { + "auxiliary_loss_clip": 0.0105722, + "auxiliary_loss_mlp": 0.01025255, + "balance_loss_clip": 1.01246369, + "balance_loss_mlp": 1.01823878, + "epoch": 0.7826243799789568, + "flos": 20262018096000.0, + "grad_norm": 2.2279493579108136, + "language_loss": 0.74982667, + "learning_rate": 4.4852558530901417e-07, + "loss": 0.77065134, + "num_input_tokens_seen": 280811460, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.390625, + "step": 13017, + "time_per_iteration": 2.3834848403930664 + }, + { + "auxiliary_loss_clip": 0.01058085, + "auxiliary_loss_mlp": 0.01023847, + "balance_loss_clip": 1.01180649, + "balance_loss_mlp": 1.01847458, + "epoch": 0.7826845032316249, + "flos": 21132123799680.0, + "grad_norm": 2.210300434558128, + "language_loss": 0.75848496, + "learning_rate": 4.4828718591483185e-07, + "loss": 0.77930427, + "num_input_tokens_seen": 280825415, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.39648438, + "step": 13018, + "time_per_iteration": 2.383453130722046 + }, + { + "auxiliary_loss_clip": 0.01055669, + "auxiliary_loss_mlp": 0.01023898, + "balance_loss_clip": 1.01255512, + "balance_loss_mlp": 1.01829398, + "epoch": 0.7827446264842928, + "flos": 22491152449920.0, + "grad_norm": 2.018536978943794, + "language_loss": 0.7727114, + "learning_rate": 4.4804884189728855e-07, + "loss": 0.7935071, + "num_input_tokens_seen": 280845335, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.375, + "step": 13019, + "time_per_iteration": 2.389157295227051 + }, + { + "auxiliary_loss_clip": 0.01054132, + "auxiliary_loss_mlp": 0.01021049, + "balance_loss_clip": 1.00959909, + "balance_loss_mlp": 1.01761103, + "epoch": 0.7828047497369608, + "flos": 28839374853120.0, + "grad_norm": 1.9671978042345706, + "language_loss": 0.6770581, + "learning_rate": 4.4781055326489016e-07, + "loss": 0.69780993, + "num_input_tokens_seen": 280867145, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.36523438, + "step": 13020, + "time_per_iteration": 2.4556071758270264 + }, + { + "auxiliary_loss_clip": 0.0105865, + "auxiliary_loss_mlp": 0.01024281, + "balance_loss_clip": 1.01287234, + "balance_loss_mlp": 1.01985002, + "epoch": 0.7828648729896287, + "flos": 23256588297600.0, + "grad_norm": 1.9647031890086917, + "language_loss": 0.62272978, + "learning_rate": 4.475723200261405e-07, + "loss": 0.6435591, + "num_input_tokens_seen": 280886185, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.38867188, + "step": 13021, + "time_per_iteration": 5.222066402435303 + }, + { + "auxiliary_loss_clip": 0.0105451, + "auxiliary_loss_mlp": 0.01024648, + "balance_loss_clip": 1.01442575, + "balance_loss_mlp": 1.01845658, + "epoch": 0.7829249962422967, + "flos": 25264478165760.0, + "grad_norm": 1.6314099273607383, + "language_loss": 0.69236737, + "learning_rate": 4.473341421895409e-07, + "loss": 0.71315902, + "num_input_tokens_seen": 280907665, + "router_z_loss_clip": 0.10205078, + "router_z_loss_mlp": 0.359375, + "step": 13022, + "time_per_iteration": 2.4167442321777344 + }, + { + "auxiliary_loss_clip": 0.01006725, + "auxiliary_loss_mlp": 0.01001348, + "balance_loss_clip": 1.00047779, + "balance_loss_mlp": 1.00070524, + "epoch": 0.7829851194949646, + "flos": 70718704179840.0, + "grad_norm": 0.6387867294017018, + "language_loss": 0.56186354, + "learning_rate": 4.4709601976359267e-07, + "loss": 0.58194423, + "num_input_tokens_seen": 280971405, + "router_z_loss_clip": 0.00872803, + "router_z_loss_mlp": 0.06005859, + "step": 13023, + "time_per_iteration": 3.164440631866455 + }, + { + "auxiliary_loss_clip": 0.01052639, + "auxiliary_loss_mlp": 0.01020125, + "balance_loss_clip": 1.00993848, + "balance_loss_mlp": 1.01692748, + "epoch": 0.7830452427476327, + "flos": 25659765112320.0, + "grad_norm": 1.628757256238824, + "language_loss": 0.67264646, + "learning_rate": 4.468579527567922e-07, + "loss": 0.69337416, + "num_input_tokens_seen": 280989615, + "router_z_loss_clip": 0.1015625, + "router_z_loss_mlp": 0.35546875, + "step": 13024, + "time_per_iteration": 2.3996026515960693 + }, + { + "auxiliary_loss_clip": 0.01057356, + "auxiliary_loss_mlp": 0.01021212, + "balance_loss_clip": 1.00918412, + "balance_loss_mlp": 1.0187552, + "epoch": 0.7831053660003006, + "flos": 22783200906240.0, + "grad_norm": 2.0596948368439145, + "language_loss": 0.77846003, + "learning_rate": 4.466199411776366e-07, + "loss": 0.79924572, + "num_input_tokens_seen": 281009450, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.38671875, + "step": 13025, + "time_per_iteration": 2.4500997066497803 + }, + { + "auxiliary_loss_clip": 0.01056946, + "auxiliary_loss_mlp": 0.01029056, + "balance_loss_clip": 1.01827407, + "balance_loss_mlp": 1.01924133, + "epoch": 0.7831654892529686, + "flos": 25811078411520.0, + "grad_norm": 1.5962096288048664, + "language_loss": 0.78169477, + "learning_rate": 4.463819850346193e-07, + "loss": 0.80255485, + "num_input_tokens_seen": 281028120, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.37890625, + "step": 13026, + "time_per_iteration": 2.413207769393921 + }, + { + "auxiliary_loss_clip": 0.01055493, + "auxiliary_loss_mlp": 0.01021518, + "balance_loss_clip": 1.01048529, + "balance_loss_mlp": 1.01942277, + "epoch": 0.7832256125056366, + "flos": 20770667827200.0, + "grad_norm": 1.8436647043500294, + "language_loss": 0.75473273, + "learning_rate": 4.4614408433623295e-07, + "loss": 0.7755028, + "num_input_tokens_seen": 281042130, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.359375, + "step": 13027, + "time_per_iteration": 2.3690052032470703 + }, + { + "auxiliary_loss_clip": 0.01057012, + "auxiliary_loss_mlp": 0.01021198, + "balance_loss_clip": 1.01043916, + "balance_loss_mlp": 1.0194838, + "epoch": 0.7832857357583045, + "flos": 21505484545920.0, + "grad_norm": 1.792845266195981, + "language_loss": 0.70334578, + "learning_rate": 4.459062390909669e-07, + "loss": 0.72412789, + "num_input_tokens_seen": 281060945, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.375, + "step": 13028, + "time_per_iteration": 2.3793179988861084 + }, + { + "auxiliary_loss_clip": 0.01060412, + "auxiliary_loss_mlp": 0.01032013, + "balance_loss_clip": 1.01968086, + "balance_loss_mlp": 1.01979756, + "epoch": 0.7833458590109725, + "flos": 18076804099200.0, + "grad_norm": 2.067897904574163, + "language_loss": 0.6920681, + "learning_rate": 4.456684493073093e-07, + "loss": 0.71299231, + "num_input_tokens_seen": 281079270, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.40625, + "step": 13029, + "time_per_iteration": 3.8080103397369385 + }, + { + "auxiliary_loss_clip": 0.0105739, + "auxiliary_loss_mlp": 0.01025491, + "balance_loss_clip": 1.01293206, + "balance_loss_mlp": 1.01837206, + "epoch": 0.7834059822636404, + "flos": 28287607726080.0, + "grad_norm": 1.8228276849709655, + "language_loss": 0.81119555, + "learning_rate": 4.454307149937475e-07, + "loss": 0.83202434, + "num_input_tokens_seen": 281099500, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.390625, + "step": 13030, + "time_per_iteration": 2.4426496028900146 + }, + { + "auxiliary_loss_clip": 0.01057465, + "auxiliary_loss_mlp": 0.0102464, + "balance_loss_clip": 1.01251638, + "balance_loss_mlp": 1.01719975, + "epoch": 0.7834661055163085, + "flos": 31684866082560.0, + "grad_norm": 2.331717073103173, + "language_loss": 0.70366287, + "learning_rate": 4.451930361587637e-07, + "loss": 0.72448397, + "num_input_tokens_seen": 281121250, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.40234375, + "step": 13031, + "time_per_iteration": 2.5349555015563965 + }, + { + "auxiliary_loss_clip": 0.01057157, + "auxiliary_loss_mlp": 0.01022981, + "balance_loss_clip": 1.0112927, + "balance_loss_mlp": 1.01796985, + "epoch": 0.7835262287689764, + "flos": 12932352063360.0, + "grad_norm": 3.4745002110474323, + "language_loss": 0.78697777, + "learning_rate": 4.4495541281084126e-07, + "loss": 0.80777907, + "num_input_tokens_seen": 281138760, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.39257812, + "step": 13032, + "time_per_iteration": 2.3827695846557617 + }, + { + "auxiliary_loss_clip": 0.01061034, + "auxiliary_loss_mlp": 0.01025821, + "balance_loss_clip": 1.01391184, + "balance_loss_mlp": 1.02095282, + "epoch": 0.7835863520216444, + "flos": 16142301642240.0, + "grad_norm": 2.0248629307474717, + "language_loss": 0.62625599, + "learning_rate": 4.4471784495845986e-07, + "loss": 0.64712453, + "num_input_tokens_seen": 281157420, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.40039062, + "step": 13033, + "time_per_iteration": 2.3838448524475098 + }, + { + "auxiliary_loss_clip": 0.01055882, + "auxiliary_loss_mlp": 0.01025135, + "balance_loss_clip": 1.01305342, + "balance_loss_mlp": 1.01801777, + "epoch": 0.7836464752743123, + "flos": 11509117689600.0, + "grad_norm": 1.3804181097164798, + "language_loss": 0.71952069, + "learning_rate": 4.444803326100988e-07, + "loss": 0.74033087, + "num_input_tokens_seen": 281174620, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.37890625, + "step": 13034, + "time_per_iteration": 2.370311737060547 + }, + { + "auxiliary_loss_clip": 0.01058304, + "auxiliary_loss_mlp": 0.01024082, + "balance_loss_clip": 1.01244676, + "balance_loss_mlp": 1.01956606, + "epoch": 0.7837065985269803, + "flos": 18222706137600.0, + "grad_norm": 1.8556799603930623, + "language_loss": 0.72093219, + "learning_rate": 4.442428757742322e-07, + "loss": 0.74175608, + "num_input_tokens_seen": 281193865, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.38671875, + "step": 13035, + "time_per_iteration": 2.3869528770446777 + }, + { + "auxiliary_loss_clip": 0.01058073, + "auxiliary_loss_mlp": 0.01028515, + "balance_loss_clip": 1.01642752, + "balance_loss_mlp": 1.01965332, + "epoch": 0.7837667217796482, + "flos": 24753244993920.0, + "grad_norm": 2.2146103350746515, + "language_loss": 0.66309512, + "learning_rate": 4.4400547445933624e-07, + "loss": 0.68396103, + "num_input_tokens_seen": 281212250, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.38476562, + "step": 13036, + "time_per_iteration": 2.4489943981170654 + }, + { + "auxiliary_loss_clip": 0.01060399, + "auxiliary_loss_mlp": 0.01026433, + "balance_loss_clip": 1.01334357, + "balance_loss_mlp": 1.0191474, + "epoch": 0.7838268450323163, + "flos": 22382013939840.0, + "grad_norm": 1.9219887143205314, + "language_loss": 0.73008239, + "learning_rate": 4.4376812867388236e-07, + "loss": 0.75095063, + "num_input_tokens_seen": 281230850, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.4140625, + "step": 13037, + "time_per_iteration": 2.419644594192505 + }, + { + "auxiliary_loss_clip": 0.01056587, + "auxiliary_loss_mlp": 0.01027091, + "balance_loss_clip": 1.01536715, + "balance_loss_mlp": 1.01853609, + "epoch": 0.7838869682849842, + "flos": 19499270423040.0, + "grad_norm": 1.9519259684520127, + "language_loss": 0.60136932, + "learning_rate": 4.4353083842634077e-07, + "loss": 0.62220609, + "num_input_tokens_seen": 281249810, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.38085938, + "step": 13038, + "time_per_iteration": 2.37154221534729 + }, + { + "auxiliary_loss_clip": 0.01055324, + "auxiliary_loss_mlp": 0.01022285, + "balance_loss_clip": 1.01158023, + "balance_loss_mlp": 1.01859546, + "epoch": 0.7839470915376522, + "flos": 32891394447360.0, + "grad_norm": 1.5238491851912368, + "language_loss": 0.68031681, + "learning_rate": 4.4329360372517957e-07, + "loss": 0.70109284, + "num_input_tokens_seen": 281273730, + "router_z_loss_clip": 0.10693359, + "router_z_loss_mlp": 0.3671875, + "step": 13039, + "time_per_iteration": 2.518425464630127 + }, + { + "auxiliary_loss_clip": 0.01052783, + "auxiliary_loss_mlp": 0.01020248, + "balance_loss_clip": 1.00998402, + "balance_loss_mlp": 1.01617515, + "epoch": 0.7840072147903202, + "flos": 29674811710080.0, + "grad_norm": 1.93894028868375, + "language_loss": 0.68990904, + "learning_rate": 4.430564245788662e-07, + "loss": 0.71063936, + "num_input_tokens_seen": 281293670, + "router_z_loss_clip": 0.10253906, + "router_z_loss_mlp": 0.3671875, + "step": 13040, + "time_per_iteration": 2.44022798538208 + }, + { + "auxiliary_loss_clip": 0.01057557, + "auxiliary_loss_mlp": 0.01025608, + "balance_loss_clip": 1.0135262, + "balance_loss_mlp": 1.01870036, + "epoch": 0.7840673380429881, + "flos": 18185768052480.0, + "grad_norm": 1.9678596583393781, + "language_loss": 0.67202604, + "learning_rate": 4.428193009958634e-07, + "loss": 0.69285768, + "num_input_tokens_seen": 281313070, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.38867188, + "step": 13041, + "time_per_iteration": 2.3916399478912354 + }, + { + "auxiliary_loss_clip": 0.01006741, + "auxiliary_loss_mlp": 0.01000602, + "balance_loss_clip": 0.99962491, + "balance_loss_mlp": 1.0006882, + "epoch": 0.7841274612956561, + "flos": 66342725280000.0, + "grad_norm": 0.6606197163111749, + "language_loss": 0.57451409, + "learning_rate": 4.425822329846338e-07, + "loss": 0.5945875, + "num_input_tokens_seen": 281374880, + "router_z_loss_clip": 0.00976562, + "router_z_loss_mlp": 0.06054688, + "step": 13042, + "time_per_iteration": 2.9967308044433594 + }, + { + "auxiliary_loss_clip": 0.01056964, + "auxiliary_loss_mlp": 0.01021736, + "balance_loss_clip": 1.01008391, + "balance_loss_mlp": 1.01850748, + "epoch": 0.784187584548324, + "flos": 26647353141120.0, + "grad_norm": 2.3031858905355302, + "language_loss": 0.83866662, + "learning_rate": 4.4234522055363885e-07, + "loss": 0.85945356, + "num_input_tokens_seen": 281392620, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.38476562, + "step": 13043, + "time_per_iteration": 2.431544303894043 + }, + { + "auxiliary_loss_clip": 0.01055016, + "auxiliary_loss_mlp": 0.01018811, + "balance_loss_clip": 1.00812984, + "balance_loss_mlp": 1.01832795, + "epoch": 0.7842477078009921, + "flos": 25738947809280.0, + "grad_norm": 1.4018061654608374, + "language_loss": 0.88539112, + "learning_rate": 4.42108263711336e-07, + "loss": 0.90612936, + "num_input_tokens_seen": 281413140, + "router_z_loss_clip": 0.10693359, + "router_z_loss_mlp": 0.3671875, + "step": 13044, + "time_per_iteration": 2.421405076980591 + }, + { + "auxiliary_loss_clip": 0.01058427, + "auxiliary_loss_mlp": 0.01026145, + "balance_loss_clip": 1.01354456, + "balance_loss_mlp": 1.01887047, + "epoch": 0.78430783105366, + "flos": 21979884366720.0, + "grad_norm": 1.849388508466791, + "language_loss": 0.79021835, + "learning_rate": 4.4187136246618183e-07, + "loss": 0.81106406, + "num_input_tokens_seen": 281430860, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.39648438, + "step": 13045, + "time_per_iteration": 2.4509806632995605 + }, + { + "auxiliary_loss_clip": 0.01055897, + "auxiliary_loss_mlp": 0.01024139, + "balance_loss_clip": 1.01295757, + "balance_loss_mlp": 1.01878428, + "epoch": 0.784367954306328, + "flos": 23841139057920.0, + "grad_norm": 1.4685160073893746, + "language_loss": 0.72320235, + "learning_rate": 4.4163451682663045e-07, + "loss": 0.74400276, + "num_input_tokens_seen": 281451385, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.37109375, + "step": 13046, + "time_per_iteration": 2.412144422531128 + }, + { + "auxiliary_loss_clip": 0.01055544, + "auxiliary_loss_mlp": 0.01026977, + "balance_loss_clip": 1.01509213, + "balance_loss_mlp": 1.01886272, + "epoch": 0.7844280775589959, + "flos": 24825515241600.0, + "grad_norm": 1.618189154896625, + "language_loss": 0.63229024, + "learning_rate": 4.413977268011355e-07, + "loss": 0.65311539, + "num_input_tokens_seen": 281472255, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.3671875, + "step": 13047, + "time_per_iteration": 2.4402246475219727 + }, + { + "auxiliary_loss_clip": 0.01056608, + "auxiliary_loss_mlp": 0.01022824, + "balance_loss_clip": 1.01139212, + "balance_loss_mlp": 1.01899242, + "epoch": 0.7844882008116639, + "flos": 22454563478400.0, + "grad_norm": 1.764364491943588, + "language_loss": 0.73106837, + "learning_rate": 4.411609923981454e-07, + "loss": 0.75186265, + "num_input_tokens_seen": 281492860, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.37695312, + "step": 13048, + "time_per_iteration": 2.3915462493896484 + }, + { + "auxiliary_loss_clip": 0.01058833, + "auxiliary_loss_mlp": 0.01023718, + "balance_loss_clip": 1.01166606, + "balance_loss_mlp": 1.02022958, + "epoch": 0.7845483240643318, + "flos": 26102847576960.0, + "grad_norm": 2.327737703916881, + "language_loss": 0.74689901, + "learning_rate": 4.4092431362611006e-07, + "loss": 0.76772451, + "num_input_tokens_seen": 281511815, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.38476562, + "step": 13049, + "time_per_iteration": 2.4360105991363525 + }, + { + "auxiliary_loss_clip": 0.01059547, + "auxiliary_loss_mlp": 0.01023933, + "balance_loss_clip": 1.01058209, + "balance_loss_mlp": 1.01872063, + "epoch": 0.7846084473169999, + "flos": 19353298561920.0, + "grad_norm": 1.7187736110558944, + "language_loss": 0.72476423, + "learning_rate": 4.406876904934758e-07, + "loss": 0.74559903, + "num_input_tokens_seen": 281530090, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.40820312, + "step": 13050, + "time_per_iteration": 2.39689564704895 + }, + { + "auxiliary_loss_clip": 0.01056877, + "auxiliary_loss_mlp": 0.0102803, + "balance_loss_clip": 1.01711035, + "balance_loss_mlp": 1.01945269, + "epoch": 0.7846685705696678, + "flos": 23324843738880.0, + "grad_norm": 1.9839593739656485, + "language_loss": 0.73651576, + "learning_rate": 4.404511230086867e-07, + "loss": 0.75736487, + "num_input_tokens_seen": 281547075, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.375, + "step": 13051, + "time_per_iteration": 3.862736701965332 + }, + { + "auxiliary_loss_clip": 0.01057977, + "auxiliary_loss_mlp": 0.0102719, + "balance_loss_clip": 1.01435757, + "balance_loss_mlp": 1.01889038, + "epoch": 0.7847286938223358, + "flos": 35808073672320.0, + "grad_norm": 1.8854553075285443, + "language_loss": 0.73380387, + "learning_rate": 4.4021461118018476e-07, + "loss": 0.75465554, + "num_input_tokens_seen": 281568080, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.390625, + "step": 13052, + "time_per_iteration": 2.520700454711914 + }, + { + "auxiliary_loss_clip": 0.01053863, + "auxiliary_loss_mlp": 0.0102432, + "balance_loss_clip": 1.0132159, + "balance_loss_mlp": 1.0181756, + "epoch": 0.7847888170750038, + "flos": 18477188104320.0, + "grad_norm": 1.8552694748860015, + "language_loss": 0.68783903, + "learning_rate": 4.399781550164119e-07, + "loss": 0.70862085, + "num_input_tokens_seen": 281586925, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.35742188, + "step": 13053, + "time_per_iteration": 2.3850817680358887 + }, + { + "auxiliary_loss_clip": 0.01054793, + "auxiliary_loss_mlp": 0.01020588, + "balance_loss_clip": 1.00964451, + "balance_loss_mlp": 1.01809144, + "epoch": 0.7848489403276717, + "flos": 25117982634240.0, + "grad_norm": 1.873009225448655, + "language_loss": 0.70140213, + "learning_rate": 4.3974175452580555e-07, + "loss": 0.72215593, + "num_input_tokens_seen": 281603915, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.3671875, + "step": 13054, + "time_per_iteration": 2.405362367630005 + }, + { + "auxiliary_loss_clip": 0.01058266, + "auxiliary_loss_mlp": 0.01024704, + "balance_loss_clip": 1.01278281, + "balance_loss_mlp": 1.01918221, + "epoch": 0.7849090635803397, + "flos": 26501311457280.0, + "grad_norm": 1.636316004598888, + "language_loss": 0.7585659, + "learning_rate": 4.395054097168027e-07, + "loss": 0.77939558, + "num_input_tokens_seen": 281624220, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.390625, + "step": 13055, + "time_per_iteration": 2.5897514820098877 + }, + { + "auxiliary_loss_clip": 0.01059768, + "auxiliary_loss_mlp": 0.01026476, + "balance_loss_clip": 1.01369071, + "balance_loss_mlp": 1.0186075, + "epoch": 0.7849691868330076, + "flos": 20958605009280.0, + "grad_norm": 1.8087863555374721, + "language_loss": 0.7463491, + "learning_rate": 4.3926912059783763e-07, + "loss": 0.7672115, + "num_input_tokens_seen": 281642325, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.41015625, + "step": 13056, + "time_per_iteration": 2.3856632709503174 + }, + { + "auxiliary_loss_clip": 0.01056463, + "auxiliary_loss_mlp": 0.01021243, + "balance_loss_clip": 1.00930452, + "balance_loss_mlp": 1.01849174, + "epoch": 0.7850293100856757, + "flos": 26066293516800.0, + "grad_norm": 1.5878407692364005, + "language_loss": 0.70187062, + "learning_rate": 4.39032887177343e-07, + "loss": 0.72264767, + "num_input_tokens_seen": 281663065, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.37890625, + "step": 13057, + "time_per_iteration": 2.4534389972686768 + }, + { + "auxiliary_loss_clip": 0.01056932, + "auxiliary_loss_mlp": 0.01022225, + "balance_loss_clip": 1.01088214, + "balance_loss_mlp": 1.01920772, + "epoch": 0.7850894333383436, + "flos": 22490803336320.0, + "grad_norm": 1.7336920452067808, + "language_loss": 0.76825356, + "learning_rate": 4.3879670946374923e-07, + "loss": 0.7890451, + "num_input_tokens_seen": 281681005, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.37890625, + "step": 13058, + "time_per_iteration": 2.3897364139556885 + }, + { + "auxiliary_loss_clip": 0.01057872, + "auxiliary_loss_mlp": 0.01023272, + "balance_loss_clip": 1.0112915, + "balance_loss_mlp": 1.01999521, + "epoch": 0.7851495565910116, + "flos": 20557592599680.0, + "grad_norm": 2.623544220880909, + "language_loss": 0.7080701, + "learning_rate": 4.385605874654845e-07, + "loss": 0.72888154, + "num_input_tokens_seen": 281697965, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.37890625, + "step": 13059, + "time_per_iteration": 2.388418436050415 + }, + { + "auxiliary_loss_clip": 0.0105984, + "auxiliary_loss_mlp": 0.01024657, + "balance_loss_clip": 1.01178861, + "balance_loss_mlp": 1.01920533, + "epoch": 0.7852096798436795, + "flos": 15923919888000.0, + "grad_norm": 2.043361683345302, + "language_loss": 0.7678445, + "learning_rate": 4.383245211909765e-07, + "loss": 0.78868949, + "num_input_tokens_seen": 281716035, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.40625, + "step": 13060, + "time_per_iteration": 2.3641040325164795 + }, + { + "auxiliary_loss_clip": 0.01057436, + "auxiliary_loss_mlp": 0.01024709, + "balance_loss_clip": 1.01249647, + "balance_loss_mlp": 1.01919341, + "epoch": 0.7852698030963475, + "flos": 19061285016960.0, + "grad_norm": 2.0946224908290816, + "language_loss": 0.76887745, + "learning_rate": 4.380885106486494e-07, + "loss": 0.7896989, + "num_input_tokens_seen": 281732815, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.3828125, + "step": 13061, + "time_per_iteration": 5.2913124561309814 + }, + { + "auxiliary_loss_clip": 0.01057245, + "auxiliary_loss_mlp": 0.01023287, + "balance_loss_clip": 1.01137769, + "balance_loss_mlp": 1.0187999, + "epoch": 0.7853299263490154, + "flos": 24643233699840.0, + "grad_norm": 2.1251084746642137, + "language_loss": 0.7423557, + "learning_rate": 4.378525558469255e-07, + "loss": 0.763161, + "num_input_tokens_seen": 281751980, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.38476562, + "step": 13062, + "time_per_iteration": 2.4266674518585205 + }, + { + "auxiliary_loss_clip": 0.01055108, + "auxiliary_loss_mlp": 0.01024223, + "balance_loss_clip": 1.01258278, + "balance_loss_mlp": 1.01815379, + "epoch": 0.7853900496016835, + "flos": 22016892274560.0, + "grad_norm": 1.3785267866413355, + "language_loss": 0.68563467, + "learning_rate": 4.37616656794225e-07, + "loss": 0.70642799, + "num_input_tokens_seen": 281772670, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.36914062, + "step": 13063, + "time_per_iteration": 2.4046854972839355 + }, + { + "auxiliary_loss_clip": 0.01058314, + "auxiliary_loss_mlp": 0.01026715, + "balance_loss_clip": 1.01536584, + "balance_loss_mlp": 1.02105653, + "epoch": 0.7854501728543514, + "flos": 30226090078080.0, + "grad_norm": 5.26900995597084, + "language_loss": 0.72889233, + "learning_rate": 4.3738081349896805e-07, + "loss": 0.74974263, + "num_input_tokens_seen": 281792930, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.37109375, + "step": 13064, + "time_per_iteration": 2.448859453201294 + }, + { + "auxiliary_loss_clip": 0.01058368, + "auxiliary_loss_mlp": 0.01025473, + "balance_loss_clip": 1.01401651, + "balance_loss_mlp": 1.01820552, + "epoch": 0.7855102961070194, + "flos": 18842693794560.0, + "grad_norm": 1.6345751441980818, + "language_loss": 0.6829797, + "learning_rate": 4.3714502596956926e-07, + "loss": 0.7038182, + "num_input_tokens_seen": 281811805, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.40234375, + "step": 13065, + "time_per_iteration": 2.3666253089904785 + }, + { + "auxiliary_loss_clip": 0.01056614, + "auxiliary_loss_mlp": 0.01028106, + "balance_loss_clip": 1.01651907, + "balance_loss_mlp": 1.01882255, + "epoch": 0.7855704193596874, + "flos": 22308870908160.0, + "grad_norm": 1.8251744263354364, + "language_loss": 0.76703262, + "learning_rate": 4.36909294214445e-07, + "loss": 0.78787982, + "num_input_tokens_seen": 281831885, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.37890625, + "step": 13066, + "time_per_iteration": 2.391582727432251 + }, + { + "auxiliary_loss_clip": 0.01006838, + "auxiliary_loss_mlp": 0.01001085, + "balance_loss_clip": 1.00009549, + "balance_loss_mlp": 1.00061858, + "epoch": 0.7856305426123553, + "flos": 60001136035200.0, + "grad_norm": 0.7198335005273792, + "language_loss": 0.5340693, + "learning_rate": 4.366736182420074e-07, + "loss": 0.55414855, + "num_input_tokens_seen": 281900310, + "router_z_loss_clip": 0.0098877, + "router_z_loss_mlp": 0.06201172, + "step": 13067, + "time_per_iteration": 3.1003193855285645 + }, + { + "auxiliary_loss_clip": 0.01057343, + "auxiliary_loss_mlp": 0.01025714, + "balance_loss_clip": 1.01343513, + "balance_loss_mlp": 1.01840734, + "epoch": 0.7856906658650233, + "flos": 21867603834240.0, + "grad_norm": 1.523898612655227, + "language_loss": 0.67802584, + "learning_rate": 4.3643799806066693e-07, + "loss": 0.69885635, + "num_input_tokens_seen": 281918870, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.38867188, + "step": 13068, + "time_per_iteration": 2.403552532196045 + }, + { + "auxiliary_loss_clip": 0.01056686, + "auxiliary_loss_mlp": 0.01024291, + "balance_loss_clip": 1.01295972, + "balance_loss_mlp": 1.01991296, + "epoch": 0.7857507891176913, + "flos": 23621814696960.0, + "grad_norm": 1.9515606513427683, + "language_loss": 0.67656797, + "learning_rate": 4.3620243367883167e-07, + "loss": 0.6973778, + "num_input_tokens_seen": 281936905, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.3671875, + "step": 13069, + "time_per_iteration": 3.898364782333374 + }, + { + "auxiliary_loss_clip": 0.01057978, + "auxiliary_loss_mlp": 0.01025167, + "balance_loss_clip": 1.01260257, + "balance_loss_mlp": 1.01935816, + "epoch": 0.7858109123703593, + "flos": 25518890309760.0, + "grad_norm": 1.4628044204728512, + "language_loss": 0.76938051, + "learning_rate": 4.359669251049096e-07, + "loss": 0.79021198, + "num_input_tokens_seen": 281955625, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.38671875, + "step": 13070, + "time_per_iteration": 2.4138858318328857 + }, + { + "auxiliary_loss_clip": 0.01055619, + "auxiliary_loss_mlp": 0.01022513, + "balance_loss_clip": 1.01143241, + "balance_loss_mlp": 1.01803207, + "epoch": 0.7858710356230272, + "flos": 17456432417280.0, + "grad_norm": 1.8587597528840825, + "language_loss": 0.66169971, + "learning_rate": 4.3573147234730536e-07, + "loss": 0.68248105, + "num_input_tokens_seen": 281973285, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.375, + "step": 13071, + "time_per_iteration": 2.3856303691864014 + }, + { + "auxiliary_loss_clip": 0.01055488, + "auxiliary_loss_mlp": 0.01022406, + "balance_loss_clip": 1.01127815, + "balance_loss_mlp": 1.01780891, + "epoch": 0.7859311588756952, + "flos": 24678565862400.0, + "grad_norm": 1.5736295997627603, + "language_loss": 0.74165046, + "learning_rate": 4.3549607541441993e-07, + "loss": 0.76242942, + "num_input_tokens_seen": 281991410, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.37695312, + "step": 13072, + "time_per_iteration": 2.45031476020813 + }, + { + "auxiliary_loss_clip": 0.01058046, + "auxiliary_loss_mlp": 0.010218, + "balance_loss_clip": 1.00960505, + "balance_loss_mlp": 1.01945019, + "epoch": 0.7859912821283631, + "flos": 21798056672640.0, + "grad_norm": 1.7659676567235922, + "language_loss": 0.71232188, + "learning_rate": 4.352607343146559e-07, + "loss": 0.73312032, + "num_input_tokens_seen": 282010845, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.38671875, + "step": 13073, + "time_per_iteration": 2.4287004470825195 + }, + { + "auxiliary_loss_clip": 0.01056828, + "auxiliary_loss_mlp": 0.01023341, + "balance_loss_clip": 1.01125908, + "balance_loss_mlp": 1.01821899, + "epoch": 0.7860514053810311, + "flos": 20846324476800.0, + "grad_norm": 1.7080497622334085, + "language_loss": 0.7721228, + "learning_rate": 4.3502544905641113e-07, + "loss": 0.79292452, + "num_input_tokens_seen": 282029635, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.38671875, + "step": 13074, + "time_per_iteration": 2.3892223834991455 + }, + { + "auxiliary_loss_clip": 0.01054903, + "auxiliary_loss_mlp": 0.0102193, + "balance_loss_clip": 1.01049805, + "balance_loss_mlp": 1.01857924, + "epoch": 0.786111528633699, + "flos": 24314561360640.0, + "grad_norm": 1.547872704373708, + "language_loss": 0.74929428, + "learning_rate": 4.347902196480826e-07, + "loss": 0.77006269, + "num_input_tokens_seen": 282050285, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.36328125, + "step": 13075, + "time_per_iteration": 2.4187567234039307 + }, + { + "auxiliary_loss_clip": 0.01056026, + "auxiliary_loss_mlp": 0.01023945, + "balance_loss_clip": 1.01285839, + "balance_loss_mlp": 1.01829886, + "epoch": 0.7861716518863671, + "flos": 24352023116160.0, + "grad_norm": 1.744560709507233, + "language_loss": 0.69105679, + "learning_rate": 4.3455504609806426e-07, + "loss": 0.71185648, + "num_input_tokens_seen": 282071040, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.37695312, + "step": 13076, + "time_per_iteration": 2.4407360553741455 + }, + { + "auxiliary_loss_clip": 0.01055428, + "auxiliary_loss_mlp": 0.01025747, + "balance_loss_clip": 1.01386213, + "balance_loss_mlp": 1.01842713, + "epoch": 0.786231775139035, + "flos": 14021677393920.0, + "grad_norm": 2.269634445084744, + "language_loss": 0.80294251, + "learning_rate": 4.3431992841475004e-07, + "loss": 0.82375425, + "num_input_tokens_seen": 282086610, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.37109375, + "step": 13077, + "time_per_iteration": 2.3455491065979004 + }, + { + "auxiliary_loss_clip": 0.0105755, + "auxiliary_loss_mlp": 0.01019546, + "balance_loss_clip": 1.00809014, + "balance_loss_mlp": 1.01883614, + "epoch": 0.786291898391703, + "flos": 33722991054720.0, + "grad_norm": 1.6622470127725513, + "language_loss": 0.70808363, + "learning_rate": 4.340848666065302e-07, + "loss": 0.7288546, + "num_input_tokens_seen": 282107440, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.38671875, + "step": 13078, + "time_per_iteration": 2.4816648960113525 + }, + { + "auxiliary_loss_clip": 0.01057682, + "auxiliary_loss_mlp": 0.01023588, + "balance_loss_clip": 1.01287675, + "balance_loss_mlp": 1.02003813, + "epoch": 0.786352021644371, + "flos": 25810310361600.0, + "grad_norm": 1.4264554339564919, + "language_loss": 0.81405079, + "learning_rate": 4.338498606817935e-07, + "loss": 0.83486348, + "num_input_tokens_seen": 282127290, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.37695312, + "step": 13079, + "time_per_iteration": 2.435631513595581 + }, + { + "auxiliary_loss_clip": 0.01054342, + "auxiliary_loss_mlp": 0.01023524, + "balance_loss_clip": 1.01248562, + "balance_loss_mlp": 1.01794684, + "epoch": 0.7864121448970389, + "flos": 28909620241920.0, + "grad_norm": 1.5519579699810089, + "language_loss": 0.68571162, + "learning_rate": 4.336149106489262e-07, + "loss": 0.70649028, + "num_input_tokens_seen": 282147505, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.36328125, + "step": 13080, + "time_per_iteration": 2.4396955966949463 + }, + { + "auxiliary_loss_clip": 0.01057431, + "auxiliary_loss_mlp": 0.01023313, + "balance_loss_clip": 1.01135087, + "balance_loss_mlp": 1.01919127, + "epoch": 0.7864722681497069, + "flos": 19207815459840.0, + "grad_norm": 1.6588350959624374, + "language_loss": 0.69863701, + "learning_rate": 4.3338001651631464e-07, + "loss": 0.71944445, + "num_input_tokens_seen": 282166450, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.3828125, + "step": 13081, + "time_per_iteration": 2.3731164932250977 + }, + { + "auxiliary_loss_clip": 0.01056833, + "auxiliary_loss_mlp": 0.01024683, + "balance_loss_clip": 1.01331043, + "balance_loss_mlp": 1.01818109, + "epoch": 0.7865323914023749, + "flos": 21870501477120.0, + "grad_norm": 4.621975607808734, + "language_loss": 0.68417412, + "learning_rate": 4.331451782923392e-07, + "loss": 0.70498931, + "num_input_tokens_seen": 282186465, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.38671875, + "step": 13082, + "time_per_iteration": 2.4195332527160645 + }, + { + "auxiliary_loss_clip": 0.01056919, + "auxiliary_loss_mlp": 0.01024492, + "balance_loss_clip": 1.01309609, + "balance_loss_mlp": 1.01940989, + "epoch": 0.7865925146550429, + "flos": 25519134689280.0, + "grad_norm": 1.632213386602927, + "language_loss": 0.66167539, + "learning_rate": 4.3291039598538237e-07, + "loss": 0.68248951, + "num_input_tokens_seen": 282207180, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.375, + "step": 13083, + "time_per_iteration": 2.4182755947113037 + }, + { + "auxiliary_loss_clip": 0.01058946, + "auxiliary_loss_mlp": 0.01025464, + "balance_loss_clip": 1.0123806, + "balance_loss_mlp": 1.01886225, + "epoch": 0.7866526379077108, + "flos": 19096407711360.0, + "grad_norm": 2.2967042816552117, + "language_loss": 0.74464053, + "learning_rate": 4.3267566960382273e-07, + "loss": 0.76548457, + "num_input_tokens_seen": 282225865, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.40039062, + "step": 13084, + "time_per_iteration": 2.3915398120880127 + }, + { + "auxiliary_loss_clip": 0.01058651, + "auxiliary_loss_mlp": 0.01026293, + "balance_loss_clip": 1.0147115, + "balance_loss_mlp": 1.02028537, + "epoch": 0.7867127611603788, + "flos": 16173025505280.0, + "grad_norm": 2.5080728650068878, + "language_loss": 0.70141828, + "learning_rate": 4.324409991560367e-07, + "loss": 0.72226775, + "num_input_tokens_seen": 282242895, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.3828125, + "step": 13085, + "time_per_iteration": 2.3873722553253174 + }, + { + "auxiliary_loss_clip": 0.01058929, + "auxiliary_loss_mlp": 0.01023072, + "balance_loss_clip": 1.01087677, + "balance_loss_mlp": 1.0188235, + "epoch": 0.7867728844130467, + "flos": 20772692686080.0, + "grad_norm": 1.659309893255621, + "language_loss": 0.724334, + "learning_rate": 4.3220638465039916e-07, + "loss": 0.74515402, + "num_input_tokens_seen": 282260425, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.40234375, + "step": 13086, + "time_per_iteration": 2.3950905799865723 + }, + { + "auxiliary_loss_clip": 0.01055989, + "auxiliary_loss_mlp": 0.01022564, + "balance_loss_clip": 1.01147735, + "balance_loss_mlp": 1.01818287, + "epoch": 0.7868330076657147, + "flos": 21759093728640.0, + "grad_norm": 6.946196622198737, + "language_loss": 0.74481165, + "learning_rate": 4.319718260952823e-07, + "loss": 0.76559716, + "num_input_tokens_seen": 282279335, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.37695312, + "step": 13087, + "time_per_iteration": 2.3815841674804688 + }, + { + "auxiliary_loss_clip": 0.01055559, + "auxiliary_loss_mlp": 0.01021688, + "balance_loss_clip": 1.01070261, + "balance_loss_mlp": 1.01867688, + "epoch": 0.7868931309183826, + "flos": 25699565928960.0, + "grad_norm": 1.5179774637955405, + "language_loss": 0.71412927, + "learning_rate": 4.317373234990587e-07, + "loss": 0.73490179, + "num_input_tokens_seen": 282299905, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.36914062, + "step": 13088, + "time_per_iteration": 2.416982889175415 + }, + { + "auxiliary_loss_clip": 0.01006559, + "auxiliary_loss_mlp": 0.01000958, + "balance_loss_clip": 0.99997407, + "balance_loss_mlp": 1.00047898, + "epoch": 0.7869532541710507, + "flos": 64641267348480.0, + "grad_norm": 0.6765078920765363, + "language_loss": 0.55478454, + "learning_rate": 4.3150287687009477e-07, + "loss": 0.57485974, + "num_input_tokens_seen": 282367620, + "router_z_loss_clip": 0.00982666, + "router_z_loss_mlp": 0.06079102, + "step": 13089, + "time_per_iteration": 3.115201473236084 + }, + { + "auxiliary_loss_clip": 0.01006725, + "auxiliary_loss_mlp": 0.01000934, + "balance_loss_clip": 0.99997389, + "balance_loss_mlp": 1.00049961, + "epoch": 0.7870133774237186, + "flos": 67449925226880.0, + "grad_norm": 0.724397360014529, + "language_loss": 0.50017875, + "learning_rate": 4.3126848621675905e-07, + "loss": 0.52025539, + "num_input_tokens_seen": 282435695, + "router_z_loss_clip": 0.00958252, + "router_z_loss_mlp": 0.06201172, + "step": 13090, + "time_per_iteration": 3.1644983291625977 + }, + { + "auxiliary_loss_clip": 0.01055016, + "auxiliary_loss_mlp": 0.01024176, + "balance_loss_clip": 1.01362598, + "balance_loss_mlp": 1.0187875, + "epoch": 0.7870735006763866, + "flos": 26067096478080.0, + "grad_norm": 1.5788929424135427, + "language_loss": 0.8385824, + "learning_rate": 4.3103415154741583e-07, + "loss": 0.85937434, + "num_input_tokens_seen": 282456025, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.36328125, + "step": 13091, + "time_per_iteration": 3.903646945953369 + }, + { + "auxiliary_loss_clip": 0.01006756, + "auxiliary_loss_mlp": 0.01001352, + "balance_loss_clip": 1.00038612, + "balance_loss_mlp": 1.00061655, + "epoch": 0.7871336239290546, + "flos": 70286095123200.0, + "grad_norm": 0.7120338271648537, + "language_loss": 0.63978136, + "learning_rate": 4.307998728704281e-07, + "loss": 0.6598624, + "num_input_tokens_seen": 282520995, + "router_z_loss_clip": 0.00964355, + "router_z_loss_mlp": 0.06152344, + "step": 13092, + "time_per_iteration": 3.1061651706695557 + }, + { + "auxiliary_loss_clip": 0.01058166, + "auxiliary_loss_mlp": 0.01027972, + "balance_loss_clip": 1.01624775, + "balance_loss_mlp": 1.01861191, + "epoch": 0.7871937471817225, + "flos": 15777668736000.0, + "grad_norm": 1.9472860356249269, + "language_loss": 0.79107958, + "learning_rate": 4.305656501941557e-07, + "loss": 0.81194097, + "num_input_tokens_seen": 282539355, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.39453125, + "step": 13093, + "time_per_iteration": 2.3652384281158447 + }, + { + "auxiliary_loss_clip": 0.01056169, + "auxiliary_loss_mlp": 0.0101993, + "balance_loss_clip": 1.00911796, + "balance_loss_mlp": 1.01896453, + "epoch": 0.7872538704343905, + "flos": 20484205188480.0, + "grad_norm": 1.743297884319831, + "language_loss": 0.75674433, + "learning_rate": 4.3033148352695915e-07, + "loss": 0.77750528, + "num_input_tokens_seen": 282555735, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.37304688, + "step": 13094, + "time_per_iteration": 2.3833940029144287 + }, + { + "auxiliary_loss_clip": 0.01057902, + "auxiliary_loss_mlp": 0.01023517, + "balance_loss_clip": 1.01212668, + "balance_loss_mlp": 1.01984358, + "epoch": 0.7873139936870585, + "flos": 25081498396800.0, + "grad_norm": 1.5796605199051252, + "language_loss": 0.79766363, + "learning_rate": 4.3009737287719327e-07, + "loss": 0.81847781, + "num_input_tokens_seen": 282574550, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.38085938, + "step": 13095, + "time_per_iteration": 2.452667474746704 + }, + { + "auxiliary_loss_clip": 0.01058273, + "auxiliary_loss_mlp": 0.01025707, + "balance_loss_clip": 1.01364255, + "balance_loss_mlp": 1.018677, + "epoch": 0.7873741169397265, + "flos": 30881863745280.0, + "grad_norm": 1.4825611022660772, + "language_loss": 0.67728609, + "learning_rate": 4.2986331825321455e-07, + "loss": 0.69812584, + "num_input_tokens_seen": 282596520, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.39453125, + "step": 13096, + "time_per_iteration": 2.4735169410705566 + }, + { + "auxiliary_loss_clip": 0.0105321, + "auxiliary_loss_mlp": 0.01019563, + "balance_loss_clip": 1.00964499, + "balance_loss_mlp": 1.01771688, + "epoch": 0.7874342401923944, + "flos": 46790178255360.0, + "grad_norm": 1.4285389863594837, + "language_loss": 0.7059899, + "learning_rate": 4.296293196633745e-07, + "loss": 0.72671759, + "num_input_tokens_seen": 282620560, + "router_z_loss_clip": 0.09960938, + "router_z_loss_mlp": 0.35546875, + "step": 13097, + "time_per_iteration": 2.6336073875427246 + }, + { + "auxiliary_loss_clip": 0.01059454, + "auxiliary_loss_mlp": 0.01024798, + "balance_loss_clip": 1.0123111, + "balance_loss_mlp": 1.02072072, + "epoch": 0.7874943634450624, + "flos": 23583480157440.0, + "grad_norm": 1.7071525039867868, + "language_loss": 0.80464482, + "learning_rate": 4.293953771160257e-07, + "loss": 0.82548738, + "num_input_tokens_seen": 282639830, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.38671875, + "step": 13098, + "time_per_iteration": 2.413896083831787 + }, + { + "auxiliary_loss_clip": 0.01055741, + "auxiliary_loss_mlp": 0.01022669, + "balance_loss_clip": 1.01148129, + "balance_loss_mlp": 1.01881528, + "epoch": 0.7875544866977303, + "flos": 20190201696000.0, + "grad_norm": 1.6530376769730224, + "language_loss": 0.74195242, + "learning_rate": 4.291614906195147e-07, + "loss": 0.76273656, + "num_input_tokens_seen": 282660130, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.36914062, + "step": 13099, + "time_per_iteration": 2.4078047275543213 + }, + { + "auxiliary_loss_clip": 0.0105649, + "auxiliary_loss_mlp": 0.01020891, + "balance_loss_clip": 1.00896406, + "balance_loss_mlp": 1.01849294, + "epoch": 0.7876146099503983, + "flos": 22601443034880.0, + "grad_norm": 1.5256520903770563, + "language_loss": 0.78092813, + "learning_rate": 4.2892766018218985e-07, + "loss": 0.8017019, + "num_input_tokens_seen": 282681125, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.37890625, + "step": 13100, + "time_per_iteration": 3.7959301471710205 + }, + { + "auxiliary_loss_clip": 0.01063078, + "auxiliary_loss_mlp": 0.01023818, + "balance_loss_clip": 1.0105139, + "balance_loss_mlp": 1.02018869, + "epoch": 0.7876747332030662, + "flos": 10705102922880.0, + "grad_norm": 2.306751175417003, + "language_loss": 0.66161376, + "learning_rate": 4.286938858123963e-07, + "loss": 0.68248272, + "num_input_tokens_seen": 282696690, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.4296875, + "step": 13101, + "time_per_iteration": 3.7550573348999023 + }, + { + "auxiliary_loss_clip": 0.01056534, + "auxiliary_loss_mlp": 0.01024927, + "balance_loss_clip": 1.01348329, + "balance_loss_mlp": 1.01870012, + "epoch": 0.7877348564557343, + "flos": 38397791214720.0, + "grad_norm": 1.9983448927258551, + "language_loss": 0.77611971, + "learning_rate": 4.2846016751847494e-07, + "loss": 0.79693437, + "num_input_tokens_seen": 282721210, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.37890625, + "step": 13102, + "time_per_iteration": 2.5554263591766357 + }, + { + "auxiliary_loss_clip": 0.01057321, + "auxiliary_loss_mlp": 0.01027058, + "balance_loss_clip": 1.01622772, + "balance_loss_mlp": 1.0192616, + "epoch": 0.7877949797084022, + "flos": 18328632802560.0, + "grad_norm": 1.968016911520255, + "language_loss": 0.82728076, + "learning_rate": 4.282265053087681e-07, + "loss": 0.8481245, + "num_input_tokens_seen": 282738505, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.38085938, + "step": 13103, + "time_per_iteration": 2.3781328201293945 + }, + { + "auxiliary_loss_clip": 0.01058967, + "auxiliary_loss_mlp": 0.01027464, + "balance_loss_clip": 1.0147326, + "balance_loss_mlp": 1.02005386, + "epoch": 0.7878551029610702, + "flos": 25805702062080.0, + "grad_norm": 1.9484197230262201, + "language_loss": 0.81235921, + "learning_rate": 4.279928991916137e-07, + "loss": 0.83322352, + "num_input_tokens_seen": 282756895, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.38867188, + "step": 13104, + "time_per_iteration": 2.4466450214385986 + }, + { + "auxiliary_loss_clip": 0.01058201, + "auxiliary_loss_mlp": 0.01024879, + "balance_loss_clip": 1.01286292, + "balance_loss_mlp": 1.01888299, + "epoch": 0.7879152262137382, + "flos": 22341689452800.0, + "grad_norm": 1.6376700607063654, + "language_loss": 0.74020272, + "learning_rate": 4.2775934917535015e-07, + "loss": 0.76103354, + "num_input_tokens_seen": 282774955, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.39257812, + "step": 13105, + "time_per_iteration": 2.3898561000823975 + }, + { + "auxiliary_loss_clip": 0.01058351, + "auxiliary_loss_mlp": 0.01031527, + "balance_loss_clip": 1.01888454, + "balance_loss_mlp": 1.01884794, + "epoch": 0.7879753494664061, + "flos": 24784317970560.0, + "grad_norm": 1.5291238952126185, + "language_loss": 0.75555497, + "learning_rate": 4.275258552683101e-07, + "loss": 0.77645373, + "num_input_tokens_seen": 282793165, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.39453125, + "step": 13106, + "time_per_iteration": 2.414935350418091 + }, + { + "auxiliary_loss_clip": 0.01058455, + "auxiliary_loss_mlp": 0.01026217, + "balance_loss_clip": 1.0138433, + "balance_loss_mlp": 1.01904297, + "epoch": 0.7880354727190741, + "flos": 16908156426240.0, + "grad_norm": 1.9767599781524996, + "language_loss": 0.72921538, + "learning_rate": 4.272924174788279e-07, + "loss": 0.75006211, + "num_input_tokens_seen": 282809820, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.39453125, + "step": 13107, + "time_per_iteration": 2.360187292098999 + }, + { + "auxiliary_loss_clip": 0.01057488, + "auxiliary_loss_mlp": 0.01025926, + "balance_loss_clip": 1.01422536, + "balance_loss_mlp": 1.0187912, + "epoch": 0.7880955959717421, + "flos": 22229583477120.0, + "grad_norm": 2.0171492516199843, + "language_loss": 0.7332108, + "learning_rate": 4.2705903581523396e-07, + "loss": 0.75404495, + "num_input_tokens_seen": 282828600, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.38671875, + "step": 13108, + "time_per_iteration": 2.4177567958831787 + }, + { + "auxiliary_loss_clip": 0.01006784, + "auxiliary_loss_mlp": 0.0100268, + "balance_loss_clip": 1.00171447, + "balance_loss_mlp": 1.00060678, + "epoch": 0.7881557192244101, + "flos": 69180082162560.0, + "grad_norm": 0.8934868753960808, + "language_loss": 0.60388857, + "learning_rate": 4.268257102858568e-07, + "loss": 0.6239832, + "num_input_tokens_seen": 282882775, + "router_z_loss_clip": 0.00964355, + "router_z_loss_mlp": 0.06176758, + "step": 13109, + "time_per_iteration": 4.414198398590088 + }, + { + "auxiliary_loss_clip": 0.01055508, + "auxiliary_loss_mlp": 0.01024075, + "balance_loss_clip": 1.0129652, + "balance_loss_mlp": 1.01866531, + "epoch": 0.788215842477078, + "flos": 24934304638080.0, + "grad_norm": 1.615465251196256, + "language_loss": 0.72367048, + "learning_rate": 4.265924408990227e-07, + "loss": 0.7444663, + "num_input_tokens_seen": 282902680, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.36914062, + "step": 13110, + "time_per_iteration": 2.438549518585205 + }, + { + "auxiliary_loss_clip": 0.01056132, + "auxiliary_loss_mlp": 0.01021748, + "balance_loss_clip": 1.01089406, + "balance_loss_mlp": 1.01925969, + "epoch": 0.788275965729746, + "flos": 26105221549440.0, + "grad_norm": 1.6260641607768003, + "language_loss": 0.75465417, + "learning_rate": 4.263592276630583e-07, + "loss": 0.77543294, + "num_input_tokens_seen": 282923625, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.3671875, + "step": 13111, + "time_per_iteration": 2.463726758956909 + }, + { + "auxiliary_loss_clip": 0.0105692, + "auxiliary_loss_mlp": 0.01025497, + "balance_loss_clip": 1.01405275, + "balance_loss_mlp": 1.01967096, + "epoch": 0.7883360889824139, + "flos": 21213750291840.0, + "grad_norm": 1.8736760558237155, + "language_loss": 0.61407983, + "learning_rate": 4.2612607058628413e-07, + "loss": 0.63490403, + "num_input_tokens_seen": 282941955, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.37304688, + "step": 13112, + "time_per_iteration": 2.386531114578247 + }, + { + "auxiliary_loss_clip": 0.0105574, + "auxiliary_loss_mlp": 0.01023665, + "balance_loss_clip": 1.01157141, + "balance_loss_mlp": 1.01696897, + "epoch": 0.7883962122350819, + "flos": 21141480044160.0, + "grad_norm": 1.6884208568414383, + "language_loss": 0.67355788, + "learning_rate": 4.258929696770226e-07, + "loss": 0.69435191, + "num_input_tokens_seen": 282961280, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.38671875, + "step": 13113, + "time_per_iteration": 2.400778293609619 + }, + { + "auxiliary_loss_clip": 0.01057042, + "auxiliary_loss_mlp": 0.01022964, + "balance_loss_clip": 1.01172292, + "balance_loss_mlp": 1.01957893, + "epoch": 0.7884563354877498, + "flos": 15302047017600.0, + "grad_norm": 2.1226438996874357, + "language_loss": 0.58488882, + "learning_rate": 4.2565992494359127e-07, + "loss": 0.60568893, + "num_input_tokens_seen": 282978210, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.375, + "step": 13114, + "time_per_iteration": 2.3719820976257324 + }, + { + "auxiliary_loss_clip": 0.01058709, + "auxiliary_loss_mlp": 0.01023467, + "balance_loss_clip": 1.01136088, + "balance_loss_mlp": 1.01912558, + "epoch": 0.7885164587404179, + "flos": 24387180721920.0, + "grad_norm": 1.6887988922444375, + "language_loss": 0.66814649, + "learning_rate": 4.254269363943086e-07, + "loss": 0.68896824, + "num_input_tokens_seen": 282998845, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.39648438, + "step": 13115, + "time_per_iteration": 2.448004961013794 + }, + { + "auxiliary_loss_clip": 0.01056793, + "auxiliary_loss_mlp": 0.01022312, + "balance_loss_clip": 1.01068902, + "balance_loss_mlp": 1.01919627, + "epoch": 0.7885765819930858, + "flos": 14385193136640.0, + "grad_norm": 2.251312911274626, + "language_loss": 0.88600957, + "learning_rate": 4.2519400403748796e-07, + "loss": 0.90680069, + "num_input_tokens_seen": 283015200, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.375, + "step": 13116, + "time_per_iteration": 2.3864922523498535 + }, + { + "auxiliary_loss_clip": 0.01060889, + "auxiliary_loss_mlp": 0.01029566, + "balance_loss_clip": 1.01613688, + "balance_loss_mlp": 1.01925278, + "epoch": 0.7886367052457538, + "flos": 18258945995520.0, + "grad_norm": 8.78683125050798, + "language_loss": 0.72217953, + "learning_rate": 4.2496112788144157e-07, + "loss": 0.74308407, + "num_input_tokens_seen": 283033680, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.41601562, + "step": 13117, + "time_per_iteration": 2.4365267753601074 + }, + { + "auxiliary_loss_clip": 0.01054524, + "auxiliary_loss_mlp": 0.01021361, + "balance_loss_clip": 1.01056695, + "balance_loss_mlp": 1.01882339, + "epoch": 0.7886968284984217, + "flos": 15304176610560.0, + "grad_norm": 1.8454646352575275, + "language_loss": 0.80167067, + "learning_rate": 4.2472830793448234e-07, + "loss": 0.82242954, + "num_input_tokens_seen": 283050620, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.35742188, + "step": 13118, + "time_per_iteration": 2.3755056858062744 + }, + { + "auxiliary_loss_clip": 0.01055632, + "auxiliary_loss_mlp": 0.01022364, + "balance_loss_clip": 1.01047266, + "balance_loss_mlp": 1.01815808, + "epoch": 0.7887569517510897, + "flos": 21214378696320.0, + "grad_norm": 1.747091649417682, + "language_loss": 0.73361886, + "learning_rate": 4.244955442049165e-07, + "loss": 0.75439876, + "num_input_tokens_seen": 283070215, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.375, + "step": 13119, + "time_per_iteration": 2.409320592880249 + }, + { + "auxiliary_loss_clip": 0.01059324, + "auxiliary_loss_mlp": 0.01020645, + "balance_loss_clip": 1.00868821, + "balance_loss_mlp": 1.01952052, + "epoch": 0.7888170750037578, + "flos": 22710127697280.0, + "grad_norm": 1.2800931777143858, + "language_loss": 0.71886683, + "learning_rate": 4.242628367010528e-07, + "loss": 0.73966652, + "num_input_tokens_seen": 283091485, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.3984375, + "step": 13120, + "time_per_iteration": 2.418099880218506 + }, + { + "auxiliary_loss_clip": 0.01057189, + "auxiliary_loss_mlp": 0.0102975, + "balance_loss_clip": 1.01843071, + "balance_loss_mlp": 1.01872194, + "epoch": 0.7888771982564257, + "flos": 36427677304320.0, + "grad_norm": 1.368005249712339, + "language_loss": 0.78645778, + "learning_rate": 4.240301854311943e-07, + "loss": 0.80732715, + "num_input_tokens_seen": 283115040, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.38476562, + "step": 13121, + "time_per_iteration": 2.525477647781372 + }, + { + "auxiliary_loss_clip": 0.01056275, + "auxiliary_loss_mlp": 0.01024153, + "balance_loss_clip": 1.01263165, + "balance_loss_mlp": 1.01875687, + "epoch": 0.7889373215090937, + "flos": 27308712625920.0, + "grad_norm": 1.4477759630226559, + "language_loss": 0.80236048, + "learning_rate": 4.2379759040364594e-07, + "loss": 0.8231647, + "num_input_tokens_seen": 283136925, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.375, + "step": 13122, + "time_per_iteration": 2.4369192123413086 + }, + { + "auxiliary_loss_clip": 0.0105795, + "auxiliary_loss_mlp": 0.01022332, + "balance_loss_clip": 1.01128125, + "balance_loss_mlp": 1.01906955, + "epoch": 0.7889974447617616, + "flos": 19827977673600.0, + "grad_norm": 1.8608637335102873, + "language_loss": 0.77975893, + "learning_rate": 4.235650516267058e-07, + "loss": 0.80056173, + "num_input_tokens_seen": 283155725, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.38867188, + "step": 13123, + "time_per_iteration": 2.381650686264038 + }, + { + "auxiliary_loss_clip": 0.01057533, + "auxiliary_loss_mlp": 0.01025973, + "balance_loss_clip": 1.01352179, + "balance_loss_mlp": 1.01818001, + "epoch": 0.7890575680144296, + "flos": 17270345537280.0, + "grad_norm": 2.4667646233858993, + "language_loss": 0.66852736, + "learning_rate": 4.2333256910867467e-07, + "loss": 0.68936247, + "num_input_tokens_seen": 283173845, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.39453125, + "step": 13124, + "time_per_iteration": 2.362889289855957 + }, + { + "auxiliary_loss_clip": 0.0105848, + "auxiliary_loss_mlp": 0.0102054, + "balance_loss_clip": 1.00799906, + "balance_loss_mlp": 1.01942146, + "epoch": 0.7891176912670975, + "flos": 27598910780160.0, + "grad_norm": 2.2220856974826573, + "language_loss": 0.72748399, + "learning_rate": 4.2310014285784824e-07, + "loss": 0.74827409, + "num_input_tokens_seen": 283191985, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.390625, + "step": 13125, + "time_per_iteration": 2.4263088703155518 + }, + { + "auxiliary_loss_clip": 0.01056111, + "auxiliary_loss_mlp": 0.01019603, + "balance_loss_clip": 1.0071938, + "balance_loss_mlp": 1.01699209, + "epoch": 0.7891778145197655, + "flos": 22710546633600.0, + "grad_norm": 1.769395356966317, + "language_loss": 0.72572637, + "learning_rate": 4.228677728825216e-07, + "loss": 0.7464835, + "num_input_tokens_seen": 283210855, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.390625, + "step": 13126, + "time_per_iteration": 2.373316526412964 + }, + { + "auxiliary_loss_clip": 0.01059028, + "auxiliary_loss_mlp": 0.01023519, + "balance_loss_clip": 1.01125848, + "balance_loss_mlp": 1.01906455, + "epoch": 0.7892379377724335, + "flos": 17309832151680.0, + "grad_norm": 1.9550819868994407, + "language_loss": 0.76914835, + "learning_rate": 4.2263545919098663e-07, + "loss": 0.7899738, + "num_input_tokens_seen": 283229665, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.40039062, + "step": 13127, + "time_per_iteration": 2.355447292327881 + }, + { + "auxiliary_loss_clip": 0.01055858, + "auxiliary_loss_mlp": 0.01019259, + "balance_loss_clip": 1.00794637, + "balance_loss_mlp": 1.01977909, + "epoch": 0.7892980610251015, + "flos": 25774489440000.0, + "grad_norm": 1.7789590582009136, + "language_loss": 0.85896111, + "learning_rate": 4.2240320179153576e-07, + "loss": 0.87971222, + "num_input_tokens_seen": 283248615, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.36132812, + "step": 13128, + "time_per_iteration": 2.4104199409484863 + }, + { + "auxiliary_loss_clip": 0.01055663, + "auxiliary_loss_mlp": 0.0102415, + "balance_loss_clip": 1.01374352, + "balance_loss_mlp": 1.01869369, + "epoch": 0.7893581842777694, + "flos": 21578871957120.0, + "grad_norm": 2.366064807375237, + "language_loss": 0.6899389, + "learning_rate": 4.221710006924557e-07, + "loss": 0.71073711, + "num_input_tokens_seen": 283267135, + "router_z_loss_clip": 0.10400391, + "router_z_loss_mlp": 0.36914062, + "step": 13129, + "time_per_iteration": 2.397728204727173 + }, + { + "auxiliary_loss_clip": 0.0100681, + "auxiliary_loss_mlp": 0.01002148, + "balance_loss_clip": 1.00122464, + "balance_loss_mlp": 1.00066137, + "epoch": 0.7894183075304374, + "flos": 69012917239680.0, + "grad_norm": 0.7144664982722442, + "language_loss": 0.61620682, + "learning_rate": 4.2193885590203424e-07, + "loss": 0.63629645, + "num_input_tokens_seen": 283328940, + "router_z_loss_clip": 0.00921631, + "router_z_loss_mlp": 0.06152344, + "step": 13130, + "time_per_iteration": 3.025944232940674 + }, + { + "auxiliary_loss_clip": 0.0105832, + "auxiliary_loss_mlp": 0.0102109, + "balance_loss_clip": 1.01032567, + "balance_loss_mlp": 1.01999974, + "epoch": 0.7894784307831053, + "flos": 24242116556160.0, + "grad_norm": 1.5637595647668392, + "language_loss": 0.73591256, + "learning_rate": 4.217067674285557e-07, + "loss": 0.75670671, + "num_input_tokens_seen": 283350000, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.3828125, + "step": 13131, + "time_per_iteration": 3.8663880825042725 + }, + { + "auxiliary_loss_clip": 0.01061811, + "auxiliary_loss_mlp": 0.01022684, + "balance_loss_clip": 1.00899899, + "balance_loss_mlp": 1.01865864, + "epoch": 0.7895385540357733, + "flos": 20265509232000.0, + "grad_norm": 4.691730083250328, + "language_loss": 0.69829607, + "learning_rate": 4.2147473528030295e-07, + "loss": 0.71914113, + "num_input_tokens_seen": 283368020, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.43164062, + "step": 13132, + "time_per_iteration": 2.4445760250091553 + }, + { + "auxiliary_loss_clip": 0.01059526, + "auxiliary_loss_mlp": 0.01030381, + "balance_loss_clip": 1.01704741, + "balance_loss_mlp": 1.0199182, + "epoch": 0.7895986772884414, + "flos": 20995508183040.0, + "grad_norm": 1.7810037845503668, + "language_loss": 0.62010801, + "learning_rate": 4.2124275946555655e-07, + "loss": 0.64100707, + "num_input_tokens_seen": 283387030, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.39453125, + "step": 13133, + "time_per_iteration": 2.396824598312378 + }, + { + "auxiliary_loss_clip": 0.01061784, + "auxiliary_loss_mlp": 0.01029065, + "balance_loss_clip": 1.01585698, + "balance_loss_mlp": 1.01998901, + "epoch": 0.7896588005411093, + "flos": 18657095673600.0, + "grad_norm": 2.056762619138602, + "language_loss": 0.79582876, + "learning_rate": 4.2101083999259424e-07, + "loss": 0.81673729, + "num_input_tokens_seen": 283402090, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.41796875, + "step": 13134, + "time_per_iteration": 2.3437347412109375 + }, + { + "auxiliary_loss_clip": 0.01056862, + "auxiliary_loss_mlp": 0.01024864, + "balance_loss_clip": 1.01293099, + "balance_loss_mlp": 1.01884127, + "epoch": 0.7897189237937773, + "flos": 18404917856640.0, + "grad_norm": 6.336164544533581, + "language_loss": 0.79917371, + "learning_rate": 4.2077897686969455e-07, + "loss": 0.81999099, + "num_input_tokens_seen": 283421035, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.38085938, + "step": 13135, + "time_per_iteration": 2.383042573928833 + }, + { + "auxiliary_loss_clip": 0.01055115, + "auxiliary_loss_mlp": 0.01025576, + "balance_loss_clip": 1.01365471, + "balance_loss_mlp": 1.01768649, + "epoch": 0.7897790470464452, + "flos": 23730499359360.0, + "grad_norm": 2.140815459868324, + "language_loss": 0.72581565, + "learning_rate": 4.2054717010512997e-07, + "loss": 0.74662256, + "num_input_tokens_seen": 283441830, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.37304688, + "step": 13136, + "time_per_iteration": 2.430723190307617 + }, + { + "auxiliary_loss_clip": 0.01062169, + "auxiliary_loss_mlp": 0.01028082, + "balance_loss_clip": 1.01565993, + "balance_loss_mlp": 1.01968336, + "epoch": 0.7898391702991132, + "flos": 15918194424960.0, + "grad_norm": 1.8756117029364474, + "language_loss": 0.708552, + "learning_rate": 4.203154197071745e-07, + "loss": 0.72945452, + "num_input_tokens_seen": 283459540, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.42578125, + "step": 13137, + "time_per_iteration": 2.3713135719299316 + }, + { + "auxiliary_loss_clip": 0.01055842, + "auxiliary_loss_mlp": 0.01022869, + "balance_loss_clip": 1.01087689, + "balance_loss_mlp": 1.01819813, + "epoch": 0.7898992935517811, + "flos": 19828012584960.0, + "grad_norm": 1.7709972513000896, + "language_loss": 0.7411803, + "learning_rate": 4.200837256840981e-07, + "loss": 0.76196742, + "num_input_tokens_seen": 283478790, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.37695312, + "step": 13138, + "time_per_iteration": 2.396695852279663 + }, + { + "auxiliary_loss_clip": 0.01057038, + "auxiliary_loss_mlp": 0.01023698, + "balance_loss_clip": 1.01233768, + "balance_loss_mlp": 1.01845443, + "epoch": 0.7899594168044491, + "flos": 16215339939840.0, + "grad_norm": 1.91573512396227, + "language_loss": 0.68887037, + "learning_rate": 4.1985208804416985e-07, + "loss": 0.7096777, + "num_input_tokens_seen": 283495720, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.38671875, + "step": 13139, + "time_per_iteration": 3.8135974407196045 + }, + { + "auxiliary_loss_clip": 0.01006822, + "auxiliary_loss_mlp": 0.01000859, + "balance_loss_clip": 0.99994087, + "balance_loss_mlp": 1.000664, + "epoch": 0.790019540057117, + "flos": 68327257582080.0, + "grad_norm": 0.8574880457704557, + "language_loss": 0.6026243, + "learning_rate": 4.196205067956551e-07, + "loss": 0.62270111, + "num_input_tokens_seen": 283558795, + "router_z_loss_clip": 0.00915527, + "router_z_loss_mlp": 0.06152344, + "step": 13140, + "time_per_iteration": 3.1216113567352295 + }, + { + "auxiliary_loss_clip": 0.01059154, + "auxiliary_loss_mlp": 0.01026465, + "balance_loss_clip": 1.0146277, + "balance_loss_mlp": 1.02004981, + "epoch": 0.7900796633097851, + "flos": 30331562895360.0, + "grad_norm": 1.5389895214494296, + "language_loss": 0.75769204, + "learning_rate": 4.1938898194681995e-07, + "loss": 0.77854824, + "num_input_tokens_seen": 283579305, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.390625, + "step": 13141, + "time_per_iteration": 3.919273614883423 + }, + { + "auxiliary_loss_clip": 0.01058232, + "auxiliary_loss_mlp": 0.01029086, + "balance_loss_clip": 1.01702821, + "balance_loss_mlp": 1.01879001, + "epoch": 0.790139786562453, + "flos": 22125716582400.0, + "grad_norm": 1.8897658521341854, + "language_loss": 0.68844306, + "learning_rate": 4.191575135059262e-07, + "loss": 0.70931625, + "num_input_tokens_seen": 283597840, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.39453125, + "step": 13142, + "time_per_iteration": 2.4047350883483887 + }, + { + "auxiliary_loss_clip": 0.01006886, + "auxiliary_loss_mlp": 0.01001617, + "balance_loss_clip": 1.00074124, + "balance_loss_mlp": 1.00079679, + "epoch": 0.790199909815121, + "flos": 58204296535680.0, + "grad_norm": 0.8272467794253001, + "language_loss": 0.60023612, + "learning_rate": 4.189261014812344e-07, + "loss": 0.62032115, + "num_input_tokens_seen": 283647950, + "router_z_loss_clip": 0.00878906, + "router_z_loss_mlp": 0.06103516, + "step": 13143, + "time_per_iteration": 2.8726766109466553 + }, + { + "auxiliary_loss_clip": 0.01060547, + "auxiliary_loss_mlp": 0.01023618, + "balance_loss_clip": 1.01175714, + "balance_loss_mlp": 1.02049446, + "epoch": 0.7902600330677889, + "flos": 34531858500480.0, + "grad_norm": 1.4264241451401625, + "language_loss": 0.74385059, + "learning_rate": 4.186947458810024e-07, + "loss": 0.76469219, + "num_input_tokens_seen": 283670645, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.40039062, + "step": 13144, + "time_per_iteration": 2.506726026535034 + }, + { + "auxiliary_loss_clip": 0.01060242, + "auxiliary_loss_mlp": 0.01025242, + "balance_loss_clip": 1.01253402, + "balance_loss_mlp": 1.01937222, + "epoch": 0.7903201563204569, + "flos": 22345285322880.0, + "grad_norm": 1.9124180945165163, + "language_loss": 0.83021748, + "learning_rate": 4.184634467134884e-07, + "loss": 0.85107231, + "num_input_tokens_seen": 283688830, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.40820312, + "step": 13145, + "time_per_iteration": 2.393202543258667 + }, + { + "auxiliary_loss_clip": 0.01052985, + "auxiliary_loss_mlp": 0.01019079, + "balance_loss_clip": 1.00854719, + "balance_loss_mlp": 1.01681817, + "epoch": 0.790380279573125, + "flos": 22052468816640.0, + "grad_norm": 1.6588216547397028, + "language_loss": 0.72797656, + "learning_rate": 4.1823220398694527e-07, + "loss": 0.74869722, + "num_input_tokens_seen": 283708625, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.36132812, + "step": 13146, + "time_per_iteration": 2.4095189571380615 + }, + { + "auxiliary_loss_clip": 0.01058517, + "auxiliary_loss_mlp": 0.0102236, + "balance_loss_clip": 1.01068985, + "balance_loss_mlp": 1.0188688, + "epoch": 0.7904404028257929, + "flos": 20301574533120.0, + "grad_norm": 1.7770614018686197, + "language_loss": 0.75787437, + "learning_rate": 4.180010177096256e-07, + "loss": 0.77868313, + "num_input_tokens_seen": 283725710, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.39648438, + "step": 13147, + "time_per_iteration": 2.382213592529297 + }, + { + "auxiliary_loss_clip": 0.0105649, + "auxiliary_loss_mlp": 0.01026888, + "balance_loss_clip": 1.01484847, + "balance_loss_mlp": 1.01883161, + "epoch": 0.7905005260784609, + "flos": 20007955065600.0, + "grad_norm": 1.6394120391932847, + "language_loss": 0.72255844, + "learning_rate": 4.177698878897806e-07, + "loss": 0.74339223, + "num_input_tokens_seen": 283744150, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.37695312, + "step": 13148, + "time_per_iteration": 3.8963088989257812 + }, + { + "auxiliary_loss_clip": 0.01056104, + "auxiliary_loss_mlp": 0.01024263, + "balance_loss_clip": 1.0122999, + "balance_loss_mlp": 1.01706994, + "epoch": 0.7905606493311288, + "flos": 26904732750720.0, + "grad_norm": 1.7205590214707167, + "language_loss": 0.71768916, + "learning_rate": 4.175388145356584e-07, + "loss": 0.73849279, + "num_input_tokens_seen": 283764170, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.390625, + "step": 13149, + "time_per_iteration": 2.484783887863159 + }, + { + "auxiliary_loss_clip": 0.01059934, + "auxiliary_loss_mlp": 0.01024792, + "balance_loss_clip": 1.01085031, + "balance_loss_mlp": 1.01974654, + "epoch": 0.7906207725837968, + "flos": 23695097374080.0, + "grad_norm": 1.9577992143864795, + "language_loss": 0.65549618, + "learning_rate": 4.1730779765550527e-07, + "loss": 0.67634344, + "num_input_tokens_seen": 283784305, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.40234375, + "step": 13150, + "time_per_iteration": 2.442737340927124 + }, + { + "auxiliary_loss_clip": 0.01056486, + "auxiliary_loss_mlp": 0.01023587, + "balance_loss_clip": 1.01244068, + "balance_loss_mlp": 1.01798248, + "epoch": 0.7906808958364647, + "flos": 20847825665280.0, + "grad_norm": 2.386582164698536, + "language_loss": 0.69897461, + "learning_rate": 4.17076837257565e-07, + "loss": 0.71977532, + "num_input_tokens_seen": 283804040, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.38671875, + "step": 13151, + "time_per_iteration": 2.425090789794922 + }, + { + "auxiliary_loss_clip": 0.01059773, + "auxiliary_loss_mlp": 0.0102599, + "balance_loss_clip": 1.01334786, + "balance_loss_mlp": 1.01945114, + "epoch": 0.7907410190891327, + "flos": 40733585372160.0, + "grad_norm": 1.2744882571804226, + "language_loss": 0.70183623, + "learning_rate": 4.16845933350082e-07, + "loss": 0.72269392, + "num_input_tokens_seen": 283827120, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.40234375, + "step": 13152, + "time_per_iteration": 2.5683774948120117 + }, + { + "auxiliary_loss_clip": 0.01057041, + "auxiliary_loss_mlp": 0.01025186, + "balance_loss_clip": 1.01346779, + "balance_loss_mlp": 1.01941395, + "epoch": 0.7908011423418007, + "flos": 13260326175360.0, + "grad_norm": 1.9375922751738073, + "language_loss": 0.72849059, + "learning_rate": 4.16615085941294e-07, + "loss": 0.74931288, + "num_input_tokens_seen": 283844820, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.37695312, + "step": 13153, + "time_per_iteration": 2.3897414207458496 + }, + { + "auxiliary_loss_clip": 0.01055532, + "auxiliary_loss_mlp": 0.01023383, + "balance_loss_clip": 1.01182532, + "balance_loss_mlp": 1.01743829, + "epoch": 0.7908612655944687, + "flos": 19753752389760.0, + "grad_norm": 1.5829252240649383, + "language_loss": 0.7901358, + "learning_rate": 4.163842950394414e-07, + "loss": 0.81092501, + "num_input_tokens_seen": 283862870, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.37890625, + "step": 13154, + "time_per_iteration": 2.394437551498413 + }, + { + "auxiliary_loss_clip": 0.0106158, + "auxiliary_loss_mlp": 0.01025479, + "balance_loss_clip": 1.01214504, + "balance_loss_mlp": 1.01995087, + "epoch": 0.7909213888471366, + "flos": 21286683855360.0, + "grad_norm": 1.892564507685364, + "language_loss": 0.69977295, + "learning_rate": 4.161535606527595e-07, + "loss": 0.72064352, + "num_input_tokens_seen": 283882405, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.41601562, + "step": 13155, + "time_per_iteration": 2.477783679962158 + }, + { + "auxiliary_loss_clip": 0.01057411, + "auxiliary_loss_mlp": 0.01021571, + "balance_loss_clip": 1.00938821, + "balance_loss_mlp": 1.01895905, + "epoch": 0.7909815120998046, + "flos": 22527776332800.0, + "grad_norm": 1.8300486897178272, + "language_loss": 0.76917809, + "learning_rate": 4.1592288278948294e-07, + "loss": 0.78996795, + "num_input_tokens_seen": 283902070, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.3828125, + "step": 13156, + "time_per_iteration": 2.406022071838379 + }, + { + "auxiliary_loss_clip": 0.01056302, + "auxiliary_loss_mlp": 0.01023395, + "balance_loss_clip": 1.01226676, + "balance_loss_mlp": 1.01857209, + "epoch": 0.7910416353524725, + "flos": 26726396192640.0, + "grad_norm": 2.017793582539305, + "language_loss": 0.65316147, + "learning_rate": 4.156922614578435e-07, + "loss": 0.67395842, + "num_input_tokens_seen": 283924100, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.37695312, + "step": 13157, + "time_per_iteration": 2.4560275077819824 + }, + { + "auxiliary_loss_clip": 0.01058409, + "auxiliary_loss_mlp": 0.0102603, + "balance_loss_clip": 1.01338744, + "balance_loss_mlp": 1.01945186, + "epoch": 0.7911017586051405, + "flos": 24643687547520.0, + "grad_norm": 2.5565445863044167, + "language_loss": 0.74015796, + "learning_rate": 4.1546169666607246e-07, + "loss": 0.7610023, + "num_input_tokens_seen": 283944955, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.390625, + "step": 13158, + "time_per_iteration": 2.438110113143921 + }, + { + "auxiliary_loss_clip": 0.01055618, + "auxiliary_loss_mlp": 0.01019549, + "balance_loss_clip": 1.00881445, + "balance_loss_mlp": 1.01818633, + "epoch": 0.7911618818578086, + "flos": 17564558497920.0, + "grad_norm": 2.0628850290932164, + "language_loss": 0.67477381, + "learning_rate": 4.1523118842239756e-07, + "loss": 0.69552553, + "num_input_tokens_seen": 283963125, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.375, + "step": 13159, + "time_per_iteration": 2.3640263080596924 + }, + { + "auxiliary_loss_clip": 0.01055318, + "auxiliary_loss_mlp": 0.01025508, + "balance_loss_clip": 1.01420665, + "balance_loss_mlp": 1.01836467, + "epoch": 0.7912220051104765, + "flos": 16720882560000.0, + "grad_norm": 1.6554356216531625, + "language_loss": 0.67005646, + "learning_rate": 4.15000736735045e-07, + "loss": 0.69086468, + "num_input_tokens_seen": 283982850, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.36914062, + "step": 13160, + "time_per_iteration": 2.419971227645874 + }, + { + "auxiliary_loss_clip": 0.01052294, + "auxiliary_loss_mlp": 0.01020847, + "balance_loss_clip": 1.01101875, + "balance_loss_mlp": 1.01692677, + "epoch": 0.7912821283631445, + "flos": 13697892645120.0, + "grad_norm": 2.502631037057988, + "language_loss": 0.73074567, + "learning_rate": 4.14770341612239e-07, + "loss": 0.75147712, + "num_input_tokens_seen": 283998275, + "router_z_loss_clip": 0.09814453, + "router_z_loss_mlp": 0.35351562, + "step": 13161, + "time_per_iteration": 2.372652053833008 + }, + { + "auxiliary_loss_clip": 0.01055993, + "auxiliary_loss_mlp": 0.01024136, + "balance_loss_clip": 1.01191688, + "balance_loss_mlp": 1.01855361, + "epoch": 0.7913422516158124, + "flos": 23767891292160.0, + "grad_norm": 1.7135442997279255, + "language_loss": 0.73321861, + "learning_rate": 4.1454000306220193e-07, + "loss": 0.75401986, + "num_input_tokens_seen": 284018750, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.375, + "step": 13162, + "time_per_iteration": 2.4345834255218506 + }, + { + "auxiliary_loss_clip": 0.01058361, + "auxiliary_loss_mlp": 0.01024395, + "balance_loss_clip": 1.01274276, + "balance_loss_mlp": 1.01899827, + "epoch": 0.7914023748684804, + "flos": 19937220917760.0, + "grad_norm": 1.6834373763756127, + "language_loss": 0.71799767, + "learning_rate": 4.1430972109315367e-07, + "loss": 0.73882526, + "num_input_tokens_seen": 284037850, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.39453125, + "step": 13163, + "time_per_iteration": 2.404517889022827 + }, + { + "auxiliary_loss_clip": 0.01057828, + "auxiliary_loss_mlp": 0.01019683, + "balance_loss_clip": 1.00751758, + "balance_loss_mlp": 1.01938415, + "epoch": 0.7914624981211483, + "flos": 20593762634880.0, + "grad_norm": 2.3707959937909435, + "language_loss": 0.69880152, + "learning_rate": 4.1407949571331226e-07, + "loss": 0.71957666, + "num_input_tokens_seen": 284056380, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.3828125, + "step": 13164, + "time_per_iteration": 2.413146495819092 + }, + { + "auxiliary_loss_clip": 0.01056088, + "auxiliary_loss_mlp": 0.01020847, + "balance_loss_clip": 1.00987411, + "balance_loss_mlp": 1.01865256, + "epoch": 0.7915226213738163, + "flos": 21798370874880.0, + "grad_norm": 3.659324030383354, + "language_loss": 0.66293472, + "learning_rate": 4.1384932693089446e-07, + "loss": 0.68370402, + "num_input_tokens_seen": 284074945, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.375, + "step": 13165, + "time_per_iteration": 2.406938076019287 + }, + { + "auxiliary_loss_clip": 0.01052619, + "auxiliary_loss_mlp": 0.01020107, + "balance_loss_clip": 1.00999808, + "balance_loss_mlp": 1.01723075, + "epoch": 0.7915827446264843, + "flos": 16287470542080.0, + "grad_norm": 2.063709598695455, + "language_loss": 0.72268581, + "learning_rate": 4.136192147541142e-07, + "loss": 0.74341303, + "num_input_tokens_seen": 284092070, + "router_z_loss_clip": 0.10107422, + "router_z_loss_mlp": 0.35351562, + "step": 13166, + "time_per_iteration": 2.386481761932373 + }, + { + "auxiliary_loss_clip": 0.01055579, + "auxiliary_loss_mlp": 0.01023062, + "balance_loss_clip": 1.01095617, + "balance_loss_mlp": 1.01743031, + "epoch": 0.7916428678791523, + "flos": 25701416231040.0, + "grad_norm": 2.0722821186489218, + "language_loss": 0.77134115, + "learning_rate": 4.1338915919118353e-07, + "loss": 0.79212749, + "num_input_tokens_seen": 284112255, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.3828125, + "step": 13167, + "time_per_iteration": 2.4367942810058594 + }, + { + "auxiliary_loss_clip": 0.01055464, + "auxiliary_loss_mlp": 0.01026378, + "balance_loss_clip": 1.01527333, + "balance_loss_mlp": 1.01812148, + "epoch": 0.7917029911318202, + "flos": 23877378915840.0, + "grad_norm": 1.679931690003386, + "language_loss": 0.84365284, + "learning_rate": 4.1315916025031216e-07, + "loss": 0.86447126, + "num_input_tokens_seen": 284132330, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.37304688, + "step": 13168, + "time_per_iteration": 2.456425905227661 + }, + { + "auxiliary_loss_clip": 0.01057323, + "auxiliary_loss_mlp": 0.01022115, + "balance_loss_clip": 1.01080203, + "balance_loss_mlp": 1.0190866, + "epoch": 0.7917631143844882, + "flos": 21645696032640.0, + "grad_norm": 1.7037041447274444, + "language_loss": 0.73046732, + "learning_rate": 4.1292921793970947e-07, + "loss": 0.75126177, + "num_input_tokens_seen": 284150640, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.3828125, + "step": 13169, + "time_per_iteration": 2.4365181922912598 + }, + { + "auxiliary_loss_clip": 0.01057678, + "auxiliary_loss_mlp": 0.0102782, + "balance_loss_clip": 1.01493371, + "balance_loss_mlp": 1.01831913, + "epoch": 0.7918232376371561, + "flos": 38872644883200.0, + "grad_norm": 1.710354649606429, + "language_loss": 0.66883171, + "learning_rate": 4.1269933226757934e-07, + "loss": 0.68968666, + "num_input_tokens_seen": 284171910, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.39453125, + "step": 13170, + "time_per_iteration": 3.98796010017395 + }, + { + "auxiliary_loss_clip": 0.0105696, + "auxiliary_loss_mlp": 0.01022366, + "balance_loss_clip": 1.01170242, + "balance_loss_mlp": 1.01809752, + "epoch": 0.7918833608898241, + "flos": 20774542988160.0, + "grad_norm": 1.7253679837215057, + "language_loss": 0.71022725, + "learning_rate": 4.124695032421277e-07, + "loss": 0.73102057, + "num_input_tokens_seen": 284191340, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.38867188, + "step": 13171, + "time_per_iteration": 2.405642509460449 + }, + { + "auxiliary_loss_clip": 0.01055636, + "auxiliary_loss_mlp": 0.01022494, + "balance_loss_clip": 1.01107335, + "balance_loss_mlp": 1.01822925, + "epoch": 0.7919434841424922, + "flos": 33908763732480.0, + "grad_norm": 1.6479258485629182, + "language_loss": 0.67121935, + "learning_rate": 4.1223973087155594e-07, + "loss": 0.69200069, + "num_input_tokens_seen": 284212495, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.37304688, + "step": 13172, + "time_per_iteration": 2.500047206878662 + }, + { + "auxiliary_loss_clip": 0.01056884, + "auxiliary_loss_mlp": 0.01025491, + "balance_loss_clip": 1.01398754, + "balance_loss_mlp": 1.01870656, + "epoch": 0.7920036073951601, + "flos": 21063728712960.0, + "grad_norm": 1.7721410446100136, + "language_loss": 0.79452062, + "learning_rate": 4.1201001516406377e-07, + "loss": 0.81534445, + "num_input_tokens_seen": 284230825, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.3828125, + "step": 13173, + "time_per_iteration": 2.426175117492676 + }, + { + "auxiliary_loss_clip": 0.01055178, + "auxiliary_loss_mlp": 0.01020855, + "balance_loss_clip": 1.01032281, + "balance_loss_mlp": 1.01770949, + "epoch": 0.7920637306478281, + "flos": 23654947443840.0, + "grad_norm": 2.1528840078546594, + "language_loss": 0.76620173, + "learning_rate": 4.11780356127849e-07, + "loss": 0.78696203, + "num_input_tokens_seen": 284250365, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.375, + "step": 13174, + "time_per_iteration": 2.409764528274536 + }, + { + "auxiliary_loss_clip": 0.01054586, + "auxiliary_loss_mlp": 0.01025483, + "balance_loss_clip": 1.01462293, + "balance_loss_mlp": 1.01789677, + "epoch": 0.792123853900496, + "flos": 27194302500480.0, + "grad_norm": 3.905694524992657, + "language_loss": 0.7176789, + "learning_rate": 4.115507537711085e-07, + "loss": 0.73847955, + "num_input_tokens_seen": 284269635, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.3671875, + "step": 13175, + "time_per_iteration": 2.462824821472168 + }, + { + "auxiliary_loss_clip": 0.01055661, + "auxiliary_loss_mlp": 0.01023456, + "balance_loss_clip": 1.01177382, + "balance_loss_mlp": 1.01808918, + "epoch": 0.792183977153164, + "flos": 19097664520320.0, + "grad_norm": 2.025292836583373, + "language_loss": 0.59513402, + "learning_rate": 4.1132120810203607e-07, + "loss": 0.61592519, + "num_input_tokens_seen": 284288380, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.375, + "step": 13176, + "time_per_iteration": 2.3875834941864014 + }, + { + "auxiliary_loss_clip": 0.01058853, + "auxiliary_loss_mlp": 0.01023748, + "balance_loss_clip": 1.01169038, + "balance_loss_mlp": 1.02060568, + "epoch": 0.7922441004058319, + "flos": 17127899723520.0, + "grad_norm": 1.6939797860379637, + "language_loss": 0.73208797, + "learning_rate": 4.110917191288219e-07, + "loss": 0.75291401, + "num_input_tokens_seen": 284306920, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.3828125, + "step": 13177, + "time_per_iteration": 2.453428268432617 + }, + { + "auxiliary_loss_clip": 0.01055326, + "auxiliary_loss_mlp": 0.01025793, + "balance_loss_clip": 1.01419985, + "balance_loss_mlp": 1.01896, + "epoch": 0.7923042236585, + "flos": 17820681298560.0, + "grad_norm": 1.8266703779137148, + "language_loss": 0.64040118, + "learning_rate": 4.1086228685965786e-07, + "loss": 0.66121233, + "num_input_tokens_seen": 284324700, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.36328125, + "step": 13178, + "time_per_iteration": 2.415870428085327 + }, + { + "auxiliary_loss_clip": 0.01056237, + "auxiliary_loss_mlp": 0.01024093, + "balance_loss_clip": 1.01313806, + "balance_loss_mlp": 1.01850641, + "epoch": 0.7923643469111679, + "flos": 29933901976320.0, + "grad_norm": 1.4032628157150377, + "language_loss": 0.6863063, + "learning_rate": 4.1063291130273115e-07, + "loss": 0.70710963, + "num_input_tokens_seen": 284345985, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.37695312, + "step": 13179, + "time_per_iteration": 3.9271657466888428 + }, + { + "auxiliary_loss_clip": 0.01055183, + "auxiliary_loss_mlp": 0.01021804, + "balance_loss_clip": 1.00992465, + "balance_loss_mlp": 1.01755428, + "epoch": 0.7924244701638359, + "flos": 22673608548480.0, + "grad_norm": 2.2406465854285793, + "language_loss": 0.74163628, + "learning_rate": 4.1040359246622724e-07, + "loss": 0.76240611, + "num_input_tokens_seen": 284364475, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.375, + "step": 13180, + "time_per_iteration": 3.9655165672302246 + }, + { + "auxiliary_loss_clip": 0.01059788, + "auxiliary_loss_mlp": 0.01025397, + "balance_loss_clip": 1.01313591, + "balance_loss_mlp": 1.01962972, + "epoch": 0.7924845934165038, + "flos": 17967176830080.0, + "grad_norm": 1.9512570307586616, + "language_loss": 0.81480205, + "learning_rate": 4.1017433035832983e-07, + "loss": 0.83565384, + "num_input_tokens_seen": 284382125, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.40234375, + "step": 13181, + "time_per_iteration": 2.3819713592529297 + }, + { + "auxiliary_loss_clip": 0.01055447, + "auxiliary_loss_mlp": 0.01022814, + "balance_loss_clip": 1.01116776, + "balance_loss_mlp": 1.01818371, + "epoch": 0.7925447166691718, + "flos": 23475842835840.0, + "grad_norm": 1.714316074773999, + "language_loss": 0.77535981, + "learning_rate": 4.099451249872221e-07, + "loss": 0.7961424, + "num_input_tokens_seen": 284401585, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.37304688, + "step": 13182, + "time_per_iteration": 2.491558313369751 + }, + { + "auxiliary_loss_clip": 0.01059723, + "auxiliary_loss_mlp": 0.01028277, + "balance_loss_clip": 1.01559281, + "balance_loss_mlp": 1.01950753, + "epoch": 0.7926048399218397, + "flos": 20446568876160.0, + "grad_norm": 1.9473128655692293, + "language_loss": 0.73811042, + "learning_rate": 4.097159763610816e-07, + "loss": 0.75899041, + "num_input_tokens_seen": 284419125, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.40234375, + "step": 13183, + "time_per_iteration": 2.3899013996124268 + }, + { + "auxiliary_loss_clip": 0.01055459, + "auxiliary_loss_mlp": 0.01021522, + "balance_loss_clip": 1.01043606, + "balance_loss_mlp": 1.01841474, + "epoch": 0.7926649631745077, + "flos": 37158514128000.0, + "grad_norm": 1.9559490164433309, + "language_loss": 0.67859161, + "learning_rate": 4.0948688448808767e-07, + "loss": 0.69936144, + "num_input_tokens_seen": 284440445, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.37109375, + "step": 13184, + "time_per_iteration": 2.5371737480163574 + }, + { + "auxiliary_loss_clip": 0.01058618, + "auxiliary_loss_mlp": 0.01021809, + "balance_loss_clip": 1.00994742, + "balance_loss_mlp": 1.0193603, + "epoch": 0.7927250864271758, + "flos": 17017678961280.0, + "grad_norm": 1.7268675246985445, + "language_loss": 0.70512664, + "learning_rate": 4.092578493764152e-07, + "loss": 0.72593093, + "num_input_tokens_seen": 284459370, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.39453125, + "step": 13185, + "time_per_iteration": 2.3895580768585205 + }, + { + "auxiliary_loss_clip": 0.01058096, + "auxiliary_loss_mlp": 0.01023232, + "balance_loss_clip": 1.01103115, + "balance_loss_mlp": 1.01877213, + "epoch": 0.7927852096798437, + "flos": 17748236494080.0, + "grad_norm": 1.8506115986424891, + "language_loss": 0.65025651, + "learning_rate": 4.090288710342391e-07, + "loss": 0.67106974, + "num_input_tokens_seen": 284477525, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.39257812, + "step": 13186, + "time_per_iteration": 2.42875075340271 + }, + { + "auxiliary_loss_clip": 0.0105749, + "auxiliary_loss_mlp": 0.0102506, + "balance_loss_clip": 1.01335394, + "balance_loss_mlp": 1.01922667, + "epoch": 0.7928453329325117, + "flos": 23837403542400.0, + "grad_norm": 1.5034506519549515, + "language_loss": 0.7691741, + "learning_rate": 4.087999494697292e-07, + "loss": 0.7899996, + "num_input_tokens_seen": 284496590, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.3828125, + "step": 13187, + "time_per_iteration": 2.4249346256256104 + }, + { + "auxiliary_loss_clip": 0.01055296, + "auxiliary_loss_mlp": 0.0102425, + "balance_loss_clip": 1.01315165, + "balance_loss_mlp": 1.01826632, + "epoch": 0.7929054561851796, + "flos": 17454023533440.0, + "grad_norm": 2.158400092633665, + "language_loss": 0.72727108, + "learning_rate": 4.085710846910566e-07, + "loss": 0.7480666, + "num_input_tokens_seen": 284511470, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.37109375, + "step": 13188, + "time_per_iteration": 3.851219892501831 + }, + { + "auxiliary_loss_clip": 0.01058497, + "auxiliary_loss_mlp": 0.01029203, + "balance_loss_clip": 1.0160718, + "balance_loss_mlp": 1.01896858, + "epoch": 0.7929655794378476, + "flos": 21980198568960.0, + "grad_norm": 2.280341391922602, + "language_loss": 0.62937033, + "learning_rate": 4.083422767063882e-07, + "loss": 0.65024734, + "num_input_tokens_seen": 284531125, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.39453125, + "step": 13189, + "time_per_iteration": 2.411515712738037 + }, + { + "auxiliary_loss_clip": 0.01057194, + "auxiliary_loss_mlp": 0.01027405, + "balance_loss_clip": 1.0160737, + "balance_loss_mlp": 1.01933205, + "epoch": 0.7930257026905155, + "flos": 17272998800640.0, + "grad_norm": 2.534673052058533, + "language_loss": 0.72248816, + "learning_rate": 4.0811352552388987e-07, + "loss": 0.74333417, + "num_input_tokens_seen": 284549340, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.37890625, + "step": 13190, + "time_per_iteration": 2.4025189876556396 + }, + { + "auxiliary_loss_clip": 0.0105269, + "auxiliary_loss_mlp": 0.01021391, + "balance_loss_clip": 1.01111531, + "balance_loss_mlp": 1.01772761, + "epoch": 0.7930858259431836, + "flos": 27299565849600.0, + "grad_norm": 1.703003500329984, + "language_loss": 0.73403025, + "learning_rate": 4.078848311517249e-07, + "loss": 0.75477105, + "num_input_tokens_seen": 284567060, + "router_z_loss_clip": 0.10253906, + "router_z_loss_mlp": 0.34960938, + "step": 13191, + "time_per_iteration": 2.4779510498046875 + }, + { + "auxiliary_loss_clip": 0.01055286, + "auxiliary_loss_mlp": 0.01028904, + "balance_loss_clip": 1.01670909, + "balance_loss_mlp": 1.0175941, + "epoch": 0.7931459491958515, + "flos": 19862751254400.0, + "grad_norm": 1.805328535425813, + "language_loss": 0.6885432, + "learning_rate": 4.076561935980545e-07, + "loss": 0.70938516, + "num_input_tokens_seen": 284586600, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.37695312, + "step": 13192, + "time_per_iteration": 2.4206907749176025 + }, + { + "auxiliary_loss_clip": 0.01057869, + "auxiliary_loss_mlp": 0.01028462, + "balance_loss_clip": 1.01639247, + "balance_loss_mlp": 1.01942468, + "epoch": 0.7932060724485195, + "flos": 23146053333120.0, + "grad_norm": 1.548288107821883, + "language_loss": 0.75113916, + "learning_rate": 4.0742761287103946e-07, + "loss": 0.77200246, + "num_input_tokens_seen": 284605715, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.3828125, + "step": 13193, + "time_per_iteration": 2.413346529006958 + }, + { + "auxiliary_loss_clip": 0.01058605, + "auxiliary_loss_mlp": 0.01023743, + "balance_loss_clip": 1.011482, + "balance_loss_mlp": 1.01986337, + "epoch": 0.7932661957011874, + "flos": 29933552862720.0, + "grad_norm": 1.4942350936581301, + "language_loss": 0.71932179, + "learning_rate": 4.0719908897883526e-07, + "loss": 0.74014527, + "num_input_tokens_seen": 284628540, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.38671875, + "step": 13194, + "time_per_iteration": 2.497157335281372 + }, + { + "auxiliary_loss_clip": 0.01059379, + "auxiliary_loss_mlp": 0.01025451, + "balance_loss_clip": 1.0131309, + "balance_loss_mlp": 1.01861, + "epoch": 0.7933263189538554, + "flos": 22558185993600.0, + "grad_norm": 1.9848895463579719, + "language_loss": 0.70114458, + "learning_rate": 4.06970621929599e-07, + "loss": 0.72199285, + "num_input_tokens_seen": 284646040, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.40820312, + "step": 13195, + "time_per_iteration": 2.420795440673828 + }, + { + "auxiliary_loss_clip": 0.01055856, + "auxiliary_loss_mlp": 0.01021912, + "balance_loss_clip": 1.01095676, + "balance_loss_mlp": 1.01828361, + "epoch": 0.7933864422065233, + "flos": 25478007240960.0, + "grad_norm": 1.4803727532140225, + "language_loss": 0.77564299, + "learning_rate": 4.067422117314834e-07, + "loss": 0.79642057, + "num_input_tokens_seen": 284665110, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.375, + "step": 13196, + "time_per_iteration": 2.477854013442993 + }, + { + "auxiliary_loss_clip": 0.01060382, + "auxiliary_loss_mlp": 0.01027122, + "balance_loss_clip": 1.01550496, + "balance_loss_mlp": 1.02012384, + "epoch": 0.7934465654591913, + "flos": 33581767138560.0, + "grad_norm": 3.3007237329374766, + "language_loss": 0.68962604, + "learning_rate": 4.0651385839264e-07, + "loss": 0.71050113, + "num_input_tokens_seen": 284686515, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.40234375, + "step": 13197, + "time_per_iteration": 2.5177528858184814 + }, + { + "auxiliary_loss_clip": 0.0105536, + "auxiliary_loss_mlp": 0.01022919, + "balance_loss_clip": 1.0114212, + "balance_loss_mlp": 1.01816618, + "epoch": 0.7935066887118594, + "flos": 31431152165760.0, + "grad_norm": 2.78822103025398, + "language_loss": 0.652789, + "learning_rate": 4.0628556192121753e-07, + "loss": 0.67357177, + "num_input_tokens_seen": 284707300, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.37304688, + "step": 13198, + "time_per_iteration": 2.511309862136841 + }, + { + "auxiliary_loss_clip": 0.01061138, + "auxiliary_loss_mlp": 0.01028417, + "balance_loss_clip": 1.01538777, + "balance_loss_mlp": 1.02054501, + "epoch": 0.7935668119645273, + "flos": 14681780069760.0, + "grad_norm": 2.106602637681059, + "language_loss": 0.72296864, + "learning_rate": 4.0605732232536494e-07, + "loss": 0.74386418, + "num_input_tokens_seen": 284723545, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.40625, + "step": 13199, + "time_per_iteration": 2.373716354370117 + }, + { + "auxiliary_loss_clip": 0.01057504, + "auxiliary_loss_mlp": 0.01020536, + "balance_loss_clip": 1.00912797, + "balance_loss_mlp": 1.0189116, + "epoch": 0.7936269352171953, + "flos": 18003277042560.0, + "grad_norm": 1.8352975068674109, + "language_loss": 0.80845737, + "learning_rate": 4.058291396132252e-07, + "loss": 0.8292377, + "num_input_tokens_seen": 284742650, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.38671875, + "step": 13200, + "time_per_iteration": 2.4408621788024902 + }, + { + "auxiliary_loss_clip": 0.01055456, + "auxiliary_loss_mlp": 0.0102622, + "balance_loss_clip": 1.01506734, + "balance_loss_mlp": 1.0189569, + "epoch": 0.7936870584698632, + "flos": 18879212943360.0, + "grad_norm": 1.939383103768386, + "language_loss": 0.77335232, + "learning_rate": 4.0560101379294333e-07, + "loss": 0.79416907, + "num_input_tokens_seen": 284760955, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.36523438, + "step": 13201, + "time_per_iteration": 2.3845221996307373 + }, + { + "auxiliary_loss_clip": 0.01055112, + "auxiliary_loss_mlp": 0.01025947, + "balance_loss_clip": 1.01471162, + "balance_loss_mlp": 1.0181495, + "epoch": 0.7937471817225312, + "flos": 23365901364480.0, + "grad_norm": 1.5290924440410059, + "language_loss": 0.67510223, + "learning_rate": 4.053729448726595e-07, + "loss": 0.69591278, + "num_input_tokens_seen": 284780745, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.36914062, + "step": 13202, + "time_per_iteration": 2.4227826595306396 + }, + { + "auxiliary_loss_clip": 0.01056897, + "auxiliary_loss_mlp": 0.01025734, + "balance_loss_clip": 1.01364589, + "balance_loss_mlp": 1.0179764, + "epoch": 0.7938073049751991, + "flos": 22673329257600.0, + "grad_norm": 4.744065577517871, + "language_loss": 0.74889338, + "learning_rate": 4.051449328605145e-07, + "loss": 0.76971972, + "num_input_tokens_seen": 284799000, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.38867188, + "step": 13203, + "time_per_iteration": 2.433119535446167 + }, + { + "auxiliary_loss_clip": 0.01054964, + "auxiliary_loss_mlp": 0.01023902, + "balance_loss_clip": 1.01275623, + "balance_loss_mlp": 1.01713622, + "epoch": 0.7938674282278672, + "flos": 22850478829440.0, + "grad_norm": 1.6717169926641082, + "language_loss": 0.66185838, + "learning_rate": 4.0491697776464326e-07, + "loss": 0.68264705, + "num_input_tokens_seen": 284817450, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.37890625, + "step": 13204, + "time_per_iteration": 2.4163665771484375 + }, + { + "auxiliary_loss_clip": 0.01054894, + "auxiliary_loss_mlp": 0.01023173, + "balance_loss_clip": 1.01196718, + "balance_loss_mlp": 1.01782846, + "epoch": 0.7939275514805351, + "flos": 27011392554240.0, + "grad_norm": 1.5183371382448743, + "language_loss": 0.79510689, + "learning_rate": 4.0468907959318257e-07, + "loss": 0.81588757, + "num_input_tokens_seen": 284838865, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.37109375, + "step": 13205, + "time_per_iteration": 2.4420182704925537 + }, + { + "auxiliary_loss_clip": 0.01053984, + "auxiliary_loss_mlp": 0.01024832, + "balance_loss_clip": 1.01303625, + "balance_loss_mlp": 1.01702547, + "epoch": 0.7939876747332031, + "flos": 21141759335040.0, + "grad_norm": 2.884550295367611, + "language_loss": 0.77173293, + "learning_rate": 4.044612383542656e-07, + "loss": 0.79252112, + "num_input_tokens_seen": 284857975, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.37109375, + "step": 13206, + "time_per_iteration": 2.4168262481689453 + }, + { + "auxiliary_loss_clip": 0.01060114, + "auxiliary_loss_mlp": 0.01027577, + "balance_loss_clip": 1.01475573, + "balance_loss_mlp": 1.01971245, + "epoch": 0.794047797985871, + "flos": 23288115121920.0, + "grad_norm": 2.0998059424747004, + "language_loss": 0.78722078, + "learning_rate": 4.042334540560217e-07, + "loss": 0.80809766, + "num_input_tokens_seen": 284877145, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.40429688, + "step": 13207, + "time_per_iteration": 2.40775203704834 + }, + { + "auxiliary_loss_clip": 0.01056159, + "auxiliary_loss_mlp": 0.01025161, + "balance_loss_clip": 1.01360989, + "balance_loss_mlp": 1.01795864, + "epoch": 0.794107921238539, + "flos": 24606924019200.0, + "grad_norm": 1.7728271050802442, + "language_loss": 0.84282738, + "learning_rate": 4.0400572670658174e-07, + "loss": 0.86364061, + "num_input_tokens_seen": 284895560, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.3828125, + "step": 13208, + "time_per_iteration": 2.4201083183288574 + }, + { + "auxiliary_loss_clip": 0.01006655, + "auxiliary_loss_mlp": 0.0100193, + "balance_loss_clip": 1.00089324, + "balance_loss_mlp": 1.00054884, + "epoch": 0.7941680444912069, + "flos": 64090198448640.0, + "grad_norm": 0.7235636492054731, + "language_loss": 0.58318734, + "learning_rate": 4.0377805631407116e-07, + "loss": 0.60327315, + "num_input_tokens_seen": 284963135, + "router_z_loss_clip": 0.01037598, + "router_z_loss_mlp": 0.06103516, + "step": 13209, + "time_per_iteration": 3.135819911956787 + }, + { + "auxiliary_loss_clip": 0.01059704, + "auxiliary_loss_mlp": 0.01023465, + "balance_loss_clip": 1.01157391, + "balance_loss_mlp": 1.01937556, + "epoch": 0.794228167743875, + "flos": 24387704392320.0, + "grad_norm": 2.0672578195570863, + "language_loss": 0.6358875, + "learning_rate": 4.03550442886617e-07, + "loss": 0.65671921, + "num_input_tokens_seen": 284981755, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.40429688, + "step": 13210, + "time_per_iteration": 3.8470637798309326 + }, + { + "auxiliary_loss_clip": 0.01056413, + "auxiliary_loss_mlp": 0.01025197, + "balance_loss_clip": 1.01334119, + "balance_loss_mlp": 1.01760399, + "epoch": 0.794288290996543, + "flos": 28511226184320.0, + "grad_norm": 1.5412414569410098, + "language_loss": 0.69115567, + "learning_rate": 4.0332288643233994e-07, + "loss": 0.71197176, + "num_input_tokens_seen": 285003060, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.38867188, + "step": 13211, + "time_per_iteration": 2.4540295600891113 + }, + { + "auxiliary_loss_clip": 0.01058346, + "auxiliary_loss_mlp": 0.01030465, + "balance_loss_clip": 1.01739979, + "balance_loss_mlp": 1.01872838, + "epoch": 0.7943484142492109, + "flos": 25920915148800.0, + "grad_norm": 9.990069417976443, + "language_loss": 0.72230113, + "learning_rate": 4.0309538695936227e-07, + "loss": 0.74318928, + "num_input_tokens_seen": 285021640, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.39648438, + "step": 13212, + "time_per_iteration": 2.4348909854888916 + }, + { + "auxiliary_loss_clip": 0.01056233, + "auxiliary_loss_mlp": 0.01021581, + "balance_loss_clip": 1.01081085, + "balance_loss_mlp": 1.01927042, + "epoch": 0.7944085375018789, + "flos": 23914142444160.0, + "grad_norm": 1.7071181345469182, + "language_loss": 0.80485505, + "learning_rate": 4.0286794447580277e-07, + "loss": 0.82563317, + "num_input_tokens_seen": 285040490, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.36914062, + "step": 13213, + "time_per_iteration": 2.4572036266326904 + }, + { + "auxiliary_loss_clip": 0.01060906, + "auxiliary_loss_mlp": 0.01022636, + "balance_loss_clip": 1.00993454, + "balance_loss_mlp": 1.01987696, + "epoch": 0.7944686607545468, + "flos": 20228920260480.0, + "grad_norm": 2.133831197918638, + "language_loss": 0.6795094, + "learning_rate": 4.026405589897779e-07, + "loss": 0.7003448, + "num_input_tokens_seen": 285059270, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.41015625, + "step": 13214, + "time_per_iteration": 2.423119068145752 + }, + { + "auxiliary_loss_clip": 0.01054476, + "auxiliary_loss_mlp": 0.01020221, + "balance_loss_clip": 1.00915873, + "balance_loss_mlp": 1.01781225, + "epoch": 0.7945287840072148, + "flos": 21979919278080.0, + "grad_norm": 2.042918924420206, + "language_loss": 0.72794002, + "learning_rate": 4.024132305094021e-07, + "loss": 0.74868703, + "num_input_tokens_seen": 285075390, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.3671875, + "step": 13215, + "time_per_iteration": 2.4294397830963135 + }, + { + "auxiliary_loss_clip": 0.0105651, + "auxiliary_loss_mlp": 0.01019885, + "balance_loss_clip": 1.00934649, + "balance_loss_mlp": 1.01923525, + "epoch": 0.7945889072598827, + "flos": 26396467044480.0, + "grad_norm": 2.322918391816529, + "language_loss": 0.78990811, + "learning_rate": 4.021859590427896e-07, + "loss": 0.81067204, + "num_input_tokens_seen": 285096290, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.37304688, + "step": 13216, + "time_per_iteration": 2.4822158813476562 + }, + { + "auxiliary_loss_clip": 0.01056744, + "auxiliary_loss_mlp": 0.01020292, + "balance_loss_clip": 1.00924134, + "balance_loss_mlp": 1.01845551, + "epoch": 0.7946490305125508, + "flos": 25809123375360.0, + "grad_norm": 1.7892137562645682, + "language_loss": 0.73688978, + "learning_rate": 4.0195874459804923e-07, + "loss": 0.75766015, + "num_input_tokens_seen": 285116020, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.3828125, + "step": 13217, + "time_per_iteration": 2.4331812858581543 + }, + { + "auxiliary_loss_clip": 0.01054043, + "auxiliary_loss_mlp": 0.01027, + "balance_loss_clip": 1.01653337, + "balance_loss_mlp": 1.01851833, + "epoch": 0.7947091537652187, + "flos": 15960055011840.0, + "grad_norm": 1.8163704964800835, + "language_loss": 0.74339288, + "learning_rate": 4.017315871832909e-07, + "loss": 0.76420331, + "num_input_tokens_seen": 285133510, + "router_z_loss_clip": 0.10449219, + "router_z_loss_mlp": 0.35546875, + "step": 13218, + "time_per_iteration": 2.445070266723633 + }, + { + "auxiliary_loss_clip": 0.01058747, + "auxiliary_loss_mlp": 0.01034644, + "balance_loss_clip": 1.02231789, + "balance_loss_mlp": 1.0184288, + "epoch": 0.7947692770178867, + "flos": 18586885196160.0, + "grad_norm": 2.072340378933994, + "language_loss": 0.83683074, + "learning_rate": 4.0150448680662064e-07, + "loss": 0.85776472, + "num_input_tokens_seen": 285151690, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.40234375, + "step": 13219, + "time_per_iteration": 5.3497536182403564 + }, + { + "auxiliary_loss_clip": 0.0105995, + "auxiliary_loss_mlp": 0.01023779, + "balance_loss_clip": 1.01111889, + "balance_loss_mlp": 1.01961315, + "epoch": 0.7948294002705546, + "flos": 20441367083520.0, + "grad_norm": 1.7966558376413435, + "language_loss": 0.75579327, + "learning_rate": 4.012774434761443e-07, + "loss": 0.77663058, + "num_input_tokens_seen": 285170485, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.40429688, + "step": 13220, + "time_per_iteration": 2.4496653079986572 + }, + { + "auxiliary_loss_clip": 0.01057628, + "auxiliary_loss_mlp": 0.01028564, + "balance_loss_clip": 1.01663673, + "balance_loss_mlp": 1.0182544, + "epoch": 0.7948895235232226, + "flos": 38179653840000.0, + "grad_norm": 1.767728882084917, + "language_loss": 0.72725737, + "learning_rate": 4.0105045719996333e-07, + "loss": 0.74811918, + "num_input_tokens_seen": 285191050, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.39453125, + "step": 13221, + "time_per_iteration": 2.571389675140381 + }, + { + "auxiliary_loss_clip": 0.01054867, + "auxiliary_loss_mlp": 0.01024678, + "balance_loss_clip": 1.01352572, + "balance_loss_mlp": 1.01805663, + "epoch": 0.7949496467758905, + "flos": 15558902956800.0, + "grad_norm": 2.0944796707756788, + "language_loss": 0.74348176, + "learning_rate": 4.008235279861778e-07, + "loss": 0.76427722, + "num_input_tokens_seen": 285208750, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.3671875, + "step": 13222, + "time_per_iteration": 2.392561912536621 + }, + { + "auxiliary_loss_clip": 0.01006459, + "auxiliary_loss_mlp": 0.01001441, + "balance_loss_clip": 1.00051129, + "balance_loss_mlp": 1.00045133, + "epoch": 0.7950097700285585, + "flos": 70893898715520.0, + "grad_norm": 0.7670673840793614, + "language_loss": 0.67115909, + "learning_rate": 4.0059665584288817e-07, + "loss": 0.69123811, + "num_input_tokens_seen": 285264605, + "router_z_loss_clip": 0.00927734, + "router_z_loss_mlp": 0.06005859, + "step": 13223, + "time_per_iteration": 2.902184009552002 + }, + { + "auxiliary_loss_clip": 0.01055649, + "auxiliary_loss_mlp": 0.01023482, + "balance_loss_clip": 1.01211512, + "balance_loss_mlp": 1.01783085, + "epoch": 0.7950698932812266, + "flos": 23950487036160.0, + "grad_norm": 1.7668615356895292, + "language_loss": 0.70854729, + "learning_rate": 4.003698407781888e-07, + "loss": 0.72933859, + "num_input_tokens_seen": 285283940, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.37890625, + "step": 13224, + "time_per_iteration": 2.4430086612701416 + }, + { + "auxiliary_loss_clip": 0.01006407, + "auxiliary_loss_mlp": 0.01001052, + "balance_loss_clip": 1.00007439, + "balance_loss_mlp": 1.0003078, + "epoch": 0.7951300165338945, + "flos": 60279638883840.0, + "grad_norm": 0.6687571024045246, + "language_loss": 0.55102599, + "learning_rate": 4.001430828001753e-07, + "loss": 0.57110059, + "num_input_tokens_seen": 285349525, + "router_z_loss_clip": 0.00976562, + "router_z_loss_mlp": 0.06103516, + "step": 13225, + "time_per_iteration": 3.1176469326019287 + }, + { + "auxiliary_loss_clip": 0.01053417, + "auxiliary_loss_mlp": 0.01020707, + "balance_loss_clip": 1.00984108, + "balance_loss_mlp": 1.01724648, + "epoch": 0.7951901397865625, + "flos": 22817939575680.0, + "grad_norm": 2.0702653648411147, + "language_loss": 0.64831197, + "learning_rate": 3.9991638191694e-07, + "loss": 0.6690532, + "num_input_tokens_seen": 285367355, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.36132812, + "step": 13226, + "time_per_iteration": 2.413119316101074 + }, + { + "auxiliary_loss_clip": 0.01057378, + "auxiliary_loss_mlp": 0.01029522, + "balance_loss_clip": 1.0183816, + "balance_loss_mlp": 1.01852894, + "epoch": 0.7952502630392304, + "flos": 35694326862720.0, + "grad_norm": 1.8049807230823063, + "language_loss": 0.70423555, + "learning_rate": 3.9968973813657316e-07, + "loss": 0.72510451, + "num_input_tokens_seen": 285386190, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.390625, + "step": 13227, + "time_per_iteration": 3.9796862602233887 + }, + { + "auxiliary_loss_clip": 0.01054285, + "auxiliary_loss_mlp": 0.01022071, + "balance_loss_clip": 1.01195621, + "balance_loss_mlp": 1.01815259, + "epoch": 0.7953103862918984, + "flos": 25628657224320.0, + "grad_norm": 1.6454584145315285, + "language_loss": 0.69100487, + "learning_rate": 3.994631514671625e-07, + "loss": 0.71176845, + "num_input_tokens_seen": 285406150, + "router_z_loss_clip": 0.10107422, + "router_z_loss_mlp": 0.359375, + "step": 13228, + "time_per_iteration": 2.4394731521606445 + }, + { + "auxiliary_loss_clip": 0.01056566, + "auxiliary_loss_mlp": 0.01029064, + "balance_loss_clip": 1.01775718, + "balance_loss_mlp": 1.01817036, + "epoch": 0.7953705095445663, + "flos": 40550396135040.0, + "grad_norm": 1.3595288285716138, + "language_loss": 0.70875049, + "learning_rate": 3.992366219167955e-07, + "loss": 0.72960681, + "num_input_tokens_seen": 285429900, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.3828125, + "step": 13229, + "time_per_iteration": 2.580904722213745 + }, + { + "auxiliary_loss_clip": 0.01060922, + "auxiliary_loss_mlp": 0.01026987, + "balance_loss_clip": 1.01459539, + "balance_loss_mlp": 1.02067995, + "epoch": 0.7954306327972344, + "flos": 27635429928960.0, + "grad_norm": 2.055095336862586, + "language_loss": 0.71767992, + "learning_rate": 3.990101494935558e-07, + "loss": 0.73855901, + "num_input_tokens_seen": 285452555, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.40234375, + "step": 13230, + "time_per_iteration": 2.4633538722991943 + }, + { + "auxiliary_loss_clip": 0.01006508, + "auxiliary_loss_mlp": 0.01001801, + "balance_loss_clip": 1.00083554, + "balance_loss_mlp": 1.00043726, + "epoch": 0.7954907560499023, + "flos": 59500481005440.0, + "grad_norm": 0.9030982225082469, + "language_loss": 0.6357559, + "learning_rate": 3.987837342055256e-07, + "loss": 0.65583897, + "num_input_tokens_seen": 285515700, + "router_z_loss_clip": 0.00964355, + "router_z_loss_mlp": 0.06054688, + "step": 13231, + "time_per_iteration": 3.080192804336548 + }, + { + "auxiliary_loss_clip": 0.01055953, + "auxiliary_loss_mlp": 0.01023696, + "balance_loss_clip": 1.01269913, + "balance_loss_mlp": 1.01883316, + "epoch": 0.7955508793025703, + "flos": 20119502459520.0, + "grad_norm": 1.6003746032081483, + "language_loss": 0.69888771, + "learning_rate": 3.9855737606078457e-07, + "loss": 0.71968424, + "num_input_tokens_seen": 285533910, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.37109375, + "step": 13232, + "time_per_iteration": 2.434983015060425 + }, + { + "auxiliary_loss_clip": 0.01055142, + "auxiliary_loss_mlp": 0.01021709, + "balance_loss_clip": 1.01030123, + "balance_loss_mlp": 1.01808667, + "epoch": 0.7956110025552382, + "flos": 26504174188800.0, + "grad_norm": 1.509380459615189, + "language_loss": 0.77750087, + "learning_rate": 3.9833107506741226e-07, + "loss": 0.79826939, + "num_input_tokens_seen": 285554080, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.37109375, + "step": 13233, + "time_per_iteration": 2.4424448013305664 + }, + { + "auxiliary_loss_clip": 0.01057888, + "auxiliary_loss_mlp": 0.01026633, + "balance_loss_clip": 1.01490259, + "balance_loss_mlp": 1.018255, + "epoch": 0.7956711258079062, + "flos": 22564365304320.0, + "grad_norm": 1.8222134950822373, + "language_loss": 0.78794181, + "learning_rate": 3.9810483123348315e-07, + "loss": 0.80878705, + "num_input_tokens_seen": 285572325, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.39648438, + "step": 13234, + "time_per_iteration": 2.4107840061187744 + }, + { + "auxiliary_loss_clip": 0.01055167, + "auxiliary_loss_mlp": 0.01022818, + "balance_loss_clip": 1.01248264, + "balance_loss_mlp": 1.01849377, + "epoch": 0.7957312490605741, + "flos": 17378192327040.0, + "grad_norm": 1.6093258786035363, + "language_loss": 0.70168102, + "learning_rate": 3.978786445670723e-07, + "loss": 0.72246087, + "num_input_tokens_seen": 285589770, + "router_z_loss_clip": 0.10351562, + "router_z_loss_mlp": 0.3671875, + "step": 13235, + "time_per_iteration": 2.3729920387268066 + }, + { + "auxiliary_loss_clip": 0.01055995, + "auxiliary_loss_mlp": 0.01023605, + "balance_loss_clip": 1.01145113, + "balance_loss_mlp": 1.01841116, + "epoch": 0.7957913723132422, + "flos": 22490349488640.0, + "grad_norm": 1.6287395865263128, + "language_loss": 0.67926753, + "learning_rate": 3.9765251507625153e-07, + "loss": 0.70006347, + "num_input_tokens_seen": 285610065, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.375, + "step": 13236, + "time_per_iteration": 2.4099466800689697 + }, + { + "auxiliary_loss_clip": 0.01055697, + "auxiliary_loss_mlp": 0.01023528, + "balance_loss_clip": 1.01199412, + "balance_loss_mlp": 1.01864552, + "epoch": 0.7958514955659101, + "flos": 22636984665600.0, + "grad_norm": 1.7729272527303488, + "language_loss": 0.74884719, + "learning_rate": 3.97426442769091e-07, + "loss": 0.76963949, + "num_input_tokens_seen": 285628480, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.37109375, + "step": 13237, + "time_per_iteration": 2.414081335067749 + }, + { + "auxiliary_loss_clip": 0.01057658, + "auxiliary_loss_mlp": 0.01026534, + "balance_loss_clip": 1.01429129, + "balance_loss_mlp": 1.01935697, + "epoch": 0.7959116188185781, + "flos": 20703180435840.0, + "grad_norm": 2.933865809266817, + "language_loss": 0.71174043, + "learning_rate": 3.9720042765365823e-07, + "loss": 0.73258233, + "num_input_tokens_seen": 285647805, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.3828125, + "step": 13238, + "time_per_iteration": 2.4540088176727295 + }, + { + "auxiliary_loss_clip": 0.01056723, + "auxiliary_loss_mlp": 0.01020777, + "balance_loss_clip": 1.00904703, + "balance_loss_mlp": 1.01850545, + "epoch": 0.7959717420712461, + "flos": 19023718527360.0, + "grad_norm": 1.6283820479974984, + "language_loss": 0.73698854, + "learning_rate": 3.9697446973801885e-07, + "loss": 0.7577635, + "num_input_tokens_seen": 285665505, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.3828125, + "step": 13239, + "time_per_iteration": 2.3815932273864746 + }, + { + "auxiliary_loss_clip": 0.01058103, + "auxiliary_loss_mlp": 0.01028004, + "balance_loss_clip": 1.01520121, + "balance_loss_mlp": 1.01874661, + "epoch": 0.796031865323914, + "flos": 26355514152960.0, + "grad_norm": 2.065403586178051, + "language_loss": 0.69366711, + "learning_rate": 3.967485690302381e-07, + "loss": 0.71452814, + "num_input_tokens_seen": 285685855, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.39257812, + "step": 13240, + "time_per_iteration": 2.4290246963500977 + }, + { + "auxiliary_loss_clip": 0.01006563, + "auxiliary_loss_mlp": 0.01000355, + "balance_loss_clip": 0.99933541, + "balance_loss_mlp": 1.0004425, + "epoch": 0.796091988576582, + "flos": 62066493734400.0, + "grad_norm": 0.8894986560323594, + "language_loss": 0.58674288, + "learning_rate": 3.9652272553837583e-07, + "loss": 0.606812, + "num_input_tokens_seen": 285735710, + "router_z_loss_clip": 0.01019287, + "router_z_loss_mlp": 0.0612793, + "step": 13241, + "time_per_iteration": 2.8182222843170166 + }, + { + "auxiliary_loss_clip": 0.01056637, + "auxiliary_loss_mlp": 0.01023109, + "balance_loss_clip": 1.01229668, + "balance_loss_mlp": 1.01893759, + "epoch": 0.7961521118292499, + "flos": 39018546921600.0, + "grad_norm": 2.468219027629776, + "language_loss": 0.64289308, + "learning_rate": 3.9629693927049355e-07, + "loss": 0.66369051, + "num_input_tokens_seen": 285757045, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.37695312, + "step": 13242, + "time_per_iteration": 2.5460193157196045 + }, + { + "auxiliary_loss_clip": 0.01057418, + "auxiliary_loss_mlp": 0.01026588, + "balance_loss_clip": 1.01414824, + "balance_loss_mlp": 1.01882958, + "epoch": 0.796212235081918, + "flos": 21761746992000.0, + "grad_norm": 3.542380505506085, + "language_loss": 0.75763041, + "learning_rate": 3.9607121023464816e-07, + "loss": 0.77847052, + "num_input_tokens_seen": 285776050, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.38671875, + "step": 13243, + "time_per_iteration": 2.395299196243286 + }, + { + "auxiliary_loss_clip": 0.01057836, + "auxiliary_loss_mlp": 0.01024555, + "balance_loss_clip": 1.01282454, + "balance_loss_mlp": 1.01900709, + "epoch": 0.7962723583345859, + "flos": 21177789724800.0, + "grad_norm": 1.7412264535991604, + "language_loss": 0.79559869, + "learning_rate": 3.9584553843889547e-07, + "loss": 0.81642258, + "num_input_tokens_seen": 285796830, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.38867188, + "step": 13244, + "time_per_iteration": 2.4414427280426025 + }, + { + "auxiliary_loss_clip": 0.01058835, + "auxiliary_loss_mlp": 0.01023013, + "balance_loss_clip": 1.01099718, + "balance_loss_mlp": 1.01897597, + "epoch": 0.7963324815872539, + "flos": 17127690255360.0, + "grad_norm": 2.256031647637597, + "language_loss": 0.68318248, + "learning_rate": 3.9561992389128875e-07, + "loss": 0.70400095, + "num_input_tokens_seen": 285814755, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.3984375, + "step": 13245, + "time_per_iteration": 2.3596158027648926 + }, + { + "auxiliary_loss_clip": 0.01057993, + "auxiliary_loss_mlp": 0.01026184, + "balance_loss_clip": 1.01441753, + "balance_loss_mlp": 1.01878333, + "epoch": 0.7963926048399218, + "flos": 21396415858560.0, + "grad_norm": 1.4839011302901604, + "language_loss": 0.78961587, + "learning_rate": 3.953943665998802e-07, + "loss": 0.81045759, + "num_input_tokens_seen": 285834255, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.390625, + "step": 13246, + "time_per_iteration": 2.404689073562622 + }, + { + "auxiliary_loss_clip": 0.01056996, + "auxiliary_loss_mlp": 0.01025562, + "balance_loss_clip": 1.0131886, + "balance_loss_mlp": 1.01745737, + "epoch": 0.7964527280925898, + "flos": 25183235698560.0, + "grad_norm": 1.8568349592380167, + "language_loss": 0.6643886, + "learning_rate": 3.9516886657271955e-07, + "loss": 0.68521422, + "num_input_tokens_seen": 285853540, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.39453125, + "step": 13247, + "time_per_iteration": 2.4346210956573486 + }, + { + "auxiliary_loss_clip": 0.01054086, + "auxiliary_loss_mlp": 0.01025022, + "balance_loss_clip": 1.01488972, + "balance_loss_mlp": 1.01760983, + "epoch": 0.7965128513452577, + "flos": 27014674222080.0, + "grad_norm": 1.930831228643912, + "language_loss": 0.71515203, + "learning_rate": 3.949434238178537e-07, + "loss": 0.73594308, + "num_input_tokens_seen": 285872705, + "router_z_loss_clip": 0.1015625, + "router_z_loss_mlp": 0.36523438, + "step": 13248, + "time_per_iteration": 2.44618558883667 + }, + { + "auxiliary_loss_clip": 0.01057931, + "auxiliary_loss_mlp": 0.01025058, + "balance_loss_clip": 1.01243401, + "balance_loss_mlp": 1.01863778, + "epoch": 0.7965729745979258, + "flos": 24418602812160.0, + "grad_norm": 2.334928628169365, + "language_loss": 0.76290458, + "learning_rate": 3.947180383433277e-07, + "loss": 0.78373444, + "num_input_tokens_seen": 285890290, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.39257812, + "step": 13249, + "time_per_iteration": 2.4061801433563232 + }, + { + "auxiliary_loss_clip": 0.01056681, + "auxiliary_loss_mlp": 0.01023794, + "balance_loss_clip": 1.01255298, + "balance_loss_mlp": 1.01934695, + "epoch": 0.7966330978505937, + "flos": 18839481949440.0, + "grad_norm": 3.127821130528732, + "language_loss": 0.6162141, + "learning_rate": 3.944927101571871e-07, + "loss": 0.6370188, + "num_input_tokens_seen": 285909190, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.37304688, + "step": 13250, + "time_per_iteration": 3.7892210483551025 + }, + { + "auxiliary_loss_clip": 0.01056992, + "auxiliary_loss_mlp": 0.01023346, + "balance_loss_clip": 1.0114845, + "balance_loss_mlp": 1.01919413, + "epoch": 0.7966932211032617, + "flos": 13151466956160.0, + "grad_norm": 2.1549011440368457, + "language_loss": 0.71653104, + "learning_rate": 3.9426743926747095e-07, + "loss": 0.73733437, + "num_input_tokens_seen": 285927570, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.37890625, + "step": 13251, + "time_per_iteration": 2.394270420074463 + }, + { + "auxiliary_loss_clip": 0.01060085, + "auxiliary_loss_mlp": 0.01024841, + "balance_loss_clip": 1.01182318, + "balance_loss_mlp": 1.02009773, + "epoch": 0.7967533443559297, + "flos": 23948671645440.0, + "grad_norm": 1.8131151076453278, + "language_loss": 0.73235142, + "learning_rate": 3.940422256822191e-07, + "loss": 0.75320065, + "num_input_tokens_seen": 285945810, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.40039062, + "step": 13252, + "time_per_iteration": 2.40120267868042 + }, + { + "auxiliary_loss_clip": 0.01054336, + "auxiliary_loss_mlp": 0.01021634, + "balance_loss_clip": 1.01051211, + "balance_loss_mlp": 1.01852751, + "epoch": 0.7968134676085976, + "flos": 30367593285120.0, + "grad_norm": 1.913452424352483, + "language_loss": 0.66232771, + "learning_rate": 3.9381706940946957e-07, + "loss": 0.68308747, + "num_input_tokens_seen": 285964235, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.35742188, + "step": 13253, + "time_per_iteration": 2.4460058212280273 + }, + { + "auxiliary_loss_clip": 0.01059418, + "auxiliary_loss_mlp": 0.01027852, + "balance_loss_clip": 1.01574016, + "balance_loss_mlp": 1.01909184, + "epoch": 0.7968735908612656, + "flos": 23073957642240.0, + "grad_norm": 1.5176823997219717, + "language_loss": 0.67767531, + "learning_rate": 3.9359197045725747e-07, + "loss": 0.69854802, + "num_input_tokens_seen": 285983710, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.40234375, + "step": 13254, + "time_per_iteration": 2.4105193614959717 + }, + { + "auxiliary_loss_clip": 0.01055814, + "auxiliary_loss_mlp": 0.01023232, + "balance_loss_clip": 1.01207411, + "balance_loss_mlp": 1.0184288, + "epoch": 0.7969337141139335, + "flos": 23581245830400.0, + "grad_norm": 2.105748842638186, + "language_loss": 0.69108564, + "learning_rate": 3.933669288336154e-07, + "loss": 0.71187609, + "num_input_tokens_seen": 286003425, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.37304688, + "step": 13255, + "time_per_iteration": 2.434213638305664 + }, + { + "auxiliary_loss_clip": 0.01055344, + "auxiliary_loss_mlp": 0.01025058, + "balance_loss_clip": 1.01382852, + "balance_loss_mlp": 1.01757479, + "epoch": 0.7969938373666016, + "flos": 19754834641920.0, + "grad_norm": 1.9407866437576187, + "language_loss": 0.79082876, + "learning_rate": 3.931419445465747e-07, + "loss": 0.81163281, + "num_input_tokens_seen": 286020130, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.37695312, + "step": 13256, + "time_per_iteration": 2.3854732513427734 + }, + { + "auxiliary_loss_clip": 0.01058413, + "auxiliary_loss_mlp": 0.01024335, + "balance_loss_clip": 1.01251554, + "balance_loss_mlp": 1.01986599, + "epoch": 0.7970539606192695, + "flos": 24132943134720.0, + "grad_norm": 1.693168180470689, + "language_loss": 0.66194618, + "learning_rate": 3.929170176041656e-07, + "loss": 0.68277365, + "num_input_tokens_seen": 286040230, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.38476562, + "step": 13257, + "time_per_iteration": 2.4157423973083496 + }, + { + "auxiliary_loss_clip": 0.01060156, + "auxiliary_loss_mlp": 0.01029736, + "balance_loss_clip": 1.01759481, + "balance_loss_mlp": 1.0201925, + "epoch": 0.7971140838719375, + "flos": 17967630677760.0, + "grad_norm": 2.327717056720125, + "language_loss": 0.7213096, + "learning_rate": 3.926921480144132e-07, + "loss": 0.74220848, + "num_input_tokens_seen": 286059475, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.40039062, + "step": 13258, + "time_per_iteration": 2.3847920894622803 + }, + { + "auxiliary_loss_clip": 0.01057772, + "auxiliary_loss_mlp": 0.01023655, + "balance_loss_clip": 1.01135254, + "balance_loss_mlp": 1.01822984, + "epoch": 0.7971742071246054, + "flos": 19168608136320.0, + "grad_norm": 1.8112407265022272, + "language_loss": 0.68849194, + "learning_rate": 3.9246733578534405e-07, + "loss": 0.70930624, + "num_input_tokens_seen": 286077820, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.39453125, + "step": 13259, + "time_per_iteration": 3.87618350982666 + }, + { + "auxiliary_loss_clip": 0.01055794, + "auxiliary_loss_mlp": 0.01022235, + "balance_loss_clip": 1.01167893, + "balance_loss_mlp": 1.01961231, + "epoch": 0.7972343303772734, + "flos": 27124720427520.0, + "grad_norm": 1.7250417784711114, + "language_loss": 0.73706245, + "learning_rate": 3.9224258092498074e-07, + "loss": 0.75784278, + "num_input_tokens_seen": 286097285, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.36328125, + "step": 13260, + "time_per_iteration": 3.8721532821655273 + }, + { + "auxiliary_loss_clip": 0.01054359, + "auxiliary_loss_mlp": 0.01025089, + "balance_loss_clip": 1.01396108, + "balance_loss_mlp": 1.01802516, + "epoch": 0.7972944536299413, + "flos": 20995578005760.0, + "grad_norm": 1.8093385557351678, + "language_loss": 0.78606296, + "learning_rate": 3.920178834413439e-07, + "loss": 0.80685747, + "num_input_tokens_seen": 286116000, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.36328125, + "step": 13261, + "time_per_iteration": 2.407517910003662 + }, + { + "auxiliary_loss_clip": 0.01053129, + "auxiliary_loss_mlp": 0.01019938, + "balance_loss_clip": 1.00969839, + "balance_loss_mlp": 1.01885653, + "epoch": 0.7973545768826094, + "flos": 21578941779840.0, + "grad_norm": 1.445822321125592, + "language_loss": 0.76222849, + "learning_rate": 3.91793243342452e-07, + "loss": 0.78295916, + "num_input_tokens_seen": 286135110, + "router_z_loss_clip": 0.10253906, + "router_z_loss_mlp": 0.34179688, + "step": 13262, + "time_per_iteration": 2.3819026947021484 + }, + { + "auxiliary_loss_clip": 0.01061044, + "auxiliary_loss_mlp": 0.01027215, + "balance_loss_clip": 1.01403666, + "balance_loss_mlp": 1.02016616, + "epoch": 0.7974147001352773, + "flos": 20557487865600.0, + "grad_norm": 2.1522673690093663, + "language_loss": 0.70504296, + "learning_rate": 3.915686606363231e-07, + "loss": 0.72592556, + "num_input_tokens_seen": 286152835, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.41015625, + "step": 13263, + "time_per_iteration": 2.3744029998779297 + }, + { + "auxiliary_loss_clip": 0.01061429, + "auxiliary_loss_mlp": 0.01027737, + "balance_loss_clip": 1.01448083, + "balance_loss_mlp": 1.01947522, + "epoch": 0.7974748233879453, + "flos": 20995717651200.0, + "grad_norm": 1.672410363378535, + "language_loss": 0.70714867, + "learning_rate": 3.9134413533097143e-07, + "loss": 0.72804034, + "num_input_tokens_seen": 286171785, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.41992188, + "step": 13264, + "time_per_iteration": 2.3950250148773193 + }, + { + "auxiliary_loss_clip": 0.01055071, + "auxiliary_loss_mlp": 0.01024847, + "balance_loss_clip": 1.01358151, + "balance_loss_mlp": 1.01832426, + "epoch": 0.7975349466406133, + "flos": 22564086013440.0, + "grad_norm": 3.3427307129641264, + "language_loss": 0.76885939, + "learning_rate": 3.911196674344095e-07, + "loss": 0.78965855, + "num_input_tokens_seen": 286190420, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.3671875, + "step": 13265, + "time_per_iteration": 2.394163131713867 + }, + { + "auxiliary_loss_clip": 0.01056309, + "auxiliary_loss_mlp": 0.01021996, + "balance_loss_clip": 1.01022971, + "balance_loss_mlp": 1.01748228, + "epoch": 0.7975950698932812, + "flos": 21688464314880.0, + "grad_norm": 1.8083105470002074, + "language_loss": 0.75666636, + "learning_rate": 3.90895256954648e-07, + "loss": 0.77744937, + "num_input_tokens_seen": 286210105, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.38671875, + "step": 13266, + "time_per_iteration": 2.3833179473876953 + }, + { + "auxiliary_loss_clip": 0.01055393, + "auxiliary_loss_mlp": 0.0102282, + "balance_loss_clip": 1.01176977, + "balance_loss_mlp": 1.01796985, + "epoch": 0.7976551931459492, + "flos": 19603695899520.0, + "grad_norm": 1.5597155515793963, + "language_loss": 0.84135824, + "learning_rate": 3.9067090389969583e-07, + "loss": 0.86214042, + "num_input_tokens_seen": 286228180, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.375, + "step": 13267, + "time_per_iteration": 3.847872257232666 + }, + { + "auxiliary_loss_clip": 0.01055642, + "auxiliary_loss_mlp": 0.01023756, + "balance_loss_clip": 1.0121932, + "balance_loss_mlp": 1.0183593, + "epoch": 0.7977153163986171, + "flos": 21686579101440.0, + "grad_norm": 1.6825493530635032, + "language_loss": 0.76103669, + "learning_rate": 3.904466082775593e-07, + "loss": 0.78183067, + "num_input_tokens_seen": 286247305, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.37304688, + "step": 13268, + "time_per_iteration": 2.440032482147217 + }, + { + "auxiliary_loss_clip": 0.01056866, + "auxiliary_loss_mlp": 0.01026629, + "balance_loss_clip": 1.01520252, + "balance_loss_mlp": 1.01900339, + "epoch": 0.7977754396512852, + "flos": 23475668279040.0, + "grad_norm": 1.7896339917435877, + "language_loss": 0.77990806, + "learning_rate": 3.902223700962426e-07, + "loss": 0.80074298, + "num_input_tokens_seen": 286268145, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.37890625, + "step": 13269, + "time_per_iteration": 2.459038496017456 + }, + { + "auxiliary_loss_clip": 0.01053498, + "auxiliary_loss_mlp": 0.01023171, + "balance_loss_clip": 1.01266265, + "balance_loss_mlp": 1.01759732, + "epoch": 0.7978355629039531, + "flos": 22381141155840.0, + "grad_norm": 1.747719368568076, + "language_loss": 0.82138854, + "learning_rate": 3.8999818936374964e-07, + "loss": 0.84215522, + "num_input_tokens_seen": 286286775, + "router_z_loss_clip": 0.10498047, + "router_z_loss_mlp": 0.359375, + "step": 13270, + "time_per_iteration": 2.391491651535034 + }, + { + "auxiliary_loss_clip": 0.01056208, + "auxiliary_loss_mlp": 0.01022342, + "balance_loss_clip": 1.0109098, + "balance_loss_mlp": 1.01860464, + "epoch": 0.7978956861566211, + "flos": 20265299763840.0, + "grad_norm": 2.1777000410918066, + "language_loss": 0.59385192, + "learning_rate": 3.8977406608807883e-07, + "loss": 0.61463737, + "num_input_tokens_seen": 286305590, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.375, + "step": 13271, + "time_per_iteration": 2.402200937271118 + }, + { + "auxiliary_loss_clip": 0.01059589, + "auxiliary_loss_mlp": 0.01025046, + "balance_loss_clip": 1.01324463, + "balance_loss_mlp": 1.01940036, + "epoch": 0.797955809409289, + "flos": 28111121470080.0, + "grad_norm": 1.6656432582930378, + "language_loss": 0.7329253, + "learning_rate": 3.895500002772303e-07, + "loss": 0.75377166, + "num_input_tokens_seen": 286328050, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.40234375, + "step": 13272, + "time_per_iteration": 2.4683046340942383 + }, + { + "auxiliary_loss_clip": 0.01057873, + "auxiliary_loss_mlp": 0.01024166, + "balance_loss_clip": 1.01225126, + "balance_loss_mlp": 1.01973999, + "epoch": 0.798015932661957, + "flos": 15558693488640.0, + "grad_norm": 1.7560435787690685, + "language_loss": 0.71772563, + "learning_rate": 3.893259919391989e-07, + "loss": 0.73854601, + "num_input_tokens_seen": 286345265, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.38085938, + "step": 13273, + "time_per_iteration": 2.38565731048584 + }, + { + "auxiliary_loss_clip": 0.01059434, + "auxiliary_loss_mlp": 0.01025572, + "balance_loss_clip": 1.0137285, + "balance_loss_mlp": 1.01851225, + "epoch": 0.7980760559146249, + "flos": 23950068099840.0, + "grad_norm": 2.146120182853053, + "language_loss": 0.7555871, + "learning_rate": 3.89102041081981e-07, + "loss": 0.7764371, + "num_input_tokens_seen": 286364465, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.41015625, + "step": 13274, + "time_per_iteration": 2.414444923400879 + }, + { + "auxiliary_loss_clip": 0.01052371, + "auxiliary_loss_mlp": 0.0102205, + "balance_loss_clip": 1.01240635, + "balance_loss_mlp": 1.01802421, + "epoch": 0.798136179167293, + "flos": 28036826363520.0, + "grad_norm": 1.3417961092296073, + "language_loss": 0.77916002, + "learning_rate": 3.888781477135663e-07, + "loss": 0.79990423, + "num_input_tokens_seen": 286385565, + "router_z_loss_clip": 0.09667969, + "router_z_loss_mlp": 0.34375, + "step": 13275, + "time_per_iteration": 2.4583959579467773 + }, + { + "auxiliary_loss_clip": 0.01058675, + "auxiliary_loss_mlp": 0.0103037, + "balance_loss_clip": 1.01754951, + "balance_loss_mlp": 1.01910627, + "epoch": 0.7981963024199609, + "flos": 35983268208000.0, + "grad_norm": 1.8642136739694597, + "language_loss": 0.63964111, + "learning_rate": 3.88654311841947e-07, + "loss": 0.66053158, + "num_input_tokens_seen": 286403950, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.39453125, + "step": 13276, + "time_per_iteration": 2.485365390777588 + }, + { + "auxiliary_loss_clip": 0.0105488, + "auxiliary_loss_mlp": 0.01021254, + "balance_loss_clip": 1.01032805, + "balance_loss_mlp": 1.01787078, + "epoch": 0.7982564256726289, + "flos": 25883732684160.0, + "grad_norm": 1.5704656959345362, + "language_loss": 0.60777795, + "learning_rate": 3.884305334751101e-07, + "loss": 0.62853932, + "num_input_tokens_seen": 286426160, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.37109375, + "step": 13277, + "time_per_iteration": 2.4547412395477295 + }, + { + "auxiliary_loss_clip": 0.01060153, + "auxiliary_loss_mlp": 0.01026421, + "balance_loss_clip": 1.01468468, + "balance_loss_mlp": 1.02078617, + "epoch": 0.7983165489252969, + "flos": 25737970291200.0, + "grad_norm": 1.9267261164883716, + "language_loss": 0.79577821, + "learning_rate": 3.8820681262104226e-07, + "loss": 0.81664395, + "num_input_tokens_seen": 286446610, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.39453125, + "step": 13278, + "time_per_iteration": 2.4830799102783203 + }, + { + "auxiliary_loss_clip": 0.01057651, + "auxiliary_loss_mlp": 0.01026999, + "balance_loss_clip": 1.0145601, + "balance_loss_mlp": 1.01845646, + "epoch": 0.7983766721779648, + "flos": 21907125360000.0, + "grad_norm": 1.773962991839808, + "language_loss": 0.63640046, + "learning_rate": 3.8798314928772656e-07, + "loss": 0.65724695, + "num_input_tokens_seen": 286465460, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.39257812, + "step": 13279, + "time_per_iteration": 2.406543254852295 + }, + { + "auxiliary_loss_clip": 0.01052347, + "auxiliary_loss_mlp": 0.01021229, + "balance_loss_clip": 1.01117945, + "balance_loss_mlp": 1.01781559, + "epoch": 0.7984367954306328, + "flos": 29346244104960.0, + "grad_norm": 1.548009196657155, + "language_loss": 0.70768344, + "learning_rate": 3.877595434831462e-07, + "loss": 0.72841924, + "num_input_tokens_seen": 286485720, + "router_z_loss_clip": 0.10058594, + "router_z_loss_mlp": 0.34570312, + "step": 13280, + "time_per_iteration": 2.5110020637512207 + }, + { + "auxiliary_loss_clip": 0.01058635, + "auxiliary_loss_mlp": 0.01028914, + "balance_loss_clip": 1.0164032, + "balance_loss_mlp": 1.01960886, + "epoch": 0.7984969186833007, + "flos": 31356438122880.0, + "grad_norm": 1.6292127107943715, + "language_loss": 0.63035345, + "learning_rate": 3.875359952152812e-07, + "loss": 0.65122896, + "num_input_tokens_seen": 286507465, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.390625, + "step": 13281, + "time_per_iteration": 2.4765682220458984 + }, + { + "auxiliary_loss_clip": 0.0105988, + "auxiliary_loss_mlp": 0.01027386, + "balance_loss_clip": 1.0153873, + "balance_loss_mlp": 1.01924515, + "epoch": 0.7985570419359688, + "flos": 24311873185920.0, + "grad_norm": 2.0583935483461016, + "language_loss": 0.79910409, + "learning_rate": 3.8731250449210753e-07, + "loss": 0.81997669, + "num_input_tokens_seen": 286526345, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.40625, + "step": 13282, + "time_per_iteration": 2.4273664951324463 + }, + { + "auxiliary_loss_clip": 0.01055137, + "auxiliary_loss_mlp": 0.01025635, + "balance_loss_clip": 1.01392305, + "balance_loss_mlp": 1.01692462, + "epoch": 0.7986171651886367, + "flos": 15741324144000.0, + "grad_norm": 1.6694892858414079, + "language_loss": 0.71473187, + "learning_rate": 3.870890713216031e-07, + "loss": 0.73553956, + "num_input_tokens_seen": 286544095, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.3828125, + "step": 13283, + "time_per_iteration": 2.442596912384033 + }, + { + "auxiliary_loss_clip": 0.01053739, + "auxiliary_loss_mlp": 0.01020333, + "balance_loss_clip": 1.00944912, + "balance_loss_mlp": 1.0166266, + "epoch": 0.7986772884413047, + "flos": 11618605313280.0, + "grad_norm": 2.063387185741619, + "language_loss": 0.73269564, + "learning_rate": 3.868656957117404e-07, + "loss": 0.75343633, + "num_input_tokens_seen": 286560960, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.37304688, + "step": 13284, + "time_per_iteration": 2.354681968688965 + }, + { + "auxiliary_loss_clip": 0.01055722, + "auxiliary_loss_mlp": 0.01022645, + "balance_loss_clip": 1.01197004, + "balance_loss_mlp": 1.01825988, + "epoch": 0.7987374116939726, + "flos": 22089965483520.0, + "grad_norm": 1.5644439949202618, + "language_loss": 0.7000227, + "learning_rate": 3.866423776704919e-07, + "loss": 0.72080636, + "num_input_tokens_seen": 286579865, + "router_z_loss_clip": 0.10693359, + "router_z_loss_mlp": 0.375, + "step": 13285, + "time_per_iteration": 2.398601531982422 + }, + { + "auxiliary_loss_clip": 0.01054866, + "auxiliary_loss_mlp": 0.01019175, + "balance_loss_clip": 1.00888157, + "balance_loss_mlp": 1.01797676, + "epoch": 0.7987975349466406, + "flos": 17889844435200.0, + "grad_norm": 1.6094841703348082, + "language_loss": 0.73564076, + "learning_rate": 3.864191172058262e-07, + "loss": 0.75638115, + "num_input_tokens_seen": 286597295, + "router_z_loss_clip": 0.10302734, + "router_z_loss_mlp": 0.36914062, + "step": 13286, + "time_per_iteration": 2.371220827102661 + }, + { + "auxiliary_loss_clip": 0.01057185, + "auxiliary_loss_mlp": 0.01022983, + "balance_loss_clip": 1.01150346, + "balance_loss_mlp": 1.01918483, + "epoch": 0.7988576581993085, + "flos": 19718210759040.0, + "grad_norm": 1.7226976813810408, + "language_loss": 0.75025827, + "learning_rate": 3.8619591432571255e-07, + "loss": 0.77105993, + "num_input_tokens_seen": 286616270, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.38085938, + "step": 13287, + "time_per_iteration": 2.3928823471069336 + }, + { + "auxiliary_loss_clip": 0.01054385, + "auxiliary_loss_mlp": 0.01022546, + "balance_loss_clip": 1.01127446, + "balance_loss_mlp": 1.01770353, + "epoch": 0.7989177814519766, + "flos": 28035150618240.0, + "grad_norm": 1.4919153049673985, + "language_loss": 0.61906147, + "learning_rate": 3.8597276903811446e-07, + "loss": 0.63983071, + "num_input_tokens_seen": 286638315, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.3671875, + "step": 13288, + "time_per_iteration": 2.4558372497558594 + }, + { + "auxiliary_loss_clip": 0.01056936, + "auxiliary_loss_mlp": 0.01023681, + "balance_loss_clip": 1.01287436, + "balance_loss_mlp": 1.01987934, + "epoch": 0.7989779047046445, + "flos": 28869924159360.0, + "grad_norm": 1.992400613759943, + "language_loss": 0.70124942, + "learning_rate": 3.857496813509973e-07, + "loss": 0.72205561, + "num_input_tokens_seen": 286658630, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.37109375, + "step": 13289, + "time_per_iteration": 3.8840672969818115 + }, + { + "auxiliary_loss_clip": 0.01056133, + "auxiliary_loss_mlp": 0.01020775, + "balance_loss_clip": 1.00983119, + "balance_loss_mlp": 1.0192169, + "epoch": 0.7990380279573125, + "flos": 18185907697920.0, + "grad_norm": 1.8593984063401627, + "language_loss": 0.62171745, + "learning_rate": 3.8552665127232073e-07, + "loss": 0.64248645, + "num_input_tokens_seen": 286676870, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.36914062, + "step": 13290, + "time_per_iteration": 2.387965679168701 + }, + { + "auxiliary_loss_clip": 0.01058449, + "auxiliary_loss_mlp": 0.0102346, + "balance_loss_clip": 1.01143742, + "balance_loss_mlp": 1.01928902, + "epoch": 0.7990981512099805, + "flos": 20879073198720.0, + "grad_norm": 2.2113824132512034, + "language_loss": 0.71696389, + "learning_rate": 3.8530367881004656e-07, + "loss": 0.73778301, + "num_input_tokens_seen": 286694300, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.390625, + "step": 13291, + "time_per_iteration": 2.4239273071289062 + }, + { + "auxiliary_loss_clip": 0.0105456, + "auxiliary_loss_mlp": 0.01022343, + "balance_loss_clip": 1.01142991, + "balance_loss_mlp": 1.017452, + "epoch": 0.7991582744626484, + "flos": 26098832770560.0, + "grad_norm": 1.6268774086552549, + "language_loss": 0.6354624, + "learning_rate": 3.850807639721292e-07, + "loss": 0.6562314, + "num_input_tokens_seen": 286714545, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.37109375, + "step": 13292, + "time_per_iteration": 2.4394359588623047 + }, + { + "auxiliary_loss_clip": 0.0105632, + "auxiliary_loss_mlp": 0.01022838, + "balance_loss_clip": 1.01166856, + "balance_loss_mlp": 1.0187571, + "epoch": 0.7992183977153164, + "flos": 35294466528000.0, + "grad_norm": 1.6335375563806467, + "language_loss": 0.56723213, + "learning_rate": 3.8485790676652585e-07, + "loss": 0.58802366, + "num_input_tokens_seen": 286734525, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.375, + "step": 13293, + "time_per_iteration": 2.52298641204834 + }, + { + "auxiliary_loss_clip": 0.01056358, + "auxiliary_loss_mlp": 0.01023611, + "balance_loss_clip": 1.01241755, + "balance_loss_mlp": 1.01870131, + "epoch": 0.7992785209679844, + "flos": 51851781901440.0, + "grad_norm": 2.928002544859207, + "language_loss": 0.71760517, + "learning_rate": 3.846351072011893e-07, + "loss": 0.73840487, + "num_input_tokens_seen": 286753430, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.375, + "step": 13294, + "time_per_iteration": 2.6746654510498047 + }, + { + "auxiliary_loss_clip": 0.01056883, + "auxiliary_loss_mlp": 0.01023025, + "balance_loss_clip": 1.01186693, + "balance_loss_mlp": 1.01881886, + "epoch": 0.7993386442206524, + "flos": 22564016190720.0, + "grad_norm": 1.6873574877086477, + "language_loss": 0.72074074, + "learning_rate": 3.844123652840705e-07, + "loss": 0.74153984, + "num_input_tokens_seen": 286771915, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.37890625, + "step": 13295, + "time_per_iteration": 2.3996481895446777 + }, + { + "auxiliary_loss_clip": 0.01054292, + "auxiliary_loss_mlp": 0.0102286, + "balance_loss_clip": 1.01252484, + "balance_loss_mlp": 1.01869202, + "epoch": 0.7993987674733203, + "flos": 18799471664640.0, + "grad_norm": 1.7194809056275049, + "language_loss": 0.76273394, + "learning_rate": 3.8418968102311866e-07, + "loss": 0.78350544, + "num_input_tokens_seen": 286789835, + "router_z_loss_clip": 0.10351562, + "router_z_loss_mlp": 0.35546875, + "step": 13296, + "time_per_iteration": 2.4231045246124268 + }, + { + "auxiliary_loss_clip": 0.01006584, + "auxiliary_loss_mlp": 0.01001433, + "balance_loss_clip": 1.00059283, + "balance_loss_mlp": 1.00052571, + "epoch": 0.7994588907259883, + "flos": 69418049189760.0, + "grad_norm": 0.6920858630915905, + "language_loss": 0.60810095, + "learning_rate": 3.839670544262801e-07, + "loss": 0.62818116, + "num_input_tokens_seen": 286855580, + "router_z_loss_clip": 0.00842285, + "router_z_loss_mlp": 0.06054688, + "step": 13297, + "time_per_iteration": 3.114727258682251 + }, + { + "auxiliary_loss_clip": 0.01057161, + "auxiliary_loss_mlp": 0.01025028, + "balance_loss_clip": 1.01431704, + "balance_loss_mlp": 1.01917934, + "epoch": 0.7995190139786562, + "flos": 13479475979520.0, + "grad_norm": 2.0341142717398855, + "language_loss": 0.70445973, + "learning_rate": 3.837444855015015e-07, + "loss": 0.7252816, + "num_input_tokens_seen": 286874360, + "router_z_loss_clip": 0.10693359, + "router_z_loss_mlp": 0.38085938, + "step": 13298, + "time_per_iteration": 2.411306142807007 + }, + { + "auxiliary_loss_clip": 0.01057357, + "auxiliary_loss_mlp": 0.01024942, + "balance_loss_clip": 1.012514, + "balance_loss_mlp": 1.01784992, + "epoch": 0.7995791372313242, + "flos": 21651770609280.0, + "grad_norm": 2.166303350610755, + "language_loss": 0.75859219, + "learning_rate": 3.835219742567237e-07, + "loss": 0.77941519, + "num_input_tokens_seen": 286891950, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.39453125, + "step": 13299, + "time_per_iteration": 3.845961570739746 + }, + { + "auxiliary_loss_clip": 0.01054119, + "auxiliary_loss_mlp": 0.01022134, + "balance_loss_clip": 1.01133955, + "balance_loss_mlp": 1.0178988, + "epoch": 0.7996392604839921, + "flos": 26066921921280.0, + "grad_norm": 1.778527215558121, + "language_loss": 0.7769053, + "learning_rate": 3.8329952069988925e-07, + "loss": 0.79766774, + "num_input_tokens_seen": 286911725, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.36132812, + "step": 13300, + "time_per_iteration": 2.42018461227417 + }, + { + "auxiliary_loss_clip": 0.01057767, + "auxiliary_loss_mlp": 0.01024524, + "balance_loss_clip": 1.01277041, + "balance_loss_mlp": 1.01910973, + "epoch": 0.7996993837366602, + "flos": 24605771944320.0, + "grad_norm": 1.9432539505543454, + "language_loss": 0.6366576, + "learning_rate": 3.8307712483893596e-07, + "loss": 0.65748048, + "num_input_tokens_seen": 286931400, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.38671875, + "step": 13301, + "time_per_iteration": 2.411553382873535 + }, + { + "auxiliary_loss_clip": 0.01054757, + "auxiliary_loss_mlp": 0.0102725, + "balance_loss_clip": 1.01634264, + "balance_loss_mlp": 1.01874185, + "epoch": 0.7997595069893281, + "flos": 20991109351680.0, + "grad_norm": 1.589623746475443, + "language_loss": 0.71534425, + "learning_rate": 3.8285478668180103e-07, + "loss": 0.73616433, + "num_input_tokens_seen": 286949795, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.359375, + "step": 13302, + "time_per_iteration": 2.4010753631591797 + }, + { + "auxiliary_loss_clip": 0.01058869, + "auxiliary_loss_mlp": 0.01025626, + "balance_loss_clip": 1.01381254, + "balance_loss_mlp": 1.01918769, + "epoch": 0.7998196302419961, + "flos": 24425340704640.0, + "grad_norm": 1.9284769435309586, + "language_loss": 0.8407622, + "learning_rate": 3.826325062364184e-07, + "loss": 0.86160719, + "num_input_tokens_seen": 286968805, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.39648438, + "step": 13303, + "time_per_iteration": 2.419189453125 + }, + { + "auxiliary_loss_clip": 0.01055467, + "auxiliary_loss_mlp": 0.0102619, + "balance_loss_clip": 1.01550937, + "balance_loss_mlp": 1.01863527, + "epoch": 0.7998797534946641, + "flos": 30263307454080.0, + "grad_norm": 2.0225070652315886, + "language_loss": 0.5900842, + "learning_rate": 3.8241028351072234e-07, + "loss": 0.61090076, + "num_input_tokens_seen": 286990235, + "router_z_loss_clip": 0.10693359, + "router_z_loss_mlp": 0.3671875, + "step": 13304, + "time_per_iteration": 2.4571754932403564 + }, + { + "auxiliary_loss_clip": 0.01057262, + "auxiliary_loss_mlp": 0.01021165, + "balance_loss_clip": 1.00944066, + "balance_loss_mlp": 1.01779628, + "epoch": 0.799939876747332, + "flos": 23512850743680.0, + "grad_norm": 1.6361914140108722, + "language_loss": 0.69173026, + "learning_rate": 3.821881185126412e-07, + "loss": 0.71251452, + "num_input_tokens_seen": 287011060, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.39453125, + "step": 13305, + "time_per_iteration": 2.418323516845703 + }, + { + "auxiliary_loss_clip": 0.01055343, + "auxiliary_loss_mlp": 0.01020495, + "balance_loss_clip": 1.01032615, + "balance_loss_mlp": 1.01886272, + "epoch": 0.8, + "flos": 19317093615360.0, + "grad_norm": 1.5239494959133248, + "language_loss": 0.69316894, + "learning_rate": 3.819660112501053e-07, + "loss": 0.71392727, + "num_input_tokens_seen": 287029215, + "router_z_loss_clip": 0.1015625, + "router_z_loss_mlp": 0.36523438, + "step": 13306, + "time_per_iteration": 2.3933582305908203 + }, + { + "auxiliary_loss_clip": 0.01056618, + "auxiliary_loss_mlp": 0.0102479, + "balance_loss_clip": 1.01363206, + "balance_loss_mlp": 1.01879776, + "epoch": 0.800060123252668, + "flos": 32411164429440.0, + "grad_norm": 1.4027450806072057, + "language_loss": 0.69361687, + "learning_rate": 3.817439617310396e-07, + "loss": 0.71443093, + "num_input_tokens_seen": 287050855, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.37890625, + "step": 13307, + "time_per_iteration": 3.8959591388702393 + }, + { + "auxiliary_loss_clip": 0.01056214, + "auxiliary_loss_mlp": 0.01026975, + "balance_loss_clip": 1.01599562, + "balance_loss_mlp": 1.01848042, + "epoch": 0.800120246505336, + "flos": 20009595899520.0, + "grad_norm": 1.8027634020281766, + "language_loss": 0.76977444, + "learning_rate": 3.815219699633705e-07, + "loss": 0.79060638, + "num_input_tokens_seen": 287069915, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.37695312, + "step": 13308, + "time_per_iteration": 2.402228832244873 + }, + { + "auxiliary_loss_clip": 0.0105854, + "auxiliary_loss_mlp": 0.0102754, + "balance_loss_clip": 1.01462328, + "balance_loss_mlp": 1.01803279, + "epoch": 0.8001803697580039, + "flos": 31866938156160.0, + "grad_norm": 1.4515852351456338, + "language_loss": 0.78800404, + "learning_rate": 3.8130003595501803e-07, + "loss": 0.80886483, + "num_input_tokens_seen": 287091450, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.40625, + "step": 13309, + "time_per_iteration": 2.4836089611053467 + }, + { + "auxiliary_loss_clip": 0.01006686, + "auxiliary_loss_mlp": 0.01000572, + "balance_loss_clip": 0.99973798, + "balance_loss_mlp": 1.00057352, + "epoch": 0.8002404930106719, + "flos": 63388828679040.0, + "grad_norm": 0.9194706734516416, + "language_loss": 0.64679009, + "learning_rate": 3.810781597139039e-07, + "loss": 0.66686267, + "num_input_tokens_seen": 287148365, + "router_z_loss_clip": 0.00836182, + "router_z_loss_mlp": 0.06103516, + "step": 13310, + "time_per_iteration": 2.958512544631958 + }, + { + "auxiliary_loss_clip": 0.0105764, + "auxiliary_loss_mlp": 0.01025808, + "balance_loss_clip": 1.01391077, + "balance_loss_mlp": 1.01929331, + "epoch": 0.8003006162633398, + "flos": 27854230619520.0, + "grad_norm": 1.8054870124255025, + "language_loss": 0.83044225, + "learning_rate": 3.808563412479464e-07, + "loss": 0.85127676, + "num_input_tokens_seen": 287168280, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.3828125, + "step": 13311, + "time_per_iteration": 2.456155776977539 + }, + { + "auxiliary_loss_clip": 0.01056135, + "auxiliary_loss_mlp": 0.01022755, + "balance_loss_clip": 1.01145434, + "balance_loss_mlp": 1.01920259, + "epoch": 0.8003607395160078, + "flos": 18222357024000.0, + "grad_norm": 1.9485894370272892, + "language_loss": 0.66259027, + "learning_rate": 3.8063458056506016e-07, + "loss": 0.68337917, + "num_input_tokens_seen": 287185980, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.36914062, + "step": 13312, + "time_per_iteration": 2.3873751163482666 + } + ], + "logging_steps": 1.0, + "max_steps": 16632, + "num_input_tokens_seen": 287185980, + "num_train_epochs": 1, + "save_steps": 3328, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.1194295986946048e+18, + "train_batch_size": 5, + "trial_name": null, + "trial_params": null +}